sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +16 -10
- sglang/bench_one_batch.py +5 -4
- sglang/bench_one_batch_server.py +86 -22
- sglang/bench_serving.py +197 -110
- sglang/compile_deep_gemm.py +4 -4
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +66 -29
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +47 -9
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +187 -134
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
- sglang/srt/disaggregation/fake/conn.py +4 -13
- sglang/srt/disaggregation/kv_events.py +412 -0
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +84 -70
- sglang/srt/disaggregation/mooncake/conn.py +441 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
- sglang/srt/disaggregation/nixl/conn.py +124 -442
- sglang/srt/disaggregation/prefill.py +128 -44
- sglang/srt/disaggregation/utils.py +154 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/distributed/utils.py +3 -3
- sglang/srt/entrypoints/EngineBase.py +11 -0
- sglang/srt/entrypoints/engine.py +129 -12
- sglang/srt/entrypoints/http_server.py +21 -6
- sglang/srt/entrypoints/http_server_engine.py +5 -2
- sglang/srt/function_call/base_format_detector.py +302 -0
- sglang/srt/function_call/core_types.py +34 -0
- sglang/srt/function_call/deepseekv3_detector.py +205 -0
- sglang/srt/function_call/ebnf_composer.py +248 -0
- sglang/srt/function_call/function_call_parser.py +202 -0
- sglang/srt/function_call/llama32_detector.py +93 -0
- sglang/srt/function_call/mistral_detector.py +131 -0
- sglang/srt/function_call/pythonic_detector.py +229 -0
- sglang/srt/function_call/qwen25_detector.py +121 -0
- sglang/srt/function_call/utils.py +52 -0
- sglang/srt/hf_transformers_utils.py +50 -7
- sglang/srt/layers/attention/aiter_backend.py +878 -0
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +166 -35
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
- sglang/srt/layers/attention/flashmla_backend.py +340 -78
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +247 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/utils.py +2 -2
- sglang/srt/layers/attention/vision.py +1 -1
- sglang/srt/layers/communicator.py +517 -0
- sglang/srt/layers/dp_attention.py +6 -15
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +370 -0
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
- sglang/srt/layers/moe/ep_moe/layer.py +195 -87
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
- sglang/srt/layers/moe/topk.py +107 -24
- sglang/srt/layers/multimodal.py +70 -0
- sglang/srt/layers/quantization/__init__.py +10 -4
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +60 -59
- sglang/srt/layers/quantization/fp8.py +113 -18
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -43
- sglang/srt/layers/quantization/gptq.py +298 -6
- sglang/srt/layers/quantization/int8_kernel.py +18 -5
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/qoq.py +244 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +20 -8
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
- sglang/srt/managers/eplb_manager.py +96 -0
- sglang/srt/managers/expert_distribution.py +878 -56
- sglang/srt/managers/expert_location.py +448 -0
- sglang/srt/managers/expert_location_dispatch.py +108 -0
- sglang/srt/managers/io_struct.py +29 -5
- sglang/srt/managers/mm_utils.py +355 -151
- sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
- sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
- sglang/srt/managers/multimodal_processors/internvl.py +18 -5
- sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
- sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
- sglang/srt/managers/multimodal_processors/llava.py +3 -3
- sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
- sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
- sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
- sglang/srt/managers/schedule_batch.py +185 -55
- sglang/srt/managers/schedule_policy.py +4 -5
- sglang/srt/managers/scheduler.py +389 -154
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +231 -39
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/base_prefix_cache.py +3 -0
- sglang/srt/mem_cache/chunk_cache.py +3 -1
- sglang/srt/mem_cache/hiradix_cache.py +4 -4
- sglang/srt/mem_cache/memory_pool.py +74 -52
- sglang/srt/mem_cache/multimodal_cache.py +45 -0
- sglang/srt/mem_cache/radix_cache.py +58 -5
- sglang/srt/metrics/collector.py +11 -2
- sglang/srt/mm_utils.py +10 -0
- sglang/srt/model_executor/cuda_graph_runner.py +87 -65
- sglang/srt/model_executor/expert_location_updater.py +557 -0
- sglang/srt/model_executor/forward_batch_info.py +39 -14
- sglang/srt/model_executor/model_runner.py +231 -101
- sglang/srt/model_loader/loader.py +10 -6
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/clip.py +5 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +732 -403
- sglang/srt/models/exaone.py +8 -3
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +75 -33
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/llama4.py +10 -2
- sglang/srt/models/llava.py +26 -18
- sglang/srt/models/mimo_mtp.py +220 -0
- sglang/srt/models/minicpmo.py +7 -17
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/mistral.py +71 -1
- sglang/srt/models/mllama.py +3 -3
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +133 -35
- sglang/srt/models/qwen2_5_vl.py +5 -3
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +206 -69
- sglang/srt/models/qwen2_vl.py +3 -3
- sglang/srt/models/qwen3.py +92 -19
- sglang/srt/models/qwen3_moe.py +457 -55
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/siglip.py +294 -0
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +114 -40
- sglang/srt/openai_api/protocol.py +37 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +189 -0
- sglang/srt/operations_strategy.py +207 -0
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +235 -38
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +181 -90
- sglang/srt/speculative/eagle_worker.py +146 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +197 -19
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +278 -0
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +81 -42
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
- sglang-0.4.7.dist-info/RECORD +699 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- sglang/srt/function_call_parser.py +0 -858
- sglang/srt/platforms/interface.py +0 -371
- sglang-0.4.6.post4.dist-info/RECORD +0 -646
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -28,8 +28,9 @@ else:
|
|
28
28
|
import logging
|
29
29
|
|
30
30
|
_is_hip = is_hip()
|
31
|
+
_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
|
31
32
|
|
32
|
-
if
|
33
|
+
if _use_aiter:
|
33
34
|
from aiter import ActivationType
|
34
35
|
from aiter.fused_moe_bf16_asm import ck_moe_2stages
|
35
36
|
from aiter.ops.shuffle import shuffle_weight
|
@@ -104,7 +105,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|
104
105
|
set_weight_attrs(w2_weight, extra_weight_attrs)
|
105
106
|
|
106
107
|
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
107
|
-
if
|
108
|
+
if _use_aiter:
|
108
109
|
layer.w13_weight = torch.nn.Parameter(
|
109
110
|
shuffle_weight(layer.w13_weight.data, (16, 16)),
|
110
111
|
requires_grad=False,
|
@@ -127,6 +128,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|
127
128
|
use_grouped_topk: bool,
|
128
129
|
topk_group: Optional[int] = None,
|
129
130
|
num_expert_group: Optional[int] = None,
|
131
|
+
num_fused_shared_experts: int = 0,
|
130
132
|
custom_routing_function: Optional[Callable] = None,
|
131
133
|
correction_bias: Optional[torch.Tensor] = None,
|
132
134
|
activation: str = "silu",
|
@@ -144,6 +146,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|
144
146
|
use_grouped_topk=use_grouped_topk,
|
145
147
|
topk_group=topk_group,
|
146
148
|
num_expert_group=num_expert_group,
|
149
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
147
150
|
custom_routing_function=custom_routing_function,
|
148
151
|
correction_bias=correction_bias,
|
149
152
|
activation=activation,
|
@@ -163,6 +166,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|
163
166
|
renormalize: bool,
|
164
167
|
topk_group: Optional[int] = None,
|
165
168
|
num_expert_group: Optional[int] = None,
|
169
|
+
num_fused_shared_experts: int = 0,
|
166
170
|
custom_routing_function: Optional[Callable] = None,
|
167
171
|
correction_bias: Optional[torch.Tensor] = None,
|
168
172
|
activation: str = "silu",
|
@@ -179,13 +183,27 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|
179
183
|
renormalize=renormalize,
|
180
184
|
topk_group=topk_group,
|
181
185
|
num_expert_group=num_expert_group,
|
186
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
182
187
|
custom_routing_function=custom_routing_function,
|
183
188
|
correction_bias=correction_bias,
|
184
189
|
routed_scaling_factor=routed_scaling_factor,
|
185
190
|
)
|
186
191
|
|
187
|
-
if
|
192
|
+
if _use_aiter:
|
188
193
|
assert not no_combine, "unsupported"
|
194
|
+
if apply_router_weight_on_input:
|
195
|
+
assert (
|
196
|
+
topk_weights.dim() == 2
|
197
|
+
), "`topk_weights` should be in shape (num_tokens, topk)"
|
198
|
+
_, topk = topk_weights.shape
|
199
|
+
assert (
|
200
|
+
topk == 1
|
201
|
+
), "Only support topk=1 when `apply_router_weight_on_input` is True"
|
202
|
+
x = x * topk_weights.to(x.dtype)
|
203
|
+
topk_weights = torch.ones_like(
|
204
|
+
topk_weights, dtype=torch.float32
|
205
|
+
) # topk_weights must be FP32 (float32)
|
206
|
+
|
189
207
|
return ck_moe_2stages(
|
190
208
|
x,
|
191
209
|
layer.w13_weight,
|
@@ -207,6 +225,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|
207
225
|
activation=activation,
|
208
226
|
apply_router_weight_on_input=apply_router_weight_on_input,
|
209
227
|
no_combine=no_combine,
|
228
|
+
routed_scaling_factor=routed_scaling_factor,
|
210
229
|
)
|
211
230
|
|
212
231
|
def forward_cpu(
|
@@ -219,6 +238,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|
219
238
|
renormalize: bool,
|
220
239
|
topk_group: Optional[int] = None,
|
221
240
|
num_expert_group: Optional[int] = None,
|
241
|
+
num_fused_shared_experts: int = 0,
|
222
242
|
custom_routing_function: Optional[Callable] = None,
|
223
243
|
correction_bias: Optional[torch.Tensor] = None,
|
224
244
|
inplace: bool = True,
|
@@ -232,6 +252,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|
232
252
|
renormalize,
|
233
253
|
topk_group,
|
234
254
|
num_expert_group,
|
255
|
+
num_fused_shared_experts,
|
235
256
|
custom_routing_function,
|
236
257
|
correction_bias,
|
237
258
|
)
|
@@ -270,11 +291,13 @@ class FusedMoE(torch.nn.Module):
|
|
270
291
|
top_k: int,
|
271
292
|
hidden_size: int,
|
272
293
|
intermediate_size: int,
|
294
|
+
layer_id: Optional[int] = None,
|
273
295
|
params_dtype: Optional[torch.dtype] = None,
|
274
296
|
reduce_results: bool = False,
|
275
297
|
renormalize: bool = True,
|
276
298
|
use_grouped_topk: bool = False,
|
277
299
|
num_expert_group: Optional[int] = None,
|
300
|
+
num_fused_shared_experts: int = 0,
|
278
301
|
topk_group: Optional[int] = None,
|
279
302
|
quant_config: Optional[QuantizationConfig] = None,
|
280
303
|
tp_size: Optional[int] = None,
|
@@ -293,6 +316,7 @@ class FusedMoE(torch.nn.Module):
|
|
293
316
|
if params_dtype is None:
|
294
317
|
params_dtype = torch.get_default_dtype()
|
295
318
|
|
319
|
+
self.hidden_size = hidden_size
|
296
320
|
self.tp_size = (
|
297
321
|
tp_size if tp_size is not None else get_tensor_model_parallel_world_size()
|
298
322
|
)
|
@@ -307,6 +331,7 @@ class FusedMoE(torch.nn.Module):
|
|
307
331
|
if self.use_grouped_topk:
|
308
332
|
assert num_expert_group is not None and topk_group is not None
|
309
333
|
self.num_expert_group = num_expert_group
|
334
|
+
self.num_fused_shared_experts = num_fused_shared_experts
|
310
335
|
self.topk_group = topk_group
|
311
336
|
self.custom_routing_function = custom_routing_function
|
312
337
|
self.correction_bias = correction_bias
|
@@ -532,7 +557,8 @@ class FusedMoE(torch.nn.Module):
|
|
532
557
|
loaded_weight = loaded_weight.to(param.data.device)
|
533
558
|
|
534
559
|
if (
|
535
|
-
|
560
|
+
"compressed" in self.quant_method.__class__.__name__.lower()
|
561
|
+
and param.data[expert_id] != 1
|
536
562
|
and (param.data[expert_id] - loaded_weight).abs() > 1e-5
|
537
563
|
):
|
538
564
|
raise ValueError(
|
@@ -556,6 +582,23 @@ class FusedMoE(torch.nn.Module):
|
|
556
582
|
tp_rank=tp_rank,
|
557
583
|
)
|
558
584
|
return
|
585
|
+
if "ModelOpt" in self.quant_method.__class__.__name__:
|
586
|
+
if "weight_scale_2" in weight_name or "input_scale" in weight_name:
|
587
|
+
self._load_per_tensor_weight_scale(
|
588
|
+
shard_id=shard_id,
|
589
|
+
param=param,
|
590
|
+
loaded_weight=loaded_weight,
|
591
|
+
expert_id=expert_id,
|
592
|
+
)
|
593
|
+
elif "weight" in weight_name:
|
594
|
+
self._load_model_weight_or_group_weight_scale(
|
595
|
+
shard_id=shard_id,
|
596
|
+
shard_dim=shard_dim,
|
597
|
+
loaded_weight=loaded_weight,
|
598
|
+
expert_data=expert_data,
|
599
|
+
tp_rank=tp_rank,
|
600
|
+
)
|
601
|
+
return
|
559
602
|
|
560
603
|
# Case weight scales and zero_points
|
561
604
|
if "scale" in weight_name or "zero" in weight_name:
|
@@ -637,6 +680,7 @@ class FusedMoE(torch.nn.Module):
|
|
637
680
|
use_grouped_topk=self.use_grouped_topk,
|
638
681
|
topk_group=self.topk_group,
|
639
682
|
num_expert_group=self.num_expert_group,
|
683
|
+
num_fused_shared_experts=self.num_fused_shared_experts,
|
640
684
|
custom_routing_function=self.custom_routing_function,
|
641
685
|
correction_bias=self.correction_bias,
|
642
686
|
activation=self.activation,
|
sglang/srt/layers/moe/topk.py
CHANGED
@@ -18,7 +18,15 @@ from typing import Callable, Optional
|
|
18
18
|
import torch
|
19
19
|
import torch.nn.functional as F
|
20
20
|
|
21
|
-
from sglang.srt.managers
|
21
|
+
from sglang.srt.managers import expert_location_dispatch
|
22
|
+
from sglang.srt.managers.expert_distribution import (
|
23
|
+
ExpertDistributionRecorder,
|
24
|
+
get_global_expert_distribution_recorder,
|
25
|
+
)
|
26
|
+
from sglang.srt.managers.expert_location_dispatch import (
|
27
|
+
ExpertLocationDispatchInfo,
|
28
|
+
topk_ids_logical_to_physical,
|
29
|
+
)
|
22
30
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
23
31
|
from sglang.srt.utils import get_compiler_backend, is_cuda, is_hip
|
24
32
|
|
@@ -32,9 +40,6 @@ if _is_cuda or _is_hip:
|
|
32
40
|
from sgl_kernel import topk_softmax
|
33
41
|
|
34
42
|
|
35
|
-
expert_distribution_recorder = ExpertDistributionRecorder()
|
36
|
-
|
37
|
-
|
38
43
|
def fused_topk_native(
|
39
44
|
hidden_states: torch.Tensor,
|
40
45
|
gating_output: torch.Tensor,
|
@@ -61,6 +66,8 @@ def fused_topk(
|
|
61
66
|
gating_output: torch.Tensor,
|
62
67
|
topk: int,
|
63
68
|
renormalize: bool,
|
69
|
+
num_token_non_padded: Optional[torch.Tensor] = None,
|
70
|
+
expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
|
64
71
|
):
|
65
72
|
assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
|
66
73
|
|
@@ -82,9 +89,27 @@ def fused_topk(
|
|
82
89
|
)
|
83
90
|
del token_expert_indicies
|
84
91
|
|
92
|
+
return _fused_topk_postprocess(
|
93
|
+
topk_weights=topk_weights,
|
94
|
+
topk_ids=topk_ids,
|
95
|
+
renormalize=renormalize,
|
96
|
+
expert_location_dispatch_info=expert_location_dispatch_info,
|
97
|
+
num_token_non_padded=num_token_non_padded,
|
98
|
+
)
|
99
|
+
|
100
|
+
|
101
|
+
@torch.compile(dynamic=True, backend=get_compiler_backend())
|
102
|
+
def _fused_topk_postprocess(
|
103
|
+
topk_weights,
|
104
|
+
topk_ids,
|
105
|
+
renormalize,
|
106
|
+
expert_location_dispatch_info,
|
107
|
+
num_token_non_padded,
|
108
|
+
):
|
85
109
|
if renormalize:
|
86
110
|
topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
|
87
|
-
|
111
|
+
topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info)
|
112
|
+
_mask_topk_ids_padded_region(topk_ids, num_token_non_padded)
|
88
113
|
return topk_weights, topk_ids
|
89
114
|
|
90
115
|
|
@@ -97,8 +122,10 @@ def grouped_topk(
|
|
97
122
|
renormalize: bool,
|
98
123
|
num_expert_group: int = 0,
|
99
124
|
topk_group: int = 0,
|
100
|
-
|
125
|
+
num_fused_shared_experts: int = 0,
|
101
126
|
routed_scaling_factor: Optional[float] = None,
|
127
|
+
num_token_non_padded: Optional[torch.Tensor] = None,
|
128
|
+
expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
|
102
129
|
):
|
103
130
|
assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
|
104
131
|
|
@@ -120,10 +147,10 @@ def grouped_topk(
|
|
120
147
|
) # [n, e]
|
121
148
|
tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e]
|
122
149
|
topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
|
123
|
-
if
|
150
|
+
if num_fused_shared_experts:
|
124
151
|
topk_ids[:, -1] = torch.randint(
|
125
152
|
low=num_experts,
|
126
|
-
high=num_experts +
|
153
|
+
high=num_experts + num_fused_shared_experts,
|
127
154
|
size=(topk_ids.size(0),),
|
128
155
|
dtype=topk_ids.dtype,
|
129
156
|
device=topk_ids.device,
|
@@ -133,12 +160,15 @@ def grouped_topk(
|
|
133
160
|
if renormalize:
|
134
161
|
topk_weights_sum = (
|
135
162
|
topk_weights.sum(dim=-1, keepdim=True)
|
136
|
-
if
|
163
|
+
if num_fused_shared_experts == 0
|
137
164
|
else topk_weights[:, :-1].sum(dim=-1, keepdim=True)
|
138
165
|
)
|
139
166
|
topk_weights = topk_weights / topk_weights_sum
|
140
167
|
|
141
|
-
|
168
|
+
topk_weights, topk_ids = topk_weights.to(torch.float32), topk_ids.to(torch.int32)
|
169
|
+
topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info)
|
170
|
+
_mask_topk_ids_padded_region(topk_ids, num_token_non_padded)
|
171
|
+
return topk_weights, topk_ids
|
142
172
|
|
143
173
|
|
144
174
|
def biased_grouped_topk_impl(
|
@@ -149,8 +179,10 @@ def biased_grouped_topk_impl(
|
|
149
179
|
renormalize: bool,
|
150
180
|
num_expert_group: int = 0,
|
151
181
|
topk_group: int = 0,
|
152
|
-
|
182
|
+
num_fused_shared_experts: int = 0,
|
153
183
|
routed_scaling_factor: Optional[float] = None,
|
184
|
+
num_token_non_padded: Optional[torch.Tensor] = None,
|
185
|
+
expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
|
154
186
|
):
|
155
187
|
assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
|
156
188
|
|
@@ -179,10 +211,10 @@ def biased_grouped_topk_impl(
|
|
179
211
|
_, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
|
180
212
|
topk_weights = scores.gather(1, topk_ids)
|
181
213
|
|
182
|
-
if
|
214
|
+
if num_fused_shared_experts:
|
183
215
|
topk_ids[:, -1] = torch.randint(
|
184
216
|
low=num_experts,
|
185
|
-
high=num_experts +
|
217
|
+
high=num_experts + num_fused_shared_experts,
|
186
218
|
size=(topk_ids.size(0),),
|
187
219
|
dtype=topk_ids.dtype,
|
188
220
|
device=topk_ids.device,
|
@@ -192,18 +224,31 @@ def biased_grouped_topk_impl(
|
|
192
224
|
if renormalize:
|
193
225
|
topk_weights_sum = (
|
194
226
|
topk_weights.sum(dim=-1, keepdim=True)
|
195
|
-
if
|
227
|
+
if num_fused_shared_experts == 0
|
196
228
|
else topk_weights[:, :-1].sum(dim=-1, keepdim=True)
|
197
229
|
)
|
198
230
|
topk_weights = topk_weights / topk_weights_sum
|
199
231
|
|
200
|
-
|
232
|
+
topk_weights, topk_ids = topk_weights.to(torch.float32), topk_ids.to(torch.int32)
|
233
|
+
topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info)
|
234
|
+
_mask_topk_ids_padded_region(topk_ids, num_token_non_padded)
|
235
|
+
return topk_weights, topk_ids
|
201
236
|
|
202
237
|
|
203
238
|
def is_power_of_two(n):
|
204
239
|
return n > 0 and math.log2(n).is_integer()
|
205
240
|
|
206
241
|
|
242
|
+
def _mask_topk_ids_padded_region(
|
243
|
+
topk_ids: torch.Tensor,
|
244
|
+
num_token_non_padded: Optional[torch.Tensor] = None,
|
245
|
+
):
|
246
|
+
if num_token_non_padded is None:
|
247
|
+
return
|
248
|
+
indices = torch.arange(0, topk_ids.shape[0], device=topk_ids.device)
|
249
|
+
topk_ids[indices >= num_token_non_padded, :] = -1
|
250
|
+
|
251
|
+
|
207
252
|
def biased_grouped_topk(
|
208
253
|
hidden_states: torch.Tensor,
|
209
254
|
gating_output: torch.Tensor,
|
@@ -213,28 +258,39 @@ def biased_grouped_topk(
|
|
213
258
|
num_expert_group: int = 0,
|
214
259
|
topk_group: int = 0,
|
215
260
|
compiled: bool = True,
|
216
|
-
|
261
|
+
num_fused_shared_experts: int = 0,
|
217
262
|
routed_scaling_factor: Optional[float] = None,
|
263
|
+
num_token_non_padded: Optional[torch.Tensor] = None,
|
264
|
+
expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
|
218
265
|
):
|
219
266
|
assert (
|
220
267
|
routed_scaling_factor is not None
|
221
268
|
), "routed_scaling_factor is required for biased_grouped_topk"
|
222
|
-
# TODO: moe_fused_gate kernel is not supported for
|
269
|
+
# TODO: moe_fused_gate kernel is not supported for num_fused_shared_experts > 0 now.
|
223
270
|
if (
|
224
271
|
_is_cuda
|
225
272
|
and gating_output.shape[1] // num_expert_group
|
226
273
|
<= 32 # moe_fused_gate kernel ensure that num_experts/num_expert_group does not exceed MAX_VPT=32 now. And when kernel can handle MAX_VPT > 32, we can remove this assertion.
|
227
274
|
and is_power_of_two(correction_bias.shape[0])
|
228
275
|
):
|
229
|
-
|
276
|
+
topk_weights, topk_ids = moe_fused_gate(
|
230
277
|
gating_output,
|
231
278
|
correction_bias,
|
232
279
|
num_expert_group,
|
233
280
|
topk_group,
|
234
281
|
topk,
|
235
|
-
|
282
|
+
num_fused_shared_experts,
|
236
283
|
routed_scaling_factor,
|
237
284
|
)
|
285
|
+
# TODO merge into kernel for this branch
|
286
|
+
topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info)
|
287
|
+
# TODO will fuse this into kernel, thus use slow manual operation now
|
288
|
+
if num_token_non_padded is None:
|
289
|
+
return topk_weights, topk_ids
|
290
|
+
torch.compile(
|
291
|
+
_mask_topk_ids_padded_region, dynamic=True, backend=get_compiler_backend()
|
292
|
+
)(topk_ids, num_token_non_padded)
|
293
|
+
return topk_weights, topk_ids
|
238
294
|
else:
|
239
295
|
biased_grouped_topk_fn = (
|
240
296
|
torch.compile(
|
@@ -251,8 +307,10 @@ def biased_grouped_topk(
|
|
251
307
|
renormalize,
|
252
308
|
num_expert_group,
|
253
309
|
topk_group,
|
254
|
-
|
310
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
255
311
|
routed_scaling_factor=routed_scaling_factor,
|
312
|
+
num_token_non_padded=num_token_non_padded,
|
313
|
+
expert_location_dispatch_info=expert_location_dispatch_info,
|
256
314
|
)
|
257
315
|
|
258
316
|
|
@@ -264,12 +322,22 @@ def select_experts(
|
|
264
322
|
renormalize: bool,
|
265
323
|
topk_group: Optional[int] = None,
|
266
324
|
num_expert_group: Optional[int] = None,
|
325
|
+
num_fused_shared_experts: int = 0,
|
267
326
|
custom_routing_function: Optional[Callable] = None,
|
268
327
|
correction_bias: Optional[torch.Tensor] = None,
|
269
328
|
torch_native: bool = False,
|
270
329
|
routed_scaling_factor: Optional[float] = None,
|
330
|
+
num_token_non_padded: Optional[torch.Tensor] = None,
|
331
|
+
expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
|
271
332
|
):
|
272
|
-
|
333
|
+
router_logits, correction_bias = (
|
334
|
+
expert_location_dispatch.transform_select_experts_inputs(
|
335
|
+
router_logits=router_logits,
|
336
|
+
correction_bias=correction_bias,
|
337
|
+
info=expert_location_dispatch_info,
|
338
|
+
)
|
339
|
+
)
|
340
|
+
|
273
341
|
# DeepSeek V2/V3/R1 series models use grouped_top_k
|
274
342
|
if use_grouped_topk:
|
275
343
|
assert topk_group is not None
|
@@ -282,8 +350,10 @@ def select_experts(
|
|
282
350
|
renormalize=renormalize,
|
283
351
|
num_expert_group=num_expert_group,
|
284
352
|
topk_group=topk_group,
|
285
|
-
|
353
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
286
354
|
routed_scaling_factor=routed_scaling_factor,
|
355
|
+
num_token_non_padded=num_token_non_padded,
|
356
|
+
expert_location_dispatch_info=expert_location_dispatch_info,
|
287
357
|
)
|
288
358
|
else:
|
289
359
|
topk_weights, topk_ids = biased_grouped_topk(
|
@@ -294,10 +364,16 @@ def select_experts(
|
|
294
364
|
renormalize=renormalize,
|
295
365
|
num_expert_group=num_expert_group,
|
296
366
|
topk_group=topk_group,
|
297
|
-
|
367
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
298
368
|
routed_scaling_factor=routed_scaling_factor,
|
369
|
+
num_token_non_padded=num_token_non_padded,
|
370
|
+
expert_location_dispatch_info=expert_location_dispatch_info,
|
299
371
|
)
|
300
372
|
elif torch_native and custom_routing_function is None:
|
373
|
+
assert (
|
374
|
+
num_token_non_padded is None
|
375
|
+
), "num_token_non_padded is not yet supported in fused_topk_native"
|
376
|
+
assert expert_location_dispatch_info is None
|
301
377
|
topk_weights, topk_ids = fused_topk_native(
|
302
378
|
hidden_states=hidden_states,
|
303
379
|
gating_output=router_logits,
|
@@ -305,13 +381,20 @@ def select_experts(
|
|
305
381
|
renormalize=renormalize,
|
306
382
|
)
|
307
383
|
elif custom_routing_function is None:
|
384
|
+
# Qwen3MOE uses fused_topk
|
308
385
|
topk_weights, topk_ids = fused_topk(
|
309
386
|
hidden_states=hidden_states,
|
310
387
|
gating_output=router_logits,
|
311
388
|
topk=top_k,
|
312
389
|
renormalize=renormalize,
|
390
|
+
num_token_non_padded=num_token_non_padded,
|
391
|
+
expert_location_dispatch_info=expert_location_dispatch_info,
|
313
392
|
)
|
314
393
|
else:
|
394
|
+
assert (
|
395
|
+
num_token_non_padded is None
|
396
|
+
), "num_token_non_padded is not yet supported in custom_routing_function"
|
397
|
+
assert expert_location_dispatch_info is None
|
315
398
|
topk_weights, topk_ids = custom_routing_function(
|
316
399
|
hidden_states=hidden_states,
|
317
400
|
gating_output=router_logits,
|
@@ -319,6 +402,6 @@ def select_experts(
|
|
319
402
|
renormalize=renormalize,
|
320
403
|
)
|
321
404
|
|
322
|
-
|
405
|
+
get_global_expert_distribution_recorder().on_select_experts(topk_ids=topk_ids)
|
323
406
|
|
324
407
|
return topk_weights, topk_ids
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
14
|
+
"""Logits processing."""
|
15
|
+
|
16
|
+
import torch
|
17
|
+
import triton
|
18
|
+
import triton.language as tl
|
19
|
+
|
20
|
+
|
21
|
+
@triton.jit
|
22
|
+
def hash_kernel(
|
23
|
+
input_ptr,
|
24
|
+
output_ptr,
|
25
|
+
n_elements,
|
26
|
+
BLOCK_SIZE: tl.constexpr,
|
27
|
+
PRIME: tl.constexpr,
|
28
|
+
XCONST: tl.constexpr,
|
29
|
+
):
|
30
|
+
pid = tl.program_id(axis=0)
|
31
|
+
block_start = pid * BLOCK_SIZE
|
32
|
+
offsets = block_start + tl.arange(0, BLOCK_SIZE)
|
33
|
+
mask = offsets < n_elements
|
34
|
+
|
35
|
+
data = tl.load(input_ptr + offsets, mask=mask, other=0).to(tl.int64)
|
36
|
+
mixed = data ^ (offsets.to(tl.int64) + XCONST)
|
37
|
+
hash_val = mixed * PRIME
|
38
|
+
hash_val = hash_val ^ (hash_val >> 16)
|
39
|
+
hash_val = hash_val * (PRIME ^ XCONST)
|
40
|
+
hash_val = hash_val ^ (hash_val >> 13)
|
41
|
+
|
42
|
+
tl.store(output_ptr + offsets, hash_val, mask=mask)
|
43
|
+
|
44
|
+
|
45
|
+
PRIME_1 = -(11400714785074694791 ^ 0xFFFFFFFFFFFFFFFF) - 1
|
46
|
+
PRIME_2 = -(14029467366897019727 ^ 0xFFFFFFFFFFFFFFFF) - 1
|
47
|
+
|
48
|
+
|
49
|
+
def gpu_tensor_hash(tensor: torch.Tensor) -> int:
|
50
|
+
assert tensor.is_cuda
|
51
|
+
tensor = tensor.contiguous().view(torch.int32)
|
52
|
+
n = tensor.numel()
|
53
|
+
BLOCK_SIZE = 1024
|
54
|
+
grid = (triton.cdiv(n, BLOCK_SIZE),)
|
55
|
+
|
56
|
+
intermediate_hashes = torch.empty(n, dtype=torch.int64, device=tensor.device)
|
57
|
+
|
58
|
+
hash_kernel[grid](
|
59
|
+
tensor,
|
60
|
+
intermediate_hashes,
|
61
|
+
n,
|
62
|
+
BLOCK_SIZE=BLOCK_SIZE,
|
63
|
+
PRIME=PRIME_1,
|
64
|
+
XCONST=PRIME_2,
|
65
|
+
)
|
66
|
+
|
67
|
+
# TODO: threads can't be synced on triton kernel
|
68
|
+
final_hash = intermediate_hashes.sum().item()
|
69
|
+
|
70
|
+
return final_hash
|
@@ -25,7 +25,6 @@ try:
|
|
25
25
|
from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
|
26
26
|
from vllm.model_executor.layers.quantization.gptq_marlin import (
|
27
27
|
GPTQMarlinLinearMethod,
|
28
|
-
GPTQMarlinMoEMethod,
|
29
28
|
)
|
30
29
|
from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
|
31
30
|
GPTQMarlin24Config,
|
@@ -58,12 +57,17 @@ from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors import
|
|
58
57
|
CompressedTensorsConfig,
|
59
58
|
)
|
60
59
|
from sglang.srt.layers.quantization.fp8 import Fp8Config
|
61
|
-
from sglang.srt.layers.quantization.gptq import
|
60
|
+
from sglang.srt.layers.quantization.gptq import (
|
61
|
+
GPTQConfig,
|
62
|
+
GPTQMarlinConfig,
|
63
|
+
GPTQMarlinMoEMethod,
|
64
|
+
)
|
62
65
|
from sglang.srt.layers.quantization.modelopt_quant import (
|
63
66
|
ModelOptFp4Config,
|
64
67
|
ModelOptFp8Config,
|
65
68
|
)
|
66
69
|
from sglang.srt.layers.quantization.moe_wna16 import MoeWNA16Config
|
70
|
+
from sglang.srt.layers.quantization.qoq import QoQConfig
|
67
71
|
from sglang.srt.layers.quantization.w8a8_fp8 import W8A8Fp8Config
|
68
72
|
from sglang.srt.layers.quantization.w8a8_int8 import W8A8Int8Config
|
69
73
|
|
@@ -77,6 +81,7 @@ BASE_QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
|
|
77
81
|
"w8a8_fp8": W8A8Fp8Config,
|
78
82
|
"moe_wna16": MoeWNA16Config,
|
79
83
|
"compressed-tensors": CompressedTensorsConfig,
|
84
|
+
"qoq": QoQConfig,
|
80
85
|
}
|
81
86
|
|
82
87
|
# VLLM-dependent quantization methods
|
@@ -109,7 +114,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
|
|
109
114
|
if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
|
110
115
|
raise ValueError(
|
111
116
|
f"{quantization} quantization requires some operators from vllm. "
|
112
|
-
"Please install vllm by `pip install vllm==0.
|
117
|
+
"Please install vllm by `pip install vllm==0.9.0.1`"
|
113
118
|
)
|
114
119
|
|
115
120
|
return QUANTIZATION_METHODS[quantization]
|
@@ -284,6 +289,7 @@ def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"):
|
|
284
289
|
use_grouped_topk: bool,
|
285
290
|
topk_group: Optional[int] = None,
|
286
291
|
num_expert_group: Optional[int] = None,
|
292
|
+
num_fused_shared_experts: int = 0,
|
287
293
|
custom_routing_function: Optional[Callable] = None,
|
288
294
|
correction_bias: Optional[torch.Tensor] = None,
|
289
295
|
activation: str = "silu",
|
@@ -310,7 +316,7 @@ def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"):
|
|
310
316
|
if correction_bias is not None:
|
311
317
|
if not has_correction_bias:
|
312
318
|
raise ValueError(
|
313
|
-
"Please increase the version of your vllm. Try `pip install vllm==0.
|
319
|
+
"Please increase the version of your vllm. Try `pip install vllm==0.9.0.1`"
|
314
320
|
)
|
315
321
|
kwargs["e_score_correction_bias"] = correction_bias
|
316
322
|
return original_apply(**kwargs)
|
@@ -367,6 +367,7 @@ class BlockInt8MoEMethod:
|
|
367
367
|
use_grouped_topk: bool,
|
368
368
|
topk_group: Optional[int] = None,
|
369
369
|
num_expert_group: Optional[int] = None,
|
370
|
+
num_fused_shared_experts: int = 0,
|
370
371
|
custom_routing_function: Optional[Callable] = None,
|
371
372
|
correction_bias: Optional[torch.Tensor] = None,
|
372
373
|
activation: str = "silu",
|
@@ -387,6 +388,7 @@ class BlockInt8MoEMethod:
|
|
387
388
|
renormalize=renormalize,
|
388
389
|
topk_group=topk_group,
|
389
390
|
num_expert_group=num_expert_group,
|
391
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
390
392
|
custom_routing_function=custom_routing_function,
|
391
393
|
correction_bias=correction_bias,
|
392
394
|
routed_scaling_factor=routed_scaling_factor,
|
@@ -409,4 +411,5 @@ class BlockInt8MoEMethod:
|
|
409
411
|
a2_scale=layer.w2_input_scale,
|
410
412
|
block_shape=self.quant_config.weight_block_size,
|
411
413
|
no_combine=no_combine,
|
414
|
+
routed_scaling_factor=routed_scaling_factor,
|
412
415
|
)
|
@@ -272,6 +272,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
|
|
272
272
|
use_grouped_topk: bool = False,
|
273
273
|
topk_group: Optional[int] = None,
|
274
274
|
num_expert_group: Optional[int] = None,
|
275
|
+
num_fused_shared_experts: int = 0,
|
275
276
|
global_num_experts: int = -1,
|
276
277
|
expert_map: Optional[torch.Tensor] = None,
|
277
278
|
custom_routing_function: Optional[Callable] = None,
|
@@ -294,6 +295,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
|
|
294
295
|
renormalize=renormalize,
|
295
296
|
topk_group=topk_group,
|
296
297
|
num_expert_group=num_expert_group,
|
298
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
297
299
|
custom_routing_function=custom_routing_function,
|
298
300
|
correction_bias=correction_bias,
|
299
301
|
routed_scaling_factor=routed_scaling_factor,
|
@@ -315,6 +317,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
|
|
315
317
|
a1_scale=layer.w13_input_scale,
|
316
318
|
a2_scale=layer.w2_input_scale,
|
317
319
|
apply_router_weight_on_input=apply_router_weight_on_input,
|
320
|
+
routed_scaling_factor=routed_scaling_factor,
|
318
321
|
)
|
319
322
|
|
320
323
|
|
@@ -627,6 +630,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
|
|
627
630
|
use_grouped_topk: bool = False,
|
628
631
|
topk_group: Optional[int] = None,
|
629
632
|
num_expert_group: Optional[int] = None,
|
633
|
+
num_fused_shared_experts: int = 0,
|
630
634
|
global_num_experts: int = -1,
|
631
635
|
expert_map: Optional[torch.Tensor] = None,
|
632
636
|
custom_routing_function: Optional[Callable] = None,
|
@@ -651,6 +655,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
|
|
651
655
|
renormalize=renormalize,
|
652
656
|
topk_group=topk_group,
|
653
657
|
num_expert_group=num_expert_group,
|
658
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
654
659
|
custom_routing_function=custom_routing_function,
|
655
660
|
scoring_func=scoring_func,
|
656
661
|
correction_bias=correction_bias,
|