sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +16 -10
- sglang/bench_one_batch.py +5 -4
- sglang/bench_one_batch_server.py +86 -22
- sglang/bench_serving.py +197 -110
- sglang/compile_deep_gemm.py +4 -4
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +66 -29
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +47 -9
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +187 -134
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
- sglang/srt/disaggregation/fake/conn.py +4 -13
- sglang/srt/disaggregation/kv_events.py +412 -0
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +84 -70
- sglang/srt/disaggregation/mooncake/conn.py +441 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
- sglang/srt/disaggregation/nixl/conn.py +124 -442
- sglang/srt/disaggregation/prefill.py +128 -44
- sglang/srt/disaggregation/utils.py +154 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/distributed/utils.py +3 -3
- sglang/srt/entrypoints/EngineBase.py +11 -0
- sglang/srt/entrypoints/engine.py +129 -12
- sglang/srt/entrypoints/http_server.py +21 -6
- sglang/srt/entrypoints/http_server_engine.py +5 -2
- sglang/srt/function_call/base_format_detector.py +302 -0
- sglang/srt/function_call/core_types.py +34 -0
- sglang/srt/function_call/deepseekv3_detector.py +205 -0
- sglang/srt/function_call/ebnf_composer.py +248 -0
- sglang/srt/function_call/function_call_parser.py +202 -0
- sglang/srt/function_call/llama32_detector.py +93 -0
- sglang/srt/function_call/mistral_detector.py +131 -0
- sglang/srt/function_call/pythonic_detector.py +229 -0
- sglang/srt/function_call/qwen25_detector.py +121 -0
- sglang/srt/function_call/utils.py +52 -0
- sglang/srt/hf_transformers_utils.py +50 -7
- sglang/srt/layers/attention/aiter_backend.py +878 -0
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +166 -35
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
- sglang/srt/layers/attention/flashmla_backend.py +340 -78
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +247 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/utils.py +2 -2
- sglang/srt/layers/attention/vision.py +1 -1
- sglang/srt/layers/communicator.py +517 -0
- sglang/srt/layers/dp_attention.py +6 -15
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +370 -0
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
- sglang/srt/layers/moe/ep_moe/layer.py +195 -87
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
- sglang/srt/layers/moe/topk.py +107 -24
- sglang/srt/layers/multimodal.py +70 -0
- sglang/srt/layers/quantization/__init__.py +10 -4
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +60 -59
- sglang/srt/layers/quantization/fp8.py +113 -18
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -43
- sglang/srt/layers/quantization/gptq.py +298 -6
- sglang/srt/layers/quantization/int8_kernel.py +18 -5
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/qoq.py +244 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +20 -8
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
- sglang/srt/managers/eplb_manager.py +96 -0
- sglang/srt/managers/expert_distribution.py +878 -56
- sglang/srt/managers/expert_location.py +448 -0
- sglang/srt/managers/expert_location_dispatch.py +108 -0
- sglang/srt/managers/io_struct.py +29 -5
- sglang/srt/managers/mm_utils.py +355 -151
- sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
- sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
- sglang/srt/managers/multimodal_processors/internvl.py +18 -5
- sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
- sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
- sglang/srt/managers/multimodal_processors/llava.py +3 -3
- sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
- sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
- sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
- sglang/srt/managers/schedule_batch.py +185 -55
- sglang/srt/managers/schedule_policy.py +4 -5
- sglang/srt/managers/scheduler.py +389 -154
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +231 -39
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/base_prefix_cache.py +3 -0
- sglang/srt/mem_cache/chunk_cache.py +3 -1
- sglang/srt/mem_cache/hiradix_cache.py +4 -4
- sglang/srt/mem_cache/memory_pool.py +74 -52
- sglang/srt/mem_cache/multimodal_cache.py +45 -0
- sglang/srt/mem_cache/radix_cache.py +58 -5
- sglang/srt/metrics/collector.py +11 -2
- sglang/srt/mm_utils.py +10 -0
- sglang/srt/model_executor/cuda_graph_runner.py +87 -65
- sglang/srt/model_executor/expert_location_updater.py +557 -0
- sglang/srt/model_executor/forward_batch_info.py +39 -14
- sglang/srt/model_executor/model_runner.py +231 -101
- sglang/srt/model_loader/loader.py +10 -6
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/clip.py +5 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +732 -403
- sglang/srt/models/exaone.py +8 -3
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +75 -33
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/llama4.py +10 -2
- sglang/srt/models/llava.py +26 -18
- sglang/srt/models/mimo_mtp.py +220 -0
- sglang/srt/models/minicpmo.py +7 -17
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/mistral.py +71 -1
- sglang/srt/models/mllama.py +3 -3
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +133 -35
- sglang/srt/models/qwen2_5_vl.py +5 -3
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +206 -69
- sglang/srt/models/qwen2_vl.py +3 -3
- sglang/srt/models/qwen3.py +92 -19
- sglang/srt/models/qwen3_moe.py +457 -55
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/siglip.py +294 -0
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +114 -40
- sglang/srt/openai_api/protocol.py +37 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +189 -0
- sglang/srt/operations_strategy.py +207 -0
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +235 -38
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +181 -90
- sglang/srt/speculative/eagle_worker.py +146 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +197 -19
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +278 -0
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +81 -42
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
- sglang-0.4.7.dist-info/RECORD +699 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- sglang/srt/function_call_parser.py +0 -858
- sglang/srt/platforms/interface.py +0 -371
- sglang-0.4.6.post4.dist-info/RECORD +0 -646
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -5,6 +5,9 @@ import torch
|
|
5
5
|
from torch.nn import Module
|
6
6
|
|
7
7
|
from sglang.srt.layers.quantization.deep_gemm import _ENABLE_JIT_DEEPGEMM
|
8
|
+
from sglang.srt.managers.expert_location import get_global_expert_location_metadata
|
9
|
+
from sglang.srt.managers.expert_location_dispatch import ExpertLocationDispatchInfo
|
10
|
+
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
8
11
|
|
9
12
|
try:
|
10
13
|
from deep_gemm import (
|
@@ -40,16 +43,19 @@ from sglang.srt.layers.moe.ep_moe.kernels import (
|
|
40
43
|
tma_align_input_scale,
|
41
44
|
)
|
42
45
|
from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
|
43
|
-
from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoEMethodBase
|
46
|
+
from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE, FusedMoEMethodBase
|
44
47
|
from sglang.srt.layers.moe.topk import select_experts
|
45
48
|
from sglang.srt.layers.quantization.base_config import (
|
46
49
|
QuantizationConfig,
|
47
50
|
QuantizeMethodBase,
|
48
51
|
)
|
49
52
|
from sglang.srt.layers.quantization.fp8 import Fp8Config, Fp8MoEMethod
|
50
|
-
from sglang.srt.layers.quantization.fp8_kernel import
|
53
|
+
from sglang.srt.layers.quantization.fp8_kernel import (
|
54
|
+
scaled_fp8_quant,
|
55
|
+
sglang_per_token_quant_fp8,
|
56
|
+
)
|
51
57
|
from sglang.srt.model_executor.forward_batch_info import ForwardMode
|
52
|
-
from sglang.srt.utils import DeepEPMode, is_hip, set_weight_attrs
|
58
|
+
from sglang.srt.utils import DeepEPMode, dispose_tensor, is_hip, set_weight_attrs
|
53
59
|
|
54
60
|
_is_hip = is_hip()
|
55
61
|
|
@@ -62,10 +68,16 @@ logger = logging.getLogger(__name__)
|
|
62
68
|
class GroupedGemmRunner(torch.nn.Module):
|
63
69
|
flashinfer_gemm_warpper = None
|
64
70
|
|
65
|
-
def __init__(
|
71
|
+
def __init__(
|
72
|
+
self,
|
73
|
+
device,
|
74
|
+
use_flashinfer: bool = False,
|
75
|
+
use_per_token_if_dynamic: bool = True,
|
76
|
+
):
|
66
77
|
super().__init__()
|
67
78
|
self.device = device
|
68
79
|
self.use_flashinfer = use_flashinfer
|
80
|
+
self.use_per_token_if_dynamic = use_per_token_if_dynamic
|
69
81
|
if self.use_flashinfer and GroupedGemmRunner.flashinfer_gemm_warpper is None:
|
70
82
|
GroupedGemmRunner._init_flashinfer_wrapper(device)
|
71
83
|
|
@@ -92,6 +104,7 @@ class GroupedGemmRunner(torch.nn.Module):
|
|
92
104
|
scale_a: torch.Tensor = None,
|
93
105
|
scale_b: torch.Tensor = None,
|
94
106
|
block_shape: Optional[List[int]] = None,
|
107
|
+
c_dtype=None,
|
95
108
|
):
|
96
109
|
if self.use_flashinfer:
|
97
110
|
# TODO: flashinfer
|
@@ -119,6 +132,8 @@ class GroupedGemmRunner(torch.nn.Module):
|
|
119
132
|
scale_a,
|
120
133
|
scale_b,
|
121
134
|
block_shape=block_shape,
|
135
|
+
c_dtype=c_dtype,
|
136
|
+
use_per_token_if_dynamic=self.use_per_token_if_dynamic,
|
122
137
|
)
|
123
138
|
return c
|
124
139
|
|
@@ -136,10 +151,12 @@ class EPMoE(torch.nn.Module):
|
|
136
151
|
top_k: int,
|
137
152
|
hidden_size: int,
|
138
153
|
intermediate_size: int,
|
154
|
+
layer_id: int,
|
139
155
|
params_dtype: Optional[torch.dtype] = None,
|
140
156
|
renormalize: bool = True,
|
141
157
|
use_grouped_topk: bool = False,
|
142
158
|
num_expert_group: Optional[int] = None,
|
159
|
+
num_fused_shared_experts: int = 0,
|
143
160
|
topk_group: Optional[int] = None,
|
144
161
|
quant_config: Optional[QuantizationConfig] = None,
|
145
162
|
tp_size: Optional[int] = None,
|
@@ -148,6 +165,7 @@ class EPMoE(torch.nn.Module):
|
|
148
165
|
custom_routing_function: Optional[Callable] = None,
|
149
166
|
activation: str = "silu",
|
150
167
|
routed_scaling_factor: Optional[float] = None,
|
168
|
+
use_per_token_if_dynamic: bool = True,
|
151
169
|
):
|
152
170
|
super().__init__()
|
153
171
|
|
@@ -159,8 +177,12 @@ class EPMoE(torch.nn.Module):
|
|
159
177
|
)
|
160
178
|
self.tp_rank = get_tensor_model_parallel_rank()
|
161
179
|
|
180
|
+
self.layer_id = layer_id
|
162
181
|
self.num_experts = num_experts
|
163
182
|
assert self.num_experts % self.tp_size == 0
|
183
|
+
assert (
|
184
|
+
num_fused_shared_experts == 0
|
185
|
+
), "num_fused_shared_experts is not supported in EP"
|
164
186
|
self.num_experts_per_partition = self.num_experts // self.tp_size
|
165
187
|
self.start_expert_id = self.tp_rank * self.num_experts_per_partition
|
166
188
|
self.end_expert_id = self.start_expert_id + self.num_experts_per_partition - 1
|
@@ -177,6 +199,7 @@ class EPMoE(torch.nn.Module):
|
|
177
199
|
self.custom_routing_function = custom_routing_function
|
178
200
|
self.activation = activation
|
179
201
|
self.routed_scaling_factor = routed_scaling_factor
|
202
|
+
self.use_per_token_if_dynamic = use_per_token_if_dynamic
|
180
203
|
|
181
204
|
if quant_config is None:
|
182
205
|
self.quant_method: Optional[QuantizeMethodBase] = UnquantizedEPMoEMethod()
|
@@ -210,12 +233,17 @@ class EPMoE(torch.nn.Module):
|
|
210
233
|
self.grouped_gemm_runner = None
|
211
234
|
|
212
235
|
def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor):
|
236
|
+
hidden_states_shape = hidden_states.shape
|
237
|
+
hidden_states_dtype = hidden_states.dtype
|
238
|
+
hidden_states_device = hidden_states.device
|
239
|
+
|
213
240
|
assert self.quant_method is not None
|
214
241
|
|
215
242
|
if self.grouped_gemm_runner is None:
|
216
243
|
self.grouped_gemm_runner = GroupedGemmRunner(
|
217
244
|
hidden_states.device,
|
218
245
|
use_flashinfer=False, # TODO: use flashinfer
|
246
|
+
use_per_token_if_dynamic=self.use_per_token_if_dynamic,
|
219
247
|
)
|
220
248
|
|
221
249
|
topk_weights, topk_ids = select_experts(
|
@@ -229,6 +257,9 @@ class EPMoE(torch.nn.Module):
|
|
229
257
|
correction_bias=self.correction_bias,
|
230
258
|
custom_routing_function=self.custom_routing_function,
|
231
259
|
routed_scaling_factor=self.routed_scaling_factor,
|
260
|
+
expert_location_dispatch_info=ExpertLocationDispatchInfo.init_new(
|
261
|
+
layer_id=self.layer_id,
|
262
|
+
),
|
232
263
|
)
|
233
264
|
|
234
265
|
reorder_topk_ids, src2dst, seg_indptr = run_moe_ep_preproess(
|
@@ -245,12 +276,16 @@ class EPMoE(torch.nn.Module):
|
|
245
276
|
),
|
246
277
|
)
|
247
278
|
if self.activation_scheme == "dynamic" and not self.use_block_quant:
|
248
|
-
|
249
|
-
torch.max(hidden_states)
|
250
|
-
.
|
251
|
-
|
252
|
-
|
253
|
-
|
279
|
+
if self.use_per_token_if_dynamic:
|
280
|
+
max_value = torch.max(hidden_states, dim=1).values.to(torch.float32)
|
281
|
+
self.w13_input_scale = max_value / torch.finfo(self.fp8_dtype).max
|
282
|
+
else:
|
283
|
+
max_value = (
|
284
|
+
torch.max(hidden_states)
|
285
|
+
.repeat(self.num_experts_per_partition)
|
286
|
+
.to(torch.float32)
|
287
|
+
)
|
288
|
+
self.w13_input_scale = max_value / torch.finfo(self.fp8_dtype).max
|
254
289
|
|
255
290
|
# PreReorder
|
256
291
|
pre_reorder_triton_kernel[(hidden_states.shape[0],)](
|
@@ -264,26 +299,40 @@ class EPMoE(torch.nn.Module):
|
|
264
299
|
self.top_k,
|
265
300
|
hidden_states.shape[1],
|
266
301
|
BLOCK_SIZE=512,
|
302
|
+
use_per_token_if_dynamic=self.use_per_token_if_dynamic,
|
267
303
|
)
|
304
|
+
dispose_tensor(hidden_states)
|
305
|
+
|
306
|
+
if (
|
307
|
+
self.activation_scheme == "dynamic"
|
308
|
+
and not self.use_block_quant
|
309
|
+
and self.use_per_token_if_dynamic
|
310
|
+
):
|
311
|
+
scale = torch.empty(
|
312
|
+
hidden_states_shape[0] * self.top_k,
|
313
|
+
device=hidden_states_device,
|
314
|
+
dtype=torch.float32,
|
315
|
+
)
|
316
|
+
scale[src2dst] = (
|
317
|
+
self.w13_input_scale.unsqueeze(1)
|
318
|
+
.expand(hidden_states_shape[0], self.top_k)
|
319
|
+
.reshape(-1)
|
320
|
+
)
|
321
|
+
self.w13_input_scale = scale
|
268
322
|
|
269
323
|
seg_indptr_cur_rank = seg_indptr[self.start_expert_id : self.end_expert_id + 2]
|
270
324
|
weight_indices_cur_rank = torch.arange(
|
271
325
|
0,
|
272
326
|
self.num_experts_per_partition,
|
273
|
-
device=
|
327
|
+
device=hidden_states_device,
|
274
328
|
dtype=torch.int64,
|
275
329
|
)
|
276
330
|
# GroupGemm-0
|
277
|
-
gateup_output = torch.empty(
|
278
|
-
gateup_input.shape[0],
|
279
|
-
self.w13_weight.shape[1],
|
280
|
-
device=hidden_states.device,
|
281
|
-
dtype=hidden_states.dtype,
|
282
|
-
)
|
283
331
|
gateup_output = self.grouped_gemm_runner(
|
284
332
|
a=gateup_input,
|
285
333
|
b=self.w13_weight,
|
286
|
-
c=
|
334
|
+
c=None,
|
335
|
+
c_dtype=hidden_states_dtype,
|
287
336
|
batch_size=self.num_experts_per_partition,
|
288
337
|
weight_column_major=True,
|
289
338
|
seg_indptr=seg_indptr_cur_rank,
|
@@ -297,23 +346,27 @@ class EPMoE(torch.nn.Module):
|
|
297
346
|
),
|
298
347
|
block_shape=self.block_shape,
|
299
348
|
)
|
349
|
+
del gateup_input
|
300
350
|
|
301
351
|
# Act
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
dtype=
|
316
|
-
|
352
|
+
if self.activation_scheme == "dynamic" and not self.use_block_quant:
|
353
|
+
self.w2_input_scale = None
|
354
|
+
down_input = torch.empty(
|
355
|
+
gateup_output.shape[0],
|
356
|
+
gateup_output.shape[1] // 2,
|
357
|
+
device=gateup_output.device,
|
358
|
+
dtype=hidden_states_dtype,
|
359
|
+
)
|
360
|
+
else:
|
361
|
+
down_input = torch.empty(
|
362
|
+
gateup_output.shape[0],
|
363
|
+
gateup_output.shape[1] // 2,
|
364
|
+
device=gateup_output.device,
|
365
|
+
dtype=(
|
366
|
+
self.fp8_dtype
|
367
|
+
if (self.use_fp8_w8a8 and not self.use_block_quant)
|
368
|
+
else hidden_states_dtype
|
369
|
+
),
|
317
370
|
)
|
318
371
|
|
319
372
|
if self.activation == "silu":
|
@@ -340,13 +393,24 @@ class EPMoE(torch.nn.Module):
|
|
340
393
|
)
|
341
394
|
else:
|
342
395
|
raise ValueError(f"Unsupported activation: {self.activation=}")
|
396
|
+
del gateup_output
|
397
|
+
|
398
|
+
if self.activation_scheme == "dynamic" and not self.use_block_quant:
|
399
|
+
if self.use_per_token_if_dynamic:
|
400
|
+
down_input, self.w2_input_scale = sglang_per_token_quant_fp8(down_input)
|
401
|
+
else:
|
402
|
+
self.w2_input_scale = torch.ones(
|
403
|
+
self.num_experts_per_partition,
|
404
|
+
dtype=torch.float32,
|
405
|
+
device=hidden_states_device,
|
406
|
+
)
|
343
407
|
|
344
408
|
# GroupGemm-1
|
345
409
|
down_output = torch.empty(
|
346
410
|
down_input.shape[0],
|
347
411
|
self.w2_weight.shape[1],
|
348
|
-
device=
|
349
|
-
dtype=
|
412
|
+
device=hidden_states_device,
|
413
|
+
dtype=hidden_states_dtype,
|
350
414
|
)
|
351
415
|
down_output = self.grouped_gemm_runner(
|
352
416
|
a=down_input,
|
@@ -365,10 +429,13 @@ class EPMoE(torch.nn.Module):
|
|
365
429
|
),
|
366
430
|
block_shape=self.block_shape,
|
367
431
|
)
|
432
|
+
del down_input
|
368
433
|
|
369
434
|
# PostReorder
|
370
|
-
output = torch.
|
371
|
-
|
435
|
+
output = torch.empty(
|
436
|
+
hidden_states_shape, dtype=hidden_states_dtype, device=hidden_states_device
|
437
|
+
)
|
438
|
+
post_reorder_triton_kernel[(hidden_states_shape[0],)](
|
372
439
|
down_output,
|
373
440
|
output,
|
374
441
|
src2dst,
|
@@ -377,7 +444,7 @@ class EPMoE(torch.nn.Module):
|
|
377
444
|
self.start_expert_id,
|
378
445
|
self.end_expert_id,
|
379
446
|
self.top_k,
|
380
|
-
|
447
|
+
hidden_states_shape[1],
|
381
448
|
BLOCK_SIZE=512,
|
382
449
|
)
|
383
450
|
return output
|
@@ -417,6 +484,28 @@ class EPMoE(torch.nn.Module):
|
|
417
484
|
weight_name: str,
|
418
485
|
shard_id: str,
|
419
486
|
expert_id: int,
|
487
|
+
) -> None:
|
488
|
+
physical_expert_ids = (
|
489
|
+
get_global_expert_location_metadata().logical_to_all_physical(
|
490
|
+
self.layer_id, expert_id
|
491
|
+
)
|
492
|
+
)
|
493
|
+
for physical_expert_id in physical_expert_ids:
|
494
|
+
self._weight_loader_physical(
|
495
|
+
param=param,
|
496
|
+
loaded_weight=loaded_weight,
|
497
|
+
weight_name=weight_name,
|
498
|
+
shard_id=shard_id,
|
499
|
+
expert_id=physical_expert_id,
|
500
|
+
)
|
501
|
+
|
502
|
+
def _weight_loader_physical(
|
503
|
+
self,
|
504
|
+
param: torch.nn.Parameter,
|
505
|
+
loaded_weight: torch.Tensor,
|
506
|
+
weight_name: str,
|
507
|
+
shard_id: str,
|
508
|
+
expert_id: int,
|
420
509
|
) -> None:
|
421
510
|
if expert_id < self.start_expert_id or expert_id > self.end_expert_id:
|
422
511
|
return
|
@@ -460,7 +549,8 @@ class EPMoE(torch.nn.Module):
|
|
460
549
|
# Input scales can be loaded directly and should be equal.
|
461
550
|
if "input_scale" in weight_name:
|
462
551
|
if (
|
463
|
-
|
552
|
+
(shard_id == "w1" or shard_id == "w3")
|
553
|
+
and param_data[expert_id] != 1
|
464
554
|
and (param_data[expert_id] - loaded_weight).abs() > 1e-5
|
465
555
|
):
|
466
556
|
raise ValueError(
|
@@ -534,13 +624,10 @@ class UnquantizedEPMoEMethod(FusedMoEMethodBase, CustomOp):
|
|
534
624
|
set_weight_attrs(w2_weight, extra_weight_attrs)
|
535
625
|
|
536
626
|
# scale
|
627
|
+
layer.register_parameter("w13_input_scale", None)
|
628
|
+
layer.register_parameter("w13_weight_scale", None)
|
629
|
+
|
537
630
|
ones_tensor = torch.ones(num_experts_per_partition, dtype=torch.float32)
|
538
|
-
w13_input_scale = torch.nn.Parameter(
|
539
|
-
ones_tensor,
|
540
|
-
requires_grad=False,
|
541
|
-
)
|
542
|
-
layer.register_parameter("w13_input_scale", w13_input_scale)
|
543
|
-
set_weight_attrs(w13_input_scale, extra_weight_attrs)
|
544
631
|
|
545
632
|
w2_input_scale = torch.nn.Parameter(
|
546
633
|
ones_tensor,
|
@@ -549,13 +636,6 @@ class UnquantizedEPMoEMethod(FusedMoEMethodBase, CustomOp):
|
|
549
636
|
layer.register_parameter("w2_input_scale", w2_input_scale)
|
550
637
|
set_weight_attrs(w2_input_scale, extra_weight_attrs)
|
551
638
|
|
552
|
-
w13_weight_scale = torch.nn.Parameter(
|
553
|
-
ones_tensor,
|
554
|
-
requires_grad=False,
|
555
|
-
)
|
556
|
-
layer.register_parameter("w13_weight_scale", w13_weight_scale)
|
557
|
-
set_weight_attrs(w13_weight_scale, extra_weight_attrs)
|
558
|
-
|
559
639
|
w2_weight_scale = torch.nn.Parameter(
|
560
640
|
ones_tensor,
|
561
641
|
requires_grad=False,
|
@@ -802,10 +882,12 @@ class DeepEPMoE(EPMoE):
|
|
802
882
|
top_k: int,
|
803
883
|
hidden_size: int,
|
804
884
|
intermediate_size: int,
|
885
|
+
layer_id: int,
|
805
886
|
params_dtype: Optional[torch.dtype] = None,
|
806
887
|
renormalize: bool = True,
|
807
888
|
use_grouped_topk: bool = False,
|
808
889
|
num_expert_group: Optional[int] = None,
|
890
|
+
num_fused_shared_experts: int = 0,
|
809
891
|
topk_group: Optional[int] = None,
|
810
892
|
quant_config: Optional[QuantizationConfig] = None,
|
811
893
|
tp_size: Optional[int] = None,
|
@@ -817,22 +899,24 @@ class DeepEPMoE(EPMoE):
|
|
817
899
|
deepep_mode: DeepEPMode = DeepEPMode.auto,
|
818
900
|
):
|
819
901
|
super().__init__(
|
820
|
-
num_experts,
|
821
|
-
top_k,
|
822
|
-
hidden_size,
|
823
|
-
intermediate_size,
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
|
829
|
-
|
830
|
-
|
831
|
-
|
832
|
-
|
833
|
-
|
834
|
-
|
835
|
-
|
902
|
+
num_experts=num_experts,
|
903
|
+
top_k=top_k,
|
904
|
+
hidden_size=hidden_size,
|
905
|
+
intermediate_size=intermediate_size,
|
906
|
+
layer_id=layer_id,
|
907
|
+
params_dtype=params_dtype,
|
908
|
+
renormalize=renormalize,
|
909
|
+
use_grouped_topk=use_grouped_topk,
|
910
|
+
num_expert_group=num_expert_group,
|
911
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
912
|
+
topk_group=topk_group,
|
913
|
+
quant_config=quant_config,
|
914
|
+
tp_size=tp_size,
|
915
|
+
prefix=prefix,
|
916
|
+
correction_bias=correction_bias,
|
917
|
+
custom_routing_function=custom_routing_function,
|
918
|
+
activation=activation,
|
919
|
+
routed_scaling_factor=routed_scaling_factor,
|
836
920
|
)
|
837
921
|
self.deepep_mode = deepep_mode
|
838
922
|
if self.deepep_mode.enable_low_latency():
|
@@ -881,6 +965,9 @@ class DeepEPMoE(EPMoE):
|
|
881
965
|
reorder_topk_ids: torch.Tensor,
|
882
966
|
seg_indptr: torch.Tensor,
|
883
967
|
):
|
968
|
+
hidden_states_dtype = hidden_states.dtype
|
969
|
+
hidden_states_device = hidden_states.device
|
970
|
+
|
884
971
|
assert self.quant_method is not None
|
885
972
|
assert self.activation == "silu"
|
886
973
|
if self.grouped_gemm_runner is None:
|
@@ -903,18 +990,12 @@ class DeepEPMoE(EPMoE):
|
|
903
990
|
)
|
904
991
|
|
905
992
|
# GroupGemm-0
|
906
|
-
gateup_output = torch.empty(
|
907
|
-
hidden_states.shape[0],
|
908
|
-
self.w13_weight.shape[1],
|
909
|
-
device=hidden_states.device,
|
910
|
-
dtype=hidden_states.dtype,
|
911
|
-
)
|
912
|
-
|
913
993
|
if hidden_states.shape[0] > 0:
|
914
994
|
gateup_output = self.grouped_gemm_runner(
|
915
995
|
a=hidden_states,
|
916
996
|
b=self.w13_weight,
|
917
|
-
c=
|
997
|
+
c=None,
|
998
|
+
c_dtype=hidden_states.dtype,
|
918
999
|
batch_size=self.num_experts_per_partition,
|
919
1000
|
weight_column_major=True,
|
920
1001
|
seg_indptr=seg_indptr,
|
@@ -928,6 +1009,13 @@ class DeepEPMoE(EPMoE):
|
|
928
1009
|
),
|
929
1010
|
block_shape=self.block_shape,
|
930
1011
|
)
|
1012
|
+
else:
|
1013
|
+
gateup_output = torch.empty(
|
1014
|
+
hidden_states.shape[0],
|
1015
|
+
self.w13_weight.shape[1],
|
1016
|
+
device=hidden_states.device,
|
1017
|
+
dtype=hidden_states.dtype,
|
1018
|
+
)
|
931
1019
|
|
932
1020
|
# Act
|
933
1021
|
down_input = torch.empty(
|
@@ -937,14 +1025,14 @@ class DeepEPMoE(EPMoE):
|
|
937
1025
|
dtype=(
|
938
1026
|
self.fp8_dtype
|
939
1027
|
if (self.use_fp8_w8a8 and not self.use_block_quant)
|
940
|
-
else
|
1028
|
+
else hidden_states_dtype
|
941
1029
|
),
|
942
1030
|
)
|
943
1031
|
if self.w2_input_scale is None and not self.use_block_quant:
|
944
1032
|
self.w2_input_scale = torch.ones(
|
945
1033
|
self.num_experts_per_partition,
|
946
1034
|
dtype=torch.float32,
|
947
|
-
device=
|
1035
|
+
device=hidden_states_device,
|
948
1036
|
)
|
949
1037
|
|
950
1038
|
if self.activation == "silu":
|
@@ -961,12 +1049,14 @@ class DeepEPMoE(EPMoE):
|
|
961
1049
|
else:
|
962
1050
|
raise ValueError(f"Unsupported activation: {self.activation=}")
|
963
1051
|
|
1052
|
+
del gateup_output
|
1053
|
+
|
964
1054
|
# GroupGemm-1
|
965
1055
|
down_output = torch.empty(
|
966
1056
|
down_input.shape[0],
|
967
1057
|
self.w2_weight.shape[1],
|
968
|
-
device=
|
969
|
-
dtype=
|
1058
|
+
device=hidden_states_device,
|
1059
|
+
dtype=hidden_states_dtype,
|
970
1060
|
)
|
971
1061
|
if down_input.shape[0] > 0:
|
972
1062
|
down_output = self.grouped_gemm_runner(
|
@@ -1007,11 +1097,9 @@ class DeepEPMoE(EPMoE):
|
|
1007
1097
|
N = self.w13_weight.size(1)
|
1008
1098
|
scale_block_size = 128
|
1009
1099
|
|
1010
|
-
|
1011
|
-
|
1012
|
-
|
1013
|
-
dtype=torch.bfloat16,
|
1014
|
-
)
|
1100
|
+
hidden_states_fp8_shape = hidden_states_fp8.shape
|
1101
|
+
hidden_states_fp8_device = hidden_states_fp8.device
|
1102
|
+
hidden_states_fp8_dtype = hidden_states_fp8.dtype
|
1015
1103
|
|
1016
1104
|
input_tensor = [
|
1017
1105
|
torch.empty(
|
@@ -1049,16 +1137,18 @@ class DeepEPMoE(EPMoE):
|
|
1049
1137
|
m_indices,
|
1050
1138
|
output_index,
|
1051
1139
|
)
|
1140
|
+
dispose_tensor(hidden_states_fp8)
|
1052
1141
|
|
1053
1142
|
gateup_output = torch.empty(
|
1054
1143
|
(all_tokens, N),
|
1055
|
-
device=
|
1144
|
+
device=hidden_states_fp8_device,
|
1056
1145
|
dtype=torch.bfloat16,
|
1057
1146
|
)
|
1058
1147
|
input_tensor[1] = tma_align_input_scale(input_tensor[1])
|
1059
1148
|
m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
|
1060
1149
|
input_tensor, self.w13_weight_fp8, gateup_output, m_indices
|
1061
1150
|
)
|
1151
|
+
del input_tensor
|
1062
1152
|
down_input = torch.empty(
|
1063
1153
|
(
|
1064
1154
|
all_tokens,
|
@@ -1068,14 +1158,16 @@ class DeepEPMoE(EPMoE):
|
|
1068
1158
|
dtype=torch.bfloat16,
|
1069
1159
|
)
|
1070
1160
|
silu_and_mul(gateup_output.view(-1, N), down_input)
|
1161
|
+
del gateup_output
|
1071
1162
|
down_output = torch.empty(
|
1072
1163
|
(all_tokens, K),
|
1073
|
-
device=
|
1164
|
+
device=hidden_states_fp8_device,
|
1074
1165
|
dtype=torch.bfloat16,
|
1075
1166
|
)
|
1076
1167
|
down_input_fp8, down_input_scale = sglang_per_token_group_quant_fp8(
|
1077
1168
|
down_input, scale_block_size
|
1078
1169
|
)
|
1170
|
+
del down_input
|
1079
1171
|
down_input_scale = tma_align_input_scale(down_input_scale)
|
1080
1172
|
m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
|
1081
1173
|
(down_input_fp8, down_input_scale),
|
@@ -1083,7 +1175,13 @@ class DeepEPMoE(EPMoE):
|
|
1083
1175
|
down_output,
|
1084
1176
|
m_indices,
|
1085
1177
|
)
|
1178
|
+
del down_input_fp8, down_input_scale
|
1086
1179
|
|
1180
|
+
gather_out = torch.empty(
|
1181
|
+
hidden_states_fp8_shape,
|
1182
|
+
device=hidden_states_fp8_device,
|
1183
|
+
dtype=torch.bfloat16,
|
1184
|
+
)
|
1087
1185
|
ep_gather(down_output, topk_idx, topk_weights, output_index, gather_out)
|
1088
1186
|
|
1089
1187
|
return gather_out
|
@@ -1107,6 +1205,7 @@ class DeepEPMoE(EPMoE):
|
|
1107
1205
|
m_grouped_gemm_fp8_fp8_bf16_nt_masked(
|
1108
1206
|
hidden_states_fp8, self.w13_weight_fp8, gateup_output, masked_m, expected_m
|
1109
1207
|
)
|
1208
|
+
dispose_tensor(hidden_states_fp8[0])
|
1110
1209
|
|
1111
1210
|
# Act
|
1112
1211
|
down_input = torch.empty(
|
@@ -1135,6 +1234,7 @@ class DeepEPMoE(EPMoE):
|
|
1135
1234
|
scale_block_size,
|
1136
1235
|
masked_m,
|
1137
1236
|
)
|
1237
|
+
del gateup_output
|
1138
1238
|
|
1139
1239
|
# GroupGemm-1
|
1140
1240
|
n = self.w2_weight.size(1)
|
@@ -1150,3 +1250,11 @@ class DeepEPMoE(EPMoE):
|
|
1150
1250
|
)
|
1151
1251
|
|
1152
1252
|
return down_output
|
1253
|
+
|
1254
|
+
|
1255
|
+
def get_moe_impl_class():
|
1256
|
+
if global_server_args_dict["enable_deepep_moe"]:
|
1257
|
+
return DeepEPMoE
|
1258
|
+
if global_server_args_dict["enable_ep_moe"]:
|
1259
|
+
return EPMoE
|
1260
|
+
return FusedMoE
|