sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +10 -4
- sglang/bench_one_batch_server.py +67 -11
- sglang/bench_serving.py +85 -74
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +27 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +46 -8
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +67 -3
- sglang/srt/disaggregation/fake/conn.py +1 -0
- sglang/srt/disaggregation/kv_events.py +60 -5
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +29 -48
- sglang/srt/disaggregation/mooncake/conn.py +432 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
- sglang/srt/disaggregation/nixl/conn.py +124 -432
- sglang/srt/disaggregation/prefill.py +2 -0
- sglang/srt/disaggregation/utils.py +38 -1
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/entrypoints/EngineBase.py +6 -0
- sglang/srt/entrypoints/engine.py +102 -5
- sglang/srt/entrypoints/http_server.py +15 -2
- sglang/srt/function_call/base_format_detector.py +138 -86
- sglang/srt/function_call/deepseekv3_detector.py +54 -6
- sglang/srt/function_call/ebnf_composer.py +33 -19
- sglang/srt/function_call/function_call_parser.py +27 -0
- sglang/srt/function_call/llama32_detector.py +33 -14
- sglang/srt/function_call/mistral_detector.py +73 -26
- sglang/srt/function_call/pythonic_detector.py +86 -20
- sglang/srt/function_call/qwen25_detector.py +64 -10
- sglang/srt/function_call/utils.py +17 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/attention/aiter_backend.py +488 -123
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +103 -18
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +37 -1
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +244 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/communicator.py +260 -194
- sglang/srt/layers/dp_attention.py +6 -5
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +170 -7
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +27 -6
- sglang/srt/layers/moe/ep_moe/layer.py +94 -40
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +13 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
- sglang/srt/layers/moe/topk.py +44 -18
- sglang/srt/layers/multimodal.py +3 -3
- sglang/srt/layers/quantization/__init__.py +3 -2
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +55 -56
- sglang/srt/layers/quantization/fp8.py +28 -23
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -49
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +19 -5
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
- sglang/srt/managers/eplb_manager.py +55 -14
- sglang/srt/managers/expert_distribution.py +220 -46
- sglang/srt/managers/expert_location.py +110 -56
- sglang/srt/managers/expert_location_dispatch.py +23 -6
- sglang/srt/managers/io_struct.py +15 -4
- sglang/srt/managers/mm_utils.py +88 -38
- sglang/srt/managers/multimodal_processors/base_processor.py +188 -16
- sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
- sglang/srt/managers/multimodal_processors/internvl.py +4 -0
- sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
- sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
- sglang/srt/managers/schedule_batch.py +140 -38
- sglang/srt/managers/scheduler.py +305 -112
- sglang/srt/managers/tokenizer_manager.py +134 -17
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +72 -61
- sglang/srt/model_executor/expert_location_updater.py +157 -22
- sglang/srt/model_executor/forward_batch_info.py +38 -17
- sglang/srt/model_executor/model_runner.py +96 -56
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +609 -234
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +19 -14
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/minicpmo.py +2 -5
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +38 -9
- sglang/srt/models/qwen2_5_vl.py +3 -9
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +58 -191
- sglang/srt/models/qwen2_vl.py +3 -9
- sglang/srt/models/qwen3.py +41 -10
- sglang/srt/models/qwen3_moe.py +230 -191
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +86 -24
- sglang/srt/openai_api/protocol.py +31 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +37 -2
- sglang/srt/operations_strategy.py +200 -24
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +114 -27
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +51 -91
- sglang/srt/speculative/eagle_worker.py +101 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +129 -7
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +3 -3
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +79 -6
- sglang/version.py +1 -1
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/METADATA +14 -11
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/RECORD +318 -291
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,17 @@
|
|
1
1
|
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/modelopt.py
|
2
2
|
|
3
3
|
import logging
|
4
|
-
from typing import Any, Dict, List, Optional
|
4
|
+
from typing import Any, Callable, Dict, List, Optional
|
5
5
|
|
6
6
|
import torch
|
7
7
|
from torch.nn.parameter import Parameter
|
8
8
|
|
9
|
-
from sglang.srt.layers.linear import
|
9
|
+
from sglang.srt.layers.linear import (
|
10
|
+
LinearBase,
|
11
|
+
LinearMethodBase,
|
12
|
+
UnquantizedLinearMethod,
|
13
|
+
)
|
14
|
+
from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams, CutlassMoEType
|
10
15
|
from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter
|
11
16
|
from sglang.srt.layers.quantization.base_config import (
|
12
17
|
QuantizationConfig,
|
@@ -15,10 +20,12 @@ from sglang.srt.layers.quantization.base_config import (
|
|
15
20
|
from sglang.srt.layers.quantization.fp8_utils import (
|
16
21
|
apply_fp8_linear,
|
17
22
|
cutlass_fp8_supported,
|
23
|
+
is_sm100_supported,
|
18
24
|
)
|
19
25
|
from sglang.srt.layers.quantization.kv_cache import BaseKVCacheMethod
|
20
26
|
from sglang.srt.layers.quantization.utils import (
|
21
27
|
convert_to_channelwise,
|
28
|
+
is_layer_skipped,
|
22
29
|
requantize_with_max_scale,
|
23
30
|
)
|
24
31
|
from sglang.srt.layers.radix_attention import RadixAttention
|
@@ -270,9 +277,16 @@ class ModelOptFp4Config(QuantizationConfig):
|
|
270
277
|
)
|
271
278
|
is_checkpoint_nvfp4_serialized = "NVFP4" in quant_method
|
272
279
|
kv_cache_quant_algo = quant_config["kv_cache_quant_algo"]
|
280
|
+
if not kv_cache_quant_algo:
|
281
|
+
kv_cache_quant_algo = "auto"
|
273
282
|
group_size = quant_config["group_size"]
|
274
283
|
exclude_modules = quant_config["exclude_modules"]
|
275
284
|
if not (group_size and kv_cache_quant_algo and exclude_modules):
|
285
|
+
logger.warning(
|
286
|
+
f"group_size: {group_size},"
|
287
|
+
f"kv_cache_quant_algo: {kv_cache_quant_algo},"
|
288
|
+
f"exclude_modules: {exclude_modules}"
|
289
|
+
)
|
276
290
|
raise ValueError(
|
277
291
|
"NVFP4 quantization requires group size and "
|
278
292
|
"kv_cache_quant_algo specified in "
|
@@ -285,19 +299,30 @@ class ModelOptFp4Config(QuantizationConfig):
|
|
285
299
|
exclude_modules,
|
286
300
|
)
|
287
301
|
|
302
|
+
def is_layer_excluded(self, prefix: str, exclude_modules: list):
|
303
|
+
import regex as re
|
304
|
+
|
305
|
+
for pattern in exclude_modules:
|
306
|
+
regex_str = pattern.replace(".", r"\.").replace("*", r".*")
|
307
|
+
if re.fullmatch(regex_str, prefix):
|
308
|
+
return True
|
309
|
+
return False
|
310
|
+
|
288
311
|
def get_quant_method(
|
289
312
|
self, layer: torch.nn.Module, prefix: str
|
290
313
|
) -> Optional["QuantizeMethodBase"]:
|
291
|
-
|
292
|
-
module in prefix for module in self.exclude_modules
|
293
|
-
):
|
294
|
-
return None
|
314
|
+
from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
|
295
315
|
|
296
316
|
if isinstance(layer, LinearBase):
|
317
|
+
if is_layer_skipped(prefix, self.exclude_modules) or self.is_layer_excluded(
|
318
|
+
prefix, self.exclude_modules
|
319
|
+
):
|
320
|
+
return UnquantizedLinearMethod()
|
297
321
|
return ModelOptFp4LinearMethod(self)
|
298
322
|
if self.kv_cache_quant_algo and isinstance(layer, RadixAttention):
|
299
323
|
return ModelOptFp8KVCacheMethod(self)
|
300
|
-
|
324
|
+
elif isinstance(layer, FusedMoE):
|
325
|
+
return ModelOptNvFp4FusedMoEMethod(self)
|
301
326
|
return None
|
302
327
|
|
303
328
|
def get_scaled_act_names(self) -> List[str]:
|
@@ -461,3 +486,305 @@ class ModelOptFp4LinearMethod(LinearMethodBase):
|
|
461
486
|
if bias is not None:
|
462
487
|
out = out + bias
|
463
488
|
return out.view(*output_shape)
|
489
|
+
|
490
|
+
|
491
|
+
class ModelOptNvFp4FusedMoEMethod:
|
492
|
+
"""
|
493
|
+
MoE Method for FP4 Quantization with Blockscales and PerTensorScales
|
494
|
+
Args:
|
495
|
+
quant_config: NVFP4 Quant Config
|
496
|
+
"""
|
497
|
+
|
498
|
+
def __new__(cls, *args, **kwargs):
|
499
|
+
from sglang.srt.layers.moe.fused_moe_triton import FusedMoEMethodBase
|
500
|
+
|
501
|
+
if not hasattr(cls, "_initialized"):
|
502
|
+
original_init = cls.__init__
|
503
|
+
new_cls = type(
|
504
|
+
cls.__name__,
|
505
|
+
(FusedMoEMethodBase,),
|
506
|
+
{
|
507
|
+
"__init__": original_init,
|
508
|
+
**{k: v for k, v in cls.__dict__.items() if k != "__dict__"},
|
509
|
+
},
|
510
|
+
)
|
511
|
+
obj = super(new_cls, new_cls).__new__(new_cls)
|
512
|
+
obj.__init__(*args, **kwargs)
|
513
|
+
return obj
|
514
|
+
return super().__new__(cls)
|
515
|
+
|
516
|
+
def __init__(self, quant_config: ModelOptFp4Config):
|
517
|
+
self.quant_config = quant_config
|
518
|
+
if not is_sm100_supported():
|
519
|
+
raise ValueError(
|
520
|
+
"Current platform does not support NVFP4"
|
521
|
+
" quantization. Please use Blackwell and"
|
522
|
+
" above."
|
523
|
+
)
|
524
|
+
|
525
|
+
def create_weights(
|
526
|
+
self,
|
527
|
+
layer: torch.nn.Module,
|
528
|
+
num_experts: int,
|
529
|
+
hidden_size: int,
|
530
|
+
intermediate_size_per_partition: int,
|
531
|
+
params_dtype: torch.dtype,
|
532
|
+
**extra_weight_attrs,
|
533
|
+
):
|
534
|
+
if not self.quant_config.is_checkpoint_nvfp4_serialized:
|
535
|
+
raise ValueError(
|
536
|
+
"NVFP4 quantization was selected, "
|
537
|
+
" dynamic quantization is not supported."
|
538
|
+
)
|
539
|
+
|
540
|
+
layer.num_experts = num_experts
|
541
|
+
layer.params_dtype = params_dtype
|
542
|
+
layer.quant_config = self.quant_config
|
543
|
+
weight_dtype = torch.uint8
|
544
|
+
weight_scale_dtype = torch.float8_e4m3fn
|
545
|
+
weight_loader = extra_weight_attrs.get("weight_loader")
|
546
|
+
# GEMM 1
|
547
|
+
w13_weight = ModelWeightParameter(
|
548
|
+
data=torch.empty(
|
549
|
+
num_experts,
|
550
|
+
2 * intermediate_size_per_partition,
|
551
|
+
# 2 fp4 items are packed in the input dimension
|
552
|
+
hidden_size // 2,
|
553
|
+
dtype=weight_dtype,
|
554
|
+
),
|
555
|
+
input_dim=1,
|
556
|
+
output_dim=2,
|
557
|
+
weight_loader=weight_loader,
|
558
|
+
)
|
559
|
+
layer.register_parameter("w13_weight", w13_weight)
|
560
|
+
|
561
|
+
# GEMM 2
|
562
|
+
w2_weight = ModelWeightParameter(
|
563
|
+
data=torch.empty(
|
564
|
+
num_experts,
|
565
|
+
hidden_size,
|
566
|
+
# 2 fp4 items are packed in the input dimension
|
567
|
+
intermediate_size_per_partition // 2,
|
568
|
+
dtype=weight_dtype,
|
569
|
+
),
|
570
|
+
input_dim=1,
|
571
|
+
output_dim=2,
|
572
|
+
weight_loader=weight_loader,
|
573
|
+
)
|
574
|
+
layer.register_parameter("w2_weight", w2_weight)
|
575
|
+
|
576
|
+
w13_weight_scale = ModelWeightParameter(
|
577
|
+
data=torch.empty(
|
578
|
+
num_experts,
|
579
|
+
2 * intermediate_size_per_partition,
|
580
|
+
# 2 fp4 items are packed in the input dimension
|
581
|
+
hidden_size // self.quant_config.group_size,
|
582
|
+
dtype=weight_scale_dtype,
|
583
|
+
),
|
584
|
+
input_dim=1,
|
585
|
+
output_dim=2,
|
586
|
+
weight_loader=weight_loader,
|
587
|
+
)
|
588
|
+
layer.register_parameter("w13_weight_scale", w13_weight_scale)
|
589
|
+
|
590
|
+
w2_weight_scale = ModelWeightParameter(
|
591
|
+
data=torch.empty(
|
592
|
+
num_experts,
|
593
|
+
hidden_size,
|
594
|
+
# 2 fp4 items are packed in the input dimension
|
595
|
+
intermediate_size_per_partition // self.quant_config.group_size,
|
596
|
+
dtype=weight_scale_dtype,
|
597
|
+
),
|
598
|
+
input_dim=1,
|
599
|
+
output_dim=2,
|
600
|
+
weight_loader=weight_loader,
|
601
|
+
)
|
602
|
+
layer.register_parameter("w2_weight_scale", w2_weight_scale)
|
603
|
+
|
604
|
+
from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
|
605
|
+
|
606
|
+
extra_weight_attrs.update(
|
607
|
+
{"quant_method": FusedMoeWeightScaleSupported.BLOCK.value}
|
608
|
+
)
|
609
|
+
|
610
|
+
w13_weight_scale_2 = PerTensorScaleParameter(
|
611
|
+
data=torch.empty(num_experts, 2, dtype=torch.float32),
|
612
|
+
weight_loader=weight_loader,
|
613
|
+
)
|
614
|
+
layer.register_parameter("w13_weight_scale_2", w13_weight_scale_2)
|
615
|
+
|
616
|
+
w2_weight_scale_2 = PerTensorScaleParameter(
|
617
|
+
data=torch.empty(num_experts, dtype=torch.float32),
|
618
|
+
weight_loader=weight_loader,
|
619
|
+
)
|
620
|
+
layer.register_parameter("w2_weight_scale_2", w2_weight_scale_2)
|
621
|
+
|
622
|
+
extra_weight_attrs.update(
|
623
|
+
{"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
|
624
|
+
)
|
625
|
+
|
626
|
+
w13_input_scale = PerTensorScaleParameter(
|
627
|
+
data=torch.empty(num_experts, 2, dtype=torch.float32),
|
628
|
+
weight_loader=weight_loader,
|
629
|
+
)
|
630
|
+
layer.register_parameter("w13_input_scale", w13_input_scale)
|
631
|
+
|
632
|
+
w2_input_scale = PerTensorScaleParameter(
|
633
|
+
data=torch.empty(num_experts, dtype=torch.float32),
|
634
|
+
weight_loader=weight_loader,
|
635
|
+
)
|
636
|
+
layer.register_parameter("w2_input_scale", w2_input_scale)
|
637
|
+
|
638
|
+
def swizzle_blockscale(self, scale: torch.tensor):
|
639
|
+
assert scale.dtype == torch.float8_e4m3fn
|
640
|
+
# Pad and blockwise interleave weight_scale
|
641
|
+
scale_ndim = scale.ndim
|
642
|
+
if scale.ndim == 2:
|
643
|
+
scale = scale.unsqueeze(0)
|
644
|
+
assert scale.ndim == 3
|
645
|
+
B, M, K = scale.shape
|
646
|
+
round_up_multiple = lambda x, m: (x + m - 1) // m * m
|
647
|
+
M_padded = round_up_multiple(M, 128)
|
648
|
+
K_padded = round_up_multiple(K, 4)
|
649
|
+
padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
|
650
|
+
padded_scale[:B, :M, :K] = scale
|
651
|
+
batches, rows, cols = padded_scale.shape
|
652
|
+
assert rows % 128 == 0
|
653
|
+
assert cols % 4 == 0
|
654
|
+
padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32, cols // 4, 4)
|
655
|
+
swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
|
656
|
+
swizzled_scale = swizzled_scale.contiguous().cuda()
|
657
|
+
return (
|
658
|
+
swizzled_scale.reshape(M, K)
|
659
|
+
if scale_ndim == 2
|
660
|
+
else swizzled_scale.reshape(B, M, K)
|
661
|
+
)
|
662
|
+
|
663
|
+
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
664
|
+
|
665
|
+
# GEMM 1
|
666
|
+
if not torch.allclose(
|
667
|
+
layer.w13_weight_scale_2[:, 0], layer.w13_weight_scale_2[:, 1]
|
668
|
+
):
|
669
|
+
logger.warning_once(
|
670
|
+
"w1_weight_scale_2 must match w3_weight_scale_2. "
|
671
|
+
"Accuracy may be affected."
|
672
|
+
)
|
673
|
+
|
674
|
+
w13_weight_scale_2 = layer.w13_weight_scale_2[:, 0]
|
675
|
+
layer.w13_weight_scale_2 = Parameter(w13_weight_scale_2, requires_grad=False)
|
676
|
+
|
677
|
+
w13_input_scale = layer.w13_input_scale.max(dim=1).values.to(torch.float32)
|
678
|
+
layer.g1_alphas = Parameter(
|
679
|
+
(w13_input_scale * w13_weight_scale_2).to(torch.float32),
|
680
|
+
requires_grad=False,
|
681
|
+
)
|
682
|
+
|
683
|
+
assert (
|
684
|
+
layer.w13_weight_scale.shape[2] % 16 == 0
|
685
|
+
), "Expected weight_scale.dim(1) to be divisible by 16"
|
686
|
+
assert (
|
687
|
+
layer.w13_weight_scale.dtype == torch.float8_e4m3fn
|
688
|
+
), "Weight Blockscale must be represented as FP8-E4M3"
|
689
|
+
w13_blockscale_swizzled = self.swizzle_blockscale(layer.w13_weight_scale)
|
690
|
+
|
691
|
+
layer.w13_blockscale_swizzled = Parameter(
|
692
|
+
w13_blockscale_swizzled, requires_grad=False
|
693
|
+
)
|
694
|
+
|
695
|
+
# This is for quantization, so we need to invert it.
|
696
|
+
layer.w13_input_scale_quant = Parameter(
|
697
|
+
(1 / w13_input_scale).to(torch.float32), requires_grad=False
|
698
|
+
)
|
699
|
+
|
700
|
+
layer.w13_weight = Parameter(layer.w13_weight.data, requires_grad=False)
|
701
|
+
|
702
|
+
# GEMM 2
|
703
|
+
layer.g2_alphas = Parameter(
|
704
|
+
(layer.w2_input_scale * layer.w2_weight_scale_2).to(torch.float32),
|
705
|
+
requires_grad=False,
|
706
|
+
)
|
707
|
+
|
708
|
+
# This is for quantization, so we need to invert it.
|
709
|
+
layer.w2_input_scale_quant = Parameter(
|
710
|
+
(1 / layer.w2_input_scale).to(torch.float32), requires_grad=False
|
711
|
+
)
|
712
|
+
|
713
|
+
assert (
|
714
|
+
layer.w2_weight_scale.shape[2] % 16 == 0
|
715
|
+
), "Expected weight_scale.dim(1) to be divisible by 16"
|
716
|
+
assert (
|
717
|
+
layer.w2_weight_scale.dtype == torch.float8_e4m3fn
|
718
|
+
), "Weight Blockscale must be represented as FP8-E4M3"
|
719
|
+
w2_blockscale_swizzled = self.swizzle_blockscale(layer.w2_weight_scale)
|
720
|
+
|
721
|
+
layer.w2_blockscale_swizzled = Parameter(
|
722
|
+
w2_blockscale_swizzled, requires_grad=False
|
723
|
+
)
|
724
|
+
layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
|
725
|
+
|
726
|
+
device = layer.w13_weight.device
|
727
|
+
layer.cutlass_moe_params = CutlassMoEParams(
|
728
|
+
CutlassMoEType.BlockscaledFP4,
|
729
|
+
device,
|
730
|
+
num_experts=layer.num_experts,
|
731
|
+
intermediate_size_per_partition=layer.w2_weight.shape[2] * 2, # n
|
732
|
+
hidden_size=layer.w13_weight.shape[2] * 2,
|
733
|
+
) # k
|
734
|
+
|
735
|
+
def apply(
|
736
|
+
self,
|
737
|
+
layer: torch.nn.Module,
|
738
|
+
x: torch.Tensor,
|
739
|
+
router_logits: torch.Tensor,
|
740
|
+
top_k: int,
|
741
|
+
renormalize: bool,
|
742
|
+
use_grouped_topk: bool,
|
743
|
+
topk_group: Optional[int] = None,
|
744
|
+
num_expert_group: Optional[int] = None,
|
745
|
+
num_fused_shared_experts: Optional[int] = None,
|
746
|
+
custom_routing_function: Optional[Callable] = None,
|
747
|
+
correction_bias: Optional[torch.Tensor] = None,
|
748
|
+
activation: str = "silu",
|
749
|
+
apply_router_weight_on_input: bool = False,
|
750
|
+
inplace: bool = True,
|
751
|
+
no_combine: bool = False,
|
752
|
+
routed_scaling_factor: Optional[float] = None,
|
753
|
+
) -> torch.Tensor:
|
754
|
+
|
755
|
+
assert activation == "silu", "Only SiLU activation is supported."
|
756
|
+
|
757
|
+
from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
|
758
|
+
from sglang.srt.layers.moe.topk import select_experts
|
759
|
+
|
760
|
+
topk_weights, topk_ids = select_experts(
|
761
|
+
hidden_states=x,
|
762
|
+
router_logits=router_logits,
|
763
|
+
use_grouped_topk=use_grouped_topk,
|
764
|
+
top_k=top_k,
|
765
|
+
renormalize=renormalize,
|
766
|
+
topk_group=topk_group,
|
767
|
+
num_expert_group=num_expert_group,
|
768
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
769
|
+
custom_routing_function=custom_routing_function,
|
770
|
+
correction_bias=correction_bias,
|
771
|
+
routed_scaling_factor=routed_scaling_factor,
|
772
|
+
)
|
773
|
+
|
774
|
+
from sglang.srt.layers.moe.cutlass_moe import cutlass_moe_fp4
|
775
|
+
|
776
|
+
return cutlass_moe_fp4(
|
777
|
+
a=x,
|
778
|
+
a1_gscale=layer.w13_input_scale_quant,
|
779
|
+
w1_fp4=layer.w13_weight,
|
780
|
+
w1_blockscale=layer.w13_blockscale_swizzled,
|
781
|
+
w1_alphas=layer.g1_alphas,
|
782
|
+
a2_gscale=layer.w2_input_scale_quant,
|
783
|
+
w2_fp4=layer.w2_weight,
|
784
|
+
w2_blockscale=layer.w2_blockscale_swizzled,
|
785
|
+
w2_alphas=layer.g2_alphas,
|
786
|
+
topk_weights=topk_weights,
|
787
|
+
topk_ids=topk_ids,
|
788
|
+
params=layer.cutlass_moe_params,
|
789
|
+
apply_router_weight_on_input=apply_router_weight_on_input,
|
790
|
+
).to(x.dtype)
|
@@ -341,6 +341,7 @@ class MoeWNA16Method:
|
|
341
341
|
use_grouped_topk: bool = False,
|
342
342
|
topk_group: Optional[int] = None,
|
343
343
|
num_expert_group: Optional[int] = None,
|
344
|
+
num_fused_shared_experts: int = 0,
|
344
345
|
custom_routing_function: Optional[Callable] = None,
|
345
346
|
correction_bias: Optional[torch.Tensor] = None,
|
346
347
|
activation: str = "silu",
|
@@ -362,6 +363,7 @@ class MoeWNA16Method:
|
|
362
363
|
renormalize=renormalize,
|
363
364
|
topk_group=topk_group,
|
364
365
|
num_expert_group=num_expert_group,
|
366
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
365
367
|
custom_routing_function=custom_routing_function,
|
366
368
|
correction_bias=correction_bias,
|
367
369
|
routed_scaling_factor=routed_scaling_factor,
|
@@ -386,6 +388,7 @@ class MoeWNA16Method:
|
|
386
388
|
w2_zp=layer.w2_qzeros if has_zp else None,
|
387
389
|
block_shape=[0, layer.group_size],
|
388
390
|
no_combine=no_combine,
|
391
|
+
routed_scaling_factor=routed_scaling_factor,
|
389
392
|
)
|
390
393
|
|
391
394
|
@staticmethod
|
@@ -287,6 +287,7 @@ class W8A8FP8MoEMethod:
|
|
287
287
|
use_grouped_topk: bool,
|
288
288
|
topk_group: Optional[int] = None,
|
289
289
|
num_expert_group: Optional[int] = None,
|
290
|
+
num_fused_shared_experts: int = 0,
|
290
291
|
custom_routing_function: Optional[Callable] = None,
|
291
292
|
correction_bias: Optional[torch.Tensor] = None,
|
292
293
|
activation: str = "silu",
|
@@ -306,6 +307,7 @@ class W8A8FP8MoEMethod:
|
|
306
307
|
renormalize=renormalize,
|
307
308
|
topk_group=topk_group,
|
308
309
|
num_expert_group=num_expert_group,
|
310
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
309
311
|
custom_routing_function=custom_routing_function,
|
310
312
|
correction_bias=correction_bias,
|
311
313
|
routed_scaling_factor=routed_scaling_factor,
|
@@ -326,4 +328,5 @@ class W8A8FP8MoEMethod:
|
|
326
328
|
a1_scale=layer.w13_input_scale,
|
327
329
|
a2_scale=layer.w2_input_scale,
|
328
330
|
no_combine=no_combine,
|
331
|
+
routed_scaling_factor=routed_scaling_factor,
|
329
332
|
)
|
@@ -225,6 +225,7 @@ class W8A8Int8MoEMethod:
|
|
225
225
|
use_grouped_topk: bool,
|
226
226
|
topk_group: Optional[int] = None,
|
227
227
|
num_expert_group: Optional[int] = None,
|
228
|
+
num_fused_shared_experts: int = 0,
|
228
229
|
custom_routing_function: Optional[Callable] = None,
|
229
230
|
correction_bias: Optional[torch.Tensor] = None,
|
230
231
|
activation: str = "silu",
|
@@ -245,6 +246,7 @@ class W8A8Int8MoEMethod:
|
|
245
246
|
renormalize=renormalize,
|
246
247
|
topk_group=topk_group,
|
247
248
|
num_expert_group=num_expert_group,
|
249
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
248
250
|
custom_routing_function=custom_routing_function,
|
249
251
|
correction_bias=correction_bias,
|
250
252
|
routed_scaling_factor=routed_scaling_factor,
|
@@ -266,4 +268,5 @@ class W8A8Int8MoEMethod:
|
|
266
268
|
a1_scale=layer.w13_input_scale,
|
267
269
|
a2_scale=layer.w2_input_scale,
|
268
270
|
no_combine=no_combine,
|
271
|
+
routed_scaling_factor=routed_scaling_factor,
|
269
272
|
)
|
@@ -8,9 +8,10 @@ import torch
|
|
8
8
|
import torch.nn as nn
|
9
9
|
|
10
10
|
from sglang.srt.custom_op import CustomOp
|
11
|
-
from sglang.srt.utils import is_cuda
|
11
|
+
from sglang.srt.utils import is_cuda, is_hip
|
12
12
|
|
13
13
|
_is_cuda = is_cuda()
|
14
|
+
_is_hip = is_hip()
|
14
15
|
|
15
16
|
if _is_cuda:
|
16
17
|
from sgl_kernel import apply_rope_with_cos_sin_cache_inplace
|
@@ -609,6 +610,10 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
|
|
609
610
|
head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
|
610
611
|
)
|
611
612
|
|
613
|
+
# Re-dispatch
|
614
|
+
if _is_hip:
|
615
|
+
self._forward_method = self.forward_native
|
616
|
+
|
612
617
|
def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
|
613
618
|
pos_freqs = self.base ** (
|
614
619
|
torch.arange(0, self.rotary_dim, 2, dtype=torch.float, device=self.device)
|
@@ -650,17 +655,6 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
|
|
650
655
|
cache = torch.cat((cos, sin), dim=-1)
|
651
656
|
return cache
|
652
657
|
|
653
|
-
def forward_hip(self, *args, **kwargs):
|
654
|
-
return self.forward_native(*args, **kwargs)
|
655
|
-
|
656
|
-
def forward(self, *args, **kwargs):
|
657
|
-
if torch.compiler.is_compiling():
|
658
|
-
return self.forward_native(*args, **kwargs)
|
659
|
-
if _is_cuda:
|
660
|
-
return self.forward_cuda(*args, **kwargs)
|
661
|
-
else:
|
662
|
-
return self.forward_native(*args, **kwargs)
|
663
|
-
|
664
658
|
def forward_native(
|
665
659
|
self,
|
666
660
|
positions: torch.Tensor,
|