sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +16 -10
- sglang/bench_one_batch.py +5 -4
- sglang/bench_one_batch_server.py +86 -22
- sglang/bench_serving.py +197 -110
- sglang/compile_deep_gemm.py +4 -4
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +66 -29
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +47 -9
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +187 -134
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
- sglang/srt/disaggregation/fake/conn.py +4 -13
- sglang/srt/disaggregation/kv_events.py +412 -0
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +84 -70
- sglang/srt/disaggregation/mooncake/conn.py +441 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
- sglang/srt/disaggregation/nixl/conn.py +124 -442
- sglang/srt/disaggregation/prefill.py +128 -44
- sglang/srt/disaggregation/utils.py +154 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/distributed/utils.py +3 -3
- sglang/srt/entrypoints/EngineBase.py +11 -0
- sglang/srt/entrypoints/engine.py +129 -12
- sglang/srt/entrypoints/http_server.py +21 -6
- sglang/srt/entrypoints/http_server_engine.py +5 -2
- sglang/srt/function_call/base_format_detector.py +302 -0
- sglang/srt/function_call/core_types.py +34 -0
- sglang/srt/function_call/deepseekv3_detector.py +205 -0
- sglang/srt/function_call/ebnf_composer.py +248 -0
- sglang/srt/function_call/function_call_parser.py +202 -0
- sglang/srt/function_call/llama32_detector.py +93 -0
- sglang/srt/function_call/mistral_detector.py +131 -0
- sglang/srt/function_call/pythonic_detector.py +229 -0
- sglang/srt/function_call/qwen25_detector.py +121 -0
- sglang/srt/function_call/utils.py +52 -0
- sglang/srt/hf_transformers_utils.py +50 -7
- sglang/srt/layers/attention/aiter_backend.py +878 -0
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +166 -35
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
- sglang/srt/layers/attention/flashmla_backend.py +340 -78
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +247 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/utils.py +2 -2
- sglang/srt/layers/attention/vision.py +1 -1
- sglang/srt/layers/communicator.py +517 -0
- sglang/srt/layers/dp_attention.py +6 -15
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +370 -0
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
- sglang/srt/layers/moe/ep_moe/layer.py +195 -87
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
- sglang/srt/layers/moe/topk.py +107 -24
- sglang/srt/layers/multimodal.py +70 -0
- sglang/srt/layers/quantization/__init__.py +10 -4
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +60 -59
- sglang/srt/layers/quantization/fp8.py +113 -18
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -43
- sglang/srt/layers/quantization/gptq.py +298 -6
- sglang/srt/layers/quantization/int8_kernel.py +18 -5
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/qoq.py +244 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +20 -8
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
- sglang/srt/managers/eplb_manager.py +96 -0
- sglang/srt/managers/expert_distribution.py +878 -56
- sglang/srt/managers/expert_location.py +448 -0
- sglang/srt/managers/expert_location_dispatch.py +108 -0
- sglang/srt/managers/io_struct.py +29 -5
- sglang/srt/managers/mm_utils.py +355 -151
- sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
- sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
- sglang/srt/managers/multimodal_processors/internvl.py +18 -5
- sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
- sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
- sglang/srt/managers/multimodal_processors/llava.py +3 -3
- sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
- sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
- sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
- sglang/srt/managers/schedule_batch.py +185 -55
- sglang/srt/managers/schedule_policy.py +4 -5
- sglang/srt/managers/scheduler.py +389 -154
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +231 -39
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/base_prefix_cache.py +3 -0
- sglang/srt/mem_cache/chunk_cache.py +3 -1
- sglang/srt/mem_cache/hiradix_cache.py +4 -4
- sglang/srt/mem_cache/memory_pool.py +74 -52
- sglang/srt/mem_cache/multimodal_cache.py +45 -0
- sglang/srt/mem_cache/radix_cache.py +58 -5
- sglang/srt/metrics/collector.py +11 -2
- sglang/srt/mm_utils.py +10 -0
- sglang/srt/model_executor/cuda_graph_runner.py +87 -65
- sglang/srt/model_executor/expert_location_updater.py +557 -0
- sglang/srt/model_executor/forward_batch_info.py +39 -14
- sglang/srt/model_executor/model_runner.py +231 -101
- sglang/srt/model_loader/loader.py +10 -6
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/clip.py +5 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +732 -403
- sglang/srt/models/exaone.py +8 -3
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +75 -33
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/llama4.py +10 -2
- sglang/srt/models/llava.py +26 -18
- sglang/srt/models/mimo_mtp.py +220 -0
- sglang/srt/models/minicpmo.py +7 -17
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/mistral.py +71 -1
- sglang/srt/models/mllama.py +3 -3
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +133 -35
- sglang/srt/models/qwen2_5_vl.py +5 -3
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +206 -69
- sglang/srt/models/qwen2_vl.py +3 -3
- sglang/srt/models/qwen3.py +92 -19
- sglang/srt/models/qwen3_moe.py +457 -55
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/siglip.py +294 -0
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +114 -40
- sglang/srt/openai_api/protocol.py +37 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +189 -0
- sglang/srt/operations_strategy.py +207 -0
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +235 -38
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +181 -90
- sglang/srt/speculative/eagle_worker.py +146 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +197 -19
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +278 -0
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +81 -42
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
- sglang-0.4.7.dist-info/RECORD +699 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- sglang/srt/function_call_parser.py +0 -858
- sglang/srt/platforms/interface.py +0 -371
- sglang-0.4.6.post4.dist-info/RECORD +0 -646
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,17 @@
|
|
1
1
|
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/modelopt.py
|
2
2
|
|
3
3
|
import logging
|
4
|
-
from typing import Any, Dict, List, Optional
|
4
|
+
from typing import Any, Callable, Dict, List, Optional
|
5
5
|
|
6
6
|
import torch
|
7
7
|
from torch.nn.parameter import Parameter
|
8
8
|
|
9
|
-
from sglang.srt.layers.linear import
|
9
|
+
from sglang.srt.layers.linear import (
|
10
|
+
LinearBase,
|
11
|
+
LinearMethodBase,
|
12
|
+
UnquantizedLinearMethod,
|
13
|
+
)
|
14
|
+
from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams, CutlassMoEType
|
10
15
|
from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter
|
11
16
|
from sglang.srt.layers.quantization.base_config import (
|
12
17
|
QuantizationConfig,
|
@@ -15,10 +20,12 @@ from sglang.srt.layers.quantization.base_config import (
|
|
15
20
|
from sglang.srt.layers.quantization.fp8_utils import (
|
16
21
|
apply_fp8_linear,
|
17
22
|
cutlass_fp8_supported,
|
23
|
+
is_sm100_supported,
|
18
24
|
)
|
19
25
|
from sglang.srt.layers.quantization.kv_cache import BaseKVCacheMethod
|
20
26
|
from sglang.srt.layers.quantization.utils import (
|
21
27
|
convert_to_channelwise,
|
28
|
+
is_layer_skipped,
|
22
29
|
requantize_with_max_scale,
|
23
30
|
)
|
24
31
|
from sglang.srt.layers.radix_attention import RadixAttention
|
@@ -270,9 +277,16 @@ class ModelOptFp4Config(QuantizationConfig):
|
|
270
277
|
)
|
271
278
|
is_checkpoint_nvfp4_serialized = "NVFP4" in quant_method
|
272
279
|
kv_cache_quant_algo = quant_config["kv_cache_quant_algo"]
|
280
|
+
if not kv_cache_quant_algo:
|
281
|
+
kv_cache_quant_algo = "auto"
|
273
282
|
group_size = quant_config["group_size"]
|
274
283
|
exclude_modules = quant_config["exclude_modules"]
|
275
284
|
if not (group_size and kv_cache_quant_algo and exclude_modules):
|
285
|
+
logger.warning(
|
286
|
+
f"group_size: {group_size},"
|
287
|
+
f"kv_cache_quant_algo: {kv_cache_quant_algo},"
|
288
|
+
f"exclude_modules: {exclude_modules}"
|
289
|
+
)
|
276
290
|
raise ValueError(
|
277
291
|
"NVFP4 quantization requires group size and "
|
278
292
|
"kv_cache_quant_algo specified in "
|
@@ -285,19 +299,30 @@ class ModelOptFp4Config(QuantizationConfig):
|
|
285
299
|
exclude_modules,
|
286
300
|
)
|
287
301
|
|
302
|
+
def is_layer_excluded(self, prefix: str, exclude_modules: list):
|
303
|
+
import regex as re
|
304
|
+
|
305
|
+
for pattern in exclude_modules:
|
306
|
+
regex_str = pattern.replace(".", r"\.").replace("*", r".*")
|
307
|
+
if re.fullmatch(regex_str, prefix):
|
308
|
+
return True
|
309
|
+
return False
|
310
|
+
|
288
311
|
def get_quant_method(
|
289
312
|
self, layer: torch.nn.Module, prefix: str
|
290
313
|
) -> Optional["QuantizeMethodBase"]:
|
291
|
-
|
292
|
-
module in prefix for module in self.exclude_modules
|
293
|
-
):
|
294
|
-
return None
|
314
|
+
from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
|
295
315
|
|
296
316
|
if isinstance(layer, LinearBase):
|
317
|
+
if is_layer_skipped(prefix, self.exclude_modules) or self.is_layer_excluded(
|
318
|
+
prefix, self.exclude_modules
|
319
|
+
):
|
320
|
+
return UnquantizedLinearMethod()
|
297
321
|
return ModelOptFp4LinearMethod(self)
|
298
322
|
if self.kv_cache_quant_algo and isinstance(layer, RadixAttention):
|
299
323
|
return ModelOptFp8KVCacheMethod(self)
|
300
|
-
|
324
|
+
elif isinstance(layer, FusedMoE):
|
325
|
+
return ModelOptNvFp4FusedMoEMethod(self)
|
301
326
|
return None
|
302
327
|
|
303
328
|
def get_scaled_act_names(self) -> List[str]:
|
@@ -461,3 +486,305 @@ class ModelOptFp4LinearMethod(LinearMethodBase):
|
|
461
486
|
if bias is not None:
|
462
487
|
out = out + bias
|
463
488
|
return out.view(*output_shape)
|
489
|
+
|
490
|
+
|
491
|
+
class ModelOptNvFp4FusedMoEMethod:
|
492
|
+
"""
|
493
|
+
MoE Method for FP4 Quantization with Blockscales and PerTensorScales
|
494
|
+
Args:
|
495
|
+
quant_config: NVFP4 Quant Config
|
496
|
+
"""
|
497
|
+
|
498
|
+
def __new__(cls, *args, **kwargs):
|
499
|
+
from sglang.srt.layers.moe.fused_moe_triton import FusedMoEMethodBase
|
500
|
+
|
501
|
+
if not hasattr(cls, "_initialized"):
|
502
|
+
original_init = cls.__init__
|
503
|
+
new_cls = type(
|
504
|
+
cls.__name__,
|
505
|
+
(FusedMoEMethodBase,),
|
506
|
+
{
|
507
|
+
"__init__": original_init,
|
508
|
+
**{k: v for k, v in cls.__dict__.items() if k != "__dict__"},
|
509
|
+
},
|
510
|
+
)
|
511
|
+
obj = super(new_cls, new_cls).__new__(new_cls)
|
512
|
+
obj.__init__(*args, **kwargs)
|
513
|
+
return obj
|
514
|
+
return super().__new__(cls)
|
515
|
+
|
516
|
+
def __init__(self, quant_config: ModelOptFp4Config):
|
517
|
+
self.quant_config = quant_config
|
518
|
+
if not is_sm100_supported():
|
519
|
+
raise ValueError(
|
520
|
+
"Current platform does not support NVFP4"
|
521
|
+
" quantization. Please use Blackwell and"
|
522
|
+
" above."
|
523
|
+
)
|
524
|
+
|
525
|
+
def create_weights(
|
526
|
+
self,
|
527
|
+
layer: torch.nn.Module,
|
528
|
+
num_experts: int,
|
529
|
+
hidden_size: int,
|
530
|
+
intermediate_size_per_partition: int,
|
531
|
+
params_dtype: torch.dtype,
|
532
|
+
**extra_weight_attrs,
|
533
|
+
):
|
534
|
+
if not self.quant_config.is_checkpoint_nvfp4_serialized:
|
535
|
+
raise ValueError(
|
536
|
+
"NVFP4 quantization was selected, "
|
537
|
+
" dynamic quantization is not supported."
|
538
|
+
)
|
539
|
+
|
540
|
+
layer.num_experts = num_experts
|
541
|
+
layer.params_dtype = params_dtype
|
542
|
+
layer.quant_config = self.quant_config
|
543
|
+
weight_dtype = torch.uint8
|
544
|
+
weight_scale_dtype = torch.float8_e4m3fn
|
545
|
+
weight_loader = extra_weight_attrs.get("weight_loader")
|
546
|
+
# GEMM 1
|
547
|
+
w13_weight = ModelWeightParameter(
|
548
|
+
data=torch.empty(
|
549
|
+
num_experts,
|
550
|
+
2 * intermediate_size_per_partition,
|
551
|
+
# 2 fp4 items are packed in the input dimension
|
552
|
+
hidden_size // 2,
|
553
|
+
dtype=weight_dtype,
|
554
|
+
),
|
555
|
+
input_dim=1,
|
556
|
+
output_dim=2,
|
557
|
+
weight_loader=weight_loader,
|
558
|
+
)
|
559
|
+
layer.register_parameter("w13_weight", w13_weight)
|
560
|
+
|
561
|
+
# GEMM 2
|
562
|
+
w2_weight = ModelWeightParameter(
|
563
|
+
data=torch.empty(
|
564
|
+
num_experts,
|
565
|
+
hidden_size,
|
566
|
+
# 2 fp4 items are packed in the input dimension
|
567
|
+
intermediate_size_per_partition // 2,
|
568
|
+
dtype=weight_dtype,
|
569
|
+
),
|
570
|
+
input_dim=1,
|
571
|
+
output_dim=2,
|
572
|
+
weight_loader=weight_loader,
|
573
|
+
)
|
574
|
+
layer.register_parameter("w2_weight", w2_weight)
|
575
|
+
|
576
|
+
w13_weight_scale = ModelWeightParameter(
|
577
|
+
data=torch.empty(
|
578
|
+
num_experts,
|
579
|
+
2 * intermediate_size_per_partition,
|
580
|
+
# 2 fp4 items are packed in the input dimension
|
581
|
+
hidden_size // self.quant_config.group_size,
|
582
|
+
dtype=weight_scale_dtype,
|
583
|
+
),
|
584
|
+
input_dim=1,
|
585
|
+
output_dim=2,
|
586
|
+
weight_loader=weight_loader,
|
587
|
+
)
|
588
|
+
layer.register_parameter("w13_weight_scale", w13_weight_scale)
|
589
|
+
|
590
|
+
w2_weight_scale = ModelWeightParameter(
|
591
|
+
data=torch.empty(
|
592
|
+
num_experts,
|
593
|
+
hidden_size,
|
594
|
+
# 2 fp4 items are packed in the input dimension
|
595
|
+
intermediate_size_per_partition // self.quant_config.group_size,
|
596
|
+
dtype=weight_scale_dtype,
|
597
|
+
),
|
598
|
+
input_dim=1,
|
599
|
+
output_dim=2,
|
600
|
+
weight_loader=weight_loader,
|
601
|
+
)
|
602
|
+
layer.register_parameter("w2_weight_scale", w2_weight_scale)
|
603
|
+
|
604
|
+
from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
|
605
|
+
|
606
|
+
extra_weight_attrs.update(
|
607
|
+
{"quant_method": FusedMoeWeightScaleSupported.BLOCK.value}
|
608
|
+
)
|
609
|
+
|
610
|
+
w13_weight_scale_2 = PerTensorScaleParameter(
|
611
|
+
data=torch.empty(num_experts, 2, dtype=torch.float32),
|
612
|
+
weight_loader=weight_loader,
|
613
|
+
)
|
614
|
+
layer.register_parameter("w13_weight_scale_2", w13_weight_scale_2)
|
615
|
+
|
616
|
+
w2_weight_scale_2 = PerTensorScaleParameter(
|
617
|
+
data=torch.empty(num_experts, dtype=torch.float32),
|
618
|
+
weight_loader=weight_loader,
|
619
|
+
)
|
620
|
+
layer.register_parameter("w2_weight_scale_2", w2_weight_scale_2)
|
621
|
+
|
622
|
+
extra_weight_attrs.update(
|
623
|
+
{"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
|
624
|
+
)
|
625
|
+
|
626
|
+
w13_input_scale = PerTensorScaleParameter(
|
627
|
+
data=torch.empty(num_experts, 2, dtype=torch.float32),
|
628
|
+
weight_loader=weight_loader,
|
629
|
+
)
|
630
|
+
layer.register_parameter("w13_input_scale", w13_input_scale)
|
631
|
+
|
632
|
+
w2_input_scale = PerTensorScaleParameter(
|
633
|
+
data=torch.empty(num_experts, dtype=torch.float32),
|
634
|
+
weight_loader=weight_loader,
|
635
|
+
)
|
636
|
+
layer.register_parameter("w2_input_scale", w2_input_scale)
|
637
|
+
|
638
|
+
def swizzle_blockscale(self, scale: torch.tensor):
|
639
|
+
assert scale.dtype == torch.float8_e4m3fn
|
640
|
+
# Pad and blockwise interleave weight_scale
|
641
|
+
scale_ndim = scale.ndim
|
642
|
+
if scale.ndim == 2:
|
643
|
+
scale = scale.unsqueeze(0)
|
644
|
+
assert scale.ndim == 3
|
645
|
+
B, M, K = scale.shape
|
646
|
+
round_up_multiple = lambda x, m: (x + m - 1) // m * m
|
647
|
+
M_padded = round_up_multiple(M, 128)
|
648
|
+
K_padded = round_up_multiple(K, 4)
|
649
|
+
padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
|
650
|
+
padded_scale[:B, :M, :K] = scale
|
651
|
+
batches, rows, cols = padded_scale.shape
|
652
|
+
assert rows % 128 == 0
|
653
|
+
assert cols % 4 == 0
|
654
|
+
padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32, cols // 4, 4)
|
655
|
+
swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
|
656
|
+
swizzled_scale = swizzled_scale.contiguous().cuda()
|
657
|
+
return (
|
658
|
+
swizzled_scale.reshape(M, K)
|
659
|
+
if scale_ndim == 2
|
660
|
+
else swizzled_scale.reshape(B, M, K)
|
661
|
+
)
|
662
|
+
|
663
|
+
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
664
|
+
|
665
|
+
# GEMM 1
|
666
|
+
if not torch.allclose(
|
667
|
+
layer.w13_weight_scale_2[:, 0], layer.w13_weight_scale_2[:, 1]
|
668
|
+
):
|
669
|
+
logger.warning_once(
|
670
|
+
"w1_weight_scale_2 must match w3_weight_scale_2. "
|
671
|
+
"Accuracy may be affected."
|
672
|
+
)
|
673
|
+
|
674
|
+
w13_weight_scale_2 = layer.w13_weight_scale_2[:, 0]
|
675
|
+
layer.w13_weight_scale_2 = Parameter(w13_weight_scale_2, requires_grad=False)
|
676
|
+
|
677
|
+
w13_input_scale = layer.w13_input_scale.max(dim=1).values.to(torch.float32)
|
678
|
+
layer.g1_alphas = Parameter(
|
679
|
+
(w13_input_scale * w13_weight_scale_2).to(torch.float32),
|
680
|
+
requires_grad=False,
|
681
|
+
)
|
682
|
+
|
683
|
+
assert (
|
684
|
+
layer.w13_weight_scale.shape[2] % 16 == 0
|
685
|
+
), "Expected weight_scale.dim(1) to be divisible by 16"
|
686
|
+
assert (
|
687
|
+
layer.w13_weight_scale.dtype == torch.float8_e4m3fn
|
688
|
+
), "Weight Blockscale must be represented as FP8-E4M3"
|
689
|
+
w13_blockscale_swizzled = self.swizzle_blockscale(layer.w13_weight_scale)
|
690
|
+
|
691
|
+
layer.w13_blockscale_swizzled = Parameter(
|
692
|
+
w13_blockscale_swizzled, requires_grad=False
|
693
|
+
)
|
694
|
+
|
695
|
+
# This is for quantization, so we need to invert it.
|
696
|
+
layer.w13_input_scale_quant = Parameter(
|
697
|
+
(1 / w13_input_scale).to(torch.float32), requires_grad=False
|
698
|
+
)
|
699
|
+
|
700
|
+
layer.w13_weight = Parameter(layer.w13_weight.data, requires_grad=False)
|
701
|
+
|
702
|
+
# GEMM 2
|
703
|
+
layer.g2_alphas = Parameter(
|
704
|
+
(layer.w2_input_scale * layer.w2_weight_scale_2).to(torch.float32),
|
705
|
+
requires_grad=False,
|
706
|
+
)
|
707
|
+
|
708
|
+
# This is for quantization, so we need to invert it.
|
709
|
+
layer.w2_input_scale_quant = Parameter(
|
710
|
+
(1 / layer.w2_input_scale).to(torch.float32), requires_grad=False
|
711
|
+
)
|
712
|
+
|
713
|
+
assert (
|
714
|
+
layer.w2_weight_scale.shape[2] % 16 == 0
|
715
|
+
), "Expected weight_scale.dim(1) to be divisible by 16"
|
716
|
+
assert (
|
717
|
+
layer.w2_weight_scale.dtype == torch.float8_e4m3fn
|
718
|
+
), "Weight Blockscale must be represented as FP8-E4M3"
|
719
|
+
w2_blockscale_swizzled = self.swizzle_blockscale(layer.w2_weight_scale)
|
720
|
+
|
721
|
+
layer.w2_blockscale_swizzled = Parameter(
|
722
|
+
w2_blockscale_swizzled, requires_grad=False
|
723
|
+
)
|
724
|
+
layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
|
725
|
+
|
726
|
+
device = layer.w13_weight.device
|
727
|
+
layer.cutlass_moe_params = CutlassMoEParams(
|
728
|
+
CutlassMoEType.BlockscaledFP4,
|
729
|
+
device,
|
730
|
+
num_experts=layer.num_experts,
|
731
|
+
intermediate_size_per_partition=layer.w2_weight.shape[2] * 2, # n
|
732
|
+
hidden_size=layer.w13_weight.shape[2] * 2,
|
733
|
+
) # k
|
734
|
+
|
735
|
+
def apply(
|
736
|
+
self,
|
737
|
+
layer: torch.nn.Module,
|
738
|
+
x: torch.Tensor,
|
739
|
+
router_logits: torch.Tensor,
|
740
|
+
top_k: int,
|
741
|
+
renormalize: bool,
|
742
|
+
use_grouped_topk: bool,
|
743
|
+
topk_group: Optional[int] = None,
|
744
|
+
num_expert_group: Optional[int] = None,
|
745
|
+
num_fused_shared_experts: Optional[int] = None,
|
746
|
+
custom_routing_function: Optional[Callable] = None,
|
747
|
+
correction_bias: Optional[torch.Tensor] = None,
|
748
|
+
activation: str = "silu",
|
749
|
+
apply_router_weight_on_input: bool = False,
|
750
|
+
inplace: bool = True,
|
751
|
+
no_combine: bool = False,
|
752
|
+
routed_scaling_factor: Optional[float] = None,
|
753
|
+
) -> torch.Tensor:
|
754
|
+
|
755
|
+
assert activation == "silu", "Only SiLU activation is supported."
|
756
|
+
|
757
|
+
from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
|
758
|
+
from sglang.srt.layers.moe.topk import select_experts
|
759
|
+
|
760
|
+
topk_weights, topk_ids = select_experts(
|
761
|
+
hidden_states=x,
|
762
|
+
router_logits=router_logits,
|
763
|
+
use_grouped_topk=use_grouped_topk,
|
764
|
+
top_k=top_k,
|
765
|
+
renormalize=renormalize,
|
766
|
+
topk_group=topk_group,
|
767
|
+
num_expert_group=num_expert_group,
|
768
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
769
|
+
custom_routing_function=custom_routing_function,
|
770
|
+
correction_bias=correction_bias,
|
771
|
+
routed_scaling_factor=routed_scaling_factor,
|
772
|
+
)
|
773
|
+
|
774
|
+
from sglang.srt.layers.moe.cutlass_moe import cutlass_moe_fp4
|
775
|
+
|
776
|
+
return cutlass_moe_fp4(
|
777
|
+
a=x,
|
778
|
+
a1_gscale=layer.w13_input_scale_quant,
|
779
|
+
w1_fp4=layer.w13_weight,
|
780
|
+
w1_blockscale=layer.w13_blockscale_swizzled,
|
781
|
+
w1_alphas=layer.g1_alphas,
|
782
|
+
a2_gscale=layer.w2_input_scale_quant,
|
783
|
+
w2_fp4=layer.w2_weight,
|
784
|
+
w2_blockscale=layer.w2_blockscale_swizzled,
|
785
|
+
w2_alphas=layer.g2_alphas,
|
786
|
+
topk_weights=topk_weights,
|
787
|
+
topk_ids=topk_ids,
|
788
|
+
params=layer.cutlass_moe_params,
|
789
|
+
apply_router_weight_on_input=apply_router_weight_on_input,
|
790
|
+
).to(x.dtype)
|
@@ -341,6 +341,7 @@ class MoeWNA16Method:
|
|
341
341
|
use_grouped_topk: bool = False,
|
342
342
|
topk_group: Optional[int] = None,
|
343
343
|
num_expert_group: Optional[int] = None,
|
344
|
+
num_fused_shared_experts: int = 0,
|
344
345
|
custom_routing_function: Optional[Callable] = None,
|
345
346
|
correction_bias: Optional[torch.Tensor] = None,
|
346
347
|
activation: str = "silu",
|
@@ -362,6 +363,7 @@ class MoeWNA16Method:
|
|
362
363
|
renormalize=renormalize,
|
363
364
|
topk_group=topk_group,
|
364
365
|
num_expert_group=num_expert_group,
|
366
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
365
367
|
custom_routing_function=custom_routing_function,
|
366
368
|
correction_bias=correction_bias,
|
367
369
|
routed_scaling_factor=routed_scaling_factor,
|
@@ -386,6 +388,7 @@ class MoeWNA16Method:
|
|
386
388
|
w2_zp=layer.w2_qzeros if has_zp else None,
|
387
389
|
block_shape=[0, layer.group_size],
|
388
390
|
no_combine=no_combine,
|
391
|
+
routed_scaling_factor=routed_scaling_factor,
|
389
392
|
)
|
390
393
|
|
391
394
|
@staticmethod
|
@@ -0,0 +1,244 @@
|
|
1
|
+
from typing import Any, Callable, Dict, List, Optional
|
2
|
+
|
3
|
+
import torch
|
4
|
+
from torch.nn.parameter import Parameter
|
5
|
+
|
6
|
+
from sglang.srt.distributed import get_tensor_model_parallel_world_size
|
7
|
+
from sglang.srt.layers.linear import LinearMethodBase
|
8
|
+
from sglang.srt.layers.parameter import (
|
9
|
+
ChannelQuantScaleParameter,
|
10
|
+
GroupQuantScaleParameter,
|
11
|
+
ModelWeightParameter,
|
12
|
+
)
|
13
|
+
from sglang.srt.layers.quantization.base_config import (
|
14
|
+
QuantizationConfig,
|
15
|
+
QuantizeMethodBase,
|
16
|
+
)
|
17
|
+
from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8
|
18
|
+
from sglang.srt.utils import is_cuda
|
19
|
+
|
20
|
+
_is_cuda = is_cuda()
|
21
|
+
if _is_cuda:
|
22
|
+
from sgl_kernel import qserve_w4a8_per_chn_gemm, qserve_w4a8_per_group_gemm
|
23
|
+
|
24
|
+
|
25
|
+
QoQ_SUPPORTED_WEIGHT_BITS = [4]
|
26
|
+
QoQ_SUPPORTED_GROUP_SIZES = [-1, 128]
|
27
|
+
|
28
|
+
|
29
|
+
class QoQConfig(QuantizationConfig):
|
30
|
+
"""Config class for QoQ Quantization.
|
31
|
+
|
32
|
+
- Weight: static, per-channel/group, asymmetric
|
33
|
+
- Activation: dynamic, per-token, symmetric
|
34
|
+
|
35
|
+
Reference: https://arxiv.org/abs/2405.04532
|
36
|
+
https://github.com/mit-han-lab/omniserve
|
37
|
+
"""
|
38
|
+
|
39
|
+
def __init__(self, weight_bits: int, group_size: int) -> None:
|
40
|
+
self.weight_bits = weight_bits
|
41
|
+
self.group_size = group_size
|
42
|
+
|
43
|
+
# Verify
|
44
|
+
if self.weight_bits not in QoQ_SUPPORTED_WEIGHT_BITS:
|
45
|
+
raise ValueError(
|
46
|
+
f"QoQ does not support weight_bits = {self.weight_bits}. "
|
47
|
+
f"Only weight_bits = {QoQ_SUPPORTED_WEIGHT_BITS} "
|
48
|
+
"are supported."
|
49
|
+
)
|
50
|
+
if self.group_size not in QoQ_SUPPORTED_GROUP_SIZES:
|
51
|
+
raise ValueError(
|
52
|
+
f"QoQ does not support group_size = {self.group_size}. "
|
53
|
+
f"Only group_sizes = {QoQ_SUPPORTED_GROUP_SIZES} "
|
54
|
+
"are supported."
|
55
|
+
)
|
56
|
+
|
57
|
+
# 4 bits packed into 8 bit datatype.
|
58
|
+
self.pack_factor = 8 // self.weight_bits
|
59
|
+
|
60
|
+
def __repr__(self) -> str:
|
61
|
+
return "QoQConfig(weight_bits={}, group_size={})".format(
|
62
|
+
self.weight_bits, self.group_size
|
63
|
+
)
|
64
|
+
|
65
|
+
@classmethod
|
66
|
+
def get_supported_act_dtypes(cls) -> List[torch.dtype]:
|
67
|
+
return [torch.float16]
|
68
|
+
|
69
|
+
@classmethod
|
70
|
+
def get_min_capability(cls) -> int:
|
71
|
+
return 80
|
72
|
+
|
73
|
+
@classmethod
|
74
|
+
def get_name(self) -> str:
|
75
|
+
return "qoq"
|
76
|
+
|
77
|
+
@classmethod
|
78
|
+
def get_config_filenames(cls) -> List[str]:
|
79
|
+
"""List of filenames to search for in the model directory."""
|
80
|
+
return [
|
81
|
+
"quant_config.json",
|
82
|
+
"quantize_config.json",
|
83
|
+
]
|
84
|
+
|
85
|
+
@classmethod
|
86
|
+
def from_config(cls, config: Dict[str, Any]) -> "QoQConfig":
|
87
|
+
weight_bits = cls.get_from_keys(config, ["wbits"])
|
88
|
+
group_size = cls.get_from_keys(config, ["group_size"])
|
89
|
+
return cls(weight_bits, group_size)
|
90
|
+
|
91
|
+
def get_quant_method(
|
92
|
+
self,
|
93
|
+
layer: torch.nn.Module,
|
94
|
+
prefix: str,
|
95
|
+
) -> Optional["QuantizeMethodBase"]:
|
96
|
+
from sglang.srt.layers.linear import LinearBase
|
97
|
+
|
98
|
+
if isinstance(layer, LinearBase):
|
99
|
+
return QoQLinearMethod(self)
|
100
|
+
return None
|
101
|
+
|
102
|
+
def get_scaled_act_names(self) -> List[str]:
|
103
|
+
return []
|
104
|
+
|
105
|
+
|
106
|
+
class QoQLinearMethod(LinearMethodBase):
|
107
|
+
"""Linear method for QoQ.
|
108
|
+
|
109
|
+
Args:
|
110
|
+
quant_config: The QoQ quantization config.
|
111
|
+
"""
|
112
|
+
|
113
|
+
def __init__(self, quant_config: QoQConfig):
|
114
|
+
self.quant_config = quant_config
|
115
|
+
|
116
|
+
def create_weights(
|
117
|
+
self,
|
118
|
+
layer: torch.nn.Module,
|
119
|
+
input_size_per_partition: int,
|
120
|
+
output_partition_sizes: List[int],
|
121
|
+
input_size: int,
|
122
|
+
output_size: int,
|
123
|
+
params_dtype: torch.dtype,
|
124
|
+
**extra_weight_attrs,
|
125
|
+
):
|
126
|
+
|
127
|
+
weight_loader = extra_weight_attrs.get("weight_loader")
|
128
|
+
|
129
|
+
# Validate output_size_per_partition
|
130
|
+
output_size_per_partition = sum(output_partition_sizes)
|
131
|
+
if output_size_per_partition % 32 != 0:
|
132
|
+
raise ValueError(
|
133
|
+
f"Weight output_size_per_partition = "
|
134
|
+
f"{output_size_per_partition} is not divisible by 32."
|
135
|
+
)
|
136
|
+
|
137
|
+
# Validate input_size_per_partition
|
138
|
+
if input_size_per_partition % self.quant_config.pack_factor != 0:
|
139
|
+
raise ValueError(
|
140
|
+
f"Weight input_size_per_partition = "
|
141
|
+
f"{input_size_per_partition} is not divisible by "
|
142
|
+
f"pack_factor = {self.quant_config.pack_factor}."
|
143
|
+
)
|
144
|
+
if (
|
145
|
+
self.quant_config.group_size != -1
|
146
|
+
and input_size_per_partition % self.quant_config.group_size != 0
|
147
|
+
):
|
148
|
+
raise ValueError(
|
149
|
+
f"Weight input_size_per_partition = "
|
150
|
+
f"{input_size_per_partition} is not divisible by "
|
151
|
+
f"group_size = {self.quant_config.group_size}."
|
152
|
+
)
|
153
|
+
|
154
|
+
qweight = ModelWeightParameter(
|
155
|
+
data=torch.empty(
|
156
|
+
output_size_per_partition,
|
157
|
+
input_size_per_partition // self.quant_config.pack_factor,
|
158
|
+
dtype=torch.int8,
|
159
|
+
),
|
160
|
+
input_dim=1,
|
161
|
+
output_dim=0,
|
162
|
+
weight_loader=weight_loader,
|
163
|
+
)
|
164
|
+
layer.register_parameter("qweight", qweight)
|
165
|
+
|
166
|
+
s1_scales = ChannelQuantScaleParameter(
|
167
|
+
data=torch.empty(output_size_per_partition, dtype=torch.float16),
|
168
|
+
output_dim=0,
|
169
|
+
weight_loader=weight_loader,
|
170
|
+
)
|
171
|
+
layer.register_parameter("s1_scales", s1_scales)
|
172
|
+
|
173
|
+
if self.quant_config.group_size == -1:
|
174
|
+
s1_szeros = ChannelQuantScaleParameter(
|
175
|
+
data=torch.empty(output_size_per_partition, dtype=torch.float16),
|
176
|
+
output_dim=0,
|
177
|
+
weight_loader=weight_loader,
|
178
|
+
)
|
179
|
+
layer.register_parameter("s1_szeros", s1_szeros)
|
180
|
+
else:
|
181
|
+
s2_scales = GroupQuantScaleParameter(
|
182
|
+
data=torch.empty(
|
183
|
+
(
|
184
|
+
input_size_per_partition // self.quant_config.group_size,
|
185
|
+
output_size_per_partition,
|
186
|
+
),
|
187
|
+
dtype=torch.int8,
|
188
|
+
),
|
189
|
+
input_dim=0,
|
190
|
+
output_dim=1,
|
191
|
+
weight_loader=weight_loader,
|
192
|
+
)
|
193
|
+
layer.register_parameter("s2_scales", s2_scales)
|
194
|
+
|
195
|
+
s2_zeros = GroupQuantScaleParameter(
|
196
|
+
data=torch.empty(
|
197
|
+
(
|
198
|
+
input_size_per_partition // self.quant_config.group_size,
|
199
|
+
output_size_per_partition,
|
200
|
+
),
|
201
|
+
dtype=torch.int8,
|
202
|
+
),
|
203
|
+
input_dim=0,
|
204
|
+
output_dim=1,
|
205
|
+
weight_loader=weight_loader,
|
206
|
+
)
|
207
|
+
layer.register_parameter("s2_zeros", s2_zeros)
|
208
|
+
|
209
|
+
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
210
|
+
layer.qweight = Parameter(layer.qweight.data, requires_grad=False)
|
211
|
+
layer.s1_scales = Parameter(layer.s1_scales.data, requires_grad=False)
|
212
|
+
if self.quant_config.group_size == -1:
|
213
|
+
layer.s1_szeros = Parameter(layer.s1_szeros.data, requires_grad=False)
|
214
|
+
else:
|
215
|
+
layer.s2_scales = Parameter(layer.s2_scales.data, requires_grad=False)
|
216
|
+
layer.s2_zeros = Parameter(layer.s2_zeros.data, requires_grad=False)
|
217
|
+
|
218
|
+
def apply(
|
219
|
+
self,
|
220
|
+
layer: torch.nn.Module,
|
221
|
+
x: torch.Tensor,
|
222
|
+
bias: Optional[torch.Tensor] = None,
|
223
|
+
):
|
224
|
+
assert x.dtype == torch.float16, "QoQ only supports float16 input now"
|
225
|
+
if self.quant_config.group_size == -1:
|
226
|
+
x_q, x_scale, x_sum = per_token_quant_int8(
|
227
|
+
x, scale_dtype=x.dtype, cal_sum=True
|
228
|
+
)
|
229
|
+
out = qserve_w4a8_per_chn_gemm(
|
230
|
+
x_q, layer.qweight, layer.s1_scales, x_scale, layer.s1_szeros, x_sum
|
231
|
+
)
|
232
|
+
else:
|
233
|
+
x_q, x_scale = per_token_quant_int8(x, scale_dtype=x.dtype)
|
234
|
+
out = qserve_w4a8_per_group_gemm(
|
235
|
+
x_q,
|
236
|
+
layer.qweight,
|
237
|
+
layer.s2_zeros,
|
238
|
+
layer.s2_scales,
|
239
|
+
layer.s1_scales,
|
240
|
+
x_scale,
|
241
|
+
)
|
242
|
+
if bias is not None:
|
243
|
+
out = out + bias
|
244
|
+
return out
|
@@ -287,6 +287,7 @@ class W8A8FP8MoEMethod:
|
|
287
287
|
use_grouped_topk: bool,
|
288
288
|
topk_group: Optional[int] = None,
|
289
289
|
num_expert_group: Optional[int] = None,
|
290
|
+
num_fused_shared_experts: int = 0,
|
290
291
|
custom_routing_function: Optional[Callable] = None,
|
291
292
|
correction_bias: Optional[torch.Tensor] = None,
|
292
293
|
activation: str = "silu",
|
@@ -306,6 +307,7 @@ class W8A8FP8MoEMethod:
|
|
306
307
|
renormalize=renormalize,
|
307
308
|
topk_group=topk_group,
|
308
309
|
num_expert_group=num_expert_group,
|
310
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
309
311
|
custom_routing_function=custom_routing_function,
|
310
312
|
correction_bias=correction_bias,
|
311
313
|
routed_scaling_factor=routed_scaling_factor,
|
@@ -326,4 +328,5 @@ class W8A8FP8MoEMethod:
|
|
326
328
|
a1_scale=layer.w13_input_scale,
|
327
329
|
a2_scale=layer.w2_input_scale,
|
328
330
|
no_combine=no_combine,
|
331
|
+
routed_scaling_factor=routed_scaling_factor,
|
329
332
|
)
|
@@ -225,6 +225,7 @@ class W8A8Int8MoEMethod:
|
|
225
225
|
use_grouped_topk: bool,
|
226
226
|
topk_group: Optional[int] = None,
|
227
227
|
num_expert_group: Optional[int] = None,
|
228
|
+
num_fused_shared_experts: int = 0,
|
228
229
|
custom_routing_function: Optional[Callable] = None,
|
229
230
|
correction_bias: Optional[torch.Tensor] = None,
|
230
231
|
activation: str = "silu",
|
@@ -245,6 +246,7 @@ class W8A8Int8MoEMethod:
|
|
245
246
|
renormalize=renormalize,
|
246
247
|
topk_group=topk_group,
|
247
248
|
num_expert_group=num_expert_group,
|
249
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
248
250
|
custom_routing_function=custom_routing_function,
|
249
251
|
correction_bias=correction_bias,
|
250
252
|
routed_scaling_factor=routed_scaling_factor,
|
@@ -266,4 +268,5 @@ class W8A8Int8MoEMethod:
|
|
266
268
|
a1_scale=layer.w13_input_scale,
|
267
269
|
a2_scale=layer.w2_input_scale,
|
268
270
|
no_combine=no_combine,
|
271
|
+
routed_scaling_factor=routed_scaling_factor,
|
269
272
|
)
|