sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +10 -4
- sglang/bench_one_batch_server.py +67 -11
- sglang/bench_serving.py +85 -74
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +27 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +46 -8
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +67 -3
- sglang/srt/disaggregation/fake/conn.py +1 -0
- sglang/srt/disaggregation/kv_events.py +60 -5
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +29 -48
- sglang/srt/disaggregation/mooncake/conn.py +432 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
- sglang/srt/disaggregation/nixl/conn.py +124 -432
- sglang/srt/disaggregation/prefill.py +2 -0
- sglang/srt/disaggregation/utils.py +38 -1
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/entrypoints/EngineBase.py +6 -0
- sglang/srt/entrypoints/engine.py +102 -5
- sglang/srt/entrypoints/http_server.py +15 -2
- sglang/srt/function_call/base_format_detector.py +138 -86
- sglang/srt/function_call/deepseekv3_detector.py +54 -6
- sglang/srt/function_call/ebnf_composer.py +33 -19
- sglang/srt/function_call/function_call_parser.py +27 -0
- sglang/srt/function_call/llama32_detector.py +33 -14
- sglang/srt/function_call/mistral_detector.py +73 -26
- sglang/srt/function_call/pythonic_detector.py +86 -20
- sglang/srt/function_call/qwen25_detector.py +64 -10
- sglang/srt/function_call/utils.py +17 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/attention/aiter_backend.py +488 -123
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +103 -18
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +37 -1
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +244 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/communicator.py +260 -194
- sglang/srt/layers/dp_attention.py +6 -5
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +170 -7
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +27 -6
- sglang/srt/layers/moe/ep_moe/layer.py +94 -40
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +13 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
- sglang/srt/layers/moe/topk.py +44 -18
- sglang/srt/layers/multimodal.py +3 -3
- sglang/srt/layers/quantization/__init__.py +3 -2
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +55 -56
- sglang/srt/layers/quantization/fp8.py +28 -23
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -49
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +19 -5
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
- sglang/srt/managers/eplb_manager.py +55 -14
- sglang/srt/managers/expert_distribution.py +220 -46
- sglang/srt/managers/expert_location.py +110 -56
- sglang/srt/managers/expert_location_dispatch.py +23 -6
- sglang/srt/managers/io_struct.py +15 -4
- sglang/srt/managers/mm_utils.py +88 -38
- sglang/srt/managers/multimodal_processors/base_processor.py +188 -16
- sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
- sglang/srt/managers/multimodal_processors/internvl.py +4 -0
- sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
- sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
- sglang/srt/managers/schedule_batch.py +140 -38
- sglang/srt/managers/scheduler.py +305 -112
- sglang/srt/managers/tokenizer_manager.py +134 -17
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +72 -61
- sglang/srt/model_executor/expert_location_updater.py +157 -22
- sglang/srt/model_executor/forward_batch_info.py +38 -17
- sglang/srt/model_executor/model_runner.py +96 -56
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +609 -234
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +19 -14
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/minicpmo.py +2 -5
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +38 -9
- sglang/srt/models/qwen2_5_vl.py +3 -9
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +58 -191
- sglang/srt/models/qwen2_vl.py +3 -9
- sglang/srt/models/qwen3.py +41 -10
- sglang/srt/models/qwen3_moe.py +230 -191
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +86 -24
- sglang/srt/openai_api/protocol.py +31 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +37 -2
- sglang/srt/operations_strategy.py +200 -24
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +114 -27
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +51 -91
- sglang/srt/speculative/eagle_worker.py +101 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +129 -7
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +3 -3
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +79 -6
- sglang/version.py +1 -1
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/METADATA +14 -11
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/RECORD +318 -291
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -28,8 +28,9 @@ else:
|
|
28
28
|
import logging
|
29
29
|
|
30
30
|
_is_hip = is_hip()
|
31
|
+
_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
|
31
32
|
|
32
|
-
if
|
33
|
+
if _use_aiter:
|
33
34
|
from aiter import ActivationType
|
34
35
|
from aiter.fused_moe_bf16_asm import ck_moe_2stages
|
35
36
|
from aiter.ops.shuffle import shuffle_weight
|
@@ -104,7 +105,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|
104
105
|
set_weight_attrs(w2_weight, extra_weight_attrs)
|
105
106
|
|
106
107
|
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
107
|
-
if
|
108
|
+
if _use_aiter:
|
108
109
|
layer.w13_weight = torch.nn.Parameter(
|
109
110
|
shuffle_weight(layer.w13_weight.data, (16, 16)),
|
110
111
|
requires_grad=False,
|
@@ -127,6 +128,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|
127
128
|
use_grouped_topk: bool,
|
128
129
|
topk_group: Optional[int] = None,
|
129
130
|
num_expert_group: Optional[int] = None,
|
131
|
+
num_fused_shared_experts: int = 0,
|
130
132
|
custom_routing_function: Optional[Callable] = None,
|
131
133
|
correction_bias: Optional[torch.Tensor] = None,
|
132
134
|
activation: str = "silu",
|
@@ -144,6 +146,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|
144
146
|
use_grouped_topk=use_grouped_topk,
|
145
147
|
topk_group=topk_group,
|
146
148
|
num_expert_group=num_expert_group,
|
149
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
147
150
|
custom_routing_function=custom_routing_function,
|
148
151
|
correction_bias=correction_bias,
|
149
152
|
activation=activation,
|
@@ -163,6 +166,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|
163
166
|
renormalize: bool,
|
164
167
|
topk_group: Optional[int] = None,
|
165
168
|
num_expert_group: Optional[int] = None,
|
169
|
+
num_fused_shared_experts: int = 0,
|
166
170
|
custom_routing_function: Optional[Callable] = None,
|
167
171
|
correction_bias: Optional[torch.Tensor] = None,
|
168
172
|
activation: str = "silu",
|
@@ -179,12 +183,13 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|
179
183
|
renormalize=renormalize,
|
180
184
|
topk_group=topk_group,
|
181
185
|
num_expert_group=num_expert_group,
|
186
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
182
187
|
custom_routing_function=custom_routing_function,
|
183
188
|
correction_bias=correction_bias,
|
184
189
|
routed_scaling_factor=routed_scaling_factor,
|
185
190
|
)
|
186
191
|
|
187
|
-
if
|
192
|
+
if _use_aiter:
|
188
193
|
assert not no_combine, "unsupported"
|
189
194
|
if apply_router_weight_on_input:
|
190
195
|
assert (
|
@@ -220,6 +225,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|
220
225
|
activation=activation,
|
221
226
|
apply_router_weight_on_input=apply_router_weight_on_input,
|
222
227
|
no_combine=no_combine,
|
228
|
+
routed_scaling_factor=routed_scaling_factor,
|
223
229
|
)
|
224
230
|
|
225
231
|
def forward_cpu(
|
@@ -232,6 +238,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|
232
238
|
renormalize: bool,
|
233
239
|
topk_group: Optional[int] = None,
|
234
240
|
num_expert_group: Optional[int] = None,
|
241
|
+
num_fused_shared_experts: int = 0,
|
235
242
|
custom_routing_function: Optional[Callable] = None,
|
236
243
|
correction_bias: Optional[torch.Tensor] = None,
|
237
244
|
inplace: bool = True,
|
@@ -245,6 +252,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|
245
252
|
renormalize,
|
246
253
|
topk_group,
|
247
254
|
num_expert_group,
|
255
|
+
num_fused_shared_experts,
|
248
256
|
custom_routing_function,
|
249
257
|
correction_bias,
|
250
258
|
)
|
@@ -289,6 +297,7 @@ class FusedMoE(torch.nn.Module):
|
|
289
297
|
renormalize: bool = True,
|
290
298
|
use_grouped_topk: bool = False,
|
291
299
|
num_expert_group: Optional[int] = None,
|
300
|
+
num_fused_shared_experts: int = 0,
|
292
301
|
topk_group: Optional[int] = None,
|
293
302
|
quant_config: Optional[QuantizationConfig] = None,
|
294
303
|
tp_size: Optional[int] = None,
|
@@ -307,6 +316,7 @@ class FusedMoE(torch.nn.Module):
|
|
307
316
|
if params_dtype is None:
|
308
317
|
params_dtype = torch.get_default_dtype()
|
309
318
|
|
319
|
+
self.hidden_size = hidden_size
|
310
320
|
self.tp_size = (
|
311
321
|
tp_size if tp_size is not None else get_tensor_model_parallel_world_size()
|
312
322
|
)
|
@@ -321,6 +331,7 @@ class FusedMoE(torch.nn.Module):
|
|
321
331
|
if self.use_grouped_topk:
|
322
332
|
assert num_expert_group is not None and topk_group is not None
|
323
333
|
self.num_expert_group = num_expert_group
|
334
|
+
self.num_fused_shared_experts = num_fused_shared_experts
|
324
335
|
self.topk_group = topk_group
|
325
336
|
self.custom_routing_function = custom_routing_function
|
326
337
|
self.correction_bias = correction_bias
|
@@ -546,7 +557,8 @@ class FusedMoE(torch.nn.Module):
|
|
546
557
|
loaded_weight = loaded_weight.to(param.data.device)
|
547
558
|
|
548
559
|
if (
|
549
|
-
|
560
|
+
"compressed" in self.quant_method.__class__.__name__.lower()
|
561
|
+
and param.data[expert_id] != 1
|
550
562
|
and (param.data[expert_id] - loaded_weight).abs() > 1e-5
|
551
563
|
):
|
552
564
|
raise ValueError(
|
@@ -570,6 +582,23 @@ class FusedMoE(torch.nn.Module):
|
|
570
582
|
tp_rank=tp_rank,
|
571
583
|
)
|
572
584
|
return
|
585
|
+
if "ModelOpt" in self.quant_method.__class__.__name__:
|
586
|
+
if "weight_scale_2" in weight_name or "input_scale" in weight_name:
|
587
|
+
self._load_per_tensor_weight_scale(
|
588
|
+
shard_id=shard_id,
|
589
|
+
param=param,
|
590
|
+
loaded_weight=loaded_weight,
|
591
|
+
expert_id=expert_id,
|
592
|
+
)
|
593
|
+
elif "weight" in weight_name:
|
594
|
+
self._load_model_weight_or_group_weight_scale(
|
595
|
+
shard_id=shard_id,
|
596
|
+
shard_dim=shard_dim,
|
597
|
+
loaded_weight=loaded_weight,
|
598
|
+
expert_data=expert_data,
|
599
|
+
tp_rank=tp_rank,
|
600
|
+
)
|
601
|
+
return
|
573
602
|
|
574
603
|
# Case weight scales and zero_points
|
575
604
|
if "scale" in weight_name or "zero" in weight_name:
|
@@ -651,6 +680,7 @@ class FusedMoE(torch.nn.Module):
|
|
651
680
|
use_grouped_topk=self.use_grouped_topk,
|
652
681
|
topk_group=self.topk_group,
|
653
682
|
num_expert_group=self.num_expert_group,
|
683
|
+
num_fused_shared_experts=self.num_fused_shared_experts,
|
654
684
|
custom_routing_function=self.custom_routing_function,
|
655
685
|
correction_bias=self.correction_bias,
|
656
686
|
activation=self.activation,
|
sglang/srt/layers/moe/topk.py
CHANGED
@@ -18,6 +18,7 @@ from typing import Callable, Optional
|
|
18
18
|
import torch
|
19
19
|
import torch.nn.functional as F
|
20
20
|
|
21
|
+
from sglang.srt.managers import expert_location_dispatch
|
21
22
|
from sglang.srt.managers.expert_distribution import (
|
22
23
|
ExpertDistributionRecorder,
|
23
24
|
get_global_expert_distribution_recorder,
|
@@ -65,6 +66,7 @@ def fused_topk(
|
|
65
66
|
gating_output: torch.Tensor,
|
66
67
|
topk: int,
|
67
68
|
renormalize: bool,
|
69
|
+
num_token_non_padded: Optional[torch.Tensor] = None,
|
68
70
|
expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
|
69
71
|
):
|
70
72
|
assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
|
@@ -87,9 +89,27 @@ def fused_topk(
|
|
87
89
|
)
|
88
90
|
del token_expert_indicies
|
89
91
|
|
92
|
+
return _fused_topk_postprocess(
|
93
|
+
topk_weights=topk_weights,
|
94
|
+
topk_ids=topk_ids,
|
95
|
+
renormalize=renormalize,
|
96
|
+
expert_location_dispatch_info=expert_location_dispatch_info,
|
97
|
+
num_token_non_padded=num_token_non_padded,
|
98
|
+
)
|
99
|
+
|
100
|
+
|
101
|
+
@torch.compile(dynamic=True, backend=get_compiler_backend())
|
102
|
+
def _fused_topk_postprocess(
|
103
|
+
topk_weights,
|
104
|
+
topk_ids,
|
105
|
+
renormalize,
|
106
|
+
expert_location_dispatch_info,
|
107
|
+
num_token_non_padded,
|
108
|
+
):
|
90
109
|
if renormalize:
|
91
110
|
topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
|
92
111
|
topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info)
|
112
|
+
_mask_topk_ids_padded_region(topk_ids, num_token_non_padded)
|
93
113
|
return topk_weights, topk_ids
|
94
114
|
|
95
115
|
|
@@ -102,7 +122,7 @@ def grouped_topk(
|
|
102
122
|
renormalize: bool,
|
103
123
|
num_expert_group: int = 0,
|
104
124
|
topk_group: int = 0,
|
105
|
-
|
125
|
+
num_fused_shared_experts: int = 0,
|
106
126
|
routed_scaling_factor: Optional[float] = None,
|
107
127
|
num_token_non_padded: Optional[torch.Tensor] = None,
|
108
128
|
expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
|
@@ -127,10 +147,10 @@ def grouped_topk(
|
|
127
147
|
) # [n, e]
|
128
148
|
tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e]
|
129
149
|
topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
|
130
|
-
if
|
150
|
+
if num_fused_shared_experts:
|
131
151
|
topk_ids[:, -1] = torch.randint(
|
132
152
|
low=num_experts,
|
133
|
-
high=num_experts +
|
153
|
+
high=num_experts + num_fused_shared_experts,
|
134
154
|
size=(topk_ids.size(0),),
|
135
155
|
dtype=topk_ids.dtype,
|
136
156
|
device=topk_ids.device,
|
@@ -140,7 +160,7 @@ def grouped_topk(
|
|
140
160
|
if renormalize:
|
141
161
|
topk_weights_sum = (
|
142
162
|
topk_weights.sum(dim=-1, keepdim=True)
|
143
|
-
if
|
163
|
+
if num_fused_shared_experts == 0
|
144
164
|
else topk_weights[:, :-1].sum(dim=-1, keepdim=True)
|
145
165
|
)
|
146
166
|
topk_weights = topk_weights / topk_weights_sum
|
@@ -159,7 +179,7 @@ def biased_grouped_topk_impl(
|
|
159
179
|
renormalize: bool,
|
160
180
|
num_expert_group: int = 0,
|
161
181
|
topk_group: int = 0,
|
162
|
-
|
182
|
+
num_fused_shared_experts: int = 0,
|
163
183
|
routed_scaling_factor: Optional[float] = None,
|
164
184
|
num_token_non_padded: Optional[torch.Tensor] = None,
|
165
185
|
expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
|
@@ -191,10 +211,10 @@ def biased_grouped_topk_impl(
|
|
191
211
|
_, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
|
192
212
|
topk_weights = scores.gather(1, topk_ids)
|
193
213
|
|
194
|
-
if
|
214
|
+
if num_fused_shared_experts:
|
195
215
|
topk_ids[:, -1] = torch.randint(
|
196
216
|
low=num_experts,
|
197
|
-
high=num_experts +
|
217
|
+
high=num_experts + num_fused_shared_experts,
|
198
218
|
size=(topk_ids.size(0),),
|
199
219
|
dtype=topk_ids.dtype,
|
200
220
|
device=topk_ids.device,
|
@@ -204,7 +224,7 @@ def biased_grouped_topk_impl(
|
|
204
224
|
if renormalize:
|
205
225
|
topk_weights_sum = (
|
206
226
|
topk_weights.sum(dim=-1, keepdim=True)
|
207
|
-
if
|
227
|
+
if num_fused_shared_experts == 0
|
208
228
|
else topk_weights[:, :-1].sum(dim=-1, keepdim=True)
|
209
229
|
)
|
210
230
|
topk_weights = topk_weights / topk_weights_sum
|
@@ -238,7 +258,7 @@ def biased_grouped_topk(
|
|
238
258
|
num_expert_group: int = 0,
|
239
259
|
topk_group: int = 0,
|
240
260
|
compiled: bool = True,
|
241
|
-
|
261
|
+
num_fused_shared_experts: int = 0,
|
242
262
|
routed_scaling_factor: Optional[float] = None,
|
243
263
|
num_token_non_padded: Optional[torch.Tensor] = None,
|
244
264
|
expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
|
@@ -246,7 +266,7 @@ def biased_grouped_topk(
|
|
246
266
|
assert (
|
247
267
|
routed_scaling_factor is not None
|
248
268
|
), "routed_scaling_factor is required for biased_grouped_topk"
|
249
|
-
# TODO: moe_fused_gate kernel is not supported for
|
269
|
+
# TODO: moe_fused_gate kernel is not supported for num_fused_shared_experts > 0 now.
|
250
270
|
if (
|
251
271
|
_is_cuda
|
252
272
|
and gating_output.shape[1] // num_expert_group
|
@@ -259,7 +279,7 @@ def biased_grouped_topk(
|
|
259
279
|
num_expert_group,
|
260
280
|
topk_group,
|
261
281
|
topk,
|
262
|
-
|
282
|
+
num_fused_shared_experts,
|
263
283
|
routed_scaling_factor,
|
264
284
|
)
|
265
285
|
# TODO merge into kernel for this branch
|
@@ -287,7 +307,7 @@ def biased_grouped_topk(
|
|
287
307
|
renormalize,
|
288
308
|
num_expert_group,
|
289
309
|
topk_group,
|
290
|
-
|
310
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
291
311
|
routed_scaling_factor=routed_scaling_factor,
|
292
312
|
num_token_non_padded=num_token_non_padded,
|
293
313
|
expert_location_dispatch_info=expert_location_dispatch_info,
|
@@ -302,6 +322,7 @@ def select_experts(
|
|
302
322
|
renormalize: bool,
|
303
323
|
topk_group: Optional[int] = None,
|
304
324
|
num_expert_group: Optional[int] = None,
|
325
|
+
num_fused_shared_experts: int = 0,
|
305
326
|
custom_routing_function: Optional[Callable] = None,
|
306
327
|
correction_bias: Optional[torch.Tensor] = None,
|
307
328
|
torch_native: bool = False,
|
@@ -309,7 +330,14 @@ def select_experts(
|
|
309
330
|
num_token_non_padded: Optional[torch.Tensor] = None,
|
310
331
|
expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
|
311
332
|
):
|
312
|
-
|
333
|
+
router_logits, correction_bias = (
|
334
|
+
expert_location_dispatch.transform_select_experts_inputs(
|
335
|
+
router_logits=router_logits,
|
336
|
+
correction_bias=correction_bias,
|
337
|
+
info=expert_location_dispatch_info,
|
338
|
+
)
|
339
|
+
)
|
340
|
+
|
313
341
|
# DeepSeek V2/V3/R1 series models use grouped_top_k
|
314
342
|
if use_grouped_topk:
|
315
343
|
assert topk_group is not None
|
@@ -322,7 +350,7 @@ def select_experts(
|
|
322
350
|
renormalize=renormalize,
|
323
351
|
num_expert_group=num_expert_group,
|
324
352
|
topk_group=topk_group,
|
325
|
-
|
353
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
326
354
|
routed_scaling_factor=routed_scaling_factor,
|
327
355
|
num_token_non_padded=num_token_non_padded,
|
328
356
|
expert_location_dispatch_info=expert_location_dispatch_info,
|
@@ -336,7 +364,7 @@ def select_experts(
|
|
336
364
|
renormalize=renormalize,
|
337
365
|
num_expert_group=num_expert_group,
|
338
366
|
topk_group=topk_group,
|
339
|
-
|
367
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
340
368
|
routed_scaling_factor=routed_scaling_factor,
|
341
369
|
num_token_non_padded=num_token_non_padded,
|
342
370
|
expert_location_dispatch_info=expert_location_dispatch_info,
|
@@ -353,15 +381,13 @@ def select_experts(
|
|
353
381
|
renormalize=renormalize,
|
354
382
|
)
|
355
383
|
elif custom_routing_function is None:
|
356
|
-
assert (
|
357
|
-
num_token_non_padded is None
|
358
|
-
), "num_token_non_padded is not yet supported in fused_topk"
|
359
384
|
# Qwen3MOE uses fused_topk
|
360
385
|
topk_weights, topk_ids = fused_topk(
|
361
386
|
hidden_states=hidden_states,
|
362
387
|
gating_output=router_logits,
|
363
388
|
topk=top_k,
|
364
389
|
renormalize=renormalize,
|
390
|
+
num_token_non_padded=num_token_non_padded,
|
365
391
|
expert_location_dispatch_info=expert_location_dispatch_info,
|
366
392
|
)
|
367
393
|
else:
|
sglang/srt/layers/multimodal.py
CHANGED
@@ -32,8 +32,8 @@ def hash_kernel(
|
|
32
32
|
offsets = block_start + tl.arange(0, BLOCK_SIZE)
|
33
33
|
mask = offsets < n_elements
|
34
34
|
|
35
|
-
data = tl.load(input_ptr + offsets, mask=mask, other=0)
|
36
|
-
mixed = data ^ (offsets + XCONST)
|
35
|
+
data = tl.load(input_ptr + offsets, mask=mask, other=0).to(tl.int64)
|
36
|
+
mixed = data ^ (offsets.to(tl.int64) + XCONST)
|
37
37
|
hash_val = mixed * PRIME
|
38
38
|
hash_val = hash_val ^ (hash_val >> 16)
|
39
39
|
hash_val = hash_val * (PRIME ^ XCONST)
|
@@ -53,7 +53,7 @@ def gpu_tensor_hash(tensor: torch.Tensor) -> int:
|
|
53
53
|
BLOCK_SIZE = 1024
|
54
54
|
grid = (triton.cdiv(n, BLOCK_SIZE),)
|
55
55
|
|
56
|
-
intermediate_hashes = torch.empty(n, dtype=torch.
|
56
|
+
intermediate_hashes = torch.empty(n, dtype=torch.int64, device=tensor.device)
|
57
57
|
|
58
58
|
hash_kernel[grid](
|
59
59
|
tensor,
|
@@ -114,7 +114,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
|
|
114
114
|
if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
|
115
115
|
raise ValueError(
|
116
116
|
f"{quantization} quantization requires some operators from vllm. "
|
117
|
-
"Please install vllm by `pip install vllm==0.
|
117
|
+
"Please install vllm by `pip install vllm==0.9.0.1`"
|
118
118
|
)
|
119
119
|
|
120
120
|
return QUANTIZATION_METHODS[quantization]
|
@@ -289,6 +289,7 @@ def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"):
|
|
289
289
|
use_grouped_topk: bool,
|
290
290
|
topk_group: Optional[int] = None,
|
291
291
|
num_expert_group: Optional[int] = None,
|
292
|
+
num_fused_shared_experts: int = 0,
|
292
293
|
custom_routing_function: Optional[Callable] = None,
|
293
294
|
correction_bias: Optional[torch.Tensor] = None,
|
294
295
|
activation: str = "silu",
|
@@ -315,7 +316,7 @@ def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"):
|
|
315
316
|
if correction_bias is not None:
|
316
317
|
if not has_correction_bias:
|
317
318
|
raise ValueError(
|
318
|
-
"Please increase the version of your vllm. Try `pip install vllm==0.
|
319
|
+
"Please increase the version of your vllm. Try `pip install vllm==0.9.0.1`"
|
319
320
|
)
|
320
321
|
kwargs["e_score_correction_bias"] = correction_bias
|
321
322
|
return original_apply(**kwargs)
|
@@ -367,6 +367,7 @@ class BlockInt8MoEMethod:
|
|
367
367
|
use_grouped_topk: bool,
|
368
368
|
topk_group: Optional[int] = None,
|
369
369
|
num_expert_group: Optional[int] = None,
|
370
|
+
num_fused_shared_experts: int = 0,
|
370
371
|
custom_routing_function: Optional[Callable] = None,
|
371
372
|
correction_bias: Optional[torch.Tensor] = None,
|
372
373
|
activation: str = "silu",
|
@@ -387,6 +388,7 @@ class BlockInt8MoEMethod:
|
|
387
388
|
renormalize=renormalize,
|
388
389
|
topk_group=topk_group,
|
389
390
|
num_expert_group=num_expert_group,
|
391
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
390
392
|
custom_routing_function=custom_routing_function,
|
391
393
|
correction_bias=correction_bias,
|
392
394
|
routed_scaling_factor=routed_scaling_factor,
|
@@ -409,4 +411,5 @@ class BlockInt8MoEMethod:
|
|
409
411
|
a2_scale=layer.w2_input_scale,
|
410
412
|
block_shape=self.quant_config.weight_block_size,
|
411
413
|
no_combine=no_combine,
|
414
|
+
routed_scaling_factor=routed_scaling_factor,
|
412
415
|
)
|
@@ -272,6 +272,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
|
|
272
272
|
use_grouped_topk: bool = False,
|
273
273
|
topk_group: Optional[int] = None,
|
274
274
|
num_expert_group: Optional[int] = None,
|
275
|
+
num_fused_shared_experts: int = 0,
|
275
276
|
global_num_experts: int = -1,
|
276
277
|
expert_map: Optional[torch.Tensor] = None,
|
277
278
|
custom_routing_function: Optional[Callable] = None,
|
@@ -294,6 +295,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
|
|
294
295
|
renormalize=renormalize,
|
295
296
|
topk_group=topk_group,
|
296
297
|
num_expert_group=num_expert_group,
|
298
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
297
299
|
custom_routing_function=custom_routing_function,
|
298
300
|
correction_bias=correction_bias,
|
299
301
|
routed_scaling_factor=routed_scaling_factor,
|
@@ -315,6 +317,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
|
|
315
317
|
a1_scale=layer.w13_input_scale,
|
316
318
|
a2_scale=layer.w2_input_scale,
|
317
319
|
apply_router_weight_on_input=apply_router_weight_on_input,
|
320
|
+
routed_scaling_factor=routed_scaling_factor,
|
318
321
|
)
|
319
322
|
|
320
323
|
|
@@ -627,6 +630,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
|
|
627
630
|
use_grouped_topk: bool = False,
|
628
631
|
topk_group: Optional[int] = None,
|
629
632
|
num_expert_group: Optional[int] = None,
|
633
|
+
num_fused_shared_experts: int = 0,
|
630
634
|
global_num_experts: int = -1,
|
631
635
|
expert_map: Optional[torch.Tensor] = None,
|
632
636
|
custom_routing_function: Optional[Callable] = None,
|
@@ -651,6 +655,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
|
|
651
655
|
renormalize=renormalize,
|
652
656
|
topk_group=topk_group,
|
653
657
|
num_expert_group=num_expert_group,
|
658
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
654
659
|
custom_routing_function=custom_routing_function,
|
655
660
|
scoring_func=scoring_func,
|
656
661
|
correction_bias=correction_bias,
|
@@ -17,10 +17,10 @@ _ENABLE_JIT_DEEPGEMM = False
|
|
17
17
|
try:
|
18
18
|
import deep_gemm
|
19
19
|
from deep_gemm import get_num_sms
|
20
|
+
from deep_gemm.jit import build
|
20
21
|
from deep_gemm.jit.compiler import get_nvcc_compiler
|
21
22
|
from deep_gemm.jit_kernels.gemm import get_best_configs
|
22
23
|
from deep_gemm.jit_kernels.runtime import FP8GemmRuntime, GemmType
|
23
|
-
from deep_gemm.jit_kernels.tuner import jit_tuner
|
24
24
|
|
25
25
|
sm_version = get_device_sm()
|
26
26
|
if sm_version == 90:
|
@@ -148,32 +148,28 @@ def _compile_grouped_gemm_nt_f8f8bf16_masked_one(
|
|
148
148
|
block_k = 128
|
149
149
|
num_tma_threads = 128
|
150
150
|
num_math_threads_per_group = 128
|
151
|
+
|
151
152
|
kwargs = {
|
153
|
+
"GEMM_TYPE": GemmType.GroupedMasked,
|
152
154
|
"NUM_TMA_THREADS": num_tma_threads,
|
153
155
|
"NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
|
156
|
+
"N": n,
|
157
|
+
"K": k,
|
158
|
+
"NUM_GROUPS": 1,
|
159
|
+
"BLOCK_M": block_m,
|
160
|
+
"BLOCK_N": block_n,
|
154
161
|
"BLOCK_K": block_k,
|
162
|
+
"SWIZZLE_D_MODE": smem_config[1],
|
163
|
+
"BLOCK_N_PADDING": smem_config[2],
|
164
|
+
"NUM_STAGES": num_stages,
|
165
|
+
"NUM_TMA_MULTICAST": tma_multicast_config[0],
|
166
|
+
"IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
|
155
167
|
"NUM_SMS": num_sms,
|
156
168
|
"SMEM_SIZE": smem_config[0],
|
157
169
|
}
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
"N": n,
|
162
|
-
"K": k,
|
163
|
-
"BLOCK_M": block_m,
|
164
|
-
"BLOCK_N": block_n,
|
165
|
-
"SWIZZLE_D_MODE": smem_config[1],
|
166
|
-
"BLOCK_N_PADDING": smem_config[2],
|
167
|
-
"NUM_GROUPS": num_groups,
|
168
|
-
"NUM_STAGES": num_stages,
|
169
|
-
"NUM_TMA_MULTICAST": tma_multicast_config[0],
|
170
|
-
"IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
|
171
|
-
"GEMM_TYPE": GemmType.GroupedMasked,
|
172
|
-
},
|
173
|
-
space=(),
|
174
|
-
kwargs=kwargs,
|
175
|
-
runtime_cls=FP8GemmRuntime,
|
176
|
-
)
|
170
|
+
|
171
|
+
code = FP8GemmRuntime.generate(kwargs)
|
172
|
+
_ = build("m_grouped_gemm_fp8_fp8_bf16_nt", code, FP8GemmRuntime, kwargs)
|
177
173
|
|
178
174
|
|
179
175
|
def _compile_grouped_gemm_nt_f8f8bf16_contig_one(
|
@@ -187,31 +183,26 @@ def _compile_grouped_gemm_nt_f8f8bf16_contig_one(
|
|
187
183
|
num_tma_threads = 128
|
188
184
|
num_math_threads_per_group = 128
|
189
185
|
kwargs = {
|
186
|
+
"GEMM_TYPE": GemmType.GroupedContiguous,
|
190
187
|
"NUM_TMA_THREADS": num_tma_threads,
|
191
188
|
"NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
|
189
|
+
"N": n,
|
190
|
+
"K": k,
|
191
|
+
"NUM_GROUPS": 1,
|
192
|
+
"BLOCK_M": block_m,
|
193
|
+
"BLOCK_N": block_n,
|
192
194
|
"BLOCK_K": block_k,
|
195
|
+
"SWIZZLE_D_MODE": smem_config[1],
|
196
|
+
"BLOCK_N_PADDING": smem_config[2],
|
197
|
+
"NUM_STAGES": num_stages,
|
198
|
+
"NUM_TMA_MULTICAST": tma_multicast_config[0],
|
199
|
+
"IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
|
193
200
|
"NUM_SMS": num_sms,
|
194
201
|
"SMEM_SIZE": smem_config[0],
|
195
202
|
}
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
"N": n,
|
200
|
-
"K": k,
|
201
|
-
"BLOCK_M": block_m,
|
202
|
-
"BLOCK_N": block_n,
|
203
|
-
"SWIZZLE_D_MODE": smem_config[1],
|
204
|
-
"BLOCK_N_PADDING": smem_config[2],
|
205
|
-
"NUM_GROUPS": num_groups,
|
206
|
-
"NUM_STAGES": num_stages,
|
207
|
-
"NUM_TMA_MULTICAST": tma_multicast_config[0],
|
208
|
-
"IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
|
209
|
-
"GEMM_TYPE": GemmType.GroupedContiguous,
|
210
|
-
},
|
211
|
-
space=(),
|
212
|
-
kwargs=kwargs,
|
213
|
-
runtime_cls=FP8GemmRuntime,
|
214
|
-
)
|
203
|
+
|
204
|
+
code = FP8GemmRuntime.generate(kwargs)
|
205
|
+
_ = build("m_grouped_gemm_fp8_fp8_bf16_nt", code, FP8GemmRuntime, kwargs)
|
215
206
|
|
216
207
|
|
217
208
|
def _compile_gemm_nt_f8f8bf16_one(
|
@@ -228,28 +219,23 @@ def _compile_gemm_nt_f8f8bf16_one(
|
|
228
219
|
"GEMM_TYPE": GemmType.Normal,
|
229
220
|
"NUM_TMA_THREADS": num_tma_threads,
|
230
221
|
"NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
|
222
|
+
"N": n,
|
223
|
+
"K": k,
|
231
224
|
"NUM_GROUPS": 1,
|
225
|
+
"BLOCK_M": block_m,
|
226
|
+
"BLOCK_N": block_n,
|
232
227
|
"BLOCK_K": block_k,
|
228
|
+
"SWIZZLE_D_MODE": smem_config[1],
|
229
|
+
"BLOCK_N_PADDING": smem_config[2],
|
230
|
+
"NUM_STAGES": num_stages,
|
231
|
+
"NUM_TMA_MULTICAST": tma_multicast_config[0],
|
232
|
+
"IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
|
233
233
|
"NUM_SMS": num_sms,
|
234
234
|
"SMEM_SIZE": smem_config[0],
|
235
235
|
}
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
"N": n,
|
240
|
-
"K": k,
|
241
|
-
"BLOCK_M": block_m,
|
242
|
-
"BLOCK_N": block_n,
|
243
|
-
"SWIZZLE_D_MODE": smem_config[1],
|
244
|
-
"BLOCK_N_PADDING": smem_config[2],
|
245
|
-
"NUM_STAGES": num_stages,
|
246
|
-
"NUM_TMA_MULTICAST": tma_multicast_config[0],
|
247
|
-
"IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
|
248
|
-
},
|
249
|
-
space=(),
|
250
|
-
kwargs=kwargs,
|
251
|
-
runtime_cls=FP8GemmRuntime,
|
252
|
-
)
|
236
|
+
|
237
|
+
code = FP8GemmRuntime.generate(kwargs)
|
238
|
+
_ = build("gemm_fp8_fp8_bf16_nt", code, FP8GemmRuntime, kwargs)
|
253
239
|
|
254
240
|
|
255
241
|
_KERNEL_HELPER_DICT: Dict[DeepGemmKernelType, DeepGemmKernelHelper] = {
|
@@ -391,3 +377,16 @@ def _log_jit_build(M: int, N: int, K: int, kernel_type: DeepGemmKernelType):
|
|
391
377
|
RuntimeCache.get = __patched_func
|
392
378
|
yield
|
393
379
|
RuntimeCache.get = origin_func
|
380
|
+
|
381
|
+
|
382
|
+
@contextmanager
|
383
|
+
def configure_deep_gemm_num_sms(num_sms):
|
384
|
+
if num_sms is None:
|
385
|
+
yield
|
386
|
+
else:
|
387
|
+
original_num_sms = deep_gemm.get_num_sms()
|
388
|
+
deep_gemm.set_num_sms(num_sms)
|
389
|
+
try:
|
390
|
+
yield
|
391
|
+
finally:
|
392
|
+
deep_gemm.set_num_sms(original_num_sms)
|