sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +16 -10
- sglang/bench_one_batch.py +5 -4
- sglang/bench_one_batch_server.py +86 -22
- sglang/bench_serving.py +197 -110
- sglang/compile_deep_gemm.py +4 -4
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +66 -29
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +47 -9
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +187 -134
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
- sglang/srt/disaggregation/fake/conn.py +4 -13
- sglang/srt/disaggregation/kv_events.py +412 -0
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +84 -70
- sglang/srt/disaggregation/mooncake/conn.py +441 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
- sglang/srt/disaggregation/nixl/conn.py +124 -442
- sglang/srt/disaggregation/prefill.py +128 -44
- sglang/srt/disaggregation/utils.py +154 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/distributed/utils.py +3 -3
- sglang/srt/entrypoints/EngineBase.py +11 -0
- sglang/srt/entrypoints/engine.py +129 -12
- sglang/srt/entrypoints/http_server.py +21 -6
- sglang/srt/entrypoints/http_server_engine.py +5 -2
- sglang/srt/function_call/base_format_detector.py +302 -0
- sglang/srt/function_call/core_types.py +34 -0
- sglang/srt/function_call/deepseekv3_detector.py +205 -0
- sglang/srt/function_call/ebnf_composer.py +248 -0
- sglang/srt/function_call/function_call_parser.py +202 -0
- sglang/srt/function_call/llama32_detector.py +93 -0
- sglang/srt/function_call/mistral_detector.py +131 -0
- sglang/srt/function_call/pythonic_detector.py +229 -0
- sglang/srt/function_call/qwen25_detector.py +121 -0
- sglang/srt/function_call/utils.py +52 -0
- sglang/srt/hf_transformers_utils.py +50 -7
- sglang/srt/layers/attention/aiter_backend.py +878 -0
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +166 -35
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
- sglang/srt/layers/attention/flashmla_backend.py +340 -78
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +247 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/utils.py +2 -2
- sglang/srt/layers/attention/vision.py +1 -1
- sglang/srt/layers/communicator.py +517 -0
- sglang/srt/layers/dp_attention.py +6 -15
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +370 -0
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
- sglang/srt/layers/moe/ep_moe/layer.py +195 -87
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
- sglang/srt/layers/moe/topk.py +107 -24
- sglang/srt/layers/multimodal.py +70 -0
- sglang/srt/layers/quantization/__init__.py +10 -4
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +60 -59
- sglang/srt/layers/quantization/fp8.py +113 -18
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -43
- sglang/srt/layers/quantization/gptq.py +298 -6
- sglang/srt/layers/quantization/int8_kernel.py +18 -5
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/qoq.py +244 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +20 -8
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
- sglang/srt/managers/eplb_manager.py +96 -0
- sglang/srt/managers/expert_distribution.py +878 -56
- sglang/srt/managers/expert_location.py +448 -0
- sglang/srt/managers/expert_location_dispatch.py +108 -0
- sglang/srt/managers/io_struct.py +29 -5
- sglang/srt/managers/mm_utils.py +355 -151
- sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
- sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
- sglang/srt/managers/multimodal_processors/internvl.py +18 -5
- sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
- sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
- sglang/srt/managers/multimodal_processors/llava.py +3 -3
- sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
- sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
- sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
- sglang/srt/managers/schedule_batch.py +185 -55
- sglang/srt/managers/schedule_policy.py +4 -5
- sglang/srt/managers/scheduler.py +389 -154
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +231 -39
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/base_prefix_cache.py +3 -0
- sglang/srt/mem_cache/chunk_cache.py +3 -1
- sglang/srt/mem_cache/hiradix_cache.py +4 -4
- sglang/srt/mem_cache/memory_pool.py +74 -52
- sglang/srt/mem_cache/multimodal_cache.py +45 -0
- sglang/srt/mem_cache/radix_cache.py +58 -5
- sglang/srt/metrics/collector.py +11 -2
- sglang/srt/mm_utils.py +10 -0
- sglang/srt/model_executor/cuda_graph_runner.py +87 -65
- sglang/srt/model_executor/expert_location_updater.py +557 -0
- sglang/srt/model_executor/forward_batch_info.py +39 -14
- sglang/srt/model_executor/model_runner.py +231 -101
- sglang/srt/model_loader/loader.py +10 -6
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/clip.py +5 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +732 -403
- sglang/srt/models/exaone.py +8 -3
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +75 -33
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/llama4.py +10 -2
- sglang/srt/models/llava.py +26 -18
- sglang/srt/models/mimo_mtp.py +220 -0
- sglang/srt/models/minicpmo.py +7 -17
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/mistral.py +71 -1
- sglang/srt/models/mllama.py +3 -3
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +133 -35
- sglang/srt/models/qwen2_5_vl.py +5 -3
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +206 -69
- sglang/srt/models/qwen2_vl.py +3 -3
- sglang/srt/models/qwen3.py +92 -19
- sglang/srt/models/qwen3_moe.py +457 -55
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/siglip.py +294 -0
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +114 -40
- sglang/srt/openai_api/protocol.py +37 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +189 -0
- sglang/srt/operations_strategy.py +207 -0
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +235 -38
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +181 -90
- sglang/srt/speculative/eagle_worker.py +146 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +197 -19
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +278 -0
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +81 -42
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
- sglang-0.4.7.dist-info/RECORD +699 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- sglang/srt/function_call_parser.py +0 -858
- sglang/srt/platforms/interface.py +0 -371
- sglang-0.4.6.post4.dist-info/RECORD +0 -646
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
sglang/srt/models/qwen2_moe.py
CHANGED
@@ -16,7 +16,10 @@
|
|
16
16
|
# https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/qwen2_moe.py
|
17
17
|
"""Inference-only Qwen2MoE model compatible with HuggingFace weights."""
|
18
18
|
|
19
|
-
|
19
|
+
import logging
|
20
|
+
from dataclasses import dataclass
|
21
|
+
from enum import Enum, auto
|
22
|
+
from typing import Any, Dict, Iterable, Optional, Tuple, Union
|
20
23
|
|
21
24
|
import torch
|
22
25
|
import torch.nn.functional as F
|
@@ -24,10 +27,25 @@ from torch import nn
|
|
24
27
|
from transformers import PretrainedConfig
|
25
28
|
|
26
29
|
from sglang.srt.distributed import (
|
30
|
+
get_pp_group,
|
27
31
|
get_tensor_model_parallel_world_size,
|
28
32
|
tensor_model_parallel_all_reduce,
|
29
33
|
)
|
30
34
|
from sglang.srt.layers.activation import SiluAndMul
|
35
|
+
from sglang.srt.layers.communicator import (
|
36
|
+
LayerCommunicator,
|
37
|
+
LayerScatterModes,
|
38
|
+
ScatterMode,
|
39
|
+
)
|
40
|
+
from sglang.srt.layers.dp_attention import (
|
41
|
+
attn_tp_all_gather,
|
42
|
+
attn_tp_reduce_scatter,
|
43
|
+
dp_gather_partial,
|
44
|
+
dp_scatter,
|
45
|
+
get_attention_tp_rank,
|
46
|
+
get_attention_tp_size,
|
47
|
+
get_local_attention_dp_size,
|
48
|
+
)
|
31
49
|
from sglang.srt.layers.layernorm import RMSNorm
|
32
50
|
from sglang.srt.layers.linear import (
|
33
51
|
MergedColumnParallelLinear,
|
@@ -35,23 +53,29 @@ from sglang.srt.layers.linear import (
|
|
35
53
|
ReplicatedLinear,
|
36
54
|
RowParallelLinear,
|
37
55
|
)
|
38
|
-
from sglang.srt.layers.logits_processor import LogitsProcessor
|
39
|
-
from sglang.srt.layers.moe.ep_moe.layer import EPMoE
|
56
|
+
from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
|
57
|
+
from sglang.srt.layers.moe.ep_moe.layer import EPMoE, get_moe_impl_class
|
40
58
|
from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
|
41
59
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
42
60
|
from sglang.srt.layers.radix_attention import RadixAttention
|
43
61
|
from sglang.srt.layers.rotary_embedding import get_rope
|
62
|
+
from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
|
44
63
|
from sglang.srt.layers.vocab_parallel_embedding import (
|
45
64
|
ParallelLMHead,
|
46
65
|
VocabParallelEmbedding,
|
47
66
|
)
|
48
|
-
from sglang.srt.managers.expert_distribution import
|
67
|
+
from sglang.srt.managers.expert_distribution import (
|
68
|
+
ExpertDistributionRecorder,
|
69
|
+
get_global_expert_distribution_recorder,
|
70
|
+
)
|
71
|
+
from sglang.srt.managers.expert_location import ModelConfigForExpertLocation
|
49
72
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
50
|
-
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
73
|
+
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
|
51
74
|
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
75
|
+
from sglang.srt.two_batch_overlap import model_forward_maybe_tbo
|
52
76
|
from sglang.srt.utils import add_prefix, make_layers
|
53
77
|
|
54
|
-
|
78
|
+
logger = logging.getLogger(__name__)
|
55
79
|
|
56
80
|
|
57
81
|
class Qwen2MoeMLP(nn.Module):
|
@@ -82,8 +106,7 @@ class Qwen2MoeMLP(nn.Module):
|
|
82
106
|
)
|
83
107
|
if hidden_act != "silu":
|
84
108
|
raise ValueError(
|
85
|
-
f"Unsupported activation: {hidden_act}. "
|
86
|
-
"Only silu is supported for now."
|
109
|
+
f"Unsupported activation: {hidden_act}. Only silu is supported for now."
|
87
110
|
)
|
88
111
|
self.act_fn = SiluAndMul()
|
89
112
|
|
@@ -97,22 +120,22 @@ class Qwen2MoeMLP(nn.Module):
|
|
97
120
|
class Qwen2MoeSparseMoeBlock(nn.Module):
|
98
121
|
def __init__(
|
99
122
|
self,
|
123
|
+
layer_id: int,
|
100
124
|
config: PretrainedConfig,
|
101
125
|
quant_config: Optional[QuantizationConfig] = None,
|
102
126
|
prefix: str = "",
|
103
127
|
):
|
104
128
|
super().__init__()
|
105
129
|
self.tp_size = get_tensor_model_parallel_world_size()
|
106
|
-
|
130
|
+
self.layer_id = layer_id
|
107
131
|
if self.tp_size > config.num_experts:
|
108
132
|
raise ValueError(
|
109
133
|
f"Tensor parallel size {self.tp_size} is greater than "
|
110
134
|
f"the number of experts {config.num_experts}."
|
111
135
|
)
|
112
136
|
|
113
|
-
|
114
|
-
|
115
|
-
self.experts = MoEImpl(
|
137
|
+
self.experts = get_moe_impl_class()(
|
138
|
+
layer_id=self.layer_id,
|
116
139
|
num_experts=config.num_experts,
|
117
140
|
top_k=config.num_experts_per_tok,
|
118
141
|
hidden_size=config.hidden_size,
|
@@ -142,7 +165,9 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
|
|
142
165
|
self.shared_expert = None
|
143
166
|
self.shared_expert_gate = torch.nn.Linear(config.hidden_size, 1, bias=False)
|
144
167
|
|
145
|
-
def forward(
|
168
|
+
def forward(
|
169
|
+
self, hidden_states: torch.Tensor, forward_batch: Optional[ForwardBatch] = None
|
170
|
+
) -> torch.Tensor:
|
146
171
|
num_tokens, hidden_dim = hidden_states.shape
|
147
172
|
hidden_states = hidden_states.view(-1, hidden_dim)
|
148
173
|
shared_output = None
|
@@ -160,7 +185,6 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
|
|
160
185
|
)
|
161
186
|
if shared_output is not None:
|
162
187
|
final_hidden_states = final_hidden_states + shared_output
|
163
|
-
if self.tp_size > 1:
|
164
188
|
final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
|
165
189
|
|
166
190
|
return final_hidden_states.view(num_tokens, hidden_dim)
|
@@ -182,20 +206,23 @@ class Qwen2MoeAttention(nn.Module):
|
|
182
206
|
) -> None:
|
183
207
|
super().__init__()
|
184
208
|
self.hidden_size = hidden_size
|
185
|
-
|
209
|
+
|
210
|
+
attn_tp_rank = get_attention_tp_rank()
|
211
|
+
attn_tp_size = get_attention_tp_size()
|
212
|
+
|
186
213
|
self.total_num_heads = num_heads
|
187
|
-
assert self.total_num_heads %
|
188
|
-
self.num_heads = self.total_num_heads //
|
214
|
+
assert self.total_num_heads % attn_tp_size == 0
|
215
|
+
self.num_heads = self.total_num_heads // attn_tp_size
|
189
216
|
self.total_num_kv_heads = num_kv_heads
|
190
|
-
if self.total_num_kv_heads >=
|
217
|
+
if self.total_num_kv_heads >= attn_tp_size:
|
191
218
|
# Number of KV heads is greater than TP size, so we partition
|
192
219
|
# the KV heads across multiple tensor parallel GPUs.
|
193
|
-
assert self.total_num_kv_heads %
|
220
|
+
assert self.total_num_kv_heads % attn_tp_size == 0
|
194
221
|
else:
|
195
222
|
# Number of KV heads is less than TP size, so we replicate
|
196
223
|
# the KV heads across multiple tensor parallel GPUs.
|
197
|
-
assert
|
198
|
-
self.num_kv_heads = max(1, self.total_num_kv_heads //
|
224
|
+
assert attn_tp_size % self.total_num_kv_heads == 0
|
225
|
+
self.num_kv_heads = max(1, self.total_num_kv_heads // attn_tp_size)
|
199
226
|
self.head_dim = hidden_size // self.total_num_heads
|
200
227
|
self.q_size = self.num_heads * self.head_dim
|
201
228
|
self.kv_size = self.num_kv_heads * self.head_dim
|
@@ -210,6 +237,8 @@ class Qwen2MoeAttention(nn.Module):
|
|
210
237
|
self.total_num_kv_heads,
|
211
238
|
bias=qkv_bias,
|
212
239
|
quant_config=quant_config,
|
240
|
+
tp_rank=attn_tp_rank,
|
241
|
+
tp_size=attn_tp_size,
|
213
242
|
prefix=add_prefix("qkv_proj", prefix),
|
214
243
|
)
|
215
244
|
|
@@ -218,6 +247,9 @@ class Qwen2MoeAttention(nn.Module):
|
|
218
247
|
hidden_size,
|
219
248
|
bias=False,
|
220
249
|
quant_config=quant_config,
|
250
|
+
tp_rank=attn_tp_rank,
|
251
|
+
tp_size=attn_tp_size,
|
252
|
+
reduce_results=False,
|
221
253
|
prefix=add_prefix("o_proj", prefix),
|
222
254
|
)
|
223
255
|
|
@@ -261,6 +293,7 @@ class Qwen2MoeDecoderLayer(nn.Module):
|
|
261
293
|
prefix: str = "",
|
262
294
|
) -> None:
|
263
295
|
super().__init__()
|
296
|
+
self.config = config
|
264
297
|
self.hidden_size = config.hidden_size
|
265
298
|
rope_theta = getattr(config, "rope_theta", 10000)
|
266
299
|
rope_scaling = getattr(config, "rope_scaling", None)
|
@@ -279,15 +312,26 @@ class Qwen2MoeDecoderLayer(nn.Module):
|
|
279
312
|
prefix=add_prefix("self_attn", prefix),
|
280
313
|
)
|
281
314
|
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
315
|
+
self.layer_id = layer_id
|
316
|
+
|
317
|
+
self.attn_tp_size = get_attention_tp_size()
|
318
|
+
self.attn_tp_rank = get_attention_tp_rank()
|
319
|
+
self.local_dp_size = get_local_attention_dp_size()
|
320
|
+
|
321
|
+
# Qwen2MoE all layers are sparse and have no nextn now
|
322
|
+
self.is_layer_sparse = True
|
323
|
+
is_previous_layer_sparse = True
|
324
|
+
|
325
|
+
self.layer_scatter_modes = LayerScatterModes.init_new(
|
326
|
+
layer_id=layer_id,
|
327
|
+
num_layers=config.num_hidden_layers,
|
328
|
+
is_layer_sparse=self.is_layer_sparse,
|
329
|
+
is_previous_layer_sparse=is_previous_layer_sparse,
|
286
330
|
)
|
287
|
-
|
288
|
-
|
289
|
-
):
|
331
|
+
|
332
|
+
if self.is_layer_sparse:
|
290
333
|
self.mlp = Qwen2MoeSparseMoeBlock(
|
334
|
+
layer_id=layer_id,
|
291
335
|
config=config,
|
292
336
|
quant_config=quant_config,
|
293
337
|
prefix=add_prefix("mlp", prefix),
|
@@ -304,6 +348,11 @@ class Qwen2MoeDecoderLayer(nn.Module):
|
|
304
348
|
self.post_attention_layernorm = RMSNorm(
|
305
349
|
config.hidden_size, eps=config.rms_norm_eps
|
306
350
|
)
|
351
|
+
self.layer_communicator = LayerCommunicator(
|
352
|
+
layer_scatter_modes=self.layer_scatter_modes,
|
353
|
+
input_layernorm=self.input_layernorm,
|
354
|
+
post_attention_layernorm=self.post_attention_layernorm,
|
355
|
+
)
|
307
356
|
|
308
357
|
def forward(
|
309
358
|
self,
|
@@ -311,22 +360,29 @@ class Qwen2MoeDecoderLayer(nn.Module):
|
|
311
360
|
hidden_states: torch.Tensor,
|
312
361
|
forward_batch: ForwardBatch,
|
313
362
|
residual: Optional[torch.Tensor],
|
314
|
-
) -> torch.Tensor:
|
315
|
-
|
316
|
-
|
317
|
-
residual
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
363
|
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
364
|
+
|
365
|
+
hidden_states, residual = self.layer_communicator.prepare_attn(
|
366
|
+
hidden_states, residual, forward_batch
|
367
|
+
)
|
368
|
+
|
369
|
+
if hidden_states.shape[0] != 0:
|
370
|
+
hidden_states = self.self_attn(
|
371
|
+
positions=positions,
|
372
|
+
hidden_states=hidden_states,
|
373
|
+
forward_batch=forward_batch,
|
374
|
+
)
|
375
|
+
|
376
|
+
hidden_states, residual = self.layer_communicator.prepare_mlp(
|
377
|
+
hidden_states, residual, forward_batch
|
378
|
+
)
|
379
|
+
|
380
|
+
hidden_states = self.mlp(hidden_states, forward_batch)
|
381
|
+
|
382
|
+
hidden_states, residual = self.layer_communicator.postprocess_layer(
|
383
|
+
hidden_states, residual, forward_batch
|
325
384
|
)
|
326
385
|
|
327
|
-
# Fully Connected
|
328
|
-
hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
|
329
|
-
hidden_states = self.mlp(hidden_states)
|
330
386
|
return hidden_states, residual
|
331
387
|
|
332
388
|
|
@@ -341,15 +397,21 @@ class Qwen2MoeModel(nn.Module):
|
|
341
397
|
super().__init__()
|
342
398
|
self.padding_idx = config.pad_token_id
|
343
399
|
self.vocab_size = config.vocab_size
|
400
|
+
self.pp_group = get_pp_group()
|
401
|
+
|
402
|
+
if self.pp_group.is_first_rank:
|
403
|
+
self.embed_tokens = VocabParallelEmbedding(
|
404
|
+
config.vocab_size,
|
405
|
+
config.hidden_size,
|
406
|
+
enable_tp=not global_server_args_dict["enable_dp_attention"],
|
407
|
+
prefix=add_prefix("embed_tokens", prefix),
|
408
|
+
)
|
409
|
+
else:
|
410
|
+
self.embed_tokens = PPMissingLayer()
|
344
411
|
|
345
|
-
self.embed_tokens = VocabParallelEmbedding(
|
346
|
-
config.vocab_size,
|
347
|
-
config.hidden_size,
|
348
|
-
prefix=add_prefix("embed_tokens", prefix),
|
349
|
-
)
|
350
412
|
# Use the provided decoder layer type or default to Qwen2MoeDecoderLayer
|
351
413
|
decoder_layer_type = decoder_layer_type or Qwen2MoeDecoderLayer
|
352
|
-
self.layers = make_layers(
|
414
|
+
self.layers, self.start_layer, self.end_layer = make_layers(
|
353
415
|
config.num_hidden_layers,
|
354
416
|
lambda idx, prefix: decoder_layer_type(
|
355
417
|
layer_id=idx,
|
@@ -357,9 +419,14 @@ class Qwen2MoeModel(nn.Module):
|
|
357
419
|
quant_config=quant_config,
|
358
420
|
prefix=prefix,
|
359
421
|
),
|
422
|
+
pp_rank=self.pp_group.rank_in_group,
|
423
|
+
pp_size=self.pp_group.world_size,
|
360
424
|
prefix=add_prefix("layers", prefix),
|
361
425
|
)
|
362
|
-
self.
|
426
|
+
if self.pp_group.is_last_rank:
|
427
|
+
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
428
|
+
else:
|
429
|
+
self.norm = PPMissingLayer(return_tuple=True)
|
363
430
|
|
364
431
|
def forward(
|
365
432
|
self,
|
@@ -367,24 +434,53 @@ class Qwen2MoeModel(nn.Module):
|
|
367
434
|
positions: torch.Tensor,
|
368
435
|
forward_batch: ForwardBatch,
|
369
436
|
input_embeds: torch.Tensor = None,
|
370
|
-
|
371
|
-
|
372
|
-
|
437
|
+
pp_proxy_tensors: Optional[PPProxyTensors] = None,
|
438
|
+
) -> Union[torch.Tensor, PPProxyTensors]:
|
439
|
+
if self.pp_group.is_first_rank:
|
440
|
+
if input_embeds is None:
|
441
|
+
hidden_states = self.embed_tokens(input_ids)
|
442
|
+
else:
|
443
|
+
hidden_states = input_embeds
|
444
|
+
residual = None
|
373
445
|
else:
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
hidden_states, residual =
|
380
|
-
|
446
|
+
assert pp_proxy_tensors is not None
|
447
|
+
hidden_states = pp_proxy_tensors["hidden_states"]
|
448
|
+
residual = pp_proxy_tensors["residual"]
|
449
|
+
|
450
|
+
if forward_batch.can_run_tbo:
|
451
|
+
hidden_states, residual = model_forward_maybe_tbo(
|
452
|
+
layers=self.layers,
|
453
|
+
enable_tbo=True,
|
454
|
+
input_data_scatter_mode=ScatterMode.model_input_output(),
|
455
|
+
positions=positions,
|
456
|
+
forward_batch=forward_batch,
|
457
|
+
hidden_states=hidden_states,
|
458
|
+
residual=residual,
|
459
|
+
)
|
460
|
+
else:
|
461
|
+
for i in range(self.start_layer, self.end_layer):
|
462
|
+
with get_global_expert_distribution_recorder().with_current_layer(i):
|
463
|
+
layer = self.layers[i]
|
464
|
+
hidden_states, residual = layer(
|
465
|
+
positions, hidden_states, forward_batch, residual
|
466
|
+
)
|
467
|
+
if not self.pp_group.is_last_rank:
|
468
|
+
return PPProxyTensors(
|
469
|
+
{
|
470
|
+
"hidden_states": hidden_states,
|
471
|
+
"residual": residual,
|
472
|
+
}
|
381
473
|
)
|
382
|
-
|
474
|
+
else:
|
475
|
+
if hidden_states.shape[0] != 0:
|
476
|
+
if residual is None:
|
477
|
+
hidden_states = self.norm(hidden_states)
|
478
|
+
else:
|
479
|
+
hidden_states, _ = self.norm(hidden_states, residual)
|
383
480
|
return hidden_states
|
384
481
|
|
385
482
|
|
386
483
|
class Qwen2MoeForCausalLM(nn.Module):
|
387
|
-
|
388
484
|
fall_back_to_pt_during_load = False
|
389
485
|
|
390
486
|
def __init__(
|
@@ -394,6 +490,7 @@ class Qwen2MoeForCausalLM(nn.Module):
|
|
394
490
|
prefix: str = "",
|
395
491
|
) -> None:
|
396
492
|
super().__init__()
|
493
|
+
self.pp_group = get_pp_group()
|
397
494
|
self.config = config
|
398
495
|
self.quant_config = quant_config
|
399
496
|
self.model = Qwen2MoeModel(
|
@@ -404,6 +501,7 @@ class Qwen2MoeForCausalLM(nn.Module):
|
|
404
501
|
config.hidden_size,
|
405
502
|
quant_config=quant_config,
|
406
503
|
prefix=add_prefix("lm_head", prefix),
|
504
|
+
use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
|
407
505
|
)
|
408
506
|
self.logits_processor = LogitsProcessor(config)
|
409
507
|
|
@@ -414,11 +512,29 @@ class Qwen2MoeForCausalLM(nn.Module):
|
|
414
512
|
positions: torch.Tensor,
|
415
513
|
forward_batch: ForwardBatch,
|
416
514
|
input_embeds: torch.Tensor = None,
|
515
|
+
pp_proxy_tensors: Optional[PPProxyTensors] = None,
|
417
516
|
) -> torch.Tensor:
|
418
|
-
hidden_states = self.model(
|
419
|
-
|
420
|
-
|
517
|
+
hidden_states = self.model(
|
518
|
+
input_ids,
|
519
|
+
positions,
|
520
|
+
forward_batch,
|
521
|
+
input_embeds,
|
522
|
+
pp_proxy_tensors=pp_proxy_tensors,
|
421
523
|
)
|
524
|
+
if self.pp_group.is_last_rank:
|
525
|
+
return self.logits_processor(
|
526
|
+
input_ids, hidden_states, self.lm_head, forward_batch
|
527
|
+
)
|
528
|
+
else:
|
529
|
+
return hidden_states
|
530
|
+
|
531
|
+
@property
|
532
|
+
def start_layer(self):
|
533
|
+
return self.model.start_layer
|
534
|
+
|
535
|
+
@property
|
536
|
+
def end_layer(self):
|
537
|
+
return self.model.end_layer
|
422
538
|
|
423
539
|
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
424
540
|
stacked_params_mapping = [
|
@@ -441,6 +557,16 @@ class Qwen2MoeForCausalLM(nn.Module):
|
|
441
557
|
|
442
558
|
params_dict = dict(self.named_parameters())
|
443
559
|
for name, loaded_weight in weights:
|
560
|
+
layer_id = get_layer_id(name)
|
561
|
+
if (
|
562
|
+
layer_id is not None
|
563
|
+
and hasattr(self.model, "start_layer")
|
564
|
+
and (
|
565
|
+
layer_id < self.model.start_layer
|
566
|
+
or layer_id >= self.model.end_layer
|
567
|
+
)
|
568
|
+
):
|
569
|
+
continue
|
444
570
|
if "rotary_emb.inv_freq" in name:
|
445
571
|
continue
|
446
572
|
for param_name, weight_name, shard_id in stacked_params_mapping:
|
@@ -489,11 +615,22 @@ class Qwen2MoeForCausalLM(nn.Module):
|
|
489
615
|
if name not in params_dict:
|
490
616
|
continue
|
491
617
|
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
618
|
+
if name in params_dict.keys():
|
619
|
+
param = params_dict[name]
|
620
|
+
weight_loader = getattr(
|
621
|
+
param, "weight_loader", default_weight_loader
|
622
|
+
)
|
623
|
+
weight_loader(param, loaded_weight)
|
624
|
+
else:
|
625
|
+
logger.warning(f"Parameter {name} not found in params_dict")
|
626
|
+
|
627
|
+
@classmethod
|
628
|
+
def get_model_config_for_expert_location(cls, config):
|
629
|
+
return ModelConfigForExpertLocation(
|
630
|
+
num_layers=config.num_hidden_layers,
|
631
|
+
num_logical_experts=config.num_experts,
|
632
|
+
num_groups=None,
|
633
|
+
)
|
497
634
|
|
498
635
|
|
499
636
|
EntryClass = Qwen2MoeForCausalLM
|
sglang/srt/models/qwen2_vl.py
CHANGED
@@ -490,10 +490,10 @@ class Qwen2VLForConditionalGeneration(nn.Module):
|
|
490
490
|
pixel_values = torch.cat([item.pixel_values for item in items], dim=0).type(
|
491
491
|
self.visual.dtype
|
492
492
|
)
|
493
|
-
|
493
|
+
image_grid_thw = torch.concat([item.image_grid_thw for item in items], dim=0)
|
494
494
|
assert pixel_values.dim() == 2, pixel_values.dim()
|
495
|
-
assert
|
496
|
-
image_embeds = self.visual(pixel_values, grid_thw=
|
495
|
+
assert image_grid_thw.dim() == 2, image_grid_thw.dim()
|
496
|
+
image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
|
497
497
|
return image_embeds
|
498
498
|
|
499
499
|
def _process_video_input(self, video_input: Qwen2VLVideoInputs) -> torch.Tensor:
|
sglang/srt/models/qwen3.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
# Adapted from qwen2.py
|
2
2
|
|
3
|
+
import logging
|
3
4
|
from functools import partial
|
4
5
|
from typing import Any, Dict, Iterable, Optional, Tuple
|
5
6
|
|
@@ -7,6 +8,7 @@ import torch
|
|
7
8
|
from torch import nn
|
8
9
|
|
9
10
|
from sglang.srt.distributed import (
|
11
|
+
get_pp_group,
|
10
12
|
get_tensor_model_parallel_rank,
|
11
13
|
get_tensor_model_parallel_world_size,
|
12
14
|
split_tensor_along_last_dim,
|
@@ -19,8 +21,9 @@ from sglang.srt.layers.pooler import Pooler, PoolingType
|
|
19
21
|
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
20
22
|
from sglang.srt.layers.radix_attention import RadixAttention
|
21
23
|
from sglang.srt.layers.rotary_embedding import get_rope
|
24
|
+
from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
|
22
25
|
from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
|
23
|
-
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
26
|
+
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
|
24
27
|
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
25
28
|
from sglang.srt.models.qwen2 import Qwen2MLP as Qwen3MLP
|
26
29
|
from sglang.srt.models.qwen2 import Qwen2Model
|
@@ -28,6 +31,8 @@ from sglang.srt.utils import add_prefix
|
|
28
31
|
|
29
32
|
Qwen3Config = None
|
30
33
|
|
34
|
+
logger = logging.getLogger(__name__)
|
35
|
+
|
31
36
|
|
32
37
|
class Qwen3Attention(nn.Module):
|
33
38
|
def __init__(
|
@@ -238,20 +243,42 @@ class Qwen3ForCausalLM(nn.Module):
|
|
238
243
|
prefix: str = "",
|
239
244
|
) -> None:
|
240
245
|
super().__init__()
|
246
|
+
self.pp_group = get_pp_group()
|
241
247
|
self.config = config
|
242
248
|
self.quant_config = quant_config
|
243
249
|
self.model = Qwen3Model(
|
244
250
|
config, quant_config=quant_config, prefix=add_prefix("model", prefix)
|
245
251
|
)
|
246
|
-
|
247
|
-
|
252
|
+
|
253
|
+
# handle the lm head on different pp ranks
|
254
|
+
if self.pp_group.is_last_rank:
|
255
|
+
if self.pp_group.world_size == 1 and config.tie_word_embeddings:
|
256
|
+
self.lm_head = self.model.embed_tokens
|
257
|
+
else:
|
258
|
+
self.lm_head = ParallelLMHead(
|
259
|
+
config.vocab_size,
|
260
|
+
config.hidden_size,
|
261
|
+
quant_config=quant_config,
|
262
|
+
prefix=add_prefix("lm_head", prefix),
|
263
|
+
)
|
248
264
|
else:
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
265
|
+
# ranks other than the last rank will have a placeholder layer
|
266
|
+
self.lm_head = PPMissingLayer()
|
267
|
+
|
268
|
+
# perform weight tying for PP
|
269
|
+
if self.pp_group.world_size > 1 and config.tie_word_embeddings:
|
270
|
+
if self.pp_group.is_first_rank:
|
271
|
+
self.pp_group.send(
|
272
|
+
self.model.embed_tokens.weight, dst=self.pp_group.last_rank
|
273
|
+
)
|
274
|
+
else:
|
275
|
+
emb_token_weight = self.pp_group.recv(
|
276
|
+
size=(config.vocab_size, config.hidden_size),
|
277
|
+
dtype=next(self.model.parameters()).dtype,
|
278
|
+
src=self.pp_group.first_rank,
|
279
|
+
)
|
280
|
+
self.lm_head.weight.copy_(emb_token_weight)
|
281
|
+
|
255
282
|
self.logits_processor = LogitsProcessor(config)
|
256
283
|
self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
|
257
284
|
|
@@ -266,14 +293,33 @@ class Qwen3ForCausalLM(nn.Module):
|
|
266
293
|
forward_batch: ForwardBatch,
|
267
294
|
input_embeds: torch.Tensor = None,
|
268
295
|
get_embedding: bool = False,
|
296
|
+
pp_proxy_tensors: Optional[PPProxyTensors] = None,
|
269
297
|
) -> torch.Tensor:
|
270
|
-
hidden_states = self.model(
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
298
|
+
hidden_states = self.model(
|
299
|
+
input_ids,
|
300
|
+
positions,
|
301
|
+
forward_batch,
|
302
|
+
input_embeds,
|
303
|
+
pp_proxy_tensors=pp_proxy_tensors,
|
304
|
+
)
|
305
|
+
|
306
|
+
if self.pp_group.is_last_rank:
|
307
|
+
if not get_embedding:
|
308
|
+
return self.logits_processor(
|
309
|
+
input_ids, hidden_states, self.lm_head, forward_batch
|
310
|
+
)
|
311
|
+
else:
|
312
|
+
return self.pooler(hidden_states, forward_batch)
|
275
313
|
else:
|
276
|
-
return
|
314
|
+
return hidden_states
|
315
|
+
|
316
|
+
@property
|
317
|
+
def start_layer(self):
|
318
|
+
return self.model.start_layer
|
319
|
+
|
320
|
+
@property
|
321
|
+
def end_layer(self):
|
322
|
+
return self.model.end_layer
|
277
323
|
|
278
324
|
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
279
325
|
stacked_params_mapping = [
|
@@ -287,6 +333,19 @@ class Qwen3ForCausalLM(nn.Module):
|
|
287
333
|
|
288
334
|
params_dict = dict(self.named_parameters())
|
289
335
|
for name, loaded_weight in weights:
|
336
|
+
if "Embedding" in self.config.name_or_path:
|
337
|
+
name = add_prefix(name, "model")
|
338
|
+
layer_id = get_layer_id(name)
|
339
|
+
if (
|
340
|
+
layer_id is not None
|
341
|
+
and hasattr(self.model, "start_layer")
|
342
|
+
and (
|
343
|
+
layer_id < self.model.start_layer
|
344
|
+
or layer_id >= self.model.end_layer
|
345
|
+
)
|
346
|
+
):
|
347
|
+
continue
|
348
|
+
|
290
349
|
if "rotary_emb.inv_freq" in name or "projector" in name:
|
291
350
|
continue
|
292
351
|
if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
|
@@ -294,7 +353,15 @@ class Qwen3ForCausalLM(nn.Module):
|
|
294
353
|
# the checkpoint. Skip them.
|
295
354
|
continue
|
296
355
|
if self.config.tie_word_embeddings and "lm_head.weight" in name:
|
297
|
-
|
356
|
+
if self.pp_group.world_size > 1 and self.pp_group.is_last_rank:
|
357
|
+
# Handle pp weight tying here
|
358
|
+
# find the embed_tokens.weight in the weights
|
359
|
+
embed_token_weights = next(
|
360
|
+
filter(lambda x: x[0] == "model.embed_tokens.weight", weights)
|
361
|
+
)[1]
|
362
|
+
loaded_weight = embed_token_weights
|
363
|
+
else:
|
364
|
+
continue
|
298
365
|
if name.startswith("model.vision_tower") and name not in params_dict:
|
299
366
|
continue
|
300
367
|
|
@@ -313,9 +380,15 @@ class Qwen3ForCausalLM(nn.Module):
|
|
313
380
|
# Skip loading extra bias for GPTQ models.
|
314
381
|
if name.endswith(".bias") and name not in params_dict:
|
315
382
|
continue
|
316
|
-
|
317
|
-
|
318
|
-
|
383
|
+
|
384
|
+
if name in params_dict.keys():
|
385
|
+
param = params_dict[name]
|
386
|
+
weight_loader = getattr(
|
387
|
+
param, "weight_loader", default_weight_loader
|
388
|
+
)
|
389
|
+
weight_loader(param, loaded_weight)
|
390
|
+
else:
|
391
|
+
logger.warning(f"Parameter {name} not found in params_dict")
|
319
392
|
|
320
393
|
def get_embed_and_head(self):
|
321
394
|
return self.model.embed_tokens.weight, self.lm_head.weight
|