sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +16 -10
- sglang/bench_one_batch.py +5 -4
- sglang/bench_one_batch_server.py +86 -22
- sglang/bench_serving.py +197 -110
- sglang/compile_deep_gemm.py +4 -4
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +66 -29
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +47 -9
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +187 -134
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
- sglang/srt/disaggregation/fake/conn.py +4 -13
- sglang/srt/disaggregation/kv_events.py +412 -0
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +84 -70
- sglang/srt/disaggregation/mooncake/conn.py +441 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
- sglang/srt/disaggregation/nixl/conn.py +124 -442
- sglang/srt/disaggregation/prefill.py +128 -44
- sglang/srt/disaggregation/utils.py +154 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/distributed/utils.py +3 -3
- sglang/srt/entrypoints/EngineBase.py +11 -0
- sglang/srt/entrypoints/engine.py +129 -12
- sglang/srt/entrypoints/http_server.py +21 -6
- sglang/srt/entrypoints/http_server_engine.py +5 -2
- sglang/srt/function_call/base_format_detector.py +302 -0
- sglang/srt/function_call/core_types.py +34 -0
- sglang/srt/function_call/deepseekv3_detector.py +205 -0
- sglang/srt/function_call/ebnf_composer.py +248 -0
- sglang/srt/function_call/function_call_parser.py +202 -0
- sglang/srt/function_call/llama32_detector.py +93 -0
- sglang/srt/function_call/mistral_detector.py +131 -0
- sglang/srt/function_call/pythonic_detector.py +229 -0
- sglang/srt/function_call/qwen25_detector.py +121 -0
- sglang/srt/function_call/utils.py +52 -0
- sglang/srt/hf_transformers_utils.py +50 -7
- sglang/srt/layers/attention/aiter_backend.py +878 -0
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +166 -35
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
- sglang/srt/layers/attention/flashmla_backend.py +340 -78
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +247 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/utils.py +2 -2
- sglang/srt/layers/attention/vision.py +1 -1
- sglang/srt/layers/communicator.py +517 -0
- sglang/srt/layers/dp_attention.py +6 -15
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +370 -0
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
- sglang/srt/layers/moe/ep_moe/layer.py +195 -87
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
- sglang/srt/layers/moe/topk.py +107 -24
- sglang/srt/layers/multimodal.py +70 -0
- sglang/srt/layers/quantization/__init__.py +10 -4
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +60 -59
- sglang/srt/layers/quantization/fp8.py +113 -18
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -43
- sglang/srt/layers/quantization/gptq.py +298 -6
- sglang/srt/layers/quantization/int8_kernel.py +18 -5
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/qoq.py +244 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +20 -8
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
- sglang/srt/managers/eplb_manager.py +96 -0
- sglang/srt/managers/expert_distribution.py +878 -56
- sglang/srt/managers/expert_location.py +448 -0
- sglang/srt/managers/expert_location_dispatch.py +108 -0
- sglang/srt/managers/io_struct.py +29 -5
- sglang/srt/managers/mm_utils.py +355 -151
- sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
- sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
- sglang/srt/managers/multimodal_processors/internvl.py +18 -5
- sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
- sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
- sglang/srt/managers/multimodal_processors/llava.py +3 -3
- sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
- sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
- sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
- sglang/srt/managers/schedule_batch.py +185 -55
- sglang/srt/managers/schedule_policy.py +4 -5
- sglang/srt/managers/scheduler.py +389 -154
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +231 -39
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/base_prefix_cache.py +3 -0
- sglang/srt/mem_cache/chunk_cache.py +3 -1
- sglang/srt/mem_cache/hiradix_cache.py +4 -4
- sglang/srt/mem_cache/memory_pool.py +74 -52
- sglang/srt/mem_cache/multimodal_cache.py +45 -0
- sglang/srt/mem_cache/radix_cache.py +58 -5
- sglang/srt/metrics/collector.py +11 -2
- sglang/srt/mm_utils.py +10 -0
- sglang/srt/model_executor/cuda_graph_runner.py +87 -65
- sglang/srt/model_executor/expert_location_updater.py +557 -0
- sglang/srt/model_executor/forward_batch_info.py +39 -14
- sglang/srt/model_executor/model_runner.py +231 -101
- sglang/srt/model_loader/loader.py +10 -6
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/clip.py +5 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +732 -403
- sglang/srt/models/exaone.py +8 -3
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +75 -33
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/llama4.py +10 -2
- sglang/srt/models/llava.py +26 -18
- sglang/srt/models/mimo_mtp.py +220 -0
- sglang/srt/models/minicpmo.py +7 -17
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/mistral.py +71 -1
- sglang/srt/models/mllama.py +3 -3
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +133 -35
- sglang/srt/models/qwen2_5_vl.py +5 -3
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +206 -69
- sglang/srt/models/qwen2_vl.py +3 -3
- sglang/srt/models/qwen3.py +92 -19
- sglang/srt/models/qwen3_moe.py +457 -55
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/siglip.py +294 -0
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +114 -40
- sglang/srt/openai_api/protocol.py +37 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +189 -0
- sglang/srt/operations_strategy.py +207 -0
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +235 -38
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +181 -90
- sglang/srt/speculative/eagle_worker.py +146 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +197 -19
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +278 -0
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +81 -42
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
- sglang-0.4.7.dist-info/RECORD +699 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- sglang/srt/function_call_parser.py +0 -858
- sglang/srt/platforms/interface.py +0 -371
- sglang-0.4.6.post4.dist-info/RECORD +0 -646
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -8,9 +8,10 @@ import torch
|
|
8
8
|
import torch.nn as nn
|
9
9
|
|
10
10
|
from sglang.srt.custom_op import CustomOp
|
11
|
-
from sglang.srt.utils import is_cuda
|
11
|
+
from sglang.srt.utils import is_cuda, is_hip
|
12
12
|
|
13
13
|
_is_cuda = is_cuda()
|
14
|
+
_is_hip = is_hip()
|
14
15
|
|
15
16
|
if _is_cuda:
|
16
17
|
from sgl_kernel import apply_rope_with_cos_sin_cache_inplace
|
@@ -609,6 +610,10 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
|
|
609
610
|
head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
|
610
611
|
)
|
611
612
|
|
613
|
+
# Re-dispatch
|
614
|
+
if _is_hip:
|
615
|
+
self._forward_method = self.forward_native
|
616
|
+
|
612
617
|
def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
|
613
618
|
pos_freqs = self.base ** (
|
614
619
|
torch.arange(0, self.rotary_dim, 2, dtype=torch.float, device=self.device)
|
@@ -650,17 +655,6 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
|
|
650
655
|
cache = torch.cat((cos, sin), dim=-1)
|
651
656
|
return cache
|
652
657
|
|
653
|
-
def forward_hip(self, *args, **kwargs):
|
654
|
-
return self.forward_native(*args, **kwargs)
|
655
|
-
|
656
|
-
def forward(self, *args, **kwargs):
|
657
|
-
if torch.compiler.is_compiling():
|
658
|
-
return self.forward_native(*args, **kwargs)
|
659
|
-
if _is_cuda:
|
660
|
-
return self.forward_cuda(*args, **kwargs)
|
661
|
-
else:
|
662
|
-
return self.forward_native(*args, **kwargs)
|
663
|
-
|
664
658
|
def forward_native(
|
665
659
|
self,
|
666
660
|
positions: torch.Tensor,
|
sglang/srt/layers/sampler.py
CHANGED
@@ -5,7 +5,7 @@ import torch
|
|
5
5
|
import torch.distributed as dist
|
6
6
|
from torch import nn
|
7
7
|
|
8
|
-
from sglang.srt.distributed import
|
8
|
+
from sglang.srt.distributed import get_tp_group
|
9
9
|
from sglang.srt.layers.dp_attention import get_attention_tp_group
|
10
10
|
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
11
11
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
@@ -30,7 +30,7 @@ class Sampler(nn.Module):
|
|
30
30
|
def __init__(self):
|
31
31
|
super().__init__()
|
32
32
|
self.use_nan_detection = global_server_args_dict["enable_nan_detection"]
|
33
|
-
self.tp_sync_group =
|
33
|
+
self.tp_sync_group = get_tp_group().device_group
|
34
34
|
|
35
35
|
if global_server_args_dict["enable_dp_attention"]:
|
36
36
|
self.tp_sync_group = get_attention_tp_group().device_group
|
@@ -59,7 +59,7 @@ class Sampler(nn.Module):
|
|
59
59
|
|
60
60
|
# Apply the custom logit processors if registered in the sampling info.
|
61
61
|
if sampling_info.has_custom_logit_processor:
|
62
|
-
|
62
|
+
apply_custom_logit_processor(logits, sampling_info)
|
63
63
|
|
64
64
|
if self.use_nan_detection and torch.any(torch.isnan(logits)):
|
65
65
|
logger.warning("Detected errors during sampling! NaN in the logits.")
|
@@ -81,54 +81,39 @@ class Sampler(nn.Module):
|
|
81
81
|
probs = logits
|
82
82
|
del logits
|
83
83
|
|
84
|
-
if
|
85
|
-
if
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
# Check Nan will throw exception, only check when crash_on_warnings is True
|
104
|
-
check_nan = self.use_nan_detection and crash_on_warnings()
|
105
|
-
batch_next_token_ids = top_k_top_p_sampling_from_probs(
|
84
|
+
if True: # Keep this redundant check to simplify some internal code sync
|
85
|
+
if global_server_args_dict["sampling_backend"] == "flashinfer":
|
86
|
+
if sampling_info.need_min_p_sampling:
|
87
|
+
probs = top_k_renorm_prob(probs, sampling_info.top_ks)
|
88
|
+
probs = top_p_renorm_prob(probs, sampling_info.top_ps)
|
89
|
+
batch_next_token_ids = min_p_sampling_from_probs(
|
90
|
+
probs, sampling_info.min_ps
|
91
|
+
)
|
92
|
+
else:
|
93
|
+
batch_next_token_ids = top_k_top_p_sampling_from_probs(
|
94
|
+
probs,
|
95
|
+
sampling_info.top_ks,
|
96
|
+
sampling_info.top_ps,
|
97
|
+
filter_apply_order="joint",
|
98
|
+
check_nan=self.use_nan_detection,
|
99
|
+
)
|
100
|
+
elif global_server_args_dict["sampling_backend"] == "pytorch":
|
101
|
+
# A slower fallback implementation with torch native operations.
|
102
|
+
batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(
|
106
103
|
probs,
|
107
104
|
sampling_info.top_ks,
|
108
105
|
sampling_info.top_ps,
|
109
|
-
|
110
|
-
|
106
|
+
sampling_info.min_ps,
|
107
|
+
sampling_info.need_min_p_sampling,
|
108
|
+
)
|
109
|
+
else:
|
110
|
+
raise ValueError(
|
111
|
+
f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
|
111
112
|
)
|
112
113
|
|
113
|
-
|
114
|
-
#
|
115
|
-
|
116
|
-
probs,
|
117
|
-
sampling_info.top_ks,
|
118
|
-
sampling_info.top_ps,
|
119
|
-
sampling_info.min_ps,
|
120
|
-
sampling_info.need_min_p_sampling,
|
121
|
-
)
|
122
|
-
|
123
|
-
if return_logprob:
|
124
|
-
# clamp to avoid -inf
|
125
|
-
logprobs = torch.log(
|
126
|
-
top_p_normalize_probs_torch(probs, sampling_info.top_ps)
|
127
|
-
).clamp(min=torch.finfo(probs.dtype).min)
|
128
|
-
else:
|
129
|
-
raise ValueError(
|
130
|
-
f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
|
131
|
-
)
|
114
|
+
if return_logprob:
|
115
|
+
# clamp to avoid -inf
|
116
|
+
logprobs = torch.log(probs).clamp(min=torch.finfo(probs.dtype).min)
|
132
117
|
|
133
118
|
# Attach logprobs to logits_output (in-place modification)
|
134
119
|
if return_logprob:
|
@@ -165,39 +150,6 @@ class Sampler(nn.Module):
|
|
165
150
|
|
166
151
|
return batch_next_token_ids
|
167
152
|
|
168
|
-
def _apply_custom_logit_processor(
|
169
|
-
self, logits: torch.Tensor, sampling_batch_info: SamplingBatchInfo
|
170
|
-
):
|
171
|
-
"""Apply custom logit processors to the logits.
|
172
|
-
This function will modify the logits in-place."""
|
173
|
-
|
174
|
-
assert logits.shape[0] == len(sampling_batch_info), (
|
175
|
-
f"The batch size of logits ({logits.shape[0]}) does not match the batch size of "
|
176
|
-
f"sampling_batch_info ({len(sampling_batch_info)})"
|
177
|
-
)
|
178
|
-
|
179
|
-
for _, (
|
180
|
-
processor,
|
181
|
-
batch_mask,
|
182
|
-
) in sampling_batch_info.custom_logit_processor.items():
|
183
|
-
# Get the batch indices that need to be processed
|
184
|
-
batch_indices = batch_mask.nonzero(as_tuple=True)[0]
|
185
|
-
|
186
|
-
assert batch_mask.shape[0] == len(sampling_batch_info), (
|
187
|
-
f"The number of batch mask ({batch_mask.shape[0]}) does not match the number of "
|
188
|
-
f"sampling_batch_info ({len(sampling_batch_info)})"
|
189
|
-
)
|
190
|
-
|
191
|
-
# Apply the processor to the logits
|
192
|
-
logits[batch_mask] = processor(
|
193
|
-
logits[batch_mask],
|
194
|
-
[sampling_batch_info.custom_params[i] for i in batch_indices],
|
195
|
-
)
|
196
|
-
|
197
|
-
logger.debug(
|
198
|
-
f"Custom logit processor {processor.__class__.__name__} is applied."
|
199
|
-
)
|
200
|
-
|
201
153
|
|
202
154
|
def top_k_top_p_min_p_sampling_from_probs_torch(
|
203
155
|
probs: torch.Tensor,
|
@@ -226,6 +178,14 @@ def top_k_top_p_min_p_sampling_from_probs_torch(
|
|
226
178
|
return batch_next_token_ids
|
227
179
|
|
228
180
|
|
181
|
+
def sampling_from_probs_torch(probs: torch.Tensor):
|
182
|
+
"""A sampling implementation with native pytorch operations, without
|
183
|
+
top-k, top-p, or min-p filtering."""
|
184
|
+
sampled_index = torch.multinomial(probs, num_samples=1)
|
185
|
+
batch_next_token_ids = sampled_index.view(-1).to(torch.int32)
|
186
|
+
return batch_next_token_ids
|
187
|
+
|
188
|
+
|
229
189
|
def top_p_normalize_probs_torch(
|
230
190
|
probs: torch.Tensor,
|
231
191
|
top_ps: torch.Tensor,
|
@@ -264,3 +224,44 @@ def get_token_ids_logprobs(logprobs: torch.Tensor, token_ids_logprobs: List[List
|
|
264
224
|
output_token_ids_logprobs_idx.append([])
|
265
225
|
|
266
226
|
return output_token_ids_logprobs_val, output_token_ids_logprobs_idx
|
227
|
+
|
228
|
+
|
229
|
+
def apply_custom_logit_processor(
|
230
|
+
logits: torch.Tensor,
|
231
|
+
sampling_batch_info: SamplingBatchInfo,
|
232
|
+
num_tokens_in_batch: int = 1,
|
233
|
+
):
|
234
|
+
"""Apply custom logit processors to the logits.
|
235
|
+
This function will modify the logits in-place.
|
236
|
+
num_tokens_in_batch is needed to support spec decoding, where each batch can contain multiple
|
237
|
+
tokens. By default, we assume each batch contains only 1 token.
|
238
|
+
"""
|
239
|
+
|
240
|
+
assert logits.shape[0] == len(sampling_batch_info) * num_tokens_in_batch, (
|
241
|
+
f"The batch size of logits ({logits.shape[0]}) does not match the batch size of "
|
242
|
+
f"sampling_batch_info ({len(sampling_batch_info)}) x num_tokens_in_batch "
|
243
|
+
f"({num_tokens_in_batch})"
|
244
|
+
)
|
245
|
+
|
246
|
+
for _, (
|
247
|
+
processor,
|
248
|
+
batch_mask,
|
249
|
+
) in sampling_batch_info.custom_logit_processor.items():
|
250
|
+
# Get the batch indices that need to be processed
|
251
|
+
batch_indices = batch_mask.nonzero(as_tuple=True)[0]
|
252
|
+
|
253
|
+
assert batch_mask.shape[0] == len(sampling_batch_info), (
|
254
|
+
f"The number of batch mask ({batch_mask.shape[0]}) does not match the number of "
|
255
|
+
f"sampling_batch_info ({len(sampling_batch_info)})"
|
256
|
+
)
|
257
|
+
batch_mask = torch.repeat_interleave(batch_mask, num_tokens_in_batch)
|
258
|
+
|
259
|
+
# Apply the processor to the logits
|
260
|
+
logits[batch_mask] = processor(
|
261
|
+
logits[batch_mask],
|
262
|
+
[sampling_batch_info.custom_params[i] for i in batch_indices],
|
263
|
+
)
|
264
|
+
|
265
|
+
logger.debug(
|
266
|
+
f"Custom logit processor {processor.__class__.__name__} is applied."
|
267
|
+
)
|
sglang/srt/layers/utils.py
CHANGED
@@ -33,3 +33,9 @@ class PPMissingLayer(torch.nn.Identity):
|
|
33
33
|
"""
|
34
34
|
input = args[0] if args else next(iter(kwargs.values()))
|
35
35
|
return (input,) if self.return_tuple else input
|
36
|
+
|
37
|
+
|
38
|
+
def is_sm100_supported(device=None) -> bool:
|
39
|
+
return (torch.cuda.get_device_capability(device)[0] == 10) and (
|
40
|
+
torch.version.cuda >= "12.8"
|
41
|
+
)
|
sglang/srt/lora/layers.py
CHANGED
@@ -137,7 +137,7 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
|
|
137
137
|
self.A_buffer_gate_up = A_buffer
|
138
138
|
if self.lora_backend.fuse_stacked_lora_b:
|
139
139
|
# B_buffer_gate_up: (num_lora, 2 * output_dim, r)
|
140
|
-
if
|
140
|
+
if getattr(self, "B_buffer_gate_up", None) is None:
|
141
141
|
self.B_buffer_gate_up = torch.empty(
|
142
142
|
(
|
143
143
|
B_buffer[0].shape[0],
|
@@ -202,7 +202,7 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
|
|
202
202
|
output_dim_q, output_dim_kv = B_buffer_q.shape[-2], B_buffer_kv.shape[-2]
|
203
203
|
|
204
204
|
# B_buffer_qkv: (num_lora, output_dim_q + 2 * output_dim_kv, r)
|
205
|
-
if
|
205
|
+
if getattr(self, "B_buffer_qkv", None) is None:
|
206
206
|
self.B_buffer_qkv = torch.empty(
|
207
207
|
(
|
208
208
|
B_buffer_q[0].shape[0],
|
@@ -221,20 +221,17 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
|
|
221
221
|
)
|
222
222
|
|
223
223
|
# Offsets of q/k/v in output dimension
|
224
|
-
if
|
225
|
-
self.output_offset = torch.
|
226
|
-
|
224
|
+
if getattr(self, "output_offset", None) is None:
|
225
|
+
self.output_offset = torch.tensor(
|
226
|
+
[
|
227
|
+
0,
|
228
|
+
output_dim_q,
|
229
|
+
output_dim_q + output_dim_kv,
|
230
|
+
output_dim_q + 2 * output_dim_kv,
|
231
|
+
],
|
232
|
+
dtype=torch.int32,
|
233
|
+
device=B_buffer_q.device,
|
227
234
|
)
|
228
|
-
self.output_offset[:4] = torch.tensor(
|
229
|
-
[
|
230
|
-
0,
|
231
|
-
output_dim_q,
|
232
|
-
output_dim_q + output_dim_kv,
|
233
|
-
output_dim_q + 2 * output_dim_kv,
|
234
|
-
],
|
235
|
-
dtype=torch.int32,
|
236
|
-
device=B_buffer_q.device,
|
237
|
-
)
|
238
235
|
# For computing number of launched blocks
|
239
236
|
self.max_qkv_out_dim = max(output_dim_q, output_dim_kv)
|
240
237
|
else:
|
sglang/srt/lora/lora.py
CHANGED
@@ -92,11 +92,12 @@ class LoRAAdapter(nn.Module):
|
|
92
92
|
for i in range(self.base_hf_config.num_hidden_layers):
|
93
93
|
layer = self.layers[i]
|
94
94
|
weight_names = [name for name, _ in layer.weights.items()]
|
95
|
-
self.
|
96
|
-
self.
|
97
|
-
|
98
|
-
def stack_qkv_proj(self, weight_names: List[str], weights: Dict[str, torch.Tensor]):
|
95
|
+
self.normalize_qkv_proj(weight_names, layer.weights)
|
96
|
+
self.normalize_gate_up_proj(weight_names, layer.weights)
|
99
97
|
|
98
|
+
def normalize_qkv_proj(
|
99
|
+
self, weight_names: List[str], weights: Dict[str, torch.Tensor]
|
100
|
+
):
|
100
101
|
# Collect target q/k/v modules. This process is necessary since there might be no lora attached to k_proj
|
101
102
|
target_module = set()
|
102
103
|
for weight_name in weight_names:
|
@@ -106,6 +107,8 @@ class LoRAAdapter(nn.Module):
|
|
106
107
|
target_module.add("q_proj")
|
107
108
|
if "v_proj" in weight_name:
|
108
109
|
target_module.add("v_proj")
|
110
|
+
if "qkv_proj" in weight_name:
|
111
|
+
target_module.add("qkv_proj")
|
109
112
|
if len(target_module) == 0:
|
110
113
|
return
|
111
114
|
|
@@ -148,8 +151,35 @@ class LoRAAdapter(nn.Module):
|
|
148
151
|
if "k_proj" in target_module:
|
149
152
|
weights.pop(k_name)
|
150
153
|
weights.pop(v_name)
|
154
|
+
elif "qkv_proj" in weight_name:
|
155
|
+
# If qkv_proj is already stacked, we normalize it following the SGL convention.
|
156
|
+
qkv_name = weight_name
|
157
|
+
q_name = weight_name.replace("qkv_proj", "q_proj")
|
158
|
+
k_name = weight_name.replace("qkv_proj", "k_proj")
|
159
|
+
v_name = weight_name.replace("qkv_proj", "v_proj")
|
160
|
+
kv_name = weight_name.replace("qkv_proj", "kv_proj")
|
161
|
+
if "lora_A" in weight_name:
|
162
|
+
weights[qkv_name] = weights[qkv_name].repeat(3, 1)
|
163
|
+
else:
|
164
|
+
head_size = (
|
165
|
+
self.base_hf_config.hidden_size
|
166
|
+
// self.base_hf_config.num_attention_heads
|
167
|
+
)
|
168
|
+
weights[q_name], k_proj_weight, v_proj_weight = torch.split(
|
169
|
+
weights[qkv_name],
|
170
|
+
[
|
171
|
+
head_size * self.base_hf_config.num_attention_heads,
|
172
|
+
head_size * self.base_hf_config.num_key_value_heads,
|
173
|
+
head_size * self.base_hf_config.num_key_value_heads,
|
174
|
+
],
|
175
|
+
dim=0,
|
176
|
+
)
|
177
|
+
weights[kv_name] = torch.stack(
|
178
|
+
[k_proj_weight, v_proj_weight],
|
179
|
+
dim=0,
|
180
|
+
)
|
151
181
|
|
152
|
-
def
|
182
|
+
def normalize_gate_up_proj(
|
153
183
|
self, weight_names: List[str], weights: Dict[str, torch.Tensor]
|
154
184
|
):
|
155
185
|
for weight_name in weight_names:
|
@@ -179,3 +209,17 @@ class LoRAAdapter(nn.Module):
|
|
179
209
|
weights.pop(weight_name)
|
180
210
|
if up_name in weights:
|
181
211
|
weights.pop(up_name)
|
212
|
+
elif "gate_up_proj" in weight_name:
|
213
|
+
# If gate_up_proj is already stacked, we normalize it following the SGL convention
|
214
|
+
gate_up_name = weight_name
|
215
|
+
if "lora_A" in weight_name:
|
216
|
+
weights[gate_up_name] = weights[gate_up_name].repeat(2, 1)
|
217
|
+
else:
|
218
|
+
output_dim = weights[gate_up_name].shape[0] // 2
|
219
|
+
weights[gate_up_name] = torch.stack(
|
220
|
+
[
|
221
|
+
weights[gate_up_name][:output_dim, :],
|
222
|
+
weights[gate_up_name][output_dim:, :],
|
223
|
+
],
|
224
|
+
dim=0,
|
225
|
+
)
|
sglang/srt/lora/lora_manager.py
CHANGED
@@ -32,7 +32,7 @@ from sglang.srt.lora.utils import (
|
|
32
32
|
LoRAType,
|
33
33
|
get_customized_names_from_hf_names,
|
34
34
|
get_layer_id,
|
35
|
-
|
35
|
+
get_normalized_lora_weight_names,
|
36
36
|
get_weight_name,
|
37
37
|
)
|
38
38
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
@@ -101,10 +101,13 @@ class LoRAManager:
|
|
101
101
|
self.hf_target_names.update(self.configs[name].target_modules)
|
102
102
|
|
103
103
|
# Target lora weight names for lora_a and lora_b modules respectively.
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
104
|
+
weights_A: List[str] = []
|
105
|
+
weights_B: List[str] = []
|
106
|
+
for module in self.hf_target_names:
|
107
|
+
lora_A, lora_B = get_normalized_lora_weight_names(module)
|
108
|
+
weights_A += lora_A
|
109
|
+
weights_B += lora_B
|
110
|
+
self.lora_weight_names: Tuple[Set[str]] = set(weights_A), set(weights_B)
|
108
111
|
|
109
112
|
# load all weights to cpu
|
110
113
|
self.loras: Dict[str, LoRAAdapter] = {}
|
@@ -170,9 +173,7 @@ class LoRAManager:
|
|
170
173
|
dim=0,
|
171
174
|
out=self.cuda_graph_batch_info.seg_indptr[1 : bs + 1],
|
172
175
|
)
|
173
|
-
self.cuda_graph_batch_info.max_len =
|
174
|
-
torch.max(self.cuda_graph_batch_info.seg_lens[:bs])
|
175
|
-
)
|
176
|
+
self.cuda_graph_batch_info.max_len = 1
|
176
177
|
|
177
178
|
for i, lora_path in enumerate(forward_batch.lora_paths):
|
178
179
|
self.cuda_graph_batch_info.weight_indices[i] = (
|
@@ -265,7 +266,18 @@ class LoRAManager:
|
|
265
266
|
self.lora_modules: Dict[int, List[Tuple[str, BaseLayerWithLoRA]]] = {
|
266
267
|
i: [] for i in range(self.base_hf_config.num_hidden_layers)
|
267
268
|
}
|
269
|
+
|
268
270
|
for module_name, module in self.base_model.named_modules():
|
271
|
+
# TODO (lifuhuang): in the future, we should consider generalizing the
|
272
|
+
# should_apply_lora function to support mapping by full module name instead
|
273
|
+
# of just the last part (e.g., "qkv_proj") to support scenarios with multiple
|
274
|
+
# attention stacks (e.g., multimodal models).
|
275
|
+
# See: https://github.com/sgl-project/sglang/issues/6608
|
276
|
+
if getattr(
|
277
|
+
self.base_model, "should_apply_lora", None
|
278
|
+
) and not self.base_model.should_apply_lora(module_name):
|
279
|
+
continue
|
280
|
+
|
269
281
|
# The module should be converted if it is included in target_names
|
270
282
|
if module_name.split(".")[-1] in customized_target_names:
|
271
283
|
layer_id = get_layer_id(module_name)
|
sglang/srt/lora/mem_pool.py
CHANGED
@@ -91,18 +91,16 @@ class LoRAMemoryPool:
|
|
91
91
|
|
92
92
|
def init_buffers(
|
93
93
|
self,
|
94
|
-
lora_weight_names: Set[
|
94
|
+
lora_weight_names: Tuple[Set[str]],
|
95
95
|
base_model: torch.nn.Module,
|
96
96
|
):
|
97
97
|
|
98
98
|
# lora_weight_names is a set of name pairs indicating each pair of lora modules to load
|
99
99
|
# e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj"), ("o_proj", "o_proj")}
|
100
|
-
self.lora_weight_names: Set[
|
100
|
+
self.lora_weight_names: Tuple[Set[str]] = lora_weight_names
|
101
101
|
device = next(base_model.parameters()).device
|
102
|
-
lora_module_A_names = set([name[0] for name in lora_weight_names])
|
103
|
-
lora_module_B_names = set([name[1] for name in lora_weight_names])
|
104
102
|
# Init A tensor, column_major=False
|
105
|
-
for module_A in
|
103
|
+
for module_A in lora_weight_names[0]:
|
106
104
|
lora_A_shape = self.get_lora_A_shape(module_A, base_model)
|
107
105
|
self.A_buffer[module_A] = [
|
108
106
|
torch.empty(
|
@@ -110,10 +108,10 @@ class LoRAMemoryPool:
|
|
110
108
|
dtype=self.dtype,
|
111
109
|
device=device,
|
112
110
|
)
|
113
|
-
for
|
111
|
+
for _ in range(self.num_layer)
|
114
112
|
]
|
115
113
|
# Init B tensor, column_major=True
|
116
|
-
for module_B in
|
114
|
+
for module_B in lora_weight_names[1]:
|
117
115
|
lora_B_shape = self.get_lora_B_shape(module_B, base_model)
|
118
116
|
self.B_buffer[module_B] = [
|
119
117
|
torch.empty(
|
@@ -159,6 +157,10 @@ class LoRAMemoryPool:
|
|
159
157
|
def load_lora_weight_to_buffer(
|
160
158
|
self, uid: str, buffer_id: int, lora_adapter: LoRAAdapter = None
|
161
159
|
):
|
160
|
+
def check_lora_weight_shape(buffer_view: torch.Tensor, weight: torch.Tensor):
|
161
|
+
assert (
|
162
|
+
buffer_view.shape == weight.shape
|
163
|
+
), f"LoRA buffer shape {buffer_view.shape} does not match weight shape {weight.shape}."
|
162
164
|
|
163
165
|
if uid is None:
|
164
166
|
for i in range(self.num_layer):
|
@@ -210,21 +212,27 @@ class LoRAMemoryPool:
|
|
210
212
|
|
211
213
|
for name, weights in temp_A_buffer.items():
|
212
214
|
c = get_stacked_multiply(name)
|
213
|
-
self.A_buffer[name][layer_id][buffer_id][
|
214
|
-
|
215
|
-
|
215
|
+
buffer_view = self.A_buffer[name][layer_id][buffer_id][
|
216
|
+
: lora_rank * c, :
|
217
|
+
]
|
218
|
+
check_lora_weight_shape(buffer_view, weights)
|
219
|
+
buffer_view.copy_(weights)
|
216
220
|
|
217
221
|
for name, weights in temp_B_buffer.items():
|
218
222
|
c = get_stacked_multiply(name)
|
219
223
|
if c > 1:
|
220
224
|
for stacked_id in range(c):
|
221
|
-
self.B_buffer[name][layer_id][stacked_id][
|
222
|
-
|
223
|
-
]
|
225
|
+
buffer_view = self.B_buffer[name][layer_id][stacked_id][
|
226
|
+
buffer_id
|
227
|
+
][:, :lora_rank]
|
228
|
+
check_lora_weight_shape(buffer_view, weights[stacked_id])
|
229
|
+
buffer_view.copy_(weights[stacked_id])
|
224
230
|
else:
|
225
|
-
self.B_buffer[name][layer_id][0][buffer_id][
|
226
|
-
|
227
|
-
|
231
|
+
buffer_view = self.B_buffer[name][layer_id][0][buffer_id][
|
232
|
+
:, :lora_rank
|
233
|
+
]
|
234
|
+
check_lora_weight_shape(buffer_view, weights)
|
235
|
+
buffer_view.copy_(weights)
|
228
236
|
|
229
237
|
def get_tensor(
|
230
238
|
self, weight_name: str, layer_id: int, lora_type: LoRAType
|
sglang/srt/lora/utils.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
import re
|
2
2
|
from dataclasses import dataclass
|
3
3
|
from enum import Enum
|
4
|
-
from typing import Optional, Set, Tuple
|
4
|
+
from typing import List, Optional, Set, Tuple
|
5
5
|
|
6
6
|
import torch
|
7
7
|
|
@@ -106,18 +106,22 @@ def get_hidden_dim(
|
|
106
106
|
raise NotImplementedError()
|
107
107
|
|
108
108
|
|
109
|
-
def
|
109
|
+
def get_normalized_lora_weight_names(name: str) -> Tuple[List[str], List[str]]:
|
110
110
|
"""
|
111
|
-
Mapping a target module name to
|
111
|
+
Mapping a target module name to names of the normized LoRA weights.
|
112
|
+
Returned tuple contains (name for Lora A, name for Lora B)
|
112
113
|
"""
|
113
114
|
params_mapping = {
|
114
|
-
"q_proj": ("qkv_proj", "q_proj"),
|
115
|
-
"k_proj": ("qkv_proj", "kv_proj"),
|
116
|
-
"v_proj": ("qkv_proj", "kv_proj"),
|
117
|
-
"gate_proj": ("gate_up_proj", "gate_up_proj"),
|
118
|
-
"up_proj": ("gate_up_proj", "gate_up_proj"),
|
115
|
+
"q_proj": (["qkv_proj"], ["q_proj"]),
|
116
|
+
"k_proj": (["qkv_proj"], ["kv_proj"]),
|
117
|
+
"v_proj": (["qkv_proj"], ["kv_proj"]),
|
118
|
+
"gate_proj": (["gate_up_proj"], ["gate_up_proj"]),
|
119
|
+
"up_proj": (["gate_up_proj"], ["gate_up_proj"]),
|
120
|
+
"qkv_proj": (["qkv_proj"], ["q_proj", "kv_proj"]),
|
121
|
+
"gate_up_proj": (["gate_up_proj"], ["gate_up_proj"]),
|
119
122
|
}
|
120
|
-
|
123
|
+
stacked = params_mapping.get(name, ([name], [name]))
|
124
|
+
return stacked
|
121
125
|
|
122
126
|
|
123
127
|
def get_stacked_multiply(module_name: str) -> int:
|
@@ -133,7 +137,7 @@ def get_stacked_multiply(module_name: str) -> int:
|
|
133
137
|
|
134
138
|
|
135
139
|
def get_weight_name(
|
136
|
-
target_name: str, lora_weight_names: Set[
|
140
|
+
target_name: str, lora_weight_names: Tuple[Set[str]], lora_type: LoRAType
|
137
141
|
) -> Optional[str]:
|
138
142
|
"""
|
139
143
|
target_name is name of a given module,
|
@@ -142,9 +146,9 @@ def get_weight_name(
|
|
142
146
|
Else raise ValueError.
|
143
147
|
"""
|
144
148
|
idx = 0 if lora_type == LoRAType.LORA_A else 1
|
145
|
-
for
|
146
|
-
if
|
147
|
-
return
|
149
|
+
for weight_name in lora_weight_names[idx]:
|
150
|
+
if weight_name in target_name:
|
151
|
+
return weight_name
|
148
152
|
raise ValueError(
|
149
153
|
f"Cannot find weight name for {target_name} in {lora_weight_names}"
|
150
154
|
)
|
@@ -248,12 +248,20 @@ class DataParallelController:
|
|
248
248
|
|
249
249
|
def round_robin_scheduler(self, req: Req):
|
250
250
|
if self.server_args.disaggregation_mode == "null":
|
251
|
-
|
252
|
-
|
253
|
-
self.workers
|
254
|
-
|
251
|
+
if req.data_parallel_rank is not None:
|
252
|
+
logger.debug(f"Direct routing to DP rank {req.data_parallel_rank}")
|
253
|
+
self.workers[req.data_parallel_rank].send_pyobj(req)
|
254
|
+
else:
|
255
|
+
self.workers[self.round_robin_counter].send_pyobj(req)
|
256
|
+
self.round_robin_counter = (self.round_robin_counter + 1) % len(
|
257
|
+
self.workers
|
258
|
+
)
|
255
259
|
else:
|
256
|
-
|
260
|
+
if req.data_parallel_rank is not None:
|
261
|
+
logger.debug(f"Direct routing to DP rank {req.data_parallel_rank}")
|
262
|
+
self.workers[req.data_parallel_rank].send_pyobj(req)
|
263
|
+
else:
|
264
|
+
self.workers[req.bootstrap_room % len(self.workers)].send_pyobj(req)
|
257
265
|
|
258
266
|
def shortest_queue_scheduler(self, input_requests):
|
259
267
|
raise NotImplementedError()
|