sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +7 -0
- sglang/bench_offline_throughput.py +10 -4
- sglang/bench_one_batch_server.py +67 -11
- sglang/bench_serving.py +86 -75
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/lang/interpreter.py +40 -1
- sglang/lang/ir.py +27 -0
- sglang/math_utils.py +8 -0
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +33 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +52 -8
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/base/__init__.py +1 -1
- sglang/srt/disaggregation/base/conn.py +25 -11
- sglang/srt/disaggregation/common/__init__.py +5 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/common/utils.py +42 -0
- sglang/srt/disaggregation/decode.py +261 -52
- sglang/srt/disaggregation/fake/__init__.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +16 -9
- sglang/srt/disaggregation/kv_events.py +60 -5
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +29 -48
- sglang/srt/disaggregation/mooncake/__init__.py +1 -1
- sglang/srt/disaggregation/mooncake/conn.py +446 -149
- sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
- sglang/srt/disaggregation/nixl/__init__.py +6 -1
- sglang/srt/disaggregation/nixl/conn.py +134 -437
- sglang/srt/disaggregation/prefill.py +130 -43
- sglang/srt/disaggregation/utils.py +127 -86
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/entrypoints/EngineBase.py +6 -0
- sglang/srt/entrypoints/engine.py +116 -5
- sglang/srt/entrypoints/http_server.py +28 -4
- sglang/srt/eplb_simulator/__init__.py +1 -0
- sglang/srt/eplb_simulator/reader.py +51 -0
- sglang/srt/function_call/base_format_detector.py +138 -86
- sglang/srt/function_call/deepseekv3_detector.py +54 -6
- sglang/srt/function_call/ebnf_composer.py +33 -19
- sglang/srt/function_call/function_call_parser.py +27 -0
- sglang/srt/function_call/llama32_detector.py +33 -14
- sglang/srt/function_call/mistral_detector.py +73 -26
- sglang/srt/function_call/pythonic_detector.py +86 -20
- sglang/srt/function_call/qwen25_detector.py +64 -10
- sglang/srt/function_call/utils.py +17 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/activation.py +19 -0
- sglang/srt/layers/attention/aiter_backend.py +503 -125
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
- sglang/srt/layers/attention/flashattention_backend.py +137 -63
- sglang/srt/layers/attention/flashinfer_backend.py +46 -3
- sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
- sglang/srt/layers/attention/flashmla_backend.py +2 -10
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +304 -65
- sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/vision.py +51 -24
- sglang/srt/layers/communicator.py +281 -197
- sglang/srt/layers/dp_attention.py +6 -5
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/linear.py +0 -4
- sglang/srt/layers/logits_processor.py +0 -12
- sglang/srt/layers/moe/cutlass_moe.py +170 -7
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
- sglang/srt/layers/moe/ep_moe/layer.py +136 -72
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
- sglang/srt/layers/moe/topk.py +60 -26
- sglang/srt/layers/multimodal.py +3 -3
- sglang/srt/layers/pooler.py +56 -0
- sglang/srt/layers/quantization/__init__.py +3 -2
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
- sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
- sglang/srt/layers/quantization/fp8.py +28 -23
- sglang/srt/layers/quantization/fp8_kernel.py +156 -75
- sglang/srt/layers/quantization/fp8_utils.py +250 -69
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/radix_attention.py +2 -3
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +98 -39
- sglang/srt/lora/mem_pool.py +28 -21
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/cache_controller.py +2 -1
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
- sglang/srt/managers/eplb_manager.py +55 -14
- sglang/srt/managers/expert_distribution.py +220 -46
- sglang/srt/managers/expert_location.py +110 -56
- sglang/srt/managers/expert_location_dispatch.py +23 -6
- sglang/srt/managers/io_struct.py +43 -8
- sglang/srt/managers/mm_utils.py +88 -38
- sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
- sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
- sglang/srt/managers/multimodal_processors/internvl.py +4 -0
- sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
- sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
- sglang/srt/managers/multimodal_processors/vila.py +85 -0
- sglang/srt/managers/schedule_batch.py +173 -38
- sglang/srt/managers/scheduler.py +376 -127
- sglang/srt/managers/tokenizer_manager.py +163 -19
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/chunk_cache.py +1 -0
- sglang/srt/mem_cache/hiradix_cache.py +4 -2
- sglang/srt/mem_cache/memory_pool.py +111 -407
- sglang/srt/mem_cache/memory_pool_host.py +380 -0
- sglang/srt/mem_cache/radix_cache.py +36 -12
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +191 -113
- sglang/srt/model_executor/expert_location_updater.py +157 -22
- sglang/srt/model_executor/forward_batch_info.py +52 -22
- sglang/srt/model_executor/model_runner.py +102 -62
- sglang/srt/model_loader/loader.py +8 -1
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/bert.py +113 -13
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +623 -290
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +19 -14
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/internvl.py +46 -102
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/minicpmo.py +2 -5
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +38 -9
- sglang/srt/models/qwen2_5_vl.py +3 -9
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +58 -191
- sglang/srt/models/qwen2_vl.py +3 -9
- sglang/srt/models/qwen3.py +41 -10
- sglang/srt/models/qwen3_moe.py +230 -191
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/roberta.py +117 -9
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/models/vila.py +305 -0
- sglang/srt/openai_api/adapter.py +248 -28
- sglang/srt/openai_api/protocol.py +68 -3
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +37 -2
- sglang/srt/operations_strategy.py +200 -24
- sglang/srt/sampling/sampling_batch_info.py +37 -1
- sglang/srt/sampling/sampling_params.py +4 -1
- sglang/srt/server_args.py +381 -209
- sglang/srt/speculative/build_eagle_tree.py +9 -9
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
- sglang/srt/speculative/eagle_utils.py +440 -200
- sglang/srt/speculative/eagle_worker.py +234 -63
- sglang/srt/two_batch_overlap.py +637 -0
- sglang/srt/utils.py +187 -7
- sglang/test/attention/test_prefix_chunk_info.py +2 -0
- sglang/test/runners.py +54 -10
- sglang/test/send_one.py +4 -0
- sglang/test/test_block_fp8.py +1 -0
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
- sglang/test/test_block_fp8_ep.py +1 -0
- sglang/test/test_cutlass_moe.py +3 -3
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +82 -7
- sglang/utils.py +9 -0
- sglang/version.py +1 -1
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
sglang/srt/layers/sampler.py
CHANGED
@@ -5,7 +5,7 @@ import torch
|
|
5
5
|
import torch.distributed as dist
|
6
6
|
from torch import nn
|
7
7
|
|
8
|
-
from sglang.srt.distributed import
|
8
|
+
from sglang.srt.distributed import get_tp_group
|
9
9
|
from sglang.srt.layers.dp_attention import get_attention_tp_group
|
10
10
|
from sglang.srt.layers.logits_processor import LogitsProcessorOutput
|
11
11
|
from sglang.srt.managers.schedule_batch import global_server_args_dict
|
@@ -30,7 +30,7 @@ class Sampler(nn.Module):
|
|
30
30
|
def __init__(self):
|
31
31
|
super().__init__()
|
32
32
|
self.use_nan_detection = global_server_args_dict["enable_nan_detection"]
|
33
|
-
self.tp_sync_group =
|
33
|
+
self.tp_sync_group = get_tp_group().device_group
|
34
34
|
|
35
35
|
if global_server_args_dict["enable_dp_attention"]:
|
36
36
|
self.tp_sync_group = get_attention_tp_group().device_group
|
@@ -59,7 +59,7 @@ class Sampler(nn.Module):
|
|
59
59
|
|
60
60
|
# Apply the custom logit processors if registered in the sampling info.
|
61
61
|
if sampling_info.has_custom_logit_processor:
|
62
|
-
|
62
|
+
apply_custom_logit_processor(logits, sampling_info)
|
63
63
|
|
64
64
|
if self.use_nan_detection and torch.any(torch.isnan(logits)):
|
65
65
|
logger.warning("Detected errors during sampling! NaN in the logits.")
|
@@ -81,54 +81,39 @@ class Sampler(nn.Module):
|
|
81
81
|
probs = logits
|
82
82
|
del logits
|
83
83
|
|
84
|
-
if
|
85
|
-
if
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
# Check Nan will throw exception, only check when crash_on_warnings is True
|
104
|
-
check_nan = self.use_nan_detection and crash_on_warnings()
|
105
|
-
batch_next_token_ids = top_k_top_p_sampling_from_probs(
|
84
|
+
if True: # Keep this redundant check to simplify some internal code sync
|
85
|
+
if global_server_args_dict["sampling_backend"] == "flashinfer":
|
86
|
+
if sampling_info.need_min_p_sampling:
|
87
|
+
probs = top_k_renorm_prob(probs, sampling_info.top_ks)
|
88
|
+
probs = top_p_renorm_prob(probs, sampling_info.top_ps)
|
89
|
+
batch_next_token_ids = min_p_sampling_from_probs(
|
90
|
+
probs, sampling_info.min_ps
|
91
|
+
)
|
92
|
+
else:
|
93
|
+
batch_next_token_ids = top_k_top_p_sampling_from_probs(
|
94
|
+
probs,
|
95
|
+
sampling_info.top_ks,
|
96
|
+
sampling_info.top_ps,
|
97
|
+
filter_apply_order="joint",
|
98
|
+
check_nan=self.use_nan_detection,
|
99
|
+
)
|
100
|
+
elif global_server_args_dict["sampling_backend"] == "pytorch":
|
101
|
+
# A slower fallback implementation with torch native operations.
|
102
|
+
batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(
|
106
103
|
probs,
|
107
104
|
sampling_info.top_ks,
|
108
105
|
sampling_info.top_ps,
|
109
|
-
|
110
|
-
|
106
|
+
sampling_info.min_ps,
|
107
|
+
sampling_info.need_min_p_sampling,
|
108
|
+
)
|
109
|
+
else:
|
110
|
+
raise ValueError(
|
111
|
+
f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
|
111
112
|
)
|
112
113
|
|
113
|
-
|
114
|
-
#
|
115
|
-
|
116
|
-
probs,
|
117
|
-
sampling_info.top_ks,
|
118
|
-
sampling_info.top_ps,
|
119
|
-
sampling_info.min_ps,
|
120
|
-
sampling_info.need_min_p_sampling,
|
121
|
-
)
|
122
|
-
|
123
|
-
if return_logprob:
|
124
|
-
# clamp to avoid -inf
|
125
|
-
logprobs = torch.log(
|
126
|
-
top_p_normalize_probs_torch(probs, sampling_info.top_ps)
|
127
|
-
).clamp(min=torch.finfo(probs.dtype).min)
|
128
|
-
else:
|
129
|
-
raise ValueError(
|
130
|
-
f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
|
131
|
-
)
|
114
|
+
if return_logprob:
|
115
|
+
# clamp to avoid -inf
|
116
|
+
logprobs = torch.log(probs).clamp(min=torch.finfo(probs.dtype).min)
|
132
117
|
|
133
118
|
# Attach logprobs to logits_output (in-place modification)
|
134
119
|
if return_logprob:
|
@@ -165,39 +150,6 @@ class Sampler(nn.Module):
|
|
165
150
|
|
166
151
|
return batch_next_token_ids
|
167
152
|
|
168
|
-
def _apply_custom_logit_processor(
|
169
|
-
self, logits: torch.Tensor, sampling_batch_info: SamplingBatchInfo
|
170
|
-
):
|
171
|
-
"""Apply custom logit processors to the logits.
|
172
|
-
This function will modify the logits in-place."""
|
173
|
-
|
174
|
-
assert logits.shape[0] == len(sampling_batch_info), (
|
175
|
-
f"The batch size of logits ({logits.shape[0]}) does not match the batch size of "
|
176
|
-
f"sampling_batch_info ({len(sampling_batch_info)})"
|
177
|
-
)
|
178
|
-
|
179
|
-
for _, (
|
180
|
-
processor,
|
181
|
-
batch_mask,
|
182
|
-
) in sampling_batch_info.custom_logit_processor.items():
|
183
|
-
# Get the batch indices that need to be processed
|
184
|
-
batch_indices = batch_mask.nonzero(as_tuple=True)[0]
|
185
|
-
|
186
|
-
assert batch_mask.shape[0] == len(sampling_batch_info), (
|
187
|
-
f"The number of batch mask ({batch_mask.shape[0]}) does not match the number of "
|
188
|
-
f"sampling_batch_info ({len(sampling_batch_info)})"
|
189
|
-
)
|
190
|
-
|
191
|
-
# Apply the processor to the logits
|
192
|
-
logits[batch_mask] = processor(
|
193
|
-
logits[batch_mask],
|
194
|
-
[sampling_batch_info.custom_params[i] for i in batch_indices],
|
195
|
-
)
|
196
|
-
|
197
|
-
logger.debug(
|
198
|
-
f"Custom logit processor {processor.__class__.__name__} is applied."
|
199
|
-
)
|
200
|
-
|
201
153
|
|
202
154
|
def top_k_top_p_min_p_sampling_from_probs_torch(
|
203
155
|
probs: torch.Tensor,
|
@@ -226,6 +178,14 @@ def top_k_top_p_min_p_sampling_from_probs_torch(
|
|
226
178
|
return batch_next_token_ids
|
227
179
|
|
228
180
|
|
181
|
+
def sampling_from_probs_torch(probs: torch.Tensor):
|
182
|
+
"""A sampling implementation with native pytorch operations, without
|
183
|
+
top-k, top-p, or min-p filtering."""
|
184
|
+
sampled_index = torch.multinomial(probs, num_samples=1)
|
185
|
+
batch_next_token_ids = sampled_index.view(-1).to(torch.int32)
|
186
|
+
return batch_next_token_ids
|
187
|
+
|
188
|
+
|
229
189
|
def top_p_normalize_probs_torch(
|
230
190
|
probs: torch.Tensor,
|
231
191
|
top_ps: torch.Tensor,
|
@@ -264,3 +224,44 @@ def get_token_ids_logprobs(logprobs: torch.Tensor, token_ids_logprobs: List[List
|
|
264
224
|
output_token_ids_logprobs_idx.append([])
|
265
225
|
|
266
226
|
return output_token_ids_logprobs_val, output_token_ids_logprobs_idx
|
227
|
+
|
228
|
+
|
229
|
+
def apply_custom_logit_processor(
|
230
|
+
logits: torch.Tensor,
|
231
|
+
sampling_batch_info: SamplingBatchInfo,
|
232
|
+
num_tokens_in_batch: int = 1,
|
233
|
+
):
|
234
|
+
"""Apply custom logit processors to the logits.
|
235
|
+
This function will modify the logits in-place.
|
236
|
+
num_tokens_in_batch is needed to support spec decoding, where each batch can contain multiple
|
237
|
+
tokens. By default, we assume each batch contains only 1 token.
|
238
|
+
"""
|
239
|
+
|
240
|
+
assert logits.shape[0] == len(sampling_batch_info) * num_tokens_in_batch, (
|
241
|
+
f"The batch size of logits ({logits.shape[0]}) does not match the batch size of "
|
242
|
+
f"sampling_batch_info ({len(sampling_batch_info)}) x num_tokens_in_batch "
|
243
|
+
f"({num_tokens_in_batch})"
|
244
|
+
)
|
245
|
+
|
246
|
+
for _, (
|
247
|
+
processor,
|
248
|
+
batch_mask,
|
249
|
+
) in sampling_batch_info.custom_logit_processor.items():
|
250
|
+
# Get the batch indices that need to be processed
|
251
|
+
batch_indices = batch_mask.nonzero(as_tuple=True)[0]
|
252
|
+
|
253
|
+
assert batch_mask.shape[0] == len(sampling_batch_info), (
|
254
|
+
f"The number of batch mask ({batch_mask.shape[0]}) does not match the number of "
|
255
|
+
f"sampling_batch_info ({len(sampling_batch_info)})"
|
256
|
+
)
|
257
|
+
batch_mask = torch.repeat_interleave(batch_mask, num_tokens_in_batch)
|
258
|
+
|
259
|
+
# Apply the processor to the logits
|
260
|
+
logits[batch_mask] = processor(
|
261
|
+
logits[batch_mask],
|
262
|
+
[sampling_batch_info.custom_params[i] for i in batch_indices],
|
263
|
+
)
|
264
|
+
|
265
|
+
logger.debug(
|
266
|
+
f"Custom logit processor {processor.__class__.__name__} is applied."
|
267
|
+
)
|
sglang/srt/layers/utils.py
CHANGED
@@ -33,3 +33,9 @@ class PPMissingLayer(torch.nn.Identity):
|
|
33
33
|
"""
|
34
34
|
input = args[0] if args else next(iter(kwargs.values()))
|
35
35
|
return (input,) if self.return_tuple else input
|
36
|
+
|
37
|
+
|
38
|
+
def is_sm100_supported(device=None) -> bool:
|
39
|
+
return (torch.cuda.get_device_capability(device)[0] == 10) and (
|
40
|
+
torch.version.cuda >= "12.8"
|
41
|
+
)
|
sglang/srt/lora/layers.py
CHANGED
@@ -137,7 +137,7 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
|
|
137
137
|
self.A_buffer_gate_up = A_buffer
|
138
138
|
if self.lora_backend.fuse_stacked_lora_b:
|
139
139
|
# B_buffer_gate_up: (num_lora, 2 * output_dim, r)
|
140
|
-
if
|
140
|
+
if getattr(self, "B_buffer_gate_up", None) is None:
|
141
141
|
self.B_buffer_gate_up = torch.empty(
|
142
142
|
(
|
143
143
|
B_buffer[0].shape[0],
|
@@ -202,7 +202,7 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
|
|
202
202
|
output_dim_q, output_dim_kv = B_buffer_q.shape[-2], B_buffer_kv.shape[-2]
|
203
203
|
|
204
204
|
# B_buffer_qkv: (num_lora, output_dim_q + 2 * output_dim_kv, r)
|
205
|
-
if
|
205
|
+
if getattr(self, "B_buffer_qkv", None) is None:
|
206
206
|
self.B_buffer_qkv = torch.empty(
|
207
207
|
(
|
208
208
|
B_buffer_q[0].shape[0],
|
@@ -221,20 +221,17 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
|
|
221
221
|
)
|
222
222
|
|
223
223
|
# Offsets of q/k/v in output dimension
|
224
|
-
if
|
225
|
-
self.output_offset = torch.
|
226
|
-
|
224
|
+
if getattr(self, "output_offset", None) is None:
|
225
|
+
self.output_offset = torch.tensor(
|
226
|
+
[
|
227
|
+
0,
|
228
|
+
output_dim_q,
|
229
|
+
output_dim_q + output_dim_kv,
|
230
|
+
output_dim_q + 2 * output_dim_kv,
|
231
|
+
],
|
232
|
+
dtype=torch.int32,
|
233
|
+
device=B_buffer_q.device,
|
227
234
|
)
|
228
|
-
self.output_offset[:4] = torch.tensor(
|
229
|
-
[
|
230
|
-
0,
|
231
|
-
output_dim_q,
|
232
|
-
output_dim_q + output_dim_kv,
|
233
|
-
output_dim_q + 2 * output_dim_kv,
|
234
|
-
],
|
235
|
-
dtype=torch.int32,
|
236
|
-
device=B_buffer_q.device,
|
237
|
-
)
|
238
235
|
# For computing number of launched blocks
|
239
236
|
self.max_qkv_out_dim = max(output_dim_q, output_dim_kv)
|
240
237
|
else:
|
sglang/srt/lora/lora.py
CHANGED
@@ -92,11 +92,12 @@ class LoRAAdapter(nn.Module):
|
|
92
92
|
for i in range(self.base_hf_config.num_hidden_layers):
|
93
93
|
layer = self.layers[i]
|
94
94
|
weight_names = [name for name, _ in layer.weights.items()]
|
95
|
-
self.
|
96
|
-
self.
|
97
|
-
|
98
|
-
def stack_qkv_proj(self, weight_names: List[str], weights: Dict[str, torch.Tensor]):
|
95
|
+
self.normalize_qkv_proj(weight_names, layer.weights)
|
96
|
+
self.normalize_gate_up_proj(weight_names, layer.weights)
|
99
97
|
|
98
|
+
def normalize_qkv_proj(
|
99
|
+
self, weight_names: List[str], weights: Dict[str, torch.Tensor]
|
100
|
+
):
|
100
101
|
# Collect target q/k/v modules. This process is necessary since there might be no lora attached to k_proj
|
101
102
|
target_module = set()
|
102
103
|
for weight_name in weight_names:
|
@@ -106,6 +107,8 @@ class LoRAAdapter(nn.Module):
|
|
106
107
|
target_module.add("q_proj")
|
107
108
|
if "v_proj" in weight_name:
|
108
109
|
target_module.add("v_proj")
|
110
|
+
if "qkv_proj" in weight_name:
|
111
|
+
target_module.add("qkv_proj")
|
109
112
|
if len(target_module) == 0:
|
110
113
|
return
|
111
114
|
|
@@ -148,8 +151,35 @@ class LoRAAdapter(nn.Module):
|
|
148
151
|
if "k_proj" in target_module:
|
149
152
|
weights.pop(k_name)
|
150
153
|
weights.pop(v_name)
|
154
|
+
elif "qkv_proj" in weight_name:
|
155
|
+
# If qkv_proj is already stacked, we normalize it following the SGL convention.
|
156
|
+
qkv_name = weight_name
|
157
|
+
q_name = weight_name.replace("qkv_proj", "q_proj")
|
158
|
+
k_name = weight_name.replace("qkv_proj", "k_proj")
|
159
|
+
v_name = weight_name.replace("qkv_proj", "v_proj")
|
160
|
+
kv_name = weight_name.replace("qkv_proj", "kv_proj")
|
161
|
+
if "lora_A" in weight_name:
|
162
|
+
weights[qkv_name] = weights[qkv_name].repeat(3, 1)
|
163
|
+
else:
|
164
|
+
head_size = (
|
165
|
+
self.base_hf_config.hidden_size
|
166
|
+
// self.base_hf_config.num_attention_heads
|
167
|
+
)
|
168
|
+
weights[q_name], k_proj_weight, v_proj_weight = torch.split(
|
169
|
+
weights[qkv_name],
|
170
|
+
[
|
171
|
+
head_size * self.base_hf_config.num_attention_heads,
|
172
|
+
head_size * self.base_hf_config.num_key_value_heads,
|
173
|
+
head_size * self.base_hf_config.num_key_value_heads,
|
174
|
+
],
|
175
|
+
dim=0,
|
176
|
+
)
|
177
|
+
weights[kv_name] = torch.stack(
|
178
|
+
[k_proj_weight, v_proj_weight],
|
179
|
+
dim=0,
|
180
|
+
)
|
151
181
|
|
152
|
-
def
|
182
|
+
def normalize_gate_up_proj(
|
153
183
|
self, weight_names: List[str], weights: Dict[str, torch.Tensor]
|
154
184
|
):
|
155
185
|
for weight_name in weight_names:
|
@@ -179,3 +209,17 @@ class LoRAAdapter(nn.Module):
|
|
179
209
|
weights.pop(weight_name)
|
180
210
|
if up_name in weights:
|
181
211
|
weights.pop(up_name)
|
212
|
+
elif "gate_up_proj" in weight_name:
|
213
|
+
# If gate_up_proj is already stacked, we normalize it following the SGL convention
|
214
|
+
gate_up_name = weight_name
|
215
|
+
if "lora_A" in weight_name:
|
216
|
+
weights[gate_up_name] = weights[gate_up_name].repeat(2, 1)
|
217
|
+
else:
|
218
|
+
output_dim = weights[gate_up_name].shape[0] // 2
|
219
|
+
weights[gate_up_name] = torch.stack(
|
220
|
+
[
|
221
|
+
weights[gate_up_name][:output_dim, :],
|
222
|
+
weights[gate_up_name][output_dim:, :],
|
223
|
+
],
|
224
|
+
dim=0,
|
225
|
+
)
|
sglang/srt/lora/lora_manager.py
CHANGED
@@ -32,7 +32,7 @@ from sglang.srt.lora.utils import (
|
|
32
32
|
LoRAType,
|
33
33
|
get_customized_names_from_hf_names,
|
34
34
|
get_layer_id,
|
35
|
-
|
35
|
+
get_normalized_lora_weight_names,
|
36
36
|
get_weight_name,
|
37
37
|
)
|
38
38
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
@@ -81,7 +81,7 @@ class LoRAManager:
|
|
81
81
|
seg_indptr=torch.zeros(
|
82
82
|
self.max_bs_in_cuda_graph + 1, dtype=torch.int32
|
83
83
|
),
|
84
|
-
max_len=
|
84
|
+
max_len=1,
|
85
85
|
weight_indices=torch.zeros(
|
86
86
|
self.max_bs_in_cuda_graph, dtype=torch.int32
|
87
87
|
),
|
@@ -89,6 +89,17 @@ class LoRAManager:
|
|
89
89
|
scalings=torch.zeros(self.max_loras_per_batch, dtype=torch.float),
|
90
90
|
)
|
91
91
|
|
92
|
+
# Initialize seg_lens and seg_indptr for CUDA graph as they remain constant
|
93
|
+
# across batches.
|
94
|
+
self.cuda_graph_batch_info.seg_lens[: self.max_bs_in_cuda_graph].fill_(1)
|
95
|
+
torch.cumsum(
|
96
|
+
self.cuda_graph_batch_info.seg_lens[: self.max_bs_in_cuda_graph],
|
97
|
+
dim=0,
|
98
|
+
out=self.cuda_graph_batch_info.seg_indptr[
|
99
|
+
1 : self.max_bs_in_cuda_graph + 1
|
100
|
+
],
|
101
|
+
)
|
102
|
+
|
92
103
|
def init_loras(self):
|
93
104
|
# Config of each LoRA adapter
|
94
105
|
self.configs: Dict[str, LoRAConfig] = {}
|
@@ -101,10 +112,13 @@ class LoRAManager:
|
|
101
112
|
self.hf_target_names.update(self.configs[name].target_modules)
|
102
113
|
|
103
114
|
# Target lora weight names for lora_a and lora_b modules respectively.
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
115
|
+
weights_A: List[str] = []
|
116
|
+
weights_B: List[str] = []
|
117
|
+
for module in self.hf_target_names:
|
118
|
+
lora_A, lora_B = get_normalized_lora_weight_names(module)
|
119
|
+
weights_A += lora_A
|
120
|
+
weights_B += lora_B
|
121
|
+
self.lora_weight_names: Tuple[Set[str]] = set(weights_A), set(weights_B)
|
108
122
|
|
109
123
|
# load all weights to cpu
|
110
124
|
self.loras: Dict[str, LoRAAdapter] = {}
|
@@ -156,6 +170,45 @@ class LoRAManager:
|
|
156
170
|
# set up batch info shared by all lora modules
|
157
171
|
bs = forward_batch.batch_size
|
158
172
|
|
173
|
+
def transfer_adapter_info(
|
174
|
+
weight_indices_out: torch.Tensor,
|
175
|
+
lora_ranks_out: torch.Tensor,
|
176
|
+
scalings_out: torch.Tensor,
|
177
|
+
):
|
178
|
+
"""
|
179
|
+
Transfer adapter metadata (weight indices, LoRA rank, scalings) from host
|
180
|
+
to device (CUDA) asynchronously.
|
181
|
+
"""
|
182
|
+
weight_indices = [0] * len(forward_batch.lora_paths)
|
183
|
+
lora_ranks = [0] * self.max_loras_per_batch
|
184
|
+
scalings = [0] * self.max_loras_per_batch
|
185
|
+
for i, lora_path in enumerate(forward_batch.lora_paths):
|
186
|
+
weight_indices[i] = self.memory_pool.get_buffer_id(lora_path)
|
187
|
+
if lora_path is not None:
|
188
|
+
lora = self.loras[lora_path]
|
189
|
+
lora_ranks[weight_indices[i]] = lora.config.hf_config["r"]
|
190
|
+
scalings[weight_indices[i]] = lora.scaling
|
191
|
+
|
192
|
+
# Use pinned memory to avoid synchronizations during host-to-device transfer
|
193
|
+
weight_indices_tensor = torch.tensor(
|
194
|
+
weight_indices, dtype=torch.int32, pin_memory=True, device="cpu"
|
195
|
+
)
|
196
|
+
lora_ranks_tensor = torch.tensor(
|
197
|
+
lora_ranks, dtype=torch.int32, pin_memory=True, device="cpu"
|
198
|
+
)
|
199
|
+
scalings_tensor = torch.tensor(
|
200
|
+
scalings, dtype=torch.float, pin_memory=True, device="cpu"
|
201
|
+
)
|
202
|
+
|
203
|
+
# Copy to device tensors asynchronously
|
204
|
+
weight_indices_out[:bs].copy_(weight_indices_tensor, non_blocking=True)
|
205
|
+
lora_ranks_out[: self.max_loras_per_batch].copy_(
|
206
|
+
lora_ranks_tensor, non_blocking=True
|
207
|
+
)
|
208
|
+
scalings_out[: self.max_loras_per_batch].copy_(
|
209
|
+
scalings_tensor, non_blocking=True
|
210
|
+
)
|
211
|
+
|
159
212
|
if (
|
160
213
|
hasattr(self, "max_bs_in_cuda_graph")
|
161
214
|
and bs <= self.max_bs_in_cuda_graph
|
@@ -163,51 +216,46 @@ class LoRAManager:
|
|
163
216
|
):
|
164
217
|
# Do in-place updates when CUDA graph is enabled and the batch forward mode
|
165
218
|
# could use CUDA graph.
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
self.cuda_graph_batch_info.
|
170
|
-
|
171
|
-
out=self.cuda_graph_batch_info.seg_indptr[1 : bs + 1],
|
219
|
+
|
220
|
+
transfer_adapter_info(
|
221
|
+
self.cuda_graph_batch_info.weight_indices,
|
222
|
+
self.cuda_graph_batch_info.lora_ranks,
|
223
|
+
self.cuda_graph_batch_info.scalings,
|
172
224
|
)
|
173
|
-
self.cuda_graph_batch_info.max_len = 1
|
174
225
|
|
175
|
-
|
176
|
-
|
177
|
-
self.memory_pool.get_buffer_id(lora_path)
|
178
|
-
)
|
179
|
-
if lora_path is not None:
|
180
|
-
lora = self.loras[lora_path]
|
181
|
-
self.cuda_graph_batch_info.lora_ranks[
|
182
|
-
self.cuda_graph_batch_info.weight_indices[i]
|
183
|
-
] = lora.config.hf_config["r"]
|
184
|
-
self.cuda_graph_batch_info.scalings[
|
185
|
-
self.cuda_graph_batch_info.weight_indices[i]
|
186
|
-
] = lora.scaling
|
226
|
+
self.cuda_graph_batch_info.bs = bs
|
227
|
+
self.cuda_graph_batch_info.max_len = 1
|
187
228
|
batch_info = self.cuda_graph_batch_info
|
188
229
|
else:
|
230
|
+
weight_indices = torch.empty((bs,), dtype=torch.int32, device=self.device)
|
231
|
+
lora_ranks = torch.zeros(
|
232
|
+
(self.max_loras_per_batch,), dtype=torch.int64, device=self.device
|
233
|
+
)
|
234
|
+
scalings = torch.zeros(
|
235
|
+
(self.max_loras_per_batch,), dtype=torch.float, device=self.device
|
236
|
+
)
|
237
|
+
transfer_adapter_info(
|
238
|
+
weight_indices,
|
239
|
+
lora_ranks,
|
240
|
+
scalings,
|
241
|
+
)
|
242
|
+
|
189
243
|
seg_lens = (
|
190
244
|
forward_batch.extend_seq_lens
|
191
245
|
if forward_batch.forward_mode.is_extend()
|
192
246
|
else torch.ones(bs, device=self.device)
|
193
247
|
)
|
248
|
+
|
249
|
+
max_len = (
|
250
|
+
# Calculate max_len from the CPU copy to avoid D2H transfer.
|
251
|
+
max(forward_batch.extend_seq_lens_cpu)
|
252
|
+
if forward_batch.forward_mode.is_extend()
|
253
|
+
else 1
|
254
|
+
)
|
255
|
+
|
194
256
|
seg_indptr = torch.zeros((bs + 1,), dtype=torch.int32, device=self.device)
|
195
257
|
seg_indptr[1:] = torch.cumsum(seg_lens, dim=0)
|
196
|
-
max_len = int(torch.max(seg_lens))
|
197
|
-
weight_indices = torch.empty((bs,), dtype=torch.int64, device=self.device)
|
198
258
|
|
199
|
-
lora_ranks = torch.zeros(
|
200
|
-
(self.max_loras_per_batch,), dtype=torch.int64, device="cuda"
|
201
|
-
)
|
202
|
-
scalings = torch.zeros(
|
203
|
-
(self.max_loras_per_batch,), dtype=torch.float, device="cuda"
|
204
|
-
)
|
205
|
-
for i, lora_path in enumerate(forward_batch.lora_paths):
|
206
|
-
weight_indices[i] = self.memory_pool.get_buffer_id(lora_path)
|
207
|
-
if lora_path is not None:
|
208
|
-
lora = self.loras[lora_path]
|
209
|
-
lora_ranks[weight_indices[i]] = lora.config.hf_config["r"]
|
210
|
-
scalings[weight_indices[i]] = lora.scaling
|
211
259
|
batch_info = LoRABatchInfo(
|
212
260
|
bs=bs,
|
213
261
|
seg_lens=seg_lens,
|
@@ -263,7 +311,18 @@ class LoRAManager:
|
|
263
311
|
self.lora_modules: Dict[int, List[Tuple[str, BaseLayerWithLoRA]]] = {
|
264
312
|
i: [] for i in range(self.base_hf_config.num_hidden_layers)
|
265
313
|
}
|
314
|
+
|
266
315
|
for module_name, module in self.base_model.named_modules():
|
316
|
+
# TODO (lifuhuang): in the future, we should consider generalizing the
|
317
|
+
# should_apply_lora function to support mapping by full module name instead
|
318
|
+
# of just the last part (e.g., "qkv_proj") to support scenarios with multiple
|
319
|
+
# attention stacks (e.g., multimodal models).
|
320
|
+
# See: https://github.com/sgl-project/sglang/issues/6608
|
321
|
+
if getattr(
|
322
|
+
self.base_model, "should_apply_lora", None
|
323
|
+
) and not self.base_model.should_apply_lora(module_name):
|
324
|
+
continue
|
325
|
+
|
267
326
|
# The module should be converted if it is included in target_names
|
268
327
|
if module_name.split(".")[-1] in customized_target_names:
|
269
328
|
layer_id = get_layer_id(module_name)
|
sglang/srt/lora/mem_pool.py
CHANGED
@@ -91,18 +91,16 @@ class LoRAMemoryPool:
|
|
91
91
|
|
92
92
|
def init_buffers(
|
93
93
|
self,
|
94
|
-
lora_weight_names: Set[
|
94
|
+
lora_weight_names: Tuple[Set[str]],
|
95
95
|
base_model: torch.nn.Module,
|
96
96
|
):
|
97
97
|
|
98
98
|
# lora_weight_names is a set of name pairs indicating each pair of lora modules to load
|
99
99
|
# e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj"), ("o_proj", "o_proj")}
|
100
|
-
self.lora_weight_names: Set[
|
100
|
+
self.lora_weight_names: Tuple[Set[str]] = lora_weight_names
|
101
101
|
device = next(base_model.parameters()).device
|
102
|
-
lora_module_A_names = set([name[0] for name in lora_weight_names])
|
103
|
-
lora_module_B_names = set([name[1] for name in lora_weight_names])
|
104
102
|
# Init A tensor, column_major=False
|
105
|
-
for module_A in
|
103
|
+
for module_A in lora_weight_names[0]:
|
106
104
|
lora_A_shape = self.get_lora_A_shape(module_A, base_model)
|
107
105
|
self.A_buffer[module_A] = [
|
108
106
|
torch.empty(
|
@@ -110,10 +108,10 @@ class LoRAMemoryPool:
|
|
110
108
|
dtype=self.dtype,
|
111
109
|
device=device,
|
112
110
|
)
|
113
|
-
for
|
111
|
+
for _ in range(self.num_layer)
|
114
112
|
]
|
115
113
|
# Init B tensor, column_major=True
|
116
|
-
for module_B in
|
114
|
+
for module_B in lora_weight_names[1]:
|
117
115
|
lora_B_shape = self.get_lora_B_shape(module_B, base_model)
|
118
116
|
self.B_buffer[module_B] = [
|
119
117
|
torch.empty(
|
@@ -134,12 +132,13 @@ class LoRAMemoryPool:
|
|
134
132
|
for buffer_id in range(self.max_loras_per_batch):
|
135
133
|
# Prioritize empty slots
|
136
134
|
if self.buffer_id_to_uid[buffer_id] == "":
|
137
|
-
return buffer_id
|
135
|
+
return buffer_id
|
138
136
|
|
139
137
|
for buffer_id in range(self.max_loras_per_batch):
|
140
138
|
# Evict unneeded lora
|
141
139
|
if self.buffer_id_to_uid[buffer_id] not in cur_uids:
|
142
|
-
|
140
|
+
self.uid_to_buffer_id.pop(self.buffer_id_to_uid[buffer_id])
|
141
|
+
return buffer_id
|
143
142
|
|
144
143
|
raise ValueError(
|
145
144
|
"No available buffer slots found. Please ensure the number of active loras is less than max_loras_per_batch."
|
@@ -147,9 +146,7 @@ class LoRAMemoryPool:
|
|
147
146
|
|
148
147
|
for uid in cur_uids:
|
149
148
|
if uid not in self.uid_to_buffer_id:
|
150
|
-
buffer_id
|
151
|
-
if evicted_lora_uid != "":
|
152
|
-
self.uid_to_buffer_id.pop(evicted_lora_uid)
|
149
|
+
buffer_id = get_available_buffer_slot()
|
153
150
|
self.load_lora_weight_to_buffer(
|
154
151
|
uid, buffer_id, lora_adapters.get(uid, None)
|
155
152
|
)
|
@@ -159,6 +156,10 @@ class LoRAMemoryPool:
|
|
159
156
|
def load_lora_weight_to_buffer(
|
160
157
|
self, uid: str, buffer_id: int, lora_adapter: LoRAAdapter = None
|
161
158
|
):
|
159
|
+
def check_lora_weight_shape(buffer_view: torch.Tensor, weight: torch.Tensor):
|
160
|
+
assert (
|
161
|
+
buffer_view.shape == weight.shape
|
162
|
+
), f"LoRA buffer shape {buffer_view.shape} does not match weight shape {weight.shape}."
|
162
163
|
|
163
164
|
if uid is None:
|
164
165
|
for i in range(self.num_layer):
|
@@ -210,21 +211,27 @@ class LoRAMemoryPool:
|
|
210
211
|
|
211
212
|
for name, weights in temp_A_buffer.items():
|
212
213
|
c = get_stacked_multiply(name)
|
213
|
-
self.A_buffer[name][layer_id][buffer_id][
|
214
|
-
|
215
|
-
|
214
|
+
buffer_view = self.A_buffer[name][layer_id][buffer_id][
|
215
|
+
: lora_rank * c, :
|
216
|
+
]
|
217
|
+
check_lora_weight_shape(buffer_view, weights)
|
218
|
+
buffer_view.copy_(weights)
|
216
219
|
|
217
220
|
for name, weights in temp_B_buffer.items():
|
218
221
|
c = get_stacked_multiply(name)
|
219
222
|
if c > 1:
|
220
223
|
for stacked_id in range(c):
|
221
|
-
self.B_buffer[name][layer_id][stacked_id][
|
222
|
-
|
223
|
-
]
|
224
|
+
buffer_view = self.B_buffer[name][layer_id][stacked_id][
|
225
|
+
buffer_id
|
226
|
+
][:, :lora_rank]
|
227
|
+
check_lora_weight_shape(buffer_view, weights[stacked_id])
|
228
|
+
buffer_view.copy_(weights[stacked_id])
|
224
229
|
else:
|
225
|
-
self.B_buffer[name][layer_id][0][buffer_id][
|
226
|
-
|
227
|
-
|
230
|
+
buffer_view = self.B_buffer[name][layer_id][0][buffer_id][
|
231
|
+
:, :lora_rank
|
232
|
+
]
|
233
|
+
check_lora_weight_shape(buffer_view, weights)
|
234
|
+
buffer_view.copy_(weights)
|
228
235
|
|
229
236
|
def get_tensor(
|
230
237
|
self, weight_name: str, layer_id: int, lora_type: LoRAType
|