sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +16 -10
- sglang/bench_one_batch.py +5 -4
- sglang/bench_one_batch_server.py +86 -22
- sglang/bench_serving.py +197 -110
- sglang/compile_deep_gemm.py +4 -4
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +66 -29
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +47 -9
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +187 -134
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
- sglang/srt/disaggregation/fake/conn.py +4 -13
- sglang/srt/disaggregation/kv_events.py +412 -0
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +84 -70
- sglang/srt/disaggregation/mooncake/conn.py +441 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
- sglang/srt/disaggregation/nixl/conn.py +124 -442
- sglang/srt/disaggregation/prefill.py +128 -44
- sglang/srt/disaggregation/utils.py +154 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/distributed/utils.py +3 -3
- sglang/srt/entrypoints/EngineBase.py +11 -0
- sglang/srt/entrypoints/engine.py +129 -12
- sglang/srt/entrypoints/http_server.py +21 -6
- sglang/srt/entrypoints/http_server_engine.py +5 -2
- sglang/srt/function_call/base_format_detector.py +302 -0
- sglang/srt/function_call/core_types.py +34 -0
- sglang/srt/function_call/deepseekv3_detector.py +205 -0
- sglang/srt/function_call/ebnf_composer.py +248 -0
- sglang/srt/function_call/function_call_parser.py +202 -0
- sglang/srt/function_call/llama32_detector.py +93 -0
- sglang/srt/function_call/mistral_detector.py +131 -0
- sglang/srt/function_call/pythonic_detector.py +229 -0
- sglang/srt/function_call/qwen25_detector.py +121 -0
- sglang/srt/function_call/utils.py +52 -0
- sglang/srt/hf_transformers_utils.py +50 -7
- sglang/srt/layers/attention/aiter_backend.py +878 -0
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +166 -35
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
- sglang/srt/layers/attention/flashmla_backend.py +340 -78
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +247 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/utils.py +2 -2
- sglang/srt/layers/attention/vision.py +1 -1
- sglang/srt/layers/communicator.py +517 -0
- sglang/srt/layers/dp_attention.py +6 -15
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +370 -0
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
- sglang/srt/layers/moe/ep_moe/layer.py +195 -87
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
- sglang/srt/layers/moe/topk.py +107 -24
- sglang/srt/layers/multimodal.py +70 -0
- sglang/srt/layers/quantization/__init__.py +10 -4
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +60 -59
- sglang/srt/layers/quantization/fp8.py +113 -18
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -43
- sglang/srt/layers/quantization/gptq.py +298 -6
- sglang/srt/layers/quantization/int8_kernel.py +18 -5
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/qoq.py +244 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +20 -8
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
- sglang/srt/managers/eplb_manager.py +96 -0
- sglang/srt/managers/expert_distribution.py +878 -56
- sglang/srt/managers/expert_location.py +448 -0
- sglang/srt/managers/expert_location_dispatch.py +108 -0
- sglang/srt/managers/io_struct.py +29 -5
- sglang/srt/managers/mm_utils.py +355 -151
- sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
- sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
- sglang/srt/managers/multimodal_processors/internvl.py +18 -5
- sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
- sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
- sglang/srt/managers/multimodal_processors/llava.py +3 -3
- sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
- sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
- sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
- sglang/srt/managers/schedule_batch.py +185 -55
- sglang/srt/managers/schedule_policy.py +4 -5
- sglang/srt/managers/scheduler.py +389 -154
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +231 -39
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/base_prefix_cache.py +3 -0
- sglang/srt/mem_cache/chunk_cache.py +3 -1
- sglang/srt/mem_cache/hiradix_cache.py +4 -4
- sglang/srt/mem_cache/memory_pool.py +74 -52
- sglang/srt/mem_cache/multimodal_cache.py +45 -0
- sglang/srt/mem_cache/radix_cache.py +58 -5
- sglang/srt/metrics/collector.py +11 -2
- sglang/srt/mm_utils.py +10 -0
- sglang/srt/model_executor/cuda_graph_runner.py +87 -65
- sglang/srt/model_executor/expert_location_updater.py +557 -0
- sglang/srt/model_executor/forward_batch_info.py +39 -14
- sglang/srt/model_executor/model_runner.py +231 -101
- sglang/srt/model_loader/loader.py +10 -6
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/clip.py +5 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +732 -403
- sglang/srt/models/exaone.py +8 -3
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +75 -33
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/llama4.py +10 -2
- sglang/srt/models/llava.py +26 -18
- sglang/srt/models/mimo_mtp.py +220 -0
- sglang/srt/models/minicpmo.py +7 -17
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/mistral.py +71 -1
- sglang/srt/models/mllama.py +3 -3
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +133 -35
- sglang/srt/models/qwen2_5_vl.py +5 -3
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +206 -69
- sglang/srt/models/qwen2_vl.py +3 -3
- sglang/srt/models/qwen3.py +92 -19
- sglang/srt/models/qwen3_moe.py +457 -55
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/siglip.py +294 -0
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +114 -40
- sglang/srt/openai_api/protocol.py +37 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +189 -0
- sglang/srt/operations_strategy.py +207 -0
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +235 -38
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +181 -90
- sglang/srt/speculative/eagle_worker.py +146 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +197 -19
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +278 -0
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +81 -42
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
- sglang-0.4.7.dist-info/RECORD +699 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- sglang/srt/function_call_parser.py +0 -858
- sglang/srt/platforms/interface.py +0 -371
- sglang-0.4.6.post4.dist-info/RECORD +0 -646
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -13,7 +13,6 @@
|
|
13
13
|
# ==============================================================================
|
14
14
|
"""ModelRunner runs the forward passes of the models."""
|
15
15
|
|
16
|
-
import collections
|
17
16
|
import datetime
|
18
17
|
import gc
|
19
18
|
import inspect
|
@@ -36,8 +35,10 @@ from sglang.srt.distributed import (
|
|
36
35
|
init_distributed_environment,
|
37
36
|
initialize_model_parallel,
|
38
37
|
set_custom_all_reduce,
|
38
|
+
set_mscclpp_all_reduce,
|
39
39
|
)
|
40
40
|
from sglang.srt.distributed.parallel_state import monkey_patch_vllm_parallel_state
|
41
|
+
from sglang.srt.layers.attention.tbo_backend import TboAttnBackend
|
41
42
|
from sglang.srt.layers.dp_attention import (
|
42
43
|
get_attention_tp_group,
|
43
44
|
get_attention_tp_size,
|
@@ -51,8 +52,24 @@ from sglang.srt.layers.quantization.deep_gemm import (
|
|
51
52
|
)
|
52
53
|
from sglang.srt.layers.sampler import Sampler
|
53
54
|
from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model
|
55
|
+
from sglang.srt.layers.utils import is_sm100_supported
|
54
56
|
from sglang.srt.lora.lora_manager import LoRAManager
|
55
|
-
from sglang.srt.managers.
|
57
|
+
from sglang.srt.managers.eplb_manager import EPLBManager
|
58
|
+
from sglang.srt.managers.expert_distribution import (
|
59
|
+
ExpertDistributionRecorder,
|
60
|
+
get_global_expert_distribution_recorder,
|
61
|
+
set_global_expert_distribution_recorder,
|
62
|
+
)
|
63
|
+
from sglang.srt.managers.expert_location import (
|
64
|
+
ExpertLocationMetadata,
|
65
|
+
compute_initial_expert_location_metadata,
|
66
|
+
get_global_expert_location_metadata,
|
67
|
+
set_global_expert_location_metadata,
|
68
|
+
)
|
69
|
+
from sglang.srt.managers.schedule_batch import (
|
70
|
+
GLOBAL_SERVER_ARGS_KEYS,
|
71
|
+
global_server_args_dict,
|
72
|
+
)
|
56
73
|
from sglang.srt.mem_cache.memory_pool import (
|
57
74
|
DoubleSparseTokenToKVPool,
|
58
75
|
MHATokenToKVPool,
|
@@ -62,13 +79,10 @@ from sglang.srt.mem_cache.memory_pool import (
|
|
62
79
|
)
|
63
80
|
from sglang.srt.mem_cache.paged_allocator import PagedTokenToKVPoolAllocator
|
64
81
|
from sglang.srt.model_executor.cuda_graph_runner import CudaGraphRunner
|
82
|
+
from sglang.srt.model_executor.expert_location_updater import ExpertLocationUpdater
|
65
83
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
|
66
84
|
from sglang.srt.model_loader import get_model
|
67
|
-
from sglang.srt.model_loader.loader import
|
68
|
-
DefaultModelLoader,
|
69
|
-
device_loading_context,
|
70
|
-
get_model_loader,
|
71
|
-
)
|
85
|
+
from sglang.srt.model_loader.loader import DefaultModelLoader, get_model_loader
|
72
86
|
from sglang.srt.model_loader.utils import set_default_torch_dtype
|
73
87
|
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
74
88
|
from sglang.srt.patch_torch import monkey_patch_torch_reductions
|
@@ -78,6 +92,7 @@ from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
|
|
78
92
|
from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
|
79
93
|
from sglang.srt.utils import (
|
80
94
|
MultiprocessingSerializer,
|
95
|
+
cpu_has_amx_support,
|
81
96
|
enable_show_time_cost,
|
82
97
|
get_available_gpu_memory,
|
83
98
|
get_bool_env_var,
|
@@ -94,6 +109,8 @@ from sglang.srt.utils import (
|
|
94
109
|
set_cuda_arch,
|
95
110
|
)
|
96
111
|
|
112
|
+
_is_hip = is_hip()
|
113
|
+
|
97
114
|
# Use a small KV cache pool size for tests in CI
|
98
115
|
SGLANG_CI_SMALL_KV_SIZE = os.getenv("SGLANG_CI_SMALL_KV_SIZE", None)
|
99
116
|
|
@@ -103,6 +120,19 @@ UNBALANCED_MODEL_LOADING_TIMEOUT_S = 300
|
|
103
120
|
logger = logging.getLogger(__name__)
|
104
121
|
|
105
122
|
|
123
|
+
class RankZeroFilter(logging.Filter):
|
124
|
+
"""Filter that only allows INFO level logs from rank 0, but allows all other levels from any rank."""
|
125
|
+
|
126
|
+
def __init__(self, is_rank_zero):
|
127
|
+
super().__init__()
|
128
|
+
self.is_rank_zero = is_rank_zero
|
129
|
+
|
130
|
+
def filter(self, record):
|
131
|
+
if record.levelno == logging.INFO:
|
132
|
+
return self.is_rank_zero
|
133
|
+
return True
|
134
|
+
|
135
|
+
|
106
136
|
class ModelRunner:
|
107
137
|
"""ModelRunner runs the forward passes of the models."""
|
108
138
|
|
@@ -126,6 +156,10 @@ class ModelRunner:
|
|
126
156
|
self.mem_fraction_static = mem_fraction_static
|
127
157
|
self.device = server_args.device
|
128
158
|
self.gpu_id = gpu_id
|
159
|
+
|
160
|
+
# Apply the rank zero filter to logger
|
161
|
+
if not any(isinstance(f, RankZeroFilter) for f in logger.filters):
|
162
|
+
logger.addFilter(RankZeroFilter(tp_rank == 0))
|
129
163
|
self.tp_rank = tp_rank
|
130
164
|
self.tp_size = tp_size
|
131
165
|
self.pp_rank = pp_rank
|
@@ -135,7 +169,9 @@ class ModelRunner:
|
|
135
169
|
self.is_draft_worker = is_draft_worker
|
136
170
|
self.is_generation = model_config.is_generation
|
137
171
|
self.is_multimodal = model_config.is_multimodal
|
138
|
-
self.
|
172
|
+
self.is_multimodal_chunked_prefill_supported = (
|
173
|
+
model_config.is_multimodal_chunked_prefill_supported
|
174
|
+
)
|
139
175
|
self.spec_algorithm = SpeculativeAlgorithm.from_string(
|
140
176
|
server_args.speculative_algorithm
|
141
177
|
)
|
@@ -145,6 +181,8 @@ class ModelRunner:
|
|
145
181
|
self.use_mla_backend = self.model_config.attention_arch == AttentionArch.MLA
|
146
182
|
self.attention_chunk_size = model_config.attention_chunk_size
|
147
183
|
|
184
|
+
self.forward_pass_id = 0
|
185
|
+
|
148
186
|
# Model-specific adjustment
|
149
187
|
self.model_specific_adjustment()
|
150
188
|
|
@@ -153,28 +191,10 @@ class ModelRunner:
|
|
153
191
|
|
154
192
|
# Global vars
|
155
193
|
global_server_args_dict.update(
|
156
|
-
{
|
157
|
-
|
158
|
-
"
|
159
|
-
"debug_tensor_dump_output_folder": server_args.debug_tensor_dump_output_folder,
|
160
|
-
"deepep_mode": server_args.deepep_mode,
|
161
|
-
"device": server_args.device,
|
162
|
-
"disable_chunked_prefix_cache": server_args.disable_chunked_prefix_cache,
|
163
|
-
"disable_radix_cache": server_args.disable_radix_cache,
|
164
|
-
"enable_nan_detection": server_args.enable_nan_detection,
|
165
|
-
"enable_dp_attention": server_args.enable_dp_attention,
|
166
|
-
"enable_ep_moe": server_args.enable_ep_moe,
|
167
|
-
"enable_deepep_moe": server_args.enable_deepep_moe,
|
168
|
-
"flashinfer_mla_disable_ragged": server_args.flashinfer_mla_disable_ragged,
|
169
|
-
"moe_dense_tp_size": server_args.moe_dense_tp_size,
|
170
|
-
"n_share_experts_fusion": server_args.n_share_experts_fusion,
|
171
|
-
"triton_attention_reduce_in_fp32": server_args.triton_attention_reduce_in_fp32,
|
172
|
-
"torchao_config": server_args.torchao_config,
|
173
|
-
"sampling_backend": server_args.sampling_backend,
|
174
|
-
"speculative_accept_threshold_single": server_args.speculative_accept_threshold_single,
|
175
|
-
"speculative_accept_threshold_acc": server_args.speculative_accept_threshold_acc,
|
194
|
+
{k: getattr(server_args, k) for k in GLOBAL_SERVER_ARGS_KEYS}
|
195
|
+
| {
|
196
|
+
# TODO it is indeed not a "server args"
|
176
197
|
"use_mla_backend": self.use_mla_backend,
|
177
|
-
"mm_attention_backend": server_args.mm_attention_backend,
|
178
198
|
}
|
179
199
|
)
|
180
200
|
|
@@ -202,6 +222,32 @@ class ModelRunner:
|
|
202
222
|
enable=self.server_args.enable_memory_saver
|
203
223
|
)
|
204
224
|
|
225
|
+
if not self.is_draft_worker:
|
226
|
+
set_global_expert_location_metadata(
|
227
|
+
compute_initial_expert_location_metadata(server_args, self.model_config)
|
228
|
+
)
|
229
|
+
if self.tp_rank == 0 and get_bool_env_var(
|
230
|
+
"SGLANG_LOG_EXPERT_LOCATION_METADATA"
|
231
|
+
):
|
232
|
+
logger.info(
|
233
|
+
f"Initial expert_location_metadata: {get_global_expert_location_metadata().debug_str()}"
|
234
|
+
)
|
235
|
+
|
236
|
+
set_global_expert_distribution_recorder(
|
237
|
+
ExpertDistributionRecorder.init_new(
|
238
|
+
server_args,
|
239
|
+
get_global_expert_location_metadata(),
|
240
|
+
rank=self.tp_rank,
|
241
|
+
)
|
242
|
+
)
|
243
|
+
|
244
|
+
self.eplb_manager = (
|
245
|
+
EPLBManager(self)
|
246
|
+
if self.server_args.enable_eplb and (not self.is_draft_worker)
|
247
|
+
else None
|
248
|
+
)
|
249
|
+
self.expert_location_updater = ExpertLocationUpdater()
|
250
|
+
|
205
251
|
# Load the model
|
206
252
|
self.sampler = Sampler()
|
207
253
|
self.load_model()
|
@@ -250,6 +296,16 @@ class ModelRunner:
|
|
250
296
|
def model_specific_adjustment(self):
|
251
297
|
server_args = self.server_args
|
252
298
|
|
299
|
+
if (
|
300
|
+
server_args.attention_backend == "intel_amx"
|
301
|
+
and server_args.device == "cpu"
|
302
|
+
and not cpu_has_amx_support()
|
303
|
+
):
|
304
|
+
logger.info(
|
305
|
+
"The current platform does not support Intel AMX, will fallback to torch_native backend."
|
306
|
+
)
|
307
|
+
server_args.attention_backend = "torch_native"
|
308
|
+
|
253
309
|
if server_args.attention_backend is None:
|
254
310
|
"""
|
255
311
|
Auto select the fastest attention backend.
|
@@ -259,7 +315,8 @@ class ModelRunner:
|
|
259
315
|
1.2 In other cases, we will use flashinfer if available, otherwise use triton.
|
260
316
|
2. Models with MLA Architecture and using FA3
|
261
317
|
2.1 We will use FA3 backend on hopper.
|
262
|
-
2.2
|
318
|
+
2.2 We will use Flashinfer backend on blackwell.
|
319
|
+
2.3 Otherwise, we will use triton backend.
|
263
320
|
"""
|
264
321
|
|
265
322
|
if not self.use_mla_backend:
|
@@ -270,6 +327,8 @@ class ModelRunner:
|
|
270
327
|
and is_fa3_default_architecture(self.model_config.hf_config)
|
271
328
|
):
|
272
329
|
server_args.attention_backend = "fa3"
|
330
|
+
elif _is_hip:
|
331
|
+
server_args.attention_backend = "aiter"
|
273
332
|
else:
|
274
333
|
server_args.attention_backend = (
|
275
334
|
"flashinfer" if is_flashinfer_available() else "triton"
|
@@ -278,31 +337,44 @@ class ModelRunner:
|
|
278
337
|
# MLA architecture
|
279
338
|
if is_hopper_with_cuda_12_3():
|
280
339
|
server_args.attention_backend = "fa3"
|
340
|
+
elif is_sm100_supported():
|
341
|
+
server_args.attention_backend = "flashinfer"
|
342
|
+
elif _is_hip:
|
343
|
+
head_num = self.model_config.get_num_kv_heads(self.tp_size)
|
344
|
+
# TODO current aiter only support head number 16 or 128 head number
|
345
|
+
if (
|
346
|
+
head_num == 128 or head_num == 16
|
347
|
+
) and self.spec_algorithm.is_none():
|
348
|
+
server_args.attention_backend = "aiter"
|
349
|
+
else:
|
350
|
+
server_args.attention_backend = "triton"
|
281
351
|
else:
|
282
352
|
server_args.attention_backend = "triton"
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
)
|
353
|
+
logger.info(
|
354
|
+
f"Attention backend not set. Use {server_args.attention_backend} backend by default."
|
355
|
+
)
|
287
356
|
elif self.use_mla_backend:
|
288
357
|
if server_args.device != "cpu":
|
289
358
|
if server_args.attention_backend in [
|
359
|
+
"aiter",
|
290
360
|
"flashinfer",
|
291
361
|
"fa3",
|
292
362
|
"triton",
|
293
363
|
"flashmla",
|
294
364
|
"cutlass_mla",
|
295
365
|
]:
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
)
|
366
|
+
logger.info(
|
367
|
+
f"MLA optimization is turned on. Use {server_args.attention_backend} backend."
|
368
|
+
)
|
300
369
|
else:
|
301
370
|
raise ValueError(
|
302
371
|
f"Invalid attention backend for MLA: {server_args.attention_backend}"
|
303
372
|
)
|
304
373
|
else:
|
305
|
-
|
374
|
+
if server_args.attention_backend != "intel_amx":
|
375
|
+
raise ValueError(
|
376
|
+
"MLA optimization not supported on CPU except for intel_amx backend."
|
377
|
+
)
|
306
378
|
|
307
379
|
if (
|
308
380
|
server_args.attention_backend == "fa3"
|
@@ -315,10 +387,9 @@ class ModelRunner:
|
|
315
387
|
server_args.attention_backend = "triton"
|
316
388
|
|
317
389
|
if server_args.enable_double_sparsity:
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
)
|
390
|
+
logger.info(
|
391
|
+
"Double sparsity optimization is turned on. Use triton backend without CUDA graph."
|
392
|
+
)
|
322
393
|
server_args.attention_backend = "triton"
|
323
394
|
server_args.disable_cuda_graph = True
|
324
395
|
if server_args.ds_heavy_channel_type is None:
|
@@ -329,26 +400,29 @@ class ModelRunner:
|
|
329
400
|
|
330
401
|
if self.is_multimodal:
|
331
402
|
self.mem_fraction_static *= 0.90
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
403
|
+
logger.info(
|
404
|
+
f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static:.3f} "
|
405
|
+
f"because this is a multimodal model."
|
406
|
+
)
|
407
|
+
if not self.is_multimodal_chunked_prefill_supported:
|
408
|
+
server_args.chunked_prefill_size = -1
|
337
409
|
logger.info(
|
338
|
-
"Automatically turn
|
410
|
+
f"Automatically turn of --chunked-prefill-size as it is not supported for "
|
411
|
+
f"{self.model_config.hf_config.model_type}"
|
339
412
|
)
|
340
|
-
server_args.chunked_prefill_size = -1
|
341
413
|
|
342
414
|
if not self.use_mla_backend:
|
343
415
|
server_args.disable_chunked_prefix_cache = True
|
344
416
|
elif self.page_size > 1:
|
345
|
-
|
346
|
-
logger.info("Disable chunked prefix cache when page size > 1.")
|
417
|
+
logger.info("Disable chunked prefix cache when page size > 1.")
|
347
418
|
server_args.disable_chunked_prefix_cache = True
|
348
419
|
|
349
420
|
if not server_args.disable_chunked_prefix_cache:
|
350
|
-
|
351
|
-
|
421
|
+
logger.info("Chunked prefix cache is turned on.")
|
422
|
+
|
423
|
+
if server_args.attention_backend == "aiter":
|
424
|
+
if self.model_config.context_len > 8192:
|
425
|
+
self.mem_fraction_static *= 0.85
|
352
426
|
|
353
427
|
def init_torch_distributed(self):
|
354
428
|
logger.info("Init torch distributed begin.")
|
@@ -381,6 +455,7 @@ class ModelRunner:
|
|
381
455
|
else:
|
382
456
|
dist_init_method = f"tcp://127.0.0.1:{self.dist_port}"
|
383
457
|
set_custom_all_reduce(not self.server_args.disable_custom_all_reduce)
|
458
|
+
set_mscclpp_all_reduce(self.server_args.enable_mscclpp)
|
384
459
|
|
385
460
|
if not self.is_draft_worker:
|
386
461
|
# Only initialize the distributed environment on the target model worker.
|
@@ -445,10 +520,9 @@ class ModelRunner:
|
|
445
520
|
torch.set_num_threads(1)
|
446
521
|
if self.device == "cuda":
|
447
522
|
if torch.cuda.get_device_capability()[0] < 8:
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
)
|
523
|
+
logger.info(
|
524
|
+
"Compute capability below sm80. Use float16 due to lack of bfloat16 support."
|
525
|
+
)
|
452
526
|
self.server_args.dtype = "float16"
|
453
527
|
self.model_config.dtype = torch.float16
|
454
528
|
if torch.cuda.get_device_capability()[1] < 5:
|
@@ -484,11 +558,10 @@ class ModelRunner:
|
|
484
558
|
self.model.load_kv_cache_scales(
|
485
559
|
self.server_args.quantization_param_path
|
486
560
|
)
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
)
|
561
|
+
logger.info(
|
562
|
+
"Loaded KV cache scaling factors from %s",
|
563
|
+
self.server_args.quantization_param_path,
|
564
|
+
)
|
492
565
|
else:
|
493
566
|
raise RuntimeError(
|
494
567
|
"Using FP8 KV cache and scaling factors provided but "
|
@@ -531,6 +604,19 @@ class ModelRunner:
|
|
531
604
|
f"TP rank {self.tp_rank} could finish the model loading, but there are other ranks that didn't finish loading. It is likely due to unexpected failures (e.g., OOM) or a slow node."
|
532
605
|
) from None
|
533
606
|
|
607
|
+
def update_expert_location(
|
608
|
+
self,
|
609
|
+
new_expert_location_metadata: ExpertLocationMetadata,
|
610
|
+
update_layer_ids: List[int],
|
611
|
+
):
|
612
|
+
self.expert_location_updater.update(
|
613
|
+
self.model.routed_experts_weights_of_layer,
|
614
|
+
new_expert_location_metadata,
|
615
|
+
update_layer_ids=update_layer_ids,
|
616
|
+
nnodes=self.server_args.nnodes,
|
617
|
+
rank=self.tp_rank,
|
618
|
+
)
|
619
|
+
|
534
620
|
def update_weights_from_disk(
|
535
621
|
self, model_path: str, load_format: str
|
536
622
|
) -> tuple[bool, str]:
|
@@ -552,13 +638,7 @@ class ModelRunner:
|
|
552
638
|
|
553
639
|
def get_weight_iter(config):
|
554
640
|
iter = loader._get_weights_iterator(
|
555
|
-
DefaultModelLoader.Source(
|
556
|
-
config.model_path,
|
557
|
-
revision=config.revision,
|
558
|
-
fall_back_to_pt=getattr(
|
559
|
-
self.model, "fall_back_to_pt_during_load", True
|
560
|
-
),
|
561
|
-
)
|
641
|
+
DefaultModelLoader.Source.init_new(config, self.model)
|
562
642
|
)
|
563
643
|
return iter
|
564
644
|
|
@@ -631,7 +711,6 @@ class ModelRunner:
|
|
631
711
|
rank=rank,
|
632
712
|
group_name=group_name,
|
633
713
|
)
|
634
|
-
dist.barrier(group=self._model_update_group, device_ids=[rank])
|
635
714
|
return True, "Succeeded to initialize custom process group."
|
636
715
|
except Exception as e:
|
637
716
|
message = f"Failed to initialize custom process group: {e}."
|
@@ -726,12 +805,15 @@ class ModelRunner:
|
|
726
805
|
distributed=get_world_group().world_size > 1,
|
727
806
|
cpu_group=get_world_group().cpu_group,
|
728
807
|
)
|
729
|
-
if self.
|
730
|
-
num_layers = (
|
731
|
-
self.model_config.
|
732
|
-
|
733
|
-
|
808
|
+
if self.is_draft_worker:
|
809
|
+
num_layers = getattr(
|
810
|
+
self.model_config.hf_config,
|
811
|
+
"num_nextn_predict_layers",
|
812
|
+
self.num_effective_layers,
|
734
813
|
)
|
814
|
+
else:
|
815
|
+
num_layers = self.num_effective_layers
|
816
|
+
if self.use_mla_backend:
|
735
817
|
# FIXME: pipeline parallelism is not compatible with mla backend
|
736
818
|
assert self.pp_size == 1
|
737
819
|
cell_size = (
|
@@ -743,7 +825,7 @@ class ModelRunner:
|
|
743
825
|
cell_size = (
|
744
826
|
self.model_config.get_num_kv_heads(get_attention_tp_size())
|
745
827
|
* self.model_config.head_dim
|
746
|
-
*
|
828
|
+
* num_layers
|
747
829
|
* 2
|
748
830
|
* torch._utils._element_size(self.kv_cache_dtype)
|
749
831
|
)
|
@@ -762,7 +844,7 @@ class ModelRunner:
|
|
762
844
|
if self.server_args.kv_cache_dtype == "auto":
|
763
845
|
self.kv_cache_dtype = self.dtype
|
764
846
|
elif self.server_args.kv_cache_dtype == "fp8_e5m2":
|
765
|
-
if
|
847
|
+
if _is_hip: # Using natively supported format
|
766
848
|
self.kv_cache_dtype = torch.float8_e5m2fnuz
|
767
849
|
else:
|
768
850
|
self.kv_cache_dtype = torch.float8_e5m2
|
@@ -834,12 +916,26 @@ class ModelRunner:
|
|
834
916
|
)
|
835
917
|
|
836
918
|
if self.req_to_token_pool is None:
|
837
|
-
self.
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
919
|
+
if self.server_args.disaggregation_mode == "decode":
|
920
|
+
from sglang.srt.disaggregation.decode import DecodeReqToTokenPool
|
921
|
+
|
922
|
+
# subscribe memory for pre-allocated requests
|
923
|
+
# if max_num_reqs <= 32, we pre-allocate 2x requests
|
924
|
+
pre_alloc_size = max_num_reqs * 2 if max_num_reqs <= 32 else 0
|
925
|
+
self.req_to_token_pool = DecodeReqToTokenPool(
|
926
|
+
size=max_num_reqs,
|
927
|
+
max_context_len=self.model_config.context_len + 4,
|
928
|
+
device=self.device,
|
929
|
+
enable_memory_saver=self.server_args.enable_memory_saver,
|
930
|
+
pre_alloc_size=pre_alloc_size,
|
931
|
+
)
|
932
|
+
else:
|
933
|
+
self.req_to_token_pool = ReqToTokenPool(
|
934
|
+
size=max_num_reqs,
|
935
|
+
max_context_len=self.model_config.context_len + 4,
|
936
|
+
device=self.device,
|
937
|
+
enable_memory_saver=self.server_args.enable_memory_saver,
|
938
|
+
)
|
843
939
|
else:
|
844
940
|
# Draft worker shares req_to_token_pool with the target worker.
|
845
941
|
assert self.is_draft_worker
|
@@ -924,6 +1020,13 @@ class ModelRunner:
|
|
924
1020
|
|
925
1021
|
def init_attention_backend(self):
|
926
1022
|
"""Init attention kernel backend."""
|
1023
|
+
if self.server_args.enable_two_batch_overlap:
|
1024
|
+
self.attn_backend = TboAttnBackend.init_new(self._get_attention_backend)
|
1025
|
+
else:
|
1026
|
+
self.attn_backend = self._get_attention_backend()
|
1027
|
+
|
1028
|
+
# TODO unify with 6338
|
1029
|
+
def _get_attention_backend(self):
|
927
1030
|
if self.server_args.attention_backend == "flashinfer":
|
928
1031
|
if not self.use_mla_backend:
|
929
1032
|
from sglang.srt.layers.attention.flashinfer_backend import (
|
@@ -933,18 +1036,18 @@ class ModelRunner:
|
|
933
1036
|
# Init streams
|
934
1037
|
if self.server_args.speculative_algorithm == "EAGLE":
|
935
1038
|
self.plan_stream_for_flashinfer = torch.cuda.Stream()
|
936
|
-
|
1039
|
+
return FlashInferAttnBackend(self)
|
937
1040
|
else:
|
938
1041
|
from sglang.srt.layers.attention.flashinfer_mla_backend import (
|
939
1042
|
FlashInferMLAAttnBackend,
|
940
1043
|
)
|
941
1044
|
|
942
|
-
|
1045
|
+
return FlashInferMLAAttnBackend(self)
|
1046
|
+
elif self.server_args.attention_backend == "aiter":
|
1047
|
+
from sglang.srt.layers.attention.aiter_backend import AiterAttnBackend
|
1048
|
+
|
1049
|
+
return AiterAttnBackend(self)
|
943
1050
|
elif self.server_args.attention_backend == "triton":
|
944
|
-
assert self.sliding_window_size is None, (
|
945
|
-
"Window attention is not supported in the triton attention backend. "
|
946
|
-
"Please use `--attention-backend flashinfer`."
|
947
|
-
)
|
948
1051
|
assert not self.model_config.is_encoder_decoder, (
|
949
1052
|
"Cross attention is not supported in the triton attention backend. "
|
950
1053
|
"Please use `--attention-backend flashinfer`."
|
@@ -954,21 +1057,21 @@ class ModelRunner:
|
|
954
1057
|
DoubleSparseAttnBackend,
|
955
1058
|
)
|
956
1059
|
|
957
|
-
|
1060
|
+
return DoubleSparseAttnBackend(self)
|
958
1061
|
else:
|
959
1062
|
from sglang.srt.layers.attention.triton_backend import TritonAttnBackend
|
960
1063
|
|
961
|
-
|
1064
|
+
return TritonAttnBackend(self)
|
962
1065
|
elif self.server_args.attention_backend == "torch_native":
|
963
1066
|
from sglang.srt.layers.attention.torch_native_backend import (
|
964
1067
|
TorchNativeAttnBackend,
|
965
1068
|
)
|
966
1069
|
|
967
|
-
|
1070
|
+
return TorchNativeAttnBackend(self)
|
968
1071
|
elif self.server_args.attention_backend == "flashmla":
|
969
1072
|
from sglang.srt.layers.attention.flashmla_backend import FlashMLABackend
|
970
1073
|
|
971
|
-
|
1074
|
+
return FlashMLABackend(self)
|
972
1075
|
elif self.server_args.attention_backend == "fa3":
|
973
1076
|
assert (
|
974
1077
|
torch.cuda.get_device_capability()[0] == 8 and not self.use_mla_backend
|
@@ -980,13 +1083,20 @@ class ModelRunner:
|
|
980
1083
|
FlashAttentionBackend,
|
981
1084
|
)
|
982
1085
|
|
983
|
-
|
1086
|
+
return FlashAttentionBackend(self)
|
984
1087
|
elif self.server_args.attention_backend == "cutlass_mla":
|
985
1088
|
from sglang.srt.layers.attention.cutlass_mla_backend import (
|
986
1089
|
CutlassMLABackend,
|
987
1090
|
)
|
988
1091
|
|
989
|
-
|
1092
|
+
return CutlassMLABackend(self)
|
1093
|
+
elif self.server_args.attention_backend == "intel_amx":
|
1094
|
+
from sglang.srt.layers.attention.intel_amx_backend import (
|
1095
|
+
IntelAMXAttnBackend,
|
1096
|
+
)
|
1097
|
+
|
1098
|
+
logger.info(f"Intel AMX attention backend is enabled.")
|
1099
|
+
return IntelAMXAttnBackend(self)
|
990
1100
|
else:
|
991
1101
|
raise ValueError(
|
992
1102
|
f"Invalid attention backend: {self.server_args.attention_backend}"
|
@@ -1020,7 +1130,7 @@ class ModelRunner:
|
|
1020
1130
|
if self.server_args.disable_cuda_graph:
|
1021
1131
|
return
|
1022
1132
|
|
1023
|
-
tic = time.
|
1133
|
+
tic = time.perf_counter()
|
1024
1134
|
before_mem = get_available_gpu_memory(self.device, self.gpu_id)
|
1025
1135
|
logger.info(
|
1026
1136
|
f"Capture cuda graph begin. This can take up to several minutes. avail mem={before_mem:.2f} GB"
|
@@ -1028,13 +1138,12 @@ class ModelRunner:
|
|
1028
1138
|
self.cuda_graph_runner = CudaGraphRunner(self)
|
1029
1139
|
after_mem = get_available_gpu_memory(self.device, self.gpu_id)
|
1030
1140
|
logger.info(
|
1031
|
-
f"Capture cuda graph end. Time elapsed: {time.
|
1141
|
+
f"Capture cuda graph end. Time elapsed: {time.perf_counter() - tic:.2f} s. "
|
1032
1142
|
f"mem usage={(before_mem - after_mem):.2f} GB. avail mem={after_mem:.2f} GB."
|
1033
1143
|
)
|
1034
1144
|
|
1035
1145
|
def apply_torch_tp(self):
|
1036
|
-
|
1037
|
-
logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.")
|
1146
|
+
logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.")
|
1038
1147
|
from sglang.srt.model_parallel import tensor_parallel
|
1039
1148
|
|
1040
1149
|
device_mesh = torch.distributed.init_device_mesh(self.device, (self.tp_size,))
|
@@ -1093,6 +1202,27 @@ class ModelRunner:
|
|
1093
1202
|
forward_batch: ForwardBatch,
|
1094
1203
|
skip_attn_backend_init: bool = False,
|
1095
1204
|
pp_proxy_tensors: Optional[PPProxyTensors] = None,
|
1205
|
+
) -> Tuple[Union[LogitsProcessorOutput, PPProxyTensors], bool]:
|
1206
|
+
self.forward_pass_id += 1
|
1207
|
+
|
1208
|
+
with get_global_expert_distribution_recorder().with_forward_pass(
|
1209
|
+
self.forward_pass_id,
|
1210
|
+
forward_batch,
|
1211
|
+
):
|
1212
|
+
output = self._forward_raw(
|
1213
|
+
forward_batch, skip_attn_backend_init, pp_proxy_tensors
|
1214
|
+
)
|
1215
|
+
|
1216
|
+
if self.eplb_manager is not None:
|
1217
|
+
self.eplb_manager.on_forward_pass_end()
|
1218
|
+
|
1219
|
+
return output
|
1220
|
+
|
1221
|
+
def _forward_raw(
|
1222
|
+
self,
|
1223
|
+
forward_batch: ForwardBatch,
|
1224
|
+
skip_attn_backend_init: bool,
|
1225
|
+
pp_proxy_tensors: Optional[PPProxyTensors],
|
1096
1226
|
) -> Tuple[Union[LogitsProcessorOutput, PPProxyTensors], bool]:
|
1097
1227
|
can_run_cuda_graph = bool(
|
1098
1228
|
forward_batch.forward_mode.is_cuda_graph()
|
@@ -1171,7 +1301,7 @@ class ModelRunner:
|
|
1171
1301
|
def model_is_mrope(self) -> bool:
|
1172
1302
|
"""Detect if the model has "mrope" rope_scaling type.
|
1173
1303
|
mrope requires keep "rope_deltas" between prompt and decoding phases."""
|
1174
|
-
rope_scaling = getattr(self.model_config.
|
1304
|
+
rope_scaling = getattr(self.model_config.hf_text_config, "rope_scaling", {})
|
1175
1305
|
if rope_scaling is None:
|
1176
1306
|
return False
|
1177
1307
|
is_mrope_enabled = "mrope_section" in rope_scaling
|
@@ -197,6 +197,15 @@ class DefaultModelLoader(BaseModelLoader):
|
|
197
197
|
fall_back_to_pt: bool = True
|
198
198
|
"""Whether .pt weights can be used."""
|
199
199
|
|
200
|
+
@classmethod
|
201
|
+
def init_new(cls, model_config: ModelConfig, model):
|
202
|
+
return cls(
|
203
|
+
model_config.model_path,
|
204
|
+
model_config.revision,
|
205
|
+
prefix="",
|
206
|
+
fall_back_to_pt=getattr(model, "fall_back_to_pt_during_load", True),
|
207
|
+
)
|
208
|
+
|
200
209
|
def __init__(self, load_config: LoadConfig):
|
201
210
|
super().__init__(load_config)
|
202
211
|
if load_config.model_loader_extra_config:
|
@@ -341,12 +350,7 @@ class DefaultModelLoader(BaseModelLoader):
|
|
341
350
|
model: nn.Module,
|
342
351
|
) -> Generator[Tuple[str, torch.Tensor], None, None]:
|
343
352
|
|
344
|
-
primary_weights = DefaultModelLoader.Source(
|
345
|
-
model_config.model_path,
|
346
|
-
model_config.revision,
|
347
|
-
prefix="",
|
348
|
-
fall_back_to_pt=getattr(model, "fall_back_to_pt_during_load", True),
|
349
|
-
)
|
353
|
+
primary_weights = DefaultModelLoader.Source.init_new(model_config, model)
|
350
354
|
yield from self._get_weights_iterator(primary_weights)
|
351
355
|
|
352
356
|
secondary_weights = cast(
|