sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +16 -10
- sglang/bench_one_batch.py +5 -4
- sglang/bench_one_batch_server.py +86 -22
- sglang/bench_serving.py +197 -110
- sglang/compile_deep_gemm.py +4 -4
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +66 -29
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +47 -9
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/common/__init__.py +1 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/decode.py +187 -134
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
- sglang/srt/disaggregation/fake/conn.py +4 -13
- sglang/srt/disaggregation/kv_events.py +412 -0
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +84 -70
- sglang/srt/disaggregation/mooncake/conn.py +441 -140
- sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
- sglang/srt/disaggregation/nixl/conn.py +124 -442
- sglang/srt/disaggregation/prefill.py +128 -44
- sglang/srt/disaggregation/utils.py +154 -6
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/distributed/utils.py +3 -3
- sglang/srt/entrypoints/EngineBase.py +11 -0
- sglang/srt/entrypoints/engine.py +129 -12
- sglang/srt/entrypoints/http_server.py +21 -6
- sglang/srt/entrypoints/http_server_engine.py +5 -2
- sglang/srt/function_call/base_format_detector.py +302 -0
- sglang/srt/function_call/core_types.py +34 -0
- sglang/srt/function_call/deepseekv3_detector.py +205 -0
- sglang/srt/function_call/ebnf_composer.py +248 -0
- sglang/srt/function_call/function_call_parser.py +202 -0
- sglang/srt/function_call/llama32_detector.py +93 -0
- sglang/srt/function_call/mistral_detector.py +131 -0
- sglang/srt/function_call/pythonic_detector.py +229 -0
- sglang/srt/function_call/qwen25_detector.py +121 -0
- sglang/srt/function_call/utils.py +52 -0
- sglang/srt/hf_transformers_utils.py +50 -7
- sglang/srt/layers/attention/aiter_backend.py +878 -0
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
- sglang/srt/layers/attention/flashattention_backend.py +166 -35
- sglang/srt/layers/attention/flashinfer_backend.py +45 -1
- sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
- sglang/srt/layers/attention/flashmla_backend.py +340 -78
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +247 -5
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/utils.py +2 -2
- sglang/srt/layers/attention/vision.py +1 -1
- sglang/srt/layers/communicator.py +517 -0
- sglang/srt/layers/dp_attention.py +6 -15
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/moe/cutlass_moe.py +370 -0
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
- sglang/srt/layers/moe/ep_moe/layer.py +195 -87
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
- sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
- sglang/srt/layers/moe/topk.py +107 -24
- sglang/srt/layers/multimodal.py +70 -0
- sglang/srt/layers/quantization/__init__.py +10 -4
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm.py +60 -59
- sglang/srt/layers/quantization/fp8.py +113 -18
- sglang/srt/layers/quantization/fp8_kernel.py +118 -66
- sglang/srt/layers/quantization/fp8_utils.py +165 -43
- sglang/srt/layers/quantization/gptq.py +298 -6
- sglang/srt/layers/quantization/int8_kernel.py +18 -5
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/qoq.py +244 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +20 -8
- sglang/srt/lora/mem_pool.py +24 -16
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
- sglang/srt/managers/eplb_manager.py +96 -0
- sglang/srt/managers/expert_distribution.py +878 -56
- sglang/srt/managers/expert_location.py +448 -0
- sglang/srt/managers/expert_location_dispatch.py +108 -0
- sglang/srt/managers/io_struct.py +29 -5
- sglang/srt/managers/mm_utils.py +355 -151
- sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
- sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
- sglang/srt/managers/multimodal_processors/internvl.py +18 -5
- sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
- sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
- sglang/srt/managers/multimodal_processors/llava.py +3 -3
- sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
- sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
- sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
- sglang/srt/managers/schedule_batch.py +185 -55
- sglang/srt/managers/schedule_policy.py +4 -5
- sglang/srt/managers/scheduler.py +389 -154
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +231 -39
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/base_prefix_cache.py +3 -0
- sglang/srt/mem_cache/chunk_cache.py +3 -1
- sglang/srt/mem_cache/hiradix_cache.py +4 -4
- sglang/srt/mem_cache/memory_pool.py +74 -52
- sglang/srt/mem_cache/multimodal_cache.py +45 -0
- sglang/srt/mem_cache/radix_cache.py +58 -5
- sglang/srt/metrics/collector.py +11 -2
- sglang/srt/mm_utils.py +10 -0
- sglang/srt/model_executor/cuda_graph_runner.py +87 -65
- sglang/srt/model_executor/expert_location_updater.py +557 -0
- sglang/srt/model_executor/forward_batch_info.py +39 -14
- sglang/srt/model_executor/model_runner.py +231 -101
- sglang/srt/model_loader/loader.py +10 -6
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/clip.py +5 -1
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +732 -403
- sglang/srt/models/exaone.py +8 -3
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +75 -33
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/llama4.py +10 -2
- sglang/srt/models/llava.py +26 -18
- sglang/srt/models/mimo_mtp.py +220 -0
- sglang/srt/models/minicpmo.py +7 -17
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/mistral.py +71 -1
- sglang/srt/models/mllama.py +3 -3
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +133 -35
- sglang/srt/models/qwen2_5_vl.py +5 -3
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +206 -69
- sglang/srt/models/qwen2_vl.py +3 -3
- sglang/srt/models/qwen3.py +92 -19
- sglang/srt/models/qwen3_moe.py +457 -55
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/siglip.py +294 -0
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/openai_api/adapter.py +114 -40
- sglang/srt/openai_api/protocol.py +37 -2
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +189 -0
- sglang/srt/operations_strategy.py +207 -0
- sglang/srt/sampling/sampling_batch_info.py +13 -1
- sglang/srt/sampling/sampling_params.py +2 -1
- sglang/srt/server_args.py +235 -38
- sglang/srt/speculative/build_eagle_tree.py +8 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
- sglang/srt/speculative/eagle_utils.py +181 -90
- sglang/srt/speculative/eagle_worker.py +146 -21
- sglang/srt/two_batch_overlap.py +635 -0
- sglang/srt/utils.py +197 -19
- sglang/test/runners.py +16 -7
- sglang/test/send_one.py +4 -0
- sglang/test/test_cutlass_moe.py +278 -0
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +81 -42
- sglang/utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
- sglang-0.4.7.dist-info/RECORD +699 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
- sglang/srt/function_call_parser.py +0 -858
- sglang/srt/platforms/interface.py +0 -371
- sglang-0.4.6.post4.dist-info/RECORD +0 -646
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -28,6 +28,7 @@ from sglang.srt.utils import (
|
|
28
28
|
configure_ipv6,
|
29
29
|
get_device,
|
30
30
|
get_device_memory_capacity,
|
31
|
+
is_cuda,
|
31
32
|
is_flashinfer_available,
|
32
33
|
is_hip,
|
33
34
|
is_port_available,
|
@@ -46,7 +47,6 @@ class ServerArgs:
|
|
46
47
|
tokenizer_path: Optional[str] = None
|
47
48
|
tokenizer_mode: str = "auto"
|
48
49
|
skip_tokenizer_init: bool = False
|
49
|
-
enable_tokenizer_batch_encode: bool = False
|
50
50
|
load_format: str = "auto"
|
51
51
|
trust_remote_code: bool = False
|
52
52
|
dtype: str = "auto"
|
@@ -59,7 +59,9 @@ class ServerArgs:
|
|
59
59
|
chat_template: Optional[str] = None
|
60
60
|
completion_template: Optional[str] = None
|
61
61
|
is_embedding: bool = False
|
62
|
+
enable_multimodal: Optional[bool] = None
|
62
63
|
revision: Optional[str] = None
|
64
|
+
impl: str = "auto"
|
63
65
|
|
64
66
|
# Port for the HTTP server
|
65
67
|
host: str = "127.0.0.1"
|
@@ -97,8 +99,13 @@ class ServerArgs:
|
|
97
99
|
log_requests_level: int = 0
|
98
100
|
show_time_cost: bool = False
|
99
101
|
enable_metrics: bool = False
|
102
|
+
bucket_time_to_first_token: Optional[List[float]] = None
|
103
|
+
bucket_e2e_request_latency: Optional[List[float]] = None
|
104
|
+
bucket_inter_token_latency: Optional[List[float]] = None
|
105
|
+
collect_tokens_histogram: bool = False
|
100
106
|
decode_log_interval: int = 40
|
101
107
|
enable_request_time_stats_logging: bool = False
|
108
|
+
kv_events_config: Optional[str] = None
|
102
109
|
|
103
110
|
# API related
|
104
111
|
api_key: Optional[str] = None
|
@@ -120,6 +127,7 @@ class ServerArgs:
|
|
120
127
|
|
121
128
|
# Model override args in JSON
|
122
129
|
json_model_override_args: str = "{}"
|
130
|
+
preferred_sampling_params: Optional[str] = None
|
123
131
|
|
124
132
|
# LoRA
|
125
133
|
lora_paths: Optional[List[str]] = None
|
@@ -154,16 +162,31 @@ class ServerArgs:
|
|
154
162
|
disable_cuda_graph: bool = False
|
155
163
|
disable_cuda_graph_padding: bool = False
|
156
164
|
enable_nccl_nvls: bool = False
|
165
|
+
enable_tokenizer_batch_encode: bool = False
|
157
166
|
disable_outlines_disk_cache: bool = False
|
158
167
|
disable_custom_all_reduce: bool = False
|
159
|
-
|
168
|
+
enable_mscclpp: bool = False
|
160
169
|
disable_overlap_schedule: bool = False
|
161
170
|
enable_mixed_chunk: bool = False
|
162
171
|
enable_dp_attention: bool = False
|
163
172
|
enable_dp_lm_head: bool = False
|
173
|
+
enable_two_batch_overlap: bool = False
|
164
174
|
enable_ep_moe: bool = False
|
165
175
|
enable_deepep_moe: bool = False
|
166
176
|
deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
|
177
|
+
ep_num_redundant_experts: int = 0
|
178
|
+
ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
|
179
|
+
init_expert_location: str = "trivial"
|
180
|
+
enable_eplb: bool = False
|
181
|
+
eplb_algorithm: str = "auto"
|
182
|
+
eplb_rebalance_num_iterations: int = 1000
|
183
|
+
eplb_rebalance_layers_per_chunk: Optional[int] = None
|
184
|
+
expert_distribution_recorder_mode: Optional[
|
185
|
+
Literal["stat", "stat_approx", "per_pass", "per_token"]
|
186
|
+
] = None
|
187
|
+
expert_distribution_recorder_buffer_size: Optional[int] = None
|
188
|
+
enable_expert_distribution_metrics: bool = False
|
189
|
+
deepep_config: Optional[str] = None
|
167
190
|
enable_torch_compile: bool = False
|
168
191
|
torch_compile_max_bs: int = 32
|
169
192
|
cuda_graph_max_bs: Optional[int] = None
|
@@ -186,7 +209,7 @@ class ServerArgs:
|
|
186
209
|
flashinfer_mla_disable_ragged: bool = False
|
187
210
|
warmups: Optional[str] = None
|
188
211
|
moe_dense_tp_size: Optional[int] = None
|
189
|
-
|
212
|
+
disable_shared_experts_fusion: bool = False
|
190
213
|
disable_chunked_prefix_cache: bool = False
|
191
214
|
disable_fast_image_processor: bool = False
|
192
215
|
mm_attention_backend: Optional[str] = None
|
@@ -229,7 +252,7 @@ class ServerArgs:
|
|
229
252
|
# Set mem fraction static, which depends on the tensor parallelism size
|
230
253
|
if self.mem_fraction_static is None:
|
231
254
|
parallel_size = self.tp_size * self.pp_size
|
232
|
-
if gpu_mem <= 81920:
|
255
|
+
if gpu_mem is not None and gpu_mem <= 81920:
|
233
256
|
if parallel_size >= 16:
|
234
257
|
self.mem_fraction_static = 0.79
|
235
258
|
elif parallel_size >= 8:
|
@@ -242,17 +265,28 @@ class ServerArgs:
|
|
242
265
|
self.mem_fraction_static = 0.88
|
243
266
|
else:
|
244
267
|
self.mem_fraction_static = 0.88
|
245
|
-
if gpu_mem >
|
268
|
+
if gpu_mem is not None and gpu_mem > 180 * 1000 and is_cuda():
|
269
|
+
self.mem_fraction_static = 0.79
|
270
|
+
elif gpu_mem is not None and gpu_mem > 96 * 1024:
|
246
271
|
mem_fraction = self.mem_fraction_static
|
272
|
+
# 15 GB + additional 3GB for cuda graph
|
273
|
+
reserve_mem = 1024 * 18
|
274
|
+
# need reserve more memory for spec cuda graph
|
275
|
+
if self.speculative_algorithm is not None:
|
276
|
+
reserve_mem = 1024 * 20
|
247
277
|
self.mem_fraction_static = min(
|
248
278
|
mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem,
|
249
|
-
(gpu_mem -
|
250
|
-
/ gpu_mem, # 15 GB + additional 3GB for cuda graph
|
279
|
+
(gpu_mem - reserve_mem) / gpu_mem,
|
251
280
|
)
|
281
|
+
else:
|
282
|
+
if self.speculative_algorithm is not None:
|
283
|
+
self.mem_fraction_static *= 0.95
|
252
284
|
|
253
285
|
# Set chunked prefill size, which depends on the gpu memory capacity
|
254
286
|
if self.chunked_prefill_size is None:
|
255
|
-
if gpu_mem is not None and gpu_mem
|
287
|
+
if gpu_mem is not None and gpu_mem > 180_000:
|
288
|
+
self.chunked_prefill_size = 16384
|
289
|
+
elif gpu_mem is not None and gpu_mem < 25_000:
|
256
290
|
self.chunked_prefill_size = 2048
|
257
291
|
elif self.disaggregation_mode != "null":
|
258
292
|
self.chunked_prefill_size = 16384
|
@@ -292,6 +326,11 @@ class ServerArgs:
|
|
292
326
|
self.sampling_backend = "pytorch"
|
293
327
|
|
294
328
|
# Set kernel backends
|
329
|
+
if self.device == "cpu":
|
330
|
+
if self.attention_backend is None:
|
331
|
+
self.attention_backend = "intel_amx"
|
332
|
+
self.sampling_backend = "pytorch"
|
333
|
+
|
295
334
|
if self.sampling_backend is None:
|
296
335
|
self.sampling_backend = (
|
297
336
|
"flashinfer" if is_flashinfer_available() else "pytorch"
|
@@ -307,12 +346,6 @@ class ServerArgs:
|
|
307
346
|
if self.grammar_backend is None:
|
308
347
|
self.grammar_backend = "xgrammar"
|
309
348
|
|
310
|
-
if self.pp_size > 1:
|
311
|
-
self.disable_overlap_schedule = True
|
312
|
-
logger.warning(
|
313
|
-
"Overlap scheduler is disabled because of using pipeline parallelism."
|
314
|
-
)
|
315
|
-
|
316
349
|
# Data parallelism attention
|
317
350
|
if self.enable_dp_attention:
|
318
351
|
self.schedule_conservativeness = self.schedule_conservativeness * 0.3
|
@@ -354,6 +387,31 @@ class ServerArgs:
|
|
354
387
|
"Pipeline parallelism is incompatible with overlap schedule."
|
355
388
|
)
|
356
389
|
|
390
|
+
if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
|
391
|
+
self.expert_distribution_recorder_mode = "stat"
|
392
|
+
logger.info(
|
393
|
+
f"EPLB is enabled. The expert_distribution_recorder_mode is automatically set."
|
394
|
+
)
|
395
|
+
|
396
|
+
if (self.enable_eplb or (self.init_expert_location is not None)) and (
|
397
|
+
self.ep_dispatch_algorithm is None
|
398
|
+
):
|
399
|
+
self.ep_dispatch_algorithm = "static"
|
400
|
+
logger.info(
|
401
|
+
f"EPLB is enabled or init_expert_location is provided. ep_dispatch_algorithm is configured."
|
402
|
+
)
|
403
|
+
|
404
|
+
if self.enable_expert_distribution_metrics and (
|
405
|
+
self.expert_distribution_recorder_mode is None
|
406
|
+
):
|
407
|
+
self.expert_distribution_recorder_mode = "stat"
|
408
|
+
|
409
|
+
if self.expert_distribution_recorder_buffer_size is None:
|
410
|
+
if (x := self.eplb_rebalance_num_iterations) is not None:
|
411
|
+
self.expert_distribution_recorder_buffer_size = x
|
412
|
+
elif self.expert_distribution_recorder_mode is not None:
|
413
|
+
self.expert_distribution_recorder_buffer_size = 1000
|
414
|
+
|
357
415
|
# Speculative Decoding
|
358
416
|
if self.speculative_algorithm == "NEXTN":
|
359
417
|
# NEXTN shares the same implementation of EAGLE
|
@@ -367,6 +425,12 @@ class ServerArgs:
|
|
367
425
|
"Overlap scheduler is disabled because of using "
|
368
426
|
"eagle speculative decoding."
|
369
427
|
)
|
428
|
+
if self.enable_mixed_chunk:
|
429
|
+
self.enable_mixed_chunk = False
|
430
|
+
logger.warning(
|
431
|
+
"Mixed chunked prefill is disabled because of using "
|
432
|
+
"eagle speculative decoding."
|
433
|
+
)
|
370
434
|
|
371
435
|
model_arch = get_model_arch(self)
|
372
436
|
|
@@ -389,7 +453,7 @@ class ServerArgs:
|
|
389
453
|
self.speculative_num_steps,
|
390
454
|
self.speculative_eagle_topk,
|
391
455
|
self.speculative_num_draft_tokens,
|
392
|
-
) = auto_choose_speculative_params(
|
456
|
+
) = auto_choose_speculative_params(self)
|
393
457
|
|
394
458
|
if self.page_size > 1 and self.speculative_eagle_topk > 1:
|
395
459
|
self.speculative_eagle_topk = 1
|
@@ -474,11 +538,6 @@ class ServerArgs:
|
|
474
538
|
action="store_true",
|
475
539
|
help="If set, skip init tokenizer and pass input_ids in generate request.",
|
476
540
|
)
|
477
|
-
parser.add_argument(
|
478
|
-
"--enable-tokenizer-batch-encode",
|
479
|
-
action="store_true",
|
480
|
-
help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
|
481
|
-
)
|
482
541
|
parser.add_argument(
|
483
542
|
"--load-format",
|
484
543
|
type=str,
|
@@ -556,6 +615,7 @@ class ServerArgs:
|
|
556
615
|
"w8a8_int8",
|
557
616
|
"w8a8_fp8",
|
558
617
|
"moe_wna16",
|
618
|
+
"qoq",
|
559
619
|
],
|
560
620
|
help="The quantization method.",
|
561
621
|
)
|
@@ -603,6 +663,12 @@ class ServerArgs:
|
|
603
663
|
action="store_true",
|
604
664
|
help="Whether to use a CausalLM as an embedding model.",
|
605
665
|
)
|
666
|
+
parser.add_argument(
|
667
|
+
"--enable-multimodal",
|
668
|
+
default=ServerArgs.enable_multimodal,
|
669
|
+
action="store_true",
|
670
|
+
help="Enable the multimodal functionality for the served model. If the model being served is not multimodal, nothing will happen",
|
671
|
+
)
|
606
672
|
parser.add_argument(
|
607
673
|
"--revision",
|
608
674
|
type=str,
|
@@ -669,6 +735,18 @@ class ServerArgs:
|
|
669
735
|
default=ServerArgs.page_size,
|
670
736
|
help="The number of tokens in a page.",
|
671
737
|
)
|
738
|
+
parser.add_argument(
|
739
|
+
"--impl",
|
740
|
+
type=str,
|
741
|
+
default=ServerArgs.impl,
|
742
|
+
help="Which implementation of the model to use.\n\n"
|
743
|
+
'* "auto" will try to use the SGLang implementation if it exists '
|
744
|
+
"and fall back to the Transformers implementation if no SGLang "
|
745
|
+
"implementation is available.\n"
|
746
|
+
'* "sglang" will use the SGLang model implementation.\n'
|
747
|
+
'* "transformers" will use the Transformers model '
|
748
|
+
"implementation.\n",
|
749
|
+
)
|
672
750
|
|
673
751
|
# Other runtime options
|
674
752
|
parser.add_argument(
|
@@ -780,6 +858,39 @@ class ServerArgs:
|
|
780
858
|
action="store_true",
|
781
859
|
help="Enable log prometheus metrics.",
|
782
860
|
)
|
861
|
+
parser.add_argument(
|
862
|
+
"--bucket-time-to-first-token",
|
863
|
+
type=float,
|
864
|
+
nargs="+",
|
865
|
+
default=ServerArgs.bucket_time_to_first_token,
|
866
|
+
help="The buckets of time to first token, specified as a list of floats.",
|
867
|
+
)
|
868
|
+
parser.add_argument(
|
869
|
+
"--bucket-inter-token-latency",
|
870
|
+
type=float,
|
871
|
+
nargs="+",
|
872
|
+
default=ServerArgs.bucket_inter_token_latency,
|
873
|
+
help="The buckets of inter-token latency, specified as a list of floats.",
|
874
|
+
)
|
875
|
+
parser.add_argument(
|
876
|
+
"--bucket-e2e-request-latency",
|
877
|
+
type=float,
|
878
|
+
nargs="+",
|
879
|
+
default=ServerArgs.bucket_e2e_request_latency,
|
880
|
+
help="The buckets of end-to-end request latency, specified as a list of floats.",
|
881
|
+
)
|
882
|
+
parser.add_argument(
|
883
|
+
"--collect-tokens-histogram",
|
884
|
+
action="store_true",
|
885
|
+
default=ServerArgs.collect_tokens_histogram,
|
886
|
+
help="Collect prompt/generation tokens histogram.",
|
887
|
+
)
|
888
|
+
parser.add_argument(
|
889
|
+
"--kv-events-config",
|
890
|
+
type=str,
|
891
|
+
default=None,
|
892
|
+
help="Config in json format for NVIDIA dynamo KV event publishing. Publishing will be enabled if this flag is used.",
|
893
|
+
)
|
783
894
|
parser.add_argument(
|
784
895
|
"--decode-log-interval",
|
785
896
|
type=int,
|
@@ -868,6 +979,11 @@ class ServerArgs:
|
|
868
979
|
help="A dictionary in JSON string format used to override default model configurations.",
|
869
980
|
default=ServerArgs.json_model_override_args,
|
870
981
|
)
|
982
|
+
parser.add_argument(
|
983
|
+
"--preferred-sampling-params",
|
984
|
+
type=str,
|
985
|
+
help="json-formatted sampling settings that will be returned in /get_model_info",
|
986
|
+
)
|
871
987
|
|
872
988
|
# LoRA
|
873
989
|
parser.add_argument(
|
@@ -896,12 +1012,14 @@ class ServerArgs:
|
|
896
1012
|
"--attention-backend",
|
897
1013
|
type=str,
|
898
1014
|
choices=[
|
899
|
-
"
|
900
|
-
"
|
901
|
-
"torch_native",
|
1015
|
+
"aiter",
|
1016
|
+
"cutlass_mla",
|
902
1017
|
"fa3",
|
1018
|
+
"flashinfer",
|
903
1019
|
"flashmla",
|
904
|
-
"
|
1020
|
+
"intel_amx",
|
1021
|
+
"torch_native",
|
1022
|
+
"triton",
|
905
1023
|
],
|
906
1024
|
default=ServerArgs.attention_backend,
|
907
1025
|
help="Choose the kernels for attention layers.",
|
@@ -1043,6 +1161,11 @@ class ServerArgs:
|
|
1043
1161
|
action="store_true",
|
1044
1162
|
help="Enable NCCL NVLS for prefill heavy requests when available.",
|
1045
1163
|
)
|
1164
|
+
parser.add_argument(
|
1165
|
+
"--enable-tokenizer-batch-encode",
|
1166
|
+
action="store_true",
|
1167
|
+
help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
|
1168
|
+
)
|
1046
1169
|
parser.add_argument(
|
1047
1170
|
"--disable-outlines-disk-cache",
|
1048
1171
|
action="store_true",
|
@@ -1054,10 +1177,9 @@ class ServerArgs:
|
|
1054
1177
|
help="Disable the custom all-reduce kernel and fall back to NCCL.",
|
1055
1178
|
)
|
1056
1179
|
parser.add_argument(
|
1057
|
-
"--enable-
|
1058
|
-
default=ServerArgs.enable_multimodal,
|
1180
|
+
"--enable-mscclpp",
|
1059
1181
|
action="store_true",
|
1060
|
-
help="Enable
|
1182
|
+
help="Enable using mscclpp for small messages for all-reduce kernel and fall back to NCCL.",
|
1061
1183
|
)
|
1062
1184
|
parser.add_argument(
|
1063
1185
|
"--disable-overlap-schedule",
|
@@ -1072,7 +1194,7 @@ class ServerArgs:
|
|
1072
1194
|
parser.add_argument(
|
1073
1195
|
"--enable-dp-attention",
|
1074
1196
|
action="store_true",
|
1075
|
-
help="Enabling data parallelism for attention and tensor parallelism for FFN. The dp size should be equal to the tp size. Currently
|
1197
|
+
help="Enabling data parallelism for attention and tensor parallelism for FFN. The dp size should be equal to the tp size. Currently DeepSeek-V2 and Qwen 2/3 MoE models are supported.",
|
1076
1198
|
)
|
1077
1199
|
parser.add_argument(
|
1078
1200
|
"--enable-dp-lm-head",
|
@@ -1084,6 +1206,11 @@ class ServerArgs:
|
|
1084
1206
|
action="store_true",
|
1085
1207
|
help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
|
1086
1208
|
)
|
1209
|
+
parser.add_argument(
|
1210
|
+
"--enable-two-batch-overlap",
|
1211
|
+
action="store_true",
|
1212
|
+
help="Enabling two micro batches to overlap.",
|
1213
|
+
)
|
1087
1214
|
parser.add_argument(
|
1088
1215
|
"--enable-torch-compile",
|
1089
1216
|
action="store_true",
|
@@ -1212,13 +1339,74 @@ class ServerArgs:
|
|
1212
1339
|
default="auto",
|
1213
1340
|
help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
|
1214
1341
|
)
|
1215
|
-
|
1216
1342
|
parser.add_argument(
|
1217
|
-
"--
|
1343
|
+
"--ep-num-redundant-experts",
|
1218
1344
|
type=int,
|
1219
|
-
default=
|
1220
|
-
help="
|
1221
|
-
|
1345
|
+
default=ServerArgs.ep_num_redundant_experts,
|
1346
|
+
help="Allocate this number of redundant experts in expert parallel.",
|
1347
|
+
)
|
1348
|
+
parser.add_argument(
|
1349
|
+
"--ep-dispatch-algorithm",
|
1350
|
+
type=str,
|
1351
|
+
default=ServerArgs.ep_dispatch_algorithm,
|
1352
|
+
help="The algorithm to choose ranks for redundant experts in expert parallel.",
|
1353
|
+
)
|
1354
|
+
parser.add_argument(
|
1355
|
+
"--init-expert-location",
|
1356
|
+
type=str,
|
1357
|
+
default=ServerArgs.init_expert_location,
|
1358
|
+
help="Initial location of EP experts.",
|
1359
|
+
)
|
1360
|
+
parser.add_argument(
|
1361
|
+
"--enable-eplb",
|
1362
|
+
action="store_true",
|
1363
|
+
help="Enable EPLB algorithm",
|
1364
|
+
)
|
1365
|
+
parser.add_argument(
|
1366
|
+
"--eplb-algorithm",
|
1367
|
+
type=str,
|
1368
|
+
default=ServerArgs.eplb_algorithm,
|
1369
|
+
help="Chosen EPLB algorithm",
|
1370
|
+
)
|
1371
|
+
parser.add_argument(
|
1372
|
+
"--eplb-rebalance-num-iterations",
|
1373
|
+
type=int,
|
1374
|
+
default=ServerArgs.eplb_rebalance_num_iterations,
|
1375
|
+
help="Number of iterations to automatically trigger a EPLB re-balance.",
|
1376
|
+
)
|
1377
|
+
parser.add_argument(
|
1378
|
+
"--eplb-rebalance-layers-per-chunk",
|
1379
|
+
type=int,
|
1380
|
+
default=ServerArgs.eplb_rebalance_layers_per_chunk,
|
1381
|
+
help="Number of layers to rebalance per forward pass.",
|
1382
|
+
)
|
1383
|
+
parser.add_argument(
|
1384
|
+
"--expert-distribution-recorder-mode",
|
1385
|
+
type=str,
|
1386
|
+
default=ServerArgs.expert_distribution_recorder_mode,
|
1387
|
+
help="Mode of expert distribution recorder.",
|
1388
|
+
)
|
1389
|
+
parser.add_argument(
|
1390
|
+
"--expert-distribution-recorder-buffer-size",
|
1391
|
+
type=int,
|
1392
|
+
default=ServerArgs.expert_distribution_recorder_buffer_size,
|
1393
|
+
help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
|
1394
|
+
)
|
1395
|
+
parser.add_argument(
|
1396
|
+
"--enable-expert-distribution-metrics",
|
1397
|
+
action="store_true",
|
1398
|
+
help="Enable logging metrics for expert balancedness",
|
1399
|
+
)
|
1400
|
+
parser.add_argument(
|
1401
|
+
"--deepep-config",
|
1402
|
+
type=str,
|
1403
|
+
default=ServerArgs.deepep_config,
|
1404
|
+
help="Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path.",
|
1405
|
+
)
|
1406
|
+
parser.add_argument(
|
1407
|
+
"--disable-shared-experts-fusion",
|
1408
|
+
action="store_true",
|
1409
|
+
help="Disable shared experts fusion optimization for deepseek v3/r1.",
|
1222
1410
|
)
|
1223
1411
|
parser.add_argument(
|
1224
1412
|
"--disable-chunked-prefix-cache",
|
@@ -1326,8 +1514,6 @@ class ServerArgs:
|
|
1326
1514
|
|
1327
1515
|
# FIXME pp constraints
|
1328
1516
|
if self.pp_size > 1:
|
1329
|
-
logger.warning(f"Turn off overlap scheule for pipeline parallelism.")
|
1330
|
-
self.disable_overlap_schedule = True
|
1331
1517
|
assert (
|
1332
1518
|
self.disable_overlap_schedule
|
1333
1519
|
and self.speculative_algorithm is None
|
@@ -1341,7 +1527,7 @@ class ServerArgs:
|
|
1341
1527
|
self.max_loras_per_batch > 0
|
1342
1528
|
# FIXME
|
1343
1529
|
and (self.lora_paths is None or self.disable_radix_cache)
|
1344
|
-
), "compatibility of lora and
|
1530
|
+
), "compatibility of lora and radix attention is in progress"
|
1345
1531
|
assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
|
1346
1532
|
assert self.gpu_id_step >= 1, "gpu_id_step must be positive"
|
1347
1533
|
|
@@ -1475,18 +1661,29 @@ def get_model_arch(args: ServerArgs):
|
|
1475
1661
|
return hf_config.architectures[0]
|
1476
1662
|
|
1477
1663
|
|
1478
|
-
def auto_choose_speculative_params(
|
1664
|
+
def auto_choose_speculative_params(self: ServerArgs):
|
1479
1665
|
"""
|
1480
1666
|
Automatically choose the parameters for speculative decoding.
|
1481
1667
|
|
1482
1668
|
You can tune them on your own models and prompts with scripts/playground/bench_speculative.py
|
1483
1669
|
"""
|
1670
|
+
kwargs = {}
|
1671
|
+
|
1672
|
+
hf_config = get_config(
|
1673
|
+
self.model_path,
|
1674
|
+
trust_remote_code=self.trust_remote_code,
|
1675
|
+
revision=self.revision,
|
1676
|
+
model_override_args=json.loads(self.json_model_override_args),
|
1677
|
+
**kwargs,
|
1678
|
+
)
|
1679
|
+
arch = hf_config.architectures[0]
|
1680
|
+
|
1484
1681
|
if arch in ["LlamaForCausalLM"]:
|
1485
1682
|
# The default value for llama
|
1486
1683
|
return (5, 4, 8)
|
1487
1684
|
elif arch in ["DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM"]:
|
1488
1685
|
# The default value for deepseek
|
1489
|
-
return (
|
1686
|
+
return (3, 1, 4)
|
1490
1687
|
elif arch in ["Grok1ForCausalLM", "Grok1VForCausalLM"]:
|
1491
1688
|
return (5, 4, 8)
|
1492
1689
|
else:
|
@@ -4,7 +4,7 @@ from typing import List
|
|
4
4
|
|
5
5
|
import torch
|
6
6
|
|
7
|
-
from sglang.srt.utils import is_cuda, is_hip
|
7
|
+
from sglang.srt.utils import is_cuda, is_hip, rank0_print
|
8
8
|
|
9
9
|
if is_cuda() or is_hip():
|
10
10
|
from sgl_kernel import (
|
@@ -344,13 +344,13 @@ def test_build_tree_kernel_efficient():
|
|
344
344
|
num_verify_tokens=num_draft_token,
|
345
345
|
)
|
346
346
|
|
347
|
-
|
348
|
-
#
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
347
|
+
rank0_print("=========== build tree kernel efficient ==========")
|
348
|
+
# rank0_print(f"{tree_mask=}", flush=True)
|
349
|
+
rank0_print(f"{position=}", flush=True)
|
350
|
+
rank0_print(f"{retrive_index=}", flush=True)
|
351
|
+
rank0_print(f"{retrive_next_token=}", flush=True)
|
352
|
+
rank0_print(f"{retrive_next_sibling=}", flush=True)
|
353
|
+
rank0_print(f"{draft_tokens=}", flush=True)
|
354
354
|
assert position.tolist() == [5, 6, 6, 7, 7, 8, 8, 9, 10, 11, 12, 12, 12, 12, 13, 14]
|
355
355
|
assert retrive_index.tolist() == [
|
356
356
|
[0, 1, 2, 3, 4, 5, 6, 7],
|
@@ -6,9 +6,11 @@ from typing import TYPE_CHECKING, Callable
|
|
6
6
|
import torch
|
7
7
|
|
8
8
|
from sglang.srt.model_executor.cuda_graph_runner import (
|
9
|
+
CUDA_GRAPH_CAPTURE_FAILED_MSG,
|
9
10
|
CudaGraphRunner,
|
10
11
|
get_batch_sizes_to_capture,
|
11
12
|
get_global_graph_memory_pool,
|
13
|
+
model_capture_mode,
|
12
14
|
set_global_graph_memory_pool,
|
13
15
|
set_torch_compile_config,
|
14
16
|
)
|
@@ -73,22 +75,17 @@ class EAGLEDraftCudaGraphRunner:
|
|
73
75
|
self.topk_p = torch.zeros((self.max_bs, self.topk), dtype=torch.float32)
|
74
76
|
self.topk_index = torch.zeros((self.max_bs, self.topk), dtype=torch.int64)
|
75
77
|
self.hidden_states = torch.zeros(
|
76
|
-
(self.
|
78
|
+
(self.max_num_token, self.model_runner.model_config.hidden_size),
|
77
79
|
dtype=self.model_runner.dtype,
|
78
80
|
)
|
79
81
|
|
80
82
|
# Capture
|
81
83
|
try:
|
82
|
-
|
84
|
+
with model_capture_mode():
|
85
|
+
self.capture()
|
83
86
|
except RuntimeError as e:
|
84
87
|
raise Exception(
|
85
|
-
f"Capture
|
86
|
-
"Possible solutions:\n"
|
87
|
-
"1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
|
88
|
-
"2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
|
89
|
-
"3. disable torch compile by not using --enable-torch-compile\n"
|
90
|
-
"4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)\n"
|
91
|
-
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
|
88
|
+
f"Capture cuda graph failed: {e}\n{CUDA_GRAPH_CAPTURE_FAILED_MSG}"
|
92
89
|
)
|
93
90
|
|
94
91
|
def can_run(self, forward_batch: ForwardBatch):
|
@@ -132,7 +129,7 @@ class EAGLEDraftCudaGraphRunner:
|
|
132
129
|
req_to_token_pool=self.model_runner.req_to_token_pool,
|
133
130
|
token_to_kv_pool=self.model_runner.token_to_kv_pool,
|
134
131
|
out_cache_loc=out_cache_loc,
|
135
|
-
seq_lens_sum=seq_lens.sum(),
|
132
|
+
seq_lens_sum=seq_lens.sum().item(),
|
136
133
|
return_logprob=False,
|
137
134
|
positions=positions,
|
138
135
|
spec_algorithm=self.model_runner.spec_algorithm,
|
@@ -214,7 +211,7 @@ class EAGLEDraftCudaGraphRunner:
|
|
214
211
|
forward_batch.positions = self.positions[:num_tokens]
|
215
212
|
|
216
213
|
# Special handle for seq_len_cpu used when flashinfer mla is used
|
217
|
-
if
|
214
|
+
if forward_batch.seq_lens_cpu is not None and bs != raw_bs:
|
218
215
|
self.seq_lens_cpu.fill_(1)
|
219
216
|
self.seq_lens_cpu[:raw_bs].copy_(forward_batch.seq_lens_cpu)
|
220
217
|
forward_batch.seq_lens_cpu = self.seq_lens_cpu[:bs]
|