sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +7 -0
- sglang/bench_offline_throughput.py +10 -4
- sglang/bench_one_batch_server.py +67 -11
- sglang/bench_serving.py +86 -75
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/lang/interpreter.py +40 -1
- sglang/lang/ir.py +27 -0
- sglang/math_utils.py +8 -0
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +33 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +52 -8
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/base/__init__.py +1 -1
- sglang/srt/disaggregation/base/conn.py +25 -11
- sglang/srt/disaggregation/common/__init__.py +5 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/common/utils.py +42 -0
- sglang/srt/disaggregation/decode.py +261 -52
- sglang/srt/disaggregation/fake/__init__.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +16 -9
- sglang/srt/disaggregation/kv_events.py +60 -5
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +29 -48
- sglang/srt/disaggregation/mooncake/__init__.py +1 -1
- sglang/srt/disaggregation/mooncake/conn.py +446 -149
- sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
- sglang/srt/disaggregation/nixl/__init__.py +6 -1
- sglang/srt/disaggregation/nixl/conn.py +134 -437
- sglang/srt/disaggregation/prefill.py +130 -43
- sglang/srt/disaggregation/utils.py +127 -86
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/entrypoints/EngineBase.py +6 -0
- sglang/srt/entrypoints/engine.py +116 -5
- sglang/srt/entrypoints/http_server.py +28 -4
- sglang/srt/eplb_simulator/__init__.py +1 -0
- sglang/srt/eplb_simulator/reader.py +51 -0
- sglang/srt/function_call/base_format_detector.py +138 -86
- sglang/srt/function_call/deepseekv3_detector.py +54 -6
- sglang/srt/function_call/ebnf_composer.py +33 -19
- sglang/srt/function_call/function_call_parser.py +27 -0
- sglang/srt/function_call/llama32_detector.py +33 -14
- sglang/srt/function_call/mistral_detector.py +73 -26
- sglang/srt/function_call/pythonic_detector.py +86 -20
- sglang/srt/function_call/qwen25_detector.py +64 -10
- sglang/srt/function_call/utils.py +17 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/activation.py +19 -0
- sglang/srt/layers/attention/aiter_backend.py +503 -125
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
- sglang/srt/layers/attention/flashattention_backend.py +137 -63
- sglang/srt/layers/attention/flashinfer_backend.py +46 -3
- sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
- sglang/srt/layers/attention/flashmla_backend.py +2 -10
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +304 -65
- sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/vision.py +51 -24
- sglang/srt/layers/communicator.py +281 -197
- sglang/srt/layers/dp_attention.py +6 -5
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/linear.py +0 -4
- sglang/srt/layers/logits_processor.py +0 -12
- sglang/srt/layers/moe/cutlass_moe.py +170 -7
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
- sglang/srt/layers/moe/ep_moe/layer.py +136 -72
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
- sglang/srt/layers/moe/topk.py +60 -26
- sglang/srt/layers/multimodal.py +3 -3
- sglang/srt/layers/pooler.py +56 -0
- sglang/srt/layers/quantization/__init__.py +3 -2
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
- sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
- sglang/srt/layers/quantization/fp8.py +28 -23
- sglang/srt/layers/quantization/fp8_kernel.py +156 -75
- sglang/srt/layers/quantization/fp8_utils.py +250 -69
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/radix_attention.py +2 -3
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +98 -39
- sglang/srt/lora/mem_pool.py +28 -21
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/cache_controller.py +2 -1
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
- sglang/srt/managers/eplb_manager.py +55 -14
- sglang/srt/managers/expert_distribution.py +220 -46
- sglang/srt/managers/expert_location.py +110 -56
- sglang/srt/managers/expert_location_dispatch.py +23 -6
- sglang/srt/managers/io_struct.py +43 -8
- sglang/srt/managers/mm_utils.py +88 -38
- sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
- sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
- sglang/srt/managers/multimodal_processors/internvl.py +4 -0
- sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
- sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
- sglang/srt/managers/multimodal_processors/vila.py +85 -0
- sglang/srt/managers/schedule_batch.py +173 -38
- sglang/srt/managers/scheduler.py +376 -127
- sglang/srt/managers/tokenizer_manager.py +163 -19
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/chunk_cache.py +1 -0
- sglang/srt/mem_cache/hiradix_cache.py +4 -2
- sglang/srt/mem_cache/memory_pool.py +111 -407
- sglang/srt/mem_cache/memory_pool_host.py +380 -0
- sglang/srt/mem_cache/radix_cache.py +36 -12
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +191 -113
- sglang/srt/model_executor/expert_location_updater.py +157 -22
- sglang/srt/model_executor/forward_batch_info.py +52 -22
- sglang/srt/model_executor/model_runner.py +102 -62
- sglang/srt/model_loader/loader.py +8 -1
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/bert.py +113 -13
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +623 -290
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +19 -14
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/internvl.py +46 -102
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/minicpmo.py +2 -5
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +38 -9
- sglang/srt/models/qwen2_5_vl.py +3 -9
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +58 -191
- sglang/srt/models/qwen2_vl.py +3 -9
- sglang/srt/models/qwen3.py +41 -10
- sglang/srt/models/qwen3_moe.py +230 -191
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/roberta.py +117 -9
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/models/vila.py +305 -0
- sglang/srt/openai_api/adapter.py +248 -28
- sglang/srt/openai_api/protocol.py +68 -3
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +37 -2
- sglang/srt/operations_strategy.py +200 -24
- sglang/srt/sampling/sampling_batch_info.py +37 -1
- sglang/srt/sampling/sampling_params.py +4 -1
- sglang/srt/server_args.py +381 -209
- sglang/srt/speculative/build_eagle_tree.py +9 -9
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
- sglang/srt/speculative/eagle_utils.py +440 -200
- sglang/srt/speculative/eagle_worker.py +234 -63
- sglang/srt/two_batch_overlap.py +637 -0
- sglang/srt/utils.py +187 -7
- sglang/test/attention/test_prefix_chunk_info.py +2 -0
- sglang/test/runners.py +54 -10
- sglang/test/send_one.py +4 -0
- sglang/test/test_block_fp8.py +1 -0
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
- sglang/test/test_block_fp8_ep.py +1 -0
- sglang/test/test_cutlass_moe.py +3 -3
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +82 -7
- sglang/utils.py +9 -0
- sglang/version.py +1 -1
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -60,6 +60,7 @@ class ServerArgs:
|
|
60
60
|
is_embedding: bool = False
|
61
61
|
enable_multimodal: Optional[bool] = None
|
62
62
|
revision: Optional[str] = None
|
63
|
+
impl: str = "auto"
|
63
64
|
|
64
65
|
# Port for the HTTP server
|
65
66
|
host: str = "127.0.0.1"
|
@@ -89,6 +90,7 @@ class ServerArgs:
|
|
89
90
|
download_dir: Optional[str] = None
|
90
91
|
base_gpu_id: int = 0
|
91
92
|
gpu_id_step: int = 1
|
93
|
+
sleep_on_idle: bool = False
|
92
94
|
|
93
95
|
# Logging
|
94
96
|
log_level: str = "info"
|
@@ -110,14 +112,12 @@ class ServerArgs:
|
|
110
112
|
file_storage_path: str = "sglang_storage"
|
111
113
|
enable_cache_report: bool = False
|
112
114
|
reasoning_parser: Optional[str] = None
|
115
|
+
tool_call_parser: Optional[str] = None
|
113
116
|
|
114
117
|
# Data parallelism
|
115
118
|
dp_size: int = 1
|
116
119
|
load_balance_method: str = "round_robin"
|
117
120
|
|
118
|
-
# Expert parallelism
|
119
|
-
ep_size: int = 1
|
120
|
-
|
121
121
|
# Multi-node distributed serving
|
122
122
|
dist_init_addr: Optional[str] = None
|
123
123
|
nnodes: int = 1
|
@@ -136,6 +136,7 @@ class ServerArgs:
|
|
136
136
|
attention_backend: Optional[str] = None
|
137
137
|
sampling_backend: Optional[str] = None
|
138
138
|
grammar_backend: Optional[str] = None
|
139
|
+
mm_attention_backend: Optional[str] = None
|
139
140
|
|
140
141
|
# Speculative decoding
|
141
142
|
speculative_algorithm: Optional[str] = None
|
@@ -147,6 +148,26 @@ class ServerArgs:
|
|
147
148
|
speculative_accept_threshold_acc: float = 1.0
|
148
149
|
speculative_token_map: Optional[str] = None
|
149
150
|
|
151
|
+
# Expert parallelism
|
152
|
+
ep_size: int = 1
|
153
|
+
enable_ep_moe: bool = False
|
154
|
+
enable_deepep_moe: bool = False
|
155
|
+
deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
|
156
|
+
ep_num_redundant_experts: int = 0
|
157
|
+
ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
|
158
|
+
init_expert_location: str = "trivial"
|
159
|
+
enable_eplb: bool = False
|
160
|
+
eplb_algorithm: str = "auto"
|
161
|
+
eplb_rebalance_num_iterations: int = 1000
|
162
|
+
eplb_rebalance_layers_per_chunk: Optional[int] = None
|
163
|
+
expert_distribution_recorder_mode: Optional[
|
164
|
+
Literal["stat", "stat_approx", "per_pass", "per_token"]
|
165
|
+
] = None
|
166
|
+
expert_distribution_recorder_buffer_size: Optional[int] = None
|
167
|
+
enable_expert_distribution_metrics: bool = False
|
168
|
+
deepep_config: Optional[str] = None
|
169
|
+
moe_dense_tp_size: Optional[int] = None
|
170
|
+
|
150
171
|
# Double Sparsity
|
151
172
|
enable_double_sparsity: bool = False
|
152
173
|
ds_channel_config_path: Optional[str] = None
|
@@ -157,34 +178,24 @@ class ServerArgs:
|
|
157
178
|
|
158
179
|
# Optimization/debug options
|
159
180
|
disable_radix_cache: bool = False
|
181
|
+
cuda_graph_max_bs: Optional[int] = None
|
182
|
+
cuda_graph_bs: Optional[List[int]] = None
|
160
183
|
disable_cuda_graph: bool = False
|
161
184
|
disable_cuda_graph_padding: bool = False
|
185
|
+
enable_profile_cuda_graph: bool = False
|
162
186
|
enable_nccl_nvls: bool = False
|
163
187
|
enable_tokenizer_batch_encode: bool = False
|
164
188
|
disable_outlines_disk_cache: bool = False
|
165
189
|
disable_custom_all_reduce: bool = False
|
190
|
+
enable_mscclpp: bool = False
|
166
191
|
disable_overlap_schedule: bool = False
|
192
|
+
disable_overlap_cg_plan: bool = False
|
167
193
|
enable_mixed_chunk: bool = False
|
168
194
|
enable_dp_attention: bool = False
|
169
195
|
enable_dp_lm_head: bool = False
|
170
|
-
|
171
|
-
enable_deepep_moe: bool = False
|
172
|
-
deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
|
173
|
-
ep_num_redundant_experts: int = 0
|
174
|
-
ep_dispatch_algorithm: Optional[Literal["static", "dynamic"]] = None
|
175
|
-
init_expert_location: str = "trivial"
|
176
|
-
enable_eplb: bool = False
|
177
|
-
eplb_rebalance_num_iterations: int = 1000
|
178
|
-
expert_distribution_recorder_mode: Optional[
|
179
|
-
Literal["stat", "per_pass", "per_token"]
|
180
|
-
] = None
|
181
|
-
expert_distribution_recorder_buffer_size: Optional[int] = None
|
182
|
-
enable_expert_distribution_metrics: bool = False
|
183
|
-
deepep_config: Optional[str] = None
|
196
|
+
enable_two_batch_overlap: bool = False
|
184
197
|
enable_torch_compile: bool = False
|
185
198
|
torch_compile_max_bs: int = 32
|
186
|
-
cuda_graph_max_bs: Optional[int] = None
|
187
|
-
cuda_graph_bs: Optional[List[int]] = None
|
188
199
|
torchao_config: str = ""
|
189
200
|
enable_nan_detection: bool = False
|
190
201
|
enable_p2p_check: bool = False
|
@@ -195,29 +206,32 @@ class ServerArgs:
|
|
195
206
|
enable_memory_saver: bool = False
|
196
207
|
allow_auto_truncate: bool = False
|
197
208
|
enable_custom_logit_processor: bool = False
|
198
|
-
tool_call_parser: Optional[str] = None
|
199
209
|
enable_hierarchical_cache: bool = False
|
200
210
|
hicache_ratio: float = 2.0
|
201
211
|
hicache_size: int = 0
|
202
212
|
hicache_write_policy: str = "write_through_selective"
|
203
213
|
flashinfer_mla_disable_ragged: bool = False
|
204
|
-
|
205
|
-
moe_dense_tp_size: Optional[int] = None
|
206
|
-
n_share_experts_fusion: int = 0
|
214
|
+
disable_shared_experts_fusion: bool = False
|
207
215
|
disable_chunked_prefix_cache: bool = False
|
208
216
|
disable_fast_image_processor: bool = False
|
209
|
-
|
217
|
+
enable_return_hidden_states: bool = False
|
218
|
+
warmups: Optional[str] = None
|
210
219
|
|
211
220
|
# Debug tensor dumps
|
212
221
|
debug_tensor_dump_output_folder: Optional[str] = None
|
213
222
|
debug_tensor_dump_input_file: Optional[str] = None
|
214
223
|
debug_tensor_dump_inject: bool = False
|
224
|
+
debug_tensor_dump_prefill_only: bool = False
|
215
225
|
|
216
226
|
# For PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
|
217
227
|
disaggregation_mode: str = "null"
|
218
|
-
disaggregation_bootstrap_port: int = 8998
|
219
228
|
disaggregation_transfer_backend: str = "mooncake"
|
229
|
+
disaggregation_bootstrap_port: int = 8998
|
230
|
+
disaggregation_decode_tp: Optional[int] = None
|
231
|
+
disaggregation_decode_dp: Optional[int] = None
|
232
|
+
disaggregation_prefill_pp: Optional[int] = 1
|
220
233
|
disaggregation_ib_device: Optional[str] = None
|
234
|
+
num_reserved_decode_tokens: int = 512 # used for decode kv cache offload in PD
|
221
235
|
pdlb_url: Optional[str] = None
|
222
236
|
|
223
237
|
def __post_init__(self):
|
@@ -243,40 +257,72 @@ class ServerArgs:
|
|
243
257
|
|
244
258
|
gpu_mem = get_device_memory_capacity(self.device)
|
245
259
|
|
246
|
-
# Set mem fraction static
|
260
|
+
# Set mem fraction static
|
247
261
|
if self.mem_fraction_static is None:
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
262
|
+
if gpu_mem is not None:
|
263
|
+
# GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
|
264
|
+
# mem_fraction_static = (model weights + KV cache pool) / GPU memory capacity.
|
265
|
+
|
266
|
+
# We want mem_fraction_static to be as large as possible but still has enough room
|
267
|
+
# for activations and cuda graph buffers. We use the following heuristic to
|
268
|
+
# compute the needed size for activations and cuda graph buffers:
|
269
|
+
# - The size of the activation depends on the chunked_prefill_size and model size.
|
270
|
+
# - The size of cuda graph buffers depends on the cuda graph capture range and model size.
|
271
|
+
# For GPUs with more memory, we use a larger chunked_prefill_size and
|
272
|
+
# capture more cuda graphs, so they need to reserve more memory.
|
273
|
+
parallel_size = self.tp_size * self.pp_size
|
274
|
+
|
275
|
+
if gpu_mem < 20 * 1024:
|
276
|
+
# T4, 4080. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
|
277
|
+
reserved_mem = (2.8 + parallel_size / 10) * 1024
|
278
|
+
elif gpu_mem < 35 * 1024:
|
279
|
+
# A10, L40, 4090, 5090. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
|
280
|
+
reserved_mem = (2.8 + parallel_size / 10) * 1024
|
281
|
+
elif gpu_mem < 90 * 1024:
|
282
|
+
# H100, A100. (chunked_prefill_size 8k, cuda_graph_max_bs 160)
|
283
|
+
reserved_mem = (9.5 + parallel_size / 2) * 1024
|
284
|
+
elif gpu_mem < 100 * 1024:
|
285
|
+
# H20. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
|
286
|
+
reserved_mem = (12 + parallel_size / 2) * 1024
|
287
|
+
elif gpu_mem < 160 * 1024:
|
288
|
+
# H200. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
|
289
|
+
reserved_mem = (12 + parallel_size / 2) * 1024
|
258
290
|
else:
|
259
|
-
|
291
|
+
# B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512)
|
292
|
+
reserved_mem = 32 * 1024
|
293
|
+
|
294
|
+
if self.speculative_algorithm is not None:
|
295
|
+
# draft model and larger cuda graph buffers
|
296
|
+
reserved_mem += 2 * 1024
|
297
|
+
if self.enable_dp_attention:
|
298
|
+
reserved_mem += 4 * 1024
|
299
|
+
|
300
|
+
self.mem_fraction_static = round((gpu_mem - reserved_mem) / gpu_mem, 3)
|
260
301
|
else:
|
261
302
|
self.mem_fraction_static = 0.88
|
262
|
-
if gpu_mem is not None and gpu_mem > 96 * 1024:
|
263
|
-
mem_fraction = self.mem_fraction_static
|
264
|
-
self.mem_fraction_static = min(
|
265
|
-
mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem,
|
266
|
-
(gpu_mem - 1024 * 18)
|
267
|
-
/ gpu_mem, # 15 GB + additional 3GB for cuda graph
|
268
|
-
)
|
269
303
|
|
270
304
|
# Set chunked prefill size, which depends on the gpu memory capacity
|
271
305
|
if self.chunked_prefill_size is None:
|
272
|
-
if gpu_mem is not None
|
273
|
-
|
274
|
-
|
275
|
-
|
306
|
+
if gpu_mem is not None:
|
307
|
+
if gpu_mem < 35 * 1024: # A10, L40, 4090
|
308
|
+
self.chunked_prefill_size = 2048
|
309
|
+
elif gpu_mem < 160 * 1024: # H100, H200, A100, H20
|
310
|
+
self.chunked_prefill_size = 8192
|
311
|
+
else: # B200, MI300
|
312
|
+
self.chunked_prefill_size = 16384
|
276
313
|
else:
|
277
|
-
self.chunked_prefill_size =
|
314
|
+
self.chunked_prefill_size = 4096
|
278
315
|
assert self.chunked_prefill_size % self.page_size == 0
|
279
316
|
|
317
|
+
# Set cuda graph max batch size
|
318
|
+
if self.cuda_graph_max_bs is None:
|
319
|
+
# Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
|
320
|
+
if gpu_mem is not None and gpu_mem < 35 * 1024:
|
321
|
+
if self.tp_size < 4:
|
322
|
+
self.cuda_graph_max_bs = 8
|
323
|
+
else:
|
324
|
+
self.cuda_graph_max_bs = 80
|
325
|
+
|
280
326
|
assert self.moe_dense_tp_size in {
|
281
327
|
1,
|
282
328
|
None,
|
@@ -294,21 +340,17 @@ class ServerArgs:
|
|
294
340
|
)
|
295
341
|
self.page_size = 128
|
296
342
|
|
297
|
-
# Set cuda graph max batch size
|
298
|
-
if self.cuda_graph_max_bs is None:
|
299
|
-
# Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
|
300
|
-
if gpu_mem is not None and gpu_mem < 25_000:
|
301
|
-
if self.tp_size < 4:
|
302
|
-
self.cuda_graph_max_bs = 8
|
303
|
-
else:
|
304
|
-
self.cuda_graph_max_bs = 80
|
305
|
-
|
306
343
|
# Set kernel backends for hpu device
|
307
344
|
if self.device == "hpu":
|
308
345
|
self.attention_backend = "torch_native"
|
309
346
|
self.sampling_backend = "pytorch"
|
310
347
|
|
311
348
|
# Set kernel backends
|
349
|
+
if self.device == "cpu":
|
350
|
+
if self.attention_backend is None:
|
351
|
+
self.attention_backend = "intel_amx"
|
352
|
+
self.sampling_backend = "pytorch"
|
353
|
+
|
312
354
|
if self.sampling_backend is None:
|
313
355
|
self.sampling_backend = (
|
314
356
|
"flashinfer" if is_flashinfer_available() else "pytorch"
|
@@ -365,12 +407,28 @@ class ServerArgs:
|
|
365
407
|
"Pipeline parallelism is incompatible with overlap schedule."
|
366
408
|
)
|
367
409
|
|
410
|
+
if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
|
411
|
+
self.expert_distribution_recorder_mode = "stat"
|
412
|
+
logger.info(
|
413
|
+
"EPLB is enabled. The expert_distribution_recorder_mode is automatically set."
|
414
|
+
)
|
415
|
+
|
416
|
+
if (self.enable_eplb or (self.init_expert_location is not None)) and (
|
417
|
+
self.ep_dispatch_algorithm is None
|
418
|
+
):
|
419
|
+
self.ep_dispatch_algorithm = "static"
|
420
|
+
logger.info(
|
421
|
+
"EPLB is enabled or init_expert_location is provided. ep_dispatch_algorithm is configured."
|
422
|
+
)
|
423
|
+
|
424
|
+
if self.enable_expert_distribution_metrics and (
|
425
|
+
self.expert_distribution_recorder_mode is None
|
426
|
+
):
|
427
|
+
self.expert_distribution_recorder_mode = "stat"
|
428
|
+
|
368
429
|
if self.expert_distribution_recorder_buffer_size is None:
|
369
|
-
|
370
|
-
|
371
|
-
# self.expert_distribution_recorder_buffer_size = x
|
372
|
-
if False:
|
373
|
-
pass
|
430
|
+
if (x := self.eplb_rebalance_num_iterations) is not None:
|
431
|
+
self.expert_distribution_recorder_buffer_size = x
|
374
432
|
elif self.expert_distribution_recorder_mode is not None:
|
375
433
|
self.expert_distribution_recorder_buffer_size = 1000
|
376
434
|
|
@@ -387,6 +445,12 @@ class ServerArgs:
|
|
387
445
|
"Overlap scheduler is disabled because of using "
|
388
446
|
"eagle speculative decoding."
|
389
447
|
)
|
448
|
+
if self.enable_mixed_chunk:
|
449
|
+
self.enable_mixed_chunk = False
|
450
|
+
logger.warning(
|
451
|
+
"Mixed chunked prefill is disabled because of using "
|
452
|
+
"eagle speculative decoding."
|
453
|
+
)
|
390
454
|
|
391
455
|
model_arch = get_model_arch(self)
|
392
456
|
|
@@ -409,7 +473,7 @@ class ServerArgs:
|
|
409
473
|
self.speculative_num_steps,
|
410
474
|
self.speculative_eagle_topk,
|
411
475
|
self.speculative_num_draft_tokens,
|
412
|
-
) = auto_choose_speculative_params(
|
476
|
+
) = auto_choose_speculative_params(self)
|
413
477
|
|
414
478
|
if self.page_size > 1 and self.speculative_eagle_topk > 1:
|
415
479
|
self.speculative_eagle_topk = 1
|
@@ -444,12 +508,27 @@ class ServerArgs:
|
|
444
508
|
self.triton_attention_num_kv_splits = 16
|
445
509
|
|
446
510
|
# PD disaggregation
|
447
|
-
if self.disaggregation_mode == "
|
448
|
-
|
449
|
-
|
450
|
-
|
511
|
+
if self.disaggregation_mode == "decode":
|
512
|
+
assert (
|
513
|
+
self.disaggregation_decode_tp is None
|
514
|
+
), "Cannot set --disaggregation-decode-tp for the decode engine."
|
515
|
+
assert (
|
516
|
+
self.disaggregation_decode_dp is None
|
517
|
+
), "Cannot set --disaggregation-decode-dp for the decode engine."
|
518
|
+
|
451
519
|
self.disable_radix_cache = True
|
452
520
|
logger.warning("KV cache is forced as chunk cache for decode server")
|
521
|
+
elif self.disaggregation_mode == "prefill":
|
522
|
+
if self.disaggregation_decode_tp is None:
|
523
|
+
self.disaggregation_decode_tp = self.tp_size
|
524
|
+
if self.disaggregation_decode_dp is None:
|
525
|
+
self.disaggregation_decode_dp = self.dp_size
|
526
|
+
|
527
|
+
self.disaggregation_prefill_pp = self.pp_size
|
528
|
+
self.validate_disagg_tp_size(self.tp_size, self.disaggregation_decode_tp)
|
529
|
+
|
530
|
+
self.disable_cuda_graph = True
|
531
|
+
logger.warning("Cuda graph is disabled for prefill server")
|
453
532
|
|
454
533
|
os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
|
455
534
|
"1" if self.enable_torch_compile else "0"
|
@@ -459,6 +538,14 @@ class ServerArgs:
|
|
459
538
|
"1" if self.disable_outlines_disk_cache else "0"
|
460
539
|
)
|
461
540
|
|
541
|
+
def validate_disagg_tp_size(self, prefill_tp: int, decode_tp: int):
|
542
|
+
larger_tp = max(decode_tp, prefill_tp)
|
543
|
+
smaller_tp = min(decode_tp, prefill_tp)
|
544
|
+
assert larger_tp % smaller_tp == 0, (
|
545
|
+
"Different tp size is supported only when one tp is multiple of the other. "
|
546
|
+
f"decode_tp={decode_tp}, prefill_tp={prefill_tp}"
|
547
|
+
)
|
548
|
+
|
462
549
|
@staticmethod
|
463
550
|
def add_cli_args(parser: argparse.ArgumentParser):
|
464
551
|
# Model and port args
|
@@ -475,10 +562,16 @@ class ServerArgs:
|
|
475
562
|
help="The path of the tokenizer.",
|
476
563
|
)
|
477
564
|
parser.add_argument(
|
478
|
-
"--host",
|
565
|
+
"--host",
|
566
|
+
type=str,
|
567
|
+
default=ServerArgs.host,
|
568
|
+
help="The host of the HTTP server.",
|
479
569
|
)
|
480
570
|
parser.add_argument(
|
481
|
-
"--port",
|
571
|
+
"--port",
|
572
|
+
type=int,
|
573
|
+
default=ServerArgs.port,
|
574
|
+
help="The port of the HTTP server.",
|
482
575
|
)
|
483
576
|
parser.add_argument(
|
484
577
|
"--tokenizer-mode",
|
@@ -633,6 +726,18 @@ class ServerArgs:
|
|
633
726
|
"name, a tag name, or a commit id. If unspecified, will use "
|
634
727
|
"the default version.",
|
635
728
|
)
|
729
|
+
parser.add_argument(
|
730
|
+
"--impl",
|
731
|
+
type=str,
|
732
|
+
default=ServerArgs.impl,
|
733
|
+
help="Which implementation of the model to use.\n\n"
|
734
|
+
'* "auto" will try to use the SGLang implementation if it exists '
|
735
|
+
"and fall back to the Transformers implementation if no SGLang "
|
736
|
+
"implementation is available.\n"
|
737
|
+
'* "sglang" will use the SGLang model implementation.\n'
|
738
|
+
'* "transformers" will use the Transformers model '
|
739
|
+
"implementation.\n",
|
740
|
+
)
|
636
741
|
|
637
742
|
# Memory and scheduling
|
638
743
|
parser.add_argument(
|
@@ -766,6 +871,11 @@ class ServerArgs:
|
|
766
871
|
default=ServerArgs.gpu_id_step,
|
767
872
|
help="The delta between consecutive GPU IDs that are used. For example, setting it to 2 will use GPU 0,2,4,...",
|
768
873
|
)
|
874
|
+
parser.add_argument(
|
875
|
+
"--sleep-on-idle",
|
876
|
+
action="store_true",
|
877
|
+
help="Reduce CPU usage when sglang is idle.",
|
878
|
+
)
|
769
879
|
|
770
880
|
# Logging
|
771
881
|
parser.add_argument(
|
@@ -873,6 +983,13 @@ class ServerArgs:
|
|
873
983
|
default=ServerArgs.reasoning_parser,
|
874
984
|
help=f"Specify the parser for reasoning models, supported parsers are: {list(ReasoningParser.DetectorMap.keys())}.",
|
875
985
|
)
|
986
|
+
parser.add_argument(
|
987
|
+
"--tool-call-parser",
|
988
|
+
type=str,
|
989
|
+
choices=["qwen25", "mistral", "llama3", "deepseekv3", "pythonic"],
|
990
|
+
default=ServerArgs.tool_call_parser,
|
991
|
+
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
|
992
|
+
)
|
876
993
|
|
877
994
|
# Data parallelism
|
878
995
|
parser.add_argument(
|
@@ -893,15 +1010,6 @@ class ServerArgs:
|
|
893
1010
|
],
|
894
1011
|
)
|
895
1012
|
|
896
|
-
# Expert parallelism
|
897
|
-
parser.add_argument(
|
898
|
-
"--expert-parallel-size",
|
899
|
-
"--ep-size",
|
900
|
-
type=int,
|
901
|
-
default=ServerArgs.ep_size,
|
902
|
-
help="The expert parallelism size.",
|
903
|
-
)
|
904
|
-
|
905
1013
|
# Multi-node distributed serving
|
906
1014
|
parser.add_argument(
|
907
1015
|
"--dist-init-addr",
|
@@ -957,12 +1065,13 @@ class ServerArgs:
|
|
957
1065
|
type=str,
|
958
1066
|
choices=[
|
959
1067
|
"aiter",
|
960
|
-
"
|
961
|
-
"triton",
|
962
|
-
"torch_native",
|
1068
|
+
"cutlass_mla",
|
963
1069
|
"fa3",
|
1070
|
+
"flashinfer",
|
964
1071
|
"flashmla",
|
965
|
-
"
|
1072
|
+
"intel_amx",
|
1073
|
+
"torch_native",
|
1074
|
+
"triton",
|
966
1075
|
],
|
967
1076
|
default=ServerArgs.attention_backend,
|
968
1077
|
help="Choose the kernels for attention layers.",
|
@@ -981,21 +1090,6 @@ class ServerArgs:
|
|
981
1090
|
default=ServerArgs.grammar_backend,
|
982
1091
|
help="Choose the backend for grammar-guided decoding.",
|
983
1092
|
)
|
984
|
-
parser.add_argument(
|
985
|
-
"--enable-flashinfer-mla",
|
986
|
-
action=DeprecatedAction,
|
987
|
-
help="--enable-flashinfer-mla is deprecated. Please use '--attention-backend flashinfer' instead.",
|
988
|
-
)
|
989
|
-
parser.add_argument(
|
990
|
-
"--enable-flashmla",
|
991
|
-
action=DeprecatedAction,
|
992
|
-
help="--enable-flashmla is deprecated. Please use '--attention-backend flashmla' instead.",
|
993
|
-
)
|
994
|
-
parser.add_argument(
|
995
|
-
"--flashinfer-mla-disable-ragged",
|
996
|
-
action="store_true",
|
997
|
-
help="Not using ragged prefill wrapper when running flashinfer mla",
|
998
|
-
)
|
999
1093
|
|
1000
1094
|
# Speculative decoding
|
1001
1095
|
parser.add_argument(
|
@@ -1045,6 +1139,109 @@ class ServerArgs:
|
|
1045
1139
|
help="The path of the draft model's small vocab table.",
|
1046
1140
|
default=ServerArgs.speculative_token_map,
|
1047
1141
|
)
|
1142
|
+
parser.add_argument(
|
1143
|
+
"--mm-attention-backend",
|
1144
|
+
type=str,
|
1145
|
+
choices=["sdpa", "fa3", "triton_attn"],
|
1146
|
+
default=ServerArgs.mm_attention_backend,
|
1147
|
+
help="Set multimodal attention backend.",
|
1148
|
+
)
|
1149
|
+
|
1150
|
+
# Expert parallelism
|
1151
|
+
parser.add_argument(
|
1152
|
+
"--expert-parallel-size",
|
1153
|
+
"--ep-size",
|
1154
|
+
type=int,
|
1155
|
+
default=ServerArgs.ep_size,
|
1156
|
+
help="The expert parallelism size.",
|
1157
|
+
)
|
1158
|
+
parser.add_argument(
|
1159
|
+
"--enable-ep-moe",
|
1160
|
+
action="store_true",
|
1161
|
+
help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
|
1162
|
+
)
|
1163
|
+
parser.add_argument(
|
1164
|
+
"--enable-deepep-moe",
|
1165
|
+
action="store_true",
|
1166
|
+
help="Enabling DeepEP MoE implementation for EP MoE.",
|
1167
|
+
)
|
1168
|
+
parser.add_argument(
|
1169
|
+
"--deepep-mode",
|
1170
|
+
type=str,
|
1171
|
+
choices=["normal", "low_latency", "auto"],
|
1172
|
+
default="auto",
|
1173
|
+
help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
|
1174
|
+
)
|
1175
|
+
parser.add_argument(
|
1176
|
+
"--ep-num-redundant-experts",
|
1177
|
+
type=int,
|
1178
|
+
default=ServerArgs.ep_num_redundant_experts,
|
1179
|
+
help="Allocate this number of redundant experts in expert parallel.",
|
1180
|
+
)
|
1181
|
+
parser.add_argument(
|
1182
|
+
"--ep-dispatch-algorithm",
|
1183
|
+
type=str,
|
1184
|
+
default=ServerArgs.ep_dispatch_algorithm,
|
1185
|
+
help="The algorithm to choose ranks for redundant experts in expert parallel.",
|
1186
|
+
)
|
1187
|
+
parser.add_argument(
|
1188
|
+
"--init-expert-location",
|
1189
|
+
type=str,
|
1190
|
+
default=ServerArgs.init_expert_location,
|
1191
|
+
help="Initial location of EP experts.",
|
1192
|
+
)
|
1193
|
+
parser.add_argument(
|
1194
|
+
"--enable-eplb",
|
1195
|
+
action="store_true",
|
1196
|
+
help="Enable EPLB algorithm",
|
1197
|
+
)
|
1198
|
+
parser.add_argument(
|
1199
|
+
"--eplb-algorithm",
|
1200
|
+
type=str,
|
1201
|
+
default=ServerArgs.eplb_algorithm,
|
1202
|
+
help="Chosen EPLB algorithm",
|
1203
|
+
)
|
1204
|
+
parser.add_argument(
|
1205
|
+
"--eplb-rebalance-num-iterations",
|
1206
|
+
type=int,
|
1207
|
+
default=ServerArgs.eplb_rebalance_num_iterations,
|
1208
|
+
help="Number of iterations to automatically trigger a EPLB re-balance.",
|
1209
|
+
)
|
1210
|
+
parser.add_argument(
|
1211
|
+
"--eplb-rebalance-layers-per-chunk",
|
1212
|
+
type=int,
|
1213
|
+
default=ServerArgs.eplb_rebalance_layers_per_chunk,
|
1214
|
+
help="Number of layers to rebalance per forward pass.",
|
1215
|
+
)
|
1216
|
+
parser.add_argument(
|
1217
|
+
"--expert-distribution-recorder-mode",
|
1218
|
+
type=str,
|
1219
|
+
default=ServerArgs.expert_distribution_recorder_mode,
|
1220
|
+
help="Mode of expert distribution recorder.",
|
1221
|
+
)
|
1222
|
+
parser.add_argument(
|
1223
|
+
"--expert-distribution-recorder-buffer-size",
|
1224
|
+
type=int,
|
1225
|
+
default=ServerArgs.expert_distribution_recorder_buffer_size,
|
1226
|
+
help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
|
1227
|
+
)
|
1228
|
+
parser.add_argument(
|
1229
|
+
"--enable-expert-distribution-metrics",
|
1230
|
+
action="store_true",
|
1231
|
+
help="Enable logging metrics for expert balancedness",
|
1232
|
+
)
|
1233
|
+
parser.add_argument(
|
1234
|
+
"--deepep-config",
|
1235
|
+
type=str,
|
1236
|
+
default=ServerArgs.deepep_config,
|
1237
|
+
help="Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path.",
|
1238
|
+
)
|
1239
|
+
parser.add_argument(
|
1240
|
+
"--moe-dense-tp-size",
|
1241
|
+
type=int,
|
1242
|
+
default=ServerArgs.moe_dense_tp_size,
|
1243
|
+
help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
|
1244
|
+
)
|
1048
1245
|
|
1049
1246
|
# Double Sparsity
|
1050
1247
|
parser.add_argument(
|
@@ -1089,6 +1286,18 @@ class ServerArgs:
|
|
1089
1286
|
action="store_true",
|
1090
1287
|
help="Disable RadixAttention for prefix caching.",
|
1091
1288
|
)
|
1289
|
+
parser.add_argument(
|
1290
|
+
"--cuda-graph-max-bs",
|
1291
|
+
type=int,
|
1292
|
+
default=ServerArgs.cuda_graph_max_bs,
|
1293
|
+
help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
|
1294
|
+
)
|
1295
|
+
parser.add_argument(
|
1296
|
+
"--cuda-graph-bs",
|
1297
|
+
type=int,
|
1298
|
+
nargs="+",
|
1299
|
+
help="Set the list of batch sizes for cuda graph.",
|
1300
|
+
)
|
1092
1301
|
parser.add_argument(
|
1093
1302
|
"--disable-cuda-graph",
|
1094
1303
|
action="store_true",
|
@@ -1099,6 +1308,11 @@ class ServerArgs:
|
|
1099
1308
|
action="store_true",
|
1100
1309
|
help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
|
1101
1310
|
)
|
1311
|
+
parser.add_argument(
|
1312
|
+
"--enable-profile-cuda-graph",
|
1313
|
+
action="store_true",
|
1314
|
+
help="Enable profiling of cuda graph capture.",
|
1315
|
+
)
|
1102
1316
|
parser.add_argument(
|
1103
1317
|
"--enable-nccl-nvls",
|
1104
1318
|
action="store_true",
|
@@ -1119,11 +1333,21 @@ class ServerArgs:
|
|
1119
1333
|
action="store_true",
|
1120
1334
|
help="Disable the custom all-reduce kernel and fall back to NCCL.",
|
1121
1335
|
)
|
1336
|
+
parser.add_argument(
|
1337
|
+
"--enable-mscclpp",
|
1338
|
+
action="store_true",
|
1339
|
+
help="Enable using mscclpp for small messages for all-reduce kernel and fall back to NCCL.",
|
1340
|
+
)
|
1122
1341
|
parser.add_argument(
|
1123
1342
|
"--disable-overlap-schedule",
|
1124
1343
|
action="store_true",
|
1125
1344
|
help="Disable the overlap scheduler, which overlaps the CPU scheduler with GPU model worker.",
|
1126
1345
|
)
|
1346
|
+
parser.add_argument(
|
1347
|
+
"--disable-overlap-cg-plan",
|
1348
|
+
action="store_true",
|
1349
|
+
help="Disable the overlap optimization for cudagraph preparation in eagle verify.",
|
1350
|
+
)
|
1127
1351
|
parser.add_argument(
|
1128
1352
|
"--enable-mixed-chunk",
|
1129
1353
|
action="store_true",
|
@@ -1140,9 +1364,9 @@ class ServerArgs:
|
|
1140
1364
|
help="Enable vocabulary parallel across the attention TP group to avoid all-gather across DP groups, optimizing performance under DP attention.",
|
1141
1365
|
)
|
1142
1366
|
parser.add_argument(
|
1143
|
-
"--enable-
|
1367
|
+
"--enable-two-batch-overlap",
|
1144
1368
|
action="store_true",
|
1145
|
-
help="Enabling
|
1369
|
+
help="Enabling two micro batches to overlap.",
|
1146
1370
|
)
|
1147
1371
|
parser.add_argument(
|
1148
1372
|
"--enable-torch-compile",
|
@@ -1155,18 +1379,6 @@ class ServerArgs:
|
|
1155
1379
|
default=ServerArgs.torch_compile_max_bs,
|
1156
1380
|
help="Set the maximum batch size when using torch compile.",
|
1157
1381
|
)
|
1158
|
-
parser.add_argument(
|
1159
|
-
"--cuda-graph-max-bs",
|
1160
|
-
type=int,
|
1161
|
-
default=ServerArgs.cuda_graph_max_bs,
|
1162
|
-
help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
|
1163
|
-
)
|
1164
|
-
parser.add_argument(
|
1165
|
-
"--cuda-graph-bs",
|
1166
|
-
type=int,
|
1167
|
-
nargs="+",
|
1168
|
-
help="Set the list of batch sizes for cuda graph.",
|
1169
|
-
)
|
1170
1382
|
parser.add_argument(
|
1171
1383
|
"--torchao-config",
|
1172
1384
|
type=str,
|
@@ -1223,13 +1435,6 @@ class ServerArgs:
|
|
1223
1435
|
action="store_true",
|
1224
1436
|
help="Enable users to pass custom logit processors to the server (disabled by default for security)",
|
1225
1437
|
)
|
1226
|
-
parser.add_argument(
|
1227
|
-
"--tool-call-parser",
|
1228
|
-
type=str,
|
1229
|
-
choices=["qwen25", "mistral", "llama3", "deepseekv3", "pythonic"],
|
1230
|
-
default=ServerArgs.tool_call_parser,
|
1231
|
-
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
|
1232
|
-
)
|
1233
1438
|
parser.add_argument(
|
1234
1439
|
"--enable-hierarchical-cache",
|
1235
1440
|
action="store_true",
|
@@ -1255,82 +1460,14 @@ class ServerArgs:
|
|
1255
1460
|
help="The write policy of hierarchical cache.",
|
1256
1461
|
)
|
1257
1462
|
parser.add_argument(
|
1258
|
-
"--
|
1259
|
-
action="store_true",
|
1260
|
-
help="Enabling DeepEP MoE implementation for EP MoE.",
|
1261
|
-
)
|
1262
|
-
parser.add_argument(
|
1263
|
-
"--moe-dense-tp-size",
|
1264
|
-
type=int,
|
1265
|
-
default=ServerArgs.moe_dense_tp_size,
|
1266
|
-
help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
|
1267
|
-
)
|
1268
|
-
parser.add_argument(
|
1269
|
-
"--deepep-mode",
|
1270
|
-
type=str,
|
1271
|
-
choices=["normal", "low_latency", "auto"],
|
1272
|
-
default="auto",
|
1273
|
-
help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
|
1274
|
-
)
|
1275
|
-
parser.add_argument(
|
1276
|
-
"--ep-num-redundant-experts",
|
1277
|
-
type=int,
|
1278
|
-
default=ServerArgs.ep_num_redundant_experts,
|
1279
|
-
help="Allocate this number of redundant experts in expert parallel.",
|
1280
|
-
)
|
1281
|
-
parser.add_argument(
|
1282
|
-
"--ep-dispatch-algorithm",
|
1283
|
-
type=str,
|
1284
|
-
default=ServerArgs.ep_dispatch_algorithm,
|
1285
|
-
help="The algorithm to choose ranks for redundant experts in expert parallel.",
|
1286
|
-
)
|
1287
|
-
parser.add_argument(
|
1288
|
-
"--init-expert-location",
|
1289
|
-
type=str,
|
1290
|
-
default=ServerArgs.init_expert_location,
|
1291
|
-
help="Initial location of EP experts.",
|
1292
|
-
)
|
1293
|
-
parser.add_argument(
|
1294
|
-
"--enable-eplb",
|
1463
|
+
"--flashinfer-mla-disable-ragged",
|
1295
1464
|
action="store_true",
|
1296
|
-
help="
|
1297
|
-
)
|
1298
|
-
parser.add_argument(
|
1299
|
-
"--eplb-rebalance-num-iterations",
|
1300
|
-
type=int,
|
1301
|
-
default=ServerArgs.eplb_rebalance_num_iterations,
|
1302
|
-
help="Number of iterations to automatically trigger a EPLB re-balance.",
|
1303
|
-
)
|
1304
|
-
parser.add_argument(
|
1305
|
-
"--expert-distribution-recorder-mode",
|
1306
|
-
type=str,
|
1307
|
-
default=ServerArgs.expert_distribution_recorder_mode,
|
1308
|
-
help="Mode of expert distribution recorder.",
|
1309
|
-
)
|
1310
|
-
parser.add_argument(
|
1311
|
-
"--expert-distribution-recorder-buffer-size",
|
1312
|
-
type=int,
|
1313
|
-
default=ServerArgs.expert_distribution_recorder_buffer_size,
|
1314
|
-
help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
|
1465
|
+
help="Not using ragged prefill wrapper when running flashinfer mla",
|
1315
1466
|
)
|
1316
1467
|
parser.add_argument(
|
1317
|
-
"--
|
1468
|
+
"--disable-shared-experts-fusion",
|
1318
1469
|
action="store_true",
|
1319
|
-
help="
|
1320
|
-
)
|
1321
|
-
parser.add_argument(
|
1322
|
-
"--deepep-config",
|
1323
|
-
type=str,
|
1324
|
-
default=ServerArgs.deepep_config,
|
1325
|
-
help="Tuned DeepEP config suitable for your own cluster.",
|
1326
|
-
)
|
1327
|
-
|
1328
|
-
parser.add_argument(
|
1329
|
-
"--n-share-experts-fusion",
|
1330
|
-
type=int,
|
1331
|
-
default=0,
|
1332
|
-
help="The number of shared_experts need to be replicated to fuse with normal experts in deepseek v3/r1, "
|
1333
|
-
"set it to tp_size can get best optimized performance. Note that for architectures with SM==90, we have enabled the shared experts fusion optimization by default for DeepSeek V3/R1, with n_share_experts_fusion automatically set to the TP size.",
|
1470
|
+
help="Disable shared experts fusion optimization for deepseek v3/r1.",
|
1334
1471
|
)
|
1335
1472
|
parser.add_argument(
|
1336
1473
|
"--disable-chunked-prefix-cache",
|
@@ -1342,8 +1479,11 @@ class ServerArgs:
|
|
1342
1479
|
action="store_true",
|
1343
1480
|
help="Adopt base image processor instead of fast image processor.",
|
1344
1481
|
)
|
1345
|
-
|
1346
|
-
|
1482
|
+
parser.add_argument(
|
1483
|
+
"--enable-return-hidden-states",
|
1484
|
+
action="store_true",
|
1485
|
+
help="Enable returning hidden states with responses.",
|
1486
|
+
)
|
1347
1487
|
parser.add_argument(
|
1348
1488
|
"--warmups",
|
1349
1489
|
type=str,
|
@@ -1371,6 +1511,11 @@ class ServerArgs:
|
|
1371
1511
|
default=ServerArgs.debug_tensor_dump_inject,
|
1372
1512
|
help="Inject the outputs from jax as the input of every layer.",
|
1373
1513
|
)
|
1514
|
+
parser.add_argument(
|
1515
|
+
"--debug-tensor-dump-prefill-only",
|
1516
|
+
action="store_true",
|
1517
|
+
help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
|
1518
|
+
)
|
1374
1519
|
|
1375
1520
|
# Disaggregation
|
1376
1521
|
parser.add_argument(
|
@@ -1380,6 +1525,13 @@ class ServerArgs:
|
|
1380
1525
|
choices=["null", "prefill", "decode"],
|
1381
1526
|
help='Only used for PD disaggregation. "prefill" for prefill-only server, and "decode" for decode-only server. If not specified, it is not PD disaggregated',
|
1382
1527
|
)
|
1528
|
+
parser.add_argument(
|
1529
|
+
"--disaggregation-transfer-backend",
|
1530
|
+
type=str,
|
1531
|
+
default=ServerArgs.disaggregation_transfer_backend,
|
1532
|
+
choices=["mooncake", "nixl"],
|
1533
|
+
help="The backend for disaggregation transfer. Default is mooncake.",
|
1534
|
+
)
|
1383
1535
|
parser.add_argument(
|
1384
1536
|
"--disaggregation-bootstrap-port",
|
1385
1537
|
type=int,
|
@@ -1387,11 +1539,22 @@ class ServerArgs:
|
|
1387
1539
|
help="Bootstrap server port on the prefill server. Default is 8998.",
|
1388
1540
|
)
|
1389
1541
|
parser.add_argument(
|
1390
|
-
"--disaggregation-
|
1391
|
-
type=
|
1392
|
-
default=ServerArgs.
|
1393
|
-
|
1394
|
-
|
1542
|
+
"--disaggregation-decode-tp",
|
1543
|
+
type=int,
|
1544
|
+
default=ServerArgs.disaggregation_decode_tp,
|
1545
|
+
help="Decode tp size. If not set, it matches the tp size of the current engine. This is only set on the prefill server.",
|
1546
|
+
)
|
1547
|
+
parser.add_argument(
|
1548
|
+
"--disaggregation-decode-dp",
|
1549
|
+
type=int,
|
1550
|
+
default=ServerArgs.disaggregation_decode_dp,
|
1551
|
+
help="Decode dp size. If not set, it matches the dp size of the current engine. This is only set on the prefill server.",
|
1552
|
+
)
|
1553
|
+
parser.add_argument(
|
1554
|
+
"--disaggregation-prefill-pp",
|
1555
|
+
type=int,
|
1556
|
+
default=ServerArgs.disaggregation_prefill_pp,
|
1557
|
+
help="Prefill pp size. If not set, it is default to 1. This is only set on the decode server.",
|
1395
1558
|
)
|
1396
1559
|
parser.add_argument(
|
1397
1560
|
"--disaggregation-ib-device",
|
@@ -1401,6 +1564,12 @@ class ServerArgs:
|
|
1401
1564
|
"or multiple comma-separated devices (e.g., --disaggregation-ib-device mlx5_0,mlx5_1). "
|
1402
1565
|
"Default is None, which triggers automatic device detection when mooncake backend is enabled.",
|
1403
1566
|
)
|
1567
|
+
parser.add_argument(
|
1568
|
+
"--num-reserved-decode-tokens",
|
1569
|
+
type=int,
|
1570
|
+
default=ServerArgs.num_reserved_decode_tokens,
|
1571
|
+
help="Number of decode tokens that will have memory reserved when adding new request to the running batch.",
|
1572
|
+
)
|
1404
1573
|
parser.add_argument(
|
1405
1574
|
"--pdlb-url",
|
1406
1575
|
type=str,
|
@@ -1408,14 +1577,6 @@ class ServerArgs:
|
|
1408
1577
|
help="The URL of the PD disaggregation load balancer. If set, the prefill/decode server will register with the load balancer.",
|
1409
1578
|
)
|
1410
1579
|
|
1411
|
-
parser.add_argument(
|
1412
|
-
"--mm-attention-backend",
|
1413
|
-
type=str,
|
1414
|
-
choices=["sdpa", "fa3", "triton_attn"],
|
1415
|
-
default=ServerArgs.mm_attention_backend,
|
1416
|
-
help="Set multimodal attention backend.",
|
1417
|
-
)
|
1418
|
-
|
1419
1580
|
@classmethod
|
1420
1581
|
def from_cli_args(cls, args: argparse.Namespace):
|
1421
1582
|
args.tp_size = args.tensor_parallel_size
|
@@ -1451,7 +1612,7 @@ class ServerArgs:
|
|
1451
1612
|
self.max_loras_per_batch > 0
|
1452
1613
|
# FIXME
|
1453
1614
|
and (self.lora_paths is None or self.disable_radix_cache)
|
1454
|
-
), "compatibility of lora and
|
1615
|
+
), "compatibility of lora and radix attention is in progress"
|
1455
1616
|
assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
|
1456
1617
|
assert self.gpu_id_step >= 1, "gpu_id_step must be positive"
|
1457
1618
|
|
@@ -1585,18 +1746,29 @@ def get_model_arch(args: ServerArgs):
|
|
1585
1746
|
return hf_config.architectures[0]
|
1586
1747
|
|
1587
1748
|
|
1588
|
-
def auto_choose_speculative_params(
|
1749
|
+
def auto_choose_speculative_params(self: ServerArgs):
|
1589
1750
|
"""
|
1590
1751
|
Automatically choose the parameters for speculative decoding.
|
1591
1752
|
|
1592
1753
|
You can tune them on your own models and prompts with scripts/playground/bench_speculative.py
|
1593
1754
|
"""
|
1755
|
+
kwargs = {}
|
1756
|
+
|
1757
|
+
hf_config = get_config(
|
1758
|
+
self.model_path,
|
1759
|
+
trust_remote_code=self.trust_remote_code,
|
1760
|
+
revision=self.revision,
|
1761
|
+
model_override_args=json.loads(self.json_model_override_args),
|
1762
|
+
**kwargs,
|
1763
|
+
)
|
1764
|
+
arch = hf_config.architectures[0]
|
1765
|
+
|
1594
1766
|
if arch in ["LlamaForCausalLM"]:
|
1595
1767
|
# The default value for llama
|
1596
1768
|
return (5, 4, 8)
|
1597
1769
|
elif arch in ["DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM"]:
|
1598
1770
|
# The default value for deepseek
|
1599
|
-
return (
|
1771
|
+
return (3, 1, 4)
|
1600
1772
|
elif arch in ["Grok1ForCausalLM", "Grok1VForCausalLM"]:
|
1601
1773
|
return (5, 4, 8)
|
1602
1774
|
else:
|