sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -11
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +474 -142
- sglang/compile_deep_gemm.py +3 -0
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +10 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +314 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +228 -92
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/qwen3_next.py +294 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +78 -37
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +373 -68
- sglang/srt/disaggregation/prefill.py +53 -49
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +156 -80
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +842 -0
- sglang/srt/entrypoints/grpc_server.py +950 -0
- sglang/srt/entrypoints/http_server.py +179 -60
- sglang/srt/entrypoints/openai/protocol.py +265 -29
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +213 -122
- sglang/srt/entrypoints/openai/serving_completions.py +14 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +48 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +289 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +17 -8
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +215 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +40 -8
- sglang/srt/layers/attention/flashinfer_backend.py +341 -204
- sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
- sglang/srt/layers/attention/mamba/mamba.py +577 -0
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +180 -18
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
- sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
- sglang/srt/layers/moe/ep_moe/layer.py +248 -333
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +83 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +29 -7
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/__init__.py +1 -1
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +155 -60
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +191 -56
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +74 -42
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +28 -33
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +44 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +199 -301
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +55 -0
- sglang/srt/managers/schedule_batch.py +343 -212
- sglang/srt/managers/schedule_policy.py +145 -18
- sglang/srt/managers/scheduler.py +653 -273
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +579 -674
- sglang/srt/managers/tp_worker.py +96 -26
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +21 -22
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +9 -2
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +43 -24
- sglang/srt/mem_cache/hiradix_cache.py +222 -75
- sglang/srt/mem_cache/memory_pool.py +651 -80
- sglang/srt/mem_cache/memory_pool_host.py +239 -228
- sglang/srt/mem_cache/radix_cache.py +227 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
- sglang/srt/mem_cache/swa_radix_cache.py +93 -48
- sglang/srt/metrics/collector.py +511 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +74 -46
- sglang/srt/model_executor/model_runner.py +455 -176
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +10 -4
- sglang/srt/model_loader/loader.py +319 -10
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +161 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +607 -130
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +578 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +17 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +4 -4
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/grok.py +5 -13
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mixtral.py +1 -3
- sglang/srt/models/mllama4.py +50 -4
- sglang/srt/models/nemotron_h.py +514 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +120 -13
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +32 -4
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +55 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +49 -26
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +1051 -285
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +98 -29
- sglang/srt/speculative/ngram_info.py +428 -0
- sglang/srt/speculative/ngram_worker.py +246 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +605 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +9 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +451 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +119 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_longbench_v2.py +332 -0
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_cutlass_w4a8_moe.py +9 -19
- sglang/test/test_deterministic.py +313 -0
- sglang/test/test_deterministic_utils.py +81 -0
- sglang/test/test_disaggregation_utils.py +140 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +407 -8
- sglang/utils.py +21 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -19,12 +19,11 @@ import json
|
|
19
19
|
import logging
|
20
20
|
import os
|
21
21
|
import random
|
22
|
-
import sys
|
23
22
|
import tempfile
|
24
|
-
from typing import List, Literal, Optional, Union
|
23
|
+
from typing import Dict, List, Literal, Optional, Union
|
25
24
|
|
25
|
+
from sglang.srt.connector import ConnectorType
|
26
26
|
from sglang.srt.function_call.function_call_parser import FunctionCallParser
|
27
|
-
from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
|
28
27
|
from sglang.srt.lora.lora_registry import LoRARef
|
29
28
|
from sglang.srt.parser.reasoning_parser import ReasoningParser
|
30
29
|
from sglang.srt.utils import (
|
@@ -36,18 +35,22 @@ from sglang.srt.utils import (
|
|
36
35
|
is_cuda,
|
37
36
|
is_flashinfer_available,
|
38
37
|
is_hip,
|
38
|
+
is_npu,
|
39
39
|
is_port_available,
|
40
40
|
is_remote_url,
|
41
41
|
is_sm90_supported,
|
42
42
|
is_sm100_supported,
|
43
43
|
is_triton_kernels_available,
|
44
44
|
is_valid_ipv6_address,
|
45
|
+
json_list_type,
|
45
46
|
nullable_str,
|
47
|
+
parse_connector_type,
|
46
48
|
)
|
49
|
+
from sglang.srt.utils.hf_transformers_utils import check_gguf_file, get_config
|
50
|
+
from sglang.utils import is_in_ci
|
47
51
|
|
48
52
|
logger = logging.getLogger(__name__)
|
49
53
|
|
50
|
-
|
51
54
|
# Define constants
|
52
55
|
LOAD_FORMAT_CHOICES = [
|
53
56
|
"auto",
|
@@ -60,6 +63,7 @@ LOAD_FORMAT_CHOICES = [
|
|
60
63
|
"bitsandbytes",
|
61
64
|
"layered",
|
62
65
|
"remote",
|
66
|
+
"remote_instance",
|
63
67
|
]
|
64
68
|
|
65
69
|
QUANTIZATION_CHOICES = [
|
@@ -86,9 +90,12 @@ ATTENTION_BACKEND_CHOICES = [
|
|
86
90
|
# Common
|
87
91
|
"triton",
|
88
92
|
"torch_native",
|
93
|
+
"flex_attention",
|
94
|
+
"nsa",
|
89
95
|
# NVIDIA specific
|
90
96
|
"cutlass_mla",
|
91
97
|
"fa3",
|
98
|
+
"fa4",
|
92
99
|
"flashinfer",
|
93
100
|
"flashmla",
|
94
101
|
"trtllm_mla",
|
@@ -102,8 +109,29 @@ ATTENTION_BACKEND_CHOICES = [
|
|
102
109
|
"ascend",
|
103
110
|
]
|
104
111
|
|
112
|
+
LORA_BACKEND_CHOICES = ["triton", "csgmv"]
|
113
|
+
|
105
114
|
DISAGG_TRANSFER_BACKEND_CHOICES = ["mooncake", "nixl", "ascend", "fake"]
|
106
115
|
|
116
|
+
GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
|
117
|
+
|
118
|
+
DETERMINISTIC_ATTENTION_BACKEND_CHOICES = ["flashinfer", "fa3", "triton"]
|
119
|
+
|
120
|
+
NSA_CHOICES = ["flashmla_prefill", "flashmla_decode", "fa3", "tilelang", "aiter"]
|
121
|
+
|
122
|
+
RADIX_EVICTION_POLICY_CHOICES = ["lru", "lfu"]
|
123
|
+
|
124
|
+
MOE_RUNNER_BACKEND_CHOICES = [
|
125
|
+
"auto",
|
126
|
+
"deep_gemm",
|
127
|
+
"triton",
|
128
|
+
"triton_kernel",
|
129
|
+
"flashinfer_trtllm",
|
130
|
+
"flashinfer_cutlass",
|
131
|
+
"flashinfer_mxfp4",
|
132
|
+
"flashinfer_cutedsl",
|
133
|
+
]
|
134
|
+
|
107
135
|
|
108
136
|
# Allow external code to add more choices
|
109
137
|
def add_load_format_choices(choices):
|
@@ -122,6 +150,22 @@ def add_disagg_transfer_backend_choices(choices):
|
|
122
150
|
DISAGG_TRANSFER_BACKEND_CHOICES.extend(choices)
|
123
151
|
|
124
152
|
|
153
|
+
def add_grammar_backend_choices(choices):
|
154
|
+
GRAMMAR_BACKEND_CHOICES.extend(choices)
|
155
|
+
|
156
|
+
|
157
|
+
def add_moe_runner_backend_choices(choices):
|
158
|
+
MOE_RUNNER_BACKEND_CHOICES.extend(choices)
|
159
|
+
|
160
|
+
|
161
|
+
def add_deterministic_attention_backend_choices(choices):
|
162
|
+
DETERMINISTIC_ATTENTION_BACKEND_CHOICES.extend(choices)
|
163
|
+
|
164
|
+
|
165
|
+
def add_radix_eviction_policy_choices(choices):
|
166
|
+
RADIX_EVICTION_POLICY_CHOICES.extend(choices)
|
167
|
+
|
168
|
+
|
125
169
|
@dataclasses.dataclass
|
126
170
|
class ServerArgs:
|
127
171
|
# Model and tokenizer
|
@@ -133,6 +177,7 @@ class ServerArgs:
|
|
133
177
|
load_format: str = "auto"
|
134
178
|
model_loader_extra_config: str = "{}"
|
135
179
|
trust_remote_code: bool = False
|
180
|
+
modelopt_quant: Optional[Union[str, Dict]] = None
|
136
181
|
context_length: Optional[int] = None
|
137
182
|
is_embedding: bool = False
|
138
183
|
enable_multimodal: Optional[bool] = None
|
@@ -151,26 +196,31 @@ class ServerArgs:
|
|
151
196
|
quantization: Optional[str] = None
|
152
197
|
quantization_param_path: Optional[str] = None
|
153
198
|
kv_cache_dtype: str = "auto"
|
199
|
+
enable_fp32_lm_head: bool = False
|
154
200
|
|
155
201
|
# Memory and scheduling
|
156
202
|
mem_fraction_static: Optional[float] = None
|
157
203
|
max_running_requests: Optional[int] = None
|
158
|
-
max_queued_requests: Optional[int] =
|
204
|
+
max_queued_requests: Optional[int] = None
|
159
205
|
max_total_tokens: Optional[int] = None
|
160
206
|
chunked_prefill_size: Optional[int] = None
|
161
207
|
max_prefill_tokens: int = 16384
|
162
208
|
schedule_policy: str = "fcfs"
|
209
|
+
enable_priority_scheduling: bool = False
|
210
|
+
schedule_low_priority_values_first: bool = False
|
211
|
+
priority_scheduling_preemption_threshold: int = 10
|
163
212
|
schedule_conservativeness: float = 1.0
|
164
213
|
page_size: Optional[int] = None
|
165
214
|
hybrid_kvcache_ratio: Optional[float] = None
|
166
215
|
swa_full_tokens_ratio: float = 0.8
|
167
216
|
disable_hybrid_swa_memory: bool = False
|
217
|
+
radix_eviction_policy: str = "lru"
|
168
218
|
|
169
219
|
# Runtime options
|
170
220
|
device: Optional[str] = None
|
171
221
|
tp_size: int = 1
|
172
222
|
pp_size: int = 1
|
173
|
-
|
223
|
+
pp_max_micro_batch_size: Optional[int] = None
|
174
224
|
stream_interval: int = 1
|
175
225
|
stream_output: bool = False
|
176
226
|
random_seed: Optional[int] = None
|
@@ -191,6 +241,8 @@ class ServerArgs:
|
|
191
241
|
show_time_cost: bool = False
|
192
242
|
enable_metrics: bool = False
|
193
243
|
enable_metrics_for_all_schedulers: bool = False
|
244
|
+
tokenizer_metrics_custom_labels_header: str = "x-custom-labels"
|
245
|
+
tokenizer_metrics_allowed_custom_labels: Optional[List[str]] = None
|
194
246
|
bucket_time_to_first_token: Optional[List[float]] = None
|
195
247
|
bucket_inter_token_latency: Optional[List[float]] = None
|
196
248
|
bucket_e2e_request_latency: Optional[List[float]] = None
|
@@ -201,6 +253,8 @@ class ServerArgs:
|
|
201
253
|
enable_request_time_stats_logging: bool = False
|
202
254
|
kv_events_config: Optional[str] = None
|
203
255
|
gc_warning_threshold_secs: float = 0.0
|
256
|
+
enable_trace: bool = False
|
257
|
+
oltp_traces_endpoint: str = "localhost:4317"
|
204
258
|
|
205
259
|
# API related
|
206
260
|
api_key: Optional[str] = None
|
@@ -213,10 +267,14 @@ class ServerArgs:
|
|
213
267
|
reasoning_parser: Optional[str] = None
|
214
268
|
tool_call_parser: Optional[str] = None
|
215
269
|
tool_server: Optional[str] = None
|
270
|
+
sampling_defaults: str = "model"
|
216
271
|
|
217
272
|
# Data parallelism
|
218
273
|
dp_size: int = 1
|
219
274
|
load_balance_method: str = "round_robin"
|
275
|
+
load_watch_interval: float = 0.1
|
276
|
+
# FIXME: remove this after dp rank scheduling is fully supported with PD-Disaggregation
|
277
|
+
prefill_round_robin_balance: bool = False
|
220
278
|
|
221
279
|
# Multi-node distributed serving
|
222
280
|
dist_init_addr: Optional[str] = None
|
@@ -237,6 +295,7 @@ class ServerArgs:
|
|
237
295
|
max_loaded_loras: Optional[int] = None
|
238
296
|
max_loras_per_batch: int = 8
|
239
297
|
lora_backend: str = "triton"
|
298
|
+
max_lora_chunk_size: Optional[int] = 16
|
240
299
|
|
241
300
|
# Kernel backend
|
242
301
|
attention_backend: Optional[str] = None
|
@@ -245,28 +304,33 @@ class ServerArgs:
|
|
245
304
|
sampling_backend: Optional[str] = None
|
246
305
|
grammar_backend: Optional[str] = None
|
247
306
|
mm_attention_backend: Optional[str] = None
|
307
|
+
nsa_prefill: str = "flashmla_prefill"
|
308
|
+
nsa_decode: str = "fa3"
|
248
309
|
|
249
310
|
# Speculative decoding
|
250
311
|
speculative_algorithm: Optional[str] = None
|
251
312
|
speculative_draft_model_path: Optional[str] = None
|
313
|
+
speculative_draft_model_revision: Optional[str] = None
|
252
314
|
speculative_num_steps: Optional[int] = None
|
253
315
|
speculative_eagle_topk: Optional[int] = None
|
254
316
|
speculative_num_draft_tokens: Optional[int] = None
|
255
317
|
speculative_accept_threshold_single: float = 1.0
|
256
318
|
speculative_accept_threshold_acc: float = 1.0
|
257
319
|
speculative_token_map: Optional[str] = None
|
320
|
+
speculative_attention_mode: str = "prefill"
|
321
|
+
# For ngram only
|
322
|
+
speculative_ngram_min_match_window_size: int = 1
|
323
|
+
speculative_ngram_max_match_window_size: int = 12
|
324
|
+
speculative_ngram_min_bfs_breadth: int = 1
|
325
|
+
speculative_ngram_max_bfs_breadth: int = 10
|
326
|
+
speculative_ngram_match_type: Literal["BFS", "PROB"] = "BFS"
|
327
|
+
speculative_ngram_branch_length: int = 18
|
328
|
+
speculative_ngram_capacity: int = 10 * 1000 * 1000
|
258
329
|
|
259
330
|
# Expert parallelism
|
260
331
|
ep_size: int = 1
|
261
332
|
moe_a2a_backend: Literal["none", "deepep"] = "none"
|
262
|
-
moe_runner_backend:
|
263
|
-
"auto",
|
264
|
-
"triton",
|
265
|
-
"triton_kernel",
|
266
|
-
"flashinfer_trtllm",
|
267
|
-
"flashinfer_cutlass",
|
268
|
-
"flashinfer_mxfp4",
|
269
|
-
] = "auto"
|
333
|
+
moe_runner_backend: str = "auto"
|
270
334
|
flashinfer_mxfp4_moe_precision: Literal["default", "bf16"] = "default"
|
271
335
|
enable_flashinfer_allreduce_fusion: bool = False
|
272
336
|
deepep_mode: Literal["auto", "normal", "low_latency"] = "auto"
|
@@ -286,6 +350,10 @@ class ServerArgs:
|
|
286
350
|
deepep_config: Optional[str] = None
|
287
351
|
moe_dense_tp_size: Optional[int] = None
|
288
352
|
|
353
|
+
# Mamba cache
|
354
|
+
max_mamba_cache_size: Optional[int] = None
|
355
|
+
mamba_ssm_dtype: str = "float32"
|
356
|
+
|
289
357
|
# Hierarchical cache
|
290
358
|
enable_hierarchical_cache: bool = False
|
291
359
|
hicache_ratio: float = 2.0
|
@@ -296,6 +364,8 @@ class ServerArgs:
|
|
296
364
|
hicache_storage_backend: Optional[str] = None
|
297
365
|
hicache_storage_prefetch_policy: str = "best_effort"
|
298
366
|
hicache_storage_backend_extra_config: Optional[str] = None
|
367
|
+
# LMCache
|
368
|
+
enable_lmcache: bool = False
|
299
369
|
|
300
370
|
# Double Sparsity
|
301
371
|
enable_double_sparsity: bool = False
|
@@ -312,6 +382,12 @@ class ServerArgs:
|
|
312
382
|
offload_prefetch_step: int = 1
|
313
383
|
offload_mode: str = "cpu"
|
314
384
|
|
385
|
+
# Scoring configuration
|
386
|
+
# Delimiter token ID used to combine Query and Items into a single sequence for multi-item scoring.
|
387
|
+
# Format: Query<delimiter>Item1<delimiter>Item2<delimiter>...
|
388
|
+
# This enables efficient batch processing of multiple items against a single query.
|
389
|
+
multi_item_scoring_delimiter: Optional[Union[int]] = None
|
390
|
+
|
315
391
|
# Optimization/debug options
|
316
392
|
disable_radix_cache: bool = False
|
317
393
|
cuda_graph_max_bs: Optional[int] = None
|
@@ -327,11 +403,13 @@ class ServerArgs:
|
|
327
403
|
disable_outlines_disk_cache: bool = False
|
328
404
|
disable_custom_all_reduce: bool = False
|
329
405
|
enable_mscclpp: bool = False
|
406
|
+
enable_torch_symm_mem: bool = False
|
330
407
|
disable_overlap_schedule: bool = False
|
331
408
|
enable_mixed_chunk: bool = False
|
332
409
|
enable_dp_attention: bool = False
|
333
410
|
enable_dp_lm_head: bool = False
|
334
411
|
enable_two_batch_overlap: bool = False
|
412
|
+
enable_single_batch_overlap: bool = False
|
335
413
|
tbo_token_distribution_threshold: float = 0.48
|
336
414
|
enable_torch_compile: bool = False
|
337
415
|
torch_compile_max_bs: int = 32
|
@@ -340,17 +418,27 @@ class ServerArgs:
|
|
340
418
|
enable_p2p_check: bool = False
|
341
419
|
triton_attention_reduce_in_fp32: bool = False
|
342
420
|
triton_attention_num_kv_splits: int = 8
|
421
|
+
triton_attention_split_tile_size: Optional[int] = None
|
343
422
|
num_continuous_decode_steps: int = 1
|
344
423
|
delete_ckpt_after_loading: bool = False
|
345
424
|
enable_memory_saver: bool = False
|
425
|
+
enable_weights_cpu_backup: bool = False
|
346
426
|
allow_auto_truncate: bool = False
|
347
427
|
enable_custom_logit_processor: bool = False
|
348
428
|
flashinfer_mla_disable_ragged: bool = False
|
349
429
|
disable_shared_experts_fusion: bool = False
|
350
430
|
disable_chunked_prefix_cache: bool = False
|
351
431
|
disable_fast_image_processor: bool = False
|
432
|
+
keep_mm_feature_on_device: bool = False
|
352
433
|
enable_return_hidden_states: bool = False
|
353
434
|
scheduler_recv_interval: int = 1
|
435
|
+
numa_node: Optional[List[int]] = None
|
436
|
+
enable_deterministic_inference: bool = False
|
437
|
+
|
438
|
+
# Dynamic batch tokenizer
|
439
|
+
enable_dynamic_batch_tokenizer: bool = False
|
440
|
+
dynamic_batch_tokenizer_batch_size: int = 32
|
441
|
+
dynamic_batch_tokenizer_batch_timeout: float = 0.002
|
354
442
|
|
355
443
|
# Debug tensor dumps
|
356
444
|
debug_tensor_dump_output_folder: Optional[str] = None
|
@@ -359,66 +447,124 @@ class ServerArgs:
|
|
359
447
|
debug_tensor_dump_prefill_only: bool = False
|
360
448
|
|
361
449
|
# PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
|
362
|
-
disaggregation_mode:
|
450
|
+
disaggregation_mode: Literal["null", "prefill", "decode"] = "null"
|
363
451
|
disaggregation_transfer_backend: str = "mooncake"
|
364
452
|
disaggregation_bootstrap_port: int = 8998
|
365
453
|
disaggregation_decode_tp: Optional[int] = None
|
366
454
|
disaggregation_decode_dp: Optional[int] = None
|
367
455
|
disaggregation_prefill_pp: Optional[int] = 1
|
368
456
|
disaggregation_ib_device: Optional[str] = None
|
457
|
+
disaggregation_decode_enable_offload_kvcache: bool = False
|
369
458
|
num_reserved_decode_tokens: int = 512 # used for decode kv cache offload in PD
|
370
|
-
|
459
|
+
# FIXME: hack to reduce ITL when decode bs is small
|
460
|
+
disaggregation_decode_polling_interval: int = 1
|
371
461
|
|
372
|
-
# For model weight update
|
462
|
+
# For model weight update and weight loading
|
373
463
|
custom_weight_loader: Optional[List[str]] = None
|
374
464
|
weight_loader_disable_mmap: bool = False
|
465
|
+
remote_instance_weight_loader_seed_instance_ip: Optional[str] = None
|
466
|
+
remote_instance_weight_loader_seed_instance_service_port: Optional[int] = None
|
467
|
+
remote_instance_weight_loader_send_weights_group_ports: Optional[List[int]] = None
|
375
468
|
|
376
469
|
# For PD-Multiplexing
|
377
470
|
enable_pdmux: bool = False
|
378
471
|
sm_group_num: int = 3
|
379
472
|
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
473
|
+
def get_attention_backends(server_args):
|
474
|
+
prefill_attention_backend_str = (
|
475
|
+
server_args.prefill_attention_backend
|
476
|
+
if server_args.prefill_attention_backend
|
477
|
+
else server_args.attention_backend
|
478
|
+
)
|
479
|
+
decode_attention_backend_str = (
|
480
|
+
server_args.decode_attention_backend
|
481
|
+
if server_args.decode_attention_backend
|
482
|
+
else server_args.attention_backend
|
483
|
+
)
|
484
|
+
return prefill_attention_backend_str, decode_attention_backend_str
|
387
485
|
|
388
486
|
def __post_init__(self):
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
487
|
+
"""
|
488
|
+
Orchestrates the handling of various server arguments, ensuring proper configuration and validation.
|
489
|
+
"""
|
490
|
+
# Handle deprecated arguments.
|
491
|
+
self._handle_deprecated_args()
|
492
|
+
|
493
|
+
# Set missing default values.
|
494
|
+
self._handle_missing_default_values()
|
495
|
+
|
496
|
+
# Get GPU memory capacity, which is a common dependency for several configuration steps.
|
497
|
+
gpu_mem = get_device_memory_capacity(self.device)
|
498
|
+
|
499
|
+
# Handle memory-related, chunked prefill, and CUDA graph batch size configurations.
|
500
|
+
self._handle_gpu_memory_settings(gpu_mem)
|
501
|
+
|
502
|
+
# Handle device-specific backends.
|
503
|
+
self._handle_hpu_backends()
|
504
|
+
self._handle_cpu_backends()
|
505
|
+
|
506
|
+
# Apply model-specific adjustments.
|
507
|
+
self._handle_model_specific_adjustments()
|
508
|
+
|
509
|
+
# Set kernel backends.
|
510
|
+
self._handle_sampling_backend()
|
511
|
+
self._handle_attention_backend_compatibility()
|
512
|
+
self._handle_page_size()
|
513
|
+
self._handle_amd_specifics()
|
514
|
+
self._handle_grammar_backend()
|
515
|
+
|
516
|
+
# Handle data parallelism.
|
517
|
+
self._handle_data_parallelism()
|
518
|
+
|
519
|
+
# Handle MoE configurations.
|
520
|
+
self._handle_moe_kernel_config()
|
521
|
+
self._handle_deepep_moe()
|
522
|
+
self._handle_eplb_and_dispatch()
|
523
|
+
self._handle_expert_distribution_metrics()
|
524
|
+
|
525
|
+
# Handle pipeline parallelism.
|
526
|
+
self._handle_pipeline_parallelism()
|
527
|
+
|
528
|
+
# Handle Hicache settings.
|
529
|
+
self._handle_hicache()
|
530
|
+
|
531
|
+
# Handle speculative decoding logic.
|
532
|
+
self._handle_speculative_decoding()
|
533
|
+
|
534
|
+
# Handle model loading format.
|
535
|
+
self._handle_load_format()
|
536
|
+
|
537
|
+
# Handle PD disaggregation.
|
538
|
+
self._handle_disaggregation()
|
539
|
+
|
540
|
+
# Validate tokenizer settings.
|
541
|
+
self._handle_tokenizer_batching()
|
542
|
+
|
543
|
+
# Propagate environment variables.
|
544
|
+
self._handle_environment_variables()
|
545
|
+
|
546
|
+
# Validate cache settings.
|
547
|
+
self._handle_cache_compatibility()
|
548
|
+
|
549
|
+
# Validate metrics labels.
|
550
|
+
self._handle_metrics_labels()
|
551
|
+
|
552
|
+
# Handle deterministic inference.
|
553
|
+
self._handle_deterministic_inference()
|
554
|
+
|
555
|
+
# Handle any other necessary validations.
|
556
|
+
self._handle_other_validations()
|
557
|
+
|
558
|
+
def _handle_deprecated_args(self):
|
559
|
+
# handle deprecated tool call parsers
|
560
|
+
deprecated_tool_call_parsers = {"qwen25": "qwen", "glm45": "glm"}
|
561
|
+
if self.tool_call_parser in deprecated_tool_call_parsers:
|
562
|
+
logger.warning(
|
563
|
+
f"The tool_call_parser '{self.tool_call_parser}' is deprecated. Please use '{deprecated_tool_call_parsers[self.tool_call_parser]}' instead."
|
419
564
|
)
|
565
|
+
self.tool_call_parser = deprecated_tool_call_parsers[self.tool_call_parser]
|
420
566
|
|
421
|
-
|
567
|
+
def _handle_missing_default_values(self):
|
422
568
|
if self.tokenizer_path is None:
|
423
569
|
self.tokenizer_path = self.model_path
|
424
570
|
if self.served_model_name is None:
|
@@ -428,51 +574,140 @@ class ServerArgs:
|
|
428
574
|
if self.random_seed is None:
|
429
575
|
self.random_seed = random.randint(0, 1 << 30)
|
430
576
|
|
431
|
-
|
577
|
+
def _handle_gpu_memory_settings(self, gpu_mem):
|
578
|
+
"""
|
579
|
+
Configure GPU memory-dependent settings including
|
580
|
+
chunked_prefill_size, cuda_graph_max_bs, and mem_fraction_static.
|
581
|
+
|
582
|
+
Here are our heuristics:
|
583
|
+
- Set chunked_prefill_size and cuda_graph_max_bs based on the GPU memory capacity.
|
584
|
+
This is because GPUs with more memory are generally more powerful, we need to use a larger
|
585
|
+
chunked_prefill_size and a larger cuda_graph_max_bs to fully utilize the GPU.
|
586
|
+
- Then set mem_fraction_static based on chunked_prefill_size and cuda_graph_max_bs.
|
587
|
+
|
588
|
+
GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
|
589
|
+
|
590
|
+
The argument mem_fraction_static is defined as (model weights + KV cache pool) / GPU memory capacity,
|
591
|
+
or equivalently, mem_fraction_static = (GPU memory capacity - activations - cuda graph buffers) / GPU memory capacity.
|
592
|
+
|
593
|
+
In order to compute mem_fraction_static, we need to estimate the size of activations and cuda graph buffers.
|
594
|
+
The activation memory is proportional to the chunked_prefill_size.
|
595
|
+
The cuda graph memory is proportional to the cuda_graph_max_bs.
|
596
|
+
We use reserved_mem = chunked_prefill_size * 1.5 + cuda_graph_max_bs * 2 to estimate the size of activations and cuda graph buffers in GB.
|
597
|
+
and set mem_fraction_static = (GPU memory capacity - reserved_mem) / GPU memory capacity.
|
598
|
+
|
599
|
+
The coefficient 1.5 is a heuristic value, in the future, we can do better estimation by looking at the model types, hidden sizes or even do a dummy run.
|
600
|
+
"""
|
601
|
+
if gpu_mem is not None:
|
602
|
+
if gpu_mem < 20 * 1024:
|
603
|
+
# T4, 4080
|
604
|
+
# (chunked_prefill_size 2k, cuda_graph_max_bs 8)
|
605
|
+
if self.chunked_prefill_size is None:
|
606
|
+
self.chunked_prefill_size = 2048
|
607
|
+
if self.cuda_graph_max_bs is None:
|
608
|
+
self.cuda_graph_max_bs = 8
|
609
|
+
elif gpu_mem < 35 * 1024:
|
610
|
+
# A10, 4090, 5090
|
611
|
+
# (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80)
|
612
|
+
if self.chunked_prefill_size is None:
|
613
|
+
self.chunked_prefill_size = 2048
|
614
|
+
if self.cuda_graph_max_bs is None:
|
615
|
+
# Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM < 35GB, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance.
|
616
|
+
# However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs
|
617
|
+
# from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
|
618
|
+
if self.tp_size < 4:
|
619
|
+
self.cuda_graph_max_bs = 16
|
620
|
+
else:
|
621
|
+
self.cuda_graph_max_bs = 80
|
622
|
+
elif gpu_mem < 60 * 1024:
|
623
|
+
# A100 (40GB), L40,
|
624
|
+
# (chunked_prefill_size 4k, cuda_graph_max_bs 32 if tp < 4 else 160)
|
625
|
+
if self.chunked_prefill_size is None:
|
626
|
+
self.chunked_prefill_size = 4096
|
627
|
+
if self.cuda_graph_max_bs is None:
|
628
|
+
if self.tp_size < 4:
|
629
|
+
self.cuda_graph_max_bs = 32
|
630
|
+
else:
|
631
|
+
self.cuda_graph_max_bs = 160
|
632
|
+
elif gpu_mem < 90 * 1024:
|
633
|
+
# H100, A100
|
634
|
+
# (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
|
635
|
+
if self.chunked_prefill_size is None:
|
636
|
+
self.chunked_prefill_size = 8192
|
637
|
+
if self.cuda_graph_max_bs is None:
|
638
|
+
if self.tp_size < 4:
|
639
|
+
self.cuda_graph_max_bs = 256
|
640
|
+
else:
|
641
|
+
self.cuda_graph_max_bs = 512
|
642
|
+
elif gpu_mem < 160 * 1024:
|
643
|
+
# H20, H200
|
644
|
+
# (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
|
645
|
+
if self.chunked_prefill_size is None:
|
646
|
+
self.chunked_prefill_size = 8192
|
647
|
+
if self.cuda_graph_max_bs is None:
|
648
|
+
if self.tp_size < 4:
|
649
|
+
self.cuda_graph_max_bs = 256
|
650
|
+
else:
|
651
|
+
self.cuda_graph_max_bs = 512
|
652
|
+
else:
|
653
|
+
# B200, MI300
|
654
|
+
# (chunked_prefill_size 16k, cuda_graph_max_bs 512)
|
655
|
+
if self.chunked_prefill_size is None:
|
656
|
+
self.chunked_prefill_size = 16384
|
657
|
+
if self.cuda_graph_max_bs is None:
|
658
|
+
self.cuda_graph_max_bs = 512
|
659
|
+
else:
|
660
|
+
# Fallback defaults when gpu_mem is None
|
661
|
+
if self.chunked_prefill_size is None:
|
662
|
+
self.chunked_prefill_size = 4096
|
663
|
+
if self.cuda_graph_max_bs is None:
|
664
|
+
self.cuda_graph_max_bs = 160
|
432
665
|
|
433
|
-
# Set
|
434
|
-
if self.
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
# We want mem_fraction_static to be as large as possible but still has enough room
|
440
|
-
# for activations and cuda graph buffers. We use the following heuristic to
|
441
|
-
# compute the needed size for activations and cuda graph buffers:
|
442
|
-
# - The size of the activation depends on the chunked_prefill_size and model size.
|
443
|
-
# - The size of cuda graph buffers depends on the cuda graph capture range and model size.
|
444
|
-
# For GPUs with more memory, we use a larger chunked_prefill_size and
|
445
|
-
# capture more cuda graphs, so they need to reserve more memory.
|
446
|
-
parallel_size = self.tp_size * self.pp_size
|
447
|
-
|
448
|
-
if gpu_mem < 20 * 1024:
|
449
|
-
# T4, 4080. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
|
450
|
-
reserved_mem = (2.8 + parallel_size / 10) * 1024
|
451
|
-
elif gpu_mem < 35 * 1024:
|
452
|
-
# A10, L40, 4090, 5090. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
|
453
|
-
reserved_mem = (2.8 + parallel_size / 10) * 1024
|
454
|
-
elif gpu_mem < 90 * 1024:
|
455
|
-
# H100, A100. (chunked_prefill_size 8k, cuda_graph_max_bs 160)
|
456
|
-
reserved_mem = (9.5 + parallel_size / 2) * 1024
|
457
|
-
elif gpu_mem < 100 * 1024:
|
458
|
-
# H20. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
|
459
|
-
reserved_mem = (12 + parallel_size / 2) * 1024
|
460
|
-
elif gpu_mem < 160 * 1024:
|
461
|
-
# H200. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
|
462
|
-
reserved_mem = (12 + parallel_size / 2) * 1024
|
463
|
-
else:
|
464
|
-
# B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512)
|
465
|
-
reserved_mem = 32 * 1024
|
666
|
+
# Set cuda graph batch sizes
|
667
|
+
if self.cuda_graph_bs is None:
|
668
|
+
self.cuda_graph_bs = self._generate_cuda_graph_batch_sizes()
|
669
|
+
else:
|
670
|
+
self.cuda_graph_max_bs = max(self.cuda_graph_bs)
|
466
671
|
|
467
|
-
|
468
|
-
|
672
|
+
if self.mem_fraction_static is None:
|
673
|
+
# Constant meta data (e.g., from attention backend)
|
674
|
+
reserved_mem = 512
|
675
|
+
# For activation during large prefill
|
676
|
+
if self.chunked_prefill_size > 0:
|
677
|
+
reserved_mem += max(self.chunked_prefill_size, 2048) * 1.5
|
678
|
+
else:
|
679
|
+
reserved_mem += max(self.max_prefill_tokens, 2048) * 1.5
|
680
|
+
# For cuda graphs
|
681
|
+
reserved_mem += self.cuda_graph_max_bs * 2
|
682
|
+
# Some adjustments for large parallel size
|
683
|
+
reserved_mem += self.tp_size * self.pp_size / 8 * 1024
|
684
|
+
|
685
|
+
if self.enable_dp_attention:
|
686
|
+
# DP attention needs more padding for some operations
|
687
|
+
reserved_mem += self.cuda_graph_max_bs * self.dp_size * 3
|
688
|
+
|
689
|
+
# DP attention uses much more memory for large cuda graph max bs,
|
690
|
+
# likely due to some inefficiencies in torch allocator or our implementation.
|
691
|
+
# So we need to reserve more memory.
|
692
|
+
if self.cuda_graph_max_bs > 300:
|
693
|
+
reserved_mem += self.cuda_graph_max_bs * self.dp_size * 1.5
|
694
|
+
|
695
|
+
if gpu_mem is not None and gpu_mem > 60 * 1024:
|
696
|
+
reserved_mem = max(reserved_mem, 10 * 1024)
|
697
|
+
|
698
|
+
if self.speculative_algorithm is not None:
|
699
|
+
if self.speculative_algorithm == "STANDALONE":
|
700
|
+
# standalonedraft model and cuda graphs
|
701
|
+
reserved_mem += 6 * 1024
|
702
|
+
elif self.speculative_algorithm != "NGRAM":
|
703
|
+
# eagle draft models and cuda graphs
|
469
704
|
reserved_mem += 2 * 1024
|
470
|
-
if self.enable_dp_attention:
|
471
|
-
reserved_mem += 4 * 1024
|
472
705
|
|
473
|
-
|
474
|
-
|
475
|
-
|
706
|
+
self.mem_fraction_static = (
|
707
|
+
round((gpu_mem - reserved_mem) / gpu_mem, 3)
|
708
|
+
if gpu_mem is not None
|
709
|
+
else 0.88
|
710
|
+
)
|
476
711
|
|
477
712
|
# Lazy init to avoid circular import
|
478
713
|
# Multimodal models need more memory for the image processor
|
@@ -482,53 +717,197 @@ class ServerArgs:
|
|
482
717
|
if model_config.is_multimodal:
|
483
718
|
self.adjust_mem_fraction_for_vlm(model_config)
|
484
719
|
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
720
|
+
def _generate_cuda_graph_batch_sizes(self):
|
721
|
+
"""
|
722
|
+
Generate the list of batch sizes for CUDA graph capture based on cuda_graph_max_bs.
|
723
|
+
This integrates the logic from cuda_graph_runner.py.
|
724
|
+
"""
|
725
|
+
# Handle disable_cuda_graph_padding as the first condition for both spec and non-spec
|
726
|
+
if self.disable_cuda_graph_padding:
|
727
|
+
capture_bs = list(range(1, self.cuda_graph_max_bs + 1))
|
728
|
+
elif self.speculative_algorithm is None:
|
729
|
+
# Normal case: [1, 2, 4, 8, 12] + list(range(16, 257, 8)) + list(range(272, 512, 16)) + list(range(512, cuda_graph_max_bs + 1))
|
730
|
+
capture_bs = (
|
731
|
+
[1, 2, 4, 8, 12]
|
732
|
+
+ list(range(16, 257, 8))
|
733
|
+
+ list(range(272, 512, 16))
|
734
|
+
+ list(range(512, self.cuda_graph_max_bs + 1, 32))
|
735
|
+
)
|
736
|
+
else:
|
737
|
+
# Spec decoding case: list(range(1, 9, 1)) + list(range(10, 33, 2)) + list(range(40, 64, 4)) + list(range(72, 257, 8))
|
738
|
+
capture_bs = (
|
739
|
+
list(range(1, 9, 1))
|
740
|
+
+ list(range(10, 33, 2))
|
741
|
+
+ list(range(40, 64, 4))
|
742
|
+
+ list(range(72, 257, 8))
|
743
|
+
+ list(range(272, self.cuda_graph_max_bs + 1, 16))
|
744
|
+
)
|
496
745
|
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
if gpu_mem is not None and gpu_mem < 35 * 1024:
|
501
|
-
if self.tp_size < 4:
|
502
|
-
self.cuda_graph_max_bs = 8
|
503
|
-
else:
|
504
|
-
self.cuda_graph_max_bs = 80
|
746
|
+
capture_bs = [bs for bs in capture_bs if bs <= self.cuda_graph_max_bs]
|
747
|
+
|
748
|
+
return capture_bs
|
505
749
|
|
506
|
-
|
750
|
+
def _handle_hpu_backends(self):
|
507
751
|
if self.device == "hpu":
|
508
752
|
self.attention_backend = "torch_native"
|
509
753
|
self.sampling_backend = "pytorch"
|
510
754
|
|
511
|
-
|
512
|
-
self.model_specific_adjustments()
|
513
|
-
|
514
|
-
# Set kernel backends
|
755
|
+
def _handle_cpu_backends(self):
|
515
756
|
if self.device == "cpu":
|
516
757
|
if self.attention_backend is None:
|
517
758
|
self.attention_backend = "intel_amx"
|
518
759
|
self.sampling_backend = "pytorch"
|
519
760
|
|
761
|
+
def _handle_model_specific_adjustments(self):
|
762
|
+
from sglang.srt.configs.model_config import is_deepseek_nsa
|
763
|
+
|
764
|
+
if parse_connector_type(self.model_path) == ConnectorType.INSTANCE:
|
765
|
+
return
|
766
|
+
|
767
|
+
hf_config = self.get_hf_config()
|
768
|
+
model_arch = hf_config.architectures[0]
|
769
|
+
if model_arch in ["GptOssForCausalLM"]:
|
770
|
+
if (
|
771
|
+
self.attention_backend is None
|
772
|
+
and self.prefill_attention_backend is None
|
773
|
+
and self.decode_attention_backend is None
|
774
|
+
):
|
775
|
+
if is_cuda() and is_sm100_supported():
|
776
|
+
self.attention_backend = "trtllm_mha"
|
777
|
+
elif is_cuda() and is_sm90_supported():
|
778
|
+
self.attention_backend = "fa3"
|
779
|
+
else:
|
780
|
+
self.attention_backend = "triton"
|
781
|
+
|
782
|
+
supported_backends = ["triton", "trtllm_mha", "fa3", "fa4"]
|
783
|
+
prefill_attn_backend, decode_attn_backend = self.get_attention_backends()
|
784
|
+
assert (
|
785
|
+
prefill_attn_backend in supported_backends
|
786
|
+
and decode_attn_backend in supported_backends
|
787
|
+
), (
|
788
|
+
f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got the following backends\n"
|
789
|
+
f"- Prefill: {prefill_attn_backend}\n"
|
790
|
+
f"- Decode: {decode_attn_backend}\n"
|
791
|
+
)
|
792
|
+
|
793
|
+
if is_sm100_supported():
|
794
|
+
if not self.enable_dp_attention:
|
795
|
+
self.enable_flashinfer_allreduce_fusion = True
|
796
|
+
logger.info(
|
797
|
+
"Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM"
|
798
|
+
)
|
799
|
+
quantization_config = getattr(hf_config, "quantization_config", None)
|
800
|
+
is_mxfp4_quant_format = (
|
801
|
+
quantization_config is not None
|
802
|
+
and quantization_config.get("quant_method") == "mxfp4"
|
803
|
+
)
|
804
|
+
|
805
|
+
if is_sm100_supported() and is_mxfp4_quant_format:
|
806
|
+
self.moe_runner_backend = "flashinfer_mxfp4"
|
807
|
+
logger.warning(
|
808
|
+
"Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
|
809
|
+
)
|
810
|
+
else:
|
811
|
+
if self.moe_runner_backend == "triton_kernel":
|
812
|
+
assert (
|
813
|
+
self.ep_size == 1
|
814
|
+
), "Triton kernel MoE is only supported when ep_size == 1"
|
815
|
+
if (
|
816
|
+
self.moe_runner_backend == "auto"
|
817
|
+
and self.ep_size == 1
|
818
|
+
and is_triton_kernels_available()
|
819
|
+
):
|
820
|
+
self.moe_runner_backend = "triton_kernel"
|
821
|
+
logger.warning(
|
822
|
+
"Detected GPT-OSS model, enabling triton_kernels MOE kernel."
|
823
|
+
)
|
824
|
+
self.disable_hybrid_swa_memory = True
|
825
|
+
if is_mxfp4_quant_format:
|
826
|
+
# use bf16 for mxfp4 triton kernels
|
827
|
+
self.dtype = "bfloat16"
|
828
|
+
|
829
|
+
elif "Llama4" in model_arch and self.device != "cpu":
|
830
|
+
assert self.attention_backend in {
|
831
|
+
"fa3",
|
832
|
+
"aiter",
|
833
|
+
"triton",
|
834
|
+
}, "fa3, aiter, or triton is required for Llama4 model"
|
835
|
+
elif model_arch in [
|
836
|
+
"Gemma2ForCausalLM",
|
837
|
+
"Gemma3ForCausalLM",
|
838
|
+
"Gemma3ForConditionalGeneration",
|
839
|
+
"Gemma3nForCausalLM",
|
840
|
+
"Gemma3nForConditionalGeneration",
|
841
|
+
]:
|
842
|
+
# FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
|
843
|
+
# It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
|
844
|
+
logger.warning(
|
845
|
+
f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
|
846
|
+
)
|
847
|
+
self.disable_hybrid_swa_memory = True
|
848
|
+
|
849
|
+
if is_deepseek_nsa(hf_config):
|
850
|
+
if (
|
851
|
+
self.attention_backend is None
|
852
|
+
and self.prefill_attention_backend is None
|
853
|
+
and self.decode_attention_backend is None
|
854
|
+
):
|
855
|
+
self.attention_backend = "nsa"
|
856
|
+
logger.warning("Set nsa attention backend for DeepSeek NSA.")
|
857
|
+
|
858
|
+
if not is_npu():
|
859
|
+
self.enable_dp_attention = True
|
860
|
+
self.dp_size = self.tp_size
|
861
|
+
logger.warning("DP attention is enabled for DeepSeek NSA.")
|
862
|
+
|
863
|
+
self.page_size = 64
|
864
|
+
logger.warning("Setting page size to 64 for DeepSeek NSA.")
|
865
|
+
|
866
|
+
# For Hopper, we support both bf16 and fp8 kv cache; for Blackwell, we support fp8 only currently
|
867
|
+
import torch
|
868
|
+
|
869
|
+
major, _ = torch.cuda.get_device_capability()
|
870
|
+
if major >= 10:
|
871
|
+
self.kv_cache_dtype = "fp8_e4m3"
|
872
|
+
logger.warning("Setting KV cache dtype to fp8.")
|
873
|
+
|
874
|
+
if self.kv_cache_dtype == "fp8_e4m3":
|
875
|
+
self.nsa_prefill = "flashmla_decode"
|
876
|
+
self.nsa_decode = "flashmla_decode"
|
877
|
+
logger.warning(
|
878
|
+
"Setting NSA backend to flashmla_decode for FP8 KV Cache."
|
879
|
+
)
|
880
|
+
|
881
|
+
# Logging env vars for NSA
|
882
|
+
from sglang.srt.layers.attention.nsa.utils import (
|
883
|
+
print_nsa_bool_env_vars,
|
884
|
+
)
|
885
|
+
|
886
|
+
print_nsa_bool_env_vars()
|
887
|
+
|
888
|
+
def _handle_sampling_backend(self):
|
520
889
|
if self.sampling_backend is None:
|
521
890
|
self.sampling_backend = (
|
522
891
|
"flashinfer" if is_flashinfer_available() else "pytorch"
|
523
892
|
)
|
524
893
|
|
894
|
+
def _handle_attention_backend_compatibility(self):
|
525
895
|
if self.attention_backend == "torch_native":
|
526
896
|
logger.warning(
|
527
897
|
"Cuda graph is disabled because of using torch native attention backend"
|
528
898
|
)
|
529
899
|
self.disable_cuda_graph = True
|
530
900
|
|
531
|
-
if self.attention_backend == "
|
901
|
+
if self.attention_backend == "flex_attention":
|
902
|
+
logger.warning(
|
903
|
+
"Cuda graph is disabled because of using torch Flex Attention backend"
|
904
|
+
)
|
905
|
+
self.disable_cuda_graph = True
|
906
|
+
assert (
|
907
|
+
self.speculative_algorithm is None
|
908
|
+
), "Speculative decoding is currently not supported with Flex Attention backend"
|
909
|
+
|
910
|
+
if is_npu() and self.attention_backend in ["ascend"]:
|
532
911
|
logger.warning(
|
533
912
|
"At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
|
534
913
|
)
|
@@ -590,30 +969,30 @@ class ServerArgs:
|
|
590
969
|
|
591
970
|
if self.attention_backend == "dual_chunk_flash_attn":
|
592
971
|
logger.warning(
|
593
|
-
"Mixed chunk
|
972
|
+
"Mixed chunk and radix cache are disabled when using dual-chunk flash attention backend"
|
594
973
|
)
|
595
974
|
self.enable_mixed_chunk = False
|
596
|
-
self.disable_cuda_graph = True
|
597
975
|
self.disable_radix_cache = True
|
598
976
|
|
599
|
-
|
977
|
+
def _handle_page_size(self):
|
600
978
|
if self.page_size is None:
|
601
979
|
self.page_size = 1
|
602
980
|
|
603
|
-
|
981
|
+
def _handle_amd_specifics(self):
|
604
982
|
if is_hip():
|
605
983
|
self.triton_attention_num_kv_splits = 16
|
606
984
|
|
607
|
-
|
985
|
+
def _handle_grammar_backend(self):
|
608
986
|
if self.grammar_backend is None:
|
609
987
|
self.grammar_backend = "xgrammar"
|
610
988
|
|
611
|
-
|
989
|
+
def _handle_data_parallelism(self):
|
990
|
+
if self.dp_size == 1:
|
991
|
+
self.enable_dp_attention = False
|
992
|
+
self.enable_dp_lm_head = False
|
993
|
+
|
612
994
|
if self.enable_dp_attention:
|
613
995
|
self.schedule_conservativeness = self.schedule_conservativeness * 0.3
|
614
|
-
assert (
|
615
|
-
self.dp_size > 1
|
616
|
-
), "Please set a dp-size > 1. You can use 1 < dp-size <= tp-size "
|
617
996
|
assert self.tp_size % self.dp_size == 0
|
618
997
|
self.chunked_prefill_size = self.chunked_prefill_size // self.dp_size
|
619
998
|
logger.warning(
|
@@ -625,7 +1004,7 @@ class ServerArgs:
|
|
625
1004
|
self.enable_dp_attention
|
626
1005
|
), "Please enable dp attention when setting enable_dp_lm_head. "
|
627
1006
|
|
628
|
-
|
1007
|
+
def _handle_moe_kernel_config(self):
|
629
1008
|
if self.moe_runner_backend == "flashinfer_cutlass":
|
630
1009
|
assert (
|
631
1010
|
self.quantization == "modelopt_fp4"
|
@@ -636,13 +1015,15 @@ class ServerArgs:
|
|
636
1015
|
], "The expert parallel size must be 1 or the same as the tensor parallel size"
|
637
1016
|
|
638
1017
|
if self.moe_runner_backend == "flashinfer_trtllm":
|
639
|
-
|
640
|
-
self.
|
641
|
-
|
642
|
-
|
643
|
-
|
1018
|
+
assert (
|
1019
|
+
self.quantization == "modelopt_fp4" or self.quantization == "fp8"
|
1020
|
+
), "modelopt_fp4 or fp8 quantization is required for Flashinfer TRTLLM MoE"
|
1021
|
+
self.disable_shared_experts_fusion = True
|
1022
|
+
logger.warning(
|
1023
|
+
"FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
|
1024
|
+
)
|
644
1025
|
|
645
|
-
|
1026
|
+
def _handle_deepep_moe(self):
|
646
1027
|
if self.moe_a2a_backend == "deepep":
|
647
1028
|
if self.deepep_mode == "normal":
|
648
1029
|
logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
|
@@ -652,6 +1033,7 @@ class ServerArgs:
|
|
652
1033
|
f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
653
1034
|
)
|
654
1035
|
|
1036
|
+
def _handle_eplb_and_dispatch(self):
|
655
1037
|
if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
|
656
1038
|
self.expert_distribution_recorder_mode = "stat"
|
657
1039
|
logger.warning(
|
@@ -666,6 +1048,7 @@ class ServerArgs:
|
|
666
1048
|
if self.enable_eplb:
|
667
1049
|
assert self.ep_size > 1
|
668
1050
|
|
1051
|
+
def _handle_expert_distribution_metrics(self):
|
669
1052
|
if self.enable_expert_distribution_metrics and (
|
670
1053
|
self.expert_distribution_recorder_mode is None
|
671
1054
|
):
|
@@ -677,25 +1060,42 @@ class ServerArgs:
|
|
677
1060
|
elif self.expert_distribution_recorder_mode is not None:
|
678
1061
|
self.expert_distribution_recorder_buffer_size = 1000
|
679
1062
|
|
680
|
-
|
1063
|
+
def _handle_pipeline_parallelism(self):
|
681
1064
|
if self.pp_size > 1:
|
682
1065
|
self.disable_overlap_schedule = True
|
683
1066
|
logger.warning(
|
684
1067
|
"Pipeline parallelism is incompatible with overlap schedule."
|
685
1068
|
)
|
686
1069
|
|
687
|
-
|
1070
|
+
def _handle_hicache(self):
|
688
1071
|
if self.hicache_storage_backend == "mooncake":
|
689
|
-
|
690
|
-
|
691
|
-
|
1072
|
+
if self.hicache_mem_layout == "layer_first":
|
1073
|
+
if self.hicache_io_backend == "direct":
|
1074
|
+
self.hicache_mem_layout = "page_first_direct"
|
1075
|
+
elif self.hicache_io_backend == "kernel":
|
1076
|
+
self.hicache_mem_layout = "page_first"
|
1077
|
+
logger.warning(
|
1078
|
+
f"Mooncake storage backend does not support layer_first layout, "
|
1079
|
+
f"switching to {self.hicache_mem_layout} layout for {self.hicache_io_backend} io backend"
|
1080
|
+
)
|
1081
|
+
|
1082
|
+
if self.hicache_mem_layout == "page_first_direct":
|
1083
|
+
if self.hicache_io_backend != "direct":
|
1084
|
+
self.hicache_io_backend = "direct"
|
1085
|
+
logger.warning(
|
1086
|
+
"Page first direct layout only support direct io backend"
|
1087
|
+
)
|
692
1088
|
|
693
|
-
|
1089
|
+
def _handle_speculative_decoding(self):
|
694
1090
|
if self.speculative_algorithm == "NEXTN":
|
695
|
-
# NEXTN shares the same implementation of EAGLE
|
696
1091
|
self.speculative_algorithm = "EAGLE"
|
697
1092
|
|
698
|
-
if self.speculative_algorithm in ("EAGLE", "EAGLE3"):
|
1093
|
+
if self.speculative_algorithm in ("EAGLE", "EAGLE3", "STANDALONE"):
|
1094
|
+
if self.speculative_algorithm == "STANDALONE" and self.enable_dp_attention:
|
1095
|
+
# TODO: support dp attention for standalone speculative decoding
|
1096
|
+
raise ValueError(
|
1097
|
+
"Currently standalone speculative decoding does not support dp attention."
|
1098
|
+
)
|
699
1099
|
if self.max_running_requests is None:
|
700
1100
|
self.max_running_requests = 48
|
701
1101
|
self.disable_overlap_schedule = True
|
@@ -711,8 +1111,13 @@ class ServerArgs:
|
|
711
1111
|
)
|
712
1112
|
|
713
1113
|
model_arch = self.get_hf_config().architectures[0]
|
714
|
-
if model_arch in [
|
715
|
-
|
1114
|
+
if model_arch in [
|
1115
|
+
"DeepseekV32ForCausalLM",
|
1116
|
+
"DeepseekV3ForCausalLM",
|
1117
|
+
"Glm4MoeForCausalLM",
|
1118
|
+
"BailingMoeForCausalLM",
|
1119
|
+
"BailingMoeV2ForCausalLM",
|
1120
|
+
]:
|
716
1121
|
if self.speculative_draft_model_path is None:
|
717
1122
|
self.speculative_draft_model_path = self.model_path
|
718
1123
|
else:
|
@@ -720,7 +1125,6 @@ class ServerArgs:
|
|
720
1125
|
"DeepSeek MTP does not require setting speculative_draft_model_path."
|
721
1126
|
)
|
722
1127
|
|
723
|
-
# Auto choose parameters
|
724
1128
|
if self.speculative_num_steps is None:
|
725
1129
|
assert (
|
726
1130
|
self.speculative_eagle_topk is None
|
@@ -760,23 +1164,63 @@ class ServerArgs:
|
|
760
1164
|
"speculative_eagle_topk > 1 with page_size > 1 is unstable and produces incorrect results for paged attention backends. This combination is only supported for the 'flashinfer' backend."
|
761
1165
|
)
|
762
1166
|
|
763
|
-
|
764
|
-
|
765
|
-
|
1167
|
+
if self.speculative_algorithm == "NGRAM":
|
1168
|
+
if not self.device.startswith("cuda"):
|
1169
|
+
raise ValueError(
|
1170
|
+
"Ngram speculative decoding only supports CUDA device."
|
1171
|
+
)
|
1172
|
+
if self.max_running_requests is None:
|
1173
|
+
self.max_running_requests = 48
|
1174
|
+
self.disable_overlap_schedule = True
|
1175
|
+
self.enable_mixed_chunk = False
|
1176
|
+
self.speculative_eagle_topk = self.speculative_ngram_max_bfs_breadth
|
1177
|
+
if self.speculative_num_draft_tokens is None:
|
1178
|
+
self.speculative_num_draft_tokens = (
|
1179
|
+
self.speculative_ngram_max_match_window_size
|
1180
|
+
)
|
1181
|
+
logger.warning(
|
1182
|
+
"The overlap scheduler and mixed chunked prefill are disabled because of "
|
1183
|
+
"using ngram speculative decoding."
|
1184
|
+
)
|
766
1185
|
|
767
|
-
|
1186
|
+
if (
|
1187
|
+
self.speculative_eagle_topk > 1
|
1188
|
+
and self.page_size > 1
|
1189
|
+
and self.attention_backend != "flashinfer"
|
1190
|
+
):
|
1191
|
+
raise ValueError(
|
1192
|
+
f"speculative_eagle_topk({self.speculative_eagle_topk}) > 1 "
|
1193
|
+
f"with page_size({self.page_size}) > 1 is unstable "
|
1194
|
+
"and produces incorrect results for paged attention backends. "
|
1195
|
+
"This combination is only supported for the 'flashinfer' backend."
|
1196
|
+
)
|
1197
|
+
if self.enable_dp_attention:
|
1198
|
+
# TODO: support dp attention for ngram speculative decoding
|
1199
|
+
raise ValueError(
|
1200
|
+
"Currently ngram speculative decoding does not support dp attention."
|
1201
|
+
)
|
1202
|
+
|
1203
|
+
def _handle_load_format(self):
|
768
1204
|
if (
|
769
1205
|
self.load_format == "auto" or self.load_format == "gguf"
|
770
1206
|
) and check_gguf_file(self.model_path):
|
771
1207
|
self.quantization = self.load_format = "gguf"
|
772
1208
|
|
773
|
-
# Model loading
|
774
1209
|
if is_remote_url(self.model_path):
|
775
1210
|
self.load_format = "remote"
|
1211
|
+
|
776
1212
|
if self.custom_weight_loader is None:
|
777
1213
|
self.custom_weight_loader = []
|
778
1214
|
|
779
|
-
|
1215
|
+
if self.load_format == "remote_instance":
|
1216
|
+
if (
|
1217
|
+
self.remote_instance_weight_loader_seed_instance_ip is None
|
1218
|
+
or self.remote_instance_weight_loader_seed_instance_service_port is None
|
1219
|
+
or self.remote_instance_weight_loader_send_weights_group_ports is None
|
1220
|
+
):
|
1221
|
+
self.load_format = "auto"
|
1222
|
+
|
1223
|
+
def _handle_disaggregation(self):
|
780
1224
|
if self.disaggregation_mode == "decode":
|
781
1225
|
assert (
|
782
1226
|
self.disaggregation_decode_tp is None
|
@@ -787,6 +1231,13 @@ class ServerArgs:
|
|
787
1231
|
|
788
1232
|
self.disable_radix_cache = True
|
789
1233
|
logger.warning("KV cache is forced as chunk cache for decode server")
|
1234
|
+
|
1235
|
+
if self.dp_size > 1 and not is_in_ci():
|
1236
|
+
assert self.prefill_round_robin_balance, (
|
1237
|
+
"Prefill round robin balance is required when dp size > 1. "
|
1238
|
+
"Please make sure that the prefill instance is launched with `--load-balance-method round_robin`"
|
1239
|
+
" and `--prefill-round-robin-balance` is set for decode server."
|
1240
|
+
)
|
790
1241
|
elif self.disaggregation_mode == "prefill":
|
791
1242
|
if self.disaggregation_decode_tp is None:
|
792
1243
|
self.disaggregation_decode_tp = self.tp_size
|
@@ -795,25 +1246,84 @@ class ServerArgs:
|
|
795
1246
|
|
796
1247
|
self.disaggregation_prefill_pp = self.pp_size
|
797
1248
|
self.validate_disagg_tp_size(self.tp_size, self.disaggregation_decode_tp)
|
798
|
-
|
799
1249
|
self.disable_cuda_graph = True
|
800
1250
|
logger.warning("Cuda graph is disabled for prefill server")
|
801
1251
|
|
802
|
-
|
1252
|
+
def _handle_tokenizer_batching(self):
|
1253
|
+
if self.enable_tokenizer_batch_encode and self.enable_dynamic_batch_tokenizer:
|
1254
|
+
raise ValueError(
|
1255
|
+
"Cannot enable both --enable-tokenizer-batch-encode and --enable-dynamic-batch-tokenizer. "
|
1256
|
+
"Please choose one tokenizer batching approach."
|
1257
|
+
)
|
1258
|
+
|
1259
|
+
def _handle_environment_variables(self):
|
803
1260
|
os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
|
804
1261
|
"1" if self.enable_torch_compile else "0"
|
805
1262
|
)
|
806
|
-
|
1263
|
+
os.environ["SGLANG_MAMBA_SSM_DTYPE"] = self.mamba_ssm_dtype
|
807
1264
|
os.environ["SGLANG_DISABLE_OUTLINES_DISK_CACHE"] = (
|
808
1265
|
"1" if self.disable_outlines_disk_cache else "0"
|
809
1266
|
)
|
1267
|
+
os.environ["SGLANG_ENABLE_DETERMINISTIC_INFERENCE"] = (
|
1268
|
+
"1" if self.enable_deterministic_inference else "0"
|
1269
|
+
)
|
810
1270
|
|
1271
|
+
def _handle_cache_compatibility(self):
|
811
1272
|
if self.enable_hierarchical_cache and self.disable_radix_cache:
|
812
1273
|
raise ValueError(
|
813
1274
|
"The arguments enable-hierarchical-cache and disable-radix-cache are mutually exclusive "
|
814
1275
|
"and cannot be used at the same time. Please use only one of them."
|
815
1276
|
)
|
816
1277
|
|
1278
|
+
if (
|
1279
|
+
self.disaggregation_decode_enable_offload_kvcache
|
1280
|
+
and self.disaggregation_mode != "decode"
|
1281
|
+
):
|
1282
|
+
raise ValueError(
|
1283
|
+
"The argument disaggregation-decode-enable-offload-kvcache is only supported for decode side."
|
1284
|
+
)
|
1285
|
+
|
1286
|
+
def _handle_metrics_labels(self):
|
1287
|
+
if (
|
1288
|
+
not self.tokenizer_metrics_custom_labels_header
|
1289
|
+
and self.tokenizer_metrics_allowed_custom_labels
|
1290
|
+
):
|
1291
|
+
raise ValueError(
|
1292
|
+
"Please set --tokenizer-metrics-custom-labels-header when setting --tokenizer-metrics-allowed-custom-labels."
|
1293
|
+
)
|
1294
|
+
|
1295
|
+
def _handle_deterministic_inference(self):
|
1296
|
+
if self.enable_deterministic_inference:
|
1297
|
+
# Check sampling backend
|
1298
|
+
self.sampling_backend = "pytorch"
|
1299
|
+
logger.warning(
|
1300
|
+
"Sampling backend is set to pytorch for deterministic inference."
|
1301
|
+
)
|
1302
|
+
|
1303
|
+
# Check attention backend
|
1304
|
+
if self.attention_backend not in DETERMINISTIC_ATTENTION_BACKEND_CHOICES:
|
1305
|
+
raise ValueError(
|
1306
|
+
f"Currently only {DETERMINISTIC_ATTENTION_BACKEND_CHOICES} attention backends are supported for deterministic inference."
|
1307
|
+
)
|
1308
|
+
|
1309
|
+
# Currently, only FA3 supports radix cache. Support for other backends is in progress
|
1310
|
+
if self.attention_backend != "fa3":
|
1311
|
+
self.disable_radix_cache = True
|
1312
|
+
logger.warning(
|
1313
|
+
f"Currently radix cache is not compatible with {self.attention_backend} attention backend for deterministic inference. It will be supported in the future."
|
1314
|
+
)
|
1315
|
+
|
1316
|
+
# Check TP size
|
1317
|
+
if self.tp_size > 1:
|
1318
|
+
os.environ["NCCL_ALGO"] = "allreduce:tree"
|
1319
|
+
self.disable_custom_all_reduce = True
|
1320
|
+
logger.warning(
|
1321
|
+
"NCCL_ALGO is set to 'allreduce:tree' and custom all reduce is disabled for deterministic inference when TP size > 1."
|
1322
|
+
)
|
1323
|
+
|
1324
|
+
def _handle_other_validations(self):
|
1325
|
+
pass
|
1326
|
+
|
817
1327
|
@staticmethod
|
818
1328
|
def add_cli_args(parser: argparse.ArgumentParser):
|
819
1329
|
# Model and tokenizer
|
@@ -830,12 +1340,6 @@ class ServerArgs:
|
|
830
1340
|
default=ServerArgs.tokenizer_path,
|
831
1341
|
help="The path of the tokenizer.",
|
832
1342
|
)
|
833
|
-
parser.add_argument(
|
834
|
-
"--tokenizer-worker-num",
|
835
|
-
type=int,
|
836
|
-
default=ServerArgs.tokenizer_worker_num,
|
837
|
-
help="The worker num of the tokenizer manager.",
|
838
|
-
)
|
839
1343
|
parser.add_argument(
|
840
1344
|
"--tokenizer-mode",
|
841
1345
|
type=str,
|
@@ -845,6 +1349,12 @@ class ServerArgs:
|
|
845
1349
|
"tokenizer if available, and 'slow' will "
|
846
1350
|
"always use the slow tokenizer.",
|
847
1351
|
)
|
1352
|
+
parser.add_argument(
|
1353
|
+
"--tokenizer-worker-num",
|
1354
|
+
type=int,
|
1355
|
+
default=ServerArgs.tokenizer_worker_num,
|
1356
|
+
help="The worker num of the tokenizer manager.",
|
1357
|
+
)
|
848
1358
|
parser.add_argument(
|
849
1359
|
"--skip-tokenizer-init",
|
850
1360
|
action="store_true",
|
@@ -985,6 +1495,14 @@ class ServerArgs:
|
|
985
1495
|
"KV cache dtype is FP8. Otherwise, KV cache scaling factors "
|
986
1496
|
"default to 1.0, which may cause accuracy issues. ",
|
987
1497
|
)
|
1498
|
+
parser.add_argument(
|
1499
|
+
"--modelopt-quant",
|
1500
|
+
type=str,
|
1501
|
+
default=ServerArgs.modelopt_quant,
|
1502
|
+
help="The ModelOpt quantization configuration. "
|
1503
|
+
"Supported values: 'fp8', 'int4_awq', 'w4a8_awq', 'nvfp4', 'nvfp4_awq'. "
|
1504
|
+
"This requires the NVIDIA Model Optimizer library to be installed: pip install nvidia-modelopt",
|
1505
|
+
)
|
988
1506
|
parser.add_argument(
|
989
1507
|
"--kv-cache-dtype",
|
990
1508
|
type=str,
|
@@ -992,6 +1510,11 @@ class ServerArgs:
|
|
992
1510
|
choices=["auto", "fp8_e5m2", "fp8_e4m3"],
|
993
1511
|
help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" and "fp8_e4m3" is supported for CUDA 11.8+.',
|
994
1512
|
)
|
1513
|
+
parser.add_argument(
|
1514
|
+
"--enable-fp32-lm-head",
|
1515
|
+
action="store_true",
|
1516
|
+
help="If set, the LM head outputs (logits) are in FP32.",
|
1517
|
+
)
|
995
1518
|
|
996
1519
|
# Memory and scheduling
|
997
1520
|
parser.add_argument(
|
@@ -1035,9 +1558,27 @@ class ServerArgs:
|
|
1035
1558
|
"--schedule-policy",
|
1036
1559
|
type=str,
|
1037
1560
|
default=ServerArgs.schedule_policy,
|
1038
|
-
choices=["lpm", "random", "fcfs", "dfs-weight", "lof"],
|
1561
|
+
choices=["lpm", "random", "fcfs", "dfs-weight", "lof", "priority"],
|
1039
1562
|
help="The scheduling policy of the requests.",
|
1040
1563
|
)
|
1564
|
+
parser.add_argument(
|
1565
|
+
"--enable-priority-scheduling",
|
1566
|
+
action="store_true",
|
1567
|
+
default=ServerArgs.enable_priority_scheduling,
|
1568
|
+
help="Enable priority scheduling. Requests with higher priority integer values will be scheduled first by default.",
|
1569
|
+
)
|
1570
|
+
parser.add_argument(
|
1571
|
+
"--schedule-low-priority-values-first",
|
1572
|
+
action="store_true",
|
1573
|
+
default=ServerArgs.schedule_low_priority_values_first,
|
1574
|
+
help="If specified with --enable-priority-scheduling, the scheduler will schedule requests with lower priority integer values first.",
|
1575
|
+
)
|
1576
|
+
parser.add_argument(
|
1577
|
+
"--priority-scheduling-preemption-threshold",
|
1578
|
+
type=int,
|
1579
|
+
default=ServerArgs.priority_scheduling_preemption_threshold,
|
1580
|
+
help="Minimum difference in priorities for an incoming request to have to preempt running request(s).",
|
1581
|
+
)
|
1041
1582
|
parser.add_argument(
|
1042
1583
|
"--schedule-conservativeness",
|
1043
1584
|
type=float,
|
@@ -1097,9 +1638,9 @@ class ServerArgs:
|
|
1097
1638
|
help="The pipeline parallelism size.",
|
1098
1639
|
)
|
1099
1640
|
parser.add_argument(
|
1100
|
-
"--max-micro-batch-size",
|
1641
|
+
"--pp-max-micro-batch-size",
|
1101
1642
|
type=int,
|
1102
|
-
default=ServerArgs.
|
1643
|
+
default=ServerArgs.pp_max_micro_batch_size,
|
1103
1644
|
help="The maximum micro batch size in pipeline parallelism.",
|
1104
1645
|
)
|
1105
1646
|
parser.add_argument(
|
@@ -1209,6 +1750,21 @@ class ServerArgs:
|
|
1209
1750
|
"to record request metrics separately. This is especially useful when dp_attention is enabled, as "
|
1210
1751
|
"otherwise all metrics appear to come from TP 0.",
|
1211
1752
|
)
|
1753
|
+
parser.add_argument(
|
1754
|
+
"--tokenizer-metrics-custom-labels-header",
|
1755
|
+
type=str,
|
1756
|
+
default=ServerArgs.tokenizer_metrics_custom_labels_header,
|
1757
|
+
help="Specify the HTTP header for passing custom labels for tokenizer metrics.",
|
1758
|
+
)
|
1759
|
+
parser.add_argument(
|
1760
|
+
"--tokenizer-metrics-allowed-custom-labels",
|
1761
|
+
type=str,
|
1762
|
+
nargs="+",
|
1763
|
+
default=ServerArgs.tokenizer_metrics_allowed_custom_labels,
|
1764
|
+
help="The custom labels allowed for tokenizer metrics. The labels are specified via a dict in "
|
1765
|
+
"'--tokenizer-metrics-custom-labels-header' field in HTTP requests, e.g., {'label1': 'value1', 'label2': "
|
1766
|
+
"'value2'} is allowed if '--tokenizer-metrics-allowed-custom-labels label1 label2' is set.",
|
1767
|
+
)
|
1212
1768
|
parser.add_argument(
|
1213
1769
|
"--bucket-time-to-first-token",
|
1214
1770
|
type=float,
|
@@ -1239,8 +1795,8 @@ class ServerArgs:
|
|
1239
1795
|
bucket_rule = (
|
1240
1796
|
"Supports 3 rule types: 'default' uses predefined buckets; 'tse <middle> <base> <count>' "
|
1241
1797
|
"generates two sides exponential distributed buckets (e.g., 'tse 1000 2 8' generates buckets "
|
1242
|
-
"[984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); '
|
1243
|
-
"<value2> ...' uses custom bucket values (e.g., '
|
1798
|
+
"[984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); 'custom <value1> "
|
1799
|
+
"<value2> ...' uses custom bucket values (e.g., 'custom 10 50 100 500')."
|
1244
1800
|
)
|
1245
1801
|
parser.add_argument(
|
1246
1802
|
"--prompt-tokens-buckets",
|
@@ -1280,6 +1836,17 @@ class ServerArgs:
|
|
1280
1836
|
default=None,
|
1281
1837
|
help="Config in json format for NVIDIA dynamo KV event publishing. Publishing will be enabled if this flag is used.",
|
1282
1838
|
)
|
1839
|
+
parser.add_argument(
|
1840
|
+
"--enable-trace",
|
1841
|
+
action="store_true",
|
1842
|
+
help="Enable opentelemetry trace",
|
1843
|
+
)
|
1844
|
+
parser.add_argument(
|
1845
|
+
"--oltp-traces-endpoint",
|
1846
|
+
type=str,
|
1847
|
+
default="localhost:4317",
|
1848
|
+
help="Config opentelemetry collector endpoint if --enable-trace is set. format: <ip>:<port>",
|
1849
|
+
)
|
1283
1850
|
|
1284
1851
|
# API related
|
1285
1852
|
parser.add_argument(
|
@@ -1338,6 +1905,16 @@ class ServerArgs:
|
|
1338
1905
|
default=ServerArgs.tool_call_parser,
|
1339
1906
|
help=f"Specify the parser for handling tool-call interactions. Options include: {tool_call_parser_choices}.",
|
1340
1907
|
)
|
1908
|
+
parser.add_argument(
|
1909
|
+
"--sampling-defaults",
|
1910
|
+
type=str,
|
1911
|
+
choices=["openai", "model"],
|
1912
|
+
default=ServerArgs.sampling_defaults,
|
1913
|
+
help="Where to get default sampling parameters. "
|
1914
|
+
"'openai' uses SGLang/OpenAI defaults (temperature=1.0, top_p=1.0, etc.). "
|
1915
|
+
"'model' uses the model's generation_config.json to get the recommended "
|
1916
|
+
"sampling parameters if available. Default is 'model'.",
|
1917
|
+
)
|
1341
1918
|
parser.add_argument(
|
1342
1919
|
"--tool-server",
|
1343
1920
|
type=str,
|
@@ -1364,6 +1941,18 @@ class ServerArgs:
|
|
1364
1941
|
"minimum_tokens",
|
1365
1942
|
],
|
1366
1943
|
)
|
1944
|
+
parser.add_argument(
|
1945
|
+
"--load-watch-interval",
|
1946
|
+
type=float,
|
1947
|
+
default=ServerArgs.load_watch_interval,
|
1948
|
+
help="The interval of load watching in seconds.",
|
1949
|
+
)
|
1950
|
+
parser.add_argument(
|
1951
|
+
"--prefill-round-robin-balance",
|
1952
|
+
default=ServerArgs.prefill_round_robin_balance,
|
1953
|
+
action="store_true",
|
1954
|
+
help="Prefill is round robin balanced. This is used to promise decode server can get the correct dp rank.",
|
1955
|
+
)
|
1367
1956
|
|
1368
1957
|
# Multi-node distributed serving
|
1369
1958
|
parser.add_argument(
|
@@ -1438,9 +2027,17 @@ class ServerArgs:
|
|
1438
2027
|
parser.add_argument(
|
1439
2028
|
"--lora-backend",
|
1440
2029
|
type=str,
|
1441
|
-
|
2030
|
+
choices=LORA_BACKEND_CHOICES,
|
2031
|
+
default=ServerArgs.lora_backend,
|
1442
2032
|
help="Choose the kernel backend for multi-LoRA serving.",
|
1443
2033
|
)
|
2034
|
+
parser.add_argument(
|
2035
|
+
"--max-lora-chunk-size",
|
2036
|
+
type=int,
|
2037
|
+
default=ServerArgs.max_lora_chunk_size,
|
2038
|
+
choices=[16, 32, 64, 128],
|
2039
|
+
help="Maximum chunk size for the ChunkedSGMV LoRA backend. Only used when --lora-backend is 'csgmv'. Choosing a larger value might improve performance.",
|
2040
|
+
)
|
1444
2041
|
|
1445
2042
|
# Kernel backend
|
1446
2043
|
parser.add_argument(
|
@@ -1474,30 +2071,51 @@ class ServerArgs:
|
|
1474
2071
|
parser.add_argument(
|
1475
2072
|
"--grammar-backend",
|
1476
2073
|
type=str,
|
1477
|
-
choices=
|
2074
|
+
choices=GRAMMAR_BACKEND_CHOICES,
|
1478
2075
|
default=ServerArgs.grammar_backend,
|
1479
2076
|
help="Choose the backend for grammar-guided decoding.",
|
1480
2077
|
)
|
1481
2078
|
parser.add_argument(
|
1482
2079
|
"--mm-attention-backend",
|
1483
2080
|
type=str,
|
1484
|
-
choices=["sdpa", "fa3", "triton_attn"],
|
2081
|
+
choices=["sdpa", "fa3", "triton_attn", "ascend_attn"],
|
1485
2082
|
default=ServerArgs.mm_attention_backend,
|
1486
2083
|
help="Set multimodal attention backend.",
|
1487
2084
|
)
|
2085
|
+
parser.add_argument(
|
2086
|
+
"--nsa-prefill",
|
2087
|
+
default=ServerArgs.nsa_prefill,
|
2088
|
+
type=str,
|
2089
|
+
choices=NSA_CHOICES,
|
2090
|
+
)
|
2091
|
+
parser.add_argument(
|
2092
|
+
"--nsa-decode",
|
2093
|
+
default=ServerArgs.nsa_decode,
|
2094
|
+
type=str,
|
2095
|
+
choices=NSA_CHOICES,
|
2096
|
+
)
|
1488
2097
|
|
1489
2098
|
# Speculative decoding
|
1490
2099
|
parser.add_argument(
|
1491
2100
|
"--speculative-algorithm",
|
1492
2101
|
type=str,
|
1493
|
-
choices=["EAGLE", "EAGLE3", "NEXTN"],
|
2102
|
+
choices=["EAGLE", "EAGLE3", "NEXTN", "STANDALONE", "NGRAM"],
|
1494
2103
|
help="Speculative algorithm.",
|
1495
2104
|
)
|
1496
2105
|
parser.add_argument(
|
1497
2106
|
"--speculative-draft-model-path",
|
2107
|
+
"--speculative-draft-model",
|
1498
2108
|
type=str,
|
1499
2109
|
help="The path of the draft model weights. This can be a local folder or a Hugging Face repo ID.",
|
1500
2110
|
)
|
2111
|
+
parser.add_argument(
|
2112
|
+
"--speculative-draft-model-revision",
|
2113
|
+
type=str,
|
2114
|
+
default=None,
|
2115
|
+
help="The specific draft model version to use. It can be a branch "
|
2116
|
+
"name, a tag name, or a commit id. If unspecified, will use "
|
2117
|
+
"the default version.",
|
2118
|
+
)
|
1501
2119
|
parser.add_argument(
|
1502
2120
|
"--speculative-num-steps",
|
1503
2121
|
type=int,
|
@@ -1534,6 +2152,57 @@ class ServerArgs:
|
|
1534
2152
|
help="The path of the draft model's small vocab table.",
|
1535
2153
|
default=ServerArgs.speculative_token_map,
|
1536
2154
|
)
|
2155
|
+
parser.add_argument(
|
2156
|
+
"--speculative-attention-mode",
|
2157
|
+
type=str,
|
2158
|
+
choices=["prefill", "decode"],
|
2159
|
+
help="Attention backend for speculative decoding operations (both target verify and draft extend). Can be one of 'prefill' (default) or 'decode'.",
|
2160
|
+
default=ServerArgs.speculative_attention_mode,
|
2161
|
+
)
|
2162
|
+
# Ngram speculative decoding
|
2163
|
+
parser.add_argument(
|
2164
|
+
"--speculative-ngram-min-match-window-size",
|
2165
|
+
type=int,
|
2166
|
+
default=ServerArgs.speculative_ngram_min_match_window_size,
|
2167
|
+
help="The minimum window size for pattern matching in ngram speculative decoding.",
|
2168
|
+
)
|
2169
|
+
parser.add_argument(
|
2170
|
+
"--speculative-ngram-max-match-window-size",
|
2171
|
+
type=int,
|
2172
|
+
default=ServerArgs.speculative_ngram_max_match_window_size,
|
2173
|
+
help="The maximum window size for pattern matching in ngram speculative decoding.",
|
2174
|
+
)
|
2175
|
+
parser.add_argument(
|
2176
|
+
"--speculative-ngram-min-bfs-breadth",
|
2177
|
+
type=int,
|
2178
|
+
default=ServerArgs.speculative_ngram_min_bfs_breadth,
|
2179
|
+
help="The minimum breadth for BFS (Breadth-First Search) in ngram speculative decoding.",
|
2180
|
+
)
|
2181
|
+
parser.add_argument(
|
2182
|
+
"--speculative-ngram-max-bfs-breadth",
|
2183
|
+
type=int,
|
2184
|
+
default=ServerArgs.speculative_ngram_max_bfs_breadth,
|
2185
|
+
help="The maximum breadth for BFS (Breadth-First Search) in ngram speculative decoding.",
|
2186
|
+
)
|
2187
|
+
parser.add_argument(
|
2188
|
+
"--speculative-ngram-match-type",
|
2189
|
+
type=str,
|
2190
|
+
choices=["BFS", "PROB"],
|
2191
|
+
default=ServerArgs.speculative_ngram_match_type,
|
2192
|
+
help="The match type for cache tree.",
|
2193
|
+
)
|
2194
|
+
parser.add_argument(
|
2195
|
+
"--speculative-ngram-branch-length",
|
2196
|
+
type=int,
|
2197
|
+
default=ServerArgs.speculative_ngram_branch_length,
|
2198
|
+
help="The branch length for ngram speculative decoding.",
|
2199
|
+
)
|
2200
|
+
parser.add_argument(
|
2201
|
+
"--speculative-ngram-capacity",
|
2202
|
+
type=int,
|
2203
|
+
default=ServerArgs.speculative_ngram_capacity,
|
2204
|
+
help="The cache capacity for ngram speculative decoding.",
|
2205
|
+
)
|
1537
2206
|
|
1538
2207
|
# Expert parallelism
|
1539
2208
|
parser.add_argument(
|
@@ -1554,21 +2223,14 @@ class ServerArgs:
|
|
1554
2223
|
parser.add_argument(
|
1555
2224
|
"--moe-runner-backend",
|
1556
2225
|
type=str,
|
1557
|
-
choices=
|
1558
|
-
"auto",
|
1559
|
-
"triton",
|
1560
|
-
"triton_kernel",
|
1561
|
-
"flashinfer_trtllm",
|
1562
|
-
"flashinfer_cutlass",
|
1563
|
-
"flashinfer_mxfp4",
|
1564
|
-
],
|
2226
|
+
choices=MOE_RUNNER_BACKEND_CHOICES,
|
1565
2227
|
default=ServerArgs.moe_runner_backend,
|
1566
2228
|
help="Choose the runner backend for MoE.",
|
1567
2229
|
)
|
1568
2230
|
parser.add_argument(
|
1569
2231
|
"--flashinfer-mxfp4-moe-precision",
|
1570
2232
|
type=str,
|
1571
|
-
choices=["
|
2233
|
+
choices=["default", "bf16"],
|
1572
2234
|
default=ServerArgs.flashinfer_mxfp4_moe_precision,
|
1573
2235
|
help="Choose the computation precision of flashinfer mxfp4 moe",
|
1574
2236
|
)
|
@@ -1661,6 +2323,27 @@ class ServerArgs:
|
|
1661
2323
|
help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
|
1662
2324
|
)
|
1663
2325
|
|
2326
|
+
# Mamba Cache
|
2327
|
+
parser.add_argument(
|
2328
|
+
"--max-mamba-cache-size",
|
2329
|
+
type=int,
|
2330
|
+
default=ServerArgs.max_mamba_cache_size,
|
2331
|
+
help="The maximum size of the mamba cache.",
|
2332
|
+
)
|
2333
|
+
parser.add_argument(
|
2334
|
+
"--mamba-ssm-dtype",
|
2335
|
+
type=str,
|
2336
|
+
default=ServerArgs.mamba_ssm_dtype,
|
2337
|
+
choices=["float32", "bfloat16"],
|
2338
|
+
help="The data type of the SSM states in mamba cache.",
|
2339
|
+
)
|
2340
|
+
# Args for multi-item-scoring
|
2341
|
+
parser.add_argument(
|
2342
|
+
"--multi-item-scoring-delimiter",
|
2343
|
+
type=int,
|
2344
|
+
default=ServerArgs.multi_item_scoring_delimiter,
|
2345
|
+
help="Delimiter token ID for multi-item scoring. Used to combine Query and Items into a single sequence: Query<delimiter>Item1<delimiter>Item2<delimiter>... This enables efficient batch processing of multiple items against a single query.",
|
2346
|
+
)
|
1664
2347
|
# Hierarchical cache
|
1665
2348
|
parser.add_argument(
|
1666
2349
|
"--enable-hierarchical-cache",
|
@@ -1686,6 +2369,13 @@ class ServerArgs:
|
|
1686
2369
|
default=ServerArgs.hicache_write_policy,
|
1687
2370
|
help="The write policy of hierarchical cache.",
|
1688
2371
|
)
|
2372
|
+
parser.add_argument(
|
2373
|
+
"--radix-eviction-policy",
|
2374
|
+
type=str,
|
2375
|
+
choices=RADIX_EVICTION_POLICY_CHOICES,
|
2376
|
+
default=ServerArgs.radix_eviction_policy,
|
2377
|
+
help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.",
|
2378
|
+
)
|
1689
2379
|
parser.add_argument(
|
1690
2380
|
"--hicache-io-backend",
|
1691
2381
|
type=str,
|
@@ -1696,16 +2386,19 @@ class ServerArgs:
|
|
1696
2386
|
parser.add_argument(
|
1697
2387
|
"--hicache-mem-layout",
|
1698
2388
|
type=str,
|
1699
|
-
choices=["layer_first", "page_first"],
|
2389
|
+
choices=["layer_first", "page_first", "page_first_direct"],
|
1700
2390
|
default=ServerArgs.hicache_mem_layout,
|
1701
2391
|
help="The layout of host memory pool for hierarchical cache.",
|
1702
2392
|
)
|
1703
2393
|
parser.add_argument(
|
1704
2394
|
"--hicache-storage-backend",
|
1705
2395
|
type=str,
|
1706
|
-
choices=["file", "mooncake", "hf3fs", "nixl"],
|
2396
|
+
choices=["file", "mooncake", "hf3fs", "nixl", "aibrix", "dynamic", "eic"],
|
1707
2397
|
default=ServerArgs.hicache_storage_backend,
|
1708
|
-
help="The storage backend for hierarchical KV cache."
|
2398
|
+
help="The storage backend for hierarchical KV cache. "
|
2399
|
+
"Built-in backends: file, mooncake, hf3fs, nixl, aibrix. "
|
2400
|
+
"For dynamic backend, use --hicache-storage-backend-extra-config to specify: "
|
2401
|
+
"backend_name (custom name), module_path (Python module path), class_name (backend class name).",
|
1709
2402
|
)
|
1710
2403
|
parser.add_argument(
|
1711
2404
|
"--hicache-storage-prefetch-policy",
|
@@ -1720,6 +2413,12 @@ class ServerArgs:
|
|
1720
2413
|
default=ServerArgs.hicache_storage_backend_extra_config,
|
1721
2414
|
help="A dictionary in JSON string format containing extra configuration for the storage backend.",
|
1722
2415
|
)
|
2416
|
+
# LMCache
|
2417
|
+
parser.add_argument(
|
2418
|
+
"--enable-lmcache",
|
2419
|
+
action="store_true",
|
2420
|
+
help="Using LMCache as an alternative hierarchical cache solution",
|
2421
|
+
)
|
1723
2422
|
|
1724
2423
|
# Double Sparsity
|
1725
2424
|
parser.add_argument(
|
@@ -1863,6 +2562,11 @@ class ServerArgs:
|
|
1863
2562
|
action="store_true",
|
1864
2563
|
help="Enable using mscclpp for small messages for all-reduce kernel and fall back to NCCL.",
|
1865
2564
|
)
|
2565
|
+
parser.add_argument(
|
2566
|
+
"--enable-torch-symm-mem",
|
2567
|
+
action="store_true",
|
2568
|
+
help="Enable using torch symm mem for all-reduce kernel and fall back to NCCL. Only supports CUDA device SM90 and above. SM90 supports world size 4, 6, 8. SM10 supports world size 6, 8.",
|
2569
|
+
)
|
1866
2570
|
parser.add_argument(
|
1867
2571
|
"--disable-overlap-schedule",
|
1868
2572
|
action="store_true",
|
@@ -1888,6 +2592,11 @@ class ServerArgs:
|
|
1888
2592
|
action="store_true",
|
1889
2593
|
help="Enabling two micro batches to overlap.",
|
1890
2594
|
)
|
2595
|
+
parser.add_argument(
|
2596
|
+
"--enable-single-batch-overlap",
|
2597
|
+
action="store_true",
|
2598
|
+
help="Let computation and communication overlap within one micro batch.",
|
2599
|
+
)
|
1891
2600
|
parser.add_argument(
|
1892
2601
|
"--tbo-token-distribution-threshold",
|
1893
2602
|
type=float,
|
@@ -1933,6 +2642,12 @@ class ServerArgs:
|
|
1933
2642
|
default=ServerArgs.triton_attention_num_kv_splits,
|
1934
2643
|
help="The number of KV splits in flash decoding Triton kernel. Larger value is better in longer context scenarios. The default value is 8.",
|
1935
2644
|
)
|
2645
|
+
parser.add_argument(
|
2646
|
+
"--triton-attention-split-tile-size",
|
2647
|
+
type=int,
|
2648
|
+
default=ServerArgs.triton_attention_split_tile_size,
|
2649
|
+
help="The size of split KV tile in flash decoding Triton kernel. Used for deterministic inference.",
|
2650
|
+
)
|
1936
2651
|
parser.add_argument(
|
1937
2652
|
"--num-continuous-decode-steps",
|
1938
2653
|
type=int,
|
@@ -1951,6 +2666,11 @@ class ServerArgs:
|
|
1951
2666
|
action="store_true",
|
1952
2667
|
help="Allow saving memory using release_memory_occupation and resume_memory_occupation",
|
1953
2668
|
)
|
2669
|
+
parser.add_argument(
|
2670
|
+
"--enable-weights-cpu-backup",
|
2671
|
+
action="store_true",
|
2672
|
+
help="Save model weights to CPU memory during release_weights_occupation and resume_weights_occupation",
|
2673
|
+
)
|
1954
2674
|
parser.add_argument(
|
1955
2675
|
"--allow-auto-truncate",
|
1956
2676
|
action="store_true",
|
@@ -1981,6 +2701,11 @@ class ServerArgs:
|
|
1981
2701
|
action="store_true",
|
1982
2702
|
help="Adopt base image processor instead of fast image processor.",
|
1983
2703
|
)
|
2704
|
+
parser.add_argument(
|
2705
|
+
"--keep-mm-feature-on-device",
|
2706
|
+
action="store_true",
|
2707
|
+
help="Keep multimodal feature tensors on device after processing to save D2H copy.",
|
2708
|
+
)
|
1984
2709
|
parser.add_argument(
|
1985
2710
|
"--enable-return-hidden-states",
|
1986
2711
|
action="store_true",
|
@@ -1992,6 +2717,12 @@ class ServerArgs:
|
|
1992
2717
|
default=ServerArgs.scheduler_recv_interval,
|
1993
2718
|
help="The interval to poll requests in scheduler. Can be set to >1 to reduce the overhead of this.",
|
1994
2719
|
)
|
2720
|
+
parser.add_argument(
|
2721
|
+
"--numa-node",
|
2722
|
+
type=int,
|
2723
|
+
nargs="+",
|
2724
|
+
help="Sets the numa node for the subprocesses. i-th element corresponds to i-th subprocess.",
|
2725
|
+
)
|
1995
2726
|
|
1996
2727
|
# Debug tensor dumps
|
1997
2728
|
parser.add_argument(
|
@@ -2017,12 +2748,29 @@ class ServerArgs:
|
|
2017
2748
|
action="store_true",
|
2018
2749
|
help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
|
2019
2750
|
)
|
2751
|
+
parser.add_argument(
|
2752
|
+
"--enable-dynamic-batch-tokenizer",
|
2753
|
+
action="store_true",
|
2754
|
+
help="Enable async dynamic batch tokenizer for improved performance when multiple requests arrive concurrently.",
|
2755
|
+
)
|
2756
|
+
parser.add_argument(
|
2757
|
+
"--dynamic-batch-tokenizer-batch-size",
|
2758
|
+
type=int,
|
2759
|
+
default=ServerArgs.dynamic_batch_tokenizer_batch_size,
|
2760
|
+
help="[Only used if --enable-dynamic-batch-tokenizer is set] Maximum batch size for dynamic batch tokenizer.",
|
2761
|
+
)
|
2762
|
+
parser.add_argument(
|
2763
|
+
"--dynamic-batch-tokenizer-batch-timeout",
|
2764
|
+
type=float,
|
2765
|
+
default=ServerArgs.dynamic_batch_tokenizer_batch_timeout,
|
2766
|
+
help="[Only used if --enable-dynamic-batch-tokenizer is set] Timeout in seconds for batching tokenization requests.",
|
2767
|
+
)
|
2020
2768
|
|
2021
2769
|
# PD disaggregation
|
2022
2770
|
parser.add_argument(
|
2023
2771
|
"--disaggregation-mode",
|
2024
2772
|
type=str,
|
2025
|
-
default=
|
2773
|
+
default=ServerArgs.disaggregation_mode,
|
2026
2774
|
choices=["null", "prefill", "decode"],
|
2027
2775
|
help='Only used for PD disaggregation. "prefill" for prefill-only server, and "decode" for decode-only server. If not specified, it is not PD disaggregated',
|
2028
2776
|
)
|
@@ -2065,6 +2813,11 @@ class ServerArgs:
|
|
2065
2813
|
"or multiple comma-separated devices (e.g., --disaggregation-ib-device mlx5_0,mlx5_1). "
|
2066
2814
|
"Default is None, which triggers automatic device detection when mooncake backend is enabled.",
|
2067
2815
|
)
|
2816
|
+
parser.add_argument(
|
2817
|
+
"--disaggregation-decode-enable-offload-kvcache",
|
2818
|
+
action="store_true",
|
2819
|
+
help="Enable async KV cache offloading on decode server (PD mode).",
|
2820
|
+
)
|
2068
2821
|
parser.add_argument(
|
2069
2822
|
"--num-reserved-decode-tokens",
|
2070
2823
|
type=int,
|
@@ -2072,10 +2825,10 @@ class ServerArgs:
|
|
2072
2825
|
help="Number of decode tokens that will have memory reserved when adding new request to the running batch.",
|
2073
2826
|
)
|
2074
2827
|
parser.add_argument(
|
2075
|
-
"--
|
2076
|
-
type=
|
2077
|
-
default=
|
2078
|
-
help="The
|
2828
|
+
"--disaggregation-decode-polling-interval",
|
2829
|
+
type=int,
|
2830
|
+
default=ServerArgs.disaggregation_decode_polling_interval,
|
2831
|
+
help="The interval to poll requests in decode server. Can be set to >1 to reduce the overhead of this.",
|
2079
2832
|
)
|
2080
2833
|
|
2081
2834
|
# Custom weight loader
|
@@ -2091,6 +2844,24 @@ class ServerArgs:
|
|
2091
2844
|
action="store_true",
|
2092
2845
|
help="Disable mmap while loading weight using safetensors.",
|
2093
2846
|
)
|
2847
|
+
parser.add_argument(
|
2848
|
+
"--remote-instance-weight-loader-seed-instance-ip",
|
2849
|
+
type=str,
|
2850
|
+
default=ServerArgs.remote_instance_weight_loader_seed_instance_ip,
|
2851
|
+
help="The ip of the seed instance for loading weights from remote instance.",
|
2852
|
+
)
|
2853
|
+
parser.add_argument(
|
2854
|
+
"--remote-instance-weight-loader-seed-instance-service-port",
|
2855
|
+
type=int,
|
2856
|
+
default=ServerArgs.remote_instance_weight_loader_seed_instance_service_port,
|
2857
|
+
help="The service port of the seed instance for loading weights from remote instance.",
|
2858
|
+
)
|
2859
|
+
parser.add_argument(
|
2860
|
+
"--remote-instance-weight-loader-send-weights-group-ports",
|
2861
|
+
type=json_list_type,
|
2862
|
+
default=ServerArgs.remote_instance_weight_loader_send_weights_group_ports,
|
2863
|
+
help="The communication group ports for loading weights from remote instance.",
|
2864
|
+
)
|
2094
2865
|
|
2095
2866
|
# For PD-Multiplexing
|
2096
2867
|
parser.add_argument(
|
@@ -2106,36 +2877,55 @@ class ServerArgs:
|
|
2106
2877
|
help="Number of sm partition groups.",
|
2107
2878
|
)
|
2108
2879
|
|
2880
|
+
# For deterministic inference
|
2881
|
+
parser.add_argument(
|
2882
|
+
"--enable-deterministic-inference",
|
2883
|
+
action="store_true",
|
2884
|
+
help="Enable deterministic inference mode with batch invariant ops.",
|
2885
|
+
)
|
2886
|
+
|
2109
2887
|
# Deprecated arguments
|
2110
2888
|
parser.add_argument(
|
2111
2889
|
"--enable-ep-moe",
|
2112
|
-
action=
|
2113
|
-
help="
|
2890
|
+
action=DeprecatedAction,
|
2891
|
+
help="NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead.",
|
2114
2892
|
)
|
2115
2893
|
parser.add_argument(
|
2116
2894
|
"--enable-deepep-moe",
|
2117
|
-
action=
|
2118
|
-
help="
|
2895
|
+
action=DeprecatedAction,
|
2896
|
+
help="NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead.",
|
2119
2897
|
)
|
2120
2898
|
parser.add_argument(
|
2121
2899
|
"--enable-flashinfer-cutlass-moe",
|
2122
|
-
action=
|
2123
|
-
help="
|
2900
|
+
action=DeprecatedAction,
|
2901
|
+
help="NOTE: --enable-flashinfer-cutlass-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutlass' instead.",
|
2902
|
+
)
|
2903
|
+
parser.add_argument(
|
2904
|
+
"--enable-flashinfer-cutedsl-moe",
|
2905
|
+
action=DeprecatedAction,
|
2906
|
+
help="NOTE: --enable-flashinfer-cutedsl-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutedsl' instead.",
|
2124
2907
|
)
|
2125
2908
|
parser.add_argument(
|
2126
2909
|
"--enable-flashinfer-trtllm-moe",
|
2127
|
-
action=
|
2128
|
-
help="
|
2910
|
+
action=DeprecatedAction,
|
2911
|
+
help="NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_trtllm' instead.",
|
2129
2912
|
)
|
2130
2913
|
parser.add_argument(
|
2131
2914
|
"--enable-triton-kernel-moe",
|
2132
|
-
action=
|
2133
|
-
help="
|
2915
|
+
action=DeprecatedAction,
|
2916
|
+
help="NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead.",
|
2134
2917
|
)
|
2135
2918
|
parser.add_argument(
|
2136
2919
|
"--enable-flashinfer-mxfp4-moe",
|
2137
|
-
action=
|
2138
|
-
help="
|
2920
|
+
action=DeprecatedAction,
|
2921
|
+
help="NOTE: --enable-flashinfer-mxfp4-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_mxfp4' instead.",
|
2922
|
+
)
|
2923
|
+
|
2924
|
+
# Configuration file support
|
2925
|
+
parser.add_argument(
|
2926
|
+
"--config",
|
2927
|
+
type=str,
|
2928
|
+
help="Read CLI options from a config file. Must be a YAML file with configuration options.",
|
2139
2929
|
)
|
2140
2930
|
|
2141
2931
|
@classmethod
|
@@ -2144,6 +2934,7 @@ class ServerArgs:
|
|
2144
2934
|
args.pp_size = args.pipeline_parallel_size
|
2145
2935
|
args.dp_size = args.data_parallel_size
|
2146
2936
|
args.ep_size = args.expert_parallel_size
|
2937
|
+
|
2147
2938
|
attrs = [attr.name for attr in dataclasses.fields(cls)]
|
2148
2939
|
return cls(**{attr: getattr(args, attr) for attr in attrs})
|
2149
2940
|
|
@@ -2200,7 +2991,8 @@ class ServerArgs:
|
|
2200
2991
|
|
2201
2992
|
# Check chunked prefill
|
2202
2993
|
# Skip validation if chunked prefill is disabled (i.e., size <= 0).
|
2203
|
-
if
|
2994
|
+
# Skip validation if disaggregation mode is decode.
|
2995
|
+
if self.chunked_prefill_size > 0 and self.disaggregation_mode != "decode":
|
2204
2996
|
assert (
|
2205
2997
|
self.chunked_prefill_size % self.page_size == 0
|
2206
2998
|
), "chunked_prefill_size must be divisible by page_size"
|
@@ -2214,6 +3006,24 @@ class ServerArgs:
|
|
2214
3006
|
"--generation-tokens-buckets", self.generation_tokens_buckets
|
2215
3007
|
)
|
2216
3008
|
|
3009
|
+
# Check scheduling policy
|
3010
|
+
if self.enable_priority_scheduling:
|
3011
|
+
assert self.schedule_policy in [
|
3012
|
+
"fcfs",
|
3013
|
+
"lof",
|
3014
|
+
], f"To use priority scheduling, schedule_policy must be 'fcfs' or 'lof'. '{self.schedule_policy}' is not supported."
|
3015
|
+
|
3016
|
+
# Check multi-item scoring
|
3017
|
+
if self.multi_item_scoring_delimiter is not None:
|
3018
|
+
assert self.disable_radix_cache, (
|
3019
|
+
"Multi-item scoring requires radix cache to be disabled. "
|
3020
|
+
"Please set --disable-radix-cache when using --multi-item-scoring-delimiter."
|
3021
|
+
)
|
3022
|
+
assert self.chunked_prefill_size == -1, (
|
3023
|
+
"Multi-item scoring requires chunked prefill to be disabled. "
|
3024
|
+
"Please set --chunked-prefill-size -1 when using --multi-item-scoring-delimiter."
|
3025
|
+
)
|
3026
|
+
|
2217
3027
|
def check_lora_server_args(self):
|
2218
3028
|
assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"
|
2219
3029
|
|
@@ -2297,6 +3107,12 @@ class ServerArgs:
|
|
2297
3107
|
f"max_loaded_loras={self.max_loaded_loras}, lora_paths={len(self.lora_paths)}"
|
2298
3108
|
)
|
2299
3109
|
|
3110
|
+
if self.max_lora_chunk_size is not None:
|
3111
|
+
assert (
|
3112
|
+
16 <= self.max_lora_chunk_size <= 128
|
3113
|
+
and (self.max_lora_chunk_size & (self.max_lora_chunk_size - 1)) == 0
|
3114
|
+
), "--max-lora-chunk-size must be a power of 2 between 16 and 128."
|
3115
|
+
|
2300
3116
|
def validate_disagg_tp_size(self, prefill_tp: int, decode_tp: int):
|
2301
3117
|
larger_tp = max(decode_tp, prefill_tp)
|
2302
3118
|
smaller_tp = min(decode_tp, prefill_tp)
|
@@ -2314,8 +3130,8 @@ class ServerArgs:
|
|
2314
3130
|
assert rule in [
|
2315
3131
|
"tse",
|
2316
3132
|
"default",
|
2317
|
-
"
|
2318
|
-
], f"Unsupported {arg_name} rule type: '{rule}'. Must be one of: 'tse', 'default', '
|
3133
|
+
"custom",
|
3134
|
+
], f"Unsupported {arg_name} rule type: '{rule}'. Must be one of: 'tse', 'default', 'custom'"
|
2319
3135
|
|
2320
3136
|
if rule == "tse":
|
2321
3137
|
assert (
|
@@ -2338,95 +3154,20 @@ class ServerArgs:
|
|
2338
3154
|
len(buckets_rule) == 1
|
2339
3155
|
), f"{arg_name} default rule should only have one parameter: ['default'], got {len(buckets_rule)}"
|
2340
3156
|
|
2341
|
-
elif rule == "
|
3157
|
+
elif rule == "custom":
|
2342
3158
|
assert (
|
2343
3159
|
len(buckets_rule) >= 2
|
2344
|
-
), f"{arg_name}
|
3160
|
+
), f"{arg_name} custom rule requires at least one bucket value: ['custom', value1, ...]"
|
2345
3161
|
try:
|
2346
3162
|
bucket_values = [float(x) for x in buckets_rule[1:]]
|
2347
3163
|
except ValueError:
|
2348
|
-
assert False, f"{arg_name}
|
3164
|
+
assert False, f"{arg_name} custom rule bucket values must be numeric"
|
2349
3165
|
assert len(set(bucket_values)) == len(
|
2350
3166
|
bucket_values
|
2351
|
-
), f"{arg_name}
|
3167
|
+
), f"{arg_name} custom rule bucket values should not contain duplicates"
|
2352
3168
|
assert all(
|
2353
3169
|
val >= 0 for val in bucket_values
|
2354
|
-
), f"{arg_name}
|
2355
|
-
|
2356
|
-
def model_specific_adjustments(self):
|
2357
|
-
hf_config = self.get_hf_config()
|
2358
|
-
model_arch = hf_config.architectures[0]
|
2359
|
-
if model_arch in ["GptOssForCausalLM"]:
|
2360
|
-
if self.attention_backend is None:
|
2361
|
-
if is_cuda() and is_sm100_supported():
|
2362
|
-
self.attention_backend = "trtllm_mha"
|
2363
|
-
elif is_cuda() and is_sm90_supported():
|
2364
|
-
self.attention_backend = "fa3"
|
2365
|
-
else:
|
2366
|
-
self.attention_backend = "triton"
|
2367
|
-
supported_backends = ["triton", "trtllm_mha", "fa3"]
|
2368
|
-
logger.info(
|
2369
|
-
f"Use {self.attention_backend} as attention backend for GptOssForCausalLM"
|
2370
|
-
)
|
2371
|
-
assert (
|
2372
|
-
self.attention_backend in supported_backends
|
2373
|
-
), f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got '{self.attention_backend}'"
|
2374
|
-
|
2375
|
-
if is_sm100_supported():
|
2376
|
-
if not self.enable_dp_attention:
|
2377
|
-
self.enable_flashinfer_allreduce_fusion = True
|
2378
|
-
logger.info(
|
2379
|
-
"Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM"
|
2380
|
-
)
|
2381
|
-
quantization_config = getattr(hf_config, "quantization_config", None)
|
2382
|
-
is_mxfp4_quant_format = (
|
2383
|
-
quantization_config is not None
|
2384
|
-
and quantization_config.get("quant_method") == "mxfp4"
|
2385
|
-
)
|
2386
|
-
|
2387
|
-
if is_sm100_supported() and is_mxfp4_quant_format:
|
2388
|
-
self.moe_runner_backend = "flashinfer_mxfp4"
|
2389
|
-
logger.warning(
|
2390
|
-
"Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
|
2391
|
-
)
|
2392
|
-
else:
|
2393
|
-
if self.moe_runner_backend == "triton_kernel":
|
2394
|
-
assert (
|
2395
|
-
self.ep_size == 1
|
2396
|
-
), "Triton kernel MoE is only supported when ep_size == 1"
|
2397
|
-
if (
|
2398
|
-
self.moe_runner_backend == "auto"
|
2399
|
-
and self.ep_size == 1
|
2400
|
-
and is_triton_kernels_available()
|
2401
|
-
):
|
2402
|
-
self.moe_runner_backend = "triton_kernel"
|
2403
|
-
logger.warning(
|
2404
|
-
"Detected GPT-OSS model, enabling triton_kernels MOE kernel."
|
2405
|
-
)
|
2406
|
-
self.disable_hybrid_swa_memory = True
|
2407
|
-
if is_mxfp4_quant_format:
|
2408
|
-
# use bf16 for mxfp4 triton kernels
|
2409
|
-
self.dtype = "bfloat16"
|
2410
|
-
|
2411
|
-
elif "Llama4" in model_arch:
|
2412
|
-
assert self.attention_backend in {
|
2413
|
-
"fa3",
|
2414
|
-
"aiter",
|
2415
|
-
"triton",
|
2416
|
-
}, "fa3, aiter, or triton is required for Llama4 model"
|
2417
|
-
elif model_arch in [
|
2418
|
-
"Gemma2ForCausalLM",
|
2419
|
-
"Gemma3ForCausalLM",
|
2420
|
-
"Gemma3ForConditionalGeneration",
|
2421
|
-
"Gemma3nForCausalLM",
|
2422
|
-
"Gemma3nForConditionalGeneration",
|
2423
|
-
]:
|
2424
|
-
# FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
|
2425
|
-
# It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
|
2426
|
-
logger.warning(
|
2427
|
-
f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
|
2428
|
-
)
|
2429
|
-
self.disable_hybrid_swa_memory = True
|
3170
|
+
), f"{arg_name} custom rule bucket values should be non-negative"
|
2430
3171
|
|
2431
3172
|
def adjust_mem_fraction_for_vlm(self, model_config):
|
2432
3173
|
vision_config = getattr(model_config.hf_config, "vision_config", None)
|
@@ -2478,6 +3219,26 @@ def prepare_server_args(argv: List[str]) -> ServerArgs:
|
|
2478
3219
|
Returns:
|
2479
3220
|
The server arguments.
|
2480
3221
|
"""
|
3222
|
+
# Import here to avoid circular imports
|
3223
|
+
from sglang.srt.server_args_config_parser import ConfigArgumentMerger
|
3224
|
+
|
3225
|
+
# Check for config file and merge arguments if present
|
3226
|
+
if "--config" in argv:
|
3227
|
+
# Extract boolean actions from the parser to handle them correctly
|
3228
|
+
parser = argparse.ArgumentParser()
|
3229
|
+
ServerArgs.add_cli_args(parser)
|
3230
|
+
|
3231
|
+
# Get boolean action destinations
|
3232
|
+
boolean_actions = []
|
3233
|
+
for action in parser._actions:
|
3234
|
+
if hasattr(action, "dest") and hasattr(action, "action"):
|
3235
|
+
if action.action in ["store_true", "store_false"]:
|
3236
|
+
boolean_actions.append(action.dest)
|
3237
|
+
|
3238
|
+
# Merge config file arguments with CLI arguments
|
3239
|
+
config_merger = ConfigArgumentMerger(boolean_actions=boolean_actions)
|
3240
|
+
argv = config_merger.merge_config_with_args(argv)
|
3241
|
+
|
2481
3242
|
parser = argparse.ArgumentParser()
|
2482
3243
|
ServerArgs.add_cli_args(parser)
|
2483
3244
|
raw_args = parser.parse_args(argv)
|
@@ -2612,14 +3373,19 @@ def auto_choose_speculative_params(self: ServerArgs):
|
|
2612
3373
|
"""
|
2613
3374
|
hf_config = self.get_hf_config()
|
2614
3375
|
arch = hf_config.architectures[0]
|
2615
|
-
|
3376
|
+
if self.speculative_algorithm == "STANDALONE":
|
3377
|
+
# The default value for standalone speculative decoding
|
3378
|
+
return (3, 1, 4)
|
2616
3379
|
if arch in ["LlamaForCausalLM"]:
|
2617
3380
|
# The default value for llama
|
2618
3381
|
return (5, 4, 8)
|
2619
3382
|
elif arch in [
|
3383
|
+
"DeepseekV32ForCausalLM",
|
2620
3384
|
"DeepseekV3ForCausalLM",
|
2621
3385
|
"DeepseekV2ForCausalLM",
|
2622
3386
|
"GptOssForCausalLM",
|
3387
|
+
"BailingMoeForCausalLM",
|
3388
|
+
"BailingMoeV2ForCausalLM",
|
2623
3389
|
]:
|
2624
3390
|
# The default value for deepseek and gpt-oss
|
2625
3391
|
return (3, 1, 4)
|