sglang 0.5.2rc1__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -9
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +267 -32
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/lang/interpreter.py +1 -1
- sglang/launch_server.py +14 -0
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +360 -0
- sglang/srt/configs/internvl.py +6 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/model_config.py +181 -82
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +71 -19
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +326 -53
- sglang/srt/disaggregation/prefill.py +36 -17
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +192 -113
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +855 -0
- sglang/srt/entrypoints/grpc_server.py +810 -0
- sglang/srt/entrypoints/http_server.py +132 -57
- sglang/srt/entrypoints/openai/protocol.py +115 -7
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +207 -58
- sglang/srt/entrypoints/openai/serving_completions.py +17 -4
- sglang/srt/entrypoints/openai/serving_embedding.py +10 -4
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +49 -4
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/environ.py +285 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +9 -2
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +24 -1
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +106 -82
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +206 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +41 -8
- sglang/srt/layers/attention/flashinfer_backend.py +118 -198
- sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
- sglang/srt/layers/attention/mamba/mamba.py +629 -0
- sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +53 -7
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +44 -12
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
- sglang/srt/layers/moe/ep_moe/layer.py +256 -63
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +22 -7
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +78 -49
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +225 -57
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +77 -42
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
- sglang/srt/layers/quantization/quark/utils.py +97 -0
- sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +26 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rocm_linear_utils.py +44 -0
- sglang/srt/layers/rotary_embedding.py +78 -49
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +52 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +215 -314
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +358 -404
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +53 -0
- sglang/srt/managers/schedule_batch.py +240 -138
- sglang/srt/managers/schedule_policy.py +147 -19
- sglang/srt/managers/scheduler.py +501 -304
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +119 -40
- sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/template_manager.py +3 -3
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +321 -632
- sglang/srt/managers/tp_worker.py +81 -22
- sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +15 -21
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +8 -1
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +58 -34
- sglang/srt/mem_cache/hiradix_cache.py +227 -80
- sglang/srt/mem_cache/memory_pool.py +535 -58
- sglang/srt/mem_cache/memory_pool_host.py +239 -223
- sglang/srt/mem_cache/radix_cache.py +222 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +268 -63
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +198 -30
- sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
- sglang/srt/mem_cache/swa_radix_cache.py +25 -36
- sglang/srt/metrics/collector.py +519 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +55 -0
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +98 -57
- sglang/srt/model_executor/model_runner.py +433 -158
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +133 -5
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +158 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +833 -152
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +576 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +14 -5
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/internvl.py +28 -0
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/minicpmv.py +165 -3
- sglang/srt/models/mllama4.py +40 -4
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +124 -14
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +26 -5
- sglang/srt/models/qwen3_moe.py +71 -12
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +10 -3
- sglang/srt/models/utils.py +51 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +6 -0
- sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
- sglang/srt/sampling/sampling_batch_info.py +38 -17
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +1030 -254
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +253 -136
- sglang/srt/speculative/ngram_utils.py +428 -0
- sglang/srt/speculative/ngram_worker.py +245 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +606 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +8 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +445 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/few_shot_gsm8k.py +1 -0
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/run_eval.py +79 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_deterministic.py +297 -0
- sglang/test/test_disaggregation_utils.py +77 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +383 -5
- sglang/utils.py +22 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/RECORD +392 -258
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
- /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
- /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
- /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
- /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -19,14 +19,13 @@ import json
|
|
19
19
|
import logging
|
20
20
|
import os
|
21
21
|
import random
|
22
|
-
import sys
|
23
22
|
import tempfile
|
24
23
|
from typing import List, Literal, Optional, Union
|
25
24
|
|
25
|
+
from sglang.srt.connector import ConnectorType
|
26
26
|
from sglang.srt.function_call.function_call_parser import FunctionCallParser
|
27
|
-
from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
|
28
27
|
from sglang.srt.lora.lora_registry import LoRARef
|
29
|
-
from sglang.srt.reasoning_parser import ReasoningParser
|
28
|
+
from sglang.srt.parser.reasoning_parser import ReasoningParser
|
30
29
|
from sglang.srt.utils import (
|
31
30
|
LORA_TARGET_ALL_MODULES,
|
32
31
|
SUPPORTED_LORA_TARGET_MODULES,
|
@@ -36,18 +35,22 @@ from sglang.srt.utils import (
|
|
36
35
|
is_cuda,
|
37
36
|
is_flashinfer_available,
|
38
37
|
is_hip,
|
38
|
+
is_npu,
|
39
39
|
is_port_available,
|
40
40
|
is_remote_url,
|
41
41
|
is_sm90_supported,
|
42
42
|
is_sm100_supported,
|
43
43
|
is_triton_kernels_available,
|
44
44
|
is_valid_ipv6_address,
|
45
|
+
json_list_type,
|
45
46
|
nullable_str,
|
47
|
+
parse_connector_type,
|
46
48
|
)
|
49
|
+
from sglang.srt.utils.hf_transformers_utils import check_gguf_file, get_config
|
50
|
+
from sglang.utils import is_in_ci
|
47
51
|
|
48
52
|
logger = logging.getLogger(__name__)
|
49
53
|
|
50
|
-
|
51
54
|
# Define constants
|
52
55
|
LOAD_FORMAT_CHOICES = [
|
53
56
|
"auto",
|
@@ -60,6 +63,7 @@ LOAD_FORMAT_CHOICES = [
|
|
60
63
|
"bitsandbytes",
|
61
64
|
"layered",
|
62
65
|
"remote",
|
66
|
+
"remote_instance",
|
63
67
|
]
|
64
68
|
|
65
69
|
QUANTIZATION_CHOICES = [
|
@@ -86,9 +90,12 @@ ATTENTION_BACKEND_CHOICES = [
|
|
86
90
|
# Common
|
87
91
|
"triton",
|
88
92
|
"torch_native",
|
93
|
+
"flex_attention",
|
94
|
+
"nsa",
|
89
95
|
# NVIDIA specific
|
90
96
|
"cutlass_mla",
|
91
97
|
"fa3",
|
98
|
+
"fa4",
|
92
99
|
"flashinfer",
|
93
100
|
"flashmla",
|
94
101
|
"trtllm_mla",
|
@@ -102,8 +109,18 @@ ATTENTION_BACKEND_CHOICES = [
|
|
102
109
|
"ascend",
|
103
110
|
]
|
104
111
|
|
112
|
+
LORA_BACKEND_CHOICES = ["triton", "csgmv"]
|
113
|
+
|
105
114
|
DISAGG_TRANSFER_BACKEND_CHOICES = ["mooncake", "nixl", "ascend", "fake"]
|
106
115
|
|
116
|
+
GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
|
117
|
+
|
118
|
+
DETERMINISTIC_ATTENTION_BACKEND_CHOICES = ["flashinfer", "fa3", "triton"]
|
119
|
+
|
120
|
+
NSA_CHOICES = ["flashmla_prefill", "flashmla_decode", "fa3", "tilelang", "aiter"]
|
121
|
+
|
122
|
+
RADIX_EVICTION_POLICY_CHOICES = ["lru", "lfu"]
|
123
|
+
|
107
124
|
|
108
125
|
# Allow external code to add more choices
|
109
126
|
def add_load_format_choices(choices):
|
@@ -122,6 +139,18 @@ def add_disagg_transfer_backend_choices(choices):
|
|
122
139
|
DISAGG_TRANSFER_BACKEND_CHOICES.extend(choices)
|
123
140
|
|
124
141
|
|
142
|
+
def add_grammar_backend_choices(choices):
|
143
|
+
GRAMMAR_BACKEND_CHOICES.extend(choices)
|
144
|
+
|
145
|
+
|
146
|
+
def add_deterministic_attention_backend_choices(choices):
|
147
|
+
DETERMINISTIC_ATTENTION_BACKEND_CHOICES.extend(choices)
|
148
|
+
|
149
|
+
|
150
|
+
def add_radix_eviction_policy_choices(choices):
|
151
|
+
RADIX_EVICTION_POLICY_CHOICES.extend(choices)
|
152
|
+
|
153
|
+
|
125
154
|
@dataclasses.dataclass
|
126
155
|
class ServerArgs:
|
127
156
|
# Model and tokenizer
|
@@ -151,20 +180,25 @@ class ServerArgs:
|
|
151
180
|
quantization: Optional[str] = None
|
152
181
|
quantization_param_path: Optional[str] = None
|
153
182
|
kv_cache_dtype: str = "auto"
|
183
|
+
enable_fp32_lm_head: bool = False
|
154
184
|
|
155
185
|
# Memory and scheduling
|
156
186
|
mem_fraction_static: Optional[float] = None
|
157
187
|
max_running_requests: Optional[int] = None
|
158
|
-
max_queued_requests: Optional[int] =
|
188
|
+
max_queued_requests: Optional[int] = None
|
159
189
|
max_total_tokens: Optional[int] = None
|
160
190
|
chunked_prefill_size: Optional[int] = None
|
161
191
|
max_prefill_tokens: int = 16384
|
162
192
|
schedule_policy: str = "fcfs"
|
193
|
+
enable_priority_scheduling: bool = False
|
194
|
+
schedule_low_priority_values_first: bool = False
|
195
|
+
priority_scheduling_preemption_threshold: int = 10
|
163
196
|
schedule_conservativeness: float = 1.0
|
164
197
|
page_size: Optional[int] = None
|
165
198
|
hybrid_kvcache_ratio: Optional[float] = None
|
166
199
|
swa_full_tokens_ratio: float = 0.8
|
167
200
|
disable_hybrid_swa_memory: bool = False
|
201
|
+
radix_eviction_policy: str = "lru"
|
168
202
|
|
169
203
|
# Runtime options
|
170
204
|
device: Optional[str] = None
|
@@ -191,14 +225,20 @@ class ServerArgs:
|
|
191
225
|
show_time_cost: bool = False
|
192
226
|
enable_metrics: bool = False
|
193
227
|
enable_metrics_for_all_schedulers: bool = False
|
228
|
+
tokenizer_metrics_custom_labels_header: str = "x-custom-labels"
|
229
|
+
tokenizer_metrics_allowed_custom_labels: Optional[List[str]] = None
|
194
230
|
bucket_time_to_first_token: Optional[List[float]] = None
|
195
231
|
bucket_inter_token_latency: Optional[List[float]] = None
|
196
232
|
bucket_e2e_request_latency: Optional[List[float]] = None
|
197
233
|
collect_tokens_histogram: bool = False
|
234
|
+
prompt_tokens_buckets: Optional[List[str]] = None
|
235
|
+
generation_tokens_buckets: Optional[List[str]] = None
|
198
236
|
decode_log_interval: int = 40
|
199
237
|
enable_request_time_stats_logging: bool = False
|
200
238
|
kv_events_config: Optional[str] = None
|
201
239
|
gc_warning_threshold_secs: float = 0.0
|
240
|
+
enable_trace: bool = False
|
241
|
+
oltp_traces_endpoint: str = "localhost:4317"
|
202
242
|
|
203
243
|
# API related
|
204
244
|
api_key: Optional[str] = None
|
@@ -215,6 +255,9 @@ class ServerArgs:
|
|
215
255
|
# Data parallelism
|
216
256
|
dp_size: int = 1
|
217
257
|
load_balance_method: str = "round_robin"
|
258
|
+
load_watch_interval: float = 0.1
|
259
|
+
# FIXME: remove this after dp rank scheduling is fully supported with PD-Disaggregation
|
260
|
+
prefill_round_robin_balance: bool = False
|
218
261
|
|
219
262
|
# Multi-node distributed serving
|
220
263
|
dist_init_addr: Optional[str] = None
|
@@ -235,6 +278,7 @@ class ServerArgs:
|
|
235
278
|
max_loaded_loras: Optional[int] = None
|
236
279
|
max_loras_per_batch: int = 8
|
237
280
|
lora_backend: str = "triton"
|
281
|
+
max_lora_chunk_size: Optional[int] = 16
|
238
282
|
|
239
283
|
# Kernel backend
|
240
284
|
attention_backend: Optional[str] = None
|
@@ -243,16 +287,28 @@ class ServerArgs:
|
|
243
287
|
sampling_backend: Optional[str] = None
|
244
288
|
grammar_backend: Optional[str] = None
|
245
289
|
mm_attention_backend: Optional[str] = None
|
290
|
+
nsa_prefill: str = "flashmla_prefill"
|
291
|
+
nsa_decode: str = "fa3"
|
246
292
|
|
247
293
|
# Speculative decoding
|
248
294
|
speculative_algorithm: Optional[str] = None
|
249
295
|
speculative_draft_model_path: Optional[str] = None
|
296
|
+
speculative_draft_model_revision: Optional[str] = None
|
250
297
|
speculative_num_steps: Optional[int] = None
|
251
298
|
speculative_eagle_topk: Optional[int] = None
|
252
299
|
speculative_num_draft_tokens: Optional[int] = None
|
253
300
|
speculative_accept_threshold_single: float = 1.0
|
254
301
|
speculative_accept_threshold_acc: float = 1.0
|
255
302
|
speculative_token_map: Optional[str] = None
|
303
|
+
speculative_attention_mode: str = "prefill"
|
304
|
+
# For ngram only
|
305
|
+
speculative_ngram_min_match_window_size: int = 1
|
306
|
+
speculative_ngram_max_match_window_size: int = 12
|
307
|
+
speculative_ngram_min_bfs_breadth: int = 1
|
308
|
+
speculative_ngram_max_bfs_breadth: int = 10
|
309
|
+
speculative_ngram_match_type: Literal["BFS", "PROB"] = "BFS"
|
310
|
+
speculative_ngram_branch_length: int = 18
|
311
|
+
speculative_ngram_capacity: int = 10 * 1000 * 1000
|
256
312
|
|
257
313
|
# Expert parallelism
|
258
314
|
ep_size: int = 1
|
@@ -284,6 +340,10 @@ class ServerArgs:
|
|
284
340
|
deepep_config: Optional[str] = None
|
285
341
|
moe_dense_tp_size: Optional[int] = None
|
286
342
|
|
343
|
+
# Mamba cache
|
344
|
+
max_mamba_cache_size: Optional[int] = None
|
345
|
+
mamba_ssm_dtype: str = "float32"
|
346
|
+
|
287
347
|
# Hierarchical cache
|
288
348
|
enable_hierarchical_cache: bool = False
|
289
349
|
hicache_ratio: float = 2.0
|
@@ -294,6 +354,8 @@ class ServerArgs:
|
|
294
354
|
hicache_storage_backend: Optional[str] = None
|
295
355
|
hicache_storage_prefetch_policy: str = "best_effort"
|
296
356
|
hicache_storage_backend_extra_config: Optional[str] = None
|
357
|
+
# LMCache
|
358
|
+
enable_lmcache: bool = False
|
297
359
|
|
298
360
|
# Double Sparsity
|
299
361
|
enable_double_sparsity: bool = False
|
@@ -325,11 +387,13 @@ class ServerArgs:
|
|
325
387
|
disable_outlines_disk_cache: bool = False
|
326
388
|
disable_custom_all_reduce: bool = False
|
327
389
|
enable_mscclpp: bool = False
|
390
|
+
enable_torch_symm_mem: bool = False
|
328
391
|
disable_overlap_schedule: bool = False
|
329
392
|
enable_mixed_chunk: bool = False
|
330
393
|
enable_dp_attention: bool = False
|
331
394
|
enable_dp_lm_head: bool = False
|
332
395
|
enable_two_batch_overlap: bool = False
|
396
|
+
enable_single_batch_overlap: bool = False
|
333
397
|
tbo_token_distribution_threshold: float = 0.48
|
334
398
|
enable_torch_compile: bool = False
|
335
399
|
torch_compile_max_bs: int = 32
|
@@ -338,17 +402,27 @@ class ServerArgs:
|
|
338
402
|
enable_p2p_check: bool = False
|
339
403
|
triton_attention_reduce_in_fp32: bool = False
|
340
404
|
triton_attention_num_kv_splits: int = 8
|
405
|
+
triton_attention_split_tile_size: Optional[int] = None
|
341
406
|
num_continuous_decode_steps: int = 1
|
342
407
|
delete_ckpt_after_loading: bool = False
|
343
408
|
enable_memory_saver: bool = False
|
409
|
+
enable_weights_cpu_backup: bool = False
|
344
410
|
allow_auto_truncate: bool = False
|
345
411
|
enable_custom_logit_processor: bool = False
|
346
412
|
flashinfer_mla_disable_ragged: bool = False
|
347
413
|
disable_shared_experts_fusion: bool = False
|
348
414
|
disable_chunked_prefix_cache: bool = False
|
349
415
|
disable_fast_image_processor: bool = False
|
416
|
+
keep_mm_feature_on_device: bool = False
|
350
417
|
enable_return_hidden_states: bool = False
|
351
418
|
scheduler_recv_interval: int = 1
|
419
|
+
numa_node: Optional[List[int]] = None
|
420
|
+
enable_deterministic_inference: bool = False
|
421
|
+
|
422
|
+
# Dynamic batch tokenizer
|
423
|
+
enable_dynamic_batch_tokenizer: bool = False
|
424
|
+
dynamic_batch_tokenizer_batch_size: int = 32
|
425
|
+
dynamic_batch_tokenizer_batch_timeout: float = 0.002
|
352
426
|
|
353
427
|
# Debug tensor dumps
|
354
428
|
debug_tensor_dump_output_folder: Optional[str] = None
|
@@ -357,66 +431,105 @@ class ServerArgs:
|
|
357
431
|
debug_tensor_dump_prefill_only: bool = False
|
358
432
|
|
359
433
|
# PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
|
360
|
-
disaggregation_mode:
|
434
|
+
disaggregation_mode: Literal["null", "prefill", "decode"] = "null"
|
361
435
|
disaggregation_transfer_backend: str = "mooncake"
|
362
436
|
disaggregation_bootstrap_port: int = 8998
|
363
437
|
disaggregation_decode_tp: Optional[int] = None
|
364
438
|
disaggregation_decode_dp: Optional[int] = None
|
365
439
|
disaggregation_prefill_pp: Optional[int] = 1
|
366
440
|
disaggregation_ib_device: Optional[str] = None
|
441
|
+
disaggregation_decode_enable_offload_kvcache: bool = False
|
367
442
|
num_reserved_decode_tokens: int = 512 # used for decode kv cache offload in PD
|
368
|
-
|
443
|
+
# FIXME: hack to reduce ITL when decode bs is small
|
444
|
+
disaggregation_decode_polling_interval: int = 1
|
369
445
|
|
370
|
-
# For model weight update
|
446
|
+
# For model weight update and weight loading
|
371
447
|
custom_weight_loader: Optional[List[str]] = None
|
372
448
|
weight_loader_disable_mmap: bool = False
|
449
|
+
remote_instance_weight_loader_seed_instance_ip: Optional[str] = None
|
450
|
+
remote_instance_weight_loader_seed_instance_service_port: Optional[int] = None
|
451
|
+
remote_instance_weight_loader_send_weights_group_ports: Optional[List[int]] = None
|
373
452
|
|
374
453
|
# For PD-Multiplexing
|
375
454
|
enable_pdmux: bool = False
|
376
455
|
sm_group_num: int = 3
|
377
456
|
|
378
|
-
# Deprecated arguments
|
379
|
-
enable_ep_moe: bool = False
|
380
|
-
enable_deepep_moe: bool = False
|
381
|
-
enable_flashinfer_cutlass_moe: bool = False
|
382
|
-
enable_flashinfer_trtllm_moe: bool = False
|
383
|
-
enable_triton_kernel_moe: bool = False
|
384
|
-
enable_flashinfer_mxfp4_moe: bool = False
|
385
|
-
|
386
457
|
def __post_init__(self):
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
458
|
+
"""
|
459
|
+
Orchestrates the handling of various server arguments, ensuring proper configuration and validation.
|
460
|
+
"""
|
461
|
+
# Handle deprecated arguments.
|
462
|
+
self._handle_deprecated_args()
|
463
|
+
|
464
|
+
# Set missing default values.
|
465
|
+
self._handle_missing_default_values()
|
466
|
+
|
467
|
+
# Get GPU memory capacity, which is a common dependency for several configuration steps.
|
468
|
+
gpu_mem = get_device_memory_capacity(self.device)
|
469
|
+
|
470
|
+
# Handle memory-related, chunked prefill, and CUDA graph batch size configurations.
|
471
|
+
self._handle_gpu_memory_settings(gpu_mem)
|
472
|
+
|
473
|
+
# Handle device-specific backends.
|
474
|
+
self._handle_hpu_backends()
|
475
|
+
self._handle_cpu_backends()
|
476
|
+
|
477
|
+
# Apply model-specific adjustments.
|
478
|
+
self._handle_model_specific_adjustments()
|
479
|
+
|
480
|
+
# Set kernel backends.
|
481
|
+
self._handle_sampling_backend()
|
482
|
+
self._handle_attention_backend_compatibility()
|
483
|
+
self._handle_page_size()
|
484
|
+
self._handle_amd_specifics()
|
485
|
+
self._handle_grammar_backend()
|
486
|
+
|
487
|
+
# Handle data parallelism.
|
488
|
+
self._handle_data_parallelism()
|
489
|
+
|
490
|
+
# Handle MoE configurations.
|
491
|
+
self._handle_moe_kernel_config()
|
492
|
+
self._handle_deepep_moe()
|
493
|
+
self._handle_eplb_and_dispatch()
|
494
|
+
self._handle_expert_distribution_metrics()
|
418
495
|
|
419
|
-
#
|
496
|
+
# Handle pipeline parallelism.
|
497
|
+
self._handle_pipeline_parallelism()
|
498
|
+
|
499
|
+
# Handle Hicache settings.
|
500
|
+
self._handle_hicache()
|
501
|
+
|
502
|
+
# Handle speculative decoding logic.
|
503
|
+
self._handle_speculative_decoding()
|
504
|
+
|
505
|
+
# Handle model loading format.
|
506
|
+
self._handle_load_format()
|
507
|
+
|
508
|
+
# Handle PD disaggregation.
|
509
|
+
self._handle_disaggregation()
|
510
|
+
|
511
|
+
# Validate tokenizer settings.
|
512
|
+
self._handle_tokenizer_batching()
|
513
|
+
|
514
|
+
# Propagate environment variables.
|
515
|
+
self._handle_environment_variables()
|
516
|
+
|
517
|
+
# Validate cache settings.
|
518
|
+
self._handle_cache_compatibility()
|
519
|
+
|
520
|
+
# Validate metrics labels.
|
521
|
+
self._handle_metrics_labels()
|
522
|
+
|
523
|
+
# Handle deterministic inference.
|
524
|
+
self._handle_deterministic_inference()
|
525
|
+
|
526
|
+
# Handle any other necessary validations.
|
527
|
+
self._handle_other_validations()
|
528
|
+
|
529
|
+
def _handle_deprecated_args(self):
|
530
|
+
pass
|
531
|
+
|
532
|
+
def _handle_missing_default_values(self):
|
420
533
|
if self.tokenizer_path is None:
|
421
534
|
self.tokenizer_path = self.model_path
|
422
535
|
if self.served_model_name is None:
|
@@ -426,51 +539,140 @@ class ServerArgs:
|
|
426
539
|
if self.random_seed is None:
|
427
540
|
self.random_seed = random.randint(0, 1 << 30)
|
428
541
|
|
429
|
-
|
542
|
+
def _handle_gpu_memory_settings(self, gpu_mem):
|
543
|
+
"""
|
544
|
+
Configure GPU memory-dependent settings including
|
545
|
+
chunked_prefill_size, cuda_graph_max_bs, and mem_fraction_static.
|
546
|
+
|
547
|
+
Here are our heuristics:
|
548
|
+
- Set chunked_prefill_size and cuda_graph_max_bs based on the GPU memory capacity.
|
549
|
+
This is because GPUs with more memory are generally more powerful, we need to use a larger
|
550
|
+
chunked_prefill_size and a larger cuda_graph_max_bs to fully utilize the GPU.
|
551
|
+
- Then set mem_fraction_static based on chunked_prefill_size and cuda_graph_max_bs.
|
552
|
+
|
553
|
+
GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
|
554
|
+
|
555
|
+
The argument mem_fraction_static is defined as (model weights + KV cache pool) / GPU memory capacity,
|
556
|
+
or equivalently, mem_fraction_static = (GPU memory capacity - activations - cuda graph buffers) / GPU memory capacity.
|
557
|
+
|
558
|
+
In order to compute mem_fraction_static, we need to estimate the size of activations and cuda graph buffers.
|
559
|
+
The activation memory is proportional to the chunked_prefill_size.
|
560
|
+
The cuda graph memory is proportional to the cuda_graph_max_bs.
|
561
|
+
We use reserved_mem = chunked_prefill_size * 1.5 + cuda_graph_max_bs * 2 to estimate the size of activations and cuda graph buffers in GB.
|
562
|
+
and set mem_fraction_static = (GPU memory capacity - reserved_mem) / GPU memory capacity.
|
563
|
+
|
564
|
+
The coefficient 1.5 is a heuristic value, in the future, we can do better estimation by looking at the model types, hidden sizes or even do a dummy run.
|
565
|
+
"""
|
566
|
+
if gpu_mem is not None:
|
567
|
+
if gpu_mem < 20 * 1024:
|
568
|
+
# T4, 4080
|
569
|
+
# (chunked_prefill_size 2k, cuda_graph_max_bs 8)
|
570
|
+
if self.chunked_prefill_size is None:
|
571
|
+
self.chunked_prefill_size = 2048
|
572
|
+
if self.cuda_graph_max_bs is None:
|
573
|
+
self.cuda_graph_max_bs = 8
|
574
|
+
elif gpu_mem < 35 * 1024:
|
575
|
+
# A10, 4090, 5090
|
576
|
+
# (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80)
|
577
|
+
if self.chunked_prefill_size is None:
|
578
|
+
self.chunked_prefill_size = 2048
|
579
|
+
if self.cuda_graph_max_bs is None:
|
580
|
+
# Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM < 35GB, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance.
|
581
|
+
# However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs
|
582
|
+
# from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
|
583
|
+
if self.tp_size < 4:
|
584
|
+
self.cuda_graph_max_bs = 16
|
585
|
+
else:
|
586
|
+
self.cuda_graph_max_bs = 80
|
587
|
+
elif gpu_mem < 60 * 1024:
|
588
|
+
# A100 (40GB), L40,
|
589
|
+
# (chunked_prefill_size 4k, cuda_graph_max_bs 32 if tp < 4 else 160)
|
590
|
+
if self.chunked_prefill_size is None:
|
591
|
+
self.chunked_prefill_size = 4096
|
592
|
+
if self.cuda_graph_max_bs is None:
|
593
|
+
if self.tp_size < 4:
|
594
|
+
self.cuda_graph_max_bs = 32
|
595
|
+
else:
|
596
|
+
self.cuda_graph_max_bs = 160
|
597
|
+
elif gpu_mem < 90 * 1024:
|
598
|
+
# H100, A100
|
599
|
+
# (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
|
600
|
+
if self.chunked_prefill_size is None:
|
601
|
+
self.chunked_prefill_size = 8192
|
602
|
+
if self.cuda_graph_max_bs is None:
|
603
|
+
if self.tp_size < 4:
|
604
|
+
self.cuda_graph_max_bs = 256
|
605
|
+
else:
|
606
|
+
self.cuda_graph_max_bs = 512
|
607
|
+
elif gpu_mem < 160 * 1024:
|
608
|
+
# H20, H200
|
609
|
+
# (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
|
610
|
+
if self.chunked_prefill_size is None:
|
611
|
+
self.chunked_prefill_size = 8192
|
612
|
+
if self.cuda_graph_max_bs is None:
|
613
|
+
if self.tp_size < 4:
|
614
|
+
self.cuda_graph_max_bs = 256
|
615
|
+
else:
|
616
|
+
self.cuda_graph_max_bs = 512
|
617
|
+
else:
|
618
|
+
# B200, MI300
|
619
|
+
# (chunked_prefill_size 16k, cuda_graph_max_bs 512)
|
620
|
+
if self.chunked_prefill_size is None:
|
621
|
+
self.chunked_prefill_size = 16384
|
622
|
+
if self.cuda_graph_max_bs is None:
|
623
|
+
self.cuda_graph_max_bs = 512
|
624
|
+
else:
|
625
|
+
# Fallback defaults when gpu_mem is None
|
626
|
+
if self.chunked_prefill_size is None:
|
627
|
+
self.chunked_prefill_size = 4096
|
628
|
+
if self.cuda_graph_max_bs is None:
|
629
|
+
self.cuda_graph_max_bs = 160
|
430
630
|
|
431
|
-
# Set
|
432
|
-
if self.
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
# We want mem_fraction_static to be as large as possible but still has enough room
|
438
|
-
# for activations and cuda graph buffers. We use the following heuristic to
|
439
|
-
# compute the needed size for activations and cuda graph buffers:
|
440
|
-
# - The size of the activation depends on the chunked_prefill_size and model size.
|
441
|
-
# - The size of cuda graph buffers depends on the cuda graph capture range and model size.
|
442
|
-
# For GPUs with more memory, we use a larger chunked_prefill_size and
|
443
|
-
# capture more cuda graphs, so they need to reserve more memory.
|
444
|
-
parallel_size = self.tp_size * self.pp_size
|
445
|
-
|
446
|
-
if gpu_mem < 20 * 1024:
|
447
|
-
# T4, 4080. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
|
448
|
-
reserved_mem = (2.8 + parallel_size / 10) * 1024
|
449
|
-
elif gpu_mem < 35 * 1024:
|
450
|
-
# A10, L40, 4090, 5090. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
|
451
|
-
reserved_mem = (2.8 + parallel_size / 10) * 1024
|
452
|
-
elif gpu_mem < 90 * 1024:
|
453
|
-
# H100, A100. (chunked_prefill_size 8k, cuda_graph_max_bs 160)
|
454
|
-
reserved_mem = (9.5 + parallel_size / 2) * 1024
|
455
|
-
elif gpu_mem < 100 * 1024:
|
456
|
-
# H20. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
|
457
|
-
reserved_mem = (12 + parallel_size / 2) * 1024
|
458
|
-
elif gpu_mem < 160 * 1024:
|
459
|
-
# H200. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
|
460
|
-
reserved_mem = (12 + parallel_size / 2) * 1024
|
461
|
-
else:
|
462
|
-
# B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512)
|
463
|
-
reserved_mem = 32 * 1024
|
631
|
+
# Set cuda graph batch sizes
|
632
|
+
if self.cuda_graph_bs is None:
|
633
|
+
self.cuda_graph_bs = self._generate_cuda_graph_batch_sizes()
|
634
|
+
else:
|
635
|
+
self.cuda_graph_max_bs = max(self.cuda_graph_bs)
|
464
636
|
|
465
|
-
|
466
|
-
|
637
|
+
if self.mem_fraction_static is None:
|
638
|
+
# Constant meta data (e.g., from attention backend)
|
639
|
+
reserved_mem = 512
|
640
|
+
# For activation during large prefill
|
641
|
+
if self.chunked_prefill_size > 0:
|
642
|
+
reserved_mem += max(self.chunked_prefill_size, 2048) * 1.5
|
643
|
+
else:
|
644
|
+
reserved_mem += max(self.max_prefill_tokens, 2048) * 1.5
|
645
|
+
# For cuda graphs
|
646
|
+
reserved_mem += self.cuda_graph_max_bs * 2
|
647
|
+
# Some adjustments for large parallel size
|
648
|
+
reserved_mem += self.tp_size * self.pp_size / 8 * 1024
|
649
|
+
|
650
|
+
if self.enable_dp_attention:
|
651
|
+
# DP attention needs more padding for some operations
|
652
|
+
reserved_mem += self.cuda_graph_max_bs * self.dp_size * 3
|
653
|
+
|
654
|
+
# DP attention uses much more memory for large cuda graph max bs,
|
655
|
+
# likely due to some inefficiencies in torch allocator or our implementation.
|
656
|
+
# So we need to reserve more memory.
|
657
|
+
if self.cuda_graph_max_bs > 300:
|
658
|
+
reserved_mem += self.cuda_graph_max_bs * self.dp_size * 1.5
|
659
|
+
|
660
|
+
if gpu_mem is not None and gpu_mem > 60 * 1024:
|
661
|
+
reserved_mem = max(reserved_mem, 10 * 1024)
|
662
|
+
|
663
|
+
if self.speculative_algorithm is not None:
|
664
|
+
if self.speculative_algorithm == "STANDALONE":
|
665
|
+
# standalonedraft model and cuda graphs
|
666
|
+
reserved_mem += 6 * 1024
|
667
|
+
elif self.speculative_algorithm != "NGRAM":
|
668
|
+
# eagle draft models and cuda graphs
|
467
669
|
reserved_mem += 2 * 1024
|
468
|
-
if self.enable_dp_attention:
|
469
|
-
reserved_mem += 4 * 1024
|
470
670
|
|
471
|
-
|
472
|
-
|
473
|
-
|
671
|
+
self.mem_fraction_static = (
|
672
|
+
round((gpu_mem - reserved_mem) / gpu_mem, 3)
|
673
|
+
if gpu_mem is not None
|
674
|
+
else 0.88
|
675
|
+
)
|
474
676
|
|
475
677
|
# Lazy init to avoid circular import
|
476
678
|
# Multimodal models need more memory for the image processor
|
@@ -480,53 +682,192 @@ class ServerArgs:
|
|
480
682
|
if model_config.is_multimodal:
|
481
683
|
self.adjust_mem_fraction_for_vlm(model_config)
|
482
684
|
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
685
|
+
def _generate_cuda_graph_batch_sizes(self):
|
686
|
+
"""
|
687
|
+
Generate the list of batch sizes for CUDA graph capture based on cuda_graph_max_bs.
|
688
|
+
This integrates the logic from cuda_graph_runner.py.
|
689
|
+
"""
|
690
|
+
# Handle disable_cuda_graph_padding as the first condition for both spec and non-spec
|
691
|
+
if self.disable_cuda_graph_padding:
|
692
|
+
capture_bs = list(range(1, self.cuda_graph_max_bs + 1))
|
693
|
+
elif self.speculative_algorithm is None:
|
694
|
+
# Normal case: [1, 2, 4, 8, 12] + list(range(16, 257, 8)) + list(range(272, 512, 16)) + list(range(512, cuda_graph_max_bs + 1))
|
695
|
+
capture_bs = (
|
696
|
+
[1, 2, 4, 8, 12]
|
697
|
+
+ list(range(16, 257, 8))
|
698
|
+
+ list(range(272, 512, 16))
|
699
|
+
+ list(range(512, self.cuda_graph_max_bs + 1, 32))
|
700
|
+
)
|
701
|
+
else:
|
702
|
+
# Spec decoding case: list(range(1, 9, 1)) + list(range(10, 33, 2)) + list(range(40, 64, 4)) + list(range(72, 257, 8))
|
703
|
+
capture_bs = (
|
704
|
+
list(range(1, 9, 1))
|
705
|
+
+ list(range(10, 33, 2))
|
706
|
+
+ list(range(40, 64, 4))
|
707
|
+
+ list(range(72, 257, 8))
|
708
|
+
+ list(range(272, self.cuda_graph_max_bs + 1, 16))
|
709
|
+
)
|
494
710
|
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
if gpu_mem is not None and gpu_mem < 35 * 1024:
|
499
|
-
if self.tp_size < 4:
|
500
|
-
self.cuda_graph_max_bs = 8
|
501
|
-
else:
|
502
|
-
self.cuda_graph_max_bs = 80
|
711
|
+
capture_bs = [bs for bs in capture_bs if bs <= self.cuda_graph_max_bs]
|
712
|
+
|
713
|
+
return capture_bs
|
503
714
|
|
504
|
-
|
715
|
+
def _handle_hpu_backends(self):
|
505
716
|
if self.device == "hpu":
|
506
717
|
self.attention_backend = "torch_native"
|
507
718
|
self.sampling_backend = "pytorch"
|
508
719
|
|
509
|
-
|
510
|
-
self.model_specific_adjustments()
|
511
|
-
|
512
|
-
# Set kernel backends
|
720
|
+
def _handle_cpu_backends(self):
|
513
721
|
if self.device == "cpu":
|
514
722
|
if self.attention_backend is None:
|
515
723
|
self.attention_backend = "intel_amx"
|
516
724
|
self.sampling_backend = "pytorch"
|
517
725
|
|
726
|
+
def _handle_model_specific_adjustments(self):
|
727
|
+
from sglang.srt.configs.model_config import is_deepseek_nsa
|
728
|
+
|
729
|
+
if parse_connector_type(self.model_path) == ConnectorType.INSTANCE:
|
730
|
+
return
|
731
|
+
|
732
|
+
hf_config = self.get_hf_config()
|
733
|
+
model_arch = hf_config.architectures[0]
|
734
|
+
if model_arch in ["GptOssForCausalLM"]:
|
735
|
+
if self.attention_backend is None:
|
736
|
+
if is_cuda() and is_sm100_supported():
|
737
|
+
self.attention_backend = "trtllm_mha"
|
738
|
+
elif is_cuda() and is_sm90_supported():
|
739
|
+
self.attention_backend = "fa3"
|
740
|
+
else:
|
741
|
+
self.attention_backend = "triton"
|
742
|
+
supported_backends = ["triton", "trtllm_mha", "fa3"]
|
743
|
+
logger.info(
|
744
|
+
f"Use {self.attention_backend} as attention backend for GptOssForCausalLM"
|
745
|
+
)
|
746
|
+
assert (
|
747
|
+
self.attention_backend in supported_backends
|
748
|
+
), f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got '{self.attention_backend}'"
|
749
|
+
|
750
|
+
if is_sm100_supported():
|
751
|
+
if not self.enable_dp_attention:
|
752
|
+
self.enable_flashinfer_allreduce_fusion = True
|
753
|
+
logger.info(
|
754
|
+
"Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM"
|
755
|
+
)
|
756
|
+
quantization_config = getattr(hf_config, "quantization_config", None)
|
757
|
+
is_mxfp4_quant_format = (
|
758
|
+
quantization_config is not None
|
759
|
+
and quantization_config.get("quant_method") == "mxfp4"
|
760
|
+
)
|
761
|
+
|
762
|
+
if is_sm100_supported() and is_mxfp4_quant_format:
|
763
|
+
self.moe_runner_backend = "flashinfer_mxfp4"
|
764
|
+
logger.warning(
|
765
|
+
"Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
|
766
|
+
)
|
767
|
+
else:
|
768
|
+
if self.moe_runner_backend == "triton_kernel":
|
769
|
+
assert (
|
770
|
+
self.ep_size == 1
|
771
|
+
), "Triton kernel MoE is only supported when ep_size == 1"
|
772
|
+
if (
|
773
|
+
self.moe_runner_backend == "auto"
|
774
|
+
and self.ep_size == 1
|
775
|
+
and is_triton_kernels_available()
|
776
|
+
):
|
777
|
+
self.moe_runner_backend = "triton_kernel"
|
778
|
+
logger.warning(
|
779
|
+
"Detected GPT-OSS model, enabling triton_kernels MOE kernel."
|
780
|
+
)
|
781
|
+
self.disable_hybrid_swa_memory = True
|
782
|
+
if is_mxfp4_quant_format:
|
783
|
+
# use bf16 for mxfp4 triton kernels
|
784
|
+
self.dtype = "bfloat16"
|
785
|
+
|
786
|
+
elif "Llama4" in model_arch and self.device != "cpu":
|
787
|
+
assert self.attention_backend in {
|
788
|
+
"fa3",
|
789
|
+
"aiter",
|
790
|
+
"triton",
|
791
|
+
}, "fa3, aiter, or triton is required for Llama4 model"
|
792
|
+
elif model_arch in [
|
793
|
+
"Gemma2ForCausalLM",
|
794
|
+
"Gemma3ForCausalLM",
|
795
|
+
"Gemma3ForConditionalGeneration",
|
796
|
+
"Gemma3nForCausalLM",
|
797
|
+
"Gemma3nForConditionalGeneration",
|
798
|
+
]:
|
799
|
+
# FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
|
800
|
+
# It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
|
801
|
+
logger.warning(
|
802
|
+
f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
|
803
|
+
)
|
804
|
+
self.disable_hybrid_swa_memory = True
|
805
|
+
|
806
|
+
if is_deepseek_nsa(hf_config):
|
807
|
+
if (
|
808
|
+
self.attention_backend is None
|
809
|
+
and self.prefill_attention_backend is None
|
810
|
+
and self.decode_attention_backend is None
|
811
|
+
):
|
812
|
+
self.attention_backend = "nsa"
|
813
|
+
logger.warning("Set nsa attention backend for DeepSeek NSA.")
|
814
|
+
|
815
|
+
if not is_npu():
|
816
|
+
self.enable_dp_attention = True
|
817
|
+
self.dp_size = self.tp_size
|
818
|
+
logger.warning("DP attention is enabled for DeepSeek NSA.")
|
819
|
+
|
820
|
+
self.page_size = 64
|
821
|
+
logger.warning("Setting page size to 64 for DeepSeek NSA.")
|
822
|
+
|
823
|
+
self.mem_fraction_static = 0.8
|
824
|
+
logger.warning("Setting mem fraction static to 0.8 for DeepSeek NSA.")
|
825
|
+
|
826
|
+
# For Hopper, we support both bf16 and fp8 kv cache; for Blackwell, we support fp8 only currently
|
827
|
+
import torch
|
828
|
+
|
829
|
+
major, _ = torch.cuda.get_device_capability()
|
830
|
+
if major >= 10:
|
831
|
+
self.kv_cache_dtype = "fp8_e4m3"
|
832
|
+
logger.warning("Setting KV cache dtype to fp8.")
|
833
|
+
|
834
|
+
if self.kv_cache_dtype == "fp8_e4m3":
|
835
|
+
self.nsa_prefill = "flashmla_decode"
|
836
|
+
self.nsa_decode = "flashmla_decode"
|
837
|
+
logger.warning(
|
838
|
+
"Setting NSA backend to flashmla_decode for FP8 KV Cache."
|
839
|
+
)
|
840
|
+
|
841
|
+
# Logging env vars for NSA
|
842
|
+
from sglang.srt.layers.attention.nsa.utils import (
|
843
|
+
print_nsa_bool_env_vars,
|
844
|
+
)
|
845
|
+
|
846
|
+
print_nsa_bool_env_vars()
|
847
|
+
|
848
|
+
def _handle_sampling_backend(self):
|
518
849
|
if self.sampling_backend is None:
|
519
850
|
self.sampling_backend = (
|
520
851
|
"flashinfer" if is_flashinfer_available() else "pytorch"
|
521
852
|
)
|
522
853
|
|
854
|
+
def _handle_attention_backend_compatibility(self):
|
523
855
|
if self.attention_backend == "torch_native":
|
524
856
|
logger.warning(
|
525
857
|
"Cuda graph is disabled because of using torch native attention backend"
|
526
858
|
)
|
527
859
|
self.disable_cuda_graph = True
|
528
860
|
|
529
|
-
if self.attention_backend == "
|
861
|
+
if self.attention_backend == "flex_attention":
|
862
|
+
logger.warning(
|
863
|
+
"Cuda graph is disabled because of using torch Flex Attention backend"
|
864
|
+
)
|
865
|
+
self.disable_cuda_graph = True
|
866
|
+
assert (
|
867
|
+
self.speculative_algorithm is None
|
868
|
+
), "Speculative decoding is currently not supported with Flex Attention backend"
|
869
|
+
|
870
|
+
if is_npu() and self.attention_backend in ["ascend"]:
|
530
871
|
logger.warning(
|
531
872
|
"At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
|
532
873
|
)
|
@@ -588,30 +929,30 @@ class ServerArgs:
|
|
588
929
|
|
589
930
|
if self.attention_backend == "dual_chunk_flash_attn":
|
590
931
|
logger.warning(
|
591
|
-
"Mixed chunk
|
932
|
+
"Mixed chunk and radix cache are disabled when using dual-chunk flash attention backend"
|
592
933
|
)
|
593
934
|
self.enable_mixed_chunk = False
|
594
|
-
self.disable_cuda_graph = True
|
595
935
|
self.disable_radix_cache = True
|
596
936
|
|
597
|
-
|
937
|
+
def _handle_page_size(self):
|
598
938
|
if self.page_size is None:
|
599
939
|
self.page_size = 1
|
600
940
|
|
601
|
-
|
941
|
+
def _handle_amd_specifics(self):
|
602
942
|
if is_hip():
|
603
943
|
self.triton_attention_num_kv_splits = 16
|
604
944
|
|
605
|
-
|
945
|
+
def _handle_grammar_backend(self):
|
606
946
|
if self.grammar_backend is None:
|
607
947
|
self.grammar_backend = "xgrammar"
|
608
948
|
|
609
|
-
|
949
|
+
def _handle_data_parallelism(self):
|
950
|
+
if self.dp_size == 1:
|
951
|
+
self.enable_dp_attention = False
|
952
|
+
self.enable_dp_lm_head = False
|
953
|
+
|
610
954
|
if self.enable_dp_attention:
|
611
955
|
self.schedule_conservativeness = self.schedule_conservativeness * 0.3
|
612
|
-
assert (
|
613
|
-
self.dp_size > 1
|
614
|
-
), "Please set a dp-size > 1. You can use 1 < dp-size <= tp-size "
|
615
956
|
assert self.tp_size % self.dp_size == 0
|
616
957
|
self.chunked_prefill_size = self.chunked_prefill_size // self.dp_size
|
617
958
|
logger.warning(
|
@@ -623,7 +964,7 @@ class ServerArgs:
|
|
623
964
|
self.enable_dp_attention
|
624
965
|
), "Please enable dp attention when setting enable_dp_lm_head. "
|
625
966
|
|
626
|
-
|
967
|
+
def _handle_moe_kernel_config(self):
|
627
968
|
if self.moe_runner_backend == "flashinfer_cutlass":
|
628
969
|
assert (
|
629
970
|
self.quantization == "modelopt_fp4"
|
@@ -634,13 +975,15 @@ class ServerArgs:
|
|
634
975
|
], "The expert parallel size must be 1 or the same as the tensor parallel size"
|
635
976
|
|
636
977
|
if self.moe_runner_backend == "flashinfer_trtllm":
|
637
|
-
|
638
|
-
self.
|
639
|
-
|
640
|
-
|
641
|
-
|
978
|
+
assert (
|
979
|
+
self.quantization == "modelopt_fp4" or self.quantization == "fp8"
|
980
|
+
), "modelopt_fp4 or fp8 quantization is required for Flashinfer TRTLLM MoE"
|
981
|
+
self.disable_shared_experts_fusion = True
|
982
|
+
logger.warning(
|
983
|
+
"FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
|
984
|
+
)
|
642
985
|
|
643
|
-
|
986
|
+
def _handle_deepep_moe(self):
|
644
987
|
if self.moe_a2a_backend == "deepep":
|
645
988
|
if self.deepep_mode == "normal":
|
646
989
|
logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
|
@@ -650,6 +993,7 @@ class ServerArgs:
|
|
650
993
|
f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
651
994
|
)
|
652
995
|
|
996
|
+
def _handle_eplb_and_dispatch(self):
|
653
997
|
if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
|
654
998
|
self.expert_distribution_recorder_mode = "stat"
|
655
999
|
logger.warning(
|
@@ -664,6 +1008,7 @@ class ServerArgs:
|
|
664
1008
|
if self.enable_eplb:
|
665
1009
|
assert self.ep_size > 1
|
666
1010
|
|
1011
|
+
def _handle_expert_distribution_metrics(self):
|
667
1012
|
if self.enable_expert_distribution_metrics and (
|
668
1013
|
self.expert_distribution_recorder_mode is None
|
669
1014
|
):
|
@@ -675,25 +1020,42 @@ class ServerArgs:
|
|
675
1020
|
elif self.expert_distribution_recorder_mode is not None:
|
676
1021
|
self.expert_distribution_recorder_buffer_size = 1000
|
677
1022
|
|
678
|
-
|
1023
|
+
def _handle_pipeline_parallelism(self):
|
679
1024
|
if self.pp_size > 1:
|
680
1025
|
self.disable_overlap_schedule = True
|
681
1026
|
logger.warning(
|
682
1027
|
"Pipeline parallelism is incompatible with overlap schedule."
|
683
1028
|
)
|
684
1029
|
|
685
|
-
|
1030
|
+
def _handle_hicache(self):
|
686
1031
|
if self.hicache_storage_backend == "mooncake":
|
687
|
-
|
688
|
-
|
689
|
-
|
1032
|
+
if self.hicache_mem_layout == "layer_first":
|
1033
|
+
if self.hicache_io_backend == "direct":
|
1034
|
+
self.hicache_mem_layout = "page_first_direct"
|
1035
|
+
elif self.hicache_io_backend == "kernel":
|
1036
|
+
self.hicache_mem_layout = "page_first"
|
1037
|
+
logger.warning(
|
1038
|
+
f"Mooncake storage backend does not support layer_first layout, "
|
1039
|
+
f"switching to {self.hicache_mem_layout} layout for {self.hicache_io_backend} io backend"
|
1040
|
+
)
|
690
1041
|
|
691
|
-
|
1042
|
+
if self.hicache_mem_layout == "page_first_direct":
|
1043
|
+
if self.hicache_io_backend != "direct":
|
1044
|
+
self.hicache_io_backend = "direct"
|
1045
|
+
logger.warning(
|
1046
|
+
"Page first direct layout only support direct io backend"
|
1047
|
+
)
|
1048
|
+
|
1049
|
+
def _handle_speculative_decoding(self):
|
692
1050
|
if self.speculative_algorithm == "NEXTN":
|
693
|
-
# NEXTN shares the same implementation of EAGLE
|
694
1051
|
self.speculative_algorithm = "EAGLE"
|
695
1052
|
|
696
|
-
if self.speculative_algorithm in ("EAGLE", "EAGLE3"):
|
1053
|
+
if self.speculative_algorithm in ("EAGLE", "EAGLE3", "STANDALONE"):
|
1054
|
+
if self.speculative_algorithm == "STANDALONE" and self.enable_dp_attention:
|
1055
|
+
# TODO: support dp attention for standalone speculative decoding
|
1056
|
+
raise ValueError(
|
1057
|
+
"Currently standalone speculative decoding does not support dp attention."
|
1058
|
+
)
|
697
1059
|
if self.max_running_requests is None:
|
698
1060
|
self.max_running_requests = 48
|
699
1061
|
self.disable_overlap_schedule = True
|
@@ -709,8 +1071,13 @@ class ServerArgs:
|
|
709
1071
|
)
|
710
1072
|
|
711
1073
|
model_arch = self.get_hf_config().architectures[0]
|
712
|
-
if model_arch in [
|
713
|
-
|
1074
|
+
if model_arch in [
|
1075
|
+
"DeepseekV32ForCausalLM",
|
1076
|
+
"DeepseekV3ForCausalLM",
|
1077
|
+
"Glm4MoeForCausalLM",
|
1078
|
+
"BailingMoeForCausalLM",
|
1079
|
+
"BailingMoeV2ForCausalLM",
|
1080
|
+
]:
|
714
1081
|
if self.speculative_draft_model_path is None:
|
715
1082
|
self.speculative_draft_model_path = self.model_path
|
716
1083
|
else:
|
@@ -718,7 +1085,6 @@ class ServerArgs:
|
|
718
1085
|
"DeepSeek MTP does not require setting speculative_draft_model_path."
|
719
1086
|
)
|
720
1087
|
|
721
|
-
# Auto choose parameters
|
722
1088
|
if self.speculative_num_steps is None:
|
723
1089
|
assert (
|
724
1090
|
self.speculative_eagle_topk is None
|
@@ -758,23 +1124,63 @@ class ServerArgs:
|
|
758
1124
|
"speculative_eagle_topk > 1 with page_size > 1 is unstable and produces incorrect results for paged attention backends. This combination is only supported for the 'flashinfer' backend."
|
759
1125
|
)
|
760
1126
|
|
761
|
-
|
762
|
-
|
763
|
-
|
1127
|
+
if self.speculative_algorithm == "NGRAM":
|
1128
|
+
if not self.device.startswith("cuda"):
|
1129
|
+
raise ValueError(
|
1130
|
+
"Ngram speculative decoding only supports CUDA device."
|
1131
|
+
)
|
1132
|
+
if self.max_running_requests is None:
|
1133
|
+
self.max_running_requests = 48
|
1134
|
+
self.disable_overlap_schedule = True
|
1135
|
+
self.enable_mixed_chunk = False
|
1136
|
+
self.speculative_eagle_topk = self.speculative_ngram_max_bfs_breadth
|
1137
|
+
if self.speculative_num_draft_tokens is None:
|
1138
|
+
self.speculative_num_draft_tokens = (
|
1139
|
+
self.speculative_ngram_max_match_window_size
|
1140
|
+
)
|
1141
|
+
logger.warning(
|
1142
|
+
"The overlap scheduler and mixed chunked prefill are disabled because of "
|
1143
|
+
"using ngram speculative decoding."
|
1144
|
+
)
|
1145
|
+
|
1146
|
+
if (
|
1147
|
+
self.speculative_eagle_topk > 1
|
1148
|
+
and self.page_size > 1
|
1149
|
+
and self.attention_backend != "flashinfer"
|
1150
|
+
):
|
1151
|
+
raise ValueError(
|
1152
|
+
f"speculative_eagle_topk({self.speculative_eagle_topk}) > 1 "
|
1153
|
+
f"with page_size({self.page_size}) > 1 is unstable "
|
1154
|
+
"and produces incorrect results for paged attention backends. "
|
1155
|
+
"This combination is only supported for the 'flashinfer' backend."
|
1156
|
+
)
|
1157
|
+
if self.enable_dp_attention:
|
1158
|
+
# TODO: support dp attention for ngram speculative decoding
|
1159
|
+
raise ValueError(
|
1160
|
+
"Currently ngram speculative decoding does not support dp attention."
|
1161
|
+
)
|
764
1162
|
|
765
|
-
|
1163
|
+
def _handle_load_format(self):
|
766
1164
|
if (
|
767
1165
|
self.load_format == "auto" or self.load_format == "gguf"
|
768
1166
|
) and check_gguf_file(self.model_path):
|
769
1167
|
self.quantization = self.load_format = "gguf"
|
770
1168
|
|
771
|
-
# Model loading
|
772
1169
|
if is_remote_url(self.model_path):
|
773
1170
|
self.load_format = "remote"
|
1171
|
+
|
774
1172
|
if self.custom_weight_loader is None:
|
775
1173
|
self.custom_weight_loader = []
|
776
1174
|
|
777
|
-
|
1175
|
+
if self.load_format == "remote_instance":
|
1176
|
+
if (
|
1177
|
+
self.remote_instance_weight_loader_seed_instance_ip is None
|
1178
|
+
or self.remote_instance_weight_loader_seed_instance_service_port is None
|
1179
|
+
or self.remote_instance_weight_loader_send_weights_group_ports is None
|
1180
|
+
):
|
1181
|
+
self.load_format = "auto"
|
1182
|
+
|
1183
|
+
def _handle_disaggregation(self):
|
778
1184
|
if self.disaggregation_mode == "decode":
|
779
1185
|
assert (
|
780
1186
|
self.disaggregation_decode_tp is None
|
@@ -785,6 +1191,13 @@ class ServerArgs:
|
|
785
1191
|
|
786
1192
|
self.disable_radix_cache = True
|
787
1193
|
logger.warning("KV cache is forced as chunk cache for decode server")
|
1194
|
+
|
1195
|
+
if self.dp_size > 1 and not is_in_ci():
|
1196
|
+
assert self.prefill_round_robin_balance, (
|
1197
|
+
"Prefill round robin balance is required when dp size > 1. "
|
1198
|
+
"Please make sure that the prefill instance is launched with `--load-balance-method round_robin`"
|
1199
|
+
" and `--prefill-round-robin-balance` is set for decode server."
|
1200
|
+
)
|
788
1201
|
elif self.disaggregation_mode == "prefill":
|
789
1202
|
if self.disaggregation_decode_tp is None:
|
790
1203
|
self.disaggregation_decode_tp = self.tp_size
|
@@ -793,25 +1206,84 @@ class ServerArgs:
|
|
793
1206
|
|
794
1207
|
self.disaggregation_prefill_pp = self.pp_size
|
795
1208
|
self.validate_disagg_tp_size(self.tp_size, self.disaggregation_decode_tp)
|
796
|
-
|
797
1209
|
self.disable_cuda_graph = True
|
798
1210
|
logger.warning("Cuda graph is disabled for prefill server")
|
799
1211
|
|
800
|
-
|
1212
|
+
def _handle_tokenizer_batching(self):
|
1213
|
+
if self.enable_tokenizer_batch_encode and self.enable_dynamic_batch_tokenizer:
|
1214
|
+
raise ValueError(
|
1215
|
+
"Cannot enable both --enable-tokenizer-batch-encode and --enable-dynamic-batch-tokenizer. "
|
1216
|
+
"Please choose one tokenizer batching approach."
|
1217
|
+
)
|
1218
|
+
|
1219
|
+
def _handle_environment_variables(self):
|
801
1220
|
os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
|
802
1221
|
"1" if self.enable_torch_compile else "0"
|
803
1222
|
)
|
804
|
-
|
1223
|
+
os.environ["SGLANG_MAMBA_SSM_DTYPE"] = self.mamba_ssm_dtype
|
805
1224
|
os.environ["SGLANG_DISABLE_OUTLINES_DISK_CACHE"] = (
|
806
1225
|
"1" if self.disable_outlines_disk_cache else "0"
|
807
1226
|
)
|
1227
|
+
os.environ["SGLANG_ENABLE_DETERMINISTIC_INFERENCE"] = (
|
1228
|
+
"1" if self.enable_deterministic_inference else "0"
|
1229
|
+
)
|
808
1230
|
|
1231
|
+
def _handle_cache_compatibility(self):
|
809
1232
|
if self.enable_hierarchical_cache and self.disable_radix_cache:
|
810
1233
|
raise ValueError(
|
811
1234
|
"The arguments enable-hierarchical-cache and disable-radix-cache are mutually exclusive "
|
812
1235
|
"and cannot be used at the same time. Please use only one of them."
|
813
1236
|
)
|
814
1237
|
|
1238
|
+
if (
|
1239
|
+
self.disaggregation_decode_enable_offload_kvcache
|
1240
|
+
and self.disaggregation_mode != "decode"
|
1241
|
+
):
|
1242
|
+
raise ValueError(
|
1243
|
+
"The argument disaggregation-decode-enable-offload-kvcache is only supported for decode side."
|
1244
|
+
)
|
1245
|
+
|
1246
|
+
def _handle_metrics_labels(self):
|
1247
|
+
if (
|
1248
|
+
not self.tokenizer_metrics_custom_labels_header
|
1249
|
+
and self.tokenizer_metrics_allowed_custom_labels
|
1250
|
+
):
|
1251
|
+
raise ValueError(
|
1252
|
+
"Please set --tokenizer-metrics-custom-labels-header when setting --tokenizer-metrics-allowed-custom-labels."
|
1253
|
+
)
|
1254
|
+
|
1255
|
+
def _handle_deterministic_inference(self):
|
1256
|
+
if self.enable_deterministic_inference:
|
1257
|
+
# Check sampling backend
|
1258
|
+
self.sampling_backend = "pytorch"
|
1259
|
+
logger.warning(
|
1260
|
+
"Sampling backend is set to pytorch for deterministic inference."
|
1261
|
+
)
|
1262
|
+
|
1263
|
+
# Check attention backend
|
1264
|
+
if self.attention_backend not in DETERMINISTIC_ATTENTION_BACKEND_CHOICES:
|
1265
|
+
raise ValueError(
|
1266
|
+
f"Currently only {DETERMINISTIC_ATTENTION_BACKEND_CHOICES} attention backends are supported for deterministic inference."
|
1267
|
+
)
|
1268
|
+
|
1269
|
+
# Currently, only FA3 supports radix cache. Support for other backends is in progress
|
1270
|
+
if self.attention_backend != "fa3":
|
1271
|
+
self.disable_radix_cache = True
|
1272
|
+
logger.warning(
|
1273
|
+
f"Currently radix cache is not compatible with {self.attention_backend} attention backend for deterministic inference. It will be supported in the future."
|
1274
|
+
)
|
1275
|
+
|
1276
|
+
# Check TP size
|
1277
|
+
if self.tp_size > 1:
|
1278
|
+
os.environ["NCCL_ALGO"] = "allreduce:tree"
|
1279
|
+
self.disable_custom_all_reduce = True
|
1280
|
+
logger.warning(
|
1281
|
+
"NCCL_ALGO is set to 'allreduce:tree' and custom all reduce is disabled for deterministic inference when TP size > 1."
|
1282
|
+
)
|
1283
|
+
|
1284
|
+
def _handle_other_validations(self):
|
1285
|
+
pass
|
1286
|
+
|
815
1287
|
@staticmethod
|
816
1288
|
def add_cli_args(parser: argparse.ArgumentParser):
|
817
1289
|
# Model and tokenizer
|
@@ -828,12 +1300,6 @@ class ServerArgs:
|
|
828
1300
|
default=ServerArgs.tokenizer_path,
|
829
1301
|
help="The path of the tokenizer.",
|
830
1302
|
)
|
831
|
-
parser.add_argument(
|
832
|
-
"--tokenizer-worker-num",
|
833
|
-
type=int,
|
834
|
-
default=ServerArgs.tokenizer_worker_num,
|
835
|
-
help="The worker num of the tokenizer manager.",
|
836
|
-
)
|
837
1303
|
parser.add_argument(
|
838
1304
|
"--tokenizer-mode",
|
839
1305
|
type=str,
|
@@ -843,6 +1309,12 @@ class ServerArgs:
|
|
843
1309
|
"tokenizer if available, and 'slow' will "
|
844
1310
|
"always use the slow tokenizer.",
|
845
1311
|
)
|
1312
|
+
parser.add_argument(
|
1313
|
+
"--tokenizer-worker-num",
|
1314
|
+
type=int,
|
1315
|
+
default=ServerArgs.tokenizer_worker_num,
|
1316
|
+
help="The worker num of the tokenizer manager.",
|
1317
|
+
)
|
846
1318
|
parser.add_argument(
|
847
1319
|
"--skip-tokenizer-init",
|
848
1320
|
action="store_true",
|
@@ -990,6 +1462,11 @@ class ServerArgs:
|
|
990
1462
|
choices=["auto", "fp8_e5m2", "fp8_e4m3"],
|
991
1463
|
help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" and "fp8_e4m3" is supported for CUDA 11.8+.',
|
992
1464
|
)
|
1465
|
+
parser.add_argument(
|
1466
|
+
"--enable-fp32-lm-head",
|
1467
|
+
action="store_true",
|
1468
|
+
help="If set, the LM head outputs (logits) are in FP32.",
|
1469
|
+
)
|
993
1470
|
|
994
1471
|
# Memory and scheduling
|
995
1472
|
parser.add_argument(
|
@@ -1033,9 +1510,27 @@ class ServerArgs:
|
|
1033
1510
|
"--schedule-policy",
|
1034
1511
|
type=str,
|
1035
1512
|
default=ServerArgs.schedule_policy,
|
1036
|
-
choices=["lpm", "random", "fcfs", "dfs-weight", "lof"],
|
1513
|
+
choices=["lpm", "random", "fcfs", "dfs-weight", "lof", "priority"],
|
1037
1514
|
help="The scheduling policy of the requests.",
|
1038
1515
|
)
|
1516
|
+
parser.add_argument(
|
1517
|
+
"--enable-priority-scheduling",
|
1518
|
+
action="store_true",
|
1519
|
+
default=ServerArgs.enable_priority_scheduling,
|
1520
|
+
help="Enable priority scheduling. Requests with higher priority integer values will be scheduled first by default.",
|
1521
|
+
)
|
1522
|
+
parser.add_argument(
|
1523
|
+
"--schedule-low-priority-values-first",
|
1524
|
+
action="store_true",
|
1525
|
+
default=ServerArgs.schedule_low_priority_values_first,
|
1526
|
+
help="If specified with --enable-priority-scheduling, the scheduler will schedule requests with lower priority integer values first.",
|
1527
|
+
)
|
1528
|
+
parser.add_argument(
|
1529
|
+
"--priority-scheduling-preemption-threshold",
|
1530
|
+
type=int,
|
1531
|
+
default=ServerArgs.priority_scheduling_preemption_threshold,
|
1532
|
+
help="Minimum difference in priorities for an incoming request to have to preempt running request(s).",
|
1533
|
+
)
|
1039
1534
|
parser.add_argument(
|
1040
1535
|
"--schedule-conservativeness",
|
1041
1536
|
type=float,
|
@@ -1207,6 +1702,21 @@ class ServerArgs:
|
|
1207
1702
|
"to record request metrics separately. This is especially useful when dp_attention is enabled, as "
|
1208
1703
|
"otherwise all metrics appear to come from TP 0.",
|
1209
1704
|
)
|
1705
|
+
parser.add_argument(
|
1706
|
+
"--tokenizer-metrics-custom-labels-header",
|
1707
|
+
type=str,
|
1708
|
+
default=ServerArgs.tokenizer_metrics_custom_labels_header,
|
1709
|
+
help="Specify the HTTP header for passing custom labels for tokenizer metrics.",
|
1710
|
+
)
|
1711
|
+
parser.add_argument(
|
1712
|
+
"--tokenizer-metrics-allowed-custom-labels",
|
1713
|
+
type=str,
|
1714
|
+
nargs="+",
|
1715
|
+
default=ServerArgs.tokenizer_metrics_allowed_custom_labels,
|
1716
|
+
help="The custom labels allowed for tokenizer metrics. The labels are specified via a dict in "
|
1717
|
+
"'--tokenizer-metrics-custom-labels-header' field in HTTP requests, e.g., {'label1': 'value1', 'label2': "
|
1718
|
+
"'value2'} is allowed if '--tokenizer-metrics-allowed-custom-labels label1 label2' is set.",
|
1719
|
+
)
|
1210
1720
|
parser.add_argument(
|
1211
1721
|
"--bucket-time-to-first-token",
|
1212
1722
|
type=float,
|
@@ -1234,6 +1744,26 @@ class ServerArgs:
|
|
1234
1744
|
default=ServerArgs.collect_tokens_histogram,
|
1235
1745
|
help="Collect prompt/generation tokens histogram.",
|
1236
1746
|
)
|
1747
|
+
bucket_rule = (
|
1748
|
+
"Supports 3 rule types: 'default' uses predefined buckets; 'tse <middle> <base> <count>' "
|
1749
|
+
"generates two sides exponential distributed buckets (e.g., 'tse 1000 2 8' generates buckets "
|
1750
|
+
"[984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); 'custom <value1> "
|
1751
|
+
"<value2> ...' uses custom bucket values (e.g., 'custom 10 50 100 500')."
|
1752
|
+
)
|
1753
|
+
parser.add_argument(
|
1754
|
+
"--prompt-tokens-buckets",
|
1755
|
+
type=str,
|
1756
|
+
nargs="+",
|
1757
|
+
default=ServerArgs.prompt_tokens_buckets,
|
1758
|
+
help=f"The buckets rule of prompt tokens. {bucket_rule}",
|
1759
|
+
)
|
1760
|
+
parser.add_argument(
|
1761
|
+
"--generation-tokens-buckets",
|
1762
|
+
type=str,
|
1763
|
+
nargs="+",
|
1764
|
+
default=ServerArgs.generation_tokens_buckets,
|
1765
|
+
help=f"The buckets rule for generation tokens histogram. {bucket_rule}",
|
1766
|
+
)
|
1237
1767
|
parser.add_argument(
|
1238
1768
|
"--gc-warning-threshold-secs",
|
1239
1769
|
type=float,
|
@@ -1258,6 +1788,17 @@ class ServerArgs:
|
|
1258
1788
|
default=None,
|
1259
1789
|
help="Config in json format for NVIDIA dynamo KV event publishing. Publishing will be enabled if this flag is used.",
|
1260
1790
|
)
|
1791
|
+
parser.add_argument(
|
1792
|
+
"--enable-trace",
|
1793
|
+
action="store_true",
|
1794
|
+
help="Enable opentelemetry trace",
|
1795
|
+
)
|
1796
|
+
parser.add_argument(
|
1797
|
+
"--oltp-traces-endpoint",
|
1798
|
+
type=str,
|
1799
|
+
default="localhost:4317",
|
1800
|
+
help="Config opentelemetry collector endpoint if --enable-trace is set. format: <ip>:<port>",
|
1801
|
+
)
|
1261
1802
|
|
1262
1803
|
# API related
|
1263
1804
|
parser.add_argument(
|
@@ -1342,6 +1883,18 @@ class ServerArgs:
|
|
1342
1883
|
"minimum_tokens",
|
1343
1884
|
],
|
1344
1885
|
)
|
1886
|
+
parser.add_argument(
|
1887
|
+
"--load-watch-interval",
|
1888
|
+
type=float,
|
1889
|
+
default=ServerArgs.load_watch_interval,
|
1890
|
+
help="The interval of load watching in seconds.",
|
1891
|
+
)
|
1892
|
+
parser.add_argument(
|
1893
|
+
"--prefill-round-robin-balance",
|
1894
|
+
default=ServerArgs.prefill_round_robin_balance,
|
1895
|
+
action="store_true",
|
1896
|
+
help="Prefill is round robin balanced. This is used to promise decode server can get the correct dp rank.",
|
1897
|
+
)
|
1345
1898
|
|
1346
1899
|
# Multi-node distributed serving
|
1347
1900
|
parser.add_argument(
|
@@ -1416,9 +1969,17 @@ class ServerArgs:
|
|
1416
1969
|
parser.add_argument(
|
1417
1970
|
"--lora-backend",
|
1418
1971
|
type=str,
|
1419
|
-
|
1972
|
+
choices=LORA_BACKEND_CHOICES,
|
1973
|
+
default=ServerArgs.lora_backend,
|
1420
1974
|
help="Choose the kernel backend for multi-LoRA serving.",
|
1421
1975
|
)
|
1976
|
+
parser.add_argument(
|
1977
|
+
"--max-lora-chunk-size",
|
1978
|
+
type=int,
|
1979
|
+
default=ServerArgs.max_lora_chunk_size,
|
1980
|
+
choices=[16, 32, 64, 128],
|
1981
|
+
help="Maximum chunk size for the ChunkedSGMV LoRA backend. Only used when --lora-backend is 'csgmv'. Choosing a larger value might improve performance.",
|
1982
|
+
)
|
1422
1983
|
|
1423
1984
|
# Kernel backend
|
1424
1985
|
parser.add_argument(
|
@@ -1452,30 +2013,51 @@ class ServerArgs:
|
|
1452
2013
|
parser.add_argument(
|
1453
2014
|
"--grammar-backend",
|
1454
2015
|
type=str,
|
1455
|
-
choices=
|
2016
|
+
choices=GRAMMAR_BACKEND_CHOICES,
|
1456
2017
|
default=ServerArgs.grammar_backend,
|
1457
2018
|
help="Choose the backend for grammar-guided decoding.",
|
1458
2019
|
)
|
1459
2020
|
parser.add_argument(
|
1460
2021
|
"--mm-attention-backend",
|
1461
2022
|
type=str,
|
1462
|
-
choices=["sdpa", "fa3", "triton_attn"],
|
2023
|
+
choices=["sdpa", "fa3", "triton_attn", "ascend_attn"],
|
1463
2024
|
default=ServerArgs.mm_attention_backend,
|
1464
2025
|
help="Set multimodal attention backend.",
|
1465
2026
|
)
|
2027
|
+
parser.add_argument(
|
2028
|
+
"--nsa-prefill",
|
2029
|
+
default=ServerArgs.nsa_prefill,
|
2030
|
+
type=str,
|
2031
|
+
choices=NSA_CHOICES,
|
2032
|
+
)
|
2033
|
+
parser.add_argument(
|
2034
|
+
"--nsa-decode",
|
2035
|
+
default=ServerArgs.nsa_decode,
|
2036
|
+
type=str,
|
2037
|
+
choices=NSA_CHOICES,
|
2038
|
+
)
|
1466
2039
|
|
1467
2040
|
# Speculative decoding
|
1468
2041
|
parser.add_argument(
|
1469
2042
|
"--speculative-algorithm",
|
1470
2043
|
type=str,
|
1471
|
-
choices=["EAGLE", "EAGLE3", "NEXTN"],
|
2044
|
+
choices=["EAGLE", "EAGLE3", "NEXTN", "STANDALONE", "NGRAM"],
|
1472
2045
|
help="Speculative algorithm.",
|
1473
2046
|
)
|
1474
2047
|
parser.add_argument(
|
1475
2048
|
"--speculative-draft-model-path",
|
2049
|
+
"--speculative-draft-model",
|
1476
2050
|
type=str,
|
1477
2051
|
help="The path of the draft model weights. This can be a local folder or a Hugging Face repo ID.",
|
1478
2052
|
)
|
2053
|
+
parser.add_argument(
|
2054
|
+
"--speculative-draft-model-revision",
|
2055
|
+
type=str,
|
2056
|
+
default=None,
|
2057
|
+
help="The specific draft model version to use. It can be a branch "
|
2058
|
+
"name, a tag name, or a commit id. If unspecified, will use "
|
2059
|
+
"the default version.",
|
2060
|
+
)
|
1479
2061
|
parser.add_argument(
|
1480
2062
|
"--speculative-num-steps",
|
1481
2063
|
type=int,
|
@@ -1512,6 +2094,57 @@ class ServerArgs:
|
|
1512
2094
|
help="The path of the draft model's small vocab table.",
|
1513
2095
|
default=ServerArgs.speculative_token_map,
|
1514
2096
|
)
|
2097
|
+
parser.add_argument(
|
2098
|
+
"--speculative-attention-mode",
|
2099
|
+
type=str,
|
2100
|
+
choices=["prefill", "decode"],
|
2101
|
+
help="Attention backend for speculative decoding operations (both target verify and draft extend). Can be one of 'prefill' (default) or 'decode'.",
|
2102
|
+
default=ServerArgs.speculative_attention_mode,
|
2103
|
+
)
|
2104
|
+
# Ngram speculative decoding
|
2105
|
+
parser.add_argument(
|
2106
|
+
"--speculative-ngram-min-match-window-size",
|
2107
|
+
type=int,
|
2108
|
+
default=ServerArgs.speculative_ngram_min_match_window_size,
|
2109
|
+
help="The minimum window size for pattern matching in ngram speculative decoding.",
|
2110
|
+
)
|
2111
|
+
parser.add_argument(
|
2112
|
+
"--speculative-ngram-max-match-window-size",
|
2113
|
+
type=int,
|
2114
|
+
default=ServerArgs.speculative_ngram_max_match_window_size,
|
2115
|
+
help="The maximum window size for pattern matching in ngram speculative decoding.",
|
2116
|
+
)
|
2117
|
+
parser.add_argument(
|
2118
|
+
"--speculative-ngram-min-bfs-breadth",
|
2119
|
+
type=int,
|
2120
|
+
default=ServerArgs.speculative_ngram_min_bfs_breadth,
|
2121
|
+
help="The minimum breadth for BFS (Breadth-First Search) in ngram speculative decoding.",
|
2122
|
+
)
|
2123
|
+
parser.add_argument(
|
2124
|
+
"--speculative-ngram-max-bfs-breadth",
|
2125
|
+
type=int,
|
2126
|
+
default=ServerArgs.speculative_ngram_max_bfs_breadth,
|
2127
|
+
help="The maximum breadth for BFS (Breadth-First Search) in ngram speculative decoding.",
|
2128
|
+
)
|
2129
|
+
parser.add_argument(
|
2130
|
+
"--speculative-ngram-match-type",
|
2131
|
+
type=str,
|
2132
|
+
choices=["BFS", "PROB"],
|
2133
|
+
default=ServerArgs.speculative_ngram_match_type,
|
2134
|
+
help="The match type for cache tree.",
|
2135
|
+
)
|
2136
|
+
parser.add_argument(
|
2137
|
+
"--speculative-ngram-branch-length",
|
2138
|
+
type=int,
|
2139
|
+
default=ServerArgs.speculative_ngram_branch_length,
|
2140
|
+
help="The branch length for ngram speculative decoding.",
|
2141
|
+
)
|
2142
|
+
parser.add_argument(
|
2143
|
+
"--speculative-ngram-capacity",
|
2144
|
+
type=int,
|
2145
|
+
default=ServerArgs.speculative_ngram_capacity,
|
2146
|
+
help="The cache capacity for ngram speculative decoding.",
|
2147
|
+
)
|
1515
2148
|
|
1516
2149
|
# Expert parallelism
|
1517
2150
|
parser.add_argument(
|
@@ -1539,6 +2172,7 @@ class ServerArgs:
|
|
1539
2172
|
"flashinfer_trtllm",
|
1540
2173
|
"flashinfer_cutlass",
|
1541
2174
|
"flashinfer_mxfp4",
|
2175
|
+
"flashinfer_cutedsl",
|
1542
2176
|
],
|
1543
2177
|
default=ServerArgs.moe_runner_backend,
|
1544
2178
|
help="Choose the runner backend for MoE.",
|
@@ -1546,7 +2180,7 @@ class ServerArgs:
|
|
1546
2180
|
parser.add_argument(
|
1547
2181
|
"--flashinfer-mxfp4-moe-precision",
|
1548
2182
|
type=str,
|
1549
|
-
choices=["
|
2183
|
+
choices=["default", "bf16"],
|
1550
2184
|
default=ServerArgs.flashinfer_mxfp4_moe_precision,
|
1551
2185
|
help="Choose the computation precision of flashinfer mxfp4 moe",
|
1552
2186
|
)
|
@@ -1639,6 +2273,21 @@ class ServerArgs:
|
|
1639
2273
|
help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
|
1640
2274
|
)
|
1641
2275
|
|
2276
|
+
# Mamba Cache
|
2277
|
+
parser.add_argument(
|
2278
|
+
"--max-mamba-cache-size",
|
2279
|
+
type=int,
|
2280
|
+
default=ServerArgs.max_mamba_cache_size,
|
2281
|
+
help="The maximum size of the mamba cache.",
|
2282
|
+
)
|
2283
|
+
parser.add_argument(
|
2284
|
+
"--mamba-ssm-dtype",
|
2285
|
+
type=str,
|
2286
|
+
default=ServerArgs.mamba_ssm_dtype,
|
2287
|
+
choices=["float32", "bfloat16"],
|
2288
|
+
help="The data type of the SSM states in mamba cache.",
|
2289
|
+
)
|
2290
|
+
|
1642
2291
|
# Hierarchical cache
|
1643
2292
|
parser.add_argument(
|
1644
2293
|
"--enable-hierarchical-cache",
|
@@ -1664,6 +2313,13 @@ class ServerArgs:
|
|
1664
2313
|
default=ServerArgs.hicache_write_policy,
|
1665
2314
|
help="The write policy of hierarchical cache.",
|
1666
2315
|
)
|
2316
|
+
parser.add_argument(
|
2317
|
+
"--radix-eviction-policy",
|
2318
|
+
type=str,
|
2319
|
+
choices=RADIX_EVICTION_POLICY_CHOICES,
|
2320
|
+
default=ServerArgs.radix_eviction_policy,
|
2321
|
+
help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.",
|
2322
|
+
)
|
1667
2323
|
parser.add_argument(
|
1668
2324
|
"--hicache-io-backend",
|
1669
2325
|
type=str,
|
@@ -1674,16 +2330,19 @@ class ServerArgs:
|
|
1674
2330
|
parser.add_argument(
|
1675
2331
|
"--hicache-mem-layout",
|
1676
2332
|
type=str,
|
1677
|
-
choices=["layer_first", "page_first"],
|
2333
|
+
choices=["layer_first", "page_first", "page_first_direct"],
|
1678
2334
|
default=ServerArgs.hicache_mem_layout,
|
1679
2335
|
help="The layout of host memory pool for hierarchical cache.",
|
1680
2336
|
)
|
1681
2337
|
parser.add_argument(
|
1682
2338
|
"--hicache-storage-backend",
|
1683
2339
|
type=str,
|
1684
|
-
choices=["file", "mooncake", "hf3fs", "nixl"],
|
2340
|
+
choices=["file", "mooncake", "hf3fs", "nixl", "aibrix", "dynamic", "eic"],
|
1685
2341
|
default=ServerArgs.hicache_storage_backend,
|
1686
|
-
help="The storage backend for hierarchical KV cache."
|
2342
|
+
help="The storage backend for hierarchical KV cache. "
|
2343
|
+
"Built-in backends: file, mooncake, hf3fs, nixl, aibrix. "
|
2344
|
+
"For dynamic backend, use --hicache-storage-backend-extra-config to specify: "
|
2345
|
+
"backend_name (custom name), module_path (Python module path), class_name (backend class name).",
|
1687
2346
|
)
|
1688
2347
|
parser.add_argument(
|
1689
2348
|
"--hicache-storage-prefetch-policy",
|
@@ -1698,6 +2357,12 @@ class ServerArgs:
|
|
1698
2357
|
default=ServerArgs.hicache_storage_backend_extra_config,
|
1699
2358
|
help="A dictionary in JSON string format containing extra configuration for the storage backend.",
|
1700
2359
|
)
|
2360
|
+
# LMCache
|
2361
|
+
parser.add_argument(
|
2362
|
+
"--enable-lmcache",
|
2363
|
+
action="store_true",
|
2364
|
+
help="Using LMCache as an alternative hierarchical cache solution",
|
2365
|
+
)
|
1701
2366
|
|
1702
2367
|
# Double Sparsity
|
1703
2368
|
parser.add_argument(
|
@@ -1841,6 +2506,11 @@ class ServerArgs:
|
|
1841
2506
|
action="store_true",
|
1842
2507
|
help="Enable using mscclpp for small messages for all-reduce kernel and fall back to NCCL.",
|
1843
2508
|
)
|
2509
|
+
parser.add_argument(
|
2510
|
+
"--enable-torch-symm-mem",
|
2511
|
+
action="store_true",
|
2512
|
+
help="Enable using torch symm mem for all-reduce kernel and fall back to NCCL. Only supports CUDA device SM90 and above. SM90 supports world size 4, 6, 8. SM10 supports world size 6, 8.",
|
2513
|
+
)
|
1844
2514
|
parser.add_argument(
|
1845
2515
|
"--disable-overlap-schedule",
|
1846
2516
|
action="store_true",
|
@@ -1866,6 +2536,11 @@ class ServerArgs:
|
|
1866
2536
|
action="store_true",
|
1867
2537
|
help="Enabling two micro batches to overlap.",
|
1868
2538
|
)
|
2539
|
+
parser.add_argument(
|
2540
|
+
"--enable-single-batch-overlap",
|
2541
|
+
action="store_true",
|
2542
|
+
help="Let computation and communication overlap within one micro batch.",
|
2543
|
+
)
|
1869
2544
|
parser.add_argument(
|
1870
2545
|
"--tbo-token-distribution-threshold",
|
1871
2546
|
type=float,
|
@@ -1911,6 +2586,12 @@ class ServerArgs:
|
|
1911
2586
|
default=ServerArgs.triton_attention_num_kv_splits,
|
1912
2587
|
help="The number of KV splits in flash decoding Triton kernel. Larger value is better in longer context scenarios. The default value is 8.",
|
1913
2588
|
)
|
2589
|
+
parser.add_argument(
|
2590
|
+
"--triton-attention-split-tile-size",
|
2591
|
+
type=int,
|
2592
|
+
default=ServerArgs.triton_attention_split_tile_size,
|
2593
|
+
help="The size of split KV tile in flash decoding Triton kernel. Used for deterministic inference.",
|
2594
|
+
)
|
1914
2595
|
parser.add_argument(
|
1915
2596
|
"--num-continuous-decode-steps",
|
1916
2597
|
type=int,
|
@@ -1929,6 +2610,11 @@ class ServerArgs:
|
|
1929
2610
|
action="store_true",
|
1930
2611
|
help="Allow saving memory using release_memory_occupation and resume_memory_occupation",
|
1931
2612
|
)
|
2613
|
+
parser.add_argument(
|
2614
|
+
"--enable-weights-cpu-backup",
|
2615
|
+
action="store_true",
|
2616
|
+
help="Save model weights to CPU memory during release_weights_occupation and resume_weights_occupation",
|
2617
|
+
)
|
1932
2618
|
parser.add_argument(
|
1933
2619
|
"--allow-auto-truncate",
|
1934
2620
|
action="store_true",
|
@@ -1959,6 +2645,11 @@ class ServerArgs:
|
|
1959
2645
|
action="store_true",
|
1960
2646
|
help="Adopt base image processor instead of fast image processor.",
|
1961
2647
|
)
|
2648
|
+
parser.add_argument(
|
2649
|
+
"--keep-mm-feature-on-device",
|
2650
|
+
action="store_true",
|
2651
|
+
help="Keep multimodal feature tensors on device after processing to save D2H copy.",
|
2652
|
+
)
|
1962
2653
|
parser.add_argument(
|
1963
2654
|
"--enable-return-hidden-states",
|
1964
2655
|
action="store_true",
|
@@ -1970,6 +2661,12 @@ class ServerArgs:
|
|
1970
2661
|
default=ServerArgs.scheduler_recv_interval,
|
1971
2662
|
help="The interval to poll requests in scheduler. Can be set to >1 to reduce the overhead of this.",
|
1972
2663
|
)
|
2664
|
+
parser.add_argument(
|
2665
|
+
"--numa-node",
|
2666
|
+
type=int,
|
2667
|
+
nargs="+",
|
2668
|
+
help="Sets the numa node for the subprocesses. i-th element corresponds to i-th subprocess.",
|
2669
|
+
)
|
1973
2670
|
|
1974
2671
|
# Debug tensor dumps
|
1975
2672
|
parser.add_argument(
|
@@ -1995,12 +2692,29 @@ class ServerArgs:
|
|
1995
2692
|
action="store_true",
|
1996
2693
|
help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
|
1997
2694
|
)
|
2695
|
+
parser.add_argument(
|
2696
|
+
"--enable-dynamic-batch-tokenizer",
|
2697
|
+
action="store_true",
|
2698
|
+
help="Enable async dynamic batch tokenizer for improved performance when multiple requests arrive concurrently.",
|
2699
|
+
)
|
2700
|
+
parser.add_argument(
|
2701
|
+
"--dynamic-batch-tokenizer-batch-size",
|
2702
|
+
type=int,
|
2703
|
+
default=ServerArgs.dynamic_batch_tokenizer_batch_size,
|
2704
|
+
help="[Only used if --enable-dynamic-batch-tokenizer is set] Maximum batch size for dynamic batch tokenizer.",
|
2705
|
+
)
|
2706
|
+
parser.add_argument(
|
2707
|
+
"--dynamic-batch-tokenizer-batch-timeout",
|
2708
|
+
type=float,
|
2709
|
+
default=ServerArgs.dynamic_batch_tokenizer_batch_timeout,
|
2710
|
+
help="[Only used if --enable-dynamic-batch-tokenizer is set] Timeout in seconds for batching tokenization requests.",
|
2711
|
+
)
|
1998
2712
|
|
1999
2713
|
# PD disaggregation
|
2000
2714
|
parser.add_argument(
|
2001
2715
|
"--disaggregation-mode",
|
2002
2716
|
type=str,
|
2003
|
-
default=
|
2717
|
+
default=ServerArgs.disaggregation_mode,
|
2004
2718
|
choices=["null", "prefill", "decode"],
|
2005
2719
|
help='Only used for PD disaggregation. "prefill" for prefill-only server, and "decode" for decode-only server. If not specified, it is not PD disaggregated',
|
2006
2720
|
)
|
@@ -2043,6 +2757,11 @@ class ServerArgs:
|
|
2043
2757
|
"or multiple comma-separated devices (e.g., --disaggregation-ib-device mlx5_0,mlx5_1). "
|
2044
2758
|
"Default is None, which triggers automatic device detection when mooncake backend is enabled.",
|
2045
2759
|
)
|
2760
|
+
parser.add_argument(
|
2761
|
+
"--disaggregation-decode-enable-offload-kvcache",
|
2762
|
+
action="store_true",
|
2763
|
+
help="Enable async KV cache offloading on decode server (PD mode).",
|
2764
|
+
)
|
2046
2765
|
parser.add_argument(
|
2047
2766
|
"--num-reserved-decode-tokens",
|
2048
2767
|
type=int,
|
@@ -2050,10 +2769,10 @@ class ServerArgs:
|
|
2050
2769
|
help="Number of decode tokens that will have memory reserved when adding new request to the running batch.",
|
2051
2770
|
)
|
2052
2771
|
parser.add_argument(
|
2053
|
-
"--
|
2054
|
-
type=
|
2055
|
-
default=
|
2056
|
-
help="The
|
2772
|
+
"--disaggregation-decode-polling-interval",
|
2773
|
+
type=int,
|
2774
|
+
default=ServerArgs.disaggregation_decode_polling_interval,
|
2775
|
+
help="The interval to poll requests in decode server. Can be set to >1 to reduce the overhead of this.",
|
2057
2776
|
)
|
2058
2777
|
|
2059
2778
|
# Custom weight loader
|
@@ -2069,6 +2788,24 @@ class ServerArgs:
|
|
2069
2788
|
action="store_true",
|
2070
2789
|
help="Disable mmap while loading weight using safetensors.",
|
2071
2790
|
)
|
2791
|
+
parser.add_argument(
|
2792
|
+
"--remote-instance-weight-loader-seed-instance-ip",
|
2793
|
+
type=str,
|
2794
|
+
default=ServerArgs.remote_instance_weight_loader_seed_instance_ip,
|
2795
|
+
help="The ip of the seed instance for loading weights from remote instance.",
|
2796
|
+
)
|
2797
|
+
parser.add_argument(
|
2798
|
+
"--remote-instance-weight-loader-seed-instance-service-port",
|
2799
|
+
type=int,
|
2800
|
+
default=ServerArgs.remote_instance_weight_loader_seed_instance_service_port,
|
2801
|
+
help="The service port of the seed instance for loading weights from remote instance.",
|
2802
|
+
)
|
2803
|
+
parser.add_argument(
|
2804
|
+
"--remote-instance-weight-loader-send-weights-group-ports",
|
2805
|
+
type=json_list_type,
|
2806
|
+
default=ServerArgs.remote_instance_weight_loader_send_weights_group_ports,
|
2807
|
+
help="The communication group ports for loading weights from remote instance.",
|
2808
|
+
)
|
2072
2809
|
|
2073
2810
|
# For PD-Multiplexing
|
2074
2811
|
parser.add_argument(
|
@@ -2084,36 +2821,55 @@ class ServerArgs:
|
|
2084
2821
|
help="Number of sm partition groups.",
|
2085
2822
|
)
|
2086
2823
|
|
2824
|
+
# For deterministic inference
|
2825
|
+
parser.add_argument(
|
2826
|
+
"--enable-deterministic-inference",
|
2827
|
+
action="store_true",
|
2828
|
+
help="Enable deterministic inference mode with batch invariant ops.",
|
2829
|
+
)
|
2830
|
+
|
2087
2831
|
# Deprecated arguments
|
2088
2832
|
parser.add_argument(
|
2089
2833
|
"--enable-ep-moe",
|
2090
|
-
action=
|
2091
|
-
help="
|
2834
|
+
action=DeprecatedAction,
|
2835
|
+
help="NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead.",
|
2092
2836
|
)
|
2093
2837
|
parser.add_argument(
|
2094
2838
|
"--enable-deepep-moe",
|
2095
|
-
action=
|
2096
|
-
help="
|
2839
|
+
action=DeprecatedAction,
|
2840
|
+
help="NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead.",
|
2097
2841
|
)
|
2098
2842
|
parser.add_argument(
|
2099
2843
|
"--enable-flashinfer-cutlass-moe",
|
2100
|
-
action=
|
2101
|
-
help="
|
2844
|
+
action=DeprecatedAction,
|
2845
|
+
help="NOTE: --enable-flashinfer-cutlass-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutlass' instead.",
|
2846
|
+
)
|
2847
|
+
parser.add_argument(
|
2848
|
+
"--enable-flashinfer-cutedsl-moe",
|
2849
|
+
action=DeprecatedAction,
|
2850
|
+
help="NOTE: --enable-flashinfer-cutedsl-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutedsl' instead.",
|
2102
2851
|
)
|
2103
2852
|
parser.add_argument(
|
2104
2853
|
"--enable-flashinfer-trtllm-moe",
|
2105
|
-
action=
|
2106
|
-
help="
|
2854
|
+
action=DeprecatedAction,
|
2855
|
+
help="NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_trtllm' instead.",
|
2107
2856
|
)
|
2108
2857
|
parser.add_argument(
|
2109
2858
|
"--enable-triton-kernel-moe",
|
2110
|
-
action=
|
2111
|
-
help="
|
2859
|
+
action=DeprecatedAction,
|
2860
|
+
help="NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead.",
|
2112
2861
|
)
|
2113
2862
|
parser.add_argument(
|
2114
2863
|
"--enable-flashinfer-mxfp4-moe",
|
2115
|
-
action=
|
2116
|
-
help="
|
2864
|
+
action=DeprecatedAction,
|
2865
|
+
help="NOTE: --enable-flashinfer-mxfp4-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_mxfp4' instead.",
|
2866
|
+
)
|
2867
|
+
|
2868
|
+
# Configuration file support
|
2869
|
+
parser.add_argument(
|
2870
|
+
"--config",
|
2871
|
+
type=str,
|
2872
|
+
help="Read CLI options from a config file. Must be a YAML file with configuration options.",
|
2117
2873
|
)
|
2118
2874
|
|
2119
2875
|
@classmethod
|
@@ -2122,6 +2878,7 @@ class ServerArgs:
|
|
2122
2878
|
args.pp_size = args.pipeline_parallel_size
|
2123
2879
|
args.dp_size = args.data_parallel_size
|
2124
2880
|
args.ep_size = args.expert_parallel_size
|
2881
|
+
|
2125
2882
|
attrs = [attr.name for attr in dataclasses.fields(cls)]
|
2126
2883
|
return cls(**{attr: getattr(args, attr) for attr in attrs})
|
2127
2884
|
|
@@ -2178,13 +2935,27 @@ class ServerArgs:
|
|
2178
2935
|
|
2179
2936
|
# Check chunked prefill
|
2180
2937
|
# Skip validation if chunked prefill is disabled (i.e., size <= 0).
|
2181
|
-
if
|
2938
|
+
# Skip validation if disaggregation mode is decode.
|
2939
|
+
if self.chunked_prefill_size > 0 and self.disaggregation_mode != "decode":
|
2182
2940
|
assert (
|
2183
2941
|
self.chunked_prefill_size % self.page_size == 0
|
2184
2942
|
), "chunked_prefill_size must be divisible by page_size"
|
2185
2943
|
|
2186
2944
|
# Check multi tokenizer
|
2187
2945
|
assert self.tokenizer_worker_num > 0, "Tokenizer worker num must >= 1"
|
2946
|
+
self.validate_buckets_rule(
|
2947
|
+
"--prompt-tokens-buckets", self.prompt_tokens_buckets
|
2948
|
+
)
|
2949
|
+
self.validate_buckets_rule(
|
2950
|
+
"--generation-tokens-buckets", self.generation_tokens_buckets
|
2951
|
+
)
|
2952
|
+
|
2953
|
+
# Check scheduling policy
|
2954
|
+
if self.enable_priority_scheduling:
|
2955
|
+
assert self.schedule_policy in [
|
2956
|
+
"fcfs",
|
2957
|
+
"lof",
|
2958
|
+
], f"To use priority scheduling, schedule_policy must be 'fcfs' or 'lof'. '{self.schedule_policy}' is not supported."
|
2188
2959
|
|
2189
2960
|
def check_lora_server_args(self):
|
2190
2961
|
assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"
|
@@ -2269,6 +3040,12 @@ class ServerArgs:
|
|
2269
3040
|
f"max_loaded_loras={self.max_loaded_loras}, lora_paths={len(self.lora_paths)}"
|
2270
3041
|
)
|
2271
3042
|
|
3043
|
+
if self.max_lora_chunk_size is not None:
|
3044
|
+
assert (
|
3045
|
+
16 <= self.max_lora_chunk_size <= 128
|
3046
|
+
and (self.max_lora_chunk_size & (self.max_lora_chunk_size - 1)) == 0
|
3047
|
+
), "--max-lora-chunk-size must be a power of 2 between 16 and 128."
|
3048
|
+
|
2272
3049
|
def validate_disagg_tp_size(self, prefill_tp: int, decode_tp: int):
|
2273
3050
|
larger_tp = max(decode_tp, prefill_tp)
|
2274
3051
|
smaller_tp = min(decode_tp, prefill_tp)
|
@@ -2277,79 +3054,53 @@ class ServerArgs:
|
|
2277
3054
|
f"decode_tp={decode_tp}, prefill_tp={prefill_tp}"
|
2278
3055
|
)
|
2279
3056
|
|
2280
|
-
def
|
2281
|
-
|
2282
|
-
|
2283
|
-
if model_arch in ["GptOssForCausalLM"]:
|
2284
|
-
if self.attention_backend is None:
|
2285
|
-
if is_cuda() and is_sm100_supported():
|
2286
|
-
self.attention_backend = "trtllm_mha"
|
2287
|
-
elif is_cuda() and is_sm90_supported():
|
2288
|
-
self.attention_backend = "fa3"
|
2289
|
-
else:
|
2290
|
-
self.attention_backend = "triton"
|
2291
|
-
supported_backends = ["triton", "trtllm_mha", "fa3"]
|
2292
|
-
logger.info(
|
2293
|
-
f"Use {self.attention_backend} as attention backend for GptOssForCausalLM"
|
2294
|
-
)
|
2295
|
-
assert (
|
2296
|
-
self.attention_backend in supported_backends
|
2297
|
-
), f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got '{self.attention_backend}'"
|
3057
|
+
def validate_buckets_rule(self, arg_name: str, buckets_rule: List[str]):
|
3058
|
+
if not buckets_rule:
|
3059
|
+
return
|
2298
3060
|
|
2299
|
-
|
2300
|
-
|
2301
|
-
|
2302
|
-
|
2303
|
-
|
2304
|
-
|
2305
|
-
|
2306
|
-
is_mxfp4_quant_format = (
|
2307
|
-
quantization_config is not None
|
2308
|
-
and quantization_config.get("quant_method") == "mxfp4"
|
2309
|
-
)
|
3061
|
+
assert len(buckets_rule) > 0, f"{arg_name} cannot be empty list"
|
3062
|
+
rule = buckets_rule[0]
|
3063
|
+
assert rule in [
|
3064
|
+
"tse",
|
3065
|
+
"default",
|
3066
|
+
"custom",
|
3067
|
+
], f"Unsupported {arg_name} rule type: '{rule}'. Must be one of: 'tse', 'default', 'custom'"
|
2310
3068
|
|
2311
|
-
|
2312
|
-
|
2313
|
-
|
2314
|
-
|
2315
|
-
|
2316
|
-
|
2317
|
-
|
2318
|
-
|
2319
|
-
|
2320
|
-
|
2321
|
-
|
2322
|
-
|
2323
|
-
|
2324
|
-
|
2325
|
-
|
2326
|
-
self.moe_runner_backend = "triton_kernel"
|
2327
|
-
logger.warning(
|
2328
|
-
"Detected GPT-OSS model, enabling triton_kernels MOE kernel."
|
2329
|
-
)
|
2330
|
-
self.disable_hybrid_swa_memory = True
|
2331
|
-
if is_mxfp4_quant_format:
|
2332
|
-
# use bf16 for mxfp4 triton kernels
|
2333
|
-
self.dtype = "bfloat16"
|
3069
|
+
if rule == "tse":
|
3070
|
+
assert (
|
3071
|
+
len(buckets_rule) == 4
|
3072
|
+
), f"{arg_name} TSE rule requires exactly 4 parameters: ['tse', middle, base, count], got {len(buckets_rule)}"
|
3073
|
+
try:
|
3074
|
+
middle = float(buckets_rule[1])
|
3075
|
+
base = float(buckets_rule[2])
|
3076
|
+
count = int(buckets_rule[3])
|
3077
|
+
except (ValueError, IndexError):
|
3078
|
+
assert (
|
3079
|
+
False
|
3080
|
+
), f"{arg_name} TSE rule parameters must be: ['tse', <float:middle>, <float:base>, <int:count>]"
|
3081
|
+
assert base > 1, f"{arg_name} TSE base must be larger than 1, got: {base}"
|
3082
|
+
assert count > 0, f"{arg_name} TSE count must be positive, got: {count}"
|
3083
|
+
assert middle > 0, f"{arg_name} TSE middle must be positive, got: {middle}"
|
2334
3084
|
|
2335
|
-
elif "
|
2336
|
-
assert
|
2337
|
-
|
2338
|
-
|
2339
|
-
|
2340
|
-
elif
|
2341
|
-
|
2342
|
-
|
2343
|
-
"
|
2344
|
-
|
2345
|
-
|
2346
|
-
|
2347
|
-
|
2348
|
-
|
2349
|
-
|
2350
|
-
|
2351
|
-
|
2352
|
-
|
3085
|
+
elif rule == "default":
|
3086
|
+
assert (
|
3087
|
+
len(buckets_rule) == 1
|
3088
|
+
), f"{arg_name} default rule should only have one parameter: ['default'], got {len(buckets_rule)}"
|
3089
|
+
|
3090
|
+
elif rule == "custom":
|
3091
|
+
assert (
|
3092
|
+
len(buckets_rule) >= 2
|
3093
|
+
), f"{arg_name} custom rule requires at least one bucket value: ['custom', value1, ...]"
|
3094
|
+
try:
|
3095
|
+
bucket_values = [float(x) for x in buckets_rule[1:]]
|
3096
|
+
except ValueError:
|
3097
|
+
assert False, f"{arg_name} custom rule bucket values must be numeric"
|
3098
|
+
assert len(set(bucket_values)) == len(
|
3099
|
+
bucket_values
|
3100
|
+
), f"{arg_name} custom rule bucket values should not contain duplicates"
|
3101
|
+
assert all(
|
3102
|
+
val >= 0 for val in bucket_values
|
3103
|
+
), f"{arg_name} custom rule bucket values should be non-negative"
|
2353
3104
|
|
2354
3105
|
def adjust_mem_fraction_for_vlm(self, model_config):
|
2355
3106
|
vision_config = getattr(model_config.hf_config, "vision_config", None)
|
@@ -2401,6 +3152,26 @@ def prepare_server_args(argv: List[str]) -> ServerArgs:
|
|
2401
3152
|
Returns:
|
2402
3153
|
The server arguments.
|
2403
3154
|
"""
|
3155
|
+
# Import here to avoid circular imports
|
3156
|
+
from sglang.srt.server_args_config_parser import ConfigArgumentMerger
|
3157
|
+
|
3158
|
+
# Check for config file and merge arguments if present
|
3159
|
+
if "--config" in argv:
|
3160
|
+
# Extract boolean actions from the parser to handle them correctly
|
3161
|
+
parser = argparse.ArgumentParser()
|
3162
|
+
ServerArgs.add_cli_args(parser)
|
3163
|
+
|
3164
|
+
# Get boolean action destinations
|
3165
|
+
boolean_actions = []
|
3166
|
+
for action in parser._actions:
|
3167
|
+
if hasattr(action, "dest") and hasattr(action, "action"):
|
3168
|
+
if action.action in ["store_true", "store_false"]:
|
3169
|
+
boolean_actions.append(action.dest)
|
3170
|
+
|
3171
|
+
# Merge config file arguments with CLI arguments
|
3172
|
+
config_merger = ConfigArgumentMerger(boolean_actions=boolean_actions)
|
3173
|
+
argv = config_merger.merge_config_with_args(argv)
|
3174
|
+
|
2404
3175
|
parser = argparse.ArgumentParser()
|
2405
3176
|
ServerArgs.add_cli_args(parser)
|
2406
3177
|
raw_args = parser.parse_args(argv)
|
@@ -2535,14 +3306,19 @@ def auto_choose_speculative_params(self: ServerArgs):
|
|
2535
3306
|
"""
|
2536
3307
|
hf_config = self.get_hf_config()
|
2537
3308
|
arch = hf_config.architectures[0]
|
2538
|
-
|
3309
|
+
if self.speculative_algorithm == "STANDALONE":
|
3310
|
+
# The default value for standalone speculative decoding
|
3311
|
+
return (3, 1, 4)
|
2539
3312
|
if arch in ["LlamaForCausalLM"]:
|
2540
3313
|
# The default value for llama
|
2541
3314
|
return (5, 4, 8)
|
2542
3315
|
elif arch in [
|
3316
|
+
"DeepseekV32ForCausalLM",
|
2543
3317
|
"DeepseekV3ForCausalLM",
|
2544
3318
|
"DeepseekV2ForCausalLM",
|
2545
3319
|
"GptOssForCausalLM",
|
3320
|
+
"BailingMoeForCausalLM",
|
3321
|
+
"BailingMoeV2ForCausalLM",
|
2546
3322
|
]:
|
2547
3323
|
# The default value for deepseek and gpt-oss
|
2548
3324
|
return (3, 1, 4)
|