sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +54 -37
 - sglang/bench_one_batch_server.py +340 -34
 - sglang/bench_serving.py +340 -159
 - sglang/check_env.py +1 -1
 - sglang/compile_deep_gemm.py +6 -2
 - sglang/global_config.py +1 -25
 - sglang/lang/api.py +6 -0
 - sglang/lang/backend/runtime_endpoint.py +1 -1
 - sglang/lang/interpreter.py +1 -0
 - sglang/lang/ir.py +13 -0
 - sglang/launch_server.py +9 -2
 - sglang/profiler.py +20 -3
 - sglang/srt/_custom_ops.py +1 -1
 - sglang/srt/batch_invariant_ops/__init__.py +27 -0
 - sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
 - sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
 - sglang/srt/compilation/backend.py +437 -0
 - sglang/srt/compilation/compilation_config.py +20 -0
 - sglang/srt/compilation/compilation_counter.py +47 -0
 - sglang/srt/compilation/compile.py +210 -0
 - sglang/srt/compilation/compiler_interface.py +503 -0
 - sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
 - sglang/srt/compilation/fix_functionalization.py +134 -0
 - sglang/srt/compilation/fx_utils.py +83 -0
 - sglang/srt/compilation/inductor_pass.py +140 -0
 - sglang/srt/compilation/pass_manager.py +66 -0
 - sglang/srt/compilation/piecewise_context_manager.py +40 -0
 - sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
 - sglang/srt/configs/__init__.py +8 -0
 - sglang/srt/configs/deepseek_ocr.py +262 -0
 - sglang/srt/configs/deepseekvl2.py +194 -96
 - sglang/srt/configs/dots_ocr.py +64 -0
 - sglang/srt/configs/dots_vlm.py +2 -7
 - sglang/srt/configs/falcon_h1.py +309 -0
 - sglang/srt/configs/load_config.py +33 -2
 - sglang/srt/configs/mamba_utils.py +117 -0
 - sglang/srt/configs/model_config.py +284 -118
 - sglang/srt/configs/modelopt_config.py +30 -0
 - sglang/srt/configs/nemotron_h.py +286 -0
 - sglang/srt/configs/olmo3.py +105 -0
 - sglang/srt/configs/points_v15_chat.py +29 -0
 - sglang/srt/configs/qwen3_next.py +11 -47
 - sglang/srt/configs/qwen3_omni.py +613 -0
 - sglang/srt/configs/qwen3_vl.py +576 -0
 - sglang/srt/connector/remote_instance.py +1 -1
 - sglang/srt/constrained/base_grammar_backend.py +6 -1
 - sglang/srt/constrained/llguidance_backend.py +5 -0
 - sglang/srt/constrained/outlines_backend.py +1 -1
 - sglang/srt/constrained/outlines_jump_forward.py +1 -1
 - sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
 - sglang/srt/constrained/utils.py +12 -0
 - sglang/srt/constrained/xgrammar_backend.py +26 -15
 - sglang/srt/debug_utils/dumper.py +10 -3
 - sglang/srt/disaggregation/ascend/conn.py +2 -2
 - sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
 - sglang/srt/disaggregation/base/conn.py +17 -4
 - sglang/srt/disaggregation/common/conn.py +268 -98
 - sglang/srt/disaggregation/decode.py +172 -39
 - sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
 - sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
 - sglang/srt/disaggregation/fake/conn.py +11 -3
 - sglang/srt/disaggregation/mooncake/conn.py +203 -555
 - sglang/srt/disaggregation/nixl/conn.py +217 -63
 - sglang/srt/disaggregation/prefill.py +113 -270
 - sglang/srt/disaggregation/utils.py +36 -5
 - sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
 - sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
 - sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
 - sglang/srt/distributed/device_communicators/pynccl.py +24 -12
 - sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
 - sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
 - sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
 - sglang/srt/distributed/naive_distributed.py +5 -4
 - sglang/srt/distributed/parallel_state.py +203 -97
 - sglang/srt/elastic_ep/elastic_ep.py +74 -0
 - sglang/srt/entrypoints/context.py +3 -2
 - sglang/srt/entrypoints/engine.py +85 -65
 - sglang/srt/entrypoints/grpc_server.py +632 -305
 - sglang/srt/entrypoints/harmony_utils.py +2 -2
 - sglang/srt/entrypoints/http_server.py +169 -17
 - sglang/srt/entrypoints/http_server_engine.py +1 -7
 - sglang/srt/entrypoints/openai/protocol.py +327 -34
 - sglang/srt/entrypoints/openai/serving_base.py +74 -8
 - sglang/srt/entrypoints/openai/serving_chat.py +202 -118
 - sglang/srt/entrypoints/openai/serving_classify.py +204 -0
 - sglang/srt/entrypoints/openai/serving_completions.py +20 -4
 - sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
 - sglang/srt/entrypoints/openai/serving_responses.py +47 -2
 - sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
 - sglang/srt/environ.py +323 -0
 - sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
 - sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
 - sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
 - sglang/srt/eplb/expert_distribution.py +3 -4
 - sglang/srt/eplb/expert_location.py +30 -5
 - sglang/srt/eplb/expert_location_dispatch.py +2 -2
 - sglang/srt/eplb/expert_location_updater.py +2 -2
 - sglang/srt/function_call/base_format_detector.py +17 -18
 - sglang/srt/function_call/function_call_parser.py +21 -16
 - sglang/srt/function_call/glm4_moe_detector.py +4 -8
 - sglang/srt/function_call/gpt_oss_detector.py +24 -1
 - sglang/srt/function_call/json_array_parser.py +61 -0
 - sglang/srt/function_call/kimik2_detector.py +17 -4
 - sglang/srt/function_call/utils.py +98 -7
 - sglang/srt/grpc/compile_proto.py +245 -0
 - sglang/srt/grpc/grpc_request_manager.py +915 -0
 - sglang/srt/grpc/health_servicer.py +189 -0
 - sglang/srt/grpc/scheduler_launcher.py +181 -0
 - sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
 - sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
 - sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
 - sglang/srt/layers/activation.py +11 -7
 - sglang/srt/layers/attention/aiter_backend.py +17 -18
 - sglang/srt/layers/attention/ascend_backend.py +125 -10
 - sglang/srt/layers/attention/attention_registry.py +226 -0
 - sglang/srt/layers/attention/base_attn_backend.py +32 -4
 - sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
 - sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
 - sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
 - sglang/srt/layers/attention/fla/chunk.py +0 -1
 - sglang/srt/layers/attention/fla/chunk_o.py +1 -1
 - sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
 - sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
 - sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
 - sglang/srt/layers/attention/fla/index.py +0 -2
 - sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
 - sglang/srt/layers/attention/fla/utils.py +0 -3
 - sglang/srt/layers/attention/fla/wy_fast.py +0 -2
 - sglang/srt/layers/attention/flashattention_backend.py +52 -15
 - sglang/srt/layers/attention/flashinfer_backend.py +357 -212
 - sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
 - sglang/srt/layers/attention/flashmla_backend.py +9 -7
 - sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
 - sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
 - sglang/srt/layers/attention/intel_amx_backend.py +1 -1
 - sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
 - sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
 - sglang/srt/layers/attention/mamba/mamba.py +514 -1
 - sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
 - sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
 - sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
 - sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
 - sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
 - sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
 - sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
 - sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
 - sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
 - sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
 - sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
 - sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
 - sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
 - sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
 - sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
 - sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
 - sglang/srt/layers/attention/nsa/transform_index.py +144 -0
 - sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
 - sglang/srt/layers/attention/nsa/utils.py +23 -0
 - sglang/srt/layers/attention/nsa_backend.py +1201 -0
 - sglang/srt/layers/attention/tbo_backend.py +6 -6
 - sglang/srt/layers/attention/torch_flex_backend.py +325 -0
 - sglang/srt/layers/attention/triton_backend.py +249 -42
 - sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
 - sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
 - sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
 - sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
 - sglang/srt/layers/attention/utils.py +11 -7
 - sglang/srt/layers/attention/vision.py +61 -3
 - sglang/srt/layers/attention/wave_backend.py +4 -4
 - sglang/srt/layers/attention/xpu_backend.py +1028 -0
 - sglang/srt/layers/communicator.py +19 -7
 - sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
 - sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
 - sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
 - sglang/srt/layers/dp_attention.py +28 -1
 - sglang/srt/layers/elementwise.py +3 -1
 - sglang/srt/layers/layernorm.py +47 -15
 - sglang/srt/layers/linear.py +30 -5
 - sglang/srt/layers/logits_processor.py +161 -18
 - sglang/srt/layers/modelopt_utils.py +11 -0
 - sglang/srt/layers/moe/cutlass_moe.py +0 -2
 - sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
 - sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
 - sglang/srt/layers/moe/ep_moe/layer.py +243 -448
 - sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
 - sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
 - sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
 - sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
 - sglang/srt/layers/moe/moe_runner/runner.py +3 -0
 - sglang/srt/layers/moe/moe_runner/triton.py +3 -1
 - sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
 - sglang/srt/layers/moe/router.py +51 -15
 - sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
 - sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
 - sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
 - sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
 - sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
 - sglang/srt/layers/moe/topk.py +3 -2
 - sglang/srt/layers/moe/utils.py +27 -1
 - sglang/srt/layers/parameter.py +23 -6
 - sglang/srt/layers/quantization/__init__.py +2 -53
 - sglang/srt/layers/quantization/awq.py +183 -6
 - sglang/srt/layers/quantization/awq_triton.py +29 -0
 - sglang/srt/layers/quantization/base_config.py +20 -1
 - sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
 - sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
 - sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
 - sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
 - sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
 - sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
 - sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
 - sglang/srt/layers/quantization/fp8.py +86 -20
 - sglang/srt/layers/quantization/fp8_kernel.py +55 -10
 - sglang/srt/layers/quantization/fp8_utils.py +43 -15
 - sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
 - sglang/srt/layers/quantization/gptq.py +0 -1
 - sglang/srt/layers/quantization/int8_kernel.py +18 -2
 - sglang/srt/layers/quantization/marlin_utils.py +12 -0
 - sglang/srt/layers/quantization/modelopt_quant.py +141 -81
 - sglang/srt/layers/quantization/mxfp4.py +17 -34
 - sglang/srt/layers/quantization/petit.py +1 -1
 - sglang/srt/layers/quantization/quark/quark.py +3 -1
 - sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
 - sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
 - sglang/srt/layers/quantization/unquant.py +1 -4
 - sglang/srt/layers/quantization/utils.py +0 -1
 - sglang/srt/layers/quantization/w4afp8.py +51 -24
 - sglang/srt/layers/quantization/w8a8_int8.py +45 -27
 - sglang/srt/layers/radix_attention.py +59 -9
 - sglang/srt/layers/rotary_embedding.py +750 -46
 - sglang/srt/layers/sampler.py +84 -16
 - sglang/srt/layers/sparse_pooler.py +98 -0
 - sglang/srt/layers/utils.py +23 -1
 - sglang/srt/layers/vocab_parallel_embedding.py +4 -1
 - sglang/srt/lora/backend/base_backend.py +3 -3
 - sglang/srt/lora/backend/chunked_backend.py +348 -0
 - sglang/srt/lora/backend/triton_backend.py +9 -4
 - sglang/srt/lora/eviction_policy.py +139 -0
 - sglang/srt/lora/lora.py +7 -5
 - sglang/srt/lora/lora_manager.py +33 -7
 - sglang/srt/lora/lora_registry.py +1 -1
 - sglang/srt/lora/mem_pool.py +41 -17
 - sglang/srt/lora/triton_ops/__init__.py +4 -0
 - sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
 - sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
 - sglang/srt/lora/utils.py +7 -5
 - sglang/srt/managers/cache_controller.py +83 -152
 - sglang/srt/managers/data_parallel_controller.py +156 -87
 - sglang/srt/managers/detokenizer_manager.py +51 -24
 - sglang/srt/managers/io_struct.py +223 -129
 - sglang/srt/managers/mm_utils.py +49 -10
 - sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
 - sglang/srt/managers/multimodal_processor.py +1 -2
 - sglang/srt/managers/overlap_utils.py +130 -0
 - sglang/srt/managers/schedule_batch.py +340 -529
 - sglang/srt/managers/schedule_policy.py +158 -18
 - sglang/srt/managers/scheduler.py +665 -620
 - sglang/srt/managers/scheduler_input_blocker.py +1 -1
 - sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
 - sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
 - sglang/srt/managers/scheduler_pp_mixin.py +341 -0
 - sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
 - sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
 - sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
 - sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
 - sglang/srt/managers/tokenizer_manager.py +462 -226
 - sglang/srt/managers/tp_worker.py +217 -156
 - sglang/srt/managers/utils.py +79 -47
 - sglang/srt/mem_cache/allocator.py +21 -22
 - sglang/srt/mem_cache/allocator_ascend.py +42 -28
 - sglang/srt/mem_cache/base_prefix_cache.py +3 -3
 - sglang/srt/mem_cache/chunk_cache.py +20 -2
 - sglang/srt/mem_cache/common.py +480 -0
 - sglang/srt/mem_cache/evict_policy.py +38 -0
 - sglang/srt/mem_cache/hicache_storage.py +44 -2
 - sglang/srt/mem_cache/hiradix_cache.py +134 -34
 - sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
 - sglang/srt/mem_cache/memory_pool.py +602 -208
 - sglang/srt/mem_cache/memory_pool_host.py +134 -183
 - sglang/srt/mem_cache/multimodal_cache.py +0 -1
 - sglang/srt/mem_cache/radix_cache.py +263 -78
 - sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
 - sglang/srt/mem_cache/storage/__init__.py +10 -0
 - sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
 - sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
 - sglang/srt/mem_cache/storage/backend_factory.py +223 -0
 - sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
 - sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
 - sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
 - sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
 - sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
 - sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
 - sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
 - sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
 - sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
 - sglang/srt/mem_cache/swa_radix_cache.py +115 -58
 - sglang/srt/metrics/collector.py +113 -120
 - sglang/srt/metrics/func_timer.py +3 -8
 - sglang/srt/metrics/utils.py +8 -1
 - sglang/srt/model_executor/cpu_graph_runner.py +2 -2
 - sglang/srt/model_executor/cuda_graph_runner.py +81 -36
 - sglang/srt/model_executor/forward_batch_info.py +40 -50
 - sglang/srt/model_executor/model_runner.py +507 -319
 - sglang/srt/model_executor/npu_graph_runner.py +11 -5
 - sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
 - sglang/srt/model_loader/__init__.py +1 -1
 - sglang/srt/model_loader/loader.py +438 -37
 - sglang/srt/model_loader/utils.py +0 -1
 - sglang/srt/model_loader/weight_utils.py +200 -27
 - sglang/srt/models/apertus.py +2 -3
 - sglang/srt/models/arcee.py +2 -2
 - sglang/srt/models/bailing_moe.py +40 -56
 - sglang/srt/models/bailing_moe_nextn.py +3 -4
 - sglang/srt/models/bert.py +1 -1
 - sglang/srt/models/deepseek_nextn.py +25 -4
 - sglang/srt/models/deepseek_ocr.py +1516 -0
 - sglang/srt/models/deepseek_v2.py +793 -235
 - sglang/srt/models/dots_ocr.py +171 -0
 - sglang/srt/models/dots_vlm.py +0 -1
 - sglang/srt/models/dots_vlm_vit.py +1 -1
 - sglang/srt/models/falcon_h1.py +570 -0
 - sglang/srt/models/gemma3_causal.py +0 -2
 - sglang/srt/models/gemma3_mm.py +17 -1
 - sglang/srt/models/gemma3n_mm.py +2 -3
 - sglang/srt/models/glm4_moe.py +17 -40
 - sglang/srt/models/glm4_moe_nextn.py +4 -4
 - sglang/srt/models/glm4v.py +3 -2
 - sglang/srt/models/glm4v_moe.py +6 -6
 - sglang/srt/models/gpt_oss.py +12 -35
 - sglang/srt/models/grok.py +10 -23
 - sglang/srt/models/hunyuan.py +2 -7
 - sglang/srt/models/interns1.py +0 -1
 - sglang/srt/models/kimi_vl.py +1 -7
 - sglang/srt/models/kimi_vl_moonvit.py +4 -2
 - sglang/srt/models/llama.py +6 -2
 - sglang/srt/models/llama_eagle3.py +1 -1
 - sglang/srt/models/longcat_flash.py +6 -23
 - sglang/srt/models/longcat_flash_nextn.py +4 -15
 - sglang/srt/models/mimo.py +2 -13
 - sglang/srt/models/mimo_mtp.py +1 -2
 - sglang/srt/models/minicpmo.py +7 -5
 - sglang/srt/models/mixtral.py +1 -4
 - sglang/srt/models/mllama.py +1 -1
 - sglang/srt/models/mllama4.py +27 -6
 - sglang/srt/models/nemotron_h.py +511 -0
 - sglang/srt/models/olmo2.py +31 -4
 - sglang/srt/models/opt.py +5 -5
 - sglang/srt/models/phi.py +1 -1
 - sglang/srt/models/phi4mm.py +1 -1
 - sglang/srt/models/phimoe.py +0 -1
 - sglang/srt/models/pixtral.py +0 -3
 - sglang/srt/models/points_v15_chat.py +186 -0
 - sglang/srt/models/qwen.py +0 -1
 - sglang/srt/models/qwen2.py +0 -7
 - sglang/srt/models/qwen2_5_vl.py +5 -5
 - sglang/srt/models/qwen2_audio.py +2 -15
 - sglang/srt/models/qwen2_moe.py +70 -4
 - sglang/srt/models/qwen2_vl.py +6 -3
 - sglang/srt/models/qwen3.py +18 -3
 - sglang/srt/models/qwen3_moe.py +50 -38
 - sglang/srt/models/qwen3_next.py +43 -21
 - sglang/srt/models/qwen3_next_mtp.py +3 -4
 - sglang/srt/models/qwen3_omni_moe.py +661 -0
 - sglang/srt/models/qwen3_vl.py +791 -0
 - sglang/srt/models/qwen3_vl_moe.py +343 -0
 - sglang/srt/models/registry.py +15 -3
 - sglang/srt/models/roberta.py +55 -3
 - sglang/srt/models/sarashina2_vision.py +268 -0
 - sglang/srt/models/solar.py +505 -0
 - sglang/srt/models/starcoder2.py +357 -0
 - sglang/srt/models/step3_vl.py +3 -5
 - sglang/srt/models/torch_native_llama.py +9 -2
 - sglang/srt/models/utils.py +61 -0
 - sglang/srt/multimodal/processors/base_processor.py +21 -9
 - sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
 - sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
 - sglang/srt/multimodal/processors/dots_vlm.py +2 -4
 - sglang/srt/multimodal/processors/glm4v.py +1 -5
 - sglang/srt/multimodal/processors/internvl.py +20 -10
 - sglang/srt/multimodal/processors/janus_pro.py +0 -1
 - sglang/srt/multimodal/processors/mllama4.py +0 -8
 - sglang/srt/multimodal/processors/phi4mm.py +0 -1
 - sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
 - sglang/srt/multimodal/processors/qwen_vl.py +83 -17
 - sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
 - sglang/srt/multimodal/processors/step3_vl.py +1 -1
 - sglang/srt/parser/conversation.py +41 -0
 - sglang/srt/parser/jinja_template_utils.py +6 -0
 - sglang/srt/parser/reasoning_parser.py +0 -1
 - sglang/srt/sampling/custom_logit_processor.py +77 -2
 - sglang/srt/sampling/sampling_batch_info.py +36 -23
 - sglang/srt/sampling/sampling_params.py +75 -0
 - sglang/srt/server_args.py +1300 -338
 - sglang/srt/server_args_config_parser.py +146 -0
 - sglang/srt/single_batch_overlap.py +161 -0
 - sglang/srt/speculative/base_spec_worker.py +34 -0
 - sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
 - sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
 - sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
 - sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
 - sglang/srt/speculative/cpp_ngram/param.h +125 -0
 - sglang/srt/speculative/cpp_ngram/queue.h +71 -0
 - sglang/srt/speculative/draft_utils.py +226 -0
 - sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
 - sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
 - sglang/srt/speculative/eagle_info.py +786 -0
 - sglang/srt/speculative/eagle_info_v2.py +458 -0
 - sglang/srt/speculative/eagle_utils.py +113 -1270
 - sglang/srt/speculative/eagle_worker.py +120 -285
 - sglang/srt/speculative/eagle_worker_v2.py +702 -0
 - sglang/srt/speculative/ngram_info.py +433 -0
 - sglang/srt/speculative/ngram_worker.py +246 -0
 - sglang/srt/speculative/spec_info.py +49 -0
 - sglang/srt/speculative/spec_utils.py +641 -0
 - sglang/srt/speculative/standalone_worker.py +4 -14
 - sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
 - sglang/srt/tracing/trace.py +32 -6
 - sglang/srt/two_batch_overlap.py +35 -18
 - sglang/srt/utils/__init__.py +2 -0
 - sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
 - sglang/srt/{utils.py → utils/common.py} +583 -113
 - sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
 - sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
 - sglang/srt/{offloader.py → utils/offloader.py} +4 -4
 - sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
 - sglang/srt/utils/profile_merger.py +199 -0
 - sglang/srt/utils/rpd_utils.py +452 -0
 - sglang/srt/utils/slow_rank_detector.py +71 -0
 - sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
 - sglang/srt/warmup.py +8 -4
 - sglang/srt/weight_sync/utils.py +1 -1
 - sglang/test/attention/test_flashattn_backend.py +1 -1
 - sglang/test/attention/test_flashattn_mla_backend.py +0 -1
 - sglang/test/attention/test_prefix_chunk_info.py +0 -2
 - sglang/test/attention/test_trtllm_mla_backend.py +221 -53
 - sglang/test/few_shot_gsm8k_engine.py +2 -4
 - sglang/test/get_logits_ut.py +57 -0
 - sglang/test/kit_matched_stop.py +157 -0
 - sglang/test/longbench_v2/__init__.py +1 -0
 - sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
 - sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
 - sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
 - sglang/test/run_eval.py +120 -11
 - sglang/test/runners.py +3 -1
 - sglang/test/send_one.py +42 -7
 - sglang/test/simple_eval_common.py +8 -2
 - sglang/test/simple_eval_gpqa.py +0 -1
 - sglang/test/simple_eval_humaneval.py +0 -3
 - sglang/test/simple_eval_longbench_v2.py +344 -0
 - sglang/test/simple_eval_mmmu_vlm.py +441 -0
 - sglang/test/test_block_fp8.py +3 -4
 - sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
 - sglang/test/test_cutlass_moe.py +1 -2
 - sglang/test/test_cutlass_w4a8_moe.py +10 -20
 - sglang/test/test_deterministic.py +430 -0
 - sglang/test/test_deterministic_utils.py +73 -0
 - sglang/test/test_disaggregation_utils.py +93 -1
 - sglang/test/test_marlin_moe.py +0 -1
 - sglang/test/test_programs.py +1 -1
 - sglang/test/test_utils.py +432 -16
 - sglang/utils.py +10 -1
 - sglang/version.py +1 -1
 - {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
 - {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
 - sglang/srt/entrypoints/grpc_request_manager.py +0 -580
 - sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
 - sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
 - sglang/srt/mem_cache/lora_radix_cache.py +0 -421
 - sglang/srt/speculative/build_eagle_tree.py +0 -427
 - sglang/test/test_block_fp8_ep.py +0 -358
 - /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
 - /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
 - /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
 - /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
 - {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
 - {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
 - {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
 
    
        sglang/srt/server_args.py
    CHANGED
    
    | 
         @@ -13,20 +13,21 @@ 
     | 
|
| 
       13 
13 
     | 
    
         
             
            # ==============================================================================
         
     | 
| 
       14 
14 
     | 
    
         
             
            """The arguments of the server."""
         
     | 
| 
       15 
15 
     | 
    
         | 
| 
      
 16 
     | 
    
         
            +
            from __future__ import annotations
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
       16 
18 
     | 
    
         
             
            import argparse
         
     | 
| 
       17 
19 
     | 
    
         
             
            import dataclasses
         
     | 
| 
       18 
20 
     | 
    
         
             
            import json
         
     | 
| 
       19 
21 
     | 
    
         
             
            import logging
         
     | 
| 
       20 
22 
     | 
    
         
             
            import os
         
     | 
| 
       21 
23 
     | 
    
         
             
            import random
         
     | 
| 
       22 
     | 
    
         
            -
            import socket
         
     | 
| 
       23 
     | 
    
         
            -
            import sys
         
     | 
| 
       24 
24 
     | 
    
         
             
            import tempfile
         
     | 
| 
       25 
     | 
    
         
            -
            from typing import List, Literal, Optional, Union
         
     | 
| 
      
 25 
     | 
    
         
            +
            from typing import Dict, List, Literal, Optional, Union
         
     | 
| 
      
 26 
     | 
    
         
            +
             
     | 
| 
      
 27 
     | 
    
         
            +
            import orjson
         
     | 
| 
       26 
28 
     | 
    
         | 
| 
       27 
29 
     | 
    
         
             
            from sglang.srt.connector import ConnectorType
         
     | 
| 
       28 
30 
     | 
    
         
             
            from sglang.srt.function_call.function_call_parser import FunctionCallParser
         
     | 
| 
       29 
     | 
    
         
            -
            from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
         
     | 
| 
       30 
31 
     | 
    
         
             
            from sglang.srt.lora.lora_registry import LoRARef
         
     | 
| 
       31 
32 
     | 
    
         
             
            from sglang.srt.parser.reasoning_parser import ReasoningParser
         
     | 
| 
       32 
33 
     | 
    
         
             
            from sglang.srt.utils import (
         
     | 
| 
         @@ -35,6 +36,7 @@ from sglang.srt.utils import ( 
     | 
|
| 
       35 
36 
     | 
    
         
             
                configure_ipv6,
         
     | 
| 
       36 
37 
     | 
    
         
             
                get_device,
         
     | 
| 
       37 
38 
     | 
    
         
             
                get_device_memory_capacity,
         
     | 
| 
      
 39 
     | 
    
         
            +
                get_device_sm,
         
     | 
| 
       38 
40 
     | 
    
         
             
                is_cuda,
         
     | 
| 
       39 
41 
     | 
    
         
             
                is_flashinfer_available,
         
     | 
| 
       40 
42 
     | 
    
         
             
                is_hip,
         
     | 
| 
         @@ -43,12 +45,14 @@ from sglang.srt.utils import ( 
     | 
|
| 
       43 
45 
     | 
    
         
             
                is_remote_url,
         
     | 
| 
       44 
46 
     | 
    
         
             
                is_sm90_supported,
         
     | 
| 
       45 
47 
     | 
    
         
             
                is_sm100_supported,
         
     | 
| 
      
 48 
     | 
    
         
            +
                is_sm120_supported,
         
     | 
| 
       46 
49 
     | 
    
         
             
                is_triton_kernels_available,
         
     | 
| 
       47 
50 
     | 
    
         
             
                is_valid_ipv6_address,
         
     | 
| 
       48 
51 
     | 
    
         
             
                json_list_type,
         
     | 
| 
       49 
52 
     | 
    
         
             
                nullable_str,
         
     | 
| 
       50 
53 
     | 
    
         
             
                parse_connector_type,
         
     | 
| 
       51 
54 
     | 
    
         
             
            )
         
     | 
| 
      
 55 
     | 
    
         
            +
            from sglang.srt.utils.hf_transformers_utils import check_gguf_file, get_config
         
     | 
| 
       52 
56 
     | 
    
         
             
            from sglang.utils import is_in_ci
         
     | 
| 
       53 
57 
     | 
    
         | 
| 
       54 
58 
     | 
    
         
             
            logger = logging.getLogger(__name__)
         
     | 
| 
         @@ -79,6 +83,7 @@ QUANTIZATION_CHOICES = [ 
     | 
|
| 
       79 
83 
     | 
    
         
             
                "bitsandbytes",
         
     | 
| 
       80 
84 
     | 
    
         
             
                "gguf",
         
     | 
| 
       81 
85 
     | 
    
         
             
                "modelopt",
         
     | 
| 
      
 86 
     | 
    
         
            +
                "modelopt_fp8",
         
     | 
| 
       82 
87 
     | 
    
         
             
                "modelopt_fp4",
         
     | 
| 
       83 
88 
     | 
    
         
             
                "petit_nvfp4",
         
     | 
| 
       84 
89 
     | 
    
         
             
                "w8a8_int8",
         
     | 
| 
         @@ -87,33 +92,59 @@ QUANTIZATION_CHOICES = [ 
     | 
|
| 
       87 
92 
     | 
    
         
             
                "qoq",
         
     | 
| 
       88 
93 
     | 
    
         
             
                "w4afp8",
         
     | 
| 
       89 
94 
     | 
    
         
             
                "mxfp4",
         
     | 
| 
      
 95 
     | 
    
         
            +
                "compressed-tensors",  # for Ktransformers
         
     | 
| 
       90 
96 
     | 
    
         
             
            ]
         
     | 
| 
       91 
97 
     | 
    
         | 
| 
       92 
98 
     | 
    
         
             
            ATTENTION_BACKEND_CHOICES = [
         
     | 
| 
       93 
99 
     | 
    
         
             
                # Common
         
     | 
| 
       94 
100 
     | 
    
         
             
                "triton",
         
     | 
| 
       95 
101 
     | 
    
         
             
                "torch_native",
         
     | 
| 
      
 102 
     | 
    
         
            +
                "flex_attention",
         
     | 
| 
      
 103 
     | 
    
         
            +
                "nsa",
         
     | 
| 
       96 
104 
     | 
    
         
             
                # NVIDIA specific
         
     | 
| 
       97 
105 
     | 
    
         
             
                "cutlass_mla",
         
     | 
| 
       98 
106 
     | 
    
         
             
                "fa3",
         
     | 
| 
      
 107 
     | 
    
         
            +
                "fa4",
         
     | 
| 
       99 
108 
     | 
    
         
             
                "flashinfer",
         
     | 
| 
       100 
109 
     | 
    
         
             
                "flashmla",
         
     | 
| 
       101 
110 
     | 
    
         
             
                "trtllm_mla",
         
     | 
| 
       102 
111 
     | 
    
         
             
                "trtllm_mha",
         
     | 
| 
       103 
112 
     | 
    
         
             
                "dual_chunk_flash_attn",
         
     | 
| 
       104 
     | 
    
         
            -
                "hybrid_linear_attn",
         
     | 
| 
       105 
113 
     | 
    
         
             
                # AMD specific
         
     | 
| 
       106 
114 
     | 
    
         
             
                "aiter",
         
     | 
| 
       107 
115 
     | 
    
         
             
                "wave",
         
     | 
| 
       108 
116 
     | 
    
         
             
                # Other platforms
         
     | 
| 
       109 
117 
     | 
    
         
             
                "intel_amx",
         
     | 
| 
       110 
118 
     | 
    
         
             
                "ascend",
         
     | 
| 
      
 119 
     | 
    
         
            +
                "intel_xpu",
         
     | 
| 
       111 
120 
     | 
    
         
             
            ]
         
     | 
| 
       112 
121 
     | 
    
         | 
| 
      
 122 
     | 
    
         
            +
            LORA_BACKEND_CHOICES = ["triton", "csgmv"]
         
     | 
| 
      
 123 
     | 
    
         
            +
             
     | 
| 
       113 
124 
     | 
    
         
             
            DISAGG_TRANSFER_BACKEND_CHOICES = ["mooncake", "nixl", "ascend", "fake"]
         
     | 
| 
       114 
125 
     | 
    
         | 
| 
       115 
126 
     | 
    
         
             
            GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
         
     | 
| 
       116 
127 
     | 
    
         | 
| 
      
 128 
     | 
    
         
            +
            DETERMINISTIC_ATTENTION_BACKEND_CHOICES = ["flashinfer", "fa3", "triton"]
         
     | 
| 
      
 129 
     | 
    
         
            +
             
     | 
| 
      
 130 
     | 
    
         
            +
            DEFAULT_LORA_EVICTION_POLICY = "lru"
         
     | 
| 
      
 131 
     | 
    
         
            +
             
     | 
| 
      
 132 
     | 
    
         
            +
            NSA_CHOICES = ["flashmla_sparse", "flashmla_kv", "fa3", "tilelang", "aiter"]
         
     | 
| 
      
 133 
     | 
    
         
            +
             
     | 
| 
      
 134 
     | 
    
         
            +
            RADIX_EVICTION_POLICY_CHOICES = ["lru", "lfu"]
         
     | 
| 
      
 135 
     | 
    
         
            +
             
     | 
| 
      
 136 
     | 
    
         
            +
            MOE_RUNNER_BACKEND_CHOICES = [
         
     | 
| 
      
 137 
     | 
    
         
            +
                "auto",
         
     | 
| 
      
 138 
     | 
    
         
            +
                "deep_gemm",
         
     | 
| 
      
 139 
     | 
    
         
            +
                "triton",
         
     | 
| 
      
 140 
     | 
    
         
            +
                "triton_kernel",
         
     | 
| 
      
 141 
     | 
    
         
            +
                "flashinfer_trtllm",
         
     | 
| 
      
 142 
     | 
    
         
            +
                "flashinfer_cutlass",
         
     | 
| 
      
 143 
     | 
    
         
            +
                "flashinfer_mxfp4",
         
     | 
| 
      
 144 
     | 
    
         
            +
                "flashinfer_cutedsl",
         
     | 
| 
      
 145 
     | 
    
         
            +
                "cutlass",
         
     | 
| 
      
 146 
     | 
    
         
            +
            ]
         
     | 
| 
      
 147 
     | 
    
         
            +
             
     | 
| 
       117 
148 
     | 
    
         | 
| 
       118 
149 
     | 
    
         
             
            # Allow external code to add more choices
         
     | 
| 
       119 
150 
     | 
    
         
             
            def add_load_format_choices(choices):
         
     | 
| 
         @@ -136,6 +167,18 @@ def add_grammar_backend_choices(choices): 
     | 
|
| 
       136 
167 
     | 
    
         
             
                GRAMMAR_BACKEND_CHOICES.extend(choices)
         
     | 
| 
       137 
168 
     | 
    
         | 
| 
       138 
169 
     | 
    
         | 
| 
      
 170 
     | 
    
         
            +
            def add_moe_runner_backend_choices(choices):
         
     | 
| 
      
 171 
     | 
    
         
            +
                MOE_RUNNER_BACKEND_CHOICES.extend(choices)
         
     | 
| 
      
 172 
     | 
    
         
            +
             
     | 
| 
      
 173 
     | 
    
         
            +
             
     | 
| 
      
 174 
     | 
    
         
            +
            def add_deterministic_attention_backend_choices(choices):
         
     | 
| 
      
 175 
     | 
    
         
            +
                DETERMINISTIC_ATTENTION_BACKEND_CHOICES.extend(choices)
         
     | 
| 
      
 176 
     | 
    
         
            +
             
     | 
| 
      
 177 
     | 
    
         
            +
             
     | 
| 
      
 178 
     | 
    
         
            +
            def add_radix_eviction_policy_choices(choices):
         
     | 
| 
      
 179 
     | 
    
         
            +
                RADIX_EVICTION_POLICY_CHOICES.extend(choices)
         
     | 
| 
      
 180 
     | 
    
         
            +
             
     | 
| 
      
 181 
     | 
    
         
            +
             
     | 
| 
       139 
182 
     | 
    
         
             
            @dataclasses.dataclass
         
     | 
| 
       140 
183 
     | 
    
         
             
            class ServerArgs:
         
     | 
| 
       141 
184 
     | 
    
         
             
                # Model and tokenizer
         
     | 
| 
         @@ -147,6 +190,11 @@ class ServerArgs: 
     | 
|
| 
       147 
190 
     | 
    
         
             
                load_format: str = "auto"
         
     | 
| 
       148 
191 
     | 
    
         
             
                model_loader_extra_config: str = "{}"
         
     | 
| 
       149 
192 
     | 
    
         
             
                trust_remote_code: bool = False
         
     | 
| 
      
 193 
     | 
    
         
            +
                modelopt_quant: Optional[Union[str, Dict]] = None
         
     | 
| 
      
 194 
     | 
    
         
            +
                modelopt_checkpoint_restore_path: Optional[str] = None
         
     | 
| 
      
 195 
     | 
    
         
            +
                modelopt_checkpoint_save_path: Optional[str] = None
         
     | 
| 
      
 196 
     | 
    
         
            +
                modelopt_export_path: Optional[str] = None
         
     | 
| 
      
 197 
     | 
    
         
            +
                quantize_and_serve: bool = False
         
     | 
| 
       150 
198 
     | 
    
         
             
                context_length: Optional[int] = None
         
     | 
| 
       151 
199 
     | 
    
         
             
                is_embedding: bool = False
         
     | 
| 
       152 
200 
     | 
    
         
             
                enable_multimodal: Optional[bool] = None
         
     | 
| 
         @@ -156,39 +204,50 @@ class ServerArgs: 
     | 
|
| 
       156 
204 
     | 
    
         
             
                # HTTP server
         
     | 
| 
       157 
205 
     | 
    
         
             
                host: str = "127.0.0.1"
         
     | 
| 
       158 
206 
     | 
    
         
             
                port: int = 30000
         
     | 
| 
      
 207 
     | 
    
         
            +
                grpc_mode: bool = False
         
     | 
| 
       159 
208 
     | 
    
         
             
                skip_server_warmup: bool = False
         
     | 
| 
       160 
209 
     | 
    
         
             
                warmups: Optional[str] = None
         
     | 
| 
       161 
210 
     | 
    
         
             
                nccl_port: Optional[int] = None
         
     | 
| 
      
 211 
     | 
    
         
            +
                checkpoint_engine_wait_weights_before_ready: bool = False
         
     | 
| 
       162 
212 
     | 
    
         | 
| 
       163 
213 
     | 
    
         
             
                # Quantization and data type
         
     | 
| 
       164 
214 
     | 
    
         
             
                dtype: str = "auto"
         
     | 
| 
       165 
215 
     | 
    
         
             
                quantization: Optional[str] = None
         
     | 
| 
       166 
216 
     | 
    
         
             
                quantization_param_path: Optional[str] = None
         
     | 
| 
       167 
217 
     | 
    
         
             
                kv_cache_dtype: str = "auto"
         
     | 
| 
      
 218 
     | 
    
         
            +
                enable_fp32_lm_head: bool = False
         
     | 
| 
       168 
219 
     | 
    
         | 
| 
       169 
220 
     | 
    
         
             
                # Memory and scheduling
         
     | 
| 
       170 
221 
     | 
    
         
             
                mem_fraction_static: Optional[float] = None
         
     | 
| 
       171 
222 
     | 
    
         
             
                max_running_requests: Optional[int] = None
         
     | 
| 
       172 
     | 
    
         
            -
                max_queued_requests: Optional[int] =  
     | 
| 
      
 223 
     | 
    
         
            +
                max_queued_requests: Optional[int] = None
         
     | 
| 
       173 
224 
     | 
    
         
             
                max_total_tokens: Optional[int] = None
         
     | 
| 
       174 
225 
     | 
    
         
             
                chunked_prefill_size: Optional[int] = None
         
     | 
| 
       175 
226 
     | 
    
         
             
                max_prefill_tokens: int = 16384
         
     | 
| 
       176 
227 
     | 
    
         
             
                schedule_policy: str = "fcfs"
         
     | 
| 
      
 228 
     | 
    
         
            +
                enable_priority_scheduling: bool = False
         
     | 
| 
      
 229 
     | 
    
         
            +
                abort_on_priority_when_disabled: bool = False
         
     | 
| 
      
 230 
     | 
    
         
            +
                schedule_low_priority_values_first: bool = False
         
     | 
| 
      
 231 
     | 
    
         
            +
                priority_scheduling_preemption_threshold: int = 10
         
     | 
| 
       177 
232 
     | 
    
         
             
                schedule_conservativeness: float = 1.0
         
     | 
| 
       178 
233 
     | 
    
         
             
                page_size: Optional[int] = None
         
     | 
| 
       179 
234 
     | 
    
         
             
                hybrid_kvcache_ratio: Optional[float] = None
         
     | 
| 
       180 
235 
     | 
    
         
             
                swa_full_tokens_ratio: float = 0.8
         
     | 
| 
       181 
236 
     | 
    
         
             
                disable_hybrid_swa_memory: bool = False
         
     | 
| 
      
 237 
     | 
    
         
            +
                radix_eviction_policy: str = "lru"
         
     | 
| 
       182 
238 
     | 
    
         | 
| 
       183 
239 
     | 
    
         
             
                # Runtime options
         
     | 
| 
       184 
240 
     | 
    
         
             
                device: Optional[str] = None
         
     | 
| 
      
 241 
     | 
    
         
            +
                elastic_ep_backend: Literal[None, "mooncake"] = None
         
     | 
| 
      
 242 
     | 
    
         
            +
                mooncake_ib_device: Optional[str] = None
         
     | 
| 
       185 
243 
     | 
    
         
             
                tp_size: int = 1
         
     | 
| 
       186 
244 
     | 
    
         
             
                pp_size: int = 1
         
     | 
| 
       187 
     | 
    
         
            -
                 
     | 
| 
      
 245 
     | 
    
         
            +
                pp_max_micro_batch_size: Optional[int] = None
         
     | 
| 
       188 
246 
     | 
    
         
             
                stream_interval: int = 1
         
     | 
| 
       189 
247 
     | 
    
         
             
                stream_output: bool = False
         
     | 
| 
       190 
248 
     | 
    
         
             
                random_seed: Optional[int] = None
         
     | 
| 
       191 
249 
     | 
    
         
             
                constrained_json_whitespace_pattern: Optional[str] = None
         
     | 
| 
      
 250 
     | 
    
         
            +
                constrained_json_disable_any_whitespace: bool = False
         
     | 
| 
       192 
251 
     | 
    
         
             
                watchdog_timeout: float = 300
         
     | 
| 
       193 
252 
     | 
    
         
             
                dist_timeout: Optional[int] = None  # timeout for torch.distributed
         
     | 
| 
       194 
253 
     | 
    
         
             
                download_dir: Optional[str] = None
         
     | 
| 
         @@ -205,8 +264,8 @@ class ServerArgs: 
     | 
|
| 
       205 
264 
     | 
    
         
             
                show_time_cost: bool = False
         
     | 
| 
       206 
265 
     | 
    
         
             
                enable_metrics: bool = False
         
     | 
| 
       207 
266 
     | 
    
         
             
                enable_metrics_for_all_schedulers: bool = False
         
     | 
| 
       208 
     | 
    
         
            -
                tokenizer_metrics_custom_labels_header: str = "x- 
     | 
| 
       209 
     | 
    
         
            -
                 
     | 
| 
      
 267 
     | 
    
         
            +
                tokenizer_metrics_custom_labels_header: str = "x-custom-labels"
         
     | 
| 
      
 268 
     | 
    
         
            +
                tokenizer_metrics_allowed_custom_labels: Optional[List[str]] = None
         
     | 
| 
       210 
269 
     | 
    
         
             
                bucket_time_to_first_token: Optional[List[float]] = None
         
     | 
| 
       211 
270 
     | 
    
         
             
                bucket_inter_token_latency: Optional[List[float]] = None
         
     | 
| 
       212 
271 
     | 
    
         
             
                bucket_e2e_request_latency: Optional[List[float]] = None
         
     | 
| 
         @@ -231,6 +290,7 @@ class ServerArgs: 
     | 
|
| 
       231 
290 
     | 
    
         
             
                reasoning_parser: Optional[str] = None
         
     | 
| 
       232 
291 
     | 
    
         
             
                tool_call_parser: Optional[str] = None
         
     | 
| 
       233 
292 
     | 
    
         
             
                tool_server: Optional[str] = None
         
     | 
| 
      
 293 
     | 
    
         
            +
                sampling_defaults: str = "model"
         
     | 
| 
       234 
294 
     | 
    
         | 
| 
       235 
295 
     | 
    
         
             
                # Data parallelism
         
     | 
| 
       236 
296 
     | 
    
         
             
                dp_size: int = 1
         
     | 
| 
         @@ -257,7 +317,9 @@ class ServerArgs: 
     | 
|
| 
       257 
317 
     | 
    
         
             
                ] = None
         
     | 
| 
       258 
318 
     | 
    
         
             
                max_loaded_loras: Optional[int] = None
         
     | 
| 
       259 
319 
     | 
    
         
             
                max_loras_per_batch: int = 8
         
     | 
| 
      
 320 
     | 
    
         
            +
                lora_eviction_policy: str = DEFAULT_LORA_EVICTION_POLICY
         
     | 
| 
       260 
321 
     | 
    
         
             
                lora_backend: str = "triton"
         
     | 
| 
      
 322 
     | 
    
         
            +
                max_lora_chunk_size: Optional[int] = 16
         
     | 
| 
       261 
323 
     | 
    
         | 
| 
       262 
324 
     | 
    
         
             
                # Kernel backend
         
     | 
| 
       263 
325 
     | 
    
         
             
                attention_backend: Optional[str] = None
         
     | 
| 
         @@ -266,11 +328,15 @@ class ServerArgs: 
     | 
|
| 
       266 
328 
     | 
    
         
             
                sampling_backend: Optional[str] = None
         
     | 
| 
       267 
329 
     | 
    
         
             
                grammar_backend: Optional[str] = None
         
     | 
| 
       268 
330 
     | 
    
         
             
                mm_attention_backend: Optional[str] = None
         
     | 
| 
      
 331 
     | 
    
         
            +
                nsa_prefill_backend: str = "flashmla_sparse"
         
     | 
| 
      
 332 
     | 
    
         
            +
                nsa_decode_backend: str = "fa3"
         
     | 
| 
       269 
333 
     | 
    
         | 
| 
       270 
334 
     | 
    
         
             
                # Speculative decoding
         
     | 
| 
      
 335 
     | 
    
         
            +
                enable_beta_spec: bool = False
         
     | 
| 
       271 
336 
     | 
    
         
             
                speculative_algorithm: Optional[str] = None
         
     | 
| 
       272 
337 
     | 
    
         
             
                speculative_draft_model_path: Optional[str] = None
         
     | 
| 
       273 
338 
     | 
    
         
             
                speculative_draft_model_revision: Optional[str] = None
         
     | 
| 
      
 339 
     | 
    
         
            +
                speculative_draft_load_format: Optional[str] = None
         
     | 
| 
       274 
340 
     | 
    
         
             
                speculative_num_steps: Optional[int] = None
         
     | 
| 
       275 
341 
     | 
    
         
             
                speculative_eagle_topk: Optional[int] = None
         
     | 
| 
       276 
342 
     | 
    
         
             
                speculative_num_draft_tokens: Optional[int] = None
         
     | 
| 
         @@ -278,18 +344,19 @@ class ServerArgs: 
     | 
|
| 
       278 
344 
     | 
    
         
             
                speculative_accept_threshold_acc: float = 1.0
         
     | 
| 
       279 
345 
     | 
    
         
             
                speculative_token_map: Optional[str] = None
         
     | 
| 
       280 
346 
     | 
    
         
             
                speculative_attention_mode: str = "prefill"
         
     | 
| 
      
 347 
     | 
    
         
            +
                # For ngram only
         
     | 
| 
      
 348 
     | 
    
         
            +
                speculative_ngram_min_match_window_size: int = 1
         
     | 
| 
      
 349 
     | 
    
         
            +
                speculative_ngram_max_match_window_size: int = 12
         
     | 
| 
      
 350 
     | 
    
         
            +
                speculative_ngram_min_bfs_breadth: int = 1
         
     | 
| 
      
 351 
     | 
    
         
            +
                speculative_ngram_max_bfs_breadth: int = 10
         
     | 
| 
      
 352 
     | 
    
         
            +
                speculative_ngram_match_type: Literal["BFS", "PROB"] = "BFS"
         
     | 
| 
      
 353 
     | 
    
         
            +
                speculative_ngram_branch_length: int = 18
         
     | 
| 
      
 354 
     | 
    
         
            +
                speculative_ngram_capacity: int = 10 * 1000 * 1000
         
     | 
| 
       281 
355 
     | 
    
         | 
| 
       282 
356 
     | 
    
         
             
                # Expert parallelism
         
     | 
| 
       283 
357 
     | 
    
         
             
                ep_size: int = 1
         
     | 
| 
       284 
     | 
    
         
            -
                moe_a2a_backend: Literal["none", "deepep"] = "none"
         
     | 
| 
       285 
     | 
    
         
            -
                moe_runner_backend:  
     | 
| 
       286 
     | 
    
         
            -
                    "auto",
         
     | 
| 
       287 
     | 
    
         
            -
                    "triton",
         
     | 
| 
       288 
     | 
    
         
            -
                    "triton_kernel",
         
     | 
| 
       289 
     | 
    
         
            -
                    "flashinfer_trtllm",
         
     | 
| 
       290 
     | 
    
         
            -
                    "flashinfer_cutlass",
         
     | 
| 
       291 
     | 
    
         
            -
                    "flashinfer_mxfp4",
         
     | 
| 
       292 
     | 
    
         
            -
                ] = "auto"
         
     | 
| 
      
 358 
     | 
    
         
            +
                moe_a2a_backend: Literal["none", "deepep", "mooncake"] = "none"
         
     | 
| 
      
 359 
     | 
    
         
            +
                moe_runner_backend: str = "auto"
         
     | 
| 
       293 
360 
     | 
    
         
             
                flashinfer_mxfp4_moe_precision: Literal["default", "bf16"] = "default"
         
     | 
| 
       294 
361 
     | 
    
         
             
                enable_flashinfer_allreduce_fusion: bool = False
         
     | 
| 
       295 
362 
     | 
    
         
             
                deepep_mode: Literal["auto", "normal", "low_latency"] = "auto"
         
     | 
| 
         @@ -309,6 +376,11 @@ class ServerArgs: 
     | 
|
| 
       309 
376 
     | 
    
         
             
                deepep_config: Optional[str] = None
         
     | 
| 
       310 
377 
     | 
    
         
             
                moe_dense_tp_size: Optional[int] = None
         
     | 
| 
       311 
378 
     | 
    
         | 
| 
      
 379 
     | 
    
         
            +
                # Mamba cache
         
     | 
| 
      
 380 
     | 
    
         
            +
                max_mamba_cache_size: Optional[int] = None
         
     | 
| 
      
 381 
     | 
    
         
            +
                mamba_ssm_dtype: str = "float32"
         
     | 
| 
      
 382 
     | 
    
         
            +
                mamba_full_memory_ratio: float = 0.9
         
     | 
| 
      
 383 
     | 
    
         
            +
             
     | 
| 
       312 
384 
     | 
    
         
             
                # Hierarchical cache
         
     | 
| 
       313 
385 
     | 
    
         
             
                enable_hierarchical_cache: bool = False
         
     | 
| 
       314 
386 
     | 
    
         
             
                hicache_ratio: float = 2.0
         
     | 
| 
         @@ -322,6 +394,13 @@ class ServerArgs: 
     | 
|
| 
       322 
394 
     | 
    
         
             
                # LMCache
         
     | 
| 
       323 
395 
     | 
    
         
             
                enable_lmcache: bool = False
         
     | 
| 
       324 
396 
     | 
    
         | 
| 
      
 397 
     | 
    
         
            +
                # Ktransformers
         
     | 
| 
      
 398 
     | 
    
         
            +
                kt_amx_weight_path: Optional[str] = None
         
     | 
| 
      
 399 
     | 
    
         
            +
                kt_amx_method: Optional[str] = None
         
     | 
| 
      
 400 
     | 
    
         
            +
                kt_cpuinfer: Optional[int] = None
         
     | 
| 
      
 401 
     | 
    
         
            +
                kt_threadpool_count: Optional[int] = None
         
     | 
| 
      
 402 
     | 
    
         
            +
                kt_num_gpu_experts: Optional[int] = None
         
     | 
| 
      
 403 
     | 
    
         
            +
             
     | 
| 
       325 
404 
     | 
    
         
             
                # Double Sparsity
         
     | 
| 
       326 
405 
     | 
    
         
             
                enable_double_sparsity: bool = False
         
     | 
| 
       327 
406 
     | 
    
         
             
                ds_channel_config_path: Optional[str] = None
         
     | 
| 
         @@ -337,6 +416,12 @@ class ServerArgs: 
     | 
|
| 
       337 
416 
     | 
    
         
             
                offload_prefetch_step: int = 1
         
     | 
| 
       338 
417 
     | 
    
         
             
                offload_mode: str = "cpu"
         
     | 
| 
       339 
418 
     | 
    
         | 
| 
      
 419 
     | 
    
         
            +
                # Scoring configuration
         
     | 
| 
      
 420 
     | 
    
         
            +
                # Delimiter token ID used to combine Query and Items into a single sequence for multi-item scoring.
         
     | 
| 
      
 421 
     | 
    
         
            +
                # Format: Query<delimiter>Item1<delimiter>Item2<delimiter>...
         
     | 
| 
      
 422 
     | 
    
         
            +
                # This enables efficient batch processing of multiple items against a single query.
         
     | 
| 
      
 423 
     | 
    
         
            +
                multi_item_scoring_delimiter: Optional[Union[int]] = None
         
     | 
| 
      
 424 
     | 
    
         
            +
             
     | 
| 
       340 
425 
     | 
    
         
             
                # Optimization/debug options
         
     | 
| 
       341 
426 
     | 
    
         
             
                disable_radix_cache: bool = False
         
     | 
| 
       342 
427 
     | 
    
         
             
                cuda_graph_max_bs: Optional[int] = None
         
     | 
| 
         @@ -349,17 +434,24 @@ class ServerArgs: 
     | 
|
| 
       349 
434 
     | 
    
         
             
                enable_symm_mem: bool = False
         
     | 
| 
       350 
435 
     | 
    
         
             
                disable_flashinfer_cutlass_moe_fp4_allgather: bool = False
         
     | 
| 
       351 
436 
     | 
    
         
             
                enable_tokenizer_batch_encode: bool = False
         
     | 
| 
      
 437 
     | 
    
         
            +
                disable_tokenizer_batch_decode: bool = False
         
     | 
| 
       352 
438 
     | 
    
         
             
                disable_outlines_disk_cache: bool = False
         
     | 
| 
       353 
439 
     | 
    
         
             
                disable_custom_all_reduce: bool = False
         
     | 
| 
       354 
440 
     | 
    
         
             
                enable_mscclpp: bool = False
         
     | 
| 
      
 441 
     | 
    
         
            +
                enable_torch_symm_mem: bool = False
         
     | 
| 
       355 
442 
     | 
    
         
             
                disable_overlap_schedule: bool = False
         
     | 
| 
       356 
443 
     | 
    
         
             
                enable_mixed_chunk: bool = False
         
     | 
| 
       357 
444 
     | 
    
         
             
                enable_dp_attention: bool = False
         
     | 
| 
       358 
445 
     | 
    
         
             
                enable_dp_lm_head: bool = False
         
     | 
| 
       359 
446 
     | 
    
         
             
                enable_two_batch_overlap: bool = False
         
     | 
| 
      
 447 
     | 
    
         
            +
                enable_single_batch_overlap: bool = False
         
     | 
| 
       360 
448 
     | 
    
         
             
                tbo_token_distribution_threshold: float = 0.48
         
     | 
| 
       361 
449 
     | 
    
         
             
                enable_torch_compile: bool = False
         
     | 
| 
      
 450 
     | 
    
         
            +
                enable_piecewise_cuda_graph: bool = False
         
     | 
| 
       362 
451 
     | 
    
         
             
                torch_compile_max_bs: int = 32
         
     | 
| 
      
 452 
     | 
    
         
            +
                piecewise_cuda_graph_max_tokens: int = 4096
         
     | 
| 
      
 453 
     | 
    
         
            +
                piecewise_cuda_graph_tokens: Optional[List[int]] = None
         
     | 
| 
      
 454 
     | 
    
         
            +
                piecewise_cuda_graph_compiler: str = "eager"
         
     | 
| 
       363 
455 
     | 
    
         
             
                torchao_config: str = ""
         
     | 
| 
       364 
456 
     | 
    
         
             
                enable_nan_detection: bool = False
         
     | 
| 
       365 
457 
     | 
    
         
             
                enable_p2p_check: bool = False
         
     | 
| 
         @@ -369,15 +461,18 @@ class ServerArgs: 
     | 
|
| 
       369 
461 
     | 
    
         
             
                num_continuous_decode_steps: int = 1
         
     | 
| 
       370 
462 
     | 
    
         
             
                delete_ckpt_after_loading: bool = False
         
     | 
| 
       371 
463 
     | 
    
         
             
                enable_memory_saver: bool = False
         
     | 
| 
      
 464 
     | 
    
         
            +
                enable_weights_cpu_backup: bool = False
         
     | 
| 
       372 
465 
     | 
    
         
             
                allow_auto_truncate: bool = False
         
     | 
| 
       373 
466 
     | 
    
         
             
                enable_custom_logit_processor: bool = False
         
     | 
| 
       374 
467 
     | 
    
         
             
                flashinfer_mla_disable_ragged: bool = False
         
     | 
| 
       375 
468 
     | 
    
         
             
                disable_shared_experts_fusion: bool = False
         
     | 
| 
       376 
469 
     | 
    
         
             
                disable_chunked_prefix_cache: bool = False
         
     | 
| 
       377 
470 
     | 
    
         
             
                disable_fast_image_processor: bool = False
         
     | 
| 
      
 471 
     | 
    
         
            +
                keep_mm_feature_on_device: bool = False
         
     | 
| 
       378 
472 
     | 
    
         
             
                enable_return_hidden_states: bool = False
         
     | 
| 
       379 
473 
     | 
    
         
             
                scheduler_recv_interval: int = 1
         
     | 
| 
       380 
474 
     | 
    
         
             
                numa_node: Optional[List[int]] = None
         
     | 
| 
      
 475 
     | 
    
         
            +
                enable_deterministic_inference: bool = False
         
     | 
| 
       381 
476 
     | 
    
         | 
| 
       382 
477 
     | 
    
         
             
                # Dynamic batch tokenizer
         
     | 
| 
       383 
478 
     | 
    
         
             
                enable_dynamic_batch_tokenizer: bool = False
         
     | 
| 
         @@ -388,7 +483,6 @@ class ServerArgs: 
     | 
|
| 
       388 
483 
     | 
    
         
             
                debug_tensor_dump_output_folder: Optional[str] = None
         
     | 
| 
       389 
484 
     | 
    
         
             
                debug_tensor_dump_input_file: Optional[str] = None
         
     | 
| 
       390 
485 
     | 
    
         
             
                debug_tensor_dump_inject: bool = False
         
     | 
| 
       391 
     | 
    
         
            -
                debug_tensor_dump_prefill_only: bool = False
         
     | 
| 
       392 
486 
     | 
    
         | 
| 
       393 
487 
     | 
    
         
             
                # PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
         
     | 
| 
       394 
488 
     | 
    
         
             
                disaggregation_mode: Literal["null", "prefill", "decode"] = "null"
         
     | 
| 
         @@ -398,79 +492,147 @@ class ServerArgs: 
     | 
|
| 
       398 
492 
     | 
    
         
             
                disaggregation_decode_dp: Optional[int] = None
         
     | 
| 
       399 
493 
     | 
    
         
             
                disaggregation_prefill_pp: Optional[int] = 1
         
     | 
| 
       400 
494 
     | 
    
         
             
                disaggregation_ib_device: Optional[str] = None
         
     | 
| 
      
 495 
     | 
    
         
            +
                disaggregation_decode_enable_offload_kvcache: bool = False
         
     | 
| 
       401 
496 
     | 
    
         
             
                num_reserved_decode_tokens: int = 512  # used for decode kv cache offload in PD
         
     | 
| 
       402 
     | 
    
         
            -
             
     | 
| 
       403 
497 
     | 
    
         
             
                # FIXME: hack to reduce ITL when decode bs is small
         
     | 
| 
       404 
498 
     | 
    
         
             
                disaggregation_decode_polling_interval: int = 1
         
     | 
| 
       405 
499 
     | 
    
         | 
| 
       406 
     | 
    
         
            -
                # For model weight update
         
     | 
| 
      
 500 
     | 
    
         
            +
                # For model weight update and weight loading
         
     | 
| 
       407 
501 
     | 
    
         
             
                custom_weight_loader: Optional[List[str]] = None
         
     | 
| 
       408 
502 
     | 
    
         
             
                weight_loader_disable_mmap: bool = False
         
     | 
| 
       409 
     | 
    
         
            -
             
     | 
| 
       410 
     | 
    
         
            -
                # Remote instance weight loading
         
     | 
| 
       411 
503 
     | 
    
         
             
                remote_instance_weight_loader_seed_instance_ip: Optional[str] = None
         
     | 
| 
       412 
504 
     | 
    
         
             
                remote_instance_weight_loader_seed_instance_service_port: Optional[int] = None
         
     | 
| 
       413 
505 
     | 
    
         
             
                remote_instance_weight_loader_send_weights_group_ports: Optional[List[int]] = None
         
     | 
| 
       414 
506 
     | 
    
         | 
| 
       415 
507 
     | 
    
         
             
                # For PD-Multiplexing
         
     | 
| 
       416 
508 
     | 
    
         
             
                enable_pdmux: bool = False
         
     | 
| 
       417 
     | 
    
         
            -
                 
     | 
| 
       418 
     | 
    
         
            -
             
     | 
| 
       419 
     | 
    
         
            -
                # Mamba cache
         
     | 
| 
       420 
     | 
    
         
            -
                max_mamba_cache_size: Optional[int] = None
         
     | 
| 
       421 
     | 
    
         
            -
                mamba_ssm_dtype: str = "float32"
         
     | 
| 
      
 509 
     | 
    
         
            +
                pdmux_config_path: Optional[str] = None
         
     | 
| 
      
 510 
     | 
    
         
            +
                sm_group_num: int = 8
         
     | 
| 
       422 
511 
     | 
    
         | 
| 
       423 
     | 
    
         
            -
                 
     | 
| 
       424 
     | 
    
         
            -
             
     | 
| 
       425 
     | 
    
         
            -
             
     | 
| 
       426 
     | 
    
         
            -
             
     | 
| 
       427 
     | 
    
         
            -
             
     | 
| 
       428 
     | 
    
         
            -
             
     | 
| 
       429 
     | 
    
         
            -
             
     | 
| 
       430 
     | 
    
         
            -
             
     | 
| 
      
 512 
     | 
    
         
            +
                def get_attention_backends(server_args):
         
     | 
| 
      
 513 
     | 
    
         
            +
                    prefill_attention_backend_str = (
         
     | 
| 
      
 514 
     | 
    
         
            +
                        server_args.prefill_attention_backend
         
     | 
| 
      
 515 
     | 
    
         
            +
                        if server_args.prefill_attention_backend
         
     | 
| 
      
 516 
     | 
    
         
            +
                        else server_args.attention_backend
         
     | 
| 
      
 517 
     | 
    
         
            +
                    )
         
     | 
| 
      
 518 
     | 
    
         
            +
                    decode_attention_backend_str = (
         
     | 
| 
      
 519 
     | 
    
         
            +
                        server_args.decode_attention_backend
         
     | 
| 
      
 520 
     | 
    
         
            +
                        if server_args.decode_attention_backend
         
     | 
| 
      
 521 
     | 
    
         
            +
                        else server_args.attention_backend
         
     | 
| 
      
 522 
     | 
    
         
            +
                    )
         
     | 
| 
      
 523 
     | 
    
         
            +
                    return prefill_attention_backend_str, decode_attention_backend_str
         
     | 
| 
       431 
524 
     | 
    
         | 
| 
       432 
525 
     | 
    
         
             
                def __post_init__(self):
         
     | 
| 
       433 
     | 
    
         
            -
                     
     | 
| 
       434 
     | 
    
         
            -
                     
     | 
| 
       435 
     | 
    
         
            -
             
     | 
| 
       436 
     | 
    
         
            -
             
     | 
| 
       437 
     | 
    
         
            -
             
     | 
| 
       438 
     | 
    
         
            -
                         
     | 
| 
       439 
     | 
    
         
            -
             
     | 
| 
       440 
     | 
    
         
            -
             
     | 
| 
       441 
     | 
    
         
            -
             
     | 
| 
       442 
     | 
    
         
            -
             
     | 
| 
       443 
     | 
    
         
            -
             
     | 
| 
       444 
     | 
    
         
            -
                     
     | 
| 
       445 
     | 
    
         
            -
             
     | 
| 
       446 
     | 
    
         
            -
             
     | 
| 
       447 
     | 
    
         
            -
             
     | 
| 
       448 
     | 
    
         
            -
             
     | 
| 
       449 
     | 
    
         
            -
             
     | 
| 
       450 
     | 
    
         
            -
             
     | 
| 
       451 
     | 
    
         
            -
             
     | 
| 
       452 
     | 
    
         
            -
             
     | 
| 
       453 
     | 
    
         
            -
             
     | 
| 
       454 
     | 
    
         
            -
                     
     | 
| 
       455 
     | 
    
         
            -
             
     | 
| 
       456 
     | 
    
         
            -
             
     | 
| 
       457 
     | 
    
         
            -
             
     | 
| 
       458 
     | 
    
         
            -
             
     | 
| 
       459 
     | 
    
         
            -
             
     | 
| 
       460 
     | 
    
         
            -
             
     | 
| 
       461 
     | 
    
         
            -
             
     | 
| 
       462 
     | 
    
         
            -
             
     | 
| 
       463 
     | 
    
         
            -
             
     | 
| 
       464 
     | 
    
         
            -
                     
     | 
| 
       465 
     | 
    
         
            -
             
     | 
| 
       466 
     | 
    
         
            -
             
     | 
| 
       467 
     | 
    
         
            -
             
     | 
| 
      
 526 
     | 
    
         
            +
                    """
         
     | 
| 
      
 527 
     | 
    
         
            +
                    Orchestrates the handling of various server arguments, ensuring proper configuration and validation.
         
     | 
| 
      
 528 
     | 
    
         
            +
                    """
         
     | 
| 
      
 529 
     | 
    
         
            +
             
     | 
| 
      
 530 
     | 
    
         
            +
                    if self.model_path.lower() in ["none", "dummy"]:
         
     | 
| 
      
 531 
     | 
    
         
            +
                        # Skip for dummy models
         
     | 
| 
      
 532 
     | 
    
         
            +
                        return
         
     | 
| 
      
 533 
     | 
    
         
            +
             
     | 
| 
      
 534 
     | 
    
         
            +
                    # Handle deprecated arguments.
         
     | 
| 
      
 535 
     | 
    
         
            +
                    self._handle_deprecated_args()
         
     | 
| 
      
 536 
     | 
    
         
            +
             
     | 
| 
      
 537 
     | 
    
         
            +
                    # Set missing default values.
         
     | 
| 
      
 538 
     | 
    
         
            +
                    self._handle_missing_default_values()
         
     | 
| 
      
 539 
     | 
    
         
            +
             
     | 
| 
      
 540 
     | 
    
         
            +
                    # Get GPU memory capacity, which is a common dependency for several configuration steps.
         
     | 
| 
      
 541 
     | 
    
         
            +
                    gpu_mem = get_device_memory_capacity(self.device)
         
     | 
| 
      
 542 
     | 
    
         
            +
             
     | 
| 
      
 543 
     | 
    
         
            +
                    # Handle memory-related, chunked prefill, and CUDA graph batch size configurations.
         
     | 
| 
      
 544 
     | 
    
         
            +
                    self._handle_gpu_memory_settings(gpu_mem)
         
     | 
| 
      
 545 
     | 
    
         
            +
             
     | 
| 
      
 546 
     | 
    
         
            +
                    # Handle device-specific backends.
         
     | 
| 
      
 547 
     | 
    
         
            +
                    self._handle_hpu_backends()
         
     | 
| 
      
 548 
     | 
    
         
            +
                    self._handle_cpu_backends()
         
     | 
| 
      
 549 
     | 
    
         
            +
             
     | 
| 
      
 550 
     | 
    
         
            +
                    # Apply model-specific adjustments.
         
     | 
| 
      
 551 
     | 
    
         
            +
                    self._handle_model_specific_adjustments()
         
     | 
| 
      
 552 
     | 
    
         
            +
             
     | 
| 
      
 553 
     | 
    
         
            +
                    # Set kernel backends.
         
     | 
| 
      
 554 
     | 
    
         
            +
                    self._handle_sampling_backend()
         
     | 
| 
      
 555 
     | 
    
         
            +
                    self._handle_attention_backend_compatibility()
         
     | 
| 
      
 556 
     | 
    
         
            +
                    self._handle_page_size()
         
     | 
| 
      
 557 
     | 
    
         
            +
                    self._handle_amd_specifics()
         
     | 
| 
      
 558 
     | 
    
         
            +
                    self._handle_grammar_backend()
         
     | 
| 
      
 559 
     | 
    
         
            +
             
     | 
| 
      
 560 
     | 
    
         
            +
                    # Handle Ktransformers specific configs
         
     | 
| 
      
 561 
     | 
    
         
            +
                    self._handle_ktransformers_configs()
         
     | 
| 
      
 562 
     | 
    
         
            +
             
     | 
| 
      
 563 
     | 
    
         
            +
                    # Handle data parallelism.
         
     | 
| 
      
 564 
     | 
    
         
            +
                    self._handle_data_parallelism()
         
     | 
| 
      
 565 
     | 
    
         
            +
             
     | 
| 
      
 566 
     | 
    
         
            +
                    # Handle MoE configurations.
         
     | 
| 
      
 567 
     | 
    
         
            +
                    self._handle_moe_kernel_config()
         
     | 
| 
      
 568 
     | 
    
         
            +
                    self._handle_a2a_moe()
         
     | 
| 
      
 569 
     | 
    
         
            +
                    self._handle_eplb_and_dispatch()
         
     | 
| 
      
 570 
     | 
    
         
            +
                    self._handle_expert_distribution_metrics()
         
     | 
| 
      
 571 
     | 
    
         
            +
             
     | 
| 
      
 572 
     | 
    
         
            +
                    # Handle pipeline parallelism.
         
     | 
| 
      
 573 
     | 
    
         
            +
                    self._handle_pipeline_parallelism()
         
     | 
| 
      
 574 
     | 
    
         
            +
             
     | 
| 
      
 575 
     | 
    
         
            +
                    # Handle Hicache settings.
         
     | 
| 
      
 576 
     | 
    
         
            +
                    self._handle_hicache()
         
     | 
| 
      
 577 
     | 
    
         
            +
             
     | 
| 
      
 578 
     | 
    
         
            +
                    # Handle speculative decoding logic.
         
     | 
| 
      
 579 
     | 
    
         
            +
                    self._handle_speculative_decoding()
         
     | 
| 
      
 580 
     | 
    
         
            +
             
     | 
| 
      
 581 
     | 
    
         
            +
                    # Handle model loading format.
         
     | 
| 
      
 582 
     | 
    
         
            +
                    self._handle_load_format()
         
     | 
| 
      
 583 
     | 
    
         
            +
             
     | 
| 
      
 584 
     | 
    
         
            +
                    # Handle PD disaggregation.
         
     | 
| 
      
 585 
     | 
    
         
            +
                    self._handle_disaggregation()
         
     | 
| 
      
 586 
     | 
    
         
            +
             
     | 
| 
      
 587 
     | 
    
         
            +
                    # Validate tokenizer settings.
         
     | 
| 
      
 588 
     | 
    
         
            +
                    self._handle_tokenizer_batching()
         
     | 
| 
      
 589 
     | 
    
         
            +
             
     | 
| 
      
 590 
     | 
    
         
            +
                    # Propagate environment variables.
         
     | 
| 
      
 591 
     | 
    
         
            +
                    self._handle_environment_variables()
         
     | 
| 
      
 592 
     | 
    
         
            +
             
     | 
| 
      
 593 
     | 
    
         
            +
                    # Validate cache settings.
         
     | 
| 
      
 594 
     | 
    
         
            +
                    self._handle_cache_compatibility()
         
     | 
| 
      
 595 
     | 
    
         
            +
             
     | 
| 
      
 596 
     | 
    
         
            +
                    # Validate metrics labels.
         
     | 
| 
      
 597 
     | 
    
         
            +
                    self._handle_metrics_labels()
         
     | 
| 
      
 598 
     | 
    
         
            +
             
     | 
| 
      
 599 
     | 
    
         
            +
                    # Handle deterministic inference.
         
     | 
| 
      
 600 
     | 
    
         
            +
                    self._handle_deterministic_inference()
         
     | 
| 
      
 601 
     | 
    
         
            +
             
     | 
| 
      
 602 
     | 
    
         
            +
                    # Handle any other necessary validations.
         
     | 
| 
      
 603 
     | 
    
         
            +
                    self._handle_other_validations()
         
     | 
| 
      
 604 
     | 
    
         
            +
             
     | 
| 
      
 605 
     | 
    
         
            +
                    # Handle elastic expert parallelism.
         
     | 
| 
      
 606 
     | 
    
         
            +
                    self._handle_elastic_ep()
         
     | 
| 
      
 607 
     | 
    
         
            +
             
     | 
| 
      
 608 
     | 
    
         
            +
                def _handle_deprecated_args(self):
         
     | 
| 
      
 609 
     | 
    
         
            +
                    # handle deprecated tool call parsers
         
     | 
| 
      
 610 
     | 
    
         
            +
                    deprecated_tool_call_parsers = {"qwen25": "qwen", "glm45": "glm"}
         
     | 
| 
      
 611 
     | 
    
         
            +
                    if self.tool_call_parser in deprecated_tool_call_parsers:
         
     | 
| 
      
 612 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 613 
     | 
    
         
            +
                            f"The tool_call_parser '{self.tool_call_parser}' is deprecated. Please use '{deprecated_tool_call_parsers[self.tool_call_parser]}' instead."
         
     | 
| 
       468 
614 
     | 
    
         
             
                        )
         
     | 
| 
      
 615 
     | 
    
         
            +
                        self.tool_call_parser = deprecated_tool_call_parsers[self.tool_call_parser]
         
     | 
| 
       469 
616 
     | 
    
         | 
| 
       470 
     | 
    
         
            -
             
     | 
| 
      
 617 
     | 
    
         
            +
                def _handle_ktransformers_configs(self):
         
     | 
| 
      
 618 
     | 
    
         
            +
                    from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import (
         
     | 
| 
      
 619 
     | 
    
         
            +
                        CompressedTensorsWNA16AMXEPMoEMethod,
         
     | 
| 
      
 620 
     | 
    
         
            +
                        override_config,
         
     | 
| 
      
 621 
     | 
    
         
            +
                    )
         
     | 
| 
      
 622 
     | 
    
         
            +
             
     | 
| 
      
 623 
     | 
    
         
            +
                    override_config(
         
     | 
| 
      
 624 
     | 
    
         
            +
                        CompressedTensorsWNA16AMXEPMoEMethod,
         
     | 
| 
      
 625 
     | 
    
         
            +
                        self.kt_num_gpu_experts,
         
     | 
| 
      
 626 
     | 
    
         
            +
                        self.kt_cpuinfer,
         
     | 
| 
      
 627 
     | 
    
         
            +
                        self.kt_threadpool_count,
         
     | 
| 
      
 628 
     | 
    
         
            +
                        self.kt_amx_weight_path,
         
     | 
| 
      
 629 
     | 
    
         
            +
                        self.kt_amx_method,
         
     | 
| 
      
 630 
     | 
    
         
            +
                        self.chunked_prefill_size,
         
     | 
| 
      
 631 
     | 
    
         
            +
                    )
         
     | 
| 
      
 632 
     | 
    
         
            +
             
     | 
| 
      
 633 
     | 
    
         
            +
                def _handle_missing_default_values(self):
         
     | 
| 
       471 
634 
     | 
    
         
             
                    if self.tokenizer_path is None:
         
     | 
| 
       472 
635 
     | 
    
         
             
                        self.tokenizer_path = self.model_path
         
     | 
| 
       473 
     | 
    
         
            -
             
     | 
| 
       474 
636 
     | 
    
         
             
                    if self.served_model_name is None:
         
     | 
| 
       475 
637 
     | 
    
         
             
                        self.served_model_name = self.model_path
         
     | 
| 
       476 
638 
     | 
    
         
             
                    if self.device is None:
         
     | 
| 
         @@ -478,56 +640,165 @@ class ServerArgs: 
     | 
|
| 
       478 
640 
     | 
    
         
             
                    if self.random_seed is None:
         
     | 
| 
       479 
641 
     | 
    
         
             
                        self.random_seed = random.randint(0, 1 << 30)
         
     | 
| 
       480 
642 
     | 
    
         | 
| 
       481 
     | 
    
         
            -
             
     | 
| 
       482 
     | 
    
         
            -
             
     | 
| 
       483 
     | 
    
         
            -
                     
     | 
| 
       484 
     | 
    
         
            -
                     
     | 
| 
       485 
     | 
    
         
            -
             
     | 
| 
       486 
     | 
    
         
            -
             
     | 
| 
       487 
     | 
    
         
            -
             
     | 
| 
       488 
     | 
    
         
            -
             
     | 
| 
       489 
     | 
    
         
            -
             
     | 
| 
       490 
     | 
    
         
            -
             
     | 
| 
       491 
     | 
    
         
            -
             
     | 
| 
       492 
     | 
    
         
            -
             
     | 
| 
       493 
     | 
    
         
            -
             
     | 
| 
       494 
     | 
    
         
            -
             
     | 
| 
       495 
     | 
    
         
            -
             
     | 
| 
       496 
     | 
    
         
            -
             
     | 
| 
       497 
     | 
    
         
            -
             
     | 
| 
       498 
     | 
    
         
            -
             
     | 
| 
       499 
     | 
    
         
            -
             
     | 
| 
       500 
     | 
    
         
            -
             
     | 
| 
       501 
     | 
    
         
            -
             
     | 
| 
       502 
     | 
    
         
            -
             
     | 
| 
       503 
     | 
    
         
            -
             
     | 
| 
       504 
     | 
    
         
            -
             
     | 
| 
       505 
     | 
    
         
            -
             
     | 
| 
       506 
     | 
    
         
            -
             
     | 
| 
       507 
     | 
    
         
            -
                             
     | 
| 
       508 
     | 
    
         
            -
             
     | 
| 
       509 
     | 
    
         
            -
             
     | 
| 
       510 
     | 
    
         
            -
             
     | 
| 
       511 
     | 
    
         
            -
             
     | 
| 
       512 
     | 
    
         
            -
                                 
     | 
| 
       513 
     | 
    
         
            -
             
     | 
| 
       514 
     | 
    
         
            -
             
     | 
| 
       515 
     | 
    
         
            -
             
     | 
| 
       516 
     | 
    
         
            -
             
     | 
| 
       517 
     | 
    
         
            -
             
     | 
| 
       518 
     | 
    
         
            -
                            if self. 
     | 
| 
       519 
     | 
    
         
            -
                                if self. 
     | 
| 
       520 
     | 
    
         
            -
                                     
     | 
| 
       521 
     | 
    
         
            -
             
     | 
| 
       522 
     | 
    
         
            -
                                     
     | 
| 
      
 643 
     | 
    
         
            +
                def _handle_gpu_memory_settings(self, gpu_mem):
         
     | 
| 
      
 644 
     | 
    
         
            +
                    """
         
     | 
| 
      
 645 
     | 
    
         
            +
                    Configure GPU memory-dependent settings including
         
     | 
| 
      
 646 
     | 
    
         
            +
                    chunked_prefill_size, cuda_graph_max_bs, and mem_fraction_static.
         
     | 
| 
      
 647 
     | 
    
         
            +
             
     | 
| 
      
 648 
     | 
    
         
            +
                    Here are our heuristics:
         
     | 
| 
      
 649 
     | 
    
         
            +
                    - Set chunked_prefill_size and cuda_graph_max_bs based on the GPU memory capacity.
         
     | 
| 
      
 650 
     | 
    
         
            +
                      This is because GPUs with more memory are generally more powerful, we need to use a larger
         
     | 
| 
      
 651 
     | 
    
         
            +
                      chunked_prefill_size and a larger cuda_graph_max_bs to fully utilize the GPU.
         
     | 
| 
      
 652 
     | 
    
         
            +
                    - Then set mem_fraction_static based on chunked_prefill_size and cuda_graph_max_bs.
         
     | 
| 
      
 653 
     | 
    
         
            +
             
     | 
| 
      
 654 
     | 
    
         
            +
                      GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
         
     | 
| 
      
 655 
     | 
    
         
            +
             
     | 
| 
      
 656 
     | 
    
         
            +
                      The argument mem_fraction_static is defined as (model weights + KV cache pool) / GPU memory capacity,
         
     | 
| 
      
 657 
     | 
    
         
            +
                      or equivalently, mem_fraction_static = (GPU memory capacity - activations - cuda graph buffers) / GPU memory capacity.
         
     | 
| 
      
 658 
     | 
    
         
            +
             
     | 
| 
      
 659 
     | 
    
         
            +
                      In order to compute mem_fraction_static, we need to estimate the size of activations and cuda graph buffers.
         
     | 
| 
      
 660 
     | 
    
         
            +
                      The activation memory is proportional to the chunked_prefill_size.
         
     | 
| 
      
 661 
     | 
    
         
            +
                      The cuda graph memory is proportional to the cuda_graph_max_bs.
         
     | 
| 
      
 662 
     | 
    
         
            +
                      We use reserved_mem = chunked_prefill_size * 1.5 + cuda_graph_max_bs * 2 to estimate the size of activations and cuda graph buffers in GB.
         
     | 
| 
      
 663 
     | 
    
         
            +
                      and set mem_fraction_static = (GPU memory capacity - reserved_mem) / GPU memory capacity.
         
     | 
| 
      
 664 
     | 
    
         
            +
             
     | 
| 
      
 665 
     | 
    
         
            +
                      The coefficient 1.5 is a heuristic value, in the future, we can do better estimation by looking at the model types, hidden sizes or even do a dummy run.
         
     | 
| 
      
 666 
     | 
    
         
            +
                    """
         
     | 
| 
      
 667 
     | 
    
         
            +
                    if gpu_mem is not None:
         
     | 
| 
      
 668 
     | 
    
         
            +
                        if gpu_mem < 20 * 1024:
         
     | 
| 
      
 669 
     | 
    
         
            +
                            # T4, 4080
         
     | 
| 
      
 670 
     | 
    
         
            +
                            # (chunked_prefill_size 2k, cuda_graph_max_bs 8)
         
     | 
| 
      
 671 
     | 
    
         
            +
                            if self.chunked_prefill_size is None:
         
     | 
| 
      
 672 
     | 
    
         
            +
                                self.chunked_prefill_size = 2048
         
     | 
| 
      
 673 
     | 
    
         
            +
                            if self.cuda_graph_max_bs is None:
         
     | 
| 
      
 674 
     | 
    
         
            +
                                self.cuda_graph_max_bs = 8
         
     | 
| 
      
 675 
     | 
    
         
            +
                        elif is_npu() and gpu_mem < 32 * 1024:
         
     | 
| 
      
 676 
     | 
    
         
            +
                            # Atlas A2B4
         
     | 
| 
      
 677 
     | 
    
         
            +
                            # (chunked_prefill_size 32k, cuda_graph_max_bs 16 if tp < 4 else 64)
         
     | 
| 
      
 678 
     | 
    
         
            +
                            if self.chunked_prefill_size is None:
         
     | 
| 
      
 679 
     | 
    
         
            +
                                self.chunked_prefill_size = 32768
         
     | 
| 
      
 680 
     | 
    
         
            +
                            if self.cuda_graph_max_bs is None:
         
     | 
| 
      
 681 
     | 
    
         
            +
                                if self.tp_size < 4:
         
     | 
| 
      
 682 
     | 
    
         
            +
                                    self.cuda_graph_max_bs = 16
         
     | 
| 
      
 683 
     | 
    
         
            +
                                else:
         
     | 
| 
      
 684 
     | 
    
         
            +
                                    self.cuda_graph_max_bs = 64
         
     | 
| 
      
 685 
     | 
    
         
            +
                        elif gpu_mem < 35 * 1024:
         
     | 
| 
      
 686 
     | 
    
         
            +
                            # A10, 4090, 5090
         
     | 
| 
      
 687 
     | 
    
         
            +
                            # (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80)
         
     | 
| 
      
 688 
     | 
    
         
            +
                            if self.chunked_prefill_size is None:
         
     | 
| 
      
 689 
     | 
    
         
            +
                                self.chunked_prefill_size = 2048
         
     | 
| 
      
 690 
     | 
    
         
            +
                            if self.cuda_graph_max_bs is None:
         
     | 
| 
      
 691 
     | 
    
         
            +
                                # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM < 35GB, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance.
         
     | 
| 
      
 692 
     | 
    
         
            +
                                # However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs
         
     | 
| 
      
 693 
     | 
    
         
            +
                                # from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
         
     | 
| 
      
 694 
     | 
    
         
            +
                                if self.tp_size < 4:
         
     | 
| 
      
 695 
     | 
    
         
            +
                                    self.cuda_graph_max_bs = 16
         
     | 
| 
      
 696 
     | 
    
         
            +
                                else:
         
     | 
| 
      
 697 
     | 
    
         
            +
                                    self.cuda_graph_max_bs = 80
         
     | 
| 
      
 698 
     | 
    
         
            +
                        elif gpu_mem < 60 * 1024:
         
     | 
| 
      
 699 
     | 
    
         
            +
                            # A100 (40GB), L40,
         
     | 
| 
      
 700 
     | 
    
         
            +
                            # (chunked_prefill_size 4k, cuda_graph_max_bs 32 if tp < 4 else 160)
         
     | 
| 
      
 701 
     | 
    
         
            +
                            if self.chunked_prefill_size is None:
         
     | 
| 
      
 702 
     | 
    
         
            +
                                self.chunked_prefill_size = 4096
         
     | 
| 
      
 703 
     | 
    
         
            +
                            if self.cuda_graph_max_bs is None:
         
     | 
| 
      
 704 
     | 
    
         
            +
                                if self.tp_size < 4:
         
     | 
| 
      
 705 
     | 
    
         
            +
                                    self.cuda_graph_max_bs = 32
         
     | 
| 
       523 
706 
     | 
    
         
             
                                else:
         
     | 
| 
       524 
     | 
    
         
            -
                                     
     | 
| 
       525 
     | 
    
         
            -
             
     | 
| 
       526 
     | 
    
         
            -
             
     | 
| 
      
 707 
     | 
    
         
            +
                                    self.cuda_graph_max_bs = 160
         
     | 
| 
      
 708 
     | 
    
         
            +
                        elif is_npu() and gpu_mem < 64 * 1024:
         
     | 
| 
      
 709 
     | 
    
         
            +
                            # Atlas A2 and Atlas A3
         
     | 
| 
      
 710 
     | 
    
         
            +
                            # (chunked_prefill_size 32k, cuda_graph_max_bs 64 if tp < 4 else 128)
         
     | 
| 
      
 711 
     | 
    
         
            +
                            if self.chunked_prefill_size is None:
         
     | 
| 
      
 712 
     | 
    
         
            +
                                self.chunked_prefill_size = 32768
         
     | 
| 
      
 713 
     | 
    
         
            +
                            if self.cuda_graph_max_bs is None:
         
     | 
| 
      
 714 
     | 
    
         
            +
                                if self.tp_size < 4:
         
     | 
| 
      
 715 
     | 
    
         
            +
                                    self.cuda_graph_max_bs = 64
         
     | 
| 
      
 716 
     | 
    
         
            +
                                else:
         
     | 
| 
      
 717 
     | 
    
         
            +
                                    self.cuda_graph_max_bs = 128
         
     | 
| 
      
 718 
     | 
    
         
            +
                        elif gpu_mem < 90 * 1024:
         
     | 
| 
      
 719 
     | 
    
         
            +
                            # H100, A100
         
     | 
| 
      
 720 
     | 
    
         
            +
                            # (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
         
     | 
| 
      
 721 
     | 
    
         
            +
                            if self.chunked_prefill_size is None:
         
     | 
| 
      
 722 
     | 
    
         
            +
                                self.chunked_prefill_size = 8192
         
     | 
| 
      
 723 
     | 
    
         
            +
                            if self.cuda_graph_max_bs is None:
         
     | 
| 
      
 724 
     | 
    
         
            +
                                if self.tp_size < 4:
         
     | 
| 
      
 725 
     | 
    
         
            +
                                    self.cuda_graph_max_bs = 256
         
     | 
| 
      
 726 
     | 
    
         
            +
                                else:
         
     | 
| 
      
 727 
     | 
    
         
            +
                                    self.cuda_graph_max_bs = 512
         
     | 
| 
      
 728 
     | 
    
         
            +
                        elif gpu_mem < 160 * 1024:
         
     | 
| 
      
 729 
     | 
    
         
            +
                            # H20, H200
         
     | 
| 
      
 730 
     | 
    
         
            +
                            # (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
         
     | 
| 
      
 731 
     | 
    
         
            +
                            if self.chunked_prefill_size is None:
         
     | 
| 
      
 732 
     | 
    
         
            +
                                self.chunked_prefill_size = 8192
         
     | 
| 
      
 733 
     | 
    
         
            +
                            if self.cuda_graph_max_bs is None:
         
     | 
| 
      
 734 
     | 
    
         
            +
                                if self.tp_size < 4:
         
     | 
| 
      
 735 
     | 
    
         
            +
                                    self.cuda_graph_max_bs = 256
         
     | 
| 
      
 736 
     | 
    
         
            +
                                else:
         
     | 
| 
      
 737 
     | 
    
         
            +
                                    self.cuda_graph_max_bs = 512
         
     | 
| 
      
 738 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 739 
     | 
    
         
            +
                            # B200, MI300
         
     | 
| 
      
 740 
     | 
    
         
            +
                            # (chunked_prefill_size 16k, cuda_graph_max_bs 512)
         
     | 
| 
      
 741 
     | 
    
         
            +
                            if self.chunked_prefill_size is None:
         
     | 
| 
      
 742 
     | 
    
         
            +
                                self.chunked_prefill_size = 16384
         
     | 
| 
      
 743 
     | 
    
         
            +
                            if self.cuda_graph_max_bs is None:
         
     | 
| 
      
 744 
     | 
    
         
            +
                                self.cuda_graph_max_bs = 512
         
     | 
| 
      
 745 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 746 
     | 
    
         
            +
                        # Fallback defaults when gpu_mem is None
         
     | 
| 
      
 747 
     | 
    
         
            +
                        if self.chunked_prefill_size is None:
         
     | 
| 
      
 748 
     | 
    
         
            +
                            self.chunked_prefill_size = 4096
         
     | 
| 
      
 749 
     | 
    
         
            +
                        if self.cuda_graph_max_bs is None:
         
     | 
| 
      
 750 
     | 
    
         
            +
                            self.cuda_graph_max_bs = 160
         
     | 
| 
      
 751 
     | 
    
         
            +
             
     | 
| 
      
 752 
     | 
    
         
            +
                    # Set cuda graph batch sizes
         
     | 
| 
      
 753 
     | 
    
         
            +
                    if self.cuda_graph_bs is None:
         
     | 
| 
      
 754 
     | 
    
         
            +
                        self.cuda_graph_bs = self._generate_cuda_graph_batch_sizes()
         
     | 
| 
      
 755 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 756 
     | 
    
         
            +
                        self.cuda_graph_max_bs = max(self.cuda_graph_bs)
         
     | 
| 
       527 
757 
     | 
    
         | 
| 
       528 
     | 
    
         
            -
             
     | 
| 
      
 758 
     | 
    
         
            +
                    if self.piecewise_cuda_graph_tokens is None:
         
     | 
| 
      
 759 
     | 
    
         
            +
                        self.piecewise_cuda_graph_tokens = (
         
     | 
| 
      
 760 
     | 
    
         
            +
                            self._generate_piecewise_cuda_graph_tokens()
         
     | 
| 
      
 761 
     | 
    
         
            +
                        )
         
     | 
| 
      
 762 
     | 
    
         
            +
             
     | 
| 
      
 763 
     | 
    
         
            +
                    if self.mem_fraction_static is None:
         
     | 
| 
      
 764 
     | 
    
         
            +
                        # Constant meta data (e.g., from attention backend)
         
     | 
| 
      
 765 
     | 
    
         
            +
                        reserved_mem = 512
         
     | 
| 
      
 766 
     | 
    
         
            +
                        # For activation during large prefill
         
     | 
| 
      
 767 
     | 
    
         
            +
                        if self.chunked_prefill_size > 0:
         
     | 
| 
      
 768 
     | 
    
         
            +
                            reserved_mem += max(self.chunked_prefill_size, 2048) * 1.5
         
     | 
| 
       529 
769 
     | 
    
         
             
                        else:
         
     | 
| 
       530 
     | 
    
         
            -
                            self. 
     | 
| 
      
 770 
     | 
    
         
            +
                            reserved_mem += max(self.max_prefill_tokens, 2048) * 1.5
         
     | 
| 
      
 771 
     | 
    
         
            +
                        # For cuda graphs
         
     | 
| 
      
 772 
     | 
    
         
            +
                        reserved_mem += self.cuda_graph_max_bs * 2
         
     | 
| 
      
 773 
     | 
    
         
            +
                        # Some adjustments for large parallel size
         
     | 
| 
      
 774 
     | 
    
         
            +
                        reserved_mem += self.tp_size * self.pp_size / 8 * 1024
         
     | 
| 
      
 775 
     | 
    
         
            +
             
     | 
| 
      
 776 
     | 
    
         
            +
                        if self.enable_dp_attention:
         
     | 
| 
      
 777 
     | 
    
         
            +
                            # DP attention needs more padding for some operations
         
     | 
| 
      
 778 
     | 
    
         
            +
                            reserved_mem += self.cuda_graph_max_bs * self.dp_size * 3
         
     | 
| 
      
 779 
     | 
    
         
            +
             
     | 
| 
      
 780 
     | 
    
         
            +
                            # DP attention uses much more memory for large cuda graph max bs,
         
     | 
| 
      
 781 
     | 
    
         
            +
                            # likely due to some inefficiencies in torch allocator or our implementation.
         
     | 
| 
      
 782 
     | 
    
         
            +
                            # So we need to reserve more memory.
         
     | 
| 
      
 783 
     | 
    
         
            +
                            if self.cuda_graph_max_bs > 300:
         
     | 
| 
      
 784 
     | 
    
         
            +
                                reserved_mem += self.cuda_graph_max_bs * self.dp_size * 1.5
         
     | 
| 
      
 785 
     | 
    
         
            +
             
     | 
| 
      
 786 
     | 
    
         
            +
                        if gpu_mem is not None and gpu_mem > 60 * 1024:
         
     | 
| 
      
 787 
     | 
    
         
            +
                            reserved_mem = max(reserved_mem, 10 * 1024)
         
     | 
| 
      
 788 
     | 
    
         
            +
             
     | 
| 
      
 789 
     | 
    
         
            +
                        if self.speculative_algorithm is not None:
         
     | 
| 
      
 790 
     | 
    
         
            +
                            if self.speculative_algorithm == "STANDALONE":
         
     | 
| 
      
 791 
     | 
    
         
            +
                                # standalonedraft model and cuda graphs
         
     | 
| 
      
 792 
     | 
    
         
            +
                                reserved_mem += 6 * 1024
         
     | 
| 
      
 793 
     | 
    
         
            +
                            elif self.speculative_algorithm != "NGRAM":
         
     | 
| 
      
 794 
     | 
    
         
            +
                                # eagle draft models and cuda graphs
         
     | 
| 
      
 795 
     | 
    
         
            +
                                reserved_mem += 2 * 1024
         
     | 
| 
      
 796 
     | 
    
         
            +
             
     | 
| 
      
 797 
     | 
    
         
            +
                        self.mem_fraction_static = (
         
     | 
| 
      
 798 
     | 
    
         
            +
                            round((gpu_mem - reserved_mem) / gpu_mem, 3)
         
     | 
| 
      
 799 
     | 
    
         
            +
                            if gpu_mem is not None
         
     | 
| 
      
 800 
     | 
    
         
            +
                            else 0.88
         
     | 
| 
      
 801 
     | 
    
         
            +
                        )
         
     | 
| 
       531 
802 
     | 
    
         | 
| 
       532 
803 
     | 
    
         
             
                        # Lazy init to avoid circular import
         
     | 
| 
       533 
804 
     | 
    
         
             
                        # Multimodal models need more memory for the image processor
         
     | 
| 
         @@ -537,54 +808,266 @@ class ServerArgs: 
     | 
|
| 
       537 
808 
     | 
    
         
             
                        if model_config.is_multimodal:
         
     | 
| 
       538 
809 
     | 
    
         
             
                            self.adjust_mem_fraction_for_vlm(model_config)
         
     | 
| 
       539 
810 
     | 
    
         | 
| 
       540 
     | 
    
         
            -
             
     | 
| 
       541 
     | 
    
         
            -
                     
     | 
| 
       542 
     | 
    
         
            -
             
     | 
| 
       543 
     | 
    
         
            -
             
     | 
| 
       544 
     | 
    
         
            -
             
     | 
| 
       545 
     | 
    
         
            -
             
     | 
| 
       546 
     | 
    
         
            -
             
     | 
| 
       547 
     | 
    
         
            -
             
     | 
| 
       548 
     | 
    
         
            -
             
     | 
| 
       549 
     | 
    
         
            -
                         
     | 
| 
       550 
     | 
    
         
            -
             
     | 
| 
      
 811 
     | 
    
         
            +
                def _generate_cuda_graph_batch_sizes(self):
         
     | 
| 
      
 812 
     | 
    
         
            +
                    """
         
     | 
| 
      
 813 
     | 
    
         
            +
                    Generate the list of batch sizes for CUDA graph capture based on cuda_graph_max_bs.
         
     | 
| 
      
 814 
     | 
    
         
            +
                    This integrates the logic from cuda_graph_runner.py.
         
     | 
| 
      
 815 
     | 
    
         
            +
                    """
         
     | 
| 
      
 816 
     | 
    
         
            +
                    # Handle disable_cuda_graph_padding as the first condition for both spec and non-spec
         
     | 
| 
      
 817 
     | 
    
         
            +
                    if self.disable_cuda_graph_padding:
         
     | 
| 
      
 818 
     | 
    
         
            +
                        capture_bs = list(range(1, self.cuda_graph_max_bs + 1))
         
     | 
| 
      
 819 
     | 
    
         
            +
                    elif self.speculative_algorithm is None:
         
     | 
| 
      
 820 
     | 
    
         
            +
                        # Normal case: [1, 2, 4, 8, 12] + list(range(16, 257, 8)) + list(range(272, 512, 16)) + list(range(512, cuda_graph_max_bs + 1))
         
     | 
| 
      
 821 
     | 
    
         
            +
                        capture_bs = (
         
     | 
| 
      
 822 
     | 
    
         
            +
                            [1, 2, 4, 8, 12]
         
     | 
| 
      
 823 
     | 
    
         
            +
                            + list(range(16, 257, 8))
         
     | 
| 
      
 824 
     | 
    
         
            +
                            + list(range(272, 512, 16))
         
     | 
| 
      
 825 
     | 
    
         
            +
                            + list(range(512, self.cuda_graph_max_bs + 1, 32))
         
     | 
| 
      
 826 
     | 
    
         
            +
                        )
         
     | 
| 
      
 827 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 828 
     | 
    
         
            +
                        # Spec decoding case: list(range(1, 9, 1)) + list(range(10, 33, 2)) + list(range(40, 64, 4)) + list(range(72, 257, 8))
         
     | 
| 
      
 829 
     | 
    
         
            +
                        capture_bs = (
         
     | 
| 
      
 830 
     | 
    
         
            +
                            list(range(1, 9, 1))
         
     | 
| 
      
 831 
     | 
    
         
            +
                            + list(range(10, 33, 2))
         
     | 
| 
      
 832 
     | 
    
         
            +
                            + list(range(40, 64, 4))
         
     | 
| 
      
 833 
     | 
    
         
            +
                            + list(range(72, 257, 8))
         
     | 
| 
      
 834 
     | 
    
         
            +
                            + list(range(272, self.cuda_graph_max_bs + 1, 16))
         
     | 
| 
      
 835 
     | 
    
         
            +
                        )
         
     | 
| 
       551 
836 
     | 
    
         | 
| 
       552 
     | 
    
         
            -
                     
     | 
| 
       553 
     | 
    
         
            -
             
     | 
| 
       554 
     | 
    
         
            -
             
     | 
| 
       555 
     | 
    
         
            -
             
     | 
| 
       556 
     | 
    
         
            -
             
     | 
| 
       557 
     | 
    
         
            -
             
     | 
| 
       558 
     | 
    
         
            -
             
     | 
| 
       559 
     | 
    
         
            -
             
     | 
| 
      
 837 
     | 
    
         
            +
                    capture_bs = [bs for bs in capture_bs if bs <= self.cuda_graph_max_bs]
         
     | 
| 
      
 838 
     | 
    
         
            +
             
     | 
| 
      
 839 
     | 
    
         
            +
                    return capture_bs
         
     | 
| 
      
 840 
     | 
    
         
            +
             
     | 
| 
      
 841 
     | 
    
         
            +
                def _generate_piecewise_cuda_graph_tokens(self):
         
     | 
| 
      
 842 
     | 
    
         
            +
                    """
         
     | 
| 
      
 843 
     | 
    
         
            +
                    Generate the list of batch sizes for piecewise CUDA graph capture
         
     | 
| 
      
 844 
     | 
    
         
            +
                    based on piecewise_cuda_graph_max_tokens.
         
     | 
| 
      
 845 
     | 
    
         
            +
                    """
         
     | 
| 
      
 846 
     | 
    
         
            +
                    capture_sizes = (
         
     | 
| 
      
 847 
     | 
    
         
            +
                        list(range(4, 33, 4))
         
     | 
| 
      
 848 
     | 
    
         
            +
                        + list(range(48, 257, 16))
         
     | 
| 
      
 849 
     | 
    
         
            +
                        + list(range(288, 513, 32))
         
     | 
| 
      
 850 
     | 
    
         
            +
                        + list(range(640, 4096 + 1, 128))
         
     | 
| 
      
 851 
     | 
    
         
            +
                        + list(range(4352, self.piecewise_cuda_graph_max_tokens + 1, 256))
         
     | 
| 
      
 852 
     | 
    
         
            +
                    )
         
     | 
| 
      
 853 
     | 
    
         
            +
             
     | 
| 
      
 854 
     | 
    
         
            +
                    capture_sizes = [
         
     | 
| 
      
 855 
     | 
    
         
            +
                        s for s in capture_sizes if s <= self.piecewise_cuda_graph_max_tokens
         
     | 
| 
      
 856 
     | 
    
         
            +
                    ]
         
     | 
| 
       560 
857 
     | 
    
         | 
| 
       561 
     | 
    
         
            -
                     
     | 
| 
      
 858 
     | 
    
         
            +
                    return capture_sizes
         
     | 
| 
      
 859 
     | 
    
         
            +
             
     | 
| 
      
 860 
     | 
    
         
            +
                def _handle_hpu_backends(self):
         
     | 
| 
       562 
861 
     | 
    
         
             
                    if self.device == "hpu":
         
     | 
| 
       563 
862 
     | 
    
         
             
                        self.attention_backend = "torch_native"
         
     | 
| 
       564 
863 
     | 
    
         
             
                        self.sampling_backend = "pytorch"
         
     | 
| 
       565 
864 
     | 
    
         | 
| 
       566 
     | 
    
         
            -
             
     | 
| 
       567 
     | 
    
         
            -
                    if parse_connector_type(self.model_path) != ConnectorType.INSTANCE:
         
     | 
| 
       568 
     | 
    
         
            -
                        self.model_specific_adjustments()
         
     | 
| 
       569 
     | 
    
         
            -
             
     | 
| 
       570 
     | 
    
         
            -
                    # Set kernel backends
         
     | 
| 
      
 865 
     | 
    
         
            +
                def _handle_cpu_backends(self):
         
     | 
| 
       571 
866 
     | 
    
         
             
                    if self.device == "cpu":
         
     | 
| 
       572 
867 
     | 
    
         
             
                        if self.attention_backend is None:
         
     | 
| 
       573 
868 
     | 
    
         
             
                            self.attention_backend = "intel_amx"
         
     | 
| 
       574 
869 
     | 
    
         
             
                        self.sampling_backend = "pytorch"
         
     | 
| 
       575 
870 
     | 
    
         | 
| 
      
 871 
     | 
    
         
            +
                def _handle_model_specific_adjustments(self):
         
     | 
| 
      
 872 
     | 
    
         
            +
                    from sglang.srt.configs.model_config import is_deepseek_nsa
         
     | 
| 
      
 873 
     | 
    
         
            +
             
     | 
| 
      
 874 
     | 
    
         
            +
                    if parse_connector_type(self.model_path) == ConnectorType.INSTANCE:
         
     | 
| 
      
 875 
     | 
    
         
            +
                        return
         
     | 
| 
      
 876 
     | 
    
         
            +
             
     | 
| 
      
 877 
     | 
    
         
            +
                    hf_config = self.get_hf_config()
         
     | 
| 
      
 878 
     | 
    
         
            +
                    model_arch = hf_config.architectures[0]
         
     | 
| 
      
 879 
     | 
    
         
            +
                    if model_arch in ["DeepseekV3ForCausalLM"] and not is_deepseek_nsa(hf_config):
         
     | 
| 
      
 880 
     | 
    
         
            +
                        if is_cuda() and is_sm100_supported():
         
     | 
| 
      
 881 
     | 
    
         
            +
                            if (
         
     | 
| 
      
 882 
     | 
    
         
            +
                                self.attention_backend is None
         
     | 
| 
      
 883 
     | 
    
         
            +
                                and self.prefill_attention_backend is None
         
     | 
| 
      
 884 
     | 
    
         
            +
                                and self.decode_attention_backend is None
         
     | 
| 
      
 885 
     | 
    
         
            +
                            ):
         
     | 
| 
      
 886 
     | 
    
         
            +
                                self.attention_backend = "trtllm_mla"
         
     | 
| 
      
 887 
     | 
    
         
            +
                                logger.info(
         
     | 
| 
      
 888 
     | 
    
         
            +
                                    "Use trtllm_mla as attention backend on sm100 for DeepseekV3ForCausalLM"
         
     | 
| 
      
 889 
     | 
    
         
            +
                                )
         
     | 
| 
      
 890 
     | 
    
         
            +
                            if not self.enable_dp_attention:
         
     | 
| 
      
 891 
     | 
    
         
            +
                                self.enable_flashinfer_allreduce_fusion = True
         
     | 
| 
      
 892 
     | 
    
         
            +
                                logger.info(
         
     | 
| 
      
 893 
     | 
    
         
            +
                                    "Enable FlashInfer AllReduce Fusion on sm100 for DeepseekV3ForCausalLM"
         
     | 
| 
      
 894 
     | 
    
         
            +
                                )
         
     | 
| 
      
 895 
     | 
    
         
            +
                            if (
         
     | 
| 
      
 896 
     | 
    
         
            +
                                self.quantization == "modelopt_fp4"
         
     | 
| 
      
 897 
     | 
    
         
            +
                                and self.moe_runner_backend == "auto"
         
     | 
| 
      
 898 
     | 
    
         
            +
                            ):
         
     | 
| 
      
 899 
     | 
    
         
            +
                                self.moe_runner_backend = "flashinfer_trtllm"
         
     | 
| 
      
 900 
     | 
    
         
            +
                                logger.info(
         
     | 
| 
      
 901 
     | 
    
         
            +
                                    "Use flashinfer_trtllm as moe runner backend on sm100 for DeepseekV3ForCausalLM"
         
     | 
| 
      
 902 
     | 
    
         
            +
                                )
         
     | 
| 
      
 903 
     | 
    
         
            +
             
     | 
| 
      
 904 
     | 
    
         
            +
                    elif model_arch in ["GptOssForCausalLM"]:
         
     | 
| 
      
 905 
     | 
    
         
            +
                        if (
         
     | 
| 
      
 906 
     | 
    
         
            +
                            self.attention_backend is None
         
     | 
| 
      
 907 
     | 
    
         
            +
                            and self.prefill_attention_backend is None
         
     | 
| 
      
 908 
     | 
    
         
            +
                            and self.decode_attention_backend is None
         
     | 
| 
      
 909 
     | 
    
         
            +
                        ):
         
     | 
| 
      
 910 
     | 
    
         
            +
                            if is_cuda() and is_sm100_supported():
         
     | 
| 
      
 911 
     | 
    
         
            +
                                self.attention_backend = "trtllm_mha"
         
     | 
| 
      
 912 
     | 
    
         
            +
                            elif is_cuda() and is_sm90_supported():
         
     | 
| 
      
 913 
     | 
    
         
            +
                                self.attention_backend = "fa3"
         
     | 
| 
      
 914 
     | 
    
         
            +
                            else:
         
     | 
| 
      
 915 
     | 
    
         
            +
                                self.attention_backend = "triton"
         
     | 
| 
      
 916 
     | 
    
         
            +
             
     | 
| 
      
 917 
     | 
    
         
            +
                        supported_backends = ["triton", "trtllm_mha", "fa3", "fa4"]
         
     | 
| 
      
 918 
     | 
    
         
            +
                        prefill_attn_backend, decode_attn_backend = self.get_attention_backends()
         
     | 
| 
      
 919 
     | 
    
         
            +
                        assert (
         
     | 
| 
      
 920 
     | 
    
         
            +
                            prefill_attn_backend in supported_backends
         
     | 
| 
      
 921 
     | 
    
         
            +
                            and decode_attn_backend in supported_backends
         
     | 
| 
      
 922 
     | 
    
         
            +
                        ), (
         
     | 
| 
      
 923 
     | 
    
         
            +
                            f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got the following backends\n"
         
     | 
| 
      
 924 
     | 
    
         
            +
                            f"- Prefill: {prefill_attn_backend}\n"
         
     | 
| 
      
 925 
     | 
    
         
            +
                            f"- Decode: {decode_attn_backend}\n"
         
     | 
| 
      
 926 
     | 
    
         
            +
                        )
         
     | 
| 
      
 927 
     | 
    
         
            +
             
     | 
| 
      
 928 
     | 
    
         
            +
                        if is_sm100_supported():
         
     | 
| 
      
 929 
     | 
    
         
            +
                            if not self.enable_dp_attention:
         
     | 
| 
      
 930 
     | 
    
         
            +
                                self.enable_flashinfer_allreduce_fusion = True
         
     | 
| 
      
 931 
     | 
    
         
            +
                                logger.info(
         
     | 
| 
      
 932 
     | 
    
         
            +
                                    "Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM"
         
     | 
| 
      
 933 
     | 
    
         
            +
                                )
         
     | 
| 
      
 934 
     | 
    
         
            +
                        quantization_config = getattr(hf_config, "quantization_config", None)
         
     | 
| 
      
 935 
     | 
    
         
            +
                        is_mxfp4_quant_format = (
         
     | 
| 
      
 936 
     | 
    
         
            +
                            quantization_config is not None
         
     | 
| 
      
 937 
     | 
    
         
            +
                            and quantization_config.get("quant_method") == "mxfp4"
         
     | 
| 
      
 938 
     | 
    
         
            +
                        )
         
     | 
| 
      
 939 
     | 
    
         
            +
             
     | 
| 
      
 940 
     | 
    
         
            +
                        if is_sm100_supported() and is_mxfp4_quant_format:
         
     | 
| 
      
 941 
     | 
    
         
            +
                            self.moe_runner_backend = "flashinfer_mxfp4"
         
     | 
| 
      
 942 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 943 
     | 
    
         
            +
                                "Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
         
     | 
| 
      
 944 
     | 
    
         
            +
                            )
         
     | 
| 
      
 945 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 946 
     | 
    
         
            +
                            if self.moe_runner_backend == "triton_kernel":
         
     | 
| 
      
 947 
     | 
    
         
            +
                                assert (
         
     | 
| 
      
 948 
     | 
    
         
            +
                                    self.ep_size == 1
         
     | 
| 
      
 949 
     | 
    
         
            +
                                ), "Triton kernel MoE is only supported when ep_size == 1"
         
     | 
| 
      
 950 
     | 
    
         
            +
                            if (
         
     | 
| 
      
 951 
     | 
    
         
            +
                                self.moe_runner_backend == "auto"
         
     | 
| 
      
 952 
     | 
    
         
            +
                                and self.ep_size == 1
         
     | 
| 
      
 953 
     | 
    
         
            +
                                and is_triton_kernels_available()
         
     | 
| 
      
 954 
     | 
    
         
            +
                            ):
         
     | 
| 
      
 955 
     | 
    
         
            +
                                self.moe_runner_backend = "triton_kernel"
         
     | 
| 
      
 956 
     | 
    
         
            +
                                logger.warning(
         
     | 
| 
      
 957 
     | 
    
         
            +
                                    "Detected GPT-OSS model, enabling triton_kernels MOE kernel."
         
     | 
| 
      
 958 
     | 
    
         
            +
                                )
         
     | 
| 
      
 959 
     | 
    
         
            +
                        self.disable_hybrid_swa_memory = True
         
     | 
| 
      
 960 
     | 
    
         
            +
                        if is_mxfp4_quant_format:
         
     | 
| 
      
 961 
     | 
    
         
            +
                            # use bf16 for mxfp4 triton kernels
         
     | 
| 
      
 962 
     | 
    
         
            +
                            self.dtype = "bfloat16"
         
     | 
| 
      
 963 
     | 
    
         
            +
             
     | 
| 
      
 964 
     | 
    
         
            +
                    elif "Llama4" in model_arch and self.device != "cpu":
         
     | 
| 
      
 965 
     | 
    
         
            +
                        assert self.attention_backend in {
         
     | 
| 
      
 966 
     | 
    
         
            +
                            "fa3",
         
     | 
| 
      
 967 
     | 
    
         
            +
                            "aiter",
         
     | 
| 
      
 968 
     | 
    
         
            +
                            "triton",
         
     | 
| 
      
 969 
     | 
    
         
            +
                        }, "fa3, aiter, or triton is required for Llama4 model"
         
     | 
| 
      
 970 
     | 
    
         
            +
                    elif model_arch in [
         
     | 
| 
      
 971 
     | 
    
         
            +
                        "Gemma2ForCausalLM",
         
     | 
| 
      
 972 
     | 
    
         
            +
                        "Gemma3ForCausalLM",
         
     | 
| 
      
 973 
     | 
    
         
            +
                        "Gemma3ForConditionalGeneration",
         
     | 
| 
      
 974 
     | 
    
         
            +
                        "Gemma3nForCausalLM",
         
     | 
| 
      
 975 
     | 
    
         
            +
                        "Gemma3nForConditionalGeneration",
         
     | 
| 
      
 976 
     | 
    
         
            +
                    ]:
         
     | 
| 
      
 977 
     | 
    
         
            +
                        # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
         
     | 
| 
      
 978 
     | 
    
         
            +
                        # It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
         
     | 
| 
      
 979 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 980 
     | 
    
         
            +
                            f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
         
     | 
| 
      
 981 
     | 
    
         
            +
                        )
         
     | 
| 
      
 982 
     | 
    
         
            +
                        self.disable_hybrid_swa_memory = True
         
     | 
| 
      
 983 
     | 
    
         
            +
                    elif model_arch in ["Olmo2ForCausalLM"]:
         
     | 
| 
      
 984 
     | 
    
         
            +
                        # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with Olmo3 model.
         
     | 
| 
      
 985 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 986 
     | 
    
         
            +
                            f"Disabling hybrid SWA memory for {model_arch} as it is not yet supported."
         
     | 
| 
      
 987 
     | 
    
         
            +
                        )
         
     | 
| 
      
 988 
     | 
    
         
            +
                        self.disable_hybrid_swa_memory = True
         
     | 
| 
      
 989 
     | 
    
         
            +
             
     | 
| 
      
 990 
     | 
    
         
            +
                        if self.attention_backend is None:
         
     | 
| 
      
 991 
     | 
    
         
            +
                            if is_cuda() and is_sm100_supported():
         
     | 
| 
      
 992 
     | 
    
         
            +
                                self.attention_backend = "trtllm_mha"
         
     | 
| 
      
 993 
     | 
    
         
            +
                            elif is_cuda() and get_device_sm() >= 80:
         
     | 
| 
      
 994 
     | 
    
         
            +
                                self.attention_backend = "fa3"
         
     | 
| 
      
 995 
     | 
    
         
            +
                            else:
         
     | 
| 
      
 996 
     | 
    
         
            +
                                self.attention_backend = "triton"
         
     | 
| 
      
 997 
     | 
    
         
            +
             
     | 
| 
      
 998 
     | 
    
         
            +
                        # Flashinfer appears to degrade performance when sliding window attention
         
     | 
| 
      
 999 
     | 
    
         
            +
                        # is used for the Olmo2 architecture. Olmo2 does not use sliding window attention
         
     | 
| 
      
 1000 
     | 
    
         
            +
                        # but Olmo3 does.
         
     | 
| 
      
 1001 
     | 
    
         
            +
                        assert (
         
     | 
| 
      
 1002 
     | 
    
         
            +
                            self.attention_backend != "flashinfer"
         
     | 
| 
      
 1003 
     | 
    
         
            +
                        ), "FlashInfer backend can significantly degrade the performance of Olmo3 models."
         
     | 
| 
      
 1004 
     | 
    
         
            +
             
     | 
| 
      
 1005 
     | 
    
         
            +
                        logger.info(
         
     | 
| 
      
 1006 
     | 
    
         
            +
                            f"Using {self.attention_backend} as attention backend for {model_arch}."
         
     | 
| 
      
 1007 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1008 
     | 
    
         
            +
             
     | 
| 
      
 1009 
     | 
    
         
            +
                    if is_deepseek_nsa(hf_config):
         
     | 
| 
      
 1010 
     | 
    
         
            +
                        if (
         
     | 
| 
      
 1011 
     | 
    
         
            +
                            self.attention_backend is None
         
     | 
| 
      
 1012 
     | 
    
         
            +
                            and self.prefill_attention_backend is None
         
     | 
| 
      
 1013 
     | 
    
         
            +
                            and self.decode_attention_backend is None
         
     | 
| 
      
 1014 
     | 
    
         
            +
                        ):
         
     | 
| 
      
 1015 
     | 
    
         
            +
                            self.attention_backend = "nsa"
         
     | 
| 
      
 1016 
     | 
    
         
            +
                            logger.warning("Set nsa attention backend for DeepSeek NSA.")
         
     | 
| 
      
 1017 
     | 
    
         
            +
             
     | 
| 
      
 1018 
     | 
    
         
            +
                        if not is_npu():
         
     | 
| 
      
 1019 
     | 
    
         
            +
                            self.enable_dp_attention = True
         
     | 
| 
      
 1020 
     | 
    
         
            +
                            self.dp_size = self.tp_size
         
     | 
| 
      
 1021 
     | 
    
         
            +
                            logger.warning("DP attention is enabled for DeepSeek NSA.")
         
     | 
| 
      
 1022 
     | 
    
         
            +
             
     | 
| 
      
 1023 
     | 
    
         
            +
                            self.page_size = 64
         
     | 
| 
      
 1024 
     | 
    
         
            +
                            logger.warning("Setting page size to 64 for DeepSeek NSA.")
         
     | 
| 
      
 1025 
     | 
    
         
            +
             
     | 
| 
      
 1026 
     | 
    
         
            +
                            # For Hopper, we support both bf16 and fp8 kv cache; for Blackwell, we support fp8 only currently
         
     | 
| 
      
 1027 
     | 
    
         
            +
                            import torch
         
     | 
| 
      
 1028 
     | 
    
         
            +
             
     | 
| 
      
 1029 
     | 
    
         
            +
                            major, _ = torch.cuda.get_device_capability()
         
     | 
| 
      
 1030 
     | 
    
         
            +
                            if major >= 10:
         
     | 
| 
      
 1031 
     | 
    
         
            +
                                self.kv_cache_dtype = "fp8_e4m3"
         
     | 
| 
      
 1032 
     | 
    
         
            +
                                logger.warning("Setting KV cache dtype to fp8.")
         
     | 
| 
      
 1033 
     | 
    
         
            +
             
     | 
| 
      
 1034 
     | 
    
         
            +
                            if self.kv_cache_dtype == "fp8_e4m3":
         
     | 
| 
      
 1035 
     | 
    
         
            +
                                self.nsa_prefill_backend = "flashmla_kv"
         
     | 
| 
      
 1036 
     | 
    
         
            +
                                self.nsa_decode_backend = "flashmla_kv"
         
     | 
| 
      
 1037 
     | 
    
         
            +
                                logger.warning(
         
     | 
| 
      
 1038 
     | 
    
         
            +
                                    "Setting NSA backend to flashmla_kv for FP8 KV Cache."
         
     | 
| 
      
 1039 
     | 
    
         
            +
                                )
         
     | 
| 
      
 1040 
     | 
    
         
            +
             
     | 
| 
      
 1041 
     | 
    
         
            +
                            # Logging env vars for NSA
         
     | 
| 
      
 1042 
     | 
    
         
            +
                            from sglang.srt.layers.attention.nsa.utils import (
         
     | 
| 
      
 1043 
     | 
    
         
            +
                                print_nsa_bool_env_vars,
         
     | 
| 
      
 1044 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1045 
     | 
    
         
            +
             
     | 
| 
      
 1046 
     | 
    
         
            +
                            print_nsa_bool_env_vars()
         
     | 
| 
      
 1047 
     | 
    
         
            +
             
     | 
| 
      
 1048 
     | 
    
         
            +
                def _handle_sampling_backend(self):
         
     | 
| 
       576 
1049 
     | 
    
         
             
                    if self.sampling_backend is None:
         
     | 
| 
       577 
1050 
     | 
    
         
             
                        self.sampling_backend = (
         
     | 
| 
       578 
1051 
     | 
    
         
             
                            "flashinfer" if is_flashinfer_available() else "pytorch"
         
     | 
| 
       579 
1052 
     | 
    
         
             
                        )
         
     | 
| 
       580 
1053 
     | 
    
         | 
| 
      
 1054 
     | 
    
         
            +
                def _handle_attention_backend_compatibility(self):
         
     | 
| 
       581 
1055 
     | 
    
         
             
                    if self.attention_backend == "torch_native":
         
     | 
| 
       582 
1056 
     | 
    
         
             
                        logger.warning(
         
     | 
| 
       583 
1057 
     | 
    
         
             
                            "Cuda graph is disabled because of using torch native attention backend"
         
     | 
| 
       584 
1058 
     | 
    
         
             
                        )
         
     | 
| 
       585 
1059 
     | 
    
         
             
                        self.disable_cuda_graph = True
         
     | 
| 
       586 
1060 
     | 
    
         | 
| 
       587 
     | 
    
         
            -
                    if  
     | 
| 
      
 1061 
     | 
    
         
            +
                    if self.attention_backend == "flex_attention":
         
     | 
| 
      
 1062 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 1063 
     | 
    
         
            +
                            "Cuda graph is disabled because of using torch Flex Attention backend"
         
     | 
| 
      
 1064 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1065 
     | 
    
         
            +
                        self.disable_cuda_graph = True
         
     | 
| 
      
 1066 
     | 
    
         
            +
                        assert (
         
     | 
| 
      
 1067 
     | 
    
         
            +
                            self.speculative_algorithm is None
         
     | 
| 
      
 1068 
     | 
    
         
            +
                        ), "Speculative decoding is currently not supported with Flex Attention backend"
         
     | 
| 
      
 1069 
     | 
    
         
            +
             
     | 
| 
      
 1070 
     | 
    
         
            +
                    if is_npu() and self.attention_backend in ["ascend"]:
         
     | 
| 
       588 
1071 
     | 
    
         
             
                        logger.warning(
         
     | 
| 
       589 
1072 
     | 
    
         
             
                            "At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
         
     | 
| 
       590 
1073 
     | 
    
         
             
                        )
         
     | 
| 
         @@ -646,29 +1129,44 @@ class ServerArgs: 
     | 
|
| 
       646 
1129 
     | 
    
         | 
| 
       647 
1130 
     | 
    
         
             
                    if self.attention_backend == "dual_chunk_flash_attn":
         
     | 
| 
       648 
1131 
     | 
    
         
             
                        logger.warning(
         
     | 
| 
       649 
     | 
    
         
            -
                            "Mixed chunk 
     | 
| 
      
 1132 
     | 
    
         
            +
                            "Mixed chunk and radix cache are disabled when using dual-chunk flash attention backend"
         
     | 
| 
       650 
1133 
     | 
    
         
             
                        )
         
     | 
| 
       651 
1134 
     | 
    
         
             
                        self.enable_mixed_chunk = False
         
     | 
| 
       652 
     | 
    
         
            -
                        self.disable_cuda_graph = True
         
     | 
| 
       653 
1135 
     | 
    
         
             
                        self.disable_radix_cache = True
         
     | 
| 
       654 
1136 
     | 
    
         | 
| 
       655 
     | 
    
         
            -
                     
     | 
| 
      
 1137 
     | 
    
         
            +
                    if self.attention_backend == "intel_xpu":
         
     | 
| 
      
 1138 
     | 
    
         
            +
                        if self.page_size not in [32, 64, 128]:
         
     | 
| 
      
 1139 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 1140 
     | 
    
         
            +
                                f"Intel XPU attention backend only supports page_size of 32, 64 or 128, changing page_size from {self.page_size} to 128."
         
     | 
| 
      
 1141 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1142 
     | 
    
         
            +
                            self.page_size = 128
         
     | 
| 
      
 1143 
     | 
    
         
            +
                    if self.attention_backend == "fa4" or self.decode_attention_backend == "fa4":
         
     | 
| 
      
 1144 
     | 
    
         
            +
                        raise ValueError(
         
     | 
| 
      
 1145 
     | 
    
         
            +
                            "FA4 backend is only supported for prefill. Please use `--prefill-attention-backend fa4` instead."
         
     | 
| 
      
 1146 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1147 
     | 
    
         
            +
                    if self.prefill_attention_backend == "fa4":
         
     | 
| 
      
 1148 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 1149 
     | 
    
         
            +
                            f"FA4 backend only supports page size 128, changing page_size from {self.page_size} to 128."
         
     | 
| 
      
 1150 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1151 
     | 
    
         
            +
                        self.page_size = 128
         
     | 
| 
      
 1152 
     | 
    
         
            +
             
     | 
| 
      
 1153 
     | 
    
         
            +
                def _handle_page_size(self):
         
     | 
| 
       656 
1154 
     | 
    
         
             
                    if self.page_size is None:
         
     | 
| 
       657 
1155 
     | 
    
         
             
                        self.page_size = 1
         
     | 
| 
       658 
1156 
     | 
    
         | 
| 
       659 
     | 
    
         
            -
             
     | 
| 
      
 1157 
     | 
    
         
            +
                def _handle_amd_specifics(self):
         
     | 
| 
       660 
1158 
     | 
    
         
             
                    if is_hip():
         
     | 
| 
       661 
1159 
     | 
    
         
             
                        self.triton_attention_num_kv_splits = 16
         
     | 
| 
       662 
1160 
     | 
    
         | 
| 
       663 
     | 
    
         
            -
             
     | 
| 
      
 1161 
     | 
    
         
            +
                def _handle_grammar_backend(self):
         
     | 
| 
       664 
1162 
     | 
    
         
             
                    if self.grammar_backend is None:
         
     | 
| 
       665 
1163 
     | 
    
         
             
                        self.grammar_backend = "xgrammar"
         
     | 
| 
       666 
1164 
     | 
    
         | 
| 
      
 1165 
     | 
    
         
            +
                def _handle_data_parallelism(self):
         
     | 
| 
       667 
1166 
     | 
    
         
             
                    if self.dp_size == 1:
         
     | 
| 
       668 
1167 
     | 
    
         
             
                        self.enable_dp_attention = False
         
     | 
| 
       669 
1168 
     | 
    
         
             
                        self.enable_dp_lm_head = False
         
     | 
| 
       670 
1169 
     | 
    
         | 
| 
       671 
     | 
    
         
            -
                    # Data parallelism attention
         
     | 
| 
       672 
1170 
     | 
    
         
             
                    if self.enable_dp_attention:
         
     | 
| 
       673 
1171 
     | 
    
         
             
                        self.schedule_conservativeness = self.schedule_conservativeness * 0.3
         
     | 
| 
       674 
1172 
     | 
    
         
             
                        assert self.tp_size % self.dp_size == 0
         
     | 
| 
         @@ -682,7 +1180,7 @@ class ServerArgs: 
     | 
|
| 
       682 
1180 
     | 
    
         
             
                            self.enable_dp_attention
         
     | 
| 
       683 
1181 
     | 
    
         
             
                        ), "Please enable dp attention when setting enable_dp_lm_head. "
         
     | 
| 
       684 
1182 
     | 
    
         | 
| 
       685 
     | 
    
         
            -
             
     | 
| 
      
 1183 
     | 
    
         
            +
                def _handle_moe_kernel_config(self):
         
     | 
| 
       686 
1184 
     | 
    
         
             
                    if self.moe_runner_backend == "flashinfer_cutlass":
         
     | 
| 
       687 
1185 
     | 
    
         
             
                        assert (
         
     | 
| 
       688 
1186 
     | 
    
         
             
                            self.quantization == "modelopt_fp4"
         
     | 
| 
         @@ -695,13 +1193,13 @@ class ServerArgs: 
     | 
|
| 
       695 
1193 
     | 
    
         
             
                    if self.moe_runner_backend == "flashinfer_trtllm":
         
     | 
| 
       696 
1194 
     | 
    
         
             
                        assert (
         
     | 
| 
       697 
1195 
     | 
    
         
             
                            self.quantization == "modelopt_fp4" or self.quantization == "fp8"
         
     | 
| 
       698 
     | 
    
         
            -
                        ), "modelopt_fp4 quantization is required for Flashinfer TRTLLM MoE"
         
     | 
| 
      
 1196 
     | 
    
         
            +
                        ), "modelopt_fp4 or fp8 quantization is required for Flashinfer TRTLLM MoE"
         
     | 
| 
       699 
1197 
     | 
    
         
             
                        self.disable_shared_experts_fusion = True
         
     | 
| 
       700 
1198 
     | 
    
         
             
                        logger.warning(
         
     | 
| 
       701 
1199 
     | 
    
         
             
                            "FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
         
     | 
| 
       702 
1200 
     | 
    
         
             
                        )
         
     | 
| 
       703 
1201 
     | 
    
         | 
| 
       704 
     | 
    
         
            -
             
     | 
| 
      
 1202 
     | 
    
         
            +
                def _handle_a2a_moe(self):
         
     | 
| 
       705 
1203 
     | 
    
         
             
                    if self.moe_a2a_backend == "deepep":
         
     | 
| 
       706 
1204 
     | 
    
         
             
                        if self.deepep_mode == "normal":
         
     | 
| 
       707 
1205 
     | 
    
         
             
                            logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
         
     | 
| 
         @@ -711,6 +1209,13 @@ class ServerArgs: 
     | 
|
| 
       711 
1209 
     | 
    
         
             
                            f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
         
     | 
| 
       712 
1210 
     | 
    
         
             
                        )
         
     | 
| 
       713 
1211 
     | 
    
         | 
| 
      
 1212 
     | 
    
         
            +
                    if self.moe_a2a_backend == "mooncake":
         
     | 
| 
      
 1213 
     | 
    
         
            +
                        self.ep_size = self.tp_size
         
     | 
| 
      
 1214 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 1215 
     | 
    
         
            +
                            f"Mooncake MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
         
     | 
| 
      
 1216 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1217 
     | 
    
         
            +
             
     | 
| 
      
 1218 
     | 
    
         
            +
                def _handle_eplb_and_dispatch(self):
         
     | 
| 
       714 
1219 
     | 
    
         
             
                    if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
         
     | 
| 
       715 
1220 
     | 
    
         
             
                        self.expert_distribution_recorder_mode = "stat"
         
     | 
| 
       716 
1221 
     | 
    
         
             
                        logger.warning(
         
     | 
| 
         @@ -725,6 +1230,16 @@ class ServerArgs: 
     | 
|
| 
       725 
1230 
     | 
    
         
             
                    if self.enable_eplb:
         
     | 
| 
       726 
1231 
     | 
    
         
             
                        assert self.ep_size > 1
         
     | 
| 
       727 
1232 
     | 
    
         | 
| 
      
 1233 
     | 
    
         
            +
                def _handle_elastic_ep(self):
         
     | 
| 
      
 1234 
     | 
    
         
            +
                    if self.elastic_ep_backend is not None:
         
     | 
| 
      
 1235 
     | 
    
         
            +
                        if self.enable_eplb:
         
     | 
| 
      
 1236 
     | 
    
         
            +
                            if self.eplb_algorithm == "auto":
         
     | 
| 
      
 1237 
     | 
    
         
            +
                                self.eplb_algorithm = "elasticity_aware"
         
     | 
| 
      
 1238 
     | 
    
         
            +
                            assert (
         
     | 
| 
      
 1239 
     | 
    
         
            +
                                self.eplb_algorithm == "elasticity_aware"
         
     | 
| 
      
 1240 
     | 
    
         
            +
                            ), "Elastic EP requires eplb_algorithm to be set to 'auto' or 'elasticity_aware'."
         
     | 
| 
      
 1241 
     | 
    
         
            +
             
     | 
| 
      
 1242 
     | 
    
         
            +
                def _handle_expert_distribution_metrics(self):
         
     | 
| 
       728 
1243 
     | 
    
         
             
                    if self.enable_expert_distribution_metrics and (
         
     | 
| 
       729 
1244 
     | 
    
         
             
                        self.expert_distribution_recorder_mode is None
         
     | 
| 
       730 
1245 
     | 
    
         
             
                    ):
         
     | 
| 
         @@ -736,18 +1251,24 @@ class ServerArgs: 
     | 
|
| 
       736 
1251 
     | 
    
         
             
                        elif self.expert_distribution_recorder_mode is not None:
         
     | 
| 
       737 
1252 
     | 
    
         
             
                            self.expert_distribution_recorder_buffer_size = 1000
         
     | 
| 
       738 
1253 
     | 
    
         | 
| 
       739 
     | 
    
         
            -
             
     | 
| 
      
 1254 
     | 
    
         
            +
                def _handle_pipeline_parallelism(self):
         
     | 
| 
       740 
1255 
     | 
    
         
             
                    if self.pp_size > 1:
         
     | 
| 
       741 
1256 
     | 
    
         
             
                        self.disable_overlap_schedule = True
         
     | 
| 
       742 
1257 
     | 
    
         
             
                        logger.warning(
         
     | 
| 
       743 
1258 
     | 
    
         
             
                            "Pipeline parallelism is incompatible with overlap schedule."
         
     | 
| 
       744 
1259 
     | 
    
         
             
                        )
         
     | 
| 
       745 
1260 
     | 
    
         | 
| 
       746 
     | 
    
         
            -
             
     | 
| 
      
 1261 
     | 
    
         
            +
                def _handle_hicache(self):
         
     | 
| 
       747 
1262 
     | 
    
         
             
                    if self.hicache_storage_backend == "mooncake":
         
     | 
| 
       748 
     | 
    
         
            -
                         
     | 
| 
       749 
     | 
    
         
            -
             
     | 
| 
       750 
     | 
    
         
            -
             
     | 
| 
      
 1263 
     | 
    
         
            +
                        if self.hicache_mem_layout == "layer_first":
         
     | 
| 
      
 1264 
     | 
    
         
            +
                            if self.hicache_io_backend == "direct":
         
     | 
| 
      
 1265 
     | 
    
         
            +
                                self.hicache_mem_layout = "page_first_direct"
         
     | 
| 
      
 1266 
     | 
    
         
            +
                            elif self.hicache_io_backend == "kernel":
         
     | 
| 
      
 1267 
     | 
    
         
            +
                                self.hicache_mem_layout = "page_first"
         
     | 
| 
      
 1268 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 1269 
     | 
    
         
            +
                                f"Mooncake storage backend does not support layer_first layout, "
         
     | 
| 
      
 1270 
     | 
    
         
            +
                                f"switching to {self.hicache_mem_layout} layout for {self.hicache_io_backend} io backend"
         
     | 
| 
      
 1271 
     | 
    
         
            +
                            )
         
     | 
| 
       751 
1272 
     | 
    
         | 
| 
       752 
1273 
     | 
    
         
             
                    if self.hicache_mem_layout == "page_first_direct":
         
     | 
| 
       753 
1274 
     | 
    
         
             
                        if self.hicache_io_backend != "direct":
         
     | 
| 
         @@ -756,24 +1277,34 @@ class ServerArgs: 
     | 
|
| 
       756 
1277 
     | 
    
         
             
                                "Page first direct layout only support direct io backend"
         
     | 
| 
       757 
1278 
     | 
    
         
             
                            )
         
     | 
| 
       758 
1279 
     | 
    
         | 
| 
       759 
     | 
    
         
            -
             
     | 
| 
      
 1280 
     | 
    
         
            +
                def _handle_speculative_decoding(self):
         
     | 
| 
       760 
1281 
     | 
    
         
             
                    if self.speculative_algorithm == "NEXTN":
         
     | 
| 
       761 
     | 
    
         
            -
                        # NEXTN shares the same implementation of EAGLE
         
     | 
| 
       762 
1282 
     | 
    
         
             
                        self.speculative_algorithm = "EAGLE"
         
     | 
| 
       763 
1283 
     | 
    
         | 
| 
       764 
1284 
     | 
    
         
             
                    if self.speculative_algorithm in ("EAGLE", "EAGLE3", "STANDALONE"):
         
     | 
| 
       765 
     | 
    
         
            -
                        if self.speculative_algorithm == "STANDALONE":
         
     | 
| 
      
 1285 
     | 
    
         
            +
                        if self.speculative_algorithm == "STANDALONE" and self.enable_dp_attention:
         
     | 
| 
       766 
1286 
     | 
    
         
             
                            # TODO: support dp attention for standalone speculative decoding
         
     | 
| 
       767 
     | 
    
         
            -
                             
     | 
| 
       768 
     | 
    
         
            -
                                 
     | 
| 
       769 
     | 
    
         
            -
                            ) 
     | 
| 
      
 1287 
     | 
    
         
            +
                            raise ValueError(
         
     | 
| 
      
 1288 
     | 
    
         
            +
                                "Currently standalone speculative decoding does not support dp attention."
         
     | 
| 
      
 1289 
     | 
    
         
            +
                            )
         
     | 
| 
       770 
1290 
     | 
    
         
             
                        if self.max_running_requests is None:
         
     | 
| 
       771 
1291 
     | 
    
         
             
                            self.max_running_requests = 48
         
     | 
| 
       772 
     | 
    
         
            -
             
     | 
| 
       773 
     | 
    
         
            -
             
     | 
| 
       774 
     | 
    
         
            -
                             
     | 
| 
       775 
     | 
    
         
            -
             
     | 
| 
       776 
     | 
    
         
            -
                         
     | 
| 
      
 1292 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 1293 
     | 
    
         
            +
                                "Max running requests is reset to 48 for speculative decoding."
         
     | 
| 
      
 1294 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1295 
     | 
    
         
            +
             
     | 
| 
      
 1296 
     | 
    
         
            +
                        if self.speculative_algorithm == "EAGLE" and self.enable_beta_spec:
         
     | 
| 
      
 1297 
     | 
    
         
            +
                            self.disable_overlap_schedule = False
         
     | 
| 
      
 1298 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 1299 
     | 
    
         
            +
                                "Beta spec is enabled for eagle speculative decoding and overlap schedule is turned on."
         
     | 
| 
      
 1300 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1301 
     | 
    
         
            +
             
     | 
| 
      
 1302 
     | 
    
         
            +
                        if not self.enable_beta_spec:
         
     | 
| 
      
 1303 
     | 
    
         
            +
                            self.disable_overlap_schedule = True
         
     | 
| 
      
 1304 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 1305 
     | 
    
         
            +
                                "Overlap scheduler is disabled because of using eagle3 and standalone speculative decoding."
         
     | 
| 
      
 1306 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1307 
     | 
    
         
            +
             
     | 
| 
       777 
1308 
     | 
    
         
             
                        if self.enable_mixed_chunk:
         
     | 
| 
       778 
1309 
     | 
    
         
             
                            self.enable_mixed_chunk = False
         
     | 
| 
       779 
1310 
     | 
    
         
             
                            logger.warning(
         
     | 
| 
         @@ -783,12 +1314,12 @@ class ServerArgs: 
     | 
|
| 
       783 
1314 
     | 
    
         | 
| 
       784 
1315 
     | 
    
         
             
                        model_arch = self.get_hf_config().architectures[0]
         
     | 
| 
       785 
1316 
     | 
    
         
             
                        if model_arch in [
         
     | 
| 
      
 1317 
     | 
    
         
            +
                            "DeepseekV32ForCausalLM",
         
     | 
| 
       786 
1318 
     | 
    
         
             
                            "DeepseekV3ForCausalLM",
         
     | 
| 
       787 
1319 
     | 
    
         
             
                            "Glm4MoeForCausalLM",
         
     | 
| 
       788 
1320 
     | 
    
         
             
                            "BailingMoeForCausalLM",
         
     | 
| 
       789 
1321 
     | 
    
         
             
                            "BailingMoeV2ForCausalLM",
         
     | 
| 
       790 
1322 
     | 
    
         
             
                        ]:
         
     | 
| 
       791 
     | 
    
         
            -
                            # Auto set draft_model_path DeepSeek-V3/R1
         
     | 
| 
       792 
1323 
     | 
    
         
             
                            if self.speculative_draft_model_path is None:
         
     | 
| 
       793 
1324 
     | 
    
         
             
                                self.speculative_draft_model_path = self.model_path
         
     | 
| 
       794 
1325 
     | 
    
         
             
                            else:
         
     | 
| 
         @@ -796,7 +1327,6 @@ class ServerArgs: 
     | 
|
| 
       796 
1327 
     | 
    
         
             
                                    "DeepSeek MTP does not require setting speculative_draft_model_path."
         
     | 
| 
       797 
1328 
     | 
    
         
             
                                )
         
     | 
| 
       798 
1329 
     | 
    
         | 
| 
       799 
     | 
    
         
            -
                        # Auto choose parameters
         
     | 
| 
       800 
1330 
     | 
    
         
             
                        if self.speculative_num_steps is None:
         
     | 
| 
       801 
1331 
     | 
    
         
             
                            assert (
         
     | 
| 
       802 
1332 
     | 
    
         
             
                                self.speculative_eagle_topk is None
         
     | 
| 
         @@ -836,11 +1366,43 @@ class ServerArgs: 
     | 
|
| 
       836 
1366 
     | 
    
         
             
                                "speculative_eagle_topk > 1 with page_size > 1 is unstable and produces incorrect results for paged attention backends. This combination is only supported for the 'flashinfer' backend."
         
     | 
| 
       837 
1367 
     | 
    
         
             
                            )
         
     | 
| 
       838 
1368 
     | 
    
         | 
| 
       839 
     | 
    
         
            -
             
     | 
| 
       840 
     | 
    
         
            -
                         
     | 
| 
       841 
     | 
    
         
            -
             
     | 
| 
      
 1369 
     | 
    
         
            +
                    if self.speculative_algorithm == "NGRAM":
         
     | 
| 
      
 1370 
     | 
    
         
            +
                        if not self.device.startswith("cuda"):
         
     | 
| 
      
 1371 
     | 
    
         
            +
                            raise ValueError(
         
     | 
| 
      
 1372 
     | 
    
         
            +
                                "Ngram speculative decoding only supports CUDA device."
         
     | 
| 
      
 1373 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1374 
     | 
    
         
            +
                        if self.max_running_requests is None:
         
     | 
| 
      
 1375 
     | 
    
         
            +
                            self.max_running_requests = 48
         
     | 
| 
      
 1376 
     | 
    
         
            +
                        self.disable_overlap_schedule = True
         
     | 
| 
      
 1377 
     | 
    
         
            +
                        self.enable_mixed_chunk = False
         
     | 
| 
      
 1378 
     | 
    
         
            +
                        self.speculative_eagle_topk = self.speculative_ngram_max_bfs_breadth
         
     | 
| 
      
 1379 
     | 
    
         
            +
                        if self.speculative_num_draft_tokens is None:
         
     | 
| 
      
 1380 
     | 
    
         
            +
                            self.speculative_num_draft_tokens = (
         
     | 
| 
      
 1381 
     | 
    
         
            +
                                self.speculative_ngram_max_match_window_size
         
     | 
| 
      
 1382 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1383 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 1384 
     | 
    
         
            +
                            "The overlap scheduler and mixed chunked prefill are disabled because of "
         
     | 
| 
      
 1385 
     | 
    
         
            +
                            "using ngram speculative decoding."
         
     | 
| 
      
 1386 
     | 
    
         
            +
                        )
         
     | 
| 
       842 
1387 
     | 
    
         | 
| 
       843 
     | 
    
         
            -
             
     | 
| 
      
 1388 
     | 
    
         
            +
                        if (
         
     | 
| 
      
 1389 
     | 
    
         
            +
                            self.speculative_eagle_topk > 1
         
     | 
| 
      
 1390 
     | 
    
         
            +
                            and self.page_size > 1
         
     | 
| 
      
 1391 
     | 
    
         
            +
                            and self.attention_backend != "flashinfer"
         
     | 
| 
      
 1392 
     | 
    
         
            +
                        ):
         
     | 
| 
      
 1393 
     | 
    
         
            +
                            raise ValueError(
         
     | 
| 
      
 1394 
     | 
    
         
            +
                                f"speculative_eagle_topk({self.speculative_eagle_topk}) > 1 "
         
     | 
| 
      
 1395 
     | 
    
         
            +
                                f"with page_size({self.page_size}) > 1 is unstable "
         
     | 
| 
      
 1396 
     | 
    
         
            +
                                "and produces incorrect results for paged attention backends. "
         
     | 
| 
      
 1397 
     | 
    
         
            +
                                "This combination is only supported for the 'flashinfer' backend."
         
     | 
| 
      
 1398 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1399 
     | 
    
         
            +
                        if self.enable_dp_attention:
         
     | 
| 
      
 1400 
     | 
    
         
            +
                            # TODO: support dp attention for ngram speculative decoding
         
     | 
| 
      
 1401 
     | 
    
         
            +
                            raise ValueError(
         
     | 
| 
      
 1402 
     | 
    
         
            +
                                "Currently ngram speculative decoding does not support dp attention."
         
     | 
| 
      
 1403 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1404 
     | 
    
         
            +
             
     | 
| 
      
 1405 
     | 
    
         
            +
                def _handle_load_format(self):
         
     | 
| 
       844 
1406 
     | 
    
         
             
                    if (
         
     | 
| 
       845 
1407 
     | 
    
         
             
                        self.load_format == "auto" or self.load_format == "gguf"
         
     | 
| 
       846 
1408 
     | 
    
         
             
                    ) and check_gguf_file(self.model_path):
         
     | 
| 
         @@ -848,6 +1410,7 @@ class ServerArgs: 
     | 
|
| 
       848 
1410 
     | 
    
         | 
| 
       849 
1411 
     | 
    
         
             
                    if is_remote_url(self.model_path):
         
     | 
| 
       850 
1412 
     | 
    
         
             
                        self.load_format = "remote"
         
     | 
| 
      
 1413 
     | 
    
         
            +
             
     | 
| 
       851 
1414 
     | 
    
         
             
                    if self.custom_weight_loader is None:
         
     | 
| 
       852 
1415 
     | 
    
         
             
                        self.custom_weight_loader = []
         
     | 
| 
       853 
1416 
     | 
    
         | 
| 
         @@ -859,7 +1422,7 @@ class ServerArgs: 
     | 
|
| 
       859 
1422 
     | 
    
         
             
                        ):
         
     | 
| 
       860 
1423 
     | 
    
         
             
                            self.load_format = "auto"
         
     | 
| 
       861 
1424 
     | 
    
         | 
| 
       862 
     | 
    
         
            -
             
     | 
| 
      
 1425 
     | 
    
         
            +
                def _handle_disaggregation(self):
         
     | 
| 
       863 
1426 
     | 
    
         
             
                    if self.disaggregation_mode == "decode":
         
     | 
| 
       864 
1427 
     | 
    
         
             
                        assert (
         
     | 
| 
       865 
1428 
     | 
    
         
             
                            self.disaggregation_decode_tp is None
         
     | 
| 
         @@ -885,44 +1448,121 @@ class ServerArgs: 
     | 
|
| 
       885 
1448 
     | 
    
         | 
| 
       886 
1449 
     | 
    
         
             
                        self.disaggregation_prefill_pp = self.pp_size
         
     | 
| 
       887 
1450 
     | 
    
         
             
                        self.validate_disagg_tp_size(self.tp_size, self.disaggregation_decode_tp)
         
     | 
| 
       888 
     | 
    
         
            -
             
     | 
| 
       889 
1451 
     | 
    
         
             
                        self.disable_cuda_graph = True
         
     | 
| 
       890 
1452 
     | 
    
         
             
                        logger.warning("Cuda graph is disabled for prefill server")
         
     | 
| 
       891 
1453 
     | 
    
         | 
| 
       892 
     | 
    
         
            -
             
     | 
| 
      
 1454 
     | 
    
         
            +
                def _handle_tokenizer_batching(self):
         
     | 
| 
       893 
1455 
     | 
    
         
             
                    if self.enable_tokenizer_batch_encode and self.enable_dynamic_batch_tokenizer:
         
     | 
| 
       894 
1456 
     | 
    
         
             
                        raise ValueError(
         
     | 
| 
       895 
1457 
     | 
    
         
             
                            "Cannot enable both --enable-tokenizer-batch-encode and --enable-dynamic-batch-tokenizer. "
         
     | 
| 
       896 
1458 
     | 
    
         
             
                            "Please choose one tokenizer batching approach."
         
     | 
| 
       897 
1459 
     | 
    
         
             
                        )
         
     | 
| 
       898 
1460 
     | 
    
         | 
| 
       899 
     | 
    
         
            -
                     
     | 
| 
      
 1461 
     | 
    
         
            +
                    if self.skip_tokenizer_init:
         
     | 
| 
      
 1462 
     | 
    
         
            +
                        if self.tokenizer_worker_num != 1:
         
     | 
| 
      
 1463 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 1464 
     | 
    
         
            +
                                "skip_tokenizer_init=True disables tokenizer workers; forcing tokenizer_worker_num=1 "
         
     | 
| 
      
 1465 
     | 
    
         
            +
                                f"(requested {self.tokenizer_worker_num})."
         
     | 
| 
      
 1466 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1467 
     | 
    
         
            +
                            self.tokenizer_worker_num = 1
         
     | 
| 
      
 1468 
     | 
    
         
            +
             
     | 
| 
      
 1469 
     | 
    
         
            +
                        if self.enable_tokenizer_batch_encode:
         
     | 
| 
      
 1470 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 1471 
     | 
    
         
            +
                                "skip_tokenizer_init=True ignores --enable-tokenizer-batch-encode; disabling it."
         
     | 
| 
      
 1472 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1473 
     | 
    
         
            +
                            self.enable_tokenizer_batch_encode = False
         
     | 
| 
      
 1474 
     | 
    
         
            +
             
     | 
| 
      
 1475 
     | 
    
         
            +
                        if self.enable_dynamic_batch_tokenizer:
         
     | 
| 
      
 1476 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 1477 
     | 
    
         
            +
                                "skip_tokenizer_init=True ignores --enable-dynamic-batch-tokenizer; disabling it."
         
     | 
| 
      
 1478 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1479 
     | 
    
         
            +
                            self.enable_dynamic_batch_tokenizer = False
         
     | 
| 
      
 1480 
     | 
    
         
            +
             
     | 
| 
      
 1481 
     | 
    
         
            +
                def _handle_environment_variables(self):
         
     | 
| 
       900 
1482 
     | 
    
         
             
                    os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
         
     | 
| 
       901 
1483 
     | 
    
         
             
                        "1" if self.enable_torch_compile else "0"
         
     | 
| 
       902 
1484 
     | 
    
         
             
                    )
         
     | 
| 
       903 
1485 
     | 
    
         
             
                    os.environ["SGLANG_MAMBA_SSM_DTYPE"] = self.mamba_ssm_dtype
         
     | 
| 
       904 
     | 
    
         
            -
             
     | 
| 
       905 
     | 
    
         
            -
                    # Set env var before grammar backends init
         
     | 
| 
       906 
1486 
     | 
    
         
             
                    os.environ["SGLANG_DISABLE_OUTLINES_DISK_CACHE"] = (
         
     | 
| 
       907 
1487 
     | 
    
         
             
                        "1" if self.disable_outlines_disk_cache else "0"
         
     | 
| 
       908 
1488 
     | 
    
         
             
                    )
         
     | 
| 
      
 1489 
     | 
    
         
            +
                    os.environ["SGLANG_ENABLE_DETERMINISTIC_INFERENCE"] = (
         
     | 
| 
      
 1490 
     | 
    
         
            +
                        "1" if self.enable_deterministic_inference else "0"
         
     | 
| 
      
 1491 
     | 
    
         
            +
                    )
         
     | 
| 
       909 
1492 
     | 
    
         | 
| 
      
 1493 
     | 
    
         
            +
                def _handle_cache_compatibility(self):
         
     | 
| 
       910 
1494 
     | 
    
         
             
                    if self.enable_hierarchical_cache and self.disable_radix_cache:
         
     | 
| 
       911 
1495 
     | 
    
         
             
                        raise ValueError(
         
     | 
| 
       912 
1496 
     | 
    
         
             
                            "The arguments enable-hierarchical-cache and disable-radix-cache are mutually exclusive "
         
     | 
| 
       913 
1497 
     | 
    
         
             
                            "and cannot be used at the same time. Please use only one of them."
         
     | 
| 
       914 
1498 
     | 
    
         
             
                        )
         
     | 
| 
       915 
1499 
     | 
    
         | 
| 
      
 1500 
     | 
    
         
            +
                    if (
         
     | 
| 
      
 1501 
     | 
    
         
            +
                        self.disaggregation_decode_enable_offload_kvcache
         
     | 
| 
      
 1502 
     | 
    
         
            +
                        and self.disaggregation_mode != "decode"
         
     | 
| 
      
 1503 
     | 
    
         
            +
                    ):
         
     | 
| 
      
 1504 
     | 
    
         
            +
                        raise ValueError(
         
     | 
| 
      
 1505 
     | 
    
         
            +
                            "The argument disaggregation-decode-enable-offload-kvcache is only supported for decode side."
         
     | 
| 
      
 1506 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1507 
     | 
    
         
            +
             
     | 
| 
      
 1508 
     | 
    
         
            +
                def _handle_metrics_labels(self):
         
     | 
| 
       916 
1509 
     | 
    
         
             
                    if (
         
     | 
| 
       917 
1510 
     | 
    
         
             
                        not self.tokenizer_metrics_custom_labels_header
         
     | 
| 
       918 
     | 
    
         
            -
                        and self. 
     | 
| 
      
 1511 
     | 
    
         
            +
                        and self.tokenizer_metrics_allowed_custom_labels
         
     | 
| 
       919 
1512 
     | 
    
         
             
                    ):
         
     | 
| 
       920 
1513 
     | 
    
         
             
                        raise ValueError(
         
     | 
| 
       921 
     | 
    
         
            -
                            "Please set --tokenizer-metrics-custom-labels-header when setting --tokenizer-metrics-allowed- 
     | 
| 
      
 1514 
     | 
    
         
            +
                            "Please set --tokenizer-metrics-custom-labels-header when setting --tokenizer-metrics-allowed-custom-labels."
         
     | 
| 
       922 
1515 
     | 
    
         
             
                        )
         
     | 
| 
       923 
1516 
     | 
    
         | 
| 
      
 1517 
     | 
    
         
            +
                def _handle_deterministic_inference(self):
         
     | 
| 
      
 1518 
     | 
    
         
            +
                    if self.enable_deterministic_inference:
         
     | 
| 
      
 1519 
     | 
    
         
            +
                        # Check sampling backend
         
     | 
| 
      
 1520 
     | 
    
         
            +
                        self.sampling_backend = "pytorch"
         
     | 
| 
      
 1521 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 1522 
     | 
    
         
            +
                            "Sampling backend is set to pytorch for deterministic inference."
         
     | 
| 
      
 1523 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1524 
     | 
    
         
            +
             
     | 
| 
      
 1525 
     | 
    
         
            +
                        # Check attention backend
         
     | 
| 
      
 1526 
     | 
    
         
            +
                        if self.attention_backend is None:
         
     | 
| 
      
 1527 
     | 
    
         
            +
                            # User didn't specify attention backend, fallback based on GPU architecture
         
     | 
| 
      
 1528 
     | 
    
         
            +
                            if is_sm100_supported() or is_sm120_supported():
         
     | 
| 
      
 1529 
     | 
    
         
            +
                                # Blackwell and newer architectures
         
     | 
| 
      
 1530 
     | 
    
         
            +
                                self.attention_backend = "flashinfer"
         
     | 
| 
      
 1531 
     | 
    
         
            +
                            else:
         
     | 
| 
      
 1532 
     | 
    
         
            +
                                # Hopper (SM90) and older architectures
         
     | 
| 
      
 1533 
     | 
    
         
            +
                                self.attention_backend = "fa3"
         
     | 
| 
      
 1534 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 1535 
     | 
    
         
            +
                                f"Attention backend not specified. Falling back to '{self.attention_backend}' for deterministic inference. "
         
     | 
| 
      
 1536 
     | 
    
         
            +
                                f"You can explicitly set --attention-backend to one of {DETERMINISTIC_ATTENTION_BACKEND_CHOICES}."
         
     | 
| 
      
 1537 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1538 
     | 
    
         
            +
                        elif self.attention_backend not in DETERMINISTIC_ATTENTION_BACKEND_CHOICES:
         
     | 
| 
      
 1539 
     | 
    
         
            +
                            # User explicitly specified an incompatible attention backend
         
     | 
| 
      
 1540 
     | 
    
         
            +
                            raise ValueError(
         
     | 
| 
      
 1541 
     | 
    
         
            +
                                f"Currently only {DETERMINISTIC_ATTENTION_BACKEND_CHOICES} attention backends are supported for deterministic inference, "
         
     | 
| 
      
 1542 
     | 
    
         
            +
                                f"but you explicitly specified '{self.attention_backend}'."
         
     | 
| 
      
 1543 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1544 
     | 
    
         
            +
             
     | 
| 
      
 1545 
     | 
    
         
            +
                        # Currently, only FA3 and Triton supports radix cache. Support for other backends is in progress
         
     | 
| 
      
 1546 
     | 
    
         
            +
                        if self.attention_backend not in ["fa3", "triton"]:
         
     | 
| 
      
 1547 
     | 
    
         
            +
                            self.disable_radix_cache = True
         
     | 
| 
      
 1548 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 1549 
     | 
    
         
            +
                                f"Currently radix cache is not compatible with {self.attention_backend} attention backend for deterministic inference. It will be supported in the future."
         
     | 
| 
      
 1550 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1551 
     | 
    
         
            +
             
     | 
| 
      
 1552 
     | 
    
         
            +
                        # Check TP size
         
     | 
| 
      
 1553 
     | 
    
         
            +
                        if self.tp_size > 1:
         
     | 
| 
      
 1554 
     | 
    
         
            +
                            os.environ["NCCL_ALGO"] = "allreduce:tree"
         
     | 
| 
      
 1555 
     | 
    
         
            +
                            self.disable_custom_all_reduce = True
         
     | 
| 
      
 1556 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 1557 
     | 
    
         
            +
                                "NCCL_ALGO is set to 'allreduce:tree' and custom all reduce is disabled for deterministic inference when TP size > 1."
         
     | 
| 
      
 1558 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1559 
     | 
    
         
            +
             
     | 
| 
      
 1560 
     | 
    
         
            +
                def _handle_other_validations(self):
         
     | 
| 
      
 1561 
     | 
    
         
            +
                    pass
         
     | 
| 
      
 1562 
     | 
    
         
            +
             
     | 
| 
       924 
1563 
     | 
    
         
             
                @staticmethod
         
     | 
| 
       925 
1564 
     | 
    
         
             
                def add_cli_args(parser: argparse.ArgumentParser):
         
     | 
| 
      
 1565 
     | 
    
         
            +
             
     | 
| 
       926 
1566 
     | 
    
         
             
                    # Model and tokenizer
         
     | 
| 
       927 
1567 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       928 
1568 
     | 
    
         
             
                        "--model-path",
         
     | 
| 
         @@ -931,24 +1571,6 @@ class ServerArgs: 
     | 
|
| 
       931 
1571 
     | 
    
         
             
                        help="The path of the model weights. This can be a local folder or a Hugging Face repo ID.",
         
     | 
| 
       932 
1572 
     | 
    
         
             
                        required=True,
         
     | 
| 
       933 
1573 
     | 
    
         
             
                    )
         
     | 
| 
       934 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       935 
     | 
    
         
            -
                        "--remote-instance-weight-loader-seed-instance-ip",
         
     | 
| 
       936 
     | 
    
         
            -
                        type=str,
         
     | 
| 
       937 
     | 
    
         
            -
                        default=ServerArgs.remote_instance_weight_loader_seed_instance_ip,
         
     | 
| 
       938 
     | 
    
         
            -
                        help="The ip of the seed instance for loading weights from remote instance.",
         
     | 
| 
       939 
     | 
    
         
            -
                    )
         
     | 
| 
       940 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       941 
     | 
    
         
            -
                        "--remote-instance-weight-loader-seed-instance-service-port",
         
     | 
| 
       942 
     | 
    
         
            -
                        type=int,
         
     | 
| 
       943 
     | 
    
         
            -
                        default=ServerArgs.remote_instance_weight_loader_seed_instance_service_port,
         
     | 
| 
       944 
     | 
    
         
            -
                        help="The service port of the seed instance for loading weights from remote instance.",
         
     | 
| 
       945 
     | 
    
         
            -
                    )
         
     | 
| 
       946 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       947 
     | 
    
         
            -
                        "--remote-instance-weight-loader-send-weights-group-ports",
         
     | 
| 
       948 
     | 
    
         
            -
                        type=json_list_type,
         
     | 
| 
       949 
     | 
    
         
            -
                        default=ServerArgs.remote_instance_weight_loader_send_weights_group_ports,
         
     | 
| 
       950 
     | 
    
         
            -
                        help="The communication group ports for loading weights from remote instance.",
         
     | 
| 
       951 
     | 
    
         
            -
                    )
         
     | 
| 
       952 
1574 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       953 
1575 
     | 
    
         
             
                        "--tokenizer-path",
         
     | 
| 
       954 
1576 
     | 
    
         
             
                        type=str,
         
     | 
| 
         @@ -1060,6 +1682,11 @@ class ServerArgs: 
     | 
|
| 
       1060 
1682 
     | 
    
         
             
                        default=ServerArgs.port,
         
     | 
| 
       1061 
1683 
     | 
    
         
             
                        help="The port of the HTTP server.",
         
     | 
| 
       1062 
1684 
     | 
    
         
             
                    )
         
     | 
| 
      
 1685 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 1686 
     | 
    
         
            +
                        "--grpc-mode",
         
     | 
| 
      
 1687 
     | 
    
         
            +
                        action="store_true",
         
     | 
| 
      
 1688 
     | 
    
         
            +
                        help="If set, use gRPC server instead of HTTP server.",
         
     | 
| 
      
 1689 
     | 
    
         
            +
                    )
         
     | 
| 
       1063 
1690 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1064 
1691 
     | 
    
         
             
                        "--skip-server-warmup",
         
     | 
| 
       1065 
1692 
     | 
    
         
             
                        action="store_true",
         
     | 
| 
         @@ -1078,6 +1705,12 @@ class ServerArgs: 
     | 
|
| 
       1078 
1705 
     | 
    
         
             
                        default=ServerArgs.nccl_port,
         
     | 
| 
       1079 
1706 
     | 
    
         
             
                        help="The port for NCCL distributed environment setup. Defaults to a random port.",
         
     | 
| 
       1080 
1707 
     | 
    
         
             
                    )
         
     | 
| 
      
 1708 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 1709 
     | 
    
         
            +
                        "--checkpoint-engine-wait-weights-before-ready",
         
     | 
| 
      
 1710 
     | 
    
         
            +
                        action="store_true",
         
     | 
| 
      
 1711 
     | 
    
         
            +
                        help="If set, the server will wait for initial weights to be loaded via checkpoint-engine or other update methods "
         
     | 
| 
      
 1712 
     | 
    
         
            +
                        "before serving inference requests.",
         
     | 
| 
      
 1713 
     | 
    
         
            +
                    )
         
     | 
| 
       1081 
1714 
     | 
    
         | 
| 
       1082 
1715 
     | 
    
         
             
                    # Quantization and data type
         
     | 
| 
       1083 
1716 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
         @@ -1110,12 +1743,56 @@ class ServerArgs: 
     | 
|
| 
       1110 
1743 
     | 
    
         
             
                        "KV cache dtype is FP8. Otherwise, KV cache scaling factors "
         
     | 
| 
       1111 
1744 
     | 
    
         
             
                        "default to 1.0, which may cause accuracy issues. ",
         
     | 
| 
       1112 
1745 
     | 
    
         
             
                    )
         
     | 
| 
      
 1746 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 1747 
     | 
    
         
            +
                        "--modelopt-quant",
         
     | 
| 
      
 1748 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 1749 
     | 
    
         
            +
                        default=ServerArgs.modelopt_quant,
         
     | 
| 
      
 1750 
     | 
    
         
            +
                        help="The ModelOpt quantization configuration. "
         
     | 
| 
      
 1751 
     | 
    
         
            +
                        "Supported values: 'fp8', 'int4_awq', 'w4a8_awq', 'nvfp4', 'nvfp4_awq'. "
         
     | 
| 
      
 1752 
     | 
    
         
            +
                        "This requires the NVIDIA Model Optimizer library to be installed: pip install nvidia-modelopt",
         
     | 
| 
      
 1753 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1754 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 1755 
     | 
    
         
            +
                        "--modelopt-checkpoint-restore-path",
         
     | 
| 
      
 1756 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 1757 
     | 
    
         
            +
                        default=ServerArgs.modelopt_checkpoint_restore_path,
         
     | 
| 
      
 1758 
     | 
    
         
            +
                        help="Path to restore a previously saved ModelOpt quantized checkpoint. "
         
     | 
| 
      
 1759 
     | 
    
         
            +
                        "If provided, the quantization process will be skipped and the model "
         
     | 
| 
      
 1760 
     | 
    
         
            +
                        "will be loaded from this checkpoint.",
         
     | 
| 
      
 1761 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1762 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 1763 
     | 
    
         
            +
                        "--modelopt-checkpoint-save-path",
         
     | 
| 
      
 1764 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 1765 
     | 
    
         
            +
                        default=ServerArgs.modelopt_checkpoint_save_path,
         
     | 
| 
      
 1766 
     | 
    
         
            +
                        help="Path to save the ModelOpt quantized checkpoint after quantization. "
         
     | 
| 
      
 1767 
     | 
    
         
            +
                        "This allows reusing the quantized model in future runs.",
         
     | 
| 
      
 1768 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1769 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 1770 
     | 
    
         
            +
                        "--modelopt-export-path",
         
     | 
| 
      
 1771 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 1772 
     | 
    
         
            +
                        default=ServerArgs.modelopt_export_path,
         
     | 
| 
      
 1773 
     | 
    
         
            +
                        help="Path to export the quantized model in HuggingFace format after ModelOpt quantization. "
         
     | 
| 
      
 1774 
     | 
    
         
            +
                        "The exported model can then be used directly with SGLang for inference. "
         
     | 
| 
      
 1775 
     | 
    
         
            +
                        "If not provided, the model will not be exported.",
         
     | 
| 
      
 1776 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1777 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 1778 
     | 
    
         
            +
                        "--quantize-and-serve",
         
     | 
| 
      
 1779 
     | 
    
         
            +
                        action="store_true",
         
     | 
| 
      
 1780 
     | 
    
         
            +
                        default=ServerArgs.quantize_and_serve,
         
     | 
| 
      
 1781 
     | 
    
         
            +
                        help="Quantize the model with ModelOpt and immediately serve it without exporting. "
         
     | 
| 
      
 1782 
     | 
    
         
            +
                        "This is useful for development and prototyping. For production, it's recommended "
         
     | 
| 
      
 1783 
     | 
    
         
            +
                        "to use separate quantization and deployment steps.",
         
     | 
| 
      
 1784 
     | 
    
         
            +
                    )
         
     | 
| 
       1113 
1785 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1114 
1786 
     | 
    
         
             
                        "--kv-cache-dtype",
         
     | 
| 
       1115 
1787 
     | 
    
         
             
                        type=str,
         
     | 
| 
       1116 
1788 
     | 
    
         
             
                        default=ServerArgs.kv_cache_dtype,
         
     | 
| 
       1117 
     | 
    
         
            -
                        choices=["auto", "fp8_e5m2", "fp8_e4m3"],
         
     | 
| 
       1118 
     | 
    
         
            -
                        help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" and "fp8_e4m3"  
     | 
| 
      
 1789 
     | 
    
         
            +
                        choices=["auto", "fp8_e5m2", "fp8_e4m3", "bf16", "bfloat16"],
         
     | 
| 
      
 1790 
     | 
    
         
            +
                        help='Data type for kv cache storage. "auto" will use model data type. "bf16" or "bfloat16" for BF16 KV cache. "fp8_e5m2" and "fp8_e4m3" are supported for CUDA 11.8+.',
         
     | 
| 
      
 1791 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1792 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 1793 
     | 
    
         
            +
                        "--enable-fp32-lm-head",
         
     | 
| 
      
 1794 
     | 
    
         
            +
                        action="store_true",
         
     | 
| 
      
 1795 
     | 
    
         
            +
                        help="If set, the LM head outputs (logits) are in FP32.",
         
     | 
| 
       1119 
1796 
     | 
    
         
             
                    )
         
     | 
| 
       1120 
1797 
     | 
    
         | 
| 
       1121 
1798 
     | 
    
         
             
                    # Memory and scheduling
         
     | 
| 
         @@ -1163,6 +1840,30 @@ class ServerArgs: 
     | 
|
| 
       1163 
1840 
     | 
    
         
             
                        choices=["lpm", "random", "fcfs", "dfs-weight", "lof", "priority"],
         
     | 
| 
       1164 
1841 
     | 
    
         
             
                        help="The scheduling policy of the requests.",
         
     | 
| 
       1165 
1842 
     | 
    
         
             
                    )
         
     | 
| 
      
 1843 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 1844 
     | 
    
         
            +
                        "--enable-priority-scheduling",
         
     | 
| 
      
 1845 
     | 
    
         
            +
                        action="store_true",
         
     | 
| 
      
 1846 
     | 
    
         
            +
                        default=ServerArgs.enable_priority_scheduling,
         
     | 
| 
      
 1847 
     | 
    
         
            +
                        help="Enable priority scheduling. Requests with higher priority integer values will be scheduled first by default.",
         
     | 
| 
      
 1848 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1849 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 1850 
     | 
    
         
            +
                        "--abort-on-priority-when-disabled",
         
     | 
| 
      
 1851 
     | 
    
         
            +
                        action="store_true",
         
     | 
| 
      
 1852 
     | 
    
         
            +
                        default=ServerArgs.abort_on_priority_when_disabled,
         
     | 
| 
      
 1853 
     | 
    
         
            +
                        help="If set, abort requests that specify a priority when priority scheduling is disabled.",
         
     | 
| 
      
 1854 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1855 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 1856 
     | 
    
         
            +
                        "--schedule-low-priority-values-first",
         
     | 
| 
      
 1857 
     | 
    
         
            +
                        action="store_true",
         
     | 
| 
      
 1858 
     | 
    
         
            +
                        default=ServerArgs.schedule_low_priority_values_first,
         
     | 
| 
      
 1859 
     | 
    
         
            +
                        help="If specified with --enable-priority-scheduling, the scheduler will schedule requests with lower priority integer values first.",
         
     | 
| 
      
 1860 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1861 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 1862 
     | 
    
         
            +
                        "--priority-scheduling-preemption-threshold",
         
     | 
| 
      
 1863 
     | 
    
         
            +
                        type=int,
         
     | 
| 
      
 1864 
     | 
    
         
            +
                        default=ServerArgs.priority_scheduling_preemption_threshold,
         
     | 
| 
      
 1865 
     | 
    
         
            +
                        help="Minimum difference in priorities for an incoming request to have to preempt running request(s).",
         
     | 
| 
      
 1866 
     | 
    
         
            +
                    )
         
     | 
| 
       1166 
1867 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1167 
1868 
     | 
    
         
             
                        "--schedule-conservativeness",
         
     | 
| 
       1168 
1869 
     | 
    
         
             
                        type=float,
         
     | 
| 
         @@ -1207,6 +1908,21 @@ class ServerArgs: 
     | 
|
| 
       1207 
1908 
     | 
    
         
             
                        default=ServerArgs.device,
         
     | 
| 
       1208 
1909 
     | 
    
         
             
                        help="The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified.",
         
     | 
| 
       1209 
1910 
     | 
    
         
             
                    )
         
     | 
| 
      
 1911 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 1912 
     | 
    
         
            +
                        "--elastic-ep-backend",
         
     | 
| 
      
 1913 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 1914 
     | 
    
         
            +
                        default=ServerArgs.elastic_ep_backend,
         
     | 
| 
      
 1915 
     | 
    
         
            +
                        choices=["none", "mooncake"],
         
     | 
| 
      
 1916 
     | 
    
         
            +
                        help="Specify the collective communication backend for elastic EP. Currently supports 'mooncake'.",
         
     | 
| 
      
 1917 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1918 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 1919 
     | 
    
         
            +
                        "--mooncake-ib-device",
         
     | 
| 
      
 1920 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 1921 
     | 
    
         
            +
                        default=ServerArgs.mooncake_ib_device,
         
     | 
| 
      
 1922 
     | 
    
         
            +
                        help="The InfiniBand devices for Mooncake Backend transfer, accepts multiple comma-separated devices "
         
     | 
| 
      
 1923 
     | 
    
         
            +
                        "(e.g., --mooncake-ib-device mlx5_0,mlx5_1). "
         
     | 
| 
      
 1924 
     | 
    
         
            +
                        "Default is None, which triggers automatic device detection when Mooncake Backend is enabled.",
         
     | 
| 
      
 1925 
     | 
    
         
            +
                    )
         
     | 
| 
       1210 
1926 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1211 
1927 
     | 
    
         
             
                        "--tensor-parallel-size",
         
     | 
| 
       1212 
1928 
     | 
    
         
             
                        "--tp-size",
         
     | 
| 
         @@ -1222,9 +1938,9 @@ class ServerArgs: 
     | 
|
| 
       1222 
1938 
     | 
    
         
             
                        help="The pipeline parallelism size.",
         
     | 
| 
       1223 
1939 
     | 
    
         
             
                    )
         
     | 
| 
       1224 
1940 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1225 
     | 
    
         
            -
                        "--max-micro-batch-size",
         
     | 
| 
      
 1941 
     | 
    
         
            +
                        "--pp-max-micro-batch-size",
         
     | 
| 
       1226 
1942 
     | 
    
         
             
                        type=int,
         
     | 
| 
       1227 
     | 
    
         
            -
                        default=ServerArgs. 
     | 
| 
      
 1943 
     | 
    
         
            +
                        default=ServerArgs.pp_max_micro_batch_size,
         
     | 
| 
       1228 
1944 
     | 
    
         
             
                        help="The maximum micro batch size in pipeline parallelism.",
         
     | 
| 
       1229 
1945 
     | 
    
         
             
                    )
         
     | 
| 
       1230 
1946 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
         @@ -1248,7 +1964,12 @@ class ServerArgs: 
     | 
|
| 
       1248 
1964 
     | 
    
         
             
                        "--constrained-json-whitespace-pattern",
         
     | 
| 
       1249 
1965 
     | 
    
         
             
                        type=str,
         
     | 
| 
       1250 
1966 
     | 
    
         
             
                        default=ServerArgs.constrained_json_whitespace_pattern,
         
     | 
| 
       1251 
     | 
    
         
            -
                        help="(outlines  
     | 
| 
      
 1967 
     | 
    
         
            +
                        help="(outlines and llguidance backends only) Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [\n\t ]*",
         
     | 
| 
      
 1968 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1969 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 1970 
     | 
    
         
            +
                        "--constrained-json-disable-any-whitespace",
         
     | 
| 
      
 1971 
     | 
    
         
            +
                        action="store_true",
         
     | 
| 
      
 1972 
     | 
    
         
            +
                        help="(xgrammar and llguidance backends only) Enforce compact representation in JSON constrained output.",
         
     | 
| 
       1252 
1973 
     | 
    
         
             
                    )
         
     | 
| 
       1253 
1974 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1254 
1975 
     | 
    
         
             
                        "--watchdog-timeout",
         
     | 
| 
         @@ -1338,16 +2059,16 @@ class ServerArgs: 
     | 
|
| 
       1338 
2059 
     | 
    
         
             
                        "--tokenizer-metrics-custom-labels-header",
         
     | 
| 
       1339 
2060 
     | 
    
         
             
                        type=str,
         
     | 
| 
       1340 
2061 
     | 
    
         
             
                        default=ServerArgs.tokenizer_metrics_custom_labels_header,
         
     | 
| 
       1341 
     | 
    
         
            -
                        help="Specify the HTTP header for passing  
     | 
| 
      
 2062 
     | 
    
         
            +
                        help="Specify the HTTP header for passing custom labels for tokenizer metrics.",
         
     | 
| 
       1342 
2063 
     | 
    
         
             
                    )
         
     | 
| 
       1343 
2064 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1344 
     | 
    
         
            -
                        "--tokenizer-metrics-allowed- 
     | 
| 
      
 2065 
     | 
    
         
            +
                        "--tokenizer-metrics-allowed-custom-labels",
         
     | 
| 
       1345 
2066 
     | 
    
         
             
                        type=str,
         
     | 
| 
       1346 
2067 
     | 
    
         
             
                        nargs="+",
         
     | 
| 
       1347 
     | 
    
         
            -
                        default=ServerArgs. 
     | 
| 
       1348 
     | 
    
         
            -
                        help="The  
     | 
| 
      
 2068 
     | 
    
         
            +
                        default=ServerArgs.tokenizer_metrics_allowed_custom_labels,
         
     | 
| 
      
 2069 
     | 
    
         
            +
                        help="The custom labels allowed for tokenizer metrics. The labels are specified via a dict in "
         
     | 
| 
       1349 
2070 
     | 
    
         
             
                        "'--tokenizer-metrics-custom-labels-header' field in HTTP requests, e.g., {'label1': 'value1', 'label2': "
         
     | 
| 
       1350 
     | 
    
         
            -
                        "'value2'} is allowed if '--tokenizer-metrics-allowed-labels label1 label2' is set.",
         
     | 
| 
      
 2071 
     | 
    
         
            +
                        "'value2'} is allowed if '--tokenizer-metrics-allowed-custom-labels label1 label2' is set.",
         
     | 
| 
       1351 
2072 
     | 
    
         
             
                    )
         
     | 
| 
       1352 
2073 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1353 
2074 
     | 
    
         
             
                        "--bucket-time-to-first-token",
         
     | 
| 
         @@ -1379,8 +2100,8 @@ class ServerArgs: 
     | 
|
| 
       1379 
2100 
     | 
    
         
             
                    bucket_rule = (
         
     | 
| 
       1380 
2101 
     | 
    
         
             
                        "Supports 3 rule types: 'default' uses predefined buckets; 'tse <middle> <base> <count>' "
         
     | 
| 
       1381 
2102 
     | 
    
         
             
                        "generates two sides exponential distributed buckets (e.g., 'tse 1000 2 8' generates buckets "
         
     | 
| 
       1382 
     | 
    
         
            -
                        "[984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); ' 
     | 
| 
       1383 
     | 
    
         
            -
                        "<value2> ...' uses custom bucket values (e.g., ' 
     | 
| 
      
 2103 
     | 
    
         
            +
                        "[984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); 'custom <value1> "
         
     | 
| 
      
 2104 
     | 
    
         
            +
                        "<value2> ...' uses custom bucket values (e.g., 'custom 10 50 100 500')."
         
     | 
| 
       1384 
2105 
     | 
    
         
             
                    )
         
     | 
| 
       1385 
2106 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1386 
2107 
     | 
    
         
             
                        "--prompt-tokens-buckets",
         
     | 
| 
         @@ -1489,6 +2210,16 @@ class ServerArgs: 
     | 
|
| 
       1489 
2210 
     | 
    
         
             
                        default=ServerArgs.tool_call_parser,
         
     | 
| 
       1490 
2211 
     | 
    
         
             
                        help=f"Specify the parser for handling tool-call interactions. Options include: {tool_call_parser_choices}.",
         
     | 
| 
       1491 
2212 
     | 
    
         
             
                    )
         
     | 
| 
      
 2213 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2214 
     | 
    
         
            +
                        "--sampling-defaults",
         
     | 
| 
      
 2215 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 2216 
     | 
    
         
            +
                        choices=["openai", "model"],
         
     | 
| 
      
 2217 
     | 
    
         
            +
                        default=ServerArgs.sampling_defaults,
         
     | 
| 
      
 2218 
     | 
    
         
            +
                        help="Where to get default sampling parameters. "
         
     | 
| 
      
 2219 
     | 
    
         
            +
                        "'openai' uses SGLang/OpenAI defaults (temperature=1.0, top_p=1.0, etc.). "
         
     | 
| 
      
 2220 
     | 
    
         
            +
                        "'model' uses the model's generation_config.json to get the recommended "
         
     | 
| 
      
 2221 
     | 
    
         
            +
                        "sampling parameters if available. Default is 'model'.",
         
     | 
| 
      
 2222 
     | 
    
         
            +
                    )
         
     | 
| 
       1492 
2223 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1493 
2224 
     | 
    
         
             
                        "--tool-server",
         
     | 
| 
       1494 
2225 
     | 
    
         
             
                        type=str,
         
     | 
| 
         @@ -1598,12 +2329,27 @@ class ServerArgs: 
     | 
|
| 
       1598 
2329 
     | 
    
         
             
                        default=ServerArgs.max_loaded_loras,
         
     | 
| 
       1599 
2330 
     | 
    
         
             
                        help="If specified, it limits the maximum number of LoRA adapters loaded in CPU memory at a time. The value must be greater than or equal to `--max-loras-per-batch`.",
         
     | 
| 
       1600 
2331 
     | 
    
         
             
                    )
         
     | 
| 
      
 2332 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2333 
     | 
    
         
            +
                        "--lora-eviction-policy",
         
     | 
| 
      
 2334 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 2335 
     | 
    
         
            +
                        default=DEFAULT_LORA_EVICTION_POLICY,
         
     | 
| 
      
 2336 
     | 
    
         
            +
                        choices=["lru", "fifo"],
         
     | 
| 
      
 2337 
     | 
    
         
            +
                        help="LoRA adapter eviction policy when memory pool is full. 'lru': Least Recently Used (default, better cache efficiency). 'fifo': First-In-First-Out.",
         
     | 
| 
      
 2338 
     | 
    
         
            +
                    )
         
     | 
| 
       1601 
2339 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1602 
2340 
     | 
    
         
             
                        "--lora-backend",
         
     | 
| 
       1603 
2341 
     | 
    
         
             
                        type=str,
         
     | 
| 
       1604 
     | 
    
         
            -
                         
     | 
| 
      
 2342 
     | 
    
         
            +
                        choices=LORA_BACKEND_CHOICES,
         
     | 
| 
      
 2343 
     | 
    
         
            +
                        default=ServerArgs.lora_backend,
         
     | 
| 
       1605 
2344 
     | 
    
         
             
                        help="Choose the kernel backend for multi-LoRA serving.",
         
     | 
| 
       1606 
2345 
     | 
    
         
             
                    )
         
     | 
| 
      
 2346 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2347 
     | 
    
         
            +
                        "--max-lora-chunk-size",
         
     | 
| 
      
 2348 
     | 
    
         
            +
                        type=int,
         
     | 
| 
      
 2349 
     | 
    
         
            +
                        default=ServerArgs.max_lora_chunk_size,
         
     | 
| 
      
 2350 
     | 
    
         
            +
                        choices=[16, 32, 64, 128],
         
     | 
| 
      
 2351 
     | 
    
         
            +
                        help="Maximum chunk size for the ChunkedSGMV LoRA backend. Only used when --lora-backend is 'csgmv'. Choosing a larger value might improve performance.",
         
     | 
| 
      
 2352 
     | 
    
         
            +
                    )
         
     | 
| 
       1607 
2353 
     | 
    
         | 
| 
       1608 
2354 
     | 
    
         
             
                    # Kernel backend
         
     | 
| 
       1609 
2355 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
         @@ -1644,16 +2390,29 @@ class ServerArgs: 
     | 
|
| 
       1644 
2390 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1645 
2391 
     | 
    
         
             
                        "--mm-attention-backend",
         
     | 
| 
       1646 
2392 
     | 
    
         
             
                        type=str,
         
     | 
| 
       1647 
     | 
    
         
            -
                        choices=["sdpa", "fa3", "triton_attn"],
         
     | 
| 
      
 2393 
     | 
    
         
            +
                        choices=["sdpa", "fa3", "triton_attn", "ascend_attn"],
         
     | 
| 
       1648 
2394 
     | 
    
         
             
                        default=ServerArgs.mm_attention_backend,
         
     | 
| 
       1649 
2395 
     | 
    
         
             
                        help="Set multimodal attention backend.",
         
     | 
| 
       1650 
2396 
     | 
    
         
             
                    )
         
     | 
| 
      
 2397 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2398 
     | 
    
         
            +
                        "--nsa-prefill-backend",
         
     | 
| 
      
 2399 
     | 
    
         
            +
                        default=ServerArgs.nsa_prefill_backend,
         
     | 
| 
      
 2400 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 2401 
     | 
    
         
            +
                        choices=NSA_CHOICES,
         
     | 
| 
      
 2402 
     | 
    
         
            +
                    )
         
     | 
| 
      
 2403 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2404 
     | 
    
         
            +
                        "--nsa-decode-backend",
         
     | 
| 
      
 2405 
     | 
    
         
            +
                        default=ServerArgs.nsa_decode_backend,
         
     | 
| 
      
 2406 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 2407 
     | 
    
         
            +
                        choices=NSA_CHOICES,
         
     | 
| 
      
 2408 
     | 
    
         
            +
                    )
         
     | 
| 
       1651 
2409 
     | 
    
         | 
| 
       1652 
2410 
     | 
    
         
             
                    # Speculative decoding
         
     | 
| 
      
 2411 
     | 
    
         
            +
                    parser.add_argument("--enable-beta-spec", action="store_true")
         
     | 
| 
       1653 
2412 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1654 
2413 
     | 
    
         
             
                        "--speculative-algorithm",
         
     | 
| 
       1655 
2414 
     | 
    
         
             
                        type=str,
         
     | 
| 
       1656 
     | 
    
         
            -
                        choices=["EAGLE", "EAGLE3", "NEXTN", "STANDALONE"],
         
     | 
| 
      
 2415 
     | 
    
         
            +
                        choices=["EAGLE", "EAGLE3", "NEXTN", "STANDALONE", "NGRAM"],
         
     | 
| 
       1657 
2416 
     | 
    
         
             
                        help="Speculative algorithm.",
         
     | 
| 
       1658 
2417 
     | 
    
         
             
                    )
         
     | 
| 
       1659 
2418 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
         @@ -1670,6 +2429,15 @@ class ServerArgs: 
     | 
|
| 
       1670 
2429 
     | 
    
         
             
                        "name, a tag name, or a commit id. If unspecified, will use "
         
     | 
| 
       1671 
2430 
     | 
    
         
             
                        "the default version.",
         
     | 
| 
       1672 
2431 
     | 
    
         
             
                    )
         
     | 
| 
      
 2432 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2433 
     | 
    
         
            +
                        "--speculative-draft-load-format",
         
     | 
| 
      
 2434 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 2435 
     | 
    
         
            +
                        default=ServerArgs.speculative_draft_load_format,
         
     | 
| 
      
 2436 
     | 
    
         
            +
                        choices=LOAD_FORMAT_CHOICES,
         
     | 
| 
      
 2437 
     | 
    
         
            +
                        help="The format of the draft model weights to load. "
         
     | 
| 
      
 2438 
     | 
    
         
            +
                        "If not specified, will use the same format as --load-format. "
         
     | 
| 
      
 2439 
     | 
    
         
            +
                        "Use 'dummy' to initialize draft model weights with random values for profiling.",
         
     | 
| 
      
 2440 
     | 
    
         
            +
                    )
         
     | 
| 
       1673 
2441 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1674 
2442 
     | 
    
         
             
                        "--speculative-num-steps",
         
     | 
| 
       1675 
2443 
     | 
    
         
             
                        type=int,
         
     | 
| 
         @@ -1713,6 +2481,50 @@ class ServerArgs: 
     | 
|
| 
       1713 
2481 
     | 
    
         
             
                        help="Attention backend for speculative decoding operations (both target verify and draft extend). Can be one of 'prefill' (default) or 'decode'.",
         
     | 
| 
       1714 
2482 
     | 
    
         
             
                        default=ServerArgs.speculative_attention_mode,
         
     | 
| 
       1715 
2483 
     | 
    
         
             
                    )
         
     | 
| 
      
 2484 
     | 
    
         
            +
                    # Ngram speculative decoding
         
     | 
| 
      
 2485 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2486 
     | 
    
         
            +
                        "--speculative-ngram-min-match-window-size",
         
     | 
| 
      
 2487 
     | 
    
         
            +
                        type=int,
         
     | 
| 
      
 2488 
     | 
    
         
            +
                        default=ServerArgs.speculative_ngram_min_match_window_size,
         
     | 
| 
      
 2489 
     | 
    
         
            +
                        help="The minimum window size for pattern matching in ngram speculative decoding.",
         
     | 
| 
      
 2490 
     | 
    
         
            +
                    )
         
     | 
| 
      
 2491 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2492 
     | 
    
         
            +
                        "--speculative-ngram-max-match-window-size",
         
     | 
| 
      
 2493 
     | 
    
         
            +
                        type=int,
         
     | 
| 
      
 2494 
     | 
    
         
            +
                        default=ServerArgs.speculative_ngram_max_match_window_size,
         
     | 
| 
      
 2495 
     | 
    
         
            +
                        help="The maximum window size for pattern matching in ngram speculative decoding.",
         
     | 
| 
      
 2496 
     | 
    
         
            +
                    )
         
     | 
| 
      
 2497 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2498 
     | 
    
         
            +
                        "--speculative-ngram-min-bfs-breadth",
         
     | 
| 
      
 2499 
     | 
    
         
            +
                        type=int,
         
     | 
| 
      
 2500 
     | 
    
         
            +
                        default=ServerArgs.speculative_ngram_min_bfs_breadth,
         
     | 
| 
      
 2501 
     | 
    
         
            +
                        help="The minimum breadth for BFS (Breadth-First Search) in ngram speculative decoding.",
         
     | 
| 
      
 2502 
     | 
    
         
            +
                    )
         
     | 
| 
      
 2503 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2504 
     | 
    
         
            +
                        "--speculative-ngram-max-bfs-breadth",
         
     | 
| 
      
 2505 
     | 
    
         
            +
                        type=int,
         
     | 
| 
      
 2506 
     | 
    
         
            +
                        default=ServerArgs.speculative_ngram_max_bfs_breadth,
         
     | 
| 
      
 2507 
     | 
    
         
            +
                        help="The maximum breadth for BFS (Breadth-First Search) in ngram speculative decoding.",
         
     | 
| 
      
 2508 
     | 
    
         
            +
                    )
         
     | 
| 
      
 2509 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2510 
     | 
    
         
            +
                        "--speculative-ngram-match-type",
         
     | 
| 
      
 2511 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 2512 
     | 
    
         
            +
                        choices=["BFS", "PROB"],
         
     | 
| 
      
 2513 
     | 
    
         
            +
                        default=ServerArgs.speculative_ngram_match_type,
         
     | 
| 
      
 2514 
     | 
    
         
            +
                        help="The match type for cache tree.",
         
     | 
| 
      
 2515 
     | 
    
         
            +
                    )
         
     | 
| 
      
 2516 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2517 
     | 
    
         
            +
                        "--speculative-ngram-branch-length",
         
     | 
| 
      
 2518 
     | 
    
         
            +
                        type=int,
         
     | 
| 
      
 2519 
     | 
    
         
            +
                        default=ServerArgs.speculative_ngram_branch_length,
         
     | 
| 
      
 2520 
     | 
    
         
            +
                        help="The branch length for ngram speculative decoding.",
         
     | 
| 
      
 2521 
     | 
    
         
            +
                    )
         
     | 
| 
      
 2522 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2523 
     | 
    
         
            +
                        "--speculative-ngram-capacity",
         
     | 
| 
      
 2524 
     | 
    
         
            +
                        type=int,
         
     | 
| 
      
 2525 
     | 
    
         
            +
                        default=ServerArgs.speculative_ngram_capacity,
         
     | 
| 
      
 2526 
     | 
    
         
            +
                        help="The cache capacity for ngram speculative decoding.",
         
     | 
| 
      
 2527 
     | 
    
         
            +
                    )
         
     | 
| 
       1716 
2528 
     | 
    
         | 
| 
       1717 
2529 
     | 
    
         
             
                    # Expert parallelism
         
     | 
| 
       1718 
2530 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
         @@ -1726,22 +2538,14 @@ class ServerArgs: 
     | 
|
| 
       1726 
2538 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1727 
2539 
     | 
    
         
             
                        "--moe-a2a-backend",
         
     | 
| 
       1728 
2540 
     | 
    
         
             
                        type=str,
         
     | 
| 
       1729 
     | 
    
         
            -
                        choices=["none", "deepep"],
         
     | 
| 
      
 2541 
     | 
    
         
            +
                        choices=["none", "deepep", "mooncake"],
         
     | 
| 
       1730 
2542 
     | 
    
         
             
                        default=ServerArgs.moe_a2a_backend,
         
     | 
| 
       1731 
2543 
     | 
    
         
             
                        help="Choose the backend for MoE A2A.",
         
     | 
| 
       1732 
2544 
     | 
    
         
             
                    )
         
     | 
| 
       1733 
2545 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1734 
2546 
     | 
    
         
             
                        "--moe-runner-backend",
         
     | 
| 
       1735 
2547 
     | 
    
         
             
                        type=str,
         
     | 
| 
       1736 
     | 
    
         
            -
                        choices= 
     | 
| 
       1737 
     | 
    
         
            -
                            "auto",
         
     | 
| 
       1738 
     | 
    
         
            -
                            "triton",
         
     | 
| 
       1739 
     | 
    
         
            -
                            "triton_kernel",
         
     | 
| 
       1740 
     | 
    
         
            -
                            "flashinfer_trtllm",
         
     | 
| 
       1741 
     | 
    
         
            -
                            "flashinfer_cutlass",
         
     | 
| 
       1742 
     | 
    
         
            -
                            "flashinfer_mxfp4",
         
     | 
| 
       1743 
     | 
    
         
            -
                            "flashinfer_cutedsl",
         
     | 
| 
       1744 
     | 
    
         
            -
                        ],
         
     | 
| 
      
 2548 
     | 
    
         
            +
                        choices=MOE_RUNNER_BACKEND_CHOICES,
         
     | 
| 
       1745 
2549 
     | 
    
         
             
                        default=ServerArgs.moe_runner_backend,
         
     | 
| 
       1746 
2550 
     | 
    
         
             
                        help="Choose the runner backend for MoE.",
         
     | 
| 
       1747 
2551 
     | 
    
         
             
                    )
         
     | 
| 
         @@ -1855,6 +2659,12 @@ class ServerArgs: 
     | 
|
| 
       1855 
2659 
     | 
    
         
             
                        choices=["float32", "bfloat16"],
         
     | 
| 
       1856 
2660 
     | 
    
         
             
                        help="The data type of the SSM states in mamba cache.",
         
     | 
| 
       1857 
2661 
     | 
    
         
             
                    )
         
     | 
| 
      
 2662 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2663 
     | 
    
         
            +
                        "--mamba-full-memory-ratio",
         
     | 
| 
      
 2664 
     | 
    
         
            +
                        type=float,
         
     | 
| 
      
 2665 
     | 
    
         
            +
                        default=ServerArgs.mamba_full_memory_ratio,
         
     | 
| 
      
 2666 
     | 
    
         
            +
                        help="The ratio of mamba state memory to full kv cache memory.",
         
     | 
| 
      
 2667 
     | 
    
         
            +
                    )
         
     | 
| 
       1858 
2668 
     | 
    
         | 
| 
       1859 
2669 
     | 
    
         
             
                    # Hierarchical cache
         
     | 
| 
       1860 
2670 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
         @@ -1881,6 +2691,13 @@ class ServerArgs: 
     | 
|
| 
       1881 
2691 
     | 
    
         
             
                        default=ServerArgs.hicache_write_policy,
         
     | 
| 
       1882 
2692 
     | 
    
         
             
                        help="The write policy of hierarchical cache.",
         
     | 
| 
       1883 
2693 
     | 
    
         
             
                    )
         
     | 
| 
      
 2694 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2695 
     | 
    
         
            +
                        "--radix-eviction-policy",
         
     | 
| 
      
 2696 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 2697 
     | 
    
         
            +
                        choices=RADIX_EVICTION_POLICY_CHOICES,
         
     | 
| 
      
 2698 
     | 
    
         
            +
                        default=ServerArgs.radix_eviction_policy,
         
     | 
| 
      
 2699 
     | 
    
         
            +
                        help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.",
         
     | 
| 
      
 2700 
     | 
    
         
            +
                    )
         
     | 
| 
       1884 
2701 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1885 
2702 
     | 
    
         
             
                        "--hicache-io-backend",
         
     | 
| 
       1886 
2703 
     | 
    
         
             
                        type=str,
         
     | 
| 
         @@ -1898,9 +2715,12 @@ class ServerArgs: 
     | 
|
| 
       1898 
2715 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1899 
2716 
     | 
    
         
             
                        "--hicache-storage-backend",
         
     | 
| 
       1900 
2717 
     | 
    
         
             
                        type=str,
         
     | 
| 
       1901 
     | 
    
         
            -
                        choices=["file", "mooncake", "hf3fs", "nixl"],
         
     | 
| 
      
 2718 
     | 
    
         
            +
                        choices=["file", "mooncake", "hf3fs", "nixl", "aibrix", "dynamic", "eic"],
         
     | 
| 
       1902 
2719 
     | 
    
         
             
                        default=ServerArgs.hicache_storage_backend,
         
     | 
| 
       1903 
     | 
    
         
            -
                        help="The storage backend for hierarchical KV cache." 
     | 
| 
      
 2720 
     | 
    
         
            +
                        help="The storage backend for hierarchical KV cache. "
         
     | 
| 
      
 2721 
     | 
    
         
            +
                        "Built-in backends: file, mooncake, hf3fs, nixl, aibrix. "
         
     | 
| 
      
 2722 
     | 
    
         
            +
                        "For dynamic backend, use --hicache-storage-backend-extra-config to specify: "
         
     | 
| 
      
 2723 
     | 
    
         
            +
                        "backend_name (custom name), module_path (Python module path), class_name (backend class name).",
         
     | 
| 
       1904 
2724 
     | 
    
         
             
                    )
         
     | 
| 
       1905 
2725 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1906 
2726 
     | 
    
         
             
                        "--hicache-storage-prefetch-policy",
         
     | 
| 
         @@ -1922,6 +2742,35 @@ class ServerArgs: 
     | 
|
| 
       1922 
2742 
     | 
    
         
             
                        help="Using LMCache as an alternative hierarchical cache solution",
         
     | 
| 
       1923 
2743 
     | 
    
         
             
                    )
         
     | 
| 
       1924 
2744 
     | 
    
         | 
| 
      
 2745 
     | 
    
         
            +
                    # Ktransformer server args
         
     | 
| 
      
 2746 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2747 
     | 
    
         
            +
                        "--kt-amx-weight-path",
         
     | 
| 
      
 2748 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 2749 
     | 
    
         
            +
                        help="[ktransformers parameter] The path of the quantized expert weights for amx kernel. A local folder.",
         
     | 
| 
      
 2750 
     | 
    
         
            +
                    )
         
     | 
| 
      
 2751 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2752 
     | 
    
         
            +
                        "--kt-amx-method",
         
     | 
| 
      
 2753 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 2754 
     | 
    
         
            +
                        default="AMXINT4",
         
     | 
| 
      
 2755 
     | 
    
         
            +
                        help="[ktransformers parameter] Quantization formats for CPU execution.",
         
     | 
| 
      
 2756 
     | 
    
         
            +
                    )
         
     | 
| 
      
 2757 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2758 
     | 
    
         
            +
                        "--kt-cpuinfer",
         
     | 
| 
      
 2759 
     | 
    
         
            +
                        type=int,
         
     | 
| 
      
 2760 
     | 
    
         
            +
                        help="[ktransformers parameter] The number of CPUInfer threads.",
         
     | 
| 
      
 2761 
     | 
    
         
            +
                    )
         
     | 
| 
      
 2762 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2763 
     | 
    
         
            +
                        "--kt-threadpool-count",
         
     | 
| 
      
 2764 
     | 
    
         
            +
                        type=int,
         
     | 
| 
      
 2765 
     | 
    
         
            +
                        default=2,
         
     | 
| 
      
 2766 
     | 
    
         
            +
                        help="[ktransformers parameter] One-to-one with the number of NUMA nodes (one thread pool per NUMA).",
         
     | 
| 
      
 2767 
     | 
    
         
            +
                    )
         
     | 
| 
      
 2768 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2769 
     | 
    
         
            +
                        "--kt-num-gpu-experts",
         
     | 
| 
      
 2770 
     | 
    
         
            +
                        type=int,
         
     | 
| 
      
 2771 
     | 
    
         
            +
                        help="[ktransformers parameter] The number of GPU experts.",
         
     | 
| 
      
 2772 
     | 
    
         
            +
                    )
         
     | 
| 
      
 2773 
     | 
    
         
            +
             
     | 
| 
       1925 
2774 
     | 
    
         
             
                    # Double Sparsity
         
     | 
| 
       1926 
2775 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1927 
2776 
     | 
    
         
             
                        "--enable-double-sparsity",
         
     | 
| 
         @@ -1991,6 +2840,14 @@ class ServerArgs: 
     | 
|
| 
       1991 
2840 
     | 
    
         
             
                        help="Mode of offloading.",
         
     | 
| 
       1992 
2841 
     | 
    
         
             
                    )
         
     | 
| 
       1993 
2842 
     | 
    
         | 
| 
      
 2843 
     | 
    
         
            +
                    # Args for multi-item-scoring
         
     | 
| 
      
 2844 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2845 
     | 
    
         
            +
                        "--multi-item-scoring-delimiter",
         
     | 
| 
      
 2846 
     | 
    
         
            +
                        type=int,
         
     | 
| 
      
 2847 
     | 
    
         
            +
                        default=ServerArgs.multi_item_scoring_delimiter,
         
     | 
| 
      
 2848 
     | 
    
         
            +
                        help="Delimiter token ID for multi-item scoring. Used to combine Query and Items into a single sequence: Query<delimiter>Item1<delimiter>Item2<delimiter>... This enables efficient batch processing of multiple items against a single query.",
         
     | 
| 
      
 2849 
     | 
    
         
            +
                    )
         
     | 
| 
      
 2850 
     | 
    
         
            +
             
     | 
| 
       1994 
2851 
     | 
    
         
             
                    # Optimization/debug options
         
     | 
| 
       1995 
2852 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1996 
2853 
     | 
    
         
             
                        "--disable-radix-cache",
         
     | 
| 
         @@ -2049,6 +2906,11 @@ class ServerArgs: 
     | 
|
| 
       2049 
2906 
     | 
    
         
             
                        action="store_true",
         
     | 
| 
       2050 
2907 
     | 
    
         
             
                        help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
         
     | 
| 
       2051 
2908 
     | 
    
         
             
                    )
         
     | 
| 
      
 2909 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2910 
     | 
    
         
            +
                        "--disable-tokenizer-batch-decode",
         
     | 
| 
      
 2911 
     | 
    
         
            +
                        action="store_true",
         
     | 
| 
      
 2912 
     | 
    
         
            +
                        help="Disable batch decoding when decoding multiple completions.",
         
     | 
| 
      
 2913 
     | 
    
         
            +
                    )
         
     | 
| 
       2052 
2914 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2053 
2915 
     | 
    
         
             
                        "--disable-outlines-disk-cache",
         
     | 
| 
       2054 
2916 
     | 
    
         
             
                        action="store_true",
         
     | 
| 
         @@ -2064,6 +2926,11 @@ class ServerArgs: 
     | 
|
| 
       2064 
2926 
     | 
    
         
             
                        action="store_true",
         
     | 
| 
       2065 
2927 
     | 
    
         
             
                        help="Enable using mscclpp for small messages for all-reduce kernel and fall back to NCCL.",
         
     | 
| 
       2066 
2928 
     | 
    
         
             
                    )
         
     | 
| 
      
 2929 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2930 
     | 
    
         
            +
                        "--enable-torch-symm-mem",
         
     | 
| 
      
 2931 
     | 
    
         
            +
                        action="store_true",
         
     | 
| 
      
 2932 
     | 
    
         
            +
                        help="Enable using torch symm mem for all-reduce kernel and fall back to NCCL. Only supports CUDA device SM90 and above. SM90 supports world size 4, 6, 8. SM10 supports world size 6, 8.",
         
     | 
| 
      
 2933 
     | 
    
         
            +
                    )
         
     | 
| 
       2067 
2934 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2068 
2935 
     | 
    
         
             
                        "--disable-overlap-schedule",
         
     | 
| 
       2069 
2936 
     | 
    
         
             
                        action="store_true",
         
     | 
| 
         @@ -2089,6 +2956,11 @@ class ServerArgs: 
     | 
|
| 
       2089 
2956 
     | 
    
         
             
                        action="store_true",
         
     | 
| 
       2090 
2957 
     | 
    
         
             
                        help="Enabling two micro batches to overlap.",
         
     | 
| 
       2091 
2958 
     | 
    
         
             
                    )
         
     | 
| 
      
 2959 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2960 
     | 
    
         
            +
                        "--enable-single-batch-overlap",
         
     | 
| 
      
 2961 
     | 
    
         
            +
                        action="store_true",
         
     | 
| 
      
 2962 
     | 
    
         
            +
                        help="Let computation and communication overlap within one micro batch.",
         
     | 
| 
      
 2963 
     | 
    
         
            +
                    )
         
     | 
| 
       2092 
2964 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2093 
2965 
     | 
    
         
             
                        "--tbo-token-distribution-threshold",
         
     | 
| 
       2094 
2966 
     | 
    
         
             
                        type=float,
         
     | 
| 
         @@ -2100,12 +2972,36 @@ class ServerArgs: 
     | 
|
| 
       2100 
2972 
     | 
    
         
             
                        action="store_true",
         
     | 
| 
       2101 
2973 
     | 
    
         
             
                        help="Optimize the model with torch.compile. Experimental feature.",
         
     | 
| 
       2102 
2974 
     | 
    
         
             
                    )
         
     | 
| 
      
 2975 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2976 
     | 
    
         
            +
                        "--enable-piecewise-cuda-graph",
         
     | 
| 
      
 2977 
     | 
    
         
            +
                        action="store_true",
         
     | 
| 
      
 2978 
     | 
    
         
            +
                        help="Optimize the model with piecewise cuda graph for extend/prefill only. Experimental feature.",
         
     | 
| 
      
 2979 
     | 
    
         
            +
                    )
         
     | 
| 
      
 2980 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2981 
     | 
    
         
            +
                        "--piecewise-cuda-graph-tokens",
         
     | 
| 
      
 2982 
     | 
    
         
            +
                        type=json_list_type,
         
     | 
| 
      
 2983 
     | 
    
         
            +
                        default=ServerArgs.piecewise_cuda_graph_tokens,
         
     | 
| 
      
 2984 
     | 
    
         
            +
                        help="Set the list of tokens when using piecewise cuda graph.",
         
     | 
| 
      
 2985 
     | 
    
         
            +
                    )
         
     | 
| 
      
 2986 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2987 
     | 
    
         
            +
                        "--piecewise-cuda-graph-compiler",
         
     | 
| 
      
 2988 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 2989 
     | 
    
         
            +
                        default=ServerArgs.piecewise_cuda_graph_compiler,
         
     | 
| 
      
 2990 
     | 
    
         
            +
                        help="Set the compiler for piecewise cuda graph. Choices are: eager, inductor.",
         
     | 
| 
      
 2991 
     | 
    
         
            +
                        choices=["eager", "inductor"],
         
     | 
| 
      
 2992 
     | 
    
         
            +
                    )
         
     | 
| 
       2103 
2993 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2104 
2994 
     | 
    
         
             
                        "--torch-compile-max-bs",
         
     | 
| 
       2105 
2995 
     | 
    
         
             
                        type=int,
         
     | 
| 
       2106 
2996 
     | 
    
         
             
                        default=ServerArgs.torch_compile_max_bs,
         
     | 
| 
       2107 
2997 
     | 
    
         
             
                        help="Set the maximum batch size when using torch compile.",
         
     | 
| 
       2108 
2998 
     | 
    
         
             
                    )
         
     | 
| 
      
 2999 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 3000 
     | 
    
         
            +
                        "--piecewise-cuda-graph-max-tokens",
         
     | 
| 
      
 3001 
     | 
    
         
            +
                        type=int,
         
     | 
| 
      
 3002 
     | 
    
         
            +
                        default=ServerArgs.piecewise_cuda_graph_max_tokens,
         
     | 
| 
      
 3003 
     | 
    
         
            +
                        help="Set the maximum tokens when using piecewise cuda graph.",
         
     | 
| 
      
 3004 
     | 
    
         
            +
                    )
         
     | 
| 
       2109 
3005 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2110 
3006 
     | 
    
         
             
                        "--torchao-config",
         
     | 
| 
       2111 
3007 
     | 
    
         
             
                        type=str,
         
     | 
| 
         @@ -2158,6 +3054,11 @@ class ServerArgs: 
     | 
|
| 
       2158 
3054 
     | 
    
         
             
                        action="store_true",
         
     | 
| 
       2159 
3055 
     | 
    
         
             
                        help="Allow saving memory using release_memory_occupation and resume_memory_occupation",
         
     | 
| 
       2160 
3056 
     | 
    
         
             
                    )
         
     | 
| 
      
 3057 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 3058 
     | 
    
         
            +
                        "--enable-weights-cpu-backup",
         
     | 
| 
      
 3059 
     | 
    
         
            +
                        action="store_true",
         
     | 
| 
      
 3060 
     | 
    
         
            +
                        help="Save model weights to CPU memory during release_weights_occupation and resume_weights_occupation",
         
     | 
| 
      
 3061 
     | 
    
         
            +
                    )
         
     | 
| 
       2161 
3062 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2162 
3063 
     | 
    
         
             
                        "--allow-auto-truncate",
         
     | 
| 
       2163 
3064 
     | 
    
         
             
                        action="store_true",
         
     | 
| 
         @@ -2188,6 +3089,11 @@ class ServerArgs: 
     | 
|
| 
       2188 
3089 
     | 
    
         
             
                        action="store_true",
         
     | 
| 
       2189 
3090 
     | 
    
         
             
                        help="Adopt base image processor instead of fast image processor.",
         
     | 
| 
       2190 
3091 
     | 
    
         
             
                    )
         
     | 
| 
      
 3092 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 3093 
     | 
    
         
            +
                        "--keep-mm-feature-on-device",
         
     | 
| 
      
 3094 
     | 
    
         
            +
                        action="store_true",
         
     | 
| 
      
 3095 
     | 
    
         
            +
                        help="Keep multimodal feature tensors on device after processing to save D2H copy.",
         
     | 
| 
      
 3096 
     | 
    
         
            +
                    )
         
     | 
| 
       2191 
3097 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2192 
3098 
     | 
    
         
             
                        "--enable-return-hidden-states",
         
     | 
| 
       2193 
3099 
     | 
    
         
             
                        action="store_true",
         
     | 
| 
         @@ -2225,11 +3131,6 @@ class ServerArgs: 
     | 
|
| 
       2225 
3131 
     | 
    
         
             
                        default=ServerArgs.debug_tensor_dump_inject,
         
     | 
| 
       2226 
3132 
     | 
    
         
             
                        help="Inject the outputs from jax as the input of every layer.",
         
     | 
| 
       2227 
3133 
     | 
    
         
             
                    )
         
     | 
| 
       2228 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       2229 
     | 
    
         
            -
                        "--debug-tensor-dump-prefill-only",
         
     | 
| 
       2230 
     | 
    
         
            -
                        action="store_true",
         
     | 
| 
       2231 
     | 
    
         
            -
                        help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
         
     | 
| 
       2232 
     | 
    
         
            -
                    )
         
     | 
| 
       2233 
3134 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2234 
3135 
     | 
    
         
             
                        "--enable-dynamic-batch-tokenizer",
         
     | 
| 
       2235 
3136 
     | 
    
         
             
                        action="store_true",
         
     | 
| 
         @@ -2295,6 +3196,11 @@ class ServerArgs: 
     | 
|
| 
       2295 
3196 
     | 
    
         
             
                        "or multiple comma-separated devices (e.g., --disaggregation-ib-device mlx5_0,mlx5_1). "
         
     | 
| 
       2296 
3197 
     | 
    
         
             
                        "Default is None, which triggers automatic device detection when mooncake backend is enabled.",
         
     | 
| 
       2297 
3198 
     | 
    
         
             
                    )
         
     | 
| 
      
 3199 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 3200 
     | 
    
         
            +
                        "--disaggregation-decode-enable-offload-kvcache",
         
     | 
| 
      
 3201 
     | 
    
         
            +
                        action="store_true",
         
     | 
| 
      
 3202 
     | 
    
         
            +
                        help="Enable async KV cache offloading on decode server (PD mode).",
         
     | 
| 
      
 3203 
     | 
    
         
            +
                    )
         
     | 
| 
       2298 
3204 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2299 
3205 
     | 
    
         
             
                        "--num-reserved-decode-tokens",
         
     | 
| 
       2300 
3206 
     | 
    
         
             
                        type=int,
         
     | 
| 
         @@ -2321,6 +3227,24 @@ class ServerArgs: 
     | 
|
| 
       2321 
3227 
     | 
    
         
             
                        action="store_true",
         
     | 
| 
       2322 
3228 
     | 
    
         
             
                        help="Disable mmap while loading weight using safetensors.",
         
     | 
| 
       2323 
3229 
     | 
    
         
             
                    )
         
     | 
| 
      
 3230 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 3231 
     | 
    
         
            +
                        "--remote-instance-weight-loader-seed-instance-ip",
         
     | 
| 
      
 3232 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 3233 
     | 
    
         
            +
                        default=ServerArgs.remote_instance_weight_loader_seed_instance_ip,
         
     | 
| 
      
 3234 
     | 
    
         
            +
                        help="The ip of the seed instance for loading weights from remote instance.",
         
     | 
| 
      
 3235 
     | 
    
         
            +
                    )
         
     | 
| 
      
 3236 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 3237 
     | 
    
         
            +
                        "--remote-instance-weight-loader-seed-instance-service-port",
         
     | 
| 
      
 3238 
     | 
    
         
            +
                        type=int,
         
     | 
| 
      
 3239 
     | 
    
         
            +
                        default=ServerArgs.remote_instance_weight_loader_seed_instance_service_port,
         
     | 
| 
      
 3240 
     | 
    
         
            +
                        help="The service port of the seed instance for loading weights from remote instance.",
         
     | 
| 
      
 3241 
     | 
    
         
            +
                    )
         
     | 
| 
      
 3242 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 3243 
     | 
    
         
            +
                        "--remote-instance-weight-loader-send-weights-group-ports",
         
     | 
| 
      
 3244 
     | 
    
         
            +
                        type=json_list_type,
         
     | 
| 
      
 3245 
     | 
    
         
            +
                        default=ServerArgs.remote_instance_weight_loader_send_weights_group_ports,
         
     | 
| 
      
 3246 
     | 
    
         
            +
                        help="The communication group ports for loading weights from remote instance.",
         
     | 
| 
      
 3247 
     | 
    
         
            +
                    )
         
     | 
| 
       2324 
3248 
     | 
    
         | 
| 
       2325 
3249 
     | 
    
         
             
                    # For PD-Multiplexing
         
     | 
| 
       2326 
3250 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
         @@ -2328,6 +3252,12 @@ class ServerArgs: 
     | 
|
| 
       2328 
3252 
     | 
    
         
             
                        action="store_true",
         
     | 
| 
       2329 
3253 
     | 
    
         
             
                        help="Enable PD-Multiplexing, PD running on greenctx stream.",
         
     | 
| 
       2330 
3254 
     | 
    
         
             
                    )
         
     | 
| 
      
 3255 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 3256 
     | 
    
         
            +
                        "--pdmux-config-path",
         
     | 
| 
      
 3257 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 3258 
     | 
    
         
            +
                        default=None,
         
     | 
| 
      
 3259 
     | 
    
         
            +
                        help="The path of the PD-Multiplexing config file.",
         
     | 
| 
      
 3260 
     | 
    
         
            +
                    )
         
     | 
| 
       2331 
3261 
     | 
    
         | 
| 
       2332 
3262 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2333 
3263 
     | 
    
         
             
                        "--sm-group-num",
         
     | 
| 
         @@ -2336,41 +3266,55 @@ class ServerArgs: 
     | 
|
| 
       2336 
3266 
     | 
    
         
             
                        help="Number of sm partition groups.",
         
     | 
| 
       2337 
3267 
     | 
    
         
             
                    )
         
     | 
| 
       2338 
3268 
     | 
    
         | 
| 
      
 3269 
     | 
    
         
            +
                    # For deterministic inference
         
     | 
| 
      
 3270 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 3271 
     | 
    
         
            +
                        "--enable-deterministic-inference",
         
     | 
| 
      
 3272 
     | 
    
         
            +
                        action="store_true",
         
     | 
| 
      
 3273 
     | 
    
         
            +
                        help="Enable deterministic inference mode with batch invariant ops.",
         
     | 
| 
      
 3274 
     | 
    
         
            +
                    )
         
     | 
| 
      
 3275 
     | 
    
         
            +
             
     | 
| 
       2339 
3276 
     | 
    
         
             
                    # Deprecated arguments
         
     | 
| 
       2340 
3277 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2341 
3278 
     | 
    
         
             
                        "--enable-ep-moe",
         
     | 
| 
       2342 
     | 
    
         
            -
                        action= 
     | 
| 
       2343 
     | 
    
         
            -
                        help=" 
     | 
| 
      
 3279 
     | 
    
         
            +
                        action=DeprecatedAction,
         
     | 
| 
      
 3280 
     | 
    
         
            +
                        help="NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead.",
         
     | 
| 
       2344 
3281 
     | 
    
         
             
                    )
         
     | 
| 
       2345 
3282 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2346 
3283 
     | 
    
         
             
                        "--enable-deepep-moe",
         
     | 
| 
       2347 
     | 
    
         
            -
                        action= 
     | 
| 
       2348 
     | 
    
         
            -
                        help=" 
     | 
| 
      
 3284 
     | 
    
         
            +
                        action=DeprecatedAction,
         
     | 
| 
      
 3285 
     | 
    
         
            +
                        help="NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead.",
         
     | 
| 
       2349 
3286 
     | 
    
         
             
                    )
         
     | 
| 
       2350 
3287 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2351 
3288 
     | 
    
         
             
                        "--enable-flashinfer-cutlass-moe",
         
     | 
| 
       2352 
     | 
    
         
            -
                        action= 
     | 
| 
       2353 
     | 
    
         
            -
                        help=" 
     | 
| 
      
 3289 
     | 
    
         
            +
                        action=DeprecatedAction,
         
     | 
| 
      
 3290 
     | 
    
         
            +
                        help="NOTE: --enable-flashinfer-cutlass-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutlass' instead.",
         
     | 
| 
       2354 
3291 
     | 
    
         
             
                    )
         
     | 
| 
       2355 
3292 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2356 
3293 
     | 
    
         
             
                        "--enable-flashinfer-cutedsl-moe",
         
     | 
| 
       2357 
     | 
    
         
            -
                        action= 
     | 
| 
       2358 
     | 
    
         
            -
                        help=" 
     | 
| 
      
 3294 
     | 
    
         
            +
                        action=DeprecatedAction,
         
     | 
| 
      
 3295 
     | 
    
         
            +
                        help="NOTE: --enable-flashinfer-cutedsl-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutedsl' instead.",
         
     | 
| 
       2359 
3296 
     | 
    
         
             
                    )
         
     | 
| 
       2360 
3297 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2361 
3298 
     | 
    
         
             
                        "--enable-flashinfer-trtllm-moe",
         
     | 
| 
       2362 
     | 
    
         
            -
                        action= 
     | 
| 
       2363 
     | 
    
         
            -
                        help=" 
     | 
| 
      
 3299 
     | 
    
         
            +
                        action=DeprecatedAction,
         
     | 
| 
      
 3300 
     | 
    
         
            +
                        help="NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_trtllm' instead.",
         
     | 
| 
       2364 
3301 
     | 
    
         
             
                    )
         
     | 
| 
       2365 
3302 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2366 
3303 
     | 
    
         
             
                        "--enable-triton-kernel-moe",
         
     | 
| 
       2367 
     | 
    
         
            -
                        action= 
     | 
| 
       2368 
     | 
    
         
            -
                        help=" 
     | 
| 
      
 3304 
     | 
    
         
            +
                        action=DeprecatedAction,
         
     | 
| 
      
 3305 
     | 
    
         
            +
                        help="NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead.",
         
     | 
| 
       2369 
3306 
     | 
    
         
             
                    )
         
     | 
| 
       2370 
3307 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2371 
3308 
     | 
    
         
             
                        "--enable-flashinfer-mxfp4-moe",
         
     | 
| 
       2372 
     | 
    
         
            -
                        action= 
     | 
| 
       2373 
     | 
    
         
            -
                        help=" 
     | 
| 
      
 3309 
     | 
    
         
            +
                        action=DeprecatedAction,
         
     | 
| 
      
 3310 
     | 
    
         
            +
                        help="NOTE: --enable-flashinfer-mxfp4-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_mxfp4' instead.",
         
     | 
| 
      
 3311 
     | 
    
         
            +
                    )
         
     | 
| 
      
 3312 
     | 
    
         
            +
             
     | 
| 
      
 3313 
     | 
    
         
            +
                    # Configuration file support
         
     | 
| 
      
 3314 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 3315 
     | 
    
         
            +
                        "--config",
         
     | 
| 
      
 3316 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 3317 
     | 
    
         
            +
                        help="Read CLI options from a config file. Must be a YAML file with configuration options.",
         
     | 
| 
       2374 
3318 
     | 
    
         
             
                    )
         
     | 
| 
       2375 
3319 
     | 
    
         | 
| 
       2376 
3320 
     | 
    
         
             
                @classmethod
         
     | 
| 
         @@ -2395,7 +3339,7 @@ class ServerArgs: 
     | 
|
| 
       2395 
3339 
     | 
    
         
             
                        self.model_path,
         
     | 
| 
       2396 
3340 
     | 
    
         
             
                        trust_remote_code=self.trust_remote_code,
         
     | 
| 
       2397 
3341 
     | 
    
         
             
                        revision=self.revision,
         
     | 
| 
       2398 
     | 
    
         
            -
                        model_override_args= 
     | 
| 
      
 3342 
     | 
    
         
            +
                        model_override_args=orjson.loads(self.json_model_override_args),
         
     | 
| 
       2399 
3343 
     | 
    
         
             
                        **kwargs,
         
     | 
| 
       2400 
3344 
     | 
    
         
             
                    )
         
     | 
| 
       2401 
3345 
     | 
    
         
             
                    return hf_config
         
     | 
| 
         @@ -2442,7 +3386,34 @@ class ServerArgs: 
     | 
|
| 
       2442 
3386 
     | 
    
         
             
                            self.chunked_prefill_size % self.page_size == 0
         
     | 
| 
       2443 
3387 
     | 
    
         
             
                        ), "chunked_prefill_size must be divisible by page_size"
         
     | 
| 
       2444 
3388 
     | 
    
         | 
| 
       2445 
     | 
    
         
            -
                    # Check  
     | 
| 
      
 3389 
     | 
    
         
            +
                    # Check pdmux
         
     | 
| 
      
 3390 
     | 
    
         
            +
                    if self.enable_pdmux:
         
     | 
| 
      
 3391 
     | 
    
         
            +
                        assert (
         
     | 
| 
      
 3392 
     | 
    
         
            +
                            self.pp_size == 1
         
     | 
| 
      
 3393 
     | 
    
         
            +
                        ), "PD-Multiplexing is only supported with pipeline parallelism disabled (pp_size=1)."
         
     | 
| 
      
 3394 
     | 
    
         
            +
                        assert (
         
     | 
| 
      
 3395 
     | 
    
         
            +
                            self.chunked_prefill_size == -1
         
     | 
| 
      
 3396 
     | 
    
         
            +
                        ), "PD-Multiplexing is not compatible with chunked prefill."
         
     | 
| 
      
 3397 
     | 
    
         
            +
                        assert (
         
     | 
| 
      
 3398 
     | 
    
         
            +
                            self.disaggregation_mode == "null"
         
     | 
| 
      
 3399 
     | 
    
         
            +
                        ), "PD-Multiplexing is not compatible with disaggregation mode."
         
     | 
| 
      
 3400 
     | 
    
         
            +
                        assert (
         
     | 
| 
      
 3401 
     | 
    
         
            +
                            self.disable_overlap_schedule
         
     | 
| 
      
 3402 
     | 
    
         
            +
                        ), "PD-Multiplexing is not compatible with overlap schedule."
         
     | 
| 
      
 3403 
     | 
    
         
            +
             
     | 
| 
      
 3404 
     | 
    
         
            +
                        # NOTE: CUDA Green Context may encounter potential issues with CudaGraph on torch 2.7.x – 2.8.x, leading to performance degradation.
         
     | 
| 
      
 3405 
     | 
    
         
            +
                        import torch
         
     | 
| 
      
 3406 
     | 
    
         
            +
             
     | 
| 
      
 3407 
     | 
    
         
            +
                        parts = torch.__version__.split("+", 1)[0].split(".")
         
     | 
| 
      
 3408 
     | 
    
         
            +
                        major = int(parts[0]) if len(parts) > 0 and parts[0].isdigit() else 0
         
     | 
| 
      
 3409 
     | 
    
         
            +
                        minor = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else 0
         
     | 
| 
      
 3410 
     | 
    
         
            +
                        if (major, minor) > (2, 6):
         
     | 
| 
      
 3411 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 3412 
     | 
    
         
            +
                                "WARNING: PD-Multiplexing may experience performance degradation with torch versions > 2.6.x.\n"
         
     | 
| 
      
 3413 
     | 
    
         
            +
                                f"  Current torch version is {torch.__version__}.\n"
         
     | 
| 
      
 3414 
     | 
    
         
            +
                                "  Please manually install torch 2.6.x."
         
     | 
| 
      
 3415 
     | 
    
         
            +
                            )
         
     | 
| 
      
 3416 
     | 
    
         
            +
             
     | 
| 
       2446 
3417 
     | 
    
         
             
                    assert self.tokenizer_worker_num > 0, "Tokenizer worker num must >= 1"
         
     | 
| 
       2447 
3418 
     | 
    
         
             
                    self.validate_buckets_rule(
         
     | 
| 
       2448 
3419 
     | 
    
         
             
                        "--prompt-tokens-buckets", self.prompt_tokens_buckets
         
     | 
| 
         @@ -2451,6 +3422,24 @@ class ServerArgs: 
     | 
|
| 
       2451 
3422 
     | 
    
         
             
                        "--generation-tokens-buckets", self.generation_tokens_buckets
         
     | 
| 
       2452 
3423 
     | 
    
         
             
                    )
         
     | 
| 
       2453 
3424 
     | 
    
         | 
| 
      
 3425 
     | 
    
         
            +
                    # Check scheduling policy
         
     | 
| 
      
 3426 
     | 
    
         
            +
                    if self.enable_priority_scheduling:
         
     | 
| 
      
 3427 
     | 
    
         
            +
                        assert self.schedule_policy in [
         
     | 
| 
      
 3428 
     | 
    
         
            +
                            "fcfs",
         
     | 
| 
      
 3429 
     | 
    
         
            +
                            "lof",
         
     | 
| 
      
 3430 
     | 
    
         
            +
                        ], f"To use priority scheduling, schedule_policy must be 'fcfs' or 'lof'. '{self.schedule_policy}' is not supported."
         
     | 
| 
      
 3431 
     | 
    
         
            +
             
     | 
| 
      
 3432 
     | 
    
         
            +
                    # Check multi-item scoring
         
     | 
| 
      
 3433 
     | 
    
         
            +
                    if self.multi_item_scoring_delimiter is not None:
         
     | 
| 
      
 3434 
     | 
    
         
            +
                        assert self.disable_radix_cache, (
         
     | 
| 
      
 3435 
     | 
    
         
            +
                            "Multi-item scoring requires radix cache to be disabled. "
         
     | 
| 
      
 3436 
     | 
    
         
            +
                            "Please set --disable-radix-cache when using --multi-item-scoring-delimiter."
         
     | 
| 
      
 3437 
     | 
    
         
            +
                        )
         
     | 
| 
      
 3438 
     | 
    
         
            +
                        assert self.chunked_prefill_size == -1, (
         
     | 
| 
      
 3439 
     | 
    
         
            +
                            "Multi-item scoring requires chunked prefill to be disabled. "
         
     | 
| 
      
 3440 
     | 
    
         
            +
                            "Please set --chunked-prefill-size -1 when using --multi-item-scoring-delimiter."
         
     | 
| 
      
 3441 
     | 
    
         
            +
                        )
         
     | 
| 
      
 3442 
     | 
    
         
            +
             
     | 
| 
       2454 
3443 
     | 
    
         
             
                def check_lora_server_args(self):
         
     | 
| 
       2455 
3444 
     | 
    
         
             
                    assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"
         
     | 
| 
       2456 
3445 
     | 
    
         | 
| 
         @@ -2534,6 +3523,12 @@ class ServerArgs: 
     | 
|
| 
       2534 
3523 
     | 
    
         
             
                                f"max_loaded_loras={self.max_loaded_loras}, lora_paths={len(self.lora_paths)}"
         
     | 
| 
       2535 
3524 
     | 
    
         
             
                            )
         
     | 
| 
       2536 
3525 
     | 
    
         | 
| 
      
 3526 
     | 
    
         
            +
                        if self.max_lora_chunk_size is not None:
         
     | 
| 
      
 3527 
     | 
    
         
            +
                            assert (
         
     | 
| 
      
 3528 
     | 
    
         
            +
                                16 <= self.max_lora_chunk_size <= 128
         
     | 
| 
      
 3529 
     | 
    
         
            +
                                and (self.max_lora_chunk_size & (self.max_lora_chunk_size - 1)) == 0
         
     | 
| 
      
 3530 
     | 
    
         
            +
                            ), "--max-lora-chunk-size must be a power of 2 between 16 and 128."
         
     | 
| 
      
 3531 
     | 
    
         
            +
             
     | 
| 
       2537 
3532 
     | 
    
         
             
                def validate_disagg_tp_size(self, prefill_tp: int, decode_tp: int):
         
     | 
| 
       2538 
3533 
     | 
    
         
             
                    larger_tp = max(decode_tp, prefill_tp)
         
     | 
| 
       2539 
3534 
     | 
    
         
             
                    smaller_tp = min(decode_tp, prefill_tp)
         
     | 
| 
         @@ -2551,8 +3546,8 @@ class ServerArgs: 
     | 
|
| 
       2551 
3546 
     | 
    
         
             
                    assert rule in [
         
     | 
| 
       2552 
3547 
     | 
    
         
             
                        "tse",
         
     | 
| 
       2553 
3548 
     | 
    
         
             
                        "default",
         
     | 
| 
       2554 
     | 
    
         
            -
                        " 
     | 
| 
       2555 
     | 
    
         
            -
                    ], f"Unsupported {arg_name} rule type: '{rule}'. Must be one of: 'tse', 'default', ' 
     | 
| 
      
 3549 
     | 
    
         
            +
                        "custom",
         
     | 
| 
      
 3550 
     | 
    
         
            +
                    ], f"Unsupported {arg_name} rule type: '{rule}'. Must be one of: 'tse', 'default', 'custom'"
         
     | 
| 
       2556 
3551 
     | 
    
         | 
| 
       2557 
3552 
     | 
    
         
             
                    if rule == "tse":
         
     | 
| 
       2558 
3553 
     | 
    
         
             
                        assert (
         
     | 
| 
         @@ -2575,95 +3570,20 @@ class ServerArgs: 
     | 
|
| 
       2575 
3570 
     | 
    
         
             
                            len(buckets_rule) == 1
         
     | 
| 
       2576 
3571 
     | 
    
         
             
                        ), f"{arg_name} default rule should only have one parameter: ['default'], got {len(buckets_rule)}"
         
     | 
| 
       2577 
3572 
     | 
    
         | 
| 
       2578 
     | 
    
         
            -
                    elif rule == " 
     | 
| 
      
 3573 
     | 
    
         
            +
                    elif rule == "custom":
         
     | 
| 
       2579 
3574 
     | 
    
         
             
                        assert (
         
     | 
| 
       2580 
3575 
     | 
    
         
             
                            len(buckets_rule) >= 2
         
     | 
| 
       2581 
     | 
    
         
            -
                        ), f"{arg_name}  
     | 
| 
      
 3576 
     | 
    
         
            +
                        ), f"{arg_name} custom rule requires at least one bucket value: ['custom', value1, ...]"
         
     | 
| 
       2582 
3577 
     | 
    
         
             
                        try:
         
     | 
| 
       2583 
3578 
     | 
    
         
             
                            bucket_values = [float(x) for x in buckets_rule[1:]]
         
     | 
| 
       2584 
3579 
     | 
    
         
             
                        except ValueError:
         
     | 
| 
       2585 
     | 
    
         
            -
                            assert False, f"{arg_name}  
     | 
| 
      
 3580 
     | 
    
         
            +
                            assert False, f"{arg_name} custom rule bucket values must be numeric"
         
     | 
| 
       2586 
3581 
     | 
    
         
             
                        assert len(set(bucket_values)) == len(
         
     | 
| 
       2587 
3582 
     | 
    
         
             
                            bucket_values
         
     | 
| 
       2588 
     | 
    
         
            -
                        ), f"{arg_name}  
     | 
| 
      
 3583 
     | 
    
         
            +
                        ), f"{arg_name} custom rule bucket values should not contain duplicates"
         
     | 
| 
       2589 
3584 
     | 
    
         
             
                        assert all(
         
     | 
| 
       2590 
3585 
     | 
    
         
             
                            val >= 0 for val in bucket_values
         
     | 
| 
       2591 
     | 
    
         
            -
                        ), f"{arg_name}  
     | 
| 
       2592 
     | 
    
         
            -
             
     | 
| 
       2593 
     | 
    
         
            -
                def model_specific_adjustments(self):
         
     | 
| 
       2594 
     | 
    
         
            -
                    hf_config = self.get_hf_config()
         
     | 
| 
       2595 
     | 
    
         
            -
                    model_arch = hf_config.architectures[0]
         
     | 
| 
       2596 
     | 
    
         
            -
                    if model_arch in ["GptOssForCausalLM"]:
         
     | 
| 
       2597 
     | 
    
         
            -
                        if self.attention_backend is None:
         
     | 
| 
       2598 
     | 
    
         
            -
                            if is_cuda() and is_sm100_supported():
         
     | 
| 
       2599 
     | 
    
         
            -
                                self.attention_backend = "trtllm_mha"
         
     | 
| 
       2600 
     | 
    
         
            -
                            elif is_cuda() and is_sm90_supported():
         
     | 
| 
       2601 
     | 
    
         
            -
                                self.attention_backend = "fa3"
         
     | 
| 
       2602 
     | 
    
         
            -
                            else:
         
     | 
| 
       2603 
     | 
    
         
            -
                                self.attention_backend = "triton"
         
     | 
| 
       2604 
     | 
    
         
            -
                        supported_backends = ["triton", "trtllm_mha", "fa3"]
         
     | 
| 
       2605 
     | 
    
         
            -
                        logger.info(
         
     | 
| 
       2606 
     | 
    
         
            -
                            f"Use {self.attention_backend} as attention backend for GptOssForCausalLM"
         
     | 
| 
       2607 
     | 
    
         
            -
                        )
         
     | 
| 
       2608 
     | 
    
         
            -
                        assert (
         
     | 
| 
       2609 
     | 
    
         
            -
                            self.attention_backend in supported_backends
         
     | 
| 
       2610 
     | 
    
         
            -
                        ), f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got '{self.attention_backend}'"
         
     | 
| 
       2611 
     | 
    
         
            -
             
     | 
| 
       2612 
     | 
    
         
            -
                        if is_sm100_supported():
         
     | 
| 
       2613 
     | 
    
         
            -
                            if not self.enable_dp_attention:
         
     | 
| 
       2614 
     | 
    
         
            -
                                self.enable_flashinfer_allreduce_fusion = True
         
     | 
| 
       2615 
     | 
    
         
            -
                                logger.info(
         
     | 
| 
       2616 
     | 
    
         
            -
                                    "Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM"
         
     | 
| 
       2617 
     | 
    
         
            -
                                )
         
     | 
| 
       2618 
     | 
    
         
            -
                        quantization_config = getattr(hf_config, "quantization_config", None)
         
     | 
| 
       2619 
     | 
    
         
            -
                        is_mxfp4_quant_format = (
         
     | 
| 
       2620 
     | 
    
         
            -
                            quantization_config is not None
         
     | 
| 
       2621 
     | 
    
         
            -
                            and quantization_config.get("quant_method") == "mxfp4"
         
     | 
| 
       2622 
     | 
    
         
            -
                        )
         
     | 
| 
       2623 
     | 
    
         
            -
             
     | 
| 
       2624 
     | 
    
         
            -
                        if is_sm100_supported() and is_mxfp4_quant_format:
         
     | 
| 
       2625 
     | 
    
         
            -
                            self.moe_runner_backend = "flashinfer_mxfp4"
         
     | 
| 
       2626 
     | 
    
         
            -
                            logger.warning(
         
     | 
| 
       2627 
     | 
    
         
            -
                                "Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
         
     | 
| 
       2628 
     | 
    
         
            -
                            )
         
     | 
| 
       2629 
     | 
    
         
            -
                        else:
         
     | 
| 
       2630 
     | 
    
         
            -
                            if self.moe_runner_backend == "triton_kernel":
         
     | 
| 
       2631 
     | 
    
         
            -
                                assert (
         
     | 
| 
       2632 
     | 
    
         
            -
                                    self.ep_size == 1
         
     | 
| 
       2633 
     | 
    
         
            -
                                ), "Triton kernel MoE is only supported when ep_size == 1"
         
     | 
| 
       2634 
     | 
    
         
            -
                            if (
         
     | 
| 
       2635 
     | 
    
         
            -
                                self.moe_runner_backend == "auto"
         
     | 
| 
       2636 
     | 
    
         
            -
                                and self.ep_size == 1
         
     | 
| 
       2637 
     | 
    
         
            -
                                and is_triton_kernels_available()
         
     | 
| 
       2638 
     | 
    
         
            -
                            ):
         
     | 
| 
       2639 
     | 
    
         
            -
                                self.moe_runner_backend = "triton_kernel"
         
     | 
| 
       2640 
     | 
    
         
            -
                                logger.warning(
         
     | 
| 
       2641 
     | 
    
         
            -
                                    "Detected GPT-OSS model, enabling triton_kernels MOE kernel."
         
     | 
| 
       2642 
     | 
    
         
            -
                                )
         
     | 
| 
       2643 
     | 
    
         
            -
                        self.disable_hybrid_swa_memory = True
         
     | 
| 
       2644 
     | 
    
         
            -
                        if is_mxfp4_quant_format:
         
     | 
| 
       2645 
     | 
    
         
            -
                            # use bf16 for mxfp4 triton kernels
         
     | 
| 
       2646 
     | 
    
         
            -
                            self.dtype = "bfloat16"
         
     | 
| 
       2647 
     | 
    
         
            -
             
     | 
| 
       2648 
     | 
    
         
            -
                    elif "Llama4" in model_arch:
         
     | 
| 
       2649 
     | 
    
         
            -
                        assert self.attention_backend in {
         
     | 
| 
       2650 
     | 
    
         
            -
                            "fa3",
         
     | 
| 
       2651 
     | 
    
         
            -
                            "aiter",
         
     | 
| 
       2652 
     | 
    
         
            -
                            "triton",
         
     | 
| 
       2653 
     | 
    
         
            -
                        }, "fa3, aiter, or triton is required for Llama4 model"
         
     | 
| 
       2654 
     | 
    
         
            -
                    elif model_arch in [
         
     | 
| 
       2655 
     | 
    
         
            -
                        "Gemma2ForCausalLM",
         
     | 
| 
       2656 
     | 
    
         
            -
                        "Gemma3ForCausalLM",
         
     | 
| 
       2657 
     | 
    
         
            -
                        "Gemma3ForConditionalGeneration",
         
     | 
| 
       2658 
     | 
    
         
            -
                        "Gemma3nForCausalLM",
         
     | 
| 
       2659 
     | 
    
         
            -
                        "Gemma3nForConditionalGeneration",
         
     | 
| 
       2660 
     | 
    
         
            -
                    ]:
         
     | 
| 
       2661 
     | 
    
         
            -
                        # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
         
     | 
| 
       2662 
     | 
    
         
            -
                        # It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
         
     | 
| 
       2663 
     | 
    
         
            -
                        logger.warning(
         
     | 
| 
       2664 
     | 
    
         
            -
                            f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
         
     | 
| 
       2665 
     | 
    
         
            -
                        )
         
     | 
| 
       2666 
     | 
    
         
            -
                        self.disable_hybrid_swa_memory = True
         
     | 
| 
      
 3586 
     | 
    
         
            +
                        ), f"{arg_name} custom rule bucket values should be non-negative"
         
     | 
| 
       2667 
3587 
     | 
    
         | 
| 
       2668 
3588 
     | 
    
         
             
                def adjust_mem_fraction_for_vlm(self, model_config):
         
     | 
| 
       2669 
3589 
     | 
    
         
             
                    vision_config = getattr(model_config.hf_config, "vision_config", None)
         
     | 
| 
         @@ -2704,6 +3624,22 @@ class ServerArgs: 
     | 
|
| 
       2704 
3624 
     | 
    
         
             
                    )
         
     | 
| 
       2705 
3625 
     | 
    
         | 
| 
       2706 
3626 
     | 
    
         | 
| 
      
 3627 
     | 
    
         
            +
            # NOTE: This is a global variable to hold the server args for scheduler.
         
     | 
| 
      
 3628 
     | 
    
         
            +
            _global_server_args: Optional[ServerArgs] = None
         
     | 
| 
      
 3629 
     | 
    
         
            +
             
     | 
| 
      
 3630 
     | 
    
         
            +
             
     | 
| 
      
 3631 
     | 
    
         
            +
            def set_global_server_args_for_scheduler(server_args: ServerArgs):
         
     | 
| 
      
 3632 
     | 
    
         
            +
                global _global_server_args
         
     | 
| 
      
 3633 
     | 
    
         
            +
                _global_server_args = server_args
         
     | 
| 
      
 3634 
     | 
    
         
            +
             
     | 
| 
      
 3635 
     | 
    
         
            +
             
     | 
| 
      
 3636 
     | 
    
         
            +
            def get_global_server_args() -> ServerArgs:
         
     | 
| 
      
 3637 
     | 
    
         
            +
                if _global_server_args is None:
         
     | 
| 
      
 3638 
     | 
    
         
            +
                    raise ValueError("Global server args is not set yet!")
         
     | 
| 
      
 3639 
     | 
    
         
            +
             
     | 
| 
      
 3640 
     | 
    
         
            +
                return _global_server_args
         
     | 
| 
      
 3641 
     | 
    
         
            +
             
     | 
| 
      
 3642 
     | 
    
         
            +
             
     | 
| 
       2707 
3643 
     | 
    
         
             
            def prepare_server_args(argv: List[str]) -> ServerArgs:
         
     | 
| 
       2708 
3644 
     | 
    
         
             
                """
         
     | 
| 
       2709 
3645 
     | 
    
         
             
                Prepare the server arguments from the command line arguments.
         
     | 
| 
         @@ -2715,14 +3651,35 @@ def prepare_server_args(argv: List[str]) -> ServerArgs: 
     | 
|
| 
       2715 
3651 
     | 
    
         
             
                Returns:
         
     | 
| 
       2716 
3652 
     | 
    
         
             
                    The server arguments.
         
     | 
| 
       2717 
3653 
     | 
    
         
             
                """
         
     | 
| 
      
 3654 
     | 
    
         
            +
                # Import here to avoid circular imports
         
     | 
| 
      
 3655 
     | 
    
         
            +
                from sglang.srt.server_args_config_parser import ConfigArgumentMerger
         
     | 
| 
      
 3656 
     | 
    
         
            +
             
     | 
| 
      
 3657 
     | 
    
         
            +
                # Check for config file and merge arguments if present
         
     | 
| 
      
 3658 
     | 
    
         
            +
                if "--config" in argv:
         
     | 
| 
      
 3659 
     | 
    
         
            +
                    # Extract boolean actions from the parser to handle them correctly
         
     | 
| 
      
 3660 
     | 
    
         
            +
                    parser = argparse.ArgumentParser()
         
     | 
| 
      
 3661 
     | 
    
         
            +
                    ServerArgs.add_cli_args(parser)
         
     | 
| 
      
 3662 
     | 
    
         
            +
             
     | 
| 
      
 3663 
     | 
    
         
            +
                    # Get boolean action destinations
         
     | 
| 
      
 3664 
     | 
    
         
            +
                    boolean_actions = []
         
     | 
| 
      
 3665 
     | 
    
         
            +
                    for action in parser._actions:
         
     | 
| 
      
 3666 
     | 
    
         
            +
                        if hasattr(action, "dest") and hasattr(action, "action"):
         
     | 
| 
      
 3667 
     | 
    
         
            +
                            if action.action in ["store_true", "store_false"]:
         
     | 
| 
      
 3668 
     | 
    
         
            +
                                boolean_actions.append(action.dest)
         
     | 
| 
      
 3669 
     | 
    
         
            +
             
     | 
| 
      
 3670 
     | 
    
         
            +
                    # Merge config file arguments with CLI arguments
         
     | 
| 
      
 3671 
     | 
    
         
            +
                    config_merger = ConfigArgumentMerger(boolean_actions=boolean_actions)
         
     | 
| 
      
 3672 
     | 
    
         
            +
                    argv = config_merger.merge_config_with_args(argv)
         
     | 
| 
      
 3673 
     | 
    
         
            +
             
     | 
| 
       2718 
3674 
     | 
    
         
             
                parser = argparse.ArgumentParser()
         
     | 
| 
       2719 
3675 
     | 
    
         
             
                ServerArgs.add_cli_args(parser)
         
     | 
| 
       2720 
3676 
     | 
    
         
             
                raw_args = parser.parse_args(argv)
         
     | 
| 
       2721 
     | 
    
         
            -
             
     | 
| 
       2722 
     | 
    
         
            -
                return  
     | 
| 
      
 3677 
     | 
    
         
            +
             
     | 
| 
      
 3678 
     | 
    
         
            +
                return ServerArgs.from_cli_args(raw_args)
         
     | 
| 
       2723 
3679 
     | 
    
         | 
| 
       2724 
3680 
     | 
    
         | 
| 
       2725 
3681 
     | 
    
         
             
            ZMQ_TCP_PORT_DELTA = 233
         
     | 
| 
      
 3682 
     | 
    
         
            +
            DP_ATTENTION_HANDSHAKE_PORT_DELTA = 5
         
     | 
| 
       2726 
3683 
     | 
    
         | 
| 
       2727 
3684 
     | 
    
         | 
| 
       2728 
3685 
     | 
    
         
             
            @dataclasses.dataclass
         
     | 
| 
         @@ -2747,7 +3704,11 @@ class PortArgs: 
     | 
|
| 
       2747 
3704 
     | 
    
         
             
                tokenizer_worker_ipc_name: Optional[str]
         
     | 
| 
       2748 
3705 
     | 
    
         | 
| 
       2749 
3706 
     | 
    
         
             
                @staticmethod
         
     | 
| 
       2750 
     | 
    
         
            -
                def init_new( 
     | 
| 
      
 3707 
     | 
    
         
            +
                def init_new(
         
     | 
| 
      
 3708 
     | 
    
         
            +
                    server_args: ServerArgs,
         
     | 
| 
      
 3709 
     | 
    
         
            +
                    dp_rank: Optional[int] = None,
         
     | 
| 
      
 3710 
     | 
    
         
            +
                    worker_ports: Optional[List[int]] = None,
         
     | 
| 
      
 3711 
     | 
    
         
            +
                ) -> PortArgs:
         
     | 
| 
       2751 
3712 
     | 
    
         
             
                    if server_args.nccl_port is None:
         
     | 
| 
       2752 
3713 
     | 
    
         
             
                        nccl_port = server_args.port + random.randint(100, 1000)
         
     | 
| 
       2753 
3714 
     | 
    
         
             
                        while True:
         
     | 
| 
         @@ -2794,8 +3755,8 @@ class PortArgs: 
     | 
|
| 
       2794 
3755 
     | 
    
         
             
                            # TokenizerManager to DataParallelController
         
     | 
| 
       2795 
3756 
     | 
    
         
             
                            scheduler_input_port = port_base + 4
         
     | 
| 
       2796 
3757 
     | 
    
         
             
                        else:
         
     | 
| 
       2797 
     | 
    
         
            -
                             
     | 
| 
       2798 
     | 
    
         
            -
             
     | 
| 
      
 3758 
     | 
    
         
            +
                            assert worker_ports is not None
         
     | 
| 
      
 3759 
     | 
    
         
            +
                            scheduler_input_port = worker_ports[dp_rank]
         
     | 
| 
       2799 
3760 
     | 
    
         
             
                        return PortArgs(
         
     | 
| 
       2800 
3761 
     | 
    
         
             
                            tokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base}",
         
     | 
| 
       2801 
3762 
     | 
    
         
             
                            scheduler_input_ipc_name=f"tcp://{dist_init_host}:{scheduler_input_port}",
         
     | 
| 
         @@ -2856,6 +3817,7 @@ def auto_choose_speculative_params(self: ServerArgs): 
     | 
|
| 
       2856 
3817 
     | 
    
         
             
                    # The default value for llama
         
     | 
| 
       2857 
3818 
     | 
    
         
             
                    return (5, 4, 8)
         
     | 
| 
       2858 
3819 
     | 
    
         
             
                elif arch in [
         
     | 
| 
      
 3820 
     | 
    
         
            +
                    "DeepseekV32ForCausalLM",
         
     | 
| 
       2859 
3821 
     | 
    
         
             
                    "DeepseekV3ForCausalLM",
         
     | 
| 
       2860 
3822 
     | 
    
         
             
                    "DeepseekV2ForCausalLM",
         
     | 
| 
       2861 
3823 
     | 
    
         
             
                    "GptOssForCausalLM",
         
     |