PyPI - sglang - Versions diffs - 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl - Mend

sglang 0.5.3rc0py3-none-any.whl → 0.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (482) hide show

sglang/bench_one_batch.py +54 -37
sglang/bench_one_batch_server.py +340 -34
sglang/bench_serving.py +340 -159
sglang/check_env.py +1 -1
sglang/compile_deep_gemm.py +6 -2
sglang/global_config.py +1 -25
sglang/lang/api.py +6 -0
sglang/lang/backend/runtime_endpoint.py +1 -1
sglang/lang/interpreter.py +1 -0
sglang/lang/ir.py +13 -0
sglang/launch_server.py +9 -2
sglang/profiler.py +20 -3
sglang/srt/_custom_ops.py +1 -1
sglang/srt/batch_invariant_ops/__init__.py +27 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
sglang/srt/compilation/backend.py +437 -0
sglang/srt/compilation/compilation_config.py +20 -0
sglang/srt/compilation/compilation_counter.py +47 -0
sglang/srt/compilation/compile.py +210 -0
sglang/srt/compilation/compiler_interface.py +503 -0
sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
sglang/srt/compilation/fix_functionalization.py +134 -0
sglang/srt/compilation/fx_utils.py +83 -0
sglang/srt/compilation/inductor_pass.py +140 -0
sglang/srt/compilation/pass_manager.py +66 -0
sglang/srt/compilation/piecewise_context_manager.py +40 -0
sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
sglang/srt/configs/__init__.py +8 -0
sglang/srt/configs/deepseek_ocr.py +262 -0
sglang/srt/configs/deepseekvl2.py +194 -96
sglang/srt/configs/dots_ocr.py +64 -0
sglang/srt/configs/dots_vlm.py +2 -7
sglang/srt/configs/falcon_h1.py +309 -0
sglang/srt/configs/load_config.py +33 -2
sglang/srt/configs/mamba_utils.py +117 -0
sglang/srt/configs/model_config.py +284 -118
sglang/srt/configs/modelopt_config.py +30 -0
sglang/srt/configs/nemotron_h.py +286 -0
sglang/srt/configs/olmo3.py +105 -0
sglang/srt/configs/points_v15_chat.py +29 -0
sglang/srt/configs/qwen3_next.py +11 -47
sglang/srt/configs/qwen3_omni.py +613 -0
sglang/srt/configs/qwen3_vl.py +576 -0
sglang/srt/connector/remote_instance.py +1 -1
sglang/srt/constrained/base_grammar_backend.py +6 -1
sglang/srt/constrained/llguidance_backend.py +5 -0
sglang/srt/constrained/outlines_backend.py +1 -1
sglang/srt/constrained/outlines_jump_forward.py +1 -1
sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
sglang/srt/constrained/utils.py +12 -0
sglang/srt/constrained/xgrammar_backend.py +26 -15
sglang/srt/debug_utils/dumper.py +10 -3
sglang/srt/disaggregation/ascend/conn.py +2 -2
sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
sglang/srt/disaggregation/base/conn.py +17 -4
sglang/srt/disaggregation/common/conn.py +268 -98
sglang/srt/disaggregation/decode.py +172 -39
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
sglang/srt/disaggregation/fake/conn.py +11 -3
sglang/srt/disaggregation/mooncake/conn.py +203 -555
sglang/srt/disaggregation/nixl/conn.py +217 -63
sglang/srt/disaggregation/prefill.py +113 -270
sglang/srt/disaggregation/utils.py +36 -5
sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
sglang/srt/distributed/device_communicators/pynccl.py +24 -12
sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
sglang/srt/distributed/naive_distributed.py +5 -4
sglang/srt/distributed/parallel_state.py +203 -97
sglang/srt/elastic_ep/elastic_ep.py +74 -0
sglang/srt/entrypoints/context.py +3 -2
sglang/srt/entrypoints/engine.py +85 -65
sglang/srt/entrypoints/grpc_server.py +632 -305
sglang/srt/entrypoints/harmony_utils.py +2 -2
sglang/srt/entrypoints/http_server.py +169 -17
sglang/srt/entrypoints/http_server_engine.py +1 -7
sglang/srt/entrypoints/openai/protocol.py +327 -34
sglang/srt/entrypoints/openai/serving_base.py +74 -8
sglang/srt/entrypoints/openai/serving_chat.py +202 -118
sglang/srt/entrypoints/openai/serving_classify.py +204 -0
sglang/srt/entrypoints/openai/serving_completions.py +20 -4
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/entrypoints/openai/serving_responses.py +47 -2
sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
sglang/srt/environ.py +323 -0
sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
sglang/srt/eplb/expert_distribution.py +3 -4
sglang/srt/eplb/expert_location.py +30 -5
sglang/srt/eplb/expert_location_dispatch.py +2 -2
sglang/srt/eplb/expert_location_updater.py +2 -2
sglang/srt/function_call/base_format_detector.py +17 -18
sglang/srt/function_call/function_call_parser.py +21 -16
sglang/srt/function_call/glm4_moe_detector.py +4 -8
sglang/srt/function_call/gpt_oss_detector.py +24 -1
sglang/srt/function_call/json_array_parser.py +61 -0
sglang/srt/function_call/kimik2_detector.py +17 -4
sglang/srt/function_call/utils.py +98 -7
sglang/srt/grpc/compile_proto.py +245 -0
sglang/srt/grpc/grpc_request_manager.py +915 -0
sglang/srt/grpc/health_servicer.py +189 -0
sglang/srt/grpc/scheduler_launcher.py +181 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
sglang/srt/layers/activation.py +11 -7
sglang/srt/layers/attention/aiter_backend.py +17 -18
sglang/srt/layers/attention/ascend_backend.py +125 -10
sglang/srt/layers/attention/attention_registry.py +226 -0
sglang/srt/layers/attention/base_attn_backend.py +32 -4
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
sglang/srt/layers/attention/fla/chunk.py +0 -1
sglang/srt/layers/attention/fla/chunk_o.py +1 -1
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
sglang/srt/layers/attention/fla/index.py +0 -2
sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
sglang/srt/layers/attention/fla/utils.py +0 -3
sglang/srt/layers/attention/fla/wy_fast.py +0 -2
sglang/srt/layers/attention/flashattention_backend.py +52 -15
sglang/srt/layers/attention/flashinfer_backend.py +357 -212
sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
sglang/srt/layers/attention/flashmla_backend.py +9 -7
sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
sglang/srt/layers/attention/intel_amx_backend.py +1 -1
sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
sglang/srt/layers/attention/mamba/mamba.py +514 -1
sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
sglang/srt/layers/attention/nsa/transform_index.py +144 -0
sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
sglang/srt/layers/attention/nsa/utils.py +23 -0
sglang/srt/layers/attention/nsa_backend.py +1201 -0
sglang/srt/layers/attention/tbo_backend.py +6 -6
sglang/srt/layers/attention/torch_flex_backend.py +325 -0
sglang/srt/layers/attention/triton_backend.py +249 -42
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
sglang/srt/layers/attention/utils.py +11 -7
sglang/srt/layers/attention/vision.py +61 -3
sglang/srt/layers/attention/wave_backend.py +4 -4
sglang/srt/layers/attention/xpu_backend.py +1028 -0
sglang/srt/layers/communicator.py +19 -7
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
sglang/srt/layers/dp_attention.py +28 -1
sglang/srt/layers/elementwise.py +3 -1
sglang/srt/layers/layernorm.py +47 -15
sglang/srt/layers/linear.py +30 -5
sglang/srt/layers/logits_processor.py +161 -18
sglang/srt/layers/modelopt_utils.py +11 -0
sglang/srt/layers/moe/cutlass_moe.py +0 -2
sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
sglang/srt/layers/moe/ep_moe/layer.py +243 -448
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
sglang/srt/layers/moe/moe_runner/runner.py +3 -0
sglang/srt/layers/moe/moe_runner/triton.py +3 -1
sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
sglang/srt/layers/moe/router.py +51 -15
sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
sglang/srt/layers/moe/topk.py +3 -2
sglang/srt/layers/moe/utils.py +27 -1
sglang/srt/layers/parameter.py +23 -6
sglang/srt/layers/quantization/__init__.py +2 -53
sglang/srt/layers/quantization/awq.py +183 -6
sglang/srt/layers/quantization/awq_triton.py +29 -0
sglang/srt/layers/quantization/base_config.py +20 -1
sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
sglang/srt/layers/quantization/fp8.py +86 -20
sglang/srt/layers/quantization/fp8_kernel.py +55 -10
sglang/srt/layers/quantization/fp8_utils.py +43 -15
sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
sglang/srt/layers/quantization/gptq.py +0 -1
sglang/srt/layers/quantization/int8_kernel.py +18 -2
sglang/srt/layers/quantization/marlin_utils.py +12 -0
sglang/srt/layers/quantization/modelopt_quant.py +141 -81
sglang/srt/layers/quantization/mxfp4.py +17 -34
sglang/srt/layers/quantization/petit.py +1 -1
sglang/srt/layers/quantization/quark/quark.py +3 -1
sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
sglang/srt/layers/quantization/unquant.py +1 -4
sglang/srt/layers/quantization/utils.py +0 -1
sglang/srt/layers/quantization/w4afp8.py +51 -24
sglang/srt/layers/quantization/w8a8_int8.py +45 -27
sglang/srt/layers/radix_attention.py +59 -9
sglang/srt/layers/rotary_embedding.py +750 -46
sglang/srt/layers/sampler.py +84 -16
sglang/srt/layers/sparse_pooler.py +98 -0
sglang/srt/layers/utils.py +23 -1
sglang/srt/layers/vocab_parallel_embedding.py +4 -1
sglang/srt/lora/backend/base_backend.py +3 -3
sglang/srt/lora/backend/chunked_backend.py +348 -0
sglang/srt/lora/backend/triton_backend.py +9 -4
sglang/srt/lora/eviction_policy.py +139 -0
sglang/srt/lora/lora.py +7 -5
sglang/srt/lora/lora_manager.py +33 -7
sglang/srt/lora/lora_registry.py +1 -1
sglang/srt/lora/mem_pool.py +41 -17
sglang/srt/lora/triton_ops/__init__.py +4 -0
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
sglang/srt/lora/utils.py +7 -5
sglang/srt/managers/cache_controller.py +83 -152
sglang/srt/managers/data_parallel_controller.py +156 -87
sglang/srt/managers/detokenizer_manager.py +51 -24
sglang/srt/managers/io_struct.py +223 -129
sglang/srt/managers/mm_utils.py +49 -10
sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
sglang/srt/managers/multimodal_processor.py +1 -2
sglang/srt/managers/overlap_utils.py +130 -0
sglang/srt/managers/schedule_batch.py +340 -529
sglang/srt/managers/schedule_policy.py +158 -18
sglang/srt/managers/scheduler.py +665 -620
sglang/srt/managers/scheduler_input_blocker.py +1 -1
sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
sglang/srt/managers/scheduler_pp_mixin.py +341 -0
sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
sglang/srt/managers/tokenizer_manager.py +462 -226
sglang/srt/managers/tp_worker.py +217 -156
sglang/srt/managers/utils.py +79 -47
sglang/srt/mem_cache/allocator.py +21 -22
sglang/srt/mem_cache/allocator_ascend.py +42 -28
sglang/srt/mem_cache/base_prefix_cache.py +3 -3
sglang/srt/mem_cache/chunk_cache.py +20 -2
sglang/srt/mem_cache/common.py +480 -0
sglang/srt/mem_cache/evict_policy.py +38 -0
sglang/srt/mem_cache/hicache_storage.py +44 -2
sglang/srt/mem_cache/hiradix_cache.py +134 -34
sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
sglang/srt/mem_cache/memory_pool.py +602 -208
sglang/srt/mem_cache/memory_pool_host.py +134 -183
sglang/srt/mem_cache/multimodal_cache.py +0 -1
sglang/srt/mem_cache/radix_cache.py +263 -78
sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
sglang/srt/mem_cache/storage/__init__.py +10 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
sglang/srt/mem_cache/storage/backend_factory.py +223 -0
sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
sglang/srt/mem_cache/swa_radix_cache.py +115 -58
sglang/srt/metrics/collector.py +113 -120
sglang/srt/metrics/func_timer.py +3 -8
sglang/srt/metrics/utils.py +8 -1
sglang/srt/model_executor/cpu_graph_runner.py +2 -2
sglang/srt/model_executor/cuda_graph_runner.py +81 -36
sglang/srt/model_executor/forward_batch_info.py +40 -50
sglang/srt/model_executor/model_runner.py +507 -319
sglang/srt/model_executor/npu_graph_runner.py +11 -5
sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
sglang/srt/model_loader/__init__.py +1 -1
sglang/srt/model_loader/loader.py +438 -37
sglang/srt/model_loader/utils.py +0 -1
sglang/srt/model_loader/weight_utils.py +200 -27
sglang/srt/models/apertus.py +2 -3
sglang/srt/models/arcee.py +2 -2
sglang/srt/models/bailing_moe.py +40 -56
sglang/srt/models/bailing_moe_nextn.py +3 -4
sglang/srt/models/bert.py +1 -1
sglang/srt/models/deepseek_nextn.py +25 -4
sglang/srt/models/deepseek_ocr.py +1516 -0
sglang/srt/models/deepseek_v2.py +793 -235
sglang/srt/models/dots_ocr.py +171 -0
sglang/srt/models/dots_vlm.py +0 -1
sglang/srt/models/dots_vlm_vit.py +1 -1
sglang/srt/models/falcon_h1.py +570 -0
sglang/srt/models/gemma3_causal.py +0 -2
sglang/srt/models/gemma3_mm.py +17 -1
sglang/srt/models/gemma3n_mm.py +2 -3
sglang/srt/models/glm4_moe.py +17 -40
sglang/srt/models/glm4_moe_nextn.py +4 -4
sglang/srt/models/glm4v.py +3 -2
sglang/srt/models/glm4v_moe.py +6 -6
sglang/srt/models/gpt_oss.py +12 -35
sglang/srt/models/grok.py +10 -23
sglang/srt/models/hunyuan.py +2 -7
sglang/srt/models/interns1.py +0 -1
sglang/srt/models/kimi_vl.py +1 -7
sglang/srt/models/kimi_vl_moonvit.py +4 -2
sglang/srt/models/llama.py +6 -2
sglang/srt/models/llama_eagle3.py +1 -1
sglang/srt/models/longcat_flash.py +6 -23
sglang/srt/models/longcat_flash_nextn.py +4 -15
sglang/srt/models/mimo.py +2 -13
sglang/srt/models/mimo_mtp.py +1 -2
sglang/srt/models/minicpmo.py +7 -5
sglang/srt/models/mixtral.py +1 -4
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/mllama4.py +27 -6
sglang/srt/models/nemotron_h.py +511 -0
sglang/srt/models/olmo2.py +31 -4
sglang/srt/models/opt.py +5 -5
sglang/srt/models/phi.py +1 -1
sglang/srt/models/phi4mm.py +1 -1
sglang/srt/models/phimoe.py +0 -1
sglang/srt/models/pixtral.py +0 -3
sglang/srt/models/points_v15_chat.py +186 -0
sglang/srt/models/qwen.py +0 -1
sglang/srt/models/qwen2.py +0 -7
sglang/srt/models/qwen2_5_vl.py +5 -5
sglang/srt/models/qwen2_audio.py +2 -15
sglang/srt/models/qwen2_moe.py +70 -4
sglang/srt/models/qwen2_vl.py +6 -3
sglang/srt/models/qwen3.py +18 -3
sglang/srt/models/qwen3_moe.py +50 -38
sglang/srt/models/qwen3_next.py +43 -21
sglang/srt/models/qwen3_next_mtp.py +3 -4
sglang/srt/models/qwen3_omni_moe.py +661 -0
sglang/srt/models/qwen3_vl.py +791 -0
sglang/srt/models/qwen3_vl_moe.py +343 -0
sglang/srt/models/registry.py +15 -3
sglang/srt/models/roberta.py +55 -3
sglang/srt/models/sarashina2_vision.py +268 -0
sglang/srt/models/solar.py +505 -0
sglang/srt/models/starcoder2.py +357 -0
sglang/srt/models/step3_vl.py +3 -5
sglang/srt/models/torch_native_llama.py +9 -2
sglang/srt/models/utils.py +61 -0
sglang/srt/multimodal/processors/base_processor.py +21 -9
sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
sglang/srt/multimodal/processors/dots_vlm.py +2 -4
sglang/srt/multimodal/processors/glm4v.py +1 -5
sglang/srt/multimodal/processors/internvl.py +20 -10
sglang/srt/multimodal/processors/janus_pro.py +0 -1
sglang/srt/multimodal/processors/mllama4.py +0 -8
sglang/srt/multimodal/processors/phi4mm.py +0 -1
sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
sglang/srt/multimodal/processors/qwen_vl.py +83 -17
sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
sglang/srt/multimodal/processors/step3_vl.py +1 -1
sglang/srt/parser/conversation.py +41 -0
sglang/srt/parser/jinja_template_utils.py +6 -0
sglang/srt/parser/reasoning_parser.py +0 -1
sglang/srt/sampling/custom_logit_processor.py +77 -2
sglang/srt/sampling/sampling_batch_info.py +36 -23
sglang/srt/sampling/sampling_params.py +75 -0
sglang/srt/server_args.py +1300 -338
sglang/srt/server_args_config_parser.py +146 -0
sglang/srt/single_batch_overlap.py +161 -0
sglang/srt/speculative/base_spec_worker.py +34 -0
sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
sglang/srt/speculative/cpp_ngram/param.h +125 -0
sglang/srt/speculative/cpp_ngram/queue.h +71 -0
sglang/srt/speculative/draft_utils.py +226 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
sglang/srt/speculative/eagle_info.py +786 -0
sglang/srt/speculative/eagle_info_v2.py +458 -0
sglang/srt/speculative/eagle_utils.py +113 -1270
sglang/srt/speculative/eagle_worker.py +120 -285
sglang/srt/speculative/eagle_worker_v2.py +702 -0
sglang/srt/speculative/ngram_info.py +433 -0
sglang/srt/speculative/ngram_worker.py +246 -0
sglang/srt/speculative/spec_info.py +49 -0
sglang/srt/speculative/spec_utils.py +641 -0
sglang/srt/speculative/standalone_worker.py +4 -14
sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
sglang/srt/tracing/trace.py +32 -6
sglang/srt/two_batch_overlap.py +35 -18
sglang/srt/utils/__init__.py +2 -0
sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
sglang/srt/{utils.py → utils/common.py} +583 -113
sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
sglang/srt/{offloader.py → utils/offloader.py} +4 -4
sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
sglang/srt/utils/profile_merger.py +199 -0
sglang/srt/utils/rpd_utils.py +452 -0
sglang/srt/utils/slow_rank_detector.py +71 -0
sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
sglang/srt/warmup.py +8 -4
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/attention/test_flashattn_backend.py +1 -1
sglang/test/attention/test_flashattn_mla_backend.py +0 -1
sglang/test/attention/test_prefix_chunk_info.py +0 -2
sglang/test/attention/test_trtllm_mla_backend.py +221 -53
sglang/test/few_shot_gsm8k_engine.py +2 -4
sglang/test/get_logits_ut.py +57 -0
sglang/test/kit_matched_stop.py +157 -0
sglang/test/longbench_v2/__init__.py +1 -0
sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
sglang/test/run_eval.py +120 -11
sglang/test/runners.py +3 -1
sglang/test/send_one.py +42 -7
sglang/test/simple_eval_common.py +8 -2
sglang/test/simple_eval_gpqa.py +0 -1
sglang/test/simple_eval_humaneval.py +0 -3
sglang/test/simple_eval_longbench_v2.py +344 -0
sglang/test/simple_eval_mmmu_vlm.py +441 -0
sglang/test/test_block_fp8.py +3 -4
sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
sglang/test/test_cutlass_moe.py +1 -2
sglang/test/test_cutlass_w4a8_moe.py +10 -20
sglang/test/test_deterministic.py +430 -0
sglang/test/test_deterministic_utils.py +73 -0
sglang/test/test_disaggregation_utils.py +93 -1
sglang/test/test_marlin_moe.py +0 -1
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +432 -16
sglang/utils.py +10 -1
sglang/version.py +1 -1
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
sglang/srt/entrypoints/grpc_request_manager.py +0 -580
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
sglang/srt/mem_cache/lora_radix_cache.py +0 -421
sglang/srt/speculative/build_eagle_tree.py +0 -427
sglang/test/test_block_fp8_ep.py +0 -358
/sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
/sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
/sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
/sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0

sglang/srt/server_args.py CHANGED Viewed

@@ -13,20 +13,21 @@
 # ==============================================================================
 """The arguments of the server."""
+from __future__ import annotations
 import argparse
 import dataclasses
 import json
 import logging
 import os
 import random
-import socket
-import sys
 import tempfile
-from typing import List, Literal, Optional, Union
+from typing import Dict, List, Literal, Optional, Union
+import orjson
 from sglang.srt.connector import ConnectorType
 from sglang.srt.function_call.function_call_parser import FunctionCallParser
-from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
 from sglang.srt.lora.lora_registry import LoRARef
 from sglang.srt.parser.reasoning_parser import ReasoningParser
 from sglang.srt.utils import (
@@ -35,6 +36,7 @@ from sglang.srt.utils import (
     configure_ipv6,
     get_device,
     get_device_memory_capacity,
+    get_device_sm,
     is_cuda,
     is_flashinfer_available,
     is_hip,
@@ -43,12 +45,14 @@ from sglang.srt.utils import (
     is_remote_url,
     is_sm90_supported,
     is_sm100_supported,
+    is_sm120_supported,
     is_triton_kernels_available,
     is_valid_ipv6_address,
     json_list_type,
     nullable_str,
     parse_connector_type,
 )
+from sglang.srt.utils.hf_transformers_utils import check_gguf_file, get_config
 from sglang.utils import is_in_ci
 logger = logging.getLogger(__name__)
@@ -79,6 +83,7 @@ QUANTIZATION_CHOICES = [
     "bitsandbytes",
     "gguf",
     "modelopt",
+    "modelopt_fp8",
     "modelopt_fp4",
     "petit_nvfp4",
     "w8a8_int8",
@@ -87,33 +92,59 @@ QUANTIZATION_CHOICES = [
     "qoq",
     "w4afp8",
     "mxfp4",
+    "compressed-tensors",  # for Ktransformers
 ]
 ATTENTION_BACKEND_CHOICES = [
     # Common
     "triton",
     "torch_native",
+    "flex_attention",
+    "nsa",
     # NVIDIA specific
     "cutlass_mla",
     "fa3",
+    "fa4",
     "flashinfer",
     "flashmla",
     "trtllm_mla",
     "trtllm_mha",
     "dual_chunk_flash_attn",
-    "hybrid_linear_attn",
     # AMD specific
     "aiter",
     "wave",
     # Other platforms
     "intel_amx",
     "ascend",
+    "intel_xpu",
 ]
+LORA_BACKEND_CHOICES = ["triton", "csgmv"]
 DISAGG_TRANSFER_BACKEND_CHOICES = ["mooncake", "nixl", "ascend", "fake"]
 GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
+DETERMINISTIC_ATTENTION_BACKEND_CHOICES = ["flashinfer", "fa3", "triton"]
+DEFAULT_LORA_EVICTION_POLICY = "lru"
+NSA_CHOICES = ["flashmla_sparse", "flashmla_kv", "fa3", "tilelang", "aiter"]
+RADIX_EVICTION_POLICY_CHOICES = ["lru", "lfu"]
+MOE_RUNNER_BACKEND_CHOICES = [
+    "auto",
+    "deep_gemm",
+    "triton",
+    "triton_kernel",
+    "flashinfer_trtllm",
+    "flashinfer_cutlass",
+    "flashinfer_mxfp4",
+    "flashinfer_cutedsl",
+    "cutlass",
+]
 # Allow external code to add more choices
 def add_load_format_choices(choices):
@@ -136,6 +167,18 @@ def add_grammar_backend_choices(choices):
     GRAMMAR_BACKEND_CHOICES.extend(choices)
+def add_moe_runner_backend_choices(choices):
+    MOE_RUNNER_BACKEND_CHOICES.extend(choices)
+def add_deterministic_attention_backend_choices(choices):
+    DETERMINISTIC_ATTENTION_BACKEND_CHOICES.extend(choices)
+def add_radix_eviction_policy_choices(choices):
+    RADIX_EVICTION_POLICY_CHOICES.extend(choices)
 @dataclasses.dataclass
 class ServerArgs:
     # Model and tokenizer
@@ -147,6 +190,11 @@ class ServerArgs:
     load_format: str = "auto"
     model_loader_extra_config: str = "{}"
     trust_remote_code: bool = False
+    modelopt_quant: Optional[Union[str, Dict]] = None
+    modelopt_checkpoint_restore_path: Optional[str] = None
+    modelopt_checkpoint_save_path: Optional[str] = None
+    modelopt_export_path: Optional[str] = None
+    quantize_and_serve: bool = False
     context_length: Optional[int] = None
     is_embedding: bool = False
     enable_multimodal: Optional[bool] = None
@@ -156,39 +204,50 @@ class ServerArgs:
     # HTTP server
     host: str = "127.0.0.1"
     port: int = 30000
+    grpc_mode: bool = False
     skip_server_warmup: bool = False
     warmups: Optional[str] = None
     nccl_port: Optional[int] = None
+    checkpoint_engine_wait_weights_before_ready: bool = False
     # Quantization and data type
     dtype: str = "auto"
     quantization: Optional[str] = None
     quantization_param_path: Optional[str] = None
     kv_cache_dtype: str = "auto"
+    enable_fp32_lm_head: bool = False
     # Memory and scheduling
     mem_fraction_static: Optional[float] = None
     max_running_requests: Optional[int] = None
-    max_queued_requests: Optional[int] = sys.maxsize
+    max_queued_requests: Optional[int] = None
     max_total_tokens: Optional[int] = None
     chunked_prefill_size: Optional[int] = None
     max_prefill_tokens: int = 16384
     schedule_policy: str = "fcfs"
+    enable_priority_scheduling: bool = False
+    abort_on_priority_when_disabled: bool = False
+    schedule_low_priority_values_first: bool = False
+    priority_scheduling_preemption_threshold: int = 10
     schedule_conservativeness: float = 1.0
     page_size: Optional[int] = None
     hybrid_kvcache_ratio: Optional[float] = None
     swa_full_tokens_ratio: float = 0.8
     disable_hybrid_swa_memory: bool = False
+    radix_eviction_policy: str = "lru"
     # Runtime options
     device: Optional[str] = None
+    elastic_ep_backend: Literal[None, "mooncake"] = None
+    mooncake_ib_device: Optional[str] = None
     tp_size: int = 1
     pp_size: int = 1
-    max_micro_batch_size: Optional[int] = None
+    pp_max_micro_batch_size: Optional[int] = None
     stream_interval: int = 1
     stream_output: bool = False
     random_seed: Optional[int] = None
     constrained_json_whitespace_pattern: Optional[str] = None
+    constrained_json_disable_any_whitespace: bool = False
     watchdog_timeout: float = 300
     dist_timeout: Optional[int] = None  # timeout for torch.distributed
     download_dir: Optional[str] = None
@@ -205,8 +264,8 @@ class ServerArgs:
     show_time_cost: bool = False
     enable_metrics: bool = False
     enable_metrics_for_all_schedulers: bool = False
-    tokenizer_metrics_custom_labels_header: str = "x-customer-labels"
-    tokenizer_metrics_allowed_customer_labels: Optional[List[str]] = None
+    tokenizer_metrics_custom_labels_header: str = "x-custom-labels"
+    tokenizer_metrics_allowed_custom_labels: Optional[List[str]] = None
     bucket_time_to_first_token: Optional[List[float]] = None
     bucket_inter_token_latency: Optional[List[float]] = None
     bucket_e2e_request_latency: Optional[List[float]] = None
@@ -231,6 +290,7 @@ class ServerArgs:
     reasoning_parser: Optional[str] = None
     tool_call_parser: Optional[str] = None
     tool_server: Optional[str] = None
+    sampling_defaults: str = "model"
     # Data parallelism
     dp_size: int = 1
@@ -257,7 +317,9 @@ class ServerArgs:
     ] = None
     max_loaded_loras: Optional[int] = None
     max_loras_per_batch: int = 8
+    lora_eviction_policy: str = DEFAULT_LORA_EVICTION_POLICY
     lora_backend: str = "triton"
+    max_lora_chunk_size: Optional[int] = 16
     # Kernel backend
     attention_backend: Optional[str] = None
@@ -266,11 +328,15 @@ class ServerArgs:
     sampling_backend: Optional[str] = None
     grammar_backend: Optional[str] = None
     mm_attention_backend: Optional[str] = None
+    nsa_prefill_backend: str = "flashmla_sparse"
+    nsa_decode_backend: str = "fa3"
     # Speculative decoding
+    enable_beta_spec: bool = False
     speculative_algorithm: Optional[str] = None
     speculative_draft_model_path: Optional[str] = None
     speculative_draft_model_revision: Optional[str] = None
+    speculative_draft_load_format: Optional[str] = None
     speculative_num_steps: Optional[int] = None
     speculative_eagle_topk: Optional[int] = None
     speculative_num_draft_tokens: Optional[int] = None
@@ -278,18 +344,19 @@ class ServerArgs:
     speculative_accept_threshold_acc: float = 1.0
     speculative_token_map: Optional[str] = None
     speculative_attention_mode: str = "prefill"
+    # For ngram only
+    speculative_ngram_min_match_window_size: int = 1
+    speculative_ngram_max_match_window_size: int = 12
+    speculative_ngram_min_bfs_breadth: int = 1
+    speculative_ngram_max_bfs_breadth: int = 10
+    speculative_ngram_match_type: Literal["BFS", "PROB"] = "BFS"
+    speculative_ngram_branch_length: int = 18
+    speculative_ngram_capacity: int = 10 * 1000 * 1000
     # Expert parallelism
     ep_size: int = 1
-    moe_a2a_backend: Literal["none", "deepep"] = "none"
-    moe_runner_backend: Literal[
-        "auto",
-        "triton",
-        "triton_kernel",
-        "flashinfer_trtllm",
-        "flashinfer_cutlass",
-        "flashinfer_mxfp4",
-    ] = "auto"
+    moe_a2a_backend: Literal["none", "deepep", "mooncake"] = "none"
+    moe_runner_backend: str = "auto"
     flashinfer_mxfp4_moe_precision: Literal["default", "bf16"] = "default"
     enable_flashinfer_allreduce_fusion: bool = False
     deepep_mode: Literal["auto", "normal", "low_latency"] = "auto"
@@ -309,6 +376,11 @@ class ServerArgs:
     deepep_config: Optional[str] = None
     moe_dense_tp_size: Optional[int] = None
+    # Mamba cache
+    max_mamba_cache_size: Optional[int] = None
+    mamba_ssm_dtype: str = "float32"
+    mamba_full_memory_ratio: float = 0.9
     # Hierarchical cache
     enable_hierarchical_cache: bool = False
     hicache_ratio: float = 2.0
@@ -322,6 +394,13 @@ class ServerArgs:
     # LMCache
     enable_lmcache: bool = False
+    # Ktransformers
+    kt_amx_weight_path: Optional[str] = None
+    kt_amx_method: Optional[str] = None
+    kt_cpuinfer: Optional[int] = None
+    kt_threadpool_count: Optional[int] = None
+    kt_num_gpu_experts: Optional[int] = None
     # Double Sparsity
     enable_double_sparsity: bool = False
     ds_channel_config_path: Optional[str] = None
@@ -337,6 +416,12 @@ class ServerArgs:
     offload_prefetch_step: int = 1
     offload_mode: str = "cpu"
+    # Scoring configuration
+    # Delimiter token ID used to combine Query and Items into a single sequence for multi-item scoring.
+    # Format: Query<delimiter>Item1<delimiter>Item2<delimiter>...
+    # This enables efficient batch processing of multiple items against a single query.
+    multi_item_scoring_delimiter: Optional[Union[int]] = None
     # Optimization/debug options
     disable_radix_cache: bool = False
     cuda_graph_max_bs: Optional[int] = None
@@ -349,17 +434,24 @@ class ServerArgs:
     enable_symm_mem: bool = False
     disable_flashinfer_cutlass_moe_fp4_allgather: bool = False
     enable_tokenizer_batch_encode: bool = False
+    disable_tokenizer_batch_decode: bool = False
     disable_outlines_disk_cache: bool = False
     disable_custom_all_reduce: bool = False
     enable_mscclpp: bool = False
+    enable_torch_symm_mem: bool = False
     disable_overlap_schedule: bool = False
     enable_mixed_chunk: bool = False
     enable_dp_attention: bool = False
     enable_dp_lm_head: bool = False
     enable_two_batch_overlap: bool = False
+    enable_single_batch_overlap: bool = False
     tbo_token_distribution_threshold: float = 0.48
     enable_torch_compile: bool = False
+    enable_piecewise_cuda_graph: bool = False
     torch_compile_max_bs: int = 32
+    piecewise_cuda_graph_max_tokens: int = 4096
+    piecewise_cuda_graph_tokens: Optional[List[int]] = None
+    piecewise_cuda_graph_compiler: str = "eager"
     torchao_config: str = ""
     enable_nan_detection: bool = False
     enable_p2p_check: bool = False
@@ -369,15 +461,18 @@ class ServerArgs:
     num_continuous_decode_steps: int = 1
     delete_ckpt_after_loading: bool = False
     enable_memory_saver: bool = False
+    enable_weights_cpu_backup: bool = False
     allow_auto_truncate: bool = False
     enable_custom_logit_processor: bool = False
     flashinfer_mla_disable_ragged: bool = False
     disable_shared_experts_fusion: bool = False
     disable_chunked_prefix_cache: bool = False
     disable_fast_image_processor: bool = False
+    keep_mm_feature_on_device: bool = False
     enable_return_hidden_states: bool = False
     scheduler_recv_interval: int = 1
     numa_node: Optional[List[int]] = None
+    enable_deterministic_inference: bool = False
     # Dynamic batch tokenizer
     enable_dynamic_batch_tokenizer: bool = False
@@ -388,7 +483,6 @@ class ServerArgs:
     debug_tensor_dump_output_folder: Optional[str] = None
     debug_tensor_dump_input_file: Optional[str] = None
     debug_tensor_dump_inject: bool = False
-    debug_tensor_dump_prefill_only: bool = False
     # PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
     disaggregation_mode: Literal["null", "prefill", "decode"] = "null"
@@ -398,79 +492,147 @@ class ServerArgs:
     disaggregation_decode_dp: Optional[int] = None
     disaggregation_prefill_pp: Optional[int] = 1
     disaggregation_ib_device: Optional[str] = None
+    disaggregation_decode_enable_offload_kvcache: bool = False
     num_reserved_decode_tokens: int = 512  # used for decode kv cache offload in PD
     # FIXME: hack to reduce ITL when decode bs is small
     disaggregation_decode_polling_interval: int = 1
-    # For model weight update
+    # For model weight update and weight loading
     custom_weight_loader: Optional[List[str]] = None
     weight_loader_disable_mmap: bool = False
-    # Remote instance weight loading
     remote_instance_weight_loader_seed_instance_ip: Optional[str] = None
     remote_instance_weight_loader_seed_instance_service_port: Optional[int] = None
     remote_instance_weight_loader_send_weights_group_ports: Optional[List[int]] = None
     # For PD-Multiplexing
     enable_pdmux: bool = False
-    sm_group_num: int = 3
-    # Mamba cache
-    max_mamba_cache_size: Optional[int] = None
-    mamba_ssm_dtype: str = "float32"
+    pdmux_config_path: Optional[str] = None
+    sm_group_num: int = 8
-    # Deprecated arguments
-    enable_ep_moe: bool = False
-    enable_deepep_moe: bool = False
-    enable_flashinfer_cutlass_moe: bool = False
-    enable_flashinfer_cutedsl_moe: bool = False
-    enable_flashinfer_trtllm_moe: bool = False
-    enable_triton_kernel_moe: bool = False
-    enable_flashinfer_mxfp4_moe: bool = False
+    def get_attention_backends(server_args):
+        prefill_attention_backend_str = (
+            server_args.prefill_attention_backend
+            if server_args.prefill_attention_backend
+            else server_args.attention_backend
+        )
+        decode_attention_backend_str = (
+            server_args.decode_attention_backend
+            if server_args.decode_attention_backend
+            else server_args.attention_backend
+        )
+        return prefill_attention_backend_str, decode_attention_backend_str
     def __post_init__(self):
-        # Check deprecated arguments
-        if self.enable_ep_moe:
-            self.ep_size = self.tp_size
-            print_deprecated_warning(
-                "NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead."
-            )
-        if self.enable_deepep_moe:
-            self.moe_a2a_backend = "deepep"
-            print_deprecated_warning(
-                "NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead."
-            )
-        if self.enable_triton_kernel_moe:
-            self.moe_runner_backend = "triton_kernel"
-            print_deprecated_warning(
-                "NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead."
-            )
-        if self.enable_flashinfer_cutedsl_moe:
-            self.moe_runner_backend = "flashinfer_cutedsl"
-            print_deprecated_warning(
-                "NOTE: --enable-flashinfer-cutedsl-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutedsl' instead."
-            )
-        if self.enable_flashinfer_cutlass_moe:
-            self.moe_runner_backend = "flashinfer_cutlass"
-            print_deprecated_warning(
-                "NOTE: --enable-flashinfer-cutlass-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutlass' instead."
-            )
-        if self.enable_flashinfer_trtllm_moe:
-            self.moe_runner_backend = "flashinfer_trtllm"
-            print_deprecated_warning(
-                "NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_trtllm' instead."
-            )
-        if self.enable_flashinfer_mxfp4_moe:
-            self.moe_runner_backend = "flashinfer_mxfp4"
-            print_deprecated_warning(
-                "NOTE: --enable-flashinfer-mxfp4-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_mxfp4' instead."
+        """
+        Orchestrates the handling of various server arguments, ensuring proper configuration and validation.
+        """
+        if self.model_path.lower() in ["none", "dummy"]:
+            # Skip for dummy models
+            return
+        # Handle deprecated arguments.
+        self._handle_deprecated_args()
+        # Set missing default values.
+        self._handle_missing_default_values()
+        # Get GPU memory capacity, which is a common dependency for several configuration steps.
+        gpu_mem = get_device_memory_capacity(self.device)
+        # Handle memory-related, chunked prefill, and CUDA graph batch size configurations.
+        self._handle_gpu_memory_settings(gpu_mem)
+        # Handle device-specific backends.
+        self._handle_hpu_backends()
+        self._handle_cpu_backends()
+        # Apply model-specific adjustments.
+        self._handle_model_specific_adjustments()
+        # Set kernel backends.
+        self._handle_sampling_backend()
+        self._handle_attention_backend_compatibility()
+        self._handle_page_size()
+        self._handle_amd_specifics()
+        self._handle_grammar_backend()
+        # Handle Ktransformers specific configs
+        self._handle_ktransformers_configs()
+        # Handle data parallelism.
+        self._handle_data_parallelism()
+        # Handle MoE configurations.
+        self._handle_moe_kernel_config()
+        self._handle_a2a_moe()
+        self._handle_eplb_and_dispatch()
+        self._handle_expert_distribution_metrics()
+        # Handle pipeline parallelism.
+        self._handle_pipeline_parallelism()
+        # Handle Hicache settings.
+        self._handle_hicache()
+        # Handle speculative decoding logic.
+        self._handle_speculative_decoding()
+        # Handle model loading format.
+        self._handle_load_format()
+        # Handle PD disaggregation.
+        self._handle_disaggregation()
+        # Validate tokenizer settings.
+        self._handle_tokenizer_batching()
+        # Propagate environment variables.
+        self._handle_environment_variables()
+        # Validate cache settings.
+        self._handle_cache_compatibility()
+        # Validate metrics labels.
+        self._handle_metrics_labels()
+        # Handle deterministic inference.
+        self._handle_deterministic_inference()
+        # Handle any other necessary validations.
+        self._handle_other_validations()
+        # Handle elastic expert parallelism.
+        self._handle_elastic_ep()
+    def _handle_deprecated_args(self):
+        # handle deprecated tool call parsers
+        deprecated_tool_call_parsers = {"qwen25": "qwen", "glm45": "glm"}
+        if self.tool_call_parser in deprecated_tool_call_parsers:
+            logger.warning(
+                f"The tool_call_parser '{self.tool_call_parser}' is deprecated. Please use '{deprecated_tool_call_parsers[self.tool_call_parser]}' instead."
             )
+            self.tool_call_parser = deprecated_tool_call_parsers[self.tool_call_parser]
-        # Set missing default values
+    def _handle_ktransformers_configs(self):
+        from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import (
+            CompressedTensorsWNA16AMXEPMoEMethod,
+            override_config,
+        )
+        override_config(
+            CompressedTensorsWNA16AMXEPMoEMethod,
+            self.kt_num_gpu_experts,
+            self.kt_cpuinfer,
+            self.kt_threadpool_count,
+            self.kt_amx_weight_path,
+            self.kt_amx_method,
+            self.chunked_prefill_size,
+        )
+    def _handle_missing_default_values(self):
         if self.tokenizer_path is None:
             self.tokenizer_path = self.model_path
         if self.served_model_name is None:
             self.served_model_name = self.model_path
         if self.device is None:
@@ -478,56 +640,165 @@ class ServerArgs:
         if self.random_seed is None:
             self.random_seed = random.randint(0, 1 << 30)
-        gpu_mem = get_device_memory_capacity(self.device)
-        # Set mem fraction static
-        if self.mem_fraction_static is None:
-            if gpu_mem is not None:
-                # GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
-                # mem_fraction_static = (model weights + KV cache pool) / GPU memory capacity.
-                # We want mem_fraction_static to be as large as possible but still has enough room
-                # for activations and cuda graph buffers. We use the following heuristic to
-                # compute the needed size for activations and cuda graph buffers:
-                # - The size of the activation depends on the chunked_prefill_size and model size.
-                # - The size of cuda graph buffers depends on the cuda graph capture range and model size.
-                # For GPUs with more memory, we use a larger chunked_prefill_size and
-                # capture more cuda graphs, so they need to reserve more memory.
-                parallel_size = self.tp_size * self.pp_size
-                if gpu_mem < 20 * 1024:
-                    # T4, 4080. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
-                    reserved_mem = (2.8 + parallel_size / 10) * 1024
-                elif gpu_mem < 35 * 1024:
-                    # A10, L40, 4090, 5090. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
-                    reserved_mem = (2.8 + parallel_size / 10) * 1024
-                elif gpu_mem < 90 * 1024:
-                    # H100, A100. (chunked_prefill_size 8k, cuda_graph_max_bs 160)
-                    reserved_mem = (9.5 + parallel_size / 2) * 1024
-                elif gpu_mem < 100 * 1024:
-                    # H20. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
-                    reserved_mem = (12 + parallel_size / 2) * 1024
-                elif gpu_mem < 160 * 1024:
-                    # H200. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
-                    reserved_mem = (12 + parallel_size / 2) * 1024
-                else:
-                    # B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512)
-                    reserved_mem = 32 * 1024
-                # draft model and larger cuda graph buffers
-                if self.speculative_algorithm is not None:
-                    if self.speculative_algorithm == "STANDALONE":
-                        # Standalone speculative decoding needs more memory than other speculative
-                        # decoding algorithms since the draft model is typically larger.
-                        reserved_mem += 6 * 1024
+    def _handle_gpu_memory_settings(self, gpu_mem):
+        """
+        Configure GPU memory-dependent settings including
+        chunked_prefill_size, cuda_graph_max_bs, and mem_fraction_static.
+        Here are our heuristics:
+        - Set chunked_prefill_size and cuda_graph_max_bs based on the GPU memory capacity.
+          This is because GPUs with more memory are generally more powerful, we need to use a larger
+          chunked_prefill_size and a larger cuda_graph_max_bs to fully utilize the GPU.
+        - Then set mem_fraction_static based on chunked_prefill_size and cuda_graph_max_bs.
+          GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
+          The argument mem_fraction_static is defined as (model weights + KV cache pool) / GPU memory capacity,
+          or equivalently, mem_fraction_static = (GPU memory capacity - activations - cuda graph buffers) / GPU memory capacity.
+          In order to compute mem_fraction_static, we need to estimate the size of activations and cuda graph buffers.
+          The activation memory is proportional to the chunked_prefill_size.
+          The cuda graph memory is proportional to the cuda_graph_max_bs.
+          We use reserved_mem = chunked_prefill_size * 1.5 + cuda_graph_max_bs * 2 to estimate the size of activations and cuda graph buffers in GB.
+          and set mem_fraction_static = (GPU memory capacity - reserved_mem) / GPU memory capacity.
+          The coefficient 1.5 is a heuristic value, in the future, we can do better estimation by looking at the model types, hidden sizes or even do a dummy run.
+        """
+        if gpu_mem is not None:
+            if gpu_mem < 20 * 1024:
+                # T4, 4080
+                # (chunked_prefill_size 2k, cuda_graph_max_bs 8)
+                if self.chunked_prefill_size is None:
+                    self.chunked_prefill_size = 2048
+                if self.cuda_graph_max_bs is None:
+                    self.cuda_graph_max_bs = 8
+            elif is_npu() and gpu_mem < 32 * 1024:
+                # Atlas A2B4
+                # (chunked_prefill_size 32k, cuda_graph_max_bs 16 if tp < 4 else 64)
+                if self.chunked_prefill_size is None:
+                    self.chunked_prefill_size = 32768
+                if self.cuda_graph_max_bs is None:
+                    if self.tp_size < 4:
+                        self.cuda_graph_max_bs = 16
+                    else:
+                        self.cuda_graph_max_bs = 64
+            elif gpu_mem < 35 * 1024:
+                # A10, 4090, 5090
+                # (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80)
+                if self.chunked_prefill_size is None:
+                    self.chunked_prefill_size = 2048
+                if self.cuda_graph_max_bs is None:
+                    # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM < 35GB, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance.
+                    # However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs
+                    # from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
+                    if self.tp_size < 4:
+                        self.cuda_graph_max_bs = 16
+                    else:
+                        self.cuda_graph_max_bs = 80
+            elif gpu_mem < 60 * 1024:
+                # A100 (40GB), L40,
+                # (chunked_prefill_size 4k, cuda_graph_max_bs 32 if tp < 4 else 160)
+                if self.chunked_prefill_size is None:
+                    self.chunked_prefill_size = 4096
+                if self.cuda_graph_max_bs is None:
+                    if self.tp_size < 4:
+                        self.cuda_graph_max_bs = 32
                     else:
-                        reserved_mem += 2 * 1024
-                if self.enable_dp_attention:
-                    reserved_mem += 4 * 1024
+                        self.cuda_graph_max_bs = 160
+            elif is_npu() and gpu_mem < 64 * 1024:
+                # Atlas A2 and Atlas A3
+                # (chunked_prefill_size 32k, cuda_graph_max_bs 64 if tp < 4 else 128)
+                if self.chunked_prefill_size is None:
+                    self.chunked_prefill_size = 32768
+                if self.cuda_graph_max_bs is None:
+                    if self.tp_size < 4:
+                        self.cuda_graph_max_bs = 64
+                    else:
+                        self.cuda_graph_max_bs = 128
+            elif gpu_mem < 90 * 1024:
+                # H100, A100
+                # (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
+                if self.chunked_prefill_size is None:
+                    self.chunked_prefill_size = 8192
+                if self.cuda_graph_max_bs is None:
+                    if self.tp_size < 4:
+                        self.cuda_graph_max_bs = 256
+                    else:
+                        self.cuda_graph_max_bs = 512
+            elif gpu_mem < 160 * 1024:
+                # H20, H200
+                # (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
+                if self.chunked_prefill_size is None:
+                    self.chunked_prefill_size = 8192
+                if self.cuda_graph_max_bs is None:
+                    if self.tp_size < 4:
+                        self.cuda_graph_max_bs = 256
+                    else:
+                        self.cuda_graph_max_bs = 512
+            else:
+                # B200, MI300
+                # (chunked_prefill_size 16k, cuda_graph_max_bs 512)
+                if self.chunked_prefill_size is None:
+                    self.chunked_prefill_size = 16384
+                if self.cuda_graph_max_bs is None:
+                    self.cuda_graph_max_bs = 512
+        else:
+            # Fallback defaults when gpu_mem is None
+            if self.chunked_prefill_size is None:
+                self.chunked_prefill_size = 4096
+            if self.cuda_graph_max_bs is None:
+                self.cuda_graph_max_bs = 160
+        # Set cuda graph batch sizes
+        if self.cuda_graph_bs is None:
+            self.cuda_graph_bs = self._generate_cuda_graph_batch_sizes()
+        else:
+            self.cuda_graph_max_bs = max(self.cuda_graph_bs)
-                self.mem_fraction_static = round((gpu_mem - reserved_mem) / gpu_mem, 3)
+        if self.piecewise_cuda_graph_tokens is None:
+            self.piecewise_cuda_graph_tokens = (
+                self._generate_piecewise_cuda_graph_tokens()
+            )
+        if self.mem_fraction_static is None:
+            # Constant meta data (e.g., from attention backend)
+            reserved_mem = 512
+            # For activation during large prefill
+            if self.chunked_prefill_size > 0:
+                reserved_mem += max(self.chunked_prefill_size, 2048) * 1.5
             else:
-                self.mem_fraction_static = 0.88
+                reserved_mem += max(self.max_prefill_tokens, 2048) * 1.5
+            # For cuda graphs
+            reserved_mem += self.cuda_graph_max_bs * 2
+            # Some adjustments for large parallel size
+            reserved_mem += self.tp_size * self.pp_size / 8 * 1024
+            if self.enable_dp_attention:
+                # DP attention needs more padding for some operations
+                reserved_mem += self.cuda_graph_max_bs * self.dp_size * 3
+                # DP attention uses much more memory for large cuda graph max bs,
+                # likely due to some inefficiencies in torch allocator or our implementation.
+                # So we need to reserve more memory.
+                if self.cuda_graph_max_bs > 300:
+                    reserved_mem += self.cuda_graph_max_bs * self.dp_size * 1.5
+            if gpu_mem is not None and gpu_mem > 60 * 1024:
+                reserved_mem = max(reserved_mem, 10 * 1024)
+            if self.speculative_algorithm is not None:
+                if self.speculative_algorithm == "STANDALONE":
+                    # standalonedraft model and cuda graphs
+                    reserved_mem += 6 * 1024
+                elif self.speculative_algorithm != "NGRAM":
+                    # eagle draft models and cuda graphs
+                    reserved_mem += 2 * 1024
+            self.mem_fraction_static = (
+                round((gpu_mem - reserved_mem) / gpu_mem, 3)
+                if gpu_mem is not None
+                else 0.88
+            )
             # Lazy init to avoid circular import
             # Multimodal models need more memory for the image processor
@@ -537,54 +808,266 @@ class ServerArgs:
             if model_config.is_multimodal:
                 self.adjust_mem_fraction_for_vlm(model_config)
-        # Set chunked prefill size, which depends on the gpu memory capacity
-        if self.chunked_prefill_size is None:
-            if gpu_mem is not None:
-                if gpu_mem < 35 * 1024:  # A10, L40, 4090
-                    self.chunked_prefill_size = 2048
-                elif gpu_mem < 160 * 1024:  # H100, H200, A100, H20
-                    self.chunked_prefill_size = 8192
-                else:  # B200, MI300
-                    self.chunked_prefill_size = 16384
-            else:
-                self.chunked_prefill_size = 4096
+    def _generate_cuda_graph_batch_sizes(self):
+        """
+        Generate the list of batch sizes for CUDA graph capture based on cuda_graph_max_bs.
+        This integrates the logic from cuda_graph_runner.py.
+        """
+        # Handle disable_cuda_graph_padding as the first condition for both spec and non-spec
+        if self.disable_cuda_graph_padding:
+            capture_bs = list(range(1, self.cuda_graph_max_bs + 1))
+        elif self.speculative_algorithm is None:
+            # Normal case: [1, 2, 4, 8, 12] + list(range(16, 257, 8)) + list(range(272, 512, 16)) + list(range(512, cuda_graph_max_bs + 1))
+            capture_bs = (
+                [1, 2, 4, 8, 12]
+                + list(range(16, 257, 8))
+                + list(range(272, 512, 16))
+                + list(range(512, self.cuda_graph_max_bs + 1, 32))
+            )
+        else:
+            # Spec decoding case: list(range(1, 9, 1)) + list(range(10, 33, 2)) + list(range(40, 64, 4)) + list(range(72, 257, 8))
+            capture_bs = (
+                list(range(1, 9, 1))
+                + list(range(10, 33, 2))
+                + list(range(40, 64, 4))
+                + list(range(72, 257, 8))
+                + list(range(272, self.cuda_graph_max_bs + 1, 16))
+            )
-        # Set cuda graph max batch size
-        if self.cuda_graph_max_bs is None:
-            # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
-            if gpu_mem is not None and gpu_mem < 35 * 1024:
-                if self.tp_size < 4:
-                    self.cuda_graph_max_bs = 8
-                else:
-                    self.cuda_graph_max_bs = 80
+        capture_bs = [bs for bs in capture_bs if bs <= self.cuda_graph_max_bs]
+        return capture_bs
+    def _generate_piecewise_cuda_graph_tokens(self):
+        """
+        Generate the list of batch sizes for piecewise CUDA graph capture
+        based on piecewise_cuda_graph_max_tokens.
+        """
+        capture_sizes = (
+            list(range(4, 33, 4))
+            + list(range(48, 257, 16))
+            + list(range(288, 513, 32))
+            + list(range(640, 4096 + 1, 128))
+            + list(range(4352, self.piecewise_cuda_graph_max_tokens + 1, 256))
+        )
+        capture_sizes = [
+            s for s in capture_sizes if s <= self.piecewise_cuda_graph_max_tokens
+        ]
-        # Set kernel backends for hpu device
+        return capture_sizes
+    def _handle_hpu_backends(self):
         if self.device == "hpu":
             self.attention_backend = "torch_native"
             self.sampling_backend = "pytorch"
-        # Model-specific adjustments
-        if parse_connector_type(self.model_path) != ConnectorType.INSTANCE:
-            self.model_specific_adjustments()
-        # Set kernel backends
+    def _handle_cpu_backends(self):
         if self.device == "cpu":
             if self.attention_backend is None:
                 self.attention_backend = "intel_amx"
             self.sampling_backend = "pytorch"
+    def _handle_model_specific_adjustments(self):
+        from sglang.srt.configs.model_config import is_deepseek_nsa
+        if parse_connector_type(self.model_path) == ConnectorType.INSTANCE:
+            return
+        hf_config = self.get_hf_config()
+        model_arch = hf_config.architectures[0]
+        if model_arch in ["DeepseekV3ForCausalLM"] and not is_deepseek_nsa(hf_config):
+            if is_cuda() and is_sm100_supported():
+                if (
+                    self.attention_backend is None
+                    and self.prefill_attention_backend is None
+                    and self.decode_attention_backend is None
+                ):
+                    self.attention_backend = "trtllm_mla"
+                    logger.info(
+                        "Use trtllm_mla as attention backend on sm100 for DeepseekV3ForCausalLM"
+                    )
+                if not self.enable_dp_attention:
+                    self.enable_flashinfer_allreduce_fusion = True
+                    logger.info(
+                        "Enable FlashInfer AllReduce Fusion on sm100 for DeepseekV3ForCausalLM"
+                    )
+                if (
+                    self.quantization == "modelopt_fp4"
+                    and self.moe_runner_backend == "auto"
+                ):
+                    self.moe_runner_backend = "flashinfer_trtllm"
+                    logger.info(
+                        "Use flashinfer_trtllm as moe runner backend on sm100 for DeepseekV3ForCausalLM"
+                    )
+        elif model_arch in ["GptOssForCausalLM"]:
+            if (
+                self.attention_backend is None
+                and self.prefill_attention_backend is None
+                and self.decode_attention_backend is None
+            ):
+                if is_cuda() and is_sm100_supported():
+                    self.attention_backend = "trtllm_mha"
+                elif is_cuda() and is_sm90_supported():
+                    self.attention_backend = "fa3"
+                else:
+                    self.attention_backend = "triton"
+            supported_backends = ["triton", "trtllm_mha", "fa3", "fa4"]
+            prefill_attn_backend, decode_attn_backend = self.get_attention_backends()
+            assert (
+                prefill_attn_backend in supported_backends
+                and decode_attn_backend in supported_backends
+            ), (
+                f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got the following backends\n"
+                f"- Prefill: {prefill_attn_backend}\n"
+                f"- Decode: {decode_attn_backend}\n"
+            )
+            if is_sm100_supported():
+                if not self.enable_dp_attention:
+                    self.enable_flashinfer_allreduce_fusion = True
+                    logger.info(
+                        "Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM"
+                    )
+            quantization_config = getattr(hf_config, "quantization_config", None)
+            is_mxfp4_quant_format = (
+                quantization_config is not None
+                and quantization_config.get("quant_method") == "mxfp4"
+            )
+            if is_sm100_supported() and is_mxfp4_quant_format:
+                self.moe_runner_backend = "flashinfer_mxfp4"
+                logger.warning(
+                    "Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
+                )
+            else:
+                if self.moe_runner_backend == "triton_kernel":
+                    assert (
+                        self.ep_size == 1
+                    ), "Triton kernel MoE is only supported when ep_size == 1"
+                if (
+                    self.moe_runner_backend == "auto"
+                    and self.ep_size == 1
+                    and is_triton_kernels_available()
+                ):
+                    self.moe_runner_backend = "triton_kernel"
+                    logger.warning(
+                        "Detected GPT-OSS model, enabling triton_kernels MOE kernel."
+                    )
+            self.disable_hybrid_swa_memory = True
+            if is_mxfp4_quant_format:
+                # use bf16 for mxfp4 triton kernels
+                self.dtype = "bfloat16"
+        elif "Llama4" in model_arch and self.device != "cpu":
+            assert self.attention_backend in {
+                "fa3",
+                "aiter",
+                "triton",
+            }, "fa3, aiter, or triton is required for Llama4 model"
+        elif model_arch in [
+            "Gemma2ForCausalLM",
+            "Gemma3ForCausalLM",
+            "Gemma3ForConditionalGeneration",
+            "Gemma3nForCausalLM",
+            "Gemma3nForConditionalGeneration",
+        ]:
+            # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
+            # It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
+            logger.warning(
+                f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
+            )
+            self.disable_hybrid_swa_memory = True
+        elif model_arch in ["Olmo2ForCausalLM"]:
+            # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with Olmo3 model.
+            logger.warning(
+                f"Disabling hybrid SWA memory for {model_arch} as it is not yet supported."
+            )
+            self.disable_hybrid_swa_memory = True
+            if self.attention_backend is None:
+                if is_cuda() and is_sm100_supported():
+                    self.attention_backend = "trtllm_mha"
+                elif is_cuda() and get_device_sm() >= 80:
+                    self.attention_backend = "fa3"
+                else:
+                    self.attention_backend = "triton"
+            # Flashinfer appears to degrade performance when sliding window attention
+            # is used for the Olmo2 architecture. Olmo2 does not use sliding window attention
+            # but Olmo3 does.
+            assert (
+                self.attention_backend != "flashinfer"
+            ), "FlashInfer backend can significantly degrade the performance of Olmo3 models."
+            logger.info(
+                f"Using {self.attention_backend} as attention backend for {model_arch}."
+            )
+        if is_deepseek_nsa(hf_config):
+            if (
+                self.attention_backend is None
+                and self.prefill_attention_backend is None
+                and self.decode_attention_backend is None
+            ):
+                self.attention_backend = "nsa"
+                logger.warning("Set nsa attention backend for DeepSeek NSA.")
+            if not is_npu():
+                self.enable_dp_attention = True
+                self.dp_size = self.tp_size
+                logger.warning("DP attention is enabled for DeepSeek NSA.")
+                self.page_size = 64
+                logger.warning("Setting page size to 64 for DeepSeek NSA.")
+                # For Hopper, we support both bf16 and fp8 kv cache; for Blackwell, we support fp8 only currently
+                import torch
+                major, _ = torch.cuda.get_device_capability()
+                if major >= 10:
+                    self.kv_cache_dtype = "fp8_e4m3"
+                    logger.warning("Setting KV cache dtype to fp8.")
+                if self.kv_cache_dtype == "fp8_e4m3":
+                    self.nsa_prefill_backend = "flashmla_kv"
+                    self.nsa_decode_backend = "flashmla_kv"
+                    logger.warning(
+                        "Setting NSA backend to flashmla_kv for FP8 KV Cache."
+                    )
+                # Logging env vars for NSA
+                from sglang.srt.layers.attention.nsa.utils import (
+                    print_nsa_bool_env_vars,
+                )
+                print_nsa_bool_env_vars()
+    def _handle_sampling_backend(self):
         if self.sampling_backend is None:
             self.sampling_backend = (
                 "flashinfer" if is_flashinfer_available() else "pytorch"
             )
+    def _handle_attention_backend_compatibility(self):
         if self.attention_backend == "torch_native":
             logger.warning(
                 "Cuda graph is disabled because of using torch native attention backend"
             )
             self.disable_cuda_graph = True
-        if is_npu() and self.attention_backend in ["ascend", "hybrid_linear_attn"]:
+        if self.attention_backend == "flex_attention":
+            logger.warning(
+                "Cuda graph is disabled because of using torch Flex Attention backend"
+            )
+            self.disable_cuda_graph = True
+            assert (
+                self.speculative_algorithm is None
+            ), "Speculative decoding is currently not supported with Flex Attention backend"
+        if is_npu() and self.attention_backend in ["ascend"]:
             logger.warning(
                 "At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
             )
@@ -646,29 +1129,44 @@ class ServerArgs:
         if self.attention_backend == "dual_chunk_flash_attn":
             logger.warning(
-                "Mixed chunk, radix cache, and cuda graphs are disabled because of using dual chunk flash attention backend"
+                "Mixed chunk and radix cache are disabled when using dual-chunk flash attention backend"
             )
             self.enable_mixed_chunk = False
-            self.disable_cuda_graph = True
             self.disable_radix_cache = True
-        # Set page size
+        if self.attention_backend == "intel_xpu":
+            if self.page_size not in [32, 64, 128]:
+                logger.warning(
+                    f"Intel XPU attention backend only supports page_size of 32, 64 or 128, changing page_size from {self.page_size} to 128."
+                )
+                self.page_size = 128
+        if self.attention_backend == "fa4" or self.decode_attention_backend == "fa4":
+            raise ValueError(
+                "FA4 backend is only supported for prefill. Please use `--prefill-attention-backend fa4` instead."
+            )
+        if self.prefill_attention_backend == "fa4":
+            logger.warning(
+                f"FA4 backend only supports page size 128, changing page_size from {self.page_size} to 128."
+            )
+            self.page_size = 128
+    def _handle_page_size(self):
         if self.page_size is None:
             self.page_size = 1
-        # AMD-specific Triton attention KV splits default number
+    def _handle_amd_specifics(self):
         if is_hip():
             self.triton_attention_num_kv_splits = 16
-        # Choose grammar backend
+    def _handle_grammar_backend(self):
         if self.grammar_backend is None:
             self.grammar_backend = "xgrammar"
+    def _handle_data_parallelism(self):
         if self.dp_size == 1:
             self.enable_dp_attention = False
             self.enable_dp_lm_head = False
-        # Data parallelism attention
         if self.enable_dp_attention:
             self.schedule_conservativeness = self.schedule_conservativeness * 0.3
             assert self.tp_size % self.dp_size == 0
@@ -682,7 +1180,7 @@ class ServerArgs:
                 self.enable_dp_attention
             ), "Please enable dp attention when setting enable_dp_lm_head. "
-        # MoE kernel
+    def _handle_moe_kernel_config(self):
         if self.moe_runner_backend == "flashinfer_cutlass":
             assert (
                 self.quantization == "modelopt_fp4"
@@ -695,13 +1193,13 @@ class ServerArgs:
         if self.moe_runner_backend == "flashinfer_trtllm":
             assert (
                 self.quantization == "modelopt_fp4" or self.quantization == "fp8"
-            ), "modelopt_fp4 quantization is required for Flashinfer TRTLLM MoE"
+            ), "modelopt_fp4 or fp8 quantization is required for Flashinfer TRTLLM MoE"
             self.disable_shared_experts_fusion = True
             logger.warning(
                 "FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
             )
-        # DeepEP MoE
+    def _handle_a2a_moe(self):
         if self.moe_a2a_backend == "deepep":
             if self.deepep_mode == "normal":
                 logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
@@ -711,6 +1209,13 @@ class ServerArgs:
                 f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
             )
+        if self.moe_a2a_backend == "mooncake":
+            self.ep_size = self.tp_size
+            logger.warning(
+                f"Mooncake MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
+            )
+    def _handle_eplb_and_dispatch(self):
         if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
             self.expert_distribution_recorder_mode = "stat"
             logger.warning(
@@ -725,6 +1230,16 @@ class ServerArgs:
         if self.enable_eplb:
             assert self.ep_size > 1
+    def _handle_elastic_ep(self):
+        if self.elastic_ep_backend is not None:
+            if self.enable_eplb:
+                if self.eplb_algorithm == "auto":
+                    self.eplb_algorithm = "elasticity_aware"
+                assert (
+                    self.eplb_algorithm == "elasticity_aware"
+                ), "Elastic EP requires eplb_algorithm to be set to 'auto' or 'elasticity_aware'."
+    def _handle_expert_distribution_metrics(self):
         if self.enable_expert_distribution_metrics and (
             self.expert_distribution_recorder_mode is None
         ):
@@ -736,18 +1251,24 @@ class ServerArgs:
             elif self.expert_distribution_recorder_mode is not None:
                 self.expert_distribution_recorder_buffer_size = 1000
-        # Pipeline parallelism
+    def _handle_pipeline_parallelism(self):
         if self.pp_size > 1:
             self.disable_overlap_schedule = True
             logger.warning(
                 "Pipeline parallelism is incompatible with overlap schedule."
             )
-        # Hicache
+    def _handle_hicache(self):
         if self.hicache_storage_backend == "mooncake":
-            # to use mooncake storage backend, the following conditions must be met:
-            self.hicache_io_backend = "kernel"
-            self.hicache_mem_layout = "page_first"
+            if self.hicache_mem_layout == "layer_first":
+                if self.hicache_io_backend == "direct":
+                    self.hicache_mem_layout = "page_first_direct"
+                elif self.hicache_io_backend == "kernel":
+                    self.hicache_mem_layout = "page_first"
+                logger.warning(
+                    f"Mooncake storage backend does not support layer_first layout, "
+                    f"switching to {self.hicache_mem_layout} layout for {self.hicache_io_backend} io backend"
+                )
         if self.hicache_mem_layout == "page_first_direct":
             if self.hicache_io_backend != "direct":
@@ -756,24 +1277,34 @@ class ServerArgs:
                     "Page first direct layout only support direct io backend"
                 )
-        # Speculative Decoding
+    def _handle_speculative_decoding(self):
         if self.speculative_algorithm == "NEXTN":
-            # NEXTN shares the same implementation of EAGLE
             self.speculative_algorithm = "EAGLE"
         if self.speculative_algorithm in ("EAGLE", "EAGLE3", "STANDALONE"):
-            if self.speculative_algorithm == "STANDALONE":
+            if self.speculative_algorithm == "STANDALONE" and self.enable_dp_attention:
                 # TODO: support dp attention for standalone speculative decoding
-                assert (
-                    self.enable_dp_attention is False
-                ), "Currently standalone speculative decoding does not support dp attention."
+                raise ValueError(
+                    "Currently standalone speculative decoding does not support dp attention."
+                )
             if self.max_running_requests is None:
                 self.max_running_requests = 48
-            self.disable_overlap_schedule = True
-            logger.warning(
-                "Overlap scheduler is disabled because of using "
-                "eagle speculative decoding."
-            )
+                logger.warning(
+                    "Max running requests is reset to 48 for speculative decoding."
+                )
+            if self.speculative_algorithm == "EAGLE" and self.enable_beta_spec:
+                self.disable_overlap_schedule = False
+                logger.warning(
+                    "Beta spec is enabled for eagle speculative decoding and overlap schedule is turned on."
+                )
+            if not self.enable_beta_spec:
+                self.disable_overlap_schedule = True
+                logger.warning(
+                    "Overlap scheduler is disabled because of using eagle3 and standalone speculative decoding."
+                )
             if self.enable_mixed_chunk:
                 self.enable_mixed_chunk = False
                 logger.warning(
@@ -783,12 +1314,12 @@ class ServerArgs:
             model_arch = self.get_hf_config().architectures[0]
             if model_arch in [
+                "DeepseekV32ForCausalLM",
                 "DeepseekV3ForCausalLM",
                 "Glm4MoeForCausalLM",
                 "BailingMoeForCausalLM",
                 "BailingMoeV2ForCausalLM",
             ]:
-                # Auto set draft_model_path DeepSeek-V3/R1
                 if self.speculative_draft_model_path is None:
                     self.speculative_draft_model_path = self.model_path
                 else:
@@ -796,7 +1327,6 @@ class ServerArgs:
                         "DeepSeek MTP does not require setting speculative_draft_model_path."
                     )
-            # Auto choose parameters
             if self.speculative_num_steps is None:
                 assert (
                     self.speculative_eagle_topk is None
@@ -836,11 +1366,43 @@ class ServerArgs:
                     "speculative_eagle_topk > 1 with page_size > 1 is unstable and produces incorrect results for paged attention backends. This combination is only supported for the 'flashinfer' backend."
                 )
-            # The token generated from the verify step is counted.
-            # If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
-            # assert self.speculative_num_steps < self.speculative_num_draft_tokens
+        if self.speculative_algorithm == "NGRAM":
+            if not self.device.startswith("cuda"):
+                raise ValueError(
+                    "Ngram speculative decoding only supports CUDA device."
+                )
+            if self.max_running_requests is None:
+                self.max_running_requests = 48
+            self.disable_overlap_schedule = True
+            self.enable_mixed_chunk = False
+            self.speculative_eagle_topk = self.speculative_ngram_max_bfs_breadth
+            if self.speculative_num_draft_tokens is None:
+                self.speculative_num_draft_tokens = (
+                    self.speculative_ngram_max_match_window_size
+                )
+            logger.warning(
+                "The overlap scheduler and mixed chunked prefill are disabled because of "
+                "using ngram speculative decoding."
+            )
-        # GGUF
+            if (
+                self.speculative_eagle_topk > 1
+                and self.page_size > 1
+                and self.attention_backend != "flashinfer"
+            ):
+                raise ValueError(
+                    f"speculative_eagle_topk({self.speculative_eagle_topk}) > 1 "
+                    f"with page_size({self.page_size}) > 1 is unstable "
+                    "and produces incorrect results for paged attention backends. "
+                    "This combination is only supported for the 'flashinfer' backend."
+                )
+            if self.enable_dp_attention:
+                # TODO: support dp attention for ngram speculative decoding
+                raise ValueError(
+                    "Currently ngram speculative decoding does not support dp attention."
+                )
+    def _handle_load_format(self):
         if (
             self.load_format == "auto" or self.load_format == "gguf"
         ) and check_gguf_file(self.model_path):
@@ -848,6 +1410,7 @@ class ServerArgs:
         if is_remote_url(self.model_path):
             self.load_format = "remote"
         if self.custom_weight_loader is None:
             self.custom_weight_loader = []
@@ -859,7 +1422,7 @@ class ServerArgs:
             ):
                 self.load_format = "auto"
-        # PD disaggregation
+    def _handle_disaggregation(self):
         if self.disaggregation_mode == "decode":
             assert (
                 self.disaggregation_decode_tp is None
@@ -885,44 +1448,121 @@ class ServerArgs:
             self.disaggregation_prefill_pp = self.pp_size
             self.validate_disagg_tp_size(self.tp_size, self.disaggregation_decode_tp)
             self.disable_cuda_graph = True
             logger.warning("Cuda graph is disabled for prefill server")
-        # Validation: prevent both tokenizer batching features from being enabled
+    def _handle_tokenizer_batching(self):
         if self.enable_tokenizer_batch_encode and self.enable_dynamic_batch_tokenizer:
             raise ValueError(
                 "Cannot enable both --enable-tokenizer-batch-encode and --enable-dynamic-batch-tokenizer. "
                 "Please choose one tokenizer batching approach."
             )
-        # Propagate env vars
+        if self.skip_tokenizer_init:
+            if self.tokenizer_worker_num != 1:
+                logger.warning(
+                    "skip_tokenizer_init=True disables tokenizer workers; forcing tokenizer_worker_num=1 "
+                    f"(requested {self.tokenizer_worker_num})."
+                )
+                self.tokenizer_worker_num = 1
+            if self.enable_tokenizer_batch_encode:
+                logger.warning(
+                    "skip_tokenizer_init=True ignores --enable-tokenizer-batch-encode; disabling it."
+                )
+                self.enable_tokenizer_batch_encode = False
+            if self.enable_dynamic_batch_tokenizer:
+                logger.warning(
+                    "skip_tokenizer_init=True ignores --enable-dynamic-batch-tokenizer; disabling it."
+                )
+                self.enable_dynamic_batch_tokenizer = False
+    def _handle_environment_variables(self):
         os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
             "1" if self.enable_torch_compile else "0"
         )
         os.environ["SGLANG_MAMBA_SSM_DTYPE"] = self.mamba_ssm_dtype
-        # Set env var before grammar backends init
         os.environ["SGLANG_DISABLE_OUTLINES_DISK_CACHE"] = (
             "1" if self.disable_outlines_disk_cache else "0"
         )
+        os.environ["SGLANG_ENABLE_DETERMINISTIC_INFERENCE"] = (
+            "1" if self.enable_deterministic_inference else "0"
+        )
+    def _handle_cache_compatibility(self):
         if self.enable_hierarchical_cache and self.disable_radix_cache:
             raise ValueError(
                 "The arguments enable-hierarchical-cache and disable-radix-cache are mutually exclusive "
                 "and cannot be used at the same time. Please use only one of them."
             )
+        if (
+            self.disaggregation_decode_enable_offload_kvcache
+            and self.disaggregation_mode != "decode"
+        ):
+            raise ValueError(
+                "The argument disaggregation-decode-enable-offload-kvcache is only supported for decode side."
+            )
+    def _handle_metrics_labels(self):
         if (
             not self.tokenizer_metrics_custom_labels_header
-            and self.tokenizer_metrics_allowed_customer_labels
+            and self.tokenizer_metrics_allowed_custom_labels
         ):
             raise ValueError(
-                "Please set --tokenizer-metrics-custom-labels-header when setting --tokenizer-metrics-allowed-customer-labels."
+                "Please set --tokenizer-metrics-custom-labels-header when setting --tokenizer-metrics-allowed-custom-labels."
             )
+    def _handle_deterministic_inference(self):
+        if self.enable_deterministic_inference:
+            # Check sampling backend
+            self.sampling_backend = "pytorch"
+            logger.warning(
+                "Sampling backend is set to pytorch for deterministic inference."
+            )
+            # Check attention backend
+            if self.attention_backend is None:
+                # User didn't specify attention backend, fallback based on GPU architecture
+                if is_sm100_supported() or is_sm120_supported():
+                    # Blackwell and newer architectures
+                    self.attention_backend = "flashinfer"
+                else:
+                    # Hopper (SM90) and older architectures
+                    self.attention_backend = "fa3"
+                logger.warning(
+                    f"Attention backend not specified. Falling back to '{self.attention_backend}' for deterministic inference. "
+                    f"You can explicitly set --attention-backend to one of {DETERMINISTIC_ATTENTION_BACKEND_CHOICES}."
+                )
+            elif self.attention_backend not in DETERMINISTIC_ATTENTION_BACKEND_CHOICES:
+                # User explicitly specified an incompatible attention backend
+                raise ValueError(
+                    f"Currently only {DETERMINISTIC_ATTENTION_BACKEND_CHOICES} attention backends are supported for deterministic inference, "
+                    f"but you explicitly specified '{self.attention_backend}'."
+                )
+            # Currently, only FA3 and Triton supports radix cache. Support for other backends is in progress
+            if self.attention_backend not in ["fa3", "triton"]:
+                self.disable_radix_cache = True
+                logger.warning(
+                    f"Currently radix cache is not compatible with {self.attention_backend} attention backend for deterministic inference. It will be supported in the future."
+                )
+            # Check TP size
+            if self.tp_size > 1:
+                os.environ["NCCL_ALGO"] = "allreduce:tree"
+                self.disable_custom_all_reduce = True
+                logger.warning(
+                    "NCCL_ALGO is set to 'allreduce:tree' and custom all reduce is disabled for deterministic inference when TP size > 1."
+                )
+    def _handle_other_validations(self):
+        pass
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
         # Model and tokenizer
         parser.add_argument(
             "--model-path",
@@ -931,24 +1571,6 @@ class ServerArgs:
             help="The path of the model weights. This can be a local folder or a Hugging Face repo ID.",
             required=True,
         )
-        parser.add_argument(
-            "--remote-instance-weight-loader-seed-instance-ip",
-            type=str,
-            default=ServerArgs.remote_instance_weight_loader_seed_instance_ip,
-            help="The ip of the seed instance for loading weights from remote instance.",
-        )
-        parser.add_argument(
-            "--remote-instance-weight-loader-seed-instance-service-port",
-            type=int,
-            default=ServerArgs.remote_instance_weight_loader_seed_instance_service_port,
-            help="The service port of the seed instance for loading weights from remote instance.",
-        )
-        parser.add_argument(
-            "--remote-instance-weight-loader-send-weights-group-ports",
-            type=json_list_type,
-            default=ServerArgs.remote_instance_weight_loader_send_weights_group_ports,
-            help="The communication group ports for loading weights from remote instance.",
-        )
         parser.add_argument(
             "--tokenizer-path",
             type=str,
@@ -1060,6 +1682,11 @@ class ServerArgs:
             default=ServerArgs.port,
             help="The port of the HTTP server.",
         )
+        parser.add_argument(
+            "--grpc-mode",
+            action="store_true",
+            help="If set, use gRPC server instead of HTTP server.",
+        )
         parser.add_argument(
             "--skip-server-warmup",
             action="store_true",
@@ -1078,6 +1705,12 @@ class ServerArgs:
             default=ServerArgs.nccl_port,
             help="The port for NCCL distributed environment setup. Defaults to a random port.",
         )
+        parser.add_argument(
+            "--checkpoint-engine-wait-weights-before-ready",
+            action="store_true",
+            help="If set, the server will wait for initial weights to be loaded via checkpoint-engine or other update methods "
+            "before serving inference requests.",
+        )
         # Quantization and data type
         parser.add_argument(
@@ -1110,12 +1743,56 @@ class ServerArgs:
             "KV cache dtype is FP8. Otherwise, KV cache scaling factors "
             "default to 1.0, which may cause accuracy issues. ",
         )
+        parser.add_argument(
+            "--modelopt-quant",
+            type=str,
+            default=ServerArgs.modelopt_quant,
+            help="The ModelOpt quantization configuration. "
+            "Supported values: 'fp8', 'int4_awq', 'w4a8_awq', 'nvfp4', 'nvfp4_awq'. "
+            "This requires the NVIDIA Model Optimizer library to be installed: pip install nvidia-modelopt",
+        )
+        parser.add_argument(
+            "--modelopt-checkpoint-restore-path",
+            type=str,
+            default=ServerArgs.modelopt_checkpoint_restore_path,
+            help="Path to restore a previously saved ModelOpt quantized checkpoint. "
+            "If provided, the quantization process will be skipped and the model "
+            "will be loaded from this checkpoint.",
+        )
+        parser.add_argument(
+            "--modelopt-checkpoint-save-path",
+            type=str,
+            default=ServerArgs.modelopt_checkpoint_save_path,
+            help="Path to save the ModelOpt quantized checkpoint after quantization. "
+            "This allows reusing the quantized model in future runs.",
+        )
+        parser.add_argument(
+            "--modelopt-export-path",
+            type=str,
+            default=ServerArgs.modelopt_export_path,
+            help="Path to export the quantized model in HuggingFace format after ModelOpt quantization. "
+            "The exported model can then be used directly with SGLang for inference. "
+            "If not provided, the model will not be exported.",
+        )
+        parser.add_argument(
+            "--quantize-and-serve",
+            action="store_true",
+            default=ServerArgs.quantize_and_serve,
+            help="Quantize the model with ModelOpt and immediately serve it without exporting. "
+            "This is useful for development and prototyping. For production, it's recommended "
+            "to use separate quantization and deployment steps.",
+        )
         parser.add_argument(
             "--kv-cache-dtype",
             type=str,
             default=ServerArgs.kv_cache_dtype,
-            choices=["auto", "fp8_e5m2", "fp8_e4m3"],
-            help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" and "fp8_e4m3" is supported for CUDA 11.8+.',
+            choices=["auto", "fp8_e5m2", "fp8_e4m3", "bf16", "bfloat16"],
+            help='Data type for kv cache storage. "auto" will use model data type. "bf16" or "bfloat16" for BF16 KV cache. "fp8_e5m2" and "fp8_e4m3" are supported for CUDA 11.8+.',
+        )
+        parser.add_argument(
+            "--enable-fp32-lm-head",
+            action="store_true",
+            help="If set, the LM head outputs (logits) are in FP32.",
         )
         # Memory and scheduling
@@ -1163,6 +1840,30 @@ class ServerArgs:
             choices=["lpm", "random", "fcfs", "dfs-weight", "lof", "priority"],
             help="The scheduling policy of the requests.",
         )
+        parser.add_argument(
+            "--enable-priority-scheduling",
+            action="store_true",
+            default=ServerArgs.enable_priority_scheduling,
+            help="Enable priority scheduling. Requests with higher priority integer values will be scheduled first by default.",
+        )
+        parser.add_argument(
+            "--abort-on-priority-when-disabled",
+            action="store_true",
+            default=ServerArgs.abort_on_priority_when_disabled,
+            help="If set, abort requests that specify a priority when priority scheduling is disabled.",
+        )
+        parser.add_argument(
+            "--schedule-low-priority-values-first",
+            action="store_true",
+            default=ServerArgs.schedule_low_priority_values_first,
+            help="If specified with --enable-priority-scheduling, the scheduler will schedule requests with lower priority integer values first.",
+        )
+        parser.add_argument(
+            "--priority-scheduling-preemption-threshold",
+            type=int,
+            default=ServerArgs.priority_scheduling_preemption_threshold,
+            help="Minimum difference in priorities for an incoming request to have to preempt running request(s).",
+        )
         parser.add_argument(
             "--schedule-conservativeness",
             type=float,
@@ -1207,6 +1908,21 @@ class ServerArgs:
             default=ServerArgs.device,
             help="The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified.",
         )
+        parser.add_argument(
+            "--elastic-ep-backend",
+            type=str,
+            default=ServerArgs.elastic_ep_backend,
+            choices=["none", "mooncake"],
+            help="Specify the collective communication backend for elastic EP. Currently supports 'mooncake'.",
+        )
+        parser.add_argument(
+            "--mooncake-ib-device",
+            type=str,
+            default=ServerArgs.mooncake_ib_device,
+            help="The InfiniBand devices for Mooncake Backend transfer, accepts multiple comma-separated devices "
+            "(e.g., --mooncake-ib-device mlx5_0,mlx5_1). "
+            "Default is None, which triggers automatic device detection when Mooncake Backend is enabled.",
+        )
         parser.add_argument(
             "--tensor-parallel-size",
             "--tp-size",
@@ -1222,9 +1938,9 @@ class ServerArgs:
             help="The pipeline parallelism size.",
         )
         parser.add_argument(
-            "--max-micro-batch-size",
+            "--pp-max-micro-batch-size",
             type=int,
-            default=ServerArgs.max_micro_batch_size,
+            default=ServerArgs.pp_max_micro_batch_size,
             help="The maximum micro batch size in pipeline parallelism.",
         )
         parser.add_argument(
@@ -1248,7 +1964,12 @@ class ServerArgs:
             "--constrained-json-whitespace-pattern",
             type=str,
             default=ServerArgs.constrained_json_whitespace_pattern,
-            help="(outlines backend only) Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [\n\t ]*",
+            help="(outlines and llguidance backends only) Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [\n\t ]*",
+        )
+        parser.add_argument(
+            "--constrained-json-disable-any-whitespace",
+            action="store_true",
+            help="(xgrammar and llguidance backends only) Enforce compact representation in JSON constrained output.",
         )
         parser.add_argument(
             "--watchdog-timeout",
@@ -1338,16 +2059,16 @@ class ServerArgs:
             "--tokenizer-metrics-custom-labels-header",
             type=str,
             default=ServerArgs.tokenizer_metrics_custom_labels_header,
-            help="Specify the HTTP header for passing customer labels for tokenizer metrics.",
+            help="Specify the HTTP header for passing custom labels for tokenizer metrics.",
         )
         parser.add_argument(
-            "--tokenizer-metrics-allowed-customer-labels",
+            "--tokenizer-metrics-allowed-custom-labels",
             type=str,
             nargs="+",
-            default=ServerArgs.tokenizer_metrics_allowed_customer_labels,
-            help="The customer labels allowed for tokenizer metrics. The labels are specified via a dict in "
+            default=ServerArgs.tokenizer_metrics_allowed_custom_labels,
+            help="The custom labels allowed for tokenizer metrics. The labels are specified via a dict in "
             "'--tokenizer-metrics-custom-labels-header' field in HTTP requests, e.g., {'label1': 'value1', 'label2': "
-            "'value2'} is allowed if '--tokenizer-metrics-allowed-labels label1 label2' is set.",
+            "'value2'} is allowed if '--tokenizer-metrics-allowed-custom-labels label1 label2' is set.",
         )
         parser.add_argument(
             "--bucket-time-to-first-token",
@@ -1379,8 +2100,8 @@ class ServerArgs:
         bucket_rule = (
             "Supports 3 rule types: 'default' uses predefined buckets; 'tse <middle> <base> <count>' "
             "generates two sides exponential distributed buckets (e.g., 'tse 1000 2 8' generates buckets "
-            "[984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); 'customer <value1> "
-            "<value2> ...' uses custom bucket values (e.g., 'customer 10 50 100 500')."
+            "[984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); 'custom <value1> "
+            "<value2> ...' uses custom bucket values (e.g., 'custom 10 50 100 500')."
         )
         parser.add_argument(
             "--prompt-tokens-buckets",
@@ -1489,6 +2210,16 @@ class ServerArgs:
             default=ServerArgs.tool_call_parser,
             help=f"Specify the parser for handling tool-call interactions. Options include: {tool_call_parser_choices}.",
         )
+        parser.add_argument(
+            "--sampling-defaults",
+            type=str,
+            choices=["openai", "model"],
+            default=ServerArgs.sampling_defaults,
+            help="Where to get default sampling parameters. "
+            "'openai' uses SGLang/OpenAI defaults (temperature=1.0, top_p=1.0, etc.). "
+            "'model' uses the model's generation_config.json to get the recommended "
+            "sampling parameters if available. Default is 'model'.",
+        )
         parser.add_argument(
             "--tool-server",
             type=str,
@@ -1598,12 +2329,27 @@ class ServerArgs:
             default=ServerArgs.max_loaded_loras,
             help="If specified, it limits the maximum number of LoRA adapters loaded in CPU memory at a time. The value must be greater than or equal to `--max-loras-per-batch`.",
         )
+        parser.add_argument(
+            "--lora-eviction-policy",
+            type=str,
+            default=DEFAULT_LORA_EVICTION_POLICY,
+            choices=["lru", "fifo"],
+            help="LoRA adapter eviction policy when memory pool is full. 'lru': Least Recently Used (default, better cache efficiency). 'fifo': First-In-First-Out.",
+        )
         parser.add_argument(
             "--lora-backend",
             type=str,
-            default="triton",
+            choices=LORA_BACKEND_CHOICES,
+            default=ServerArgs.lora_backend,
             help="Choose the kernel backend for multi-LoRA serving.",
         )
+        parser.add_argument(
+            "--max-lora-chunk-size",
+            type=int,
+            default=ServerArgs.max_lora_chunk_size,
+            choices=[16, 32, 64, 128],
+            help="Maximum chunk size for the ChunkedSGMV LoRA backend. Only used when --lora-backend is 'csgmv'. Choosing a larger value might improve performance.",
+        )
         # Kernel backend
         parser.add_argument(
@@ -1644,16 +2390,29 @@ class ServerArgs:
         parser.add_argument(
             "--mm-attention-backend",
             type=str,
-            choices=["sdpa", "fa3", "triton_attn"],
+            choices=["sdpa", "fa3", "triton_attn", "ascend_attn"],
             default=ServerArgs.mm_attention_backend,
             help="Set multimodal attention backend.",
         )
+        parser.add_argument(
+            "--nsa-prefill-backend",
+            default=ServerArgs.nsa_prefill_backend,
+            type=str,
+            choices=NSA_CHOICES,
+        )
+        parser.add_argument(
+            "--nsa-decode-backend",
+            default=ServerArgs.nsa_decode_backend,
+            type=str,
+            choices=NSA_CHOICES,
+        )
         # Speculative decoding
+        parser.add_argument("--enable-beta-spec", action="store_true")
         parser.add_argument(
             "--speculative-algorithm",
             type=str,
-            choices=["EAGLE", "EAGLE3", "NEXTN", "STANDALONE"],
+            choices=["EAGLE", "EAGLE3", "NEXTN", "STANDALONE", "NGRAM"],
             help="Speculative algorithm.",
         )
         parser.add_argument(
@@ -1670,6 +2429,15 @@ class ServerArgs:
             "name, a tag name, or a commit id. If unspecified, will use "
             "the default version.",
         )
+        parser.add_argument(
+            "--speculative-draft-load-format",
+            type=str,
+            default=ServerArgs.speculative_draft_load_format,
+            choices=LOAD_FORMAT_CHOICES,
+            help="The format of the draft model weights to load. "
+            "If not specified, will use the same format as --load-format. "
+            "Use 'dummy' to initialize draft model weights with random values for profiling.",
+        )
         parser.add_argument(
             "--speculative-num-steps",
             type=int,
@@ -1713,6 +2481,50 @@ class ServerArgs:
             help="Attention backend for speculative decoding operations (both target verify and draft extend). Can be one of 'prefill' (default) or 'decode'.",
             default=ServerArgs.speculative_attention_mode,
         )
+        # Ngram speculative decoding
+        parser.add_argument(
+            "--speculative-ngram-min-match-window-size",
+            type=int,
+            default=ServerArgs.speculative_ngram_min_match_window_size,
+            help="The minimum window size for pattern matching in ngram speculative decoding.",
+        )
+        parser.add_argument(
+            "--speculative-ngram-max-match-window-size",
+            type=int,
+            default=ServerArgs.speculative_ngram_max_match_window_size,
+            help="The maximum window size for pattern matching in ngram speculative decoding.",
+        )
+        parser.add_argument(
+            "--speculative-ngram-min-bfs-breadth",
+            type=int,
+            default=ServerArgs.speculative_ngram_min_bfs_breadth,
+            help="The minimum breadth for BFS (Breadth-First Search) in ngram speculative decoding.",
+        )
+        parser.add_argument(
+            "--speculative-ngram-max-bfs-breadth",
+            type=int,
+            default=ServerArgs.speculative_ngram_max_bfs_breadth,
+            help="The maximum breadth for BFS (Breadth-First Search) in ngram speculative decoding.",
+        )
+        parser.add_argument(
+            "--speculative-ngram-match-type",
+            type=str,
+            choices=["BFS", "PROB"],
+            default=ServerArgs.speculative_ngram_match_type,
+            help="The match type for cache tree.",
+        )
+        parser.add_argument(
+            "--speculative-ngram-branch-length",
+            type=int,
+            default=ServerArgs.speculative_ngram_branch_length,
+            help="The branch length for ngram speculative decoding.",
+        )
+        parser.add_argument(
+            "--speculative-ngram-capacity",
+            type=int,
+            default=ServerArgs.speculative_ngram_capacity,
+            help="The cache capacity for ngram speculative decoding.",
+        )
         # Expert parallelism
         parser.add_argument(
@@ -1726,22 +2538,14 @@ class ServerArgs:
         parser.add_argument(
             "--moe-a2a-backend",
             type=str,
-            choices=["none", "deepep"],
+            choices=["none", "deepep", "mooncake"],
             default=ServerArgs.moe_a2a_backend,
             help="Choose the backend for MoE A2A.",
         )
         parser.add_argument(
             "--moe-runner-backend",
             type=str,
-            choices=[
-                "auto",
-                "triton",
-                "triton_kernel",
-                "flashinfer_trtllm",
-                "flashinfer_cutlass",
-                "flashinfer_mxfp4",
-                "flashinfer_cutedsl",
-            ],
+            choices=MOE_RUNNER_BACKEND_CHOICES,
             default=ServerArgs.moe_runner_backend,
             help="Choose the runner backend for MoE.",
         )
@@ -1855,6 +2659,12 @@ class ServerArgs:
             choices=["float32", "bfloat16"],
             help="The data type of the SSM states in mamba cache.",
         )
+        parser.add_argument(
+            "--mamba-full-memory-ratio",
+            type=float,
+            default=ServerArgs.mamba_full_memory_ratio,
+            help="The ratio of mamba state memory to full kv cache memory.",
+        )
         # Hierarchical cache
         parser.add_argument(
@@ -1881,6 +2691,13 @@ class ServerArgs:
             default=ServerArgs.hicache_write_policy,
             help="The write policy of hierarchical cache.",
         )
+        parser.add_argument(
+            "--radix-eviction-policy",
+            type=str,
+            choices=RADIX_EVICTION_POLICY_CHOICES,
+            default=ServerArgs.radix_eviction_policy,
+            help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.",
+        )
         parser.add_argument(
             "--hicache-io-backend",
             type=str,
@@ -1898,9 +2715,12 @@ class ServerArgs:
         parser.add_argument(
             "--hicache-storage-backend",
             type=str,
-            choices=["file", "mooncake", "hf3fs", "nixl"],
+            choices=["file", "mooncake", "hf3fs", "nixl", "aibrix", "dynamic", "eic"],
             default=ServerArgs.hicache_storage_backend,
-            help="The storage backend for hierarchical KV cache.",
+            help="The storage backend for hierarchical KV cache. "
+            "Built-in backends: file, mooncake, hf3fs, nixl, aibrix. "
+            "For dynamic backend, use --hicache-storage-backend-extra-config to specify: "
+            "backend_name (custom name), module_path (Python module path), class_name (backend class name).",
         )
         parser.add_argument(
             "--hicache-storage-prefetch-policy",
@@ -1922,6 +2742,35 @@ class ServerArgs:
             help="Using LMCache as an alternative hierarchical cache solution",
         )
+        # Ktransformer server args
+        parser.add_argument(
+            "--kt-amx-weight-path",
+            type=str,
+            help="[ktransformers parameter] The path of the quantized expert weights for amx kernel. A local folder.",
+        )
+        parser.add_argument(
+            "--kt-amx-method",
+            type=str,
+            default="AMXINT4",
+            help="[ktransformers parameter] Quantization formats for CPU execution.",
+        )
+        parser.add_argument(
+            "--kt-cpuinfer",
+            type=int,
+            help="[ktransformers parameter] The number of CPUInfer threads.",
+        )
+        parser.add_argument(
+            "--kt-threadpool-count",
+            type=int,
+            default=2,
+            help="[ktransformers parameter] One-to-one with the number of NUMA nodes (one thread pool per NUMA).",
+        )
+        parser.add_argument(
+            "--kt-num-gpu-experts",
+            type=int,
+            help="[ktransformers parameter] The number of GPU experts.",
+        )
         # Double Sparsity
         parser.add_argument(
             "--enable-double-sparsity",
@@ -1991,6 +2840,14 @@ class ServerArgs:
             help="Mode of offloading.",
         )
+        # Args for multi-item-scoring
+        parser.add_argument(
+            "--multi-item-scoring-delimiter",
+            type=int,
+            default=ServerArgs.multi_item_scoring_delimiter,
+            help="Delimiter token ID for multi-item scoring. Used to combine Query and Items into a single sequence: Query<delimiter>Item1<delimiter>Item2<delimiter>... This enables efficient batch processing of multiple items against a single query.",
+        )
         # Optimization/debug options
         parser.add_argument(
             "--disable-radix-cache",
@@ -2049,6 +2906,11 @@ class ServerArgs:
             action="store_true",
             help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
         )
+        parser.add_argument(
+            "--disable-tokenizer-batch-decode",
+            action="store_true",
+            help="Disable batch decoding when decoding multiple completions.",
+        )
         parser.add_argument(
             "--disable-outlines-disk-cache",
             action="store_true",
@@ -2064,6 +2926,11 @@ class ServerArgs:
             action="store_true",
             help="Enable using mscclpp for small messages for all-reduce kernel and fall back to NCCL.",
         )
+        parser.add_argument(
+            "--enable-torch-symm-mem",
+            action="store_true",
+            help="Enable using torch symm mem for all-reduce kernel and fall back to NCCL. Only supports CUDA device SM90 and above. SM90 supports world size 4, 6, 8. SM10 supports world size 6, 8.",
+        )
         parser.add_argument(
             "--disable-overlap-schedule",
             action="store_true",
@@ -2089,6 +2956,11 @@ class ServerArgs:
             action="store_true",
             help="Enabling two micro batches to overlap.",
         )
+        parser.add_argument(
+            "--enable-single-batch-overlap",
+            action="store_true",
+            help="Let computation and communication overlap within one micro batch.",
+        )
         parser.add_argument(
             "--tbo-token-distribution-threshold",
             type=float,
@@ -2100,12 +2972,36 @@ class ServerArgs:
             action="store_true",
             help="Optimize the model with torch.compile. Experimental feature.",
         )
+        parser.add_argument(
+            "--enable-piecewise-cuda-graph",
+            action="store_true",
+            help="Optimize the model with piecewise cuda graph for extend/prefill only. Experimental feature.",
+        )
+        parser.add_argument(
+            "--piecewise-cuda-graph-tokens",
+            type=json_list_type,
+            default=ServerArgs.piecewise_cuda_graph_tokens,
+            help="Set the list of tokens when using piecewise cuda graph.",
+        )
+        parser.add_argument(
+            "--piecewise-cuda-graph-compiler",
+            type=str,
+            default=ServerArgs.piecewise_cuda_graph_compiler,
+            help="Set the compiler for piecewise cuda graph. Choices are: eager, inductor.",
+            choices=["eager", "inductor"],
+        )
         parser.add_argument(
             "--torch-compile-max-bs",
             type=int,
             default=ServerArgs.torch_compile_max_bs,
             help="Set the maximum batch size when using torch compile.",
         )
+        parser.add_argument(
+            "--piecewise-cuda-graph-max-tokens",
+            type=int,
+            default=ServerArgs.piecewise_cuda_graph_max_tokens,
+            help="Set the maximum tokens when using piecewise cuda graph.",
+        )
         parser.add_argument(
             "--torchao-config",
             type=str,
@@ -2158,6 +3054,11 @@ class ServerArgs:
             action="store_true",
             help="Allow saving memory using release_memory_occupation and resume_memory_occupation",
         )
+        parser.add_argument(
+            "--enable-weights-cpu-backup",
+            action="store_true",
+            help="Save model weights to CPU memory during release_weights_occupation and resume_weights_occupation",
+        )
         parser.add_argument(
             "--allow-auto-truncate",
             action="store_true",
@@ -2188,6 +3089,11 @@ class ServerArgs:
             action="store_true",
             help="Adopt base image processor instead of fast image processor.",
         )
+        parser.add_argument(
+            "--keep-mm-feature-on-device",
+            action="store_true",
+            help="Keep multimodal feature tensors on device after processing to save D2H copy.",
+        )
         parser.add_argument(
             "--enable-return-hidden-states",
             action="store_true",
@@ -2225,11 +3131,6 @@ class ServerArgs:
             default=ServerArgs.debug_tensor_dump_inject,
             help="Inject the outputs from jax as the input of every layer.",
         )
-        parser.add_argument(
-            "--debug-tensor-dump-prefill-only",
-            action="store_true",
-            help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
-        )
         parser.add_argument(
             "--enable-dynamic-batch-tokenizer",
             action="store_true",
@@ -2295,6 +3196,11 @@ class ServerArgs:
             "or multiple comma-separated devices (e.g., --disaggregation-ib-device mlx5_0,mlx5_1). "
             "Default is None, which triggers automatic device detection when mooncake backend is enabled.",
         )
+        parser.add_argument(
+            "--disaggregation-decode-enable-offload-kvcache",
+            action="store_true",
+            help="Enable async KV cache offloading on decode server (PD mode).",
+        )
         parser.add_argument(
             "--num-reserved-decode-tokens",
             type=int,
@@ -2321,6 +3227,24 @@ class ServerArgs:
             action="store_true",
             help="Disable mmap while loading weight using safetensors.",
         )
+        parser.add_argument(
+            "--remote-instance-weight-loader-seed-instance-ip",
+            type=str,
+            default=ServerArgs.remote_instance_weight_loader_seed_instance_ip,
+            help="The ip of the seed instance for loading weights from remote instance.",
+        )
+        parser.add_argument(
+            "--remote-instance-weight-loader-seed-instance-service-port",
+            type=int,
+            default=ServerArgs.remote_instance_weight_loader_seed_instance_service_port,
+            help="The service port of the seed instance for loading weights from remote instance.",
+        )
+        parser.add_argument(
+            "--remote-instance-weight-loader-send-weights-group-ports",
+            type=json_list_type,
+            default=ServerArgs.remote_instance_weight_loader_send_weights_group_ports,
+            help="The communication group ports for loading weights from remote instance.",
+        )
         # For PD-Multiplexing
         parser.add_argument(
@@ -2328,6 +3252,12 @@ class ServerArgs:
             action="store_true",
             help="Enable PD-Multiplexing, PD running on greenctx stream.",
         )
+        parser.add_argument(
+            "--pdmux-config-path",
+            type=str,
+            default=None,
+            help="The path of the PD-Multiplexing config file.",
+        )
         parser.add_argument(
             "--sm-group-num",
@@ -2336,41 +3266,55 @@ class ServerArgs:
             help="Number of sm partition groups.",
         )
+        # For deterministic inference
+        parser.add_argument(
+            "--enable-deterministic-inference",
+            action="store_true",
+            help="Enable deterministic inference mode with batch invariant ops.",
+        )
         # Deprecated arguments
         parser.add_argument(
             "--enable-ep-moe",
-            action="store_true",
-            help="(Deprecated) Enabling expert parallelism for moe. The ep size is equal to the tp size.",
+            action=DeprecatedAction,
+            help="NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead.",
         )
         parser.add_argument(
             "--enable-deepep-moe",
-            action="store_true",
-            help="(Deprecated) Enabling DeepEP MoE implementation for EP MoE.",
+            action=DeprecatedAction,
+            help="NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead.",
         )
         parser.add_argument(
             "--enable-flashinfer-cutlass-moe",
-            action="store_true",
-            help="(Deprecated) Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
+            action=DeprecatedAction,
+            help="NOTE: --enable-flashinfer-cutlass-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutlass' instead.",
         )
         parser.add_argument(
             "--enable-flashinfer-cutedsl-moe",
-            action="store_true",
-            help="(Deprecated) Enable FlashInfer CuteDSL MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
+            action=DeprecatedAction,
+            help="NOTE: --enable-flashinfer-cutedsl-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutedsl' instead.",
         )
         parser.add_argument(
             "--enable-flashinfer-trtllm-moe",
-            action="store_true",
-            help="(Deprecated) Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP",
+            action=DeprecatedAction,
+            help="NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_trtllm' instead.",
         )
         parser.add_argument(
             "--enable-triton-kernel-moe",
-            action="store_true",
-            help="(Deprecated) Use triton moe grouped gemm kernel.",
+            action=DeprecatedAction,
+            help="NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead.",
         )
         parser.add_argument(
             "--enable-flashinfer-mxfp4-moe",
-            action="store_true",
-            help="(Deprecated) Enable FlashInfer MXFP4 MoE backend for modelopt_fp4 quant on Blackwell.",
+            action=DeprecatedAction,
+            help="NOTE: --enable-flashinfer-mxfp4-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_mxfp4' instead.",
+        )
+        # Configuration file support
+        parser.add_argument(
+            "--config",
+            type=str,
+            help="Read CLI options from a config file. Must be a YAML file with configuration options.",
         )
     @classmethod
@@ -2395,7 +3339,7 @@ class ServerArgs:
             self.model_path,
             trust_remote_code=self.trust_remote_code,
             revision=self.revision,
-            model_override_args=json.loads(self.json_model_override_args),
+            model_override_args=orjson.loads(self.json_model_override_args),
             **kwargs,
         )
         return hf_config
@@ -2442,7 +3386,34 @@ class ServerArgs:
                 self.chunked_prefill_size % self.page_size == 0
             ), "chunked_prefill_size must be divisible by page_size"
-        # Check multi tokenizer
+        # Check pdmux
+        if self.enable_pdmux:
+            assert (
+                self.pp_size == 1
+            ), "PD-Multiplexing is only supported with pipeline parallelism disabled (pp_size=1)."
+            assert (
+                self.chunked_prefill_size == -1
+            ), "PD-Multiplexing is not compatible with chunked prefill."
+            assert (
+                self.disaggregation_mode == "null"
+            ), "PD-Multiplexing is not compatible with disaggregation mode."
+            assert (
+                self.disable_overlap_schedule
+            ), "PD-Multiplexing is not compatible with overlap schedule."
+            # NOTE: CUDA Green Context may encounter potential issues with CudaGraph on torch 2.7.x – 2.8.x, leading to performance degradation.
+            import torch
+            parts = torch.__version__.split("+", 1)[0].split(".")
+            major = int(parts[0]) if len(parts) > 0 and parts[0].isdigit() else 0
+            minor = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else 0
+            if (major, minor) > (2, 6):
+                logger.warning(
+                    "WARNING: PD-Multiplexing may experience performance degradation with torch versions > 2.6.x.\n"
+                    f"  Current torch version is {torch.__version__}.\n"
+                    "  Please manually install torch 2.6.x."
+                )
         assert self.tokenizer_worker_num > 0, "Tokenizer worker num must >= 1"
         self.validate_buckets_rule(
             "--prompt-tokens-buckets", self.prompt_tokens_buckets
@@ -2451,6 +3422,24 @@ class ServerArgs:
             "--generation-tokens-buckets", self.generation_tokens_buckets
         )
+        # Check scheduling policy
+        if self.enable_priority_scheduling:
+            assert self.schedule_policy in [
+                "fcfs",
+                "lof",
+            ], f"To use priority scheduling, schedule_policy must be 'fcfs' or 'lof'. '{self.schedule_policy}' is not supported."
+        # Check multi-item scoring
+        if self.multi_item_scoring_delimiter is not None:
+            assert self.disable_radix_cache, (
+                "Multi-item scoring requires radix cache to be disabled. "
+                "Please set --disable-radix-cache when using --multi-item-scoring-delimiter."
+            )
+            assert self.chunked_prefill_size == -1, (
+                "Multi-item scoring requires chunked prefill to be disabled. "
+                "Please set --chunked-prefill-size -1 when using --multi-item-scoring-delimiter."
+            )
     def check_lora_server_args(self):
         assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"
@@ -2534,6 +3523,12 @@ class ServerArgs:
                     f"max_loaded_loras={self.max_loaded_loras}, lora_paths={len(self.lora_paths)}"
                 )
+            if self.max_lora_chunk_size is not None:
+                assert (
+                    16 <= self.max_lora_chunk_size <= 128
+                    and (self.max_lora_chunk_size & (self.max_lora_chunk_size - 1)) == 0
+                ), "--max-lora-chunk-size must be a power of 2 between 16 and 128."
     def validate_disagg_tp_size(self, prefill_tp: int, decode_tp: int):
         larger_tp = max(decode_tp, prefill_tp)
         smaller_tp = min(decode_tp, prefill_tp)
@@ -2551,8 +3546,8 @@ class ServerArgs:
         assert rule in [
             "tse",
             "default",
-            "customer",
-        ], f"Unsupported {arg_name} rule type: '{rule}'. Must be one of: 'tse', 'default', 'customer'"
+            "custom",
+        ], f"Unsupported {arg_name} rule type: '{rule}'. Must be one of: 'tse', 'default', 'custom'"
         if rule == "tse":
             assert (
@@ -2575,95 +3570,20 @@ class ServerArgs:
                 len(buckets_rule) == 1
             ), f"{arg_name} default rule should only have one parameter: ['default'], got {len(buckets_rule)}"
-        elif rule == "customer":
+        elif rule == "custom":
             assert (
                 len(buckets_rule) >= 2
-            ), f"{arg_name} customer rule requires at least one bucket value: ['customer', value1, ...]"
+            ), f"{arg_name} custom rule requires at least one bucket value: ['custom', value1, ...]"
             try:
                 bucket_values = [float(x) for x in buckets_rule[1:]]
             except ValueError:
-                assert False, f"{arg_name} customer rule bucket values must be numeric"
+                assert False, f"{arg_name} custom rule bucket values must be numeric"
             assert len(set(bucket_values)) == len(
                 bucket_values
-            ), f"{arg_name} customer rule bucket values should not contain duplicates"
+            ), f"{arg_name} custom rule bucket values should not contain duplicates"
             assert all(
                 val >= 0 for val in bucket_values
-            ), f"{arg_name} customer rule bucket values should be non-negative"
-    def model_specific_adjustments(self):
-        hf_config = self.get_hf_config()
-        model_arch = hf_config.architectures[0]
-        if model_arch in ["GptOssForCausalLM"]:
-            if self.attention_backend is None:
-                if is_cuda() and is_sm100_supported():
-                    self.attention_backend = "trtllm_mha"
-                elif is_cuda() and is_sm90_supported():
-                    self.attention_backend = "fa3"
-                else:
-                    self.attention_backend = "triton"
-            supported_backends = ["triton", "trtllm_mha", "fa3"]
-            logger.info(
-                f"Use {self.attention_backend} as attention backend for GptOssForCausalLM"
-            )
-            assert (
-                self.attention_backend in supported_backends
-            ), f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got '{self.attention_backend}'"
-            if is_sm100_supported():
-                if not self.enable_dp_attention:
-                    self.enable_flashinfer_allreduce_fusion = True
-                    logger.info(
-                        "Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM"
-                    )
-            quantization_config = getattr(hf_config, "quantization_config", None)
-            is_mxfp4_quant_format = (
-                quantization_config is not None
-                and quantization_config.get("quant_method") == "mxfp4"
-            )
-            if is_sm100_supported() and is_mxfp4_quant_format:
-                self.moe_runner_backend = "flashinfer_mxfp4"
-                logger.warning(
-                    "Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
-                )
-            else:
-                if self.moe_runner_backend == "triton_kernel":
-                    assert (
-                        self.ep_size == 1
-                    ), "Triton kernel MoE is only supported when ep_size == 1"
-                if (
-                    self.moe_runner_backend == "auto"
-                    and self.ep_size == 1
-                    and is_triton_kernels_available()
-                ):
-                    self.moe_runner_backend = "triton_kernel"
-                    logger.warning(
-                        "Detected GPT-OSS model, enabling triton_kernels MOE kernel."
-                    )
-            self.disable_hybrid_swa_memory = True
-            if is_mxfp4_quant_format:
-                # use bf16 for mxfp4 triton kernels
-                self.dtype = "bfloat16"
-        elif "Llama4" in model_arch:
-            assert self.attention_backend in {
-                "fa3",
-                "aiter",
-                "triton",
-            }, "fa3, aiter, or triton is required for Llama4 model"
-        elif model_arch in [
-            "Gemma2ForCausalLM",
-            "Gemma3ForCausalLM",
-            "Gemma3ForConditionalGeneration",
-            "Gemma3nForCausalLM",
-            "Gemma3nForConditionalGeneration",
-        ]:
-            # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
-            # It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
-            logger.warning(
-                f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
-            )
-            self.disable_hybrid_swa_memory = True
+            ), f"{arg_name} custom rule bucket values should be non-negative"
     def adjust_mem_fraction_for_vlm(self, model_config):
         vision_config = getattr(model_config.hf_config, "vision_config", None)
@@ -2704,6 +3624,22 @@ class ServerArgs:
         )
+# NOTE: This is a global variable to hold the server args for scheduler.
+_global_server_args: Optional[ServerArgs] = None
+def set_global_server_args_for_scheduler(server_args: ServerArgs):
+    global _global_server_args
+    _global_server_args = server_args
+def get_global_server_args() -> ServerArgs:
+    if _global_server_args is None:
+        raise ValueError("Global server args is not set yet!")
+    return _global_server_args
 def prepare_server_args(argv: List[str]) -> ServerArgs:
     """
     Prepare the server arguments from the command line arguments.
@@ -2715,14 +3651,35 @@ def prepare_server_args(argv: List[str]) -> ServerArgs:
     Returns:
         The server arguments.
     """
+    # Import here to avoid circular imports
+    from sglang.srt.server_args_config_parser import ConfigArgumentMerger
+    # Check for config file and merge arguments if present
+    if "--config" in argv:
+        # Extract boolean actions from the parser to handle them correctly
+        parser = argparse.ArgumentParser()
+        ServerArgs.add_cli_args(parser)
+        # Get boolean action destinations
+        boolean_actions = []
+        for action in parser._actions:
+            if hasattr(action, "dest") and hasattr(action, "action"):
+                if action.action in ["store_true", "store_false"]:
+                    boolean_actions.append(action.dest)
+        # Merge config file arguments with CLI arguments
+        config_merger = ConfigArgumentMerger(boolean_actions=boolean_actions)
+        argv = config_merger.merge_config_with_args(argv)
     parser = argparse.ArgumentParser()
     ServerArgs.add_cli_args(parser)
     raw_args = parser.parse_args(argv)
-    server_args = ServerArgs.from_cli_args(raw_args)
-    return server_args
+    return ServerArgs.from_cli_args(raw_args)
 ZMQ_TCP_PORT_DELTA = 233
+DP_ATTENTION_HANDSHAKE_PORT_DELTA = 5
 @dataclasses.dataclass
@@ -2747,7 +3704,11 @@ class PortArgs:
     tokenizer_worker_ipc_name: Optional[str]
     @staticmethod
-    def init_new(server_args, dp_rank: Optional[int] = None) -> "PortArgs":
+    def init_new(
+        server_args: ServerArgs,
+        dp_rank: Optional[int] = None,
+        worker_ports: Optional[List[int]] = None,
+    ) -> PortArgs:
         if server_args.nccl_port is None:
             nccl_port = server_args.port + random.randint(100, 1000)
             while True:
@@ -2794,8 +3755,8 @@ class PortArgs:
                 # TokenizerManager to DataParallelController
                 scheduler_input_port = port_base + 4
             else:
-                scheduler_input_port = port_base + 4 + 1 + dp_rank
+                assert worker_ports is not None
+                scheduler_input_port = worker_ports[dp_rank]
             return PortArgs(
                 tokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base}",
                 scheduler_input_ipc_name=f"tcp://{dist_init_host}:{scheduler_input_port}",
@@ -2856,6 +3817,7 @@ def auto_choose_speculative_params(self: ServerArgs):
         # The default value for llama
         return (5, 4, 8)
     elif arch in [
+        "DeepseekV32ForCausalLM",
         "DeepseekV3ForCausalLM",
         "DeepseekV2ForCausalLM",
         "GptOssForCausalLM",

sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

sglang 0.5.3rc0py3-none-any.whl → 0.5.4py3-none-any.whl