sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +47 -28
 - sglang/bench_one_batch_server.py +41 -25
 - sglang/bench_serving.py +378 -160
 - sglang/check_env.py +1 -1
 - sglang/compile_deep_gemm.py +6 -2
 - sglang/global_config.py +1 -25
 - sglang/lang/api.py +6 -0
 - sglang/lang/interpreter.py +1 -0
 - sglang/lang/ir.py +13 -0
 - sglang/launch_server.py +10 -15
 - sglang/profiler.py +18 -1
 - sglang/srt/_custom_ops.py +1 -1
 - sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
 - sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
 - sglang/srt/compilation/backend.py +437 -0
 - sglang/srt/compilation/compilation_config.py +20 -0
 - sglang/srt/compilation/compilation_counter.py +47 -0
 - sglang/srt/compilation/compile.py +210 -0
 - sglang/srt/compilation/compiler_interface.py +503 -0
 - sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
 - sglang/srt/compilation/fix_functionalization.py +134 -0
 - sglang/srt/compilation/fx_utils.py +83 -0
 - sglang/srt/compilation/inductor_pass.py +140 -0
 - sglang/srt/compilation/pass_manager.py +66 -0
 - sglang/srt/compilation/piecewise_context_manager.py +40 -0
 - sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
 - sglang/srt/configs/__init__.py +4 -0
 - sglang/srt/configs/deepseek_ocr.py +262 -0
 - sglang/srt/configs/deepseekvl2.py +194 -96
 - sglang/srt/configs/dots_vlm.py +2 -7
 - sglang/srt/configs/falcon_h1.py +13 -64
 - sglang/srt/configs/load_config.py +25 -2
 - sglang/srt/configs/mamba_utils.py +117 -0
 - sglang/srt/configs/model_config.py +136 -25
 - sglang/srt/configs/modelopt_config.py +30 -0
 - sglang/srt/configs/nemotron_h.py +286 -0
 - sglang/srt/configs/olmo3.py +105 -0
 - sglang/srt/configs/points_v15_chat.py +29 -0
 - sglang/srt/configs/qwen3_next.py +11 -47
 - sglang/srt/configs/qwen3_omni.py +613 -0
 - sglang/srt/configs/qwen3_vl.py +0 -10
 - sglang/srt/connector/remote_instance.py +1 -1
 - sglang/srt/constrained/base_grammar_backend.py +5 -1
 - sglang/srt/constrained/llguidance_backend.py +5 -0
 - sglang/srt/constrained/outlines_backend.py +1 -1
 - sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
 - sglang/srt/constrained/utils.py +12 -0
 - sglang/srt/constrained/xgrammar_backend.py +20 -11
 - sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
 - sglang/srt/disaggregation/base/conn.py +17 -4
 - sglang/srt/disaggregation/common/conn.py +4 -2
 - sglang/srt/disaggregation/decode.py +123 -31
 - sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
 - sglang/srt/disaggregation/fake/conn.py +11 -3
 - sglang/srt/disaggregation/mooncake/conn.py +157 -19
 - sglang/srt/disaggregation/nixl/conn.py +69 -24
 - sglang/srt/disaggregation/prefill.py +96 -270
 - sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
 - sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
 - sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
 - sglang/srt/distributed/device_communicators/pynccl.py +24 -12
 - sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
 - sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
 - sglang/srt/distributed/naive_distributed.py +5 -4
 - sglang/srt/distributed/parallel_state.py +63 -19
 - sglang/srt/elastic_ep/elastic_ep.py +74 -0
 - sglang/srt/entrypoints/context.py +3 -2
 - sglang/srt/entrypoints/engine.py +83 -80
 - sglang/srt/entrypoints/grpc_server.py +430 -234
 - sglang/srt/entrypoints/harmony_utils.py +2 -2
 - sglang/srt/entrypoints/http_server.py +195 -102
 - sglang/srt/entrypoints/http_server_engine.py +1 -7
 - sglang/srt/entrypoints/openai/protocol.py +225 -37
 - sglang/srt/entrypoints/openai/serving_base.py +49 -2
 - sglang/srt/entrypoints/openai/serving_chat.py +29 -74
 - sglang/srt/entrypoints/openai/serving_classify.py +204 -0
 - sglang/srt/entrypoints/openai/serving_completions.py +15 -1
 - sglang/srt/entrypoints/openai/serving_responses.py +5 -2
 - sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
 - sglang/srt/environ.py +58 -6
 - sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
 - sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
 - sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
 - sglang/srt/eplb/expert_distribution.py +33 -4
 - sglang/srt/eplb/expert_location_dispatch.py +2 -2
 - sglang/srt/eplb/expert_location_updater.py +2 -2
 - sglang/srt/function_call/base_format_detector.py +17 -18
 - sglang/srt/function_call/function_call_parser.py +20 -14
 - sglang/srt/function_call/glm4_moe_detector.py +1 -5
 - sglang/srt/function_call/gpt_oss_detector.py +1 -1
 - sglang/srt/function_call/json_array_parser.py +0 -2
 - sglang/srt/function_call/minimax_m2.py +367 -0
 - sglang/srt/function_call/utils.py +2 -2
 - sglang/srt/grpc/compile_proto.py +3 -3
 - sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
 - sglang/srt/grpc/health_servicer.py +189 -0
 - sglang/srt/grpc/scheduler_launcher.py +181 -0
 - sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
 - sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
 - sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
 - sglang/srt/layers/activation.py +10 -1
 - sglang/srt/layers/attention/aiter_backend.py +3 -3
 - sglang/srt/layers/attention/ascend_backend.py +17 -1
 - sglang/srt/layers/attention/attention_registry.py +43 -23
 - sglang/srt/layers/attention/base_attn_backend.py +20 -1
 - sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
 - sglang/srt/layers/attention/fla/chunk.py +0 -1
 - sglang/srt/layers/attention/fla/chunk_o.py +1 -1
 - sglang/srt/layers/attention/fla/index.py +0 -2
 - sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
 - sglang/srt/layers/attention/fla/utils.py +0 -3
 - sglang/srt/layers/attention/fla/wy_fast.py +0 -2
 - sglang/srt/layers/attention/flashattention_backend.py +24 -10
 - sglang/srt/layers/attention/flashinfer_backend.py +258 -22
 - sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
 - sglang/srt/layers/attention/flashmla_backend.py +2 -2
 - sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
 - sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
 - sglang/srt/layers/attention/intel_amx_backend.py +1 -1
 - sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
 - sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
 - sglang/srt/layers/attention/mamba/mamba.py +189 -241
 - sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
 - sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
 - sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
 - sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
 - sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
 - sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
 - sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
 - sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
 - sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
 - sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
 - sglang/srt/layers/attention/nsa/utils.py +0 -1
 - sglang/srt/layers/attention/nsa_backend.py +404 -90
 - sglang/srt/layers/attention/triton_backend.py +208 -34
 - sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
 - sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
 - sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
 - sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
 - sglang/srt/layers/attention/utils.py +89 -7
 - sglang/srt/layers/attention/vision.py +3 -3
 - sglang/srt/layers/attention/xpu_backend.py +1028 -0
 - sglang/srt/layers/communicator.py +12 -7
 - sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
 - sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
 - sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
 - sglang/srt/layers/dp_attention.py +17 -0
 - sglang/srt/layers/layernorm.py +64 -19
 - sglang/srt/layers/linear.py +9 -1
 - sglang/srt/layers/logits_processor.py +152 -17
 - sglang/srt/layers/modelopt_utils.py +11 -0
 - sglang/srt/layers/moe/cutlass_moe.py +0 -2
 - sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
 - sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
 - sglang/srt/layers/moe/ep_moe/layer.py +154 -625
 - sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
 - sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
 - sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
 - sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
 - sglang/srt/layers/moe/moe_runner/runner.py +6 -0
 - sglang/srt/layers/moe/moe_runner/triton.py +3 -1
 - sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
 - sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
 - sglang/srt/layers/moe/router.py +51 -15
 - sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
 - sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
 - sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
 - sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
 - sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
 - sglang/srt/layers/moe/topk.py +7 -6
 - sglang/srt/layers/moe/utils.py +20 -5
 - sglang/srt/layers/quantization/__init__.py +5 -58
 - sglang/srt/layers/quantization/awq.py +183 -9
 - sglang/srt/layers/quantization/awq_triton.py +29 -0
 - sglang/srt/layers/quantization/base_config.py +27 -1
 - sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
 - sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
 - sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
 - sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
 - sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
 - sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
 - sglang/srt/layers/quantization/fp8.py +152 -81
 - sglang/srt/layers/quantization/fp8_kernel.py +55 -10
 - sglang/srt/layers/quantization/fp8_utils.py +42 -14
 - sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
 - sglang/srt/layers/quantization/gguf.py +566 -0
 - sglang/srt/layers/quantization/gptq.py +0 -1
 - sglang/srt/layers/quantization/int8_kernel.py +18 -2
 - sglang/srt/layers/quantization/marlin_utils.py +12 -0
 - sglang/srt/layers/quantization/modelopt_quant.py +125 -100
 - sglang/srt/layers/quantization/mxfp4.py +35 -68
 - sglang/srt/layers/quantization/petit.py +1 -1
 - sglang/srt/layers/quantization/quark/quark.py +3 -1
 - sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
 - sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
 - sglang/srt/layers/quantization/unquant.py +23 -48
 - sglang/srt/layers/quantization/utils.py +0 -1
 - sglang/srt/layers/quantization/w4afp8.py +87 -20
 - sglang/srt/layers/quantization/w8a8_int8.py +30 -24
 - sglang/srt/layers/radix_attention.py +62 -9
 - sglang/srt/layers/rotary_embedding.py +686 -17
 - sglang/srt/layers/sampler.py +47 -16
 - sglang/srt/layers/sparse_pooler.py +98 -0
 - sglang/srt/layers/utils.py +0 -1
 - sglang/srt/layers/vocab_parallel_embedding.py +4 -1
 - sglang/srt/lora/backend/triton_backend.py +0 -1
 - sglang/srt/lora/eviction_policy.py +139 -0
 - sglang/srt/lora/lora_manager.py +24 -9
 - sglang/srt/lora/lora_registry.py +1 -1
 - sglang/srt/lora/mem_pool.py +40 -16
 - sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
 - sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
 - sglang/srt/managers/cache_controller.py +48 -17
 - sglang/srt/managers/data_parallel_controller.py +146 -42
 - sglang/srt/managers/detokenizer_manager.py +40 -13
 - sglang/srt/managers/io_struct.py +69 -16
 - sglang/srt/managers/mm_utils.py +20 -18
 - sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
 - sglang/srt/managers/overlap_utils.py +96 -19
 - sglang/srt/managers/schedule_batch.py +241 -511
 - sglang/srt/managers/schedule_policy.py +15 -2
 - sglang/srt/managers/scheduler.py +420 -514
 - sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
 - sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
 - sglang/srt/managers/scheduler_pp_mixin.py +341 -0
 - sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
 - sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
 - sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
 - sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
 - sglang/srt/managers/tokenizer_manager.py +375 -95
 - sglang/srt/managers/tp_worker.py +212 -161
 - sglang/srt/managers/utils.py +78 -2
 - sglang/srt/mem_cache/allocator.py +7 -2
 - sglang/srt/mem_cache/allocator_ascend.py +2 -2
 - sglang/srt/mem_cache/base_prefix_cache.py +2 -2
 - sglang/srt/mem_cache/chunk_cache.py +13 -2
 - sglang/srt/mem_cache/common.py +480 -0
 - sglang/srt/mem_cache/evict_policy.py +16 -1
 - sglang/srt/mem_cache/hicache_storage.py +11 -2
 - sglang/srt/mem_cache/hiradix_cache.py +16 -3
 - sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
 - sglang/srt/mem_cache/memory_pool.py +517 -219
 - sglang/srt/mem_cache/memory_pool_host.py +0 -1
 - sglang/srt/mem_cache/multimodal_cache.py +0 -1
 - sglang/srt/mem_cache/radix_cache.py +53 -19
 - sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
 - sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
 - sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
 - sglang/srt/mem_cache/storage/backend_factory.py +2 -2
 - sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
 - sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
 - sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
 - sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
 - sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
 - sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
 - sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
 - sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
 - sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
 - sglang/srt/mem_cache/swa_radix_cache.py +92 -26
 - sglang/srt/metrics/collector.py +31 -0
 - sglang/srt/metrics/func_timer.py +1 -1
 - sglang/srt/model_executor/cuda_graph_runner.py +43 -5
 - sglang/srt/model_executor/forward_batch_info.py +71 -25
 - sglang/srt/model_executor/model_runner.py +362 -270
 - sglang/srt/model_executor/npu_graph_runner.py +2 -3
 - sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
 - sglang/srt/model_loader/__init__.py +1 -1
 - sglang/srt/model_loader/loader.py +424 -27
 - sglang/srt/model_loader/utils.py +0 -1
 - sglang/srt/model_loader/weight_utils.py +47 -28
 - sglang/srt/models/apertus.py +2 -3
 - sglang/srt/models/arcee.py +2 -2
 - sglang/srt/models/bailing_moe.py +13 -52
 - sglang/srt/models/bailing_moe_nextn.py +3 -4
 - sglang/srt/models/bert.py +1 -1
 - sglang/srt/models/deepseek_nextn.py +19 -3
 - sglang/srt/models/deepseek_ocr.py +1516 -0
 - sglang/srt/models/deepseek_v2.py +418 -140
 - sglang/srt/models/dots_ocr.py +0 -2
 - sglang/srt/models/dots_vlm.py +0 -1
 - sglang/srt/models/dots_vlm_vit.py +1 -1
 - sglang/srt/models/falcon_h1.py +13 -19
 - sglang/srt/models/gemma3_mm.py +16 -0
 - sglang/srt/models/gemma3n_mm.py +1 -2
 - sglang/srt/models/glm4_moe.py +327 -382
 - sglang/srt/models/glm4_moe_nextn.py +6 -16
 - sglang/srt/models/glm4v.py +2 -1
 - sglang/srt/models/glm4v_moe.py +32 -199
 - sglang/srt/models/gpt_oss.py +5 -5
 - sglang/srt/models/grok.py +10 -23
 - sglang/srt/models/hunyuan.py +2 -7
 - sglang/srt/models/interns1.py +0 -1
 - sglang/srt/models/kimi_vl.py +1 -7
 - sglang/srt/models/kimi_vl_moonvit.py +3 -1
 - sglang/srt/models/llama.py +2 -2
 - sglang/srt/models/llama_eagle3.py +1 -1
 - sglang/srt/models/longcat_flash.py +5 -22
 - sglang/srt/models/longcat_flash_nextn.py +3 -14
 - sglang/srt/models/mimo.py +2 -13
 - sglang/srt/models/mimo_mtp.py +1 -2
 - sglang/srt/models/minicpmo.py +7 -5
 - sglang/srt/models/minimax_m2.py +922 -0
 - sglang/srt/models/mixtral.py +1 -4
 - sglang/srt/models/mllama.py +1 -1
 - sglang/srt/models/mllama4.py +13 -3
 - sglang/srt/models/nemotron_h.py +511 -0
 - sglang/srt/models/nvila.py +355 -0
 - sglang/srt/models/nvila_lite.py +184 -0
 - sglang/srt/models/olmo2.py +31 -4
 - sglang/srt/models/opt.py +5 -5
 - sglang/srt/models/phi.py +1 -1
 - sglang/srt/models/phi4mm.py +1 -1
 - sglang/srt/models/phimoe.py +0 -1
 - sglang/srt/models/pixtral.py +0 -3
 - sglang/srt/models/points_v15_chat.py +186 -0
 - sglang/srt/models/qwen.py +0 -1
 - sglang/srt/models/qwen2.py +22 -1
 - sglang/srt/models/qwen2_5_vl.py +3 -3
 - sglang/srt/models/qwen2_audio.py +2 -15
 - sglang/srt/models/qwen2_moe.py +15 -12
 - sglang/srt/models/qwen2_vl.py +5 -2
 - sglang/srt/models/qwen3.py +34 -4
 - sglang/srt/models/qwen3_moe.py +19 -37
 - sglang/srt/models/qwen3_next.py +7 -12
 - sglang/srt/models/qwen3_next_mtp.py +3 -4
 - sglang/srt/models/qwen3_omni_moe.py +661 -0
 - sglang/srt/models/qwen3_vl.py +37 -33
 - sglang/srt/models/qwen3_vl_moe.py +57 -185
 - sglang/srt/models/roberta.py +55 -3
 - sglang/srt/models/sarashina2_vision.py +0 -1
 - sglang/srt/models/step3_vl.py +3 -5
 - sglang/srt/models/utils.py +11 -1
 - sglang/srt/multimodal/processors/base_processor.py +7 -2
 - sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
 - sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
 - sglang/srt/multimodal/processors/dots_vlm.py +0 -1
 - sglang/srt/multimodal/processors/glm4v.py +2 -6
 - sglang/srt/multimodal/processors/internvl.py +0 -2
 - sglang/srt/multimodal/processors/janus_pro.py +0 -1
 - sglang/srt/multimodal/processors/mllama4.py +0 -8
 - sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
 - sglang/srt/multimodal/processors/phi4mm.py +0 -1
 - sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
 - sglang/srt/multimodal/processors/qwen_vl.py +75 -16
 - sglang/srt/multimodal/processors/step3_vl.py +1 -1
 - sglang/srt/parser/conversation.py +41 -0
 - sglang/srt/parser/reasoning_parser.py +28 -2
 - sglang/srt/sampling/custom_logit_processor.py +77 -2
 - sglang/srt/sampling/sampling_batch_info.py +17 -22
 - sglang/srt/sampling/sampling_params.py +70 -2
 - sglang/srt/server_args.py +846 -163
 - sglang/srt/server_args_config_parser.py +1 -1
 - sglang/srt/single_batch_overlap.py +36 -31
 - sglang/srt/speculative/base_spec_worker.py +34 -0
 - sglang/srt/speculative/draft_utils.py +226 -0
 - sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
 - sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
 - sglang/srt/speculative/eagle_info.py +57 -18
 - sglang/srt/speculative/eagle_info_v2.py +458 -0
 - sglang/srt/speculative/eagle_utils.py +138 -0
 - sglang/srt/speculative/eagle_worker.py +83 -280
 - sglang/srt/speculative/eagle_worker_v2.py +702 -0
 - sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
 - sglang/srt/speculative/ngram_worker.py +12 -11
 - sglang/srt/speculative/spec_info.py +2 -0
 - sglang/srt/speculative/spec_utils.py +38 -3
 - sglang/srt/speculative/standalone_worker.py +4 -14
 - sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
 - sglang/srt/two_batch_overlap.py +28 -14
 - sglang/srt/utils/__init__.py +1 -1
 - sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
 - sglang/srt/utils/common.py +272 -82
 - sglang/srt/utils/hf_transformers_utils.py +44 -17
 - sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
 - sglang/srt/{offloader.py → utils/offloader.py} +4 -4
 - sglang/srt/utils/profile_merger.py +199 -0
 - sglang/test/attention/test_flashattn_backend.py +1 -1
 - sglang/test/attention/test_flashattn_mla_backend.py +0 -1
 - sglang/test/attention/test_prefix_chunk_info.py +0 -2
 - sglang/test/attention/test_trtllm_mla_backend.py +221 -53
 - sglang/test/few_shot_gsm8k_engine.py +2 -4
 - sglang/test/kit_matched_stop.py +157 -0
 - sglang/test/longbench_v2/__init__.py +1 -0
 - sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
 - sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
 - sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
 - sglang/test/run_eval.py +41 -0
 - sglang/test/runners.py +2 -0
 - sglang/test/send_one.py +42 -7
 - sglang/test/simple_eval_common.py +3 -0
 - sglang/test/simple_eval_gpqa.py +0 -1
 - sglang/test/simple_eval_humaneval.py +0 -3
 - sglang/test/simple_eval_longbench_v2.py +344 -0
 - sglang/test/test_block_fp8.py +1 -2
 - sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
 - sglang/test/test_cutlass_moe.py +1 -2
 - sglang/test/test_cutlass_w4a8_moe.py +10 -20
 - sglang/test/test_deterministic.py +463 -107
 - sglang/test/test_deterministic_utils.py +74 -0
 - sglang/test/test_disaggregation_utils.py +81 -0
 - sglang/test/test_marlin_moe.py +0 -1
 - sglang/test/test_utils.py +85 -20
 - sglang/version.py +1 -1
 - {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
 - {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
 - sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
 - sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
 - sglang/srt/models/vila.py +0 -306
 - sglang/srt/speculative/build_eagle_tree.py +0 -427
 - sglang/test/test_block_fp8_ep.py +0 -358
 - /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
 - /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
 - /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
 - {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
 - {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
 - {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
 
    
        sglang/srt/server_args.py
    CHANGED
    
    | 
         @@ -13,6 +13,8 @@ 
     | 
|
| 
       13 
13 
     | 
    
         
             
            # ==============================================================================
         
     | 
| 
       14 
14 
     | 
    
         
             
            """The arguments of the server."""
         
     | 
| 
       15 
15 
     | 
    
         | 
| 
      
 16 
     | 
    
         
            +
            from __future__ import annotations
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
       16 
18 
     | 
    
         
             
            import argparse
         
     | 
| 
       17 
19 
     | 
    
         
             
            import dataclasses
         
     | 
| 
       18 
20 
     | 
    
         
             
            import json
         
     | 
| 
         @@ -20,37 +22,48 @@ import logging 
     | 
|
| 
       20 
22 
     | 
    
         
             
            import os
         
     | 
| 
       21 
23 
     | 
    
         
             
            import random
         
     | 
| 
       22 
24 
     | 
    
         
             
            import tempfile
         
     | 
| 
       23 
     | 
    
         
            -
            from typing import List, Literal, Optional, Union
         
     | 
| 
      
 25 
     | 
    
         
            +
            from typing import Dict, List, Literal, Optional, Union
         
     | 
| 
      
 26 
     | 
    
         
            +
             
     | 
| 
      
 27 
     | 
    
         
            +
            import orjson
         
     | 
| 
       24 
28 
     | 
    
         | 
| 
       25 
29 
     | 
    
         
             
            from sglang.srt.connector import ConnectorType
         
     | 
| 
      
 30 
     | 
    
         
            +
            from sglang.srt.environ import envs
         
     | 
| 
       26 
31 
     | 
    
         
             
            from sglang.srt.function_call.function_call_parser import FunctionCallParser
         
     | 
| 
       27 
32 
     | 
    
         
             
            from sglang.srt.lora.lora_registry import LoRARef
         
     | 
| 
       28 
33 
     | 
    
         
             
            from sglang.srt.parser.reasoning_parser import ReasoningParser
         
     | 
| 
       29 
     | 
    
         
            -
            from sglang.srt.utils import (
         
     | 
| 
      
 34 
     | 
    
         
            +
            from sglang.srt.utils.common import (
         
     | 
| 
       30 
35 
     | 
    
         
             
                LORA_TARGET_ALL_MODULES,
         
     | 
| 
       31 
36 
     | 
    
         
             
                SUPPORTED_LORA_TARGET_MODULES,
         
     | 
| 
       32 
37 
     | 
    
         
             
                configure_ipv6,
         
     | 
| 
      
 38 
     | 
    
         
            +
                cpu_has_amx_support,
         
     | 
| 
       33 
39 
     | 
    
         
             
                get_device,
         
     | 
| 
       34 
40 
     | 
    
         
             
                get_device_memory_capacity,
         
     | 
| 
      
 41 
     | 
    
         
            +
                get_device_sm,
         
     | 
| 
       35 
42 
     | 
    
         
             
                is_cuda,
         
     | 
| 
      
 43 
     | 
    
         
            +
                is_fa3_default_architecture,
         
     | 
| 
       36 
44 
     | 
    
         
             
                is_flashinfer_available,
         
     | 
| 
       37 
45 
     | 
    
         
             
                is_hip,
         
     | 
| 
      
 46 
     | 
    
         
            +
                is_hopper_with_cuda_12_3,
         
     | 
| 
      
 47 
     | 
    
         
            +
                is_no_spec_infer_or_topk_one,
         
     | 
| 
       38 
48 
     | 
    
         
             
                is_npu,
         
     | 
| 
       39 
49 
     | 
    
         
             
                is_port_available,
         
     | 
| 
       40 
50 
     | 
    
         
             
                is_remote_url,
         
     | 
| 
       41 
51 
     | 
    
         
             
                is_sm90_supported,
         
     | 
| 
       42 
52 
     | 
    
         
             
                is_sm100_supported,
         
     | 
| 
      
 53 
     | 
    
         
            +
                is_sm120_supported,
         
     | 
| 
       43 
54 
     | 
    
         
             
                is_triton_kernels_available,
         
     | 
| 
       44 
55 
     | 
    
         
             
                is_valid_ipv6_address,
         
     | 
| 
       45 
56 
     | 
    
         
             
                json_list_type,
         
     | 
| 
       46 
57 
     | 
    
         
             
                nullable_str,
         
     | 
| 
       47 
58 
     | 
    
         
             
                parse_connector_type,
         
     | 
| 
      
 59 
     | 
    
         
            +
                xpu_has_xmx_support,
         
     | 
| 
       48 
60 
     | 
    
         
             
            )
         
     | 
| 
       49 
61 
     | 
    
         
             
            from sglang.srt.utils.hf_transformers_utils import check_gguf_file, get_config
         
     | 
| 
       50 
62 
     | 
    
         
             
            from sglang.utils import is_in_ci
         
     | 
| 
       51 
63 
     | 
    
         | 
| 
       52 
64 
     | 
    
         
             
            logger = logging.getLogger(__name__)
         
     | 
| 
       53 
65 
     | 
    
         | 
| 
      
 66 
     | 
    
         
            +
             
     | 
| 
       54 
67 
     | 
    
         
             
            # Define constants
         
     | 
| 
       55 
68 
     | 
    
         
             
            LOAD_FORMAT_CHOICES = [
         
     | 
| 
       56 
69 
     | 
    
         
             
                "auto",
         
     | 
| 
         @@ -76,6 +89,7 @@ QUANTIZATION_CHOICES = [ 
     | 
|
| 
       76 
89 
     | 
    
         
             
                "bitsandbytes",
         
     | 
| 
       77 
90 
     | 
    
         
             
                "gguf",
         
     | 
| 
       78 
91 
     | 
    
         
             
                "modelopt",
         
     | 
| 
      
 92 
     | 
    
         
            +
                "modelopt_fp8",
         
     | 
| 
       79 
93 
     | 
    
         
             
                "modelopt_fp4",
         
     | 
| 
       80 
94 
     | 
    
         
             
                "petit_nvfp4",
         
     | 
| 
       81 
95 
     | 
    
         
             
                "w8a8_int8",
         
     | 
| 
         @@ -84,6 +98,7 @@ QUANTIZATION_CHOICES = [ 
     | 
|
| 
       84 
98 
     | 
    
         
             
                "qoq",
         
     | 
| 
       85 
99 
     | 
    
         
             
                "w4afp8",
         
     | 
| 
       86 
100 
     | 
    
         
             
                "mxfp4",
         
     | 
| 
      
 101 
     | 
    
         
            +
                "compressed-tensors",  # for Ktransformers
         
     | 
| 
       87 
102 
     | 
    
         
             
            ]
         
     | 
| 
       88 
103 
     | 
    
         | 
| 
       89 
104 
     | 
    
         
             
            ATTENTION_BACKEND_CHOICES = [
         
     | 
| 
         @@ -107,6 +122,7 @@ ATTENTION_BACKEND_CHOICES = [ 
     | 
|
| 
       107 
122 
     | 
    
         
             
                # Other platforms
         
     | 
| 
       108 
123 
     | 
    
         
             
                "intel_amx",
         
     | 
| 
       109 
124 
     | 
    
         
             
                "ascend",
         
     | 
| 
      
 125 
     | 
    
         
            +
                "intel_xpu",
         
     | 
| 
       110 
126 
     | 
    
         
             
            ]
         
     | 
| 
       111 
127 
     | 
    
         | 
| 
       112 
128 
     | 
    
         
             
            LORA_BACKEND_CHOICES = ["triton", "csgmv"]
         
     | 
| 
         @@ -117,10 +133,22 @@ GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"] 
     | 
|
| 
       117 
133 
     | 
    
         | 
| 
       118 
134 
     | 
    
         
             
            DETERMINISTIC_ATTENTION_BACKEND_CHOICES = ["flashinfer", "fa3", "triton"]
         
     | 
| 
       119 
135 
     | 
    
         | 
| 
       120 
     | 
    
         
            -
            NSA_CHOICES = [" 
     | 
| 
      
 136 
     | 
    
         
            +
            NSA_CHOICES = ["flashmla_sparse", "flashmla_kv", "fa3", "tilelang", "aiter"]
         
     | 
| 
       121 
137 
     | 
    
         | 
| 
       122 
138 
     | 
    
         
             
            RADIX_EVICTION_POLICY_CHOICES = ["lru", "lfu"]
         
     | 
| 
       123 
139 
     | 
    
         | 
| 
      
 140 
     | 
    
         
            +
            MOE_RUNNER_BACKEND_CHOICES = [
         
     | 
| 
      
 141 
     | 
    
         
            +
                "auto",
         
     | 
| 
      
 142 
     | 
    
         
            +
                "deep_gemm",
         
     | 
| 
      
 143 
     | 
    
         
            +
                "triton",
         
     | 
| 
      
 144 
     | 
    
         
            +
                "triton_kernel",
         
     | 
| 
      
 145 
     | 
    
         
            +
                "flashinfer_trtllm",
         
     | 
| 
      
 146 
     | 
    
         
            +
                "flashinfer_cutlass",
         
     | 
| 
      
 147 
     | 
    
         
            +
                "flashinfer_mxfp4",
         
     | 
| 
      
 148 
     | 
    
         
            +
                "flashinfer_cutedsl",
         
     | 
| 
      
 149 
     | 
    
         
            +
                "cutlass",
         
     | 
| 
      
 150 
     | 
    
         
            +
            ]
         
     | 
| 
      
 151 
     | 
    
         
            +
             
     | 
| 
       124 
152 
     | 
    
         | 
| 
       125 
153 
     | 
    
         
             
            # Allow external code to add more choices
         
     | 
| 
       126 
154 
     | 
    
         
             
            def add_load_format_choices(choices):
         
     | 
| 
         @@ -143,6 +171,10 @@ def add_grammar_backend_choices(choices): 
     | 
|
| 
       143 
171 
     | 
    
         
             
                GRAMMAR_BACKEND_CHOICES.extend(choices)
         
     | 
| 
       144 
172 
     | 
    
         | 
| 
       145 
173 
     | 
    
         | 
| 
      
 174 
     | 
    
         
            +
            def add_moe_runner_backend_choices(choices):
         
     | 
| 
      
 175 
     | 
    
         
            +
                MOE_RUNNER_BACKEND_CHOICES.extend(choices)
         
     | 
| 
      
 176 
     | 
    
         
            +
             
     | 
| 
      
 177 
     | 
    
         
            +
             
     | 
| 
       146 
178 
     | 
    
         
             
            def add_deterministic_attention_backend_choices(choices):
         
     | 
| 
       147 
179 
     | 
    
         
             
                DETERMINISTIC_ATTENTION_BACKEND_CHOICES.extend(choices)
         
     | 
| 
       148 
180 
     | 
    
         | 
| 
         @@ -153,6 +185,15 @@ def add_radix_eviction_policy_choices(choices): 
     | 
|
| 
       153 
185 
     | 
    
         | 
| 
       154 
186 
     | 
    
         
             
            @dataclasses.dataclass
         
     | 
| 
       155 
187 
     | 
    
         
             
            class ServerArgs:
         
     | 
| 
      
 188 
     | 
    
         
            +
                """
         
     | 
| 
      
 189 
     | 
    
         
            +
                The arguments of the server.
         
     | 
| 
      
 190 
     | 
    
         
            +
             
     | 
| 
      
 191 
     | 
    
         
            +
                NOTE: When you add new arguments, please make sure the order
         
     | 
| 
      
 192 
     | 
    
         
            +
                in this class definition the same as the order in the the function
         
     | 
| 
      
 193 
     | 
    
         
            +
                `ServerArgs.add_cli_args`.
         
     | 
| 
      
 194 
     | 
    
         
            +
                Please follow the existing style to group the new arguments into related groups or create new groups.
         
     | 
| 
      
 195 
     | 
    
         
            +
                """
         
     | 
| 
      
 196 
     | 
    
         
            +
             
     | 
| 
       156 
197 
     | 
    
         
             
                # Model and tokenizer
         
     | 
| 
       157 
198 
     | 
    
         
             
                model_path: str
         
     | 
| 
       158 
199 
     | 
    
         
             
                tokenizer_path: Optional[str] = None
         
     | 
| 
         @@ -171,9 +212,11 @@ class ServerArgs: 
     | 
|
| 
       171 
212 
     | 
    
         
             
                # HTTP server
         
     | 
| 
       172 
213 
     | 
    
         
             
                host: str = "127.0.0.1"
         
     | 
| 
       173 
214 
     | 
    
         
             
                port: int = 30000
         
     | 
| 
      
 215 
     | 
    
         
            +
                grpc_mode: bool = False
         
     | 
| 
       174 
216 
     | 
    
         
             
                skip_server_warmup: bool = False
         
     | 
| 
       175 
217 
     | 
    
         
             
                warmups: Optional[str] = None
         
     | 
| 
       176 
218 
     | 
    
         
             
                nccl_port: Optional[int] = None
         
     | 
| 
      
 219 
     | 
    
         
            +
                checkpoint_engine_wait_weights_before_ready: bool = False
         
     | 
| 
       177 
220 
     | 
    
         | 
| 
       178 
221 
     | 
    
         
             
                # Quantization and data type
         
     | 
| 
       179 
222 
     | 
    
         
             
                dtype: str = "auto"
         
     | 
| 
         @@ -181,6 +224,11 @@ class ServerArgs: 
     | 
|
| 
       181 
224 
     | 
    
         
             
                quantization_param_path: Optional[str] = None
         
     | 
| 
       182 
225 
     | 
    
         
             
                kv_cache_dtype: str = "auto"
         
     | 
| 
       183 
226 
     | 
    
         
             
                enable_fp32_lm_head: bool = False
         
     | 
| 
      
 227 
     | 
    
         
            +
                modelopt_quant: Optional[Union[str, Dict]] = None
         
     | 
| 
      
 228 
     | 
    
         
            +
                modelopt_checkpoint_restore_path: Optional[str] = None
         
     | 
| 
      
 229 
     | 
    
         
            +
                modelopt_checkpoint_save_path: Optional[str] = None
         
     | 
| 
      
 230 
     | 
    
         
            +
                modelopt_export_path: Optional[str] = None
         
     | 
| 
      
 231 
     | 
    
         
            +
                quantize_and_serve: bool = False
         
     | 
| 
       184 
232 
     | 
    
         | 
| 
       185 
233 
     | 
    
         
             
                # Memory and scheduling
         
     | 
| 
       186 
234 
     | 
    
         
             
                mem_fraction_static: Optional[float] = None
         
     | 
| 
         @@ -191,6 +239,7 @@ class ServerArgs: 
     | 
|
| 
       191 
239 
     | 
    
         
             
                max_prefill_tokens: int = 16384
         
     | 
| 
       192 
240 
     | 
    
         
             
                schedule_policy: str = "fcfs"
         
     | 
| 
       193 
241 
     | 
    
         
             
                enable_priority_scheduling: bool = False
         
     | 
| 
      
 242 
     | 
    
         
            +
                abort_on_priority_when_disabled: bool = False
         
     | 
| 
       194 
243 
     | 
    
         
             
                schedule_low_priority_values_first: bool = False
         
     | 
| 
       195 
244 
     | 
    
         
             
                priority_scheduling_preemption_threshold: int = 10
         
     | 
| 
       196 
245 
     | 
    
         
             
                schedule_conservativeness: float = 1.0
         
     | 
| 
         @@ -204,11 +253,12 @@ class ServerArgs: 
     | 
|
| 
       204 
253 
     | 
    
         
             
                device: Optional[str] = None
         
     | 
| 
       205 
254 
     | 
    
         
             
                tp_size: int = 1
         
     | 
| 
       206 
255 
     | 
    
         
             
                pp_size: int = 1
         
     | 
| 
       207 
     | 
    
         
            -
                 
     | 
| 
      
 256 
     | 
    
         
            +
                pp_max_micro_batch_size: Optional[int] = None
         
     | 
| 
       208 
257 
     | 
    
         
             
                stream_interval: int = 1
         
     | 
| 
       209 
258 
     | 
    
         
             
                stream_output: bool = False
         
     | 
| 
       210 
259 
     | 
    
         
             
                random_seed: Optional[int] = None
         
     | 
| 
       211 
260 
     | 
    
         
             
                constrained_json_whitespace_pattern: Optional[str] = None
         
     | 
| 
      
 261 
     | 
    
         
            +
                constrained_json_disable_any_whitespace: bool = False
         
     | 
| 
       212 
262 
     | 
    
         
             
                watchdog_timeout: float = 300
         
     | 
| 
       213 
263 
     | 
    
         
             
                dist_timeout: Optional[int] = None  # timeout for torch.distributed
         
     | 
| 
       214 
264 
     | 
    
         
             
                download_dir: Optional[str] = None
         
     | 
| 
         @@ -233,10 +283,10 @@ class ServerArgs: 
     | 
|
| 
       233 
283 
     | 
    
         
             
                collect_tokens_histogram: bool = False
         
     | 
| 
       234 
284 
     | 
    
         
             
                prompt_tokens_buckets: Optional[List[str]] = None
         
     | 
| 
       235 
285 
     | 
    
         
             
                generation_tokens_buckets: Optional[List[str]] = None
         
     | 
| 
      
 286 
     | 
    
         
            +
                gc_warning_threshold_secs: float = 0.0
         
     | 
| 
       236 
287 
     | 
    
         
             
                decode_log_interval: int = 40
         
     | 
| 
       237 
288 
     | 
    
         
             
                enable_request_time_stats_logging: bool = False
         
     | 
| 
       238 
289 
     | 
    
         
             
                kv_events_config: Optional[str] = None
         
     | 
| 
       239 
     | 
    
         
            -
                gc_warning_threshold_secs: float = 0.0
         
     | 
| 
       240 
290 
     | 
    
         
             
                enable_trace: bool = False
         
     | 
| 
       241 
291 
     | 
    
         
             
                oltp_traces_endpoint: str = "localhost:4317"
         
     | 
| 
       242 
292 
     | 
    
         | 
| 
         @@ -251,6 +301,7 @@ class ServerArgs: 
     | 
|
| 
       251 
301 
     | 
    
         
             
                reasoning_parser: Optional[str] = None
         
     | 
| 
       252 
302 
     | 
    
         
             
                tool_call_parser: Optional[str] = None
         
     | 
| 
       253 
303 
     | 
    
         
             
                tool_server: Optional[str] = None
         
     | 
| 
      
 304 
     | 
    
         
            +
                sampling_defaults: str = "model"
         
     | 
| 
       254 
305 
     | 
    
         | 
| 
       255 
306 
     | 
    
         
             
                # Data parallelism
         
     | 
| 
       256 
307 
     | 
    
         
             
                dp_size: int = 1
         
     | 
| 
         @@ -277,6 +328,7 @@ class ServerArgs: 
     | 
|
| 
       277 
328 
     | 
    
         
             
                ] = None
         
     | 
| 
       278 
329 
     | 
    
         
             
                max_loaded_loras: Optional[int] = None
         
     | 
| 
       279 
330 
     | 
    
         
             
                max_loras_per_batch: int = 8
         
     | 
| 
      
 331 
     | 
    
         
            +
                lora_eviction_policy: str = "lru"
         
     | 
| 
       280 
332 
     | 
    
         
             
                lora_backend: str = "triton"
         
     | 
| 
       281 
333 
     | 
    
         
             
                max_lora_chunk_size: Optional[int] = 16
         
     | 
| 
       282 
334 
     | 
    
         | 
| 
         @@ -287,13 +339,14 @@ class ServerArgs: 
     | 
|
| 
       287 
339 
     | 
    
         
             
                sampling_backend: Optional[str] = None
         
     | 
| 
       288 
340 
     | 
    
         
             
                grammar_backend: Optional[str] = None
         
     | 
| 
       289 
341 
     | 
    
         
             
                mm_attention_backend: Optional[str] = None
         
     | 
| 
       290 
     | 
    
         
            -
                 
     | 
| 
       291 
     | 
    
         
            -
                 
     | 
| 
      
 342 
     | 
    
         
            +
                nsa_prefill_backend: str = "flashmla_sparse"
         
     | 
| 
      
 343 
     | 
    
         
            +
                nsa_decode_backend: str = "fa3"
         
     | 
| 
       292 
344 
     | 
    
         | 
| 
       293 
345 
     | 
    
         
             
                # Speculative decoding
         
     | 
| 
       294 
346 
     | 
    
         
             
                speculative_algorithm: Optional[str] = None
         
     | 
| 
       295 
347 
     | 
    
         
             
                speculative_draft_model_path: Optional[str] = None
         
     | 
| 
       296 
348 
     | 
    
         
             
                speculative_draft_model_revision: Optional[str] = None
         
     | 
| 
      
 349 
     | 
    
         
            +
                speculative_draft_load_format: Optional[str] = None
         
     | 
| 
       297 
350 
     | 
    
         
             
                speculative_num_steps: Optional[int] = None
         
     | 
| 
       298 
351 
     | 
    
         
             
                speculative_eagle_topk: Optional[int] = None
         
     | 
| 
       299 
352 
     | 
    
         
             
                speculative_num_draft_tokens: Optional[int] = None
         
     | 
| 
         @@ -312,15 +365,8 @@ class ServerArgs: 
     | 
|
| 
       312 
365 
     | 
    
         | 
| 
       313 
366 
     | 
    
         
             
                # Expert parallelism
         
     | 
| 
       314 
367 
     | 
    
         
             
                ep_size: int = 1
         
     | 
| 
       315 
     | 
    
         
            -
                moe_a2a_backend: Literal["none", "deepep"] = "none"
         
     | 
| 
       316 
     | 
    
         
            -
                moe_runner_backend:  
     | 
| 
       317 
     | 
    
         
            -
                    "auto",
         
     | 
| 
       318 
     | 
    
         
            -
                    "triton",
         
     | 
| 
       319 
     | 
    
         
            -
                    "triton_kernel",
         
     | 
| 
       320 
     | 
    
         
            -
                    "flashinfer_trtllm",
         
     | 
| 
       321 
     | 
    
         
            -
                    "flashinfer_cutlass",
         
     | 
| 
       322 
     | 
    
         
            -
                    "flashinfer_mxfp4",
         
     | 
| 
       323 
     | 
    
         
            -
                ] = "auto"
         
     | 
| 
      
 368 
     | 
    
         
            +
                moe_a2a_backend: Literal["none", "deepep", "mooncake"] = "none"
         
     | 
| 
      
 369 
     | 
    
         
            +
                moe_runner_backend: str = "auto"
         
     | 
| 
       324 
370 
     | 
    
         
             
                flashinfer_mxfp4_moe_precision: Literal["default", "bf16"] = "default"
         
     | 
| 
       325 
371 
     | 
    
         
             
                enable_flashinfer_allreduce_fusion: bool = False
         
     | 
| 
       326 
372 
     | 
    
         
             
                deepep_mode: Literal["auto", "normal", "low_latency"] = "auto"
         
     | 
| 
         @@ -339,10 +385,13 @@ class ServerArgs: 
     | 
|
| 
       339 
385 
     | 
    
         
             
                enable_expert_distribution_metrics: bool = False
         
     | 
| 
       340 
386 
     | 
    
         
             
                deepep_config: Optional[str] = None
         
     | 
| 
       341 
387 
     | 
    
         
             
                moe_dense_tp_size: Optional[int] = None
         
     | 
| 
      
 388 
     | 
    
         
            +
                elastic_ep_backend: Literal[None, "mooncake"] = None
         
     | 
| 
      
 389 
     | 
    
         
            +
                mooncake_ib_device: Optional[str] = None
         
     | 
| 
       342 
390 
     | 
    
         | 
| 
       343 
391 
     | 
    
         
             
                # Mamba cache
         
     | 
| 
       344 
392 
     | 
    
         
             
                max_mamba_cache_size: Optional[int] = None
         
     | 
| 
       345 
393 
     | 
    
         
             
                mamba_ssm_dtype: str = "float32"
         
     | 
| 
      
 394 
     | 
    
         
            +
                mamba_full_memory_ratio: float = 0.9
         
     | 
| 
       346 
395 
     | 
    
         | 
| 
       347 
396 
     | 
    
         
             
                # Hierarchical cache
         
     | 
| 
       348 
397 
     | 
    
         
             
                enable_hierarchical_cache: bool = False
         
     | 
| 
         @@ -357,6 +406,13 @@ class ServerArgs: 
     | 
|
| 
       357 
406 
     | 
    
         
             
                # LMCache
         
     | 
| 
       358 
407 
     | 
    
         
             
                enable_lmcache: bool = False
         
     | 
| 
       359 
408 
     | 
    
         | 
| 
      
 409 
     | 
    
         
            +
                # Ktransformers
         
     | 
| 
      
 410 
     | 
    
         
            +
                kt_amx_weight_path: Optional[str] = None
         
     | 
| 
      
 411 
     | 
    
         
            +
                kt_amx_method: Optional[str] = None
         
     | 
| 
      
 412 
     | 
    
         
            +
                kt_cpuinfer: Optional[int] = None
         
     | 
| 
      
 413 
     | 
    
         
            +
                kt_threadpool_count: Optional[int] = None
         
     | 
| 
      
 414 
     | 
    
         
            +
                kt_num_gpu_experts: Optional[int] = None
         
     | 
| 
      
 415 
     | 
    
         
            +
             
     | 
| 
       360 
416 
     | 
    
         
             
                # Double Sparsity
         
     | 
| 
       361 
417 
     | 
    
         
             
                enable_double_sparsity: bool = False
         
     | 
| 
       362 
418 
     | 
    
         
             
                ds_channel_config_path: Optional[str] = None
         
     | 
| 
         @@ -372,6 +428,12 @@ class ServerArgs: 
     | 
|
| 
       372 
428 
     | 
    
         
             
                offload_prefetch_step: int = 1
         
     | 
| 
       373 
429 
     | 
    
         
             
                offload_mode: str = "cpu"
         
     | 
| 
       374 
430 
     | 
    
         | 
| 
      
 431 
     | 
    
         
            +
                # Scoring configuration
         
     | 
| 
      
 432 
     | 
    
         
            +
                # Delimiter token ID used to combine Query and Items into a single sequence for multi-item scoring.
         
     | 
| 
      
 433 
     | 
    
         
            +
                # Format: Query<delimiter>Item1<delimiter>Item2<delimiter>...
         
     | 
| 
      
 434 
     | 
    
         
            +
                # This enables efficient batch processing of multiple items against a single query.
         
     | 
| 
      
 435 
     | 
    
         
            +
                multi_item_scoring_delimiter: Optional[Union[int]] = None
         
     | 
| 
      
 436 
     | 
    
         
            +
             
     | 
| 
       375 
437 
     | 
    
         
             
                # Optimization/debug options
         
     | 
| 
       376 
438 
     | 
    
         
             
                disable_radix_cache: bool = False
         
     | 
| 
       377 
439 
     | 
    
         
             
                cuda_graph_max_bs: Optional[int] = None
         
     | 
| 
         @@ -384,6 +446,7 @@ class ServerArgs: 
     | 
|
| 
       384 
446 
     | 
    
         
             
                enable_symm_mem: bool = False
         
     | 
| 
       385 
447 
     | 
    
         
             
                disable_flashinfer_cutlass_moe_fp4_allgather: bool = False
         
     | 
| 
       386 
448 
     | 
    
         
             
                enable_tokenizer_batch_encode: bool = False
         
     | 
| 
      
 449 
     | 
    
         
            +
                disable_tokenizer_batch_decode: bool = False
         
     | 
| 
       387 
450 
     | 
    
         
             
                disable_outlines_disk_cache: bool = False
         
     | 
| 
       388 
451 
     | 
    
         
             
                disable_custom_all_reduce: bool = False
         
     | 
| 
       389 
452 
     | 
    
         
             
                enable_mscclpp: bool = False
         
     | 
| 
         @@ -396,7 +459,11 @@ class ServerArgs: 
     | 
|
| 
       396 
459 
     | 
    
         
             
                enable_single_batch_overlap: bool = False
         
     | 
| 
       397 
460 
     | 
    
         
             
                tbo_token_distribution_threshold: float = 0.48
         
     | 
| 
       398 
461 
     | 
    
         
             
                enable_torch_compile: bool = False
         
     | 
| 
      
 462 
     | 
    
         
            +
                enable_piecewise_cuda_graph: bool = False
         
     | 
| 
       399 
463 
     | 
    
         
             
                torch_compile_max_bs: int = 32
         
     | 
| 
      
 464 
     | 
    
         
            +
                piecewise_cuda_graph_max_tokens: int = 4096
         
     | 
| 
      
 465 
     | 
    
         
            +
                piecewise_cuda_graph_tokens: Optional[List[int]] = None
         
     | 
| 
      
 466 
     | 
    
         
            +
                piecewise_cuda_graph_compiler: str = "eager"
         
     | 
| 
       400 
467 
     | 
    
         
             
                torchao_config: str = ""
         
     | 
| 
       401 
468 
     | 
    
         
             
                enable_nan_detection: bool = False
         
     | 
| 
       402 
469 
     | 
    
         
             
                enable_p2p_check: bool = False
         
     | 
| 
         @@ -418,6 +485,7 @@ class ServerArgs: 
     | 
|
| 
       418 
485 
     | 
    
         
             
                scheduler_recv_interval: int = 1
         
     | 
| 
       419 
486 
     | 
    
         
             
                numa_node: Optional[List[int]] = None
         
     | 
| 
       420 
487 
     | 
    
         
             
                enable_deterministic_inference: bool = False
         
     | 
| 
      
 488 
     | 
    
         
            +
                rl_on_policy_target: Optional[str] = None
         
     | 
| 
       421 
489 
     | 
    
         | 
| 
       422 
490 
     | 
    
         
             
                # Dynamic batch tokenizer
         
     | 
| 
       423 
491 
     | 
    
         
             
                enable_dynamic_batch_tokenizer: bool = False
         
     | 
| 
         @@ -428,7 +496,6 @@ class ServerArgs: 
     | 
|
| 
       428 
496 
     | 
    
         
             
                debug_tensor_dump_output_folder: Optional[str] = None
         
     | 
| 
       429 
497 
     | 
    
         
             
                debug_tensor_dump_input_file: Optional[str] = None
         
     | 
| 
       430 
498 
     | 
    
         
             
                debug_tensor_dump_inject: bool = False
         
     | 
| 
       431 
     | 
    
         
            -
                debug_tensor_dump_prefill_only: bool = False
         
     | 
| 
       432 
499 
     | 
    
         | 
| 
       433 
500 
     | 
    
         
             
                # PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
         
     | 
| 
       434 
501 
     | 
    
         
             
                disaggregation_mode: Literal["null", "prefill", "decode"] = "null"
         
     | 
| 
         @@ -452,12 +519,18 @@ class ServerArgs: 
     | 
|
| 
       452 
519 
     | 
    
         | 
| 
       453 
520 
     | 
    
         
             
                # For PD-Multiplexing
         
     | 
| 
       454 
521 
     | 
    
         
             
                enable_pdmux: bool = False
         
     | 
| 
       455 
     | 
    
         
            -
                 
     | 
| 
      
 522 
     | 
    
         
            +
                pdmux_config_path: Optional[str] = None
         
     | 
| 
      
 523 
     | 
    
         
            +
                sm_group_num: int = 8
         
     | 
| 
       456 
524 
     | 
    
         | 
| 
       457 
525 
     | 
    
         
             
                def __post_init__(self):
         
     | 
| 
       458 
526 
     | 
    
         
             
                    """
         
     | 
| 
       459 
527 
     | 
    
         
             
                    Orchestrates the handling of various server arguments, ensuring proper configuration and validation.
         
     | 
| 
       460 
528 
     | 
    
         
             
                    """
         
     | 
| 
      
 529 
     | 
    
         
            +
             
     | 
| 
      
 530 
     | 
    
         
            +
                    if self.model_path.lower() in ["none", "dummy"]:
         
     | 
| 
      
 531 
     | 
    
         
            +
                        # Skip for dummy models
         
     | 
| 
      
 532 
     | 
    
         
            +
                        return
         
     | 
| 
      
 533 
     | 
    
         
            +
             
     | 
| 
       461 
534 
     | 
    
         
             
                    # Handle deprecated arguments.
         
     | 
| 
       462 
535 
     | 
    
         
             
                    self._handle_deprecated_args()
         
     | 
| 
       463 
536 
     | 
    
         | 
| 
         @@ -477,6 +550,9 @@ class ServerArgs: 
     | 
|
| 
       477 
550 
     | 
    
         
             
                    # Apply model-specific adjustments.
         
     | 
| 
       478 
551 
     | 
    
         
             
                    self._handle_model_specific_adjustments()
         
     | 
| 
       479 
552 
     | 
    
         | 
| 
      
 553 
     | 
    
         
            +
                    # Handle Hicache settings.
         
     | 
| 
      
 554 
     | 
    
         
            +
                    self._handle_hicache()
         
     | 
| 
      
 555 
     | 
    
         
            +
             
     | 
| 
       480 
556 
     | 
    
         
             
                    # Set kernel backends.
         
     | 
| 
       481 
557 
     | 
    
         
             
                    self._handle_sampling_backend()
         
     | 
| 
       482 
558 
     | 
    
         
             
                    self._handle_attention_backend_compatibility()
         
     | 
| 
         @@ -484,21 +560,21 @@ class ServerArgs: 
     | 
|
| 
       484 
560 
     | 
    
         
             
                    self._handle_amd_specifics()
         
     | 
| 
       485 
561 
     | 
    
         
             
                    self._handle_grammar_backend()
         
     | 
| 
       486 
562 
     | 
    
         | 
| 
      
 563 
     | 
    
         
            +
                    # Handle Ktransformers specific configs
         
     | 
| 
      
 564 
     | 
    
         
            +
                    self._handle_ktransformers_configs()
         
     | 
| 
      
 565 
     | 
    
         
            +
             
     | 
| 
       487 
566 
     | 
    
         
             
                    # Handle data parallelism.
         
     | 
| 
       488 
567 
     | 
    
         
             
                    self._handle_data_parallelism()
         
     | 
| 
       489 
568 
     | 
    
         | 
| 
       490 
569 
     | 
    
         
             
                    # Handle MoE configurations.
         
     | 
| 
       491 
570 
     | 
    
         
             
                    self._handle_moe_kernel_config()
         
     | 
| 
       492 
     | 
    
         
            -
                    self. 
     | 
| 
      
 571 
     | 
    
         
            +
                    self._handle_a2a_moe()
         
     | 
| 
       493 
572 
     | 
    
         
             
                    self._handle_eplb_and_dispatch()
         
     | 
| 
       494 
573 
     | 
    
         
             
                    self._handle_expert_distribution_metrics()
         
     | 
| 
       495 
574 
     | 
    
         | 
| 
       496 
575 
     | 
    
         
             
                    # Handle pipeline parallelism.
         
     | 
| 
       497 
576 
     | 
    
         
             
                    self._handle_pipeline_parallelism()
         
     | 
| 
       498 
577 
     | 
    
         | 
| 
       499 
     | 
    
         
            -
                    # Handle Hicache settings.
         
     | 
| 
       500 
     | 
    
         
            -
                    self._handle_hicache()
         
     | 
| 
       501 
     | 
    
         
            -
             
     | 
| 
       502 
578 
     | 
    
         
             
                    # Handle speculative decoding logic.
         
     | 
| 
       503 
579 
     | 
    
         
             
                    self._handle_speculative_decoding()
         
     | 
| 
       504 
580 
     | 
    
         | 
| 
         @@ -526,8 +602,17 @@ class ServerArgs: 
     | 
|
| 
       526 
602 
     | 
    
         
             
                    # Handle any other necessary validations.
         
     | 
| 
       527 
603 
     | 
    
         
             
                    self._handle_other_validations()
         
     | 
| 
       528 
604 
     | 
    
         | 
| 
      
 605 
     | 
    
         
            +
                    # Handle elastic expert parallelism.
         
     | 
| 
      
 606 
     | 
    
         
            +
                    self._handle_elastic_ep()
         
     | 
| 
      
 607 
     | 
    
         
            +
             
     | 
| 
       529 
608 
     | 
    
         
             
                def _handle_deprecated_args(self):
         
     | 
| 
       530 
     | 
    
         
            -
                     
     | 
| 
      
 609 
     | 
    
         
            +
                    # handle deprecated tool call parsers
         
     | 
| 
      
 610 
     | 
    
         
            +
                    deprecated_tool_call_parsers = {"qwen25": "qwen", "glm45": "glm"}
         
     | 
| 
      
 611 
     | 
    
         
            +
                    if self.tool_call_parser in deprecated_tool_call_parsers:
         
     | 
| 
      
 612 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 613 
     | 
    
         
            +
                            f"The tool_call_parser '{self.tool_call_parser}' is deprecated. Please use '{deprecated_tool_call_parsers[self.tool_call_parser]}' instead."
         
     | 
| 
      
 614 
     | 
    
         
            +
                        )
         
     | 
| 
      
 615 
     | 
    
         
            +
                        self.tool_call_parser = deprecated_tool_call_parsers[self.tool_call_parser]
         
     | 
| 
       531 
616 
     | 
    
         | 
| 
       532 
617 
     | 
    
         
             
                def _handle_missing_default_values(self):
         
     | 
| 
       533 
618 
     | 
    
         
             
                    if self.tokenizer_path is None:
         
     | 
| 
         @@ -571,9 +656,19 @@ class ServerArgs: 
     | 
|
| 
       571 
656 
     | 
    
         
             
                                self.chunked_prefill_size = 2048
         
     | 
| 
       572 
657 
     | 
    
         
             
                            if self.cuda_graph_max_bs is None:
         
     | 
| 
       573 
658 
     | 
    
         
             
                                self.cuda_graph_max_bs = 8
         
     | 
| 
      
 659 
     | 
    
         
            +
                        elif is_npu() and gpu_mem < 32 * 1024:
         
     | 
| 
      
 660 
     | 
    
         
            +
                            # Atlas A2B4
         
     | 
| 
      
 661 
     | 
    
         
            +
                            # (chunked_prefill_size 32k, cuda_graph_max_bs 16 if tp < 4 else 64)
         
     | 
| 
      
 662 
     | 
    
         
            +
                            if self.chunked_prefill_size is None:
         
     | 
| 
      
 663 
     | 
    
         
            +
                                self.chunked_prefill_size = 32768
         
     | 
| 
      
 664 
     | 
    
         
            +
                            if self.cuda_graph_max_bs is None:
         
     | 
| 
      
 665 
     | 
    
         
            +
                                if self.tp_size < 4:
         
     | 
| 
      
 666 
     | 
    
         
            +
                                    self.cuda_graph_max_bs = 16
         
     | 
| 
      
 667 
     | 
    
         
            +
                                else:
         
     | 
| 
      
 668 
     | 
    
         
            +
                                    self.cuda_graph_max_bs = 64
         
     | 
| 
       574 
669 
     | 
    
         
             
                        elif gpu_mem < 35 * 1024:
         
     | 
| 
       575 
670 
     | 
    
         
             
                            # A10, 4090, 5090
         
     | 
| 
       576 
     | 
    
         
            -
                            # (chunked_prefill_size 2k, cuda_graph_max_bs  
     | 
| 
      
 671 
     | 
    
         
            +
                            # (chunked_prefill_size 2k, cuda_graph_max_bs 24 if tp < 4 else 80)
         
     | 
| 
       577 
672 
     | 
    
         
             
                            if self.chunked_prefill_size is None:
         
     | 
| 
       578 
673 
     | 
    
         
             
                                self.chunked_prefill_size = 2048
         
     | 
| 
       579 
674 
     | 
    
         
             
                            if self.cuda_graph_max_bs is None:
         
     | 
| 
         @@ -581,7 +676,7 @@ class ServerArgs: 
     | 
|
| 
       581 
676 
     | 
    
         
             
                                # However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs
         
     | 
| 
       582 
677 
     | 
    
         
             
                                # from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
         
     | 
| 
       583 
678 
     | 
    
         
             
                                if self.tp_size < 4:
         
     | 
| 
       584 
     | 
    
         
            -
                                    self.cuda_graph_max_bs =  
     | 
| 
      
 679 
     | 
    
         
            +
                                    self.cuda_graph_max_bs = 24
         
     | 
| 
       585 
680 
     | 
    
         
             
                                else:
         
     | 
| 
       586 
681 
     | 
    
         
             
                                    self.cuda_graph_max_bs = 80
         
     | 
| 
       587 
682 
     | 
    
         
             
                        elif gpu_mem < 60 * 1024:
         
     | 
| 
         @@ -594,6 +689,16 @@ class ServerArgs: 
     | 
|
| 
       594 
689 
     | 
    
         
             
                                    self.cuda_graph_max_bs = 32
         
     | 
| 
       595 
690 
     | 
    
         
             
                                else:
         
     | 
| 
       596 
691 
     | 
    
         
             
                                    self.cuda_graph_max_bs = 160
         
     | 
| 
      
 692 
     | 
    
         
            +
                        elif is_npu() and gpu_mem < 64 * 1024:
         
     | 
| 
      
 693 
     | 
    
         
            +
                            # Atlas A2 and Atlas A3
         
     | 
| 
      
 694 
     | 
    
         
            +
                            # (chunked_prefill_size 32k, cuda_graph_max_bs 64 if tp < 4 else 128)
         
     | 
| 
      
 695 
     | 
    
         
            +
                            if self.chunked_prefill_size is None:
         
     | 
| 
      
 696 
     | 
    
         
            +
                                self.chunked_prefill_size = 32768
         
     | 
| 
      
 697 
     | 
    
         
            +
                            if self.cuda_graph_max_bs is None:
         
     | 
| 
      
 698 
     | 
    
         
            +
                                if self.tp_size < 4:
         
     | 
| 
      
 699 
     | 
    
         
            +
                                    self.cuda_graph_max_bs = 64
         
     | 
| 
      
 700 
     | 
    
         
            +
                                else:
         
     | 
| 
      
 701 
     | 
    
         
            +
                                    self.cuda_graph_max_bs = 128
         
     | 
| 
       597 
702 
     | 
    
         
             
                        elif gpu_mem < 90 * 1024:
         
     | 
| 
       598 
703 
     | 
    
         
             
                            # H100, A100
         
     | 
| 
       599 
704 
     | 
    
         
             
                            # (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
         
     | 
| 
         @@ -634,6 +739,11 @@ class ServerArgs: 
     | 
|
| 
       634 
739 
     | 
    
         
             
                    else:
         
     | 
| 
       635 
740 
     | 
    
         
             
                        self.cuda_graph_max_bs = max(self.cuda_graph_bs)
         
     | 
| 
       636 
741 
     | 
    
         | 
| 
      
 742 
     | 
    
         
            +
                    if self.piecewise_cuda_graph_tokens is None:
         
     | 
| 
      
 743 
     | 
    
         
            +
                        self.piecewise_cuda_graph_tokens = (
         
     | 
| 
      
 744 
     | 
    
         
            +
                            self._generate_piecewise_cuda_graph_tokens()
         
     | 
| 
      
 745 
     | 
    
         
            +
                        )
         
     | 
| 
      
 746 
     | 
    
         
            +
             
     | 
| 
       637 
747 
     | 
    
         
             
                    if self.mem_fraction_static is None:
         
     | 
| 
       638 
748 
     | 
    
         
             
                        # Constant meta data (e.g., from attention backend)
         
     | 
| 
       639 
749 
     | 
    
         
             
                        reserved_mem = 512
         
     | 
| 
         @@ -674,11 +784,9 @@ class ServerArgs: 
     | 
|
| 
       674 
784 
     | 
    
         
             
                            else 0.88
         
     | 
| 
       675 
785 
     | 
    
         
             
                        )
         
     | 
| 
       676 
786 
     | 
    
         | 
| 
       677 
     | 
    
         
            -
                        #  
     | 
| 
       678 
     | 
    
         
            -
                        #  
     | 
| 
       679 
     | 
    
         
            -
                         
     | 
| 
       680 
     | 
    
         
            -
             
     | 
| 
       681 
     | 
    
         
            -
                        model_config = ModelConfig.from_server_args(self)
         
     | 
| 
      
 787 
     | 
    
         
            +
                        # Multimodal models need more memory for the image processing,
         
     | 
| 
      
 788 
     | 
    
         
            +
                        # so we adjust the mem_fraction_static accordingly.
         
     | 
| 
      
 789 
     | 
    
         
            +
                        model_config = self.get_model_config()
         
     | 
| 
       682 
790 
     | 
    
         
             
                        if model_config.is_multimodal:
         
     | 
| 
       683 
791 
     | 
    
         
             
                            self.adjust_mem_fraction_for_vlm(model_config)
         
     | 
| 
       684 
792 
     | 
    
         | 
| 
         @@ -712,6 +820,25 @@ class ServerArgs: 
     | 
|
| 
       712 
820 
     | 
    
         | 
| 
       713 
821 
     | 
    
         
             
                    return capture_bs
         
     | 
| 
       714 
822 
     | 
    
         | 
| 
      
 823 
     | 
    
         
            +
                def _generate_piecewise_cuda_graph_tokens(self):
         
     | 
| 
      
 824 
     | 
    
         
            +
                    """
         
     | 
| 
      
 825 
     | 
    
         
            +
                    Generate the list of batch sizes for piecewise CUDA graph capture
         
     | 
| 
      
 826 
     | 
    
         
            +
                    based on piecewise_cuda_graph_max_tokens.
         
     | 
| 
      
 827 
     | 
    
         
            +
                    """
         
     | 
| 
      
 828 
     | 
    
         
            +
                    capture_sizes = (
         
     | 
| 
      
 829 
     | 
    
         
            +
                        list(range(4, 33, 4))
         
     | 
| 
      
 830 
     | 
    
         
            +
                        + list(range(48, 257, 16))
         
     | 
| 
      
 831 
     | 
    
         
            +
                        + list(range(288, 513, 32))
         
     | 
| 
      
 832 
     | 
    
         
            +
                        + list(range(640, 4096 + 1, 128))
         
     | 
| 
      
 833 
     | 
    
         
            +
                        + list(range(4352, self.piecewise_cuda_graph_max_tokens + 1, 256))
         
     | 
| 
      
 834 
     | 
    
         
            +
                    )
         
     | 
| 
      
 835 
     | 
    
         
            +
             
     | 
| 
      
 836 
     | 
    
         
            +
                    capture_sizes = [
         
     | 
| 
      
 837 
     | 
    
         
            +
                        s for s in capture_sizes if s <= self.piecewise_cuda_graph_max_tokens
         
     | 
| 
      
 838 
     | 
    
         
            +
                    ]
         
     | 
| 
      
 839 
     | 
    
         
            +
             
     | 
| 
      
 840 
     | 
    
         
            +
                    return capture_sizes
         
     | 
| 
      
 841 
     | 
    
         
            +
             
     | 
| 
       715 
842 
     | 
    
         
             
                def _handle_hpu_backends(self):
         
     | 
| 
       716 
843 
     | 
    
         
             
                    if self.device == "hpu":
         
     | 
| 
       717 
844 
     | 
    
         
             
                        self.attention_backend = "torch_native"
         
     | 
| 
         @@ -731,21 +858,59 @@ class ServerArgs: 
     | 
|
| 
       731 
858 
     | 
    
         | 
| 
       732 
859 
     | 
    
         
             
                    hf_config = self.get_hf_config()
         
     | 
| 
       733 
860 
     | 
    
         
             
                    model_arch = hf_config.architectures[0]
         
     | 
| 
       734 
     | 
    
         
            -
                    if model_arch in [" 
     | 
| 
       735 
     | 
    
         
            -
                        if  
     | 
| 
      
 861 
     | 
    
         
            +
                    if model_arch in ["DeepseekV3ForCausalLM"] and not is_deepseek_nsa(hf_config):
         
     | 
| 
      
 862 
     | 
    
         
            +
                        if is_cuda() and is_sm100_supported():
         
     | 
| 
      
 863 
     | 
    
         
            +
                            if (
         
     | 
| 
      
 864 
     | 
    
         
            +
                                self.attention_backend is None
         
     | 
| 
      
 865 
     | 
    
         
            +
                                and self.prefill_attention_backend is None
         
     | 
| 
      
 866 
     | 
    
         
            +
                                and self.decode_attention_backend is None
         
     | 
| 
      
 867 
     | 
    
         
            +
                            ):
         
     | 
| 
      
 868 
     | 
    
         
            +
                                self.attention_backend = "trtllm_mla"
         
     | 
| 
      
 869 
     | 
    
         
            +
                                logger.info(
         
     | 
| 
      
 870 
     | 
    
         
            +
                                    "Use trtllm_mla as attention backend on sm100 for DeepseekV3ForCausalLM"
         
     | 
| 
      
 871 
     | 
    
         
            +
                                )
         
     | 
| 
      
 872 
     | 
    
         
            +
                            if not self.enable_dp_attention:
         
     | 
| 
      
 873 
     | 
    
         
            +
                                self.enable_flashinfer_allreduce_fusion = True
         
     | 
| 
      
 874 
     | 
    
         
            +
                                logger.info(
         
     | 
| 
      
 875 
     | 
    
         
            +
                                    "Enable FlashInfer AllReduce Fusion on sm100 for DeepseekV3ForCausalLM"
         
     | 
| 
      
 876 
     | 
    
         
            +
                                )
         
     | 
| 
      
 877 
     | 
    
         
            +
                            if self.moe_runner_backend == "auto":
         
     | 
| 
      
 878 
     | 
    
         
            +
                                self.moe_runner_backend = "flashinfer_trtllm"
         
     | 
| 
      
 879 
     | 
    
         
            +
                                logger.info(
         
     | 
| 
      
 880 
     | 
    
         
            +
                                    "Use flashinfer_trtllm as MoE runner backend on sm100 for DeepseekV3ForCausalLM"
         
     | 
| 
      
 881 
     | 
    
         
            +
                                )
         
     | 
| 
      
 882 
     | 
    
         
            +
                                if self.quantization is None:
         
     | 
| 
      
 883 
     | 
    
         
            +
                                    # Default DeepSeek V3/R1 native FP8 when not explicitly set,
         
     | 
| 
      
 884 
     | 
    
         
            +
                                    # Because we need this condition for an assertion in
         
     | 
| 
      
 885 
     | 
    
         
            +
                                    # flashinfer_trtllm MoE runner backend.
         
     | 
| 
      
 886 
     | 
    
         
            +
                                    self.quantization = "fp8"
         
     | 
| 
      
 887 
     | 
    
         
            +
                                    logger.info(
         
     | 
| 
      
 888 
     | 
    
         
            +
                                        "Quantization not specified, default to fp8 for DeepSeek on sm100"
         
     | 
| 
      
 889 
     | 
    
         
            +
                                    )
         
     | 
| 
      
 890 
     | 
    
         
            +
             
     | 
| 
      
 891 
     | 
    
         
            +
                    elif model_arch in ["GptOssForCausalLM"]:
         
     | 
| 
      
 892 
     | 
    
         
            +
                        if (
         
     | 
| 
      
 893 
     | 
    
         
            +
                            self.attention_backend is None
         
     | 
| 
      
 894 
     | 
    
         
            +
                            and self.prefill_attention_backend is None
         
     | 
| 
      
 895 
     | 
    
         
            +
                            and self.decode_attention_backend is None
         
     | 
| 
      
 896 
     | 
    
         
            +
                        ):
         
     | 
| 
       736 
897 
     | 
    
         
             
                            if is_cuda() and is_sm100_supported():
         
     | 
| 
       737 
898 
     | 
    
         
             
                                self.attention_backend = "trtllm_mha"
         
     | 
| 
       738 
899 
     | 
    
         
             
                            elif is_cuda() and is_sm90_supported():
         
     | 
| 
       739 
900 
     | 
    
         
             
                                self.attention_backend = "fa3"
         
     | 
| 
       740 
901 
     | 
    
         
             
                            else:
         
     | 
| 
       741 
902 
     | 
    
         
             
                                self.attention_backend = "triton"
         
     | 
| 
       742 
     | 
    
         
            -
             
     | 
| 
       743 
     | 
    
         
            -
                         
     | 
| 
       744 
     | 
    
         
            -
             
     | 
| 
       745 
     | 
    
         
            -
                        )
         
     | 
| 
      
 903 
     | 
    
         
            +
             
     | 
| 
      
 904 
     | 
    
         
            +
                        supported_backends = ["triton", "trtllm_mha", "fa3", "fa4"]
         
     | 
| 
      
 905 
     | 
    
         
            +
                        prefill_attn_backend, decode_attn_backend = self.get_attention_backends()
         
     | 
| 
       746 
906 
     | 
    
         
             
                        assert (
         
     | 
| 
       747 
     | 
    
         
            -
                             
     | 
| 
       748 
     | 
    
         
            -
             
     | 
| 
      
 907 
     | 
    
         
            +
                            prefill_attn_backend in supported_backends
         
     | 
| 
      
 908 
     | 
    
         
            +
                            and decode_attn_backend in supported_backends
         
     | 
| 
      
 909 
     | 
    
         
            +
                        ), (
         
     | 
| 
      
 910 
     | 
    
         
            +
                            f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got the following backends\n"
         
     | 
| 
      
 911 
     | 
    
         
            +
                            f"- Prefill: {prefill_attn_backend}\n"
         
     | 
| 
      
 912 
     | 
    
         
            +
                            f"- Decode: {decode_attn_backend}\n"
         
     | 
| 
      
 913 
     | 
    
         
            +
                        )
         
     | 
| 
       749 
914 
     | 
    
         | 
| 
       750 
915 
     | 
    
         
             
                        if is_sm100_supported():
         
     | 
| 
       751 
916 
     | 
    
         
             
                            if not self.enable_dp_attention:
         
     | 
| 
         @@ -788,7 +953,13 @@ class ServerArgs: 
     | 
|
| 
       788 
953 
     | 
    
         
             
                            "fa3",
         
     | 
| 
       789 
954 
     | 
    
         
             
                            "aiter",
         
     | 
| 
       790 
955 
     | 
    
         
             
                            "triton",
         
     | 
| 
       791 
     | 
    
         
            -
             
     | 
| 
      
 956 
     | 
    
         
            +
                            "trtllm_mha",
         
     | 
| 
      
 957 
     | 
    
         
            +
                        }, "fa3, aiter, triton, or trtllm_mha is required for Llama4 model"
         
     | 
| 
      
 958 
     | 
    
         
            +
                        if is_sm100_supported() and self.attention_backend is None:
         
     | 
| 
      
 959 
     | 
    
         
            +
                            self.attention_backend = "trtllm_mha"
         
     | 
| 
      
 960 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 961 
     | 
    
         
            +
                                "Use trtllm_mha as attention backend on sm100 for Llama4 model"
         
     | 
| 
      
 962 
     | 
    
         
            +
                            )
         
     | 
| 
       792 
963 
     | 
    
         
             
                    elif model_arch in [
         
     | 
| 
       793 
964 
     | 
    
         
             
                        "Gemma2ForCausalLM",
         
     | 
| 
       794 
965 
     | 
    
         
             
                        "Gemma3ForCausalLM",
         
     | 
| 
         @@ -802,6 +973,31 @@ class ServerArgs: 
     | 
|
| 
       802 
973 
     | 
    
         
             
                            f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
         
     | 
| 
       803 
974 
     | 
    
         
             
                        )
         
     | 
| 
       804 
975 
     | 
    
         
             
                        self.disable_hybrid_swa_memory = True
         
     | 
| 
      
 976 
     | 
    
         
            +
                    elif model_arch in ["Olmo2ForCausalLM"]:
         
     | 
| 
      
 977 
     | 
    
         
            +
                        # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with Olmo3 model.
         
     | 
| 
      
 978 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 979 
     | 
    
         
            +
                            f"Disabling hybrid SWA memory for {model_arch} as it is not yet supported."
         
     | 
| 
      
 980 
     | 
    
         
            +
                        )
         
     | 
| 
      
 981 
     | 
    
         
            +
                        self.disable_hybrid_swa_memory = True
         
     | 
| 
      
 982 
     | 
    
         
            +
             
     | 
| 
      
 983 
     | 
    
         
            +
                        if self.attention_backend is None:
         
     | 
| 
      
 984 
     | 
    
         
            +
                            if is_cuda() and is_sm100_supported():
         
     | 
| 
      
 985 
     | 
    
         
            +
                                self.attention_backend = "trtllm_mha"
         
     | 
| 
      
 986 
     | 
    
         
            +
                            elif is_cuda() and get_device_sm() >= 80:
         
     | 
| 
      
 987 
     | 
    
         
            +
                                self.attention_backend = "fa3"
         
     | 
| 
      
 988 
     | 
    
         
            +
                            else:
         
     | 
| 
      
 989 
     | 
    
         
            +
                                self.attention_backend = "triton"
         
     | 
| 
      
 990 
     | 
    
         
            +
             
     | 
| 
      
 991 
     | 
    
         
            +
                        # Flashinfer appears to degrade performance when sliding window attention
         
     | 
| 
      
 992 
     | 
    
         
            +
                        # is used for the Olmo2 architecture. Olmo2 does not use sliding window attention
         
     | 
| 
      
 993 
     | 
    
         
            +
                        # but Olmo3 does.
         
     | 
| 
      
 994 
     | 
    
         
            +
                        assert (
         
     | 
| 
      
 995 
     | 
    
         
            +
                            self.attention_backend != "flashinfer"
         
     | 
| 
      
 996 
     | 
    
         
            +
                        ), "FlashInfer backend can significantly degrade the performance of Olmo3 models."
         
     | 
| 
      
 997 
     | 
    
         
            +
             
     | 
| 
      
 998 
     | 
    
         
            +
                        logger.info(
         
     | 
| 
      
 999 
     | 
    
         
            +
                            f"Using {self.attention_backend} as attention backend for {model_arch}."
         
     | 
| 
      
 1000 
     | 
    
         
            +
                        )
         
     | 
| 
       805 
1001 
     | 
    
         | 
| 
       806 
1002 
     | 
    
         
             
                    if is_deepseek_nsa(hf_config):
         
     | 
| 
       807 
1003 
     | 
    
         
             
                        if (
         
     | 
| 
         @@ -820,9 +1016,6 @@ class ServerArgs: 
     | 
|
| 
       820 
1016 
     | 
    
         
             
                            self.page_size = 64
         
     | 
| 
       821 
1017 
     | 
    
         
             
                            logger.warning("Setting page size to 64 for DeepSeek NSA.")
         
     | 
| 
       822 
1018 
     | 
    
         | 
| 
       823 
     | 
    
         
            -
                            self.mem_fraction_static = 0.8
         
     | 
| 
       824 
     | 
    
         
            -
                            logger.warning("Setting mem fraction static to 0.8 for DeepSeek NSA.")
         
     | 
| 
       825 
     | 
    
         
            -
             
     | 
| 
       826 
1019 
     | 
    
         
             
                            # For Hopper, we support both bf16 and fp8 kv cache; for Blackwell, we support fp8 only currently
         
     | 
| 
       827 
1020 
     | 
    
         
             
                            import torch
         
     | 
| 
       828 
1021 
     | 
    
         | 
| 
         @@ -832,10 +1025,10 @@ class ServerArgs: 
     | 
|
| 
       832 
1025 
     | 
    
         
             
                                logger.warning("Setting KV cache dtype to fp8.")
         
     | 
| 
       833 
1026 
     | 
    
         | 
| 
       834 
1027 
     | 
    
         
             
                            if self.kv_cache_dtype == "fp8_e4m3":
         
     | 
| 
       835 
     | 
    
         
            -
                                self. 
     | 
| 
       836 
     | 
    
         
            -
                                self. 
     | 
| 
      
 1028 
     | 
    
         
            +
                                self.nsa_prefill_backend = "flashmla_kv"
         
     | 
| 
      
 1029 
     | 
    
         
            +
                                self.nsa_decode_backend = "flashmla_kv"
         
     | 
| 
       837 
1030 
     | 
    
         
             
                                logger.warning(
         
     | 
| 
       838 
     | 
    
         
            -
                                    "Setting NSA backend to  
     | 
| 
      
 1031 
     | 
    
         
            +
                                    "Setting NSA backend to flashmla_kv for FP8 KV Cache."
         
     | 
| 
       839 
1032 
     | 
    
         
             
                                )
         
     | 
| 
       840 
1033 
     | 
    
         | 
| 
       841 
1034 
     | 
    
         
             
                            # Logging env vars for NSA
         
     | 
| 
         @@ -852,6 +1045,67 @@ class ServerArgs: 
     | 
|
| 
       852 
1045 
     | 
    
         
             
                        )
         
     | 
| 
       853 
1046 
     | 
    
         | 
| 
       854 
1047 
     | 
    
         
             
                def _handle_attention_backend_compatibility(self):
         
     | 
| 
      
 1048 
     | 
    
         
            +
                    model_config = self.get_model_config()
         
     | 
| 
      
 1049 
     | 
    
         
            +
                    use_mla_backend = self.use_mla_backend()
         
     | 
| 
      
 1050 
     | 
    
         
            +
             
     | 
| 
      
 1051 
     | 
    
         
            +
                    if self.prefill_attention_backend is not None and (
         
     | 
| 
      
 1052 
     | 
    
         
            +
                        self.prefill_attention_backend == self.decode_attention_backend
         
     | 
| 
      
 1053 
     | 
    
         
            +
                    ):  # override the default attention backend
         
     | 
| 
      
 1054 
     | 
    
         
            +
                        self.attention_backend = self.prefill_attention_backend
         
     | 
| 
      
 1055 
     | 
    
         
            +
             
     | 
| 
      
 1056 
     | 
    
         
            +
                    # Pick the default attention backend if not specified
         
     | 
| 
      
 1057 
     | 
    
         
            +
                    if self.attention_backend is None:
         
     | 
| 
      
 1058 
     | 
    
         
            +
                        """
         
     | 
| 
      
 1059 
     | 
    
         
            +
                        Auto select the fastest attention backend.
         
     | 
| 
      
 1060 
     | 
    
         
            +
             
     | 
| 
      
 1061 
     | 
    
         
            +
                        1. Models with MHA Architecture (e.g: Llama, QWen)
         
     | 
| 
      
 1062 
     | 
    
         
            +
                            1.1 We will turn on FA3 on hopper unless user use spec decode with topk > 1 or page_size > 1.
         
     | 
| 
      
 1063 
     | 
    
         
            +
                            1.2 In other cases, we will use flashinfer if available, otherwise use triton.
         
     | 
| 
      
 1064 
     | 
    
         
            +
                        2. Models with MLA Architecture and using FA3
         
     | 
| 
      
 1065 
     | 
    
         
            +
                            2.1 We will use FA3 backend on hopper.
         
     | 
| 
      
 1066 
     | 
    
         
            +
                            2.2 We will use Flashinfer backend on blackwell.
         
     | 
| 
      
 1067 
     | 
    
         
            +
                            2.3 Otherwise, we will use triton backend.
         
     | 
| 
      
 1068 
     | 
    
         
            +
                        """
         
     | 
| 
      
 1069 
     | 
    
         
            +
             
     | 
| 
      
 1070 
     | 
    
         
            +
                        if not use_mla_backend:
         
     | 
| 
      
 1071 
     | 
    
         
            +
                            # MHA architecture
         
     | 
| 
      
 1072 
     | 
    
         
            +
                            if (
         
     | 
| 
      
 1073 
     | 
    
         
            +
                                is_hopper_with_cuda_12_3()
         
     | 
| 
      
 1074 
     | 
    
         
            +
                                and is_no_spec_infer_or_topk_one(self)
         
     | 
| 
      
 1075 
     | 
    
         
            +
                                and is_fa3_default_architecture(self.model_config.hf_config)
         
     | 
| 
      
 1076 
     | 
    
         
            +
                            ):
         
     | 
| 
      
 1077 
     | 
    
         
            +
                                self.attention_backend = "fa3"
         
     | 
| 
      
 1078 
     | 
    
         
            +
                            elif is_hip():
         
     | 
| 
      
 1079 
     | 
    
         
            +
                                self.attention_backend = "aiter"
         
     | 
| 
      
 1080 
     | 
    
         
            +
                            elif is_npu():
         
     | 
| 
      
 1081 
     | 
    
         
            +
                                self.attention_backend = "ascend"
         
     | 
| 
      
 1082 
     | 
    
         
            +
                            else:
         
     | 
| 
      
 1083 
     | 
    
         
            +
                                self.attention_backend = (
         
     | 
| 
      
 1084 
     | 
    
         
            +
                                    "flashinfer" if is_flashinfer_available() else "triton"
         
     | 
| 
      
 1085 
     | 
    
         
            +
                                )
         
     | 
| 
      
 1086 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 1087 
     | 
    
         
            +
                            # MLA architecture
         
     | 
| 
      
 1088 
     | 
    
         
            +
                            if is_hopper_with_cuda_12_3():
         
     | 
| 
      
 1089 
     | 
    
         
            +
                                self.attention_backend = "fa3"
         
     | 
| 
      
 1090 
     | 
    
         
            +
                            elif is_sm100_supported():
         
     | 
| 
      
 1091 
     | 
    
         
            +
                                self.attention_backend = "flashinfer"
         
     | 
| 
      
 1092 
     | 
    
         
            +
                            elif is_hip():
         
     | 
| 
      
 1093 
     | 
    
         
            +
                                head_num = model_config.get_num_kv_heads(self.tp_size)
         
     | 
| 
      
 1094 
     | 
    
         
            +
                                # TODO current aiter only support head number 16 or 128 head number
         
     | 
| 
      
 1095 
     | 
    
         
            +
                                if head_num == 128 or head_num == 16:
         
     | 
| 
      
 1096 
     | 
    
         
            +
                                    self.attention_backend = "aiter"
         
     | 
| 
      
 1097 
     | 
    
         
            +
                                else:
         
     | 
| 
      
 1098 
     | 
    
         
            +
                                    self.attention_backend = "triton"
         
     | 
| 
      
 1099 
     | 
    
         
            +
                            elif is_npu():
         
     | 
| 
      
 1100 
     | 
    
         
            +
                                self.attention_backend = "ascend"
         
     | 
| 
      
 1101 
     | 
    
         
            +
                            else:
         
     | 
| 
      
 1102 
     | 
    
         
            +
                                self.attention_backend = "triton"
         
     | 
| 
      
 1103 
     | 
    
         
            +
             
     | 
| 
      
 1104 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 1105 
     | 
    
         
            +
                            f"Attention backend not explicitly specified. Use {self.attention_backend} backend by default."
         
     | 
| 
      
 1106 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1107 
     | 
    
         
            +
             
     | 
| 
      
 1108 
     | 
    
         
            +
                    # Torch native and flex attention backends
         
     | 
| 
       855 
1109 
     | 
    
         
             
                    if self.attention_backend == "torch_native":
         
     | 
| 
       856 
1110 
     | 
    
         
             
                        logger.warning(
         
     | 
| 
       857 
1111 
     | 
    
         
             
                            "Cuda graph is disabled because of using torch native attention backend"
         
     | 
| 
         @@ -867,12 +1121,7 @@ class ServerArgs: 
     | 
|
| 
       867 
1121 
     | 
    
         
             
                            self.speculative_algorithm is None
         
     | 
| 
       868 
1122 
     | 
    
         
             
                        ), "Speculative decoding is currently not supported with Flex Attention backend"
         
     | 
| 
       869 
1123 
     | 
    
         | 
| 
       870 
     | 
    
         
            -
                     
     | 
| 
       871 
     | 
    
         
            -
                        logger.warning(
         
     | 
| 
       872 
     | 
    
         
            -
                            "At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
         
     | 
| 
       873 
     | 
    
         
            -
                        )
         
     | 
| 
       874 
     | 
    
         
            -
                        self.page_size = 128
         
     | 
| 
       875 
     | 
    
         
            -
             
     | 
| 
      
 1124 
     | 
    
         
            +
                    # Major NVIDIA platforms backends
         
     | 
| 
       876 
1125 
     | 
    
         
             
                    if (
         
     | 
| 
       877 
1126 
     | 
    
         
             
                        self.attention_backend == "flashmla"
         
     | 
| 
       878 
1127 
     | 
    
         
             
                        or self.decode_attention_backend == "flashmla"
         
     | 
| 
         @@ -927,6 +1176,76 @@ class ServerArgs: 
     | 
|
| 
       927 
1176 
     | 
    
         
             
                            )
         
     | 
| 
       928 
1177 
     | 
    
         
             
                            self.page_size = 64
         
     | 
| 
       929 
1178 
     | 
    
         | 
| 
      
 1179 
     | 
    
         
            +
                    if self.attention_backend == "fa3" and self.kv_cache_dtype == "fp8_e5m2":
         
     | 
| 
      
 1180 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 1181 
     | 
    
         
            +
                            "FlashAttention3 only supports fp8_e4m3 if using FP8; "
         
     | 
| 
      
 1182 
     | 
    
         
            +
                            "Setting attention backend to triton."
         
     | 
| 
      
 1183 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1184 
     | 
    
         
            +
                        self.attention_backend = "triton"
         
     | 
| 
      
 1185 
     | 
    
         
            +
             
     | 
| 
      
 1186 
     | 
    
         
            +
                    if self.attention_backend == "fa4" or self.decode_attention_backend == "fa4":
         
     | 
| 
      
 1187 
     | 
    
         
            +
                        raise ValueError(
         
     | 
| 
      
 1188 
     | 
    
         
            +
                            "FA4 backend is only supported for prefill. Please use `--prefill-attention-backend fa4` instead."
         
     | 
| 
      
 1189 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1190 
     | 
    
         
            +
                    if self.prefill_attention_backend == "fa4":
         
     | 
| 
      
 1191 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 1192 
     | 
    
         
            +
                            f"FA4 backend only supports page size 128, changing page_size from {self.page_size} to 128."
         
     | 
| 
      
 1193 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1194 
     | 
    
         
            +
                        self.page_size = 128
         
     | 
| 
      
 1195 
     | 
    
         
            +
             
     | 
| 
      
 1196 
     | 
    
         
            +
                    # AMD platforms backends
         
     | 
| 
      
 1197 
     | 
    
         
            +
                    if self.attention_backend == "aiter":
         
     | 
| 
      
 1198 
     | 
    
         
            +
                        if model_config.context_len > 8192:
         
     | 
| 
      
 1199 
     | 
    
         
            +
                            self.mem_fraction_static *= 0.90
         
     | 
| 
      
 1200 
     | 
    
         
            +
             
     | 
| 
      
 1201 
     | 
    
         
            +
                    # NPU platforms backends
         
     | 
| 
      
 1202 
     | 
    
         
            +
                    if is_npu() and self.attention_backend in ["ascend"]:
         
     | 
| 
      
 1203 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 1204 
     | 
    
         
            +
                            "At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
         
     | 
| 
      
 1205 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1206 
     | 
    
         
            +
                        self.page_size = 128
         
     | 
| 
      
 1207 
     | 
    
         
            +
             
     | 
| 
      
 1208 
     | 
    
         
            +
                    # Other platforms backends
         
     | 
| 
      
 1209 
     | 
    
         
            +
                    if (
         
     | 
| 
      
 1210 
     | 
    
         
            +
                        self.attention_backend == "intel_amx"
         
     | 
| 
      
 1211 
     | 
    
         
            +
                        and self.device == "cpu"
         
     | 
| 
      
 1212 
     | 
    
         
            +
                        and not cpu_has_amx_support()
         
     | 
| 
      
 1213 
     | 
    
         
            +
                    ):
         
     | 
| 
      
 1214 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 1215 
     | 
    
         
            +
                            "The current platform does not support Intel AMX, will fallback to torch_native backend."
         
     | 
| 
      
 1216 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1217 
     | 
    
         
            +
                        self.attention_backend = "torch_native"
         
     | 
| 
      
 1218 
     | 
    
         
            +
             
     | 
| 
      
 1219 
     | 
    
         
            +
                    if (
         
     | 
| 
      
 1220 
     | 
    
         
            +
                        self.attention_backend == "intel_xpu"
         
     | 
| 
      
 1221 
     | 
    
         
            +
                        and self.device == "xpu"
         
     | 
| 
      
 1222 
     | 
    
         
            +
                        and not xpu_has_xmx_support()
         
     | 
| 
      
 1223 
     | 
    
         
            +
                    ):
         
     | 
| 
      
 1224 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 1225 
     | 
    
         
            +
                            "The current platform does not support Intel XMX, will fallback to triton backend."
         
     | 
| 
      
 1226 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1227 
     | 
    
         
            +
                        self.attention_backend = "triton"
         
     | 
| 
      
 1228 
     | 
    
         
            +
             
     | 
| 
      
 1229 
     | 
    
         
            +
                    if self.attention_backend == "intel_xpu":
         
     | 
| 
      
 1230 
     | 
    
         
            +
                        if self.page_size not in [32, 64, 128]:
         
     | 
| 
      
 1231 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 1232 
     | 
    
         
            +
                                f"Intel XPU attention backend only supports page_size of 32, 64 or 128, changing page_size from {self.page_size} to 128."
         
     | 
| 
      
 1233 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1234 
     | 
    
         
            +
                            self.page_size = 128
         
     | 
| 
      
 1235 
     | 
    
         
            +
             
     | 
| 
      
 1236 
     | 
    
         
            +
                    # Dual chunk flash attention backend
         
     | 
| 
      
 1237 
     | 
    
         
            +
                    if (
         
     | 
| 
      
 1238 
     | 
    
         
            +
                        getattr(model_config.hf_config, "dual_chunk_attention_config", None)
         
     | 
| 
      
 1239 
     | 
    
         
            +
                        is not None
         
     | 
| 
      
 1240 
     | 
    
         
            +
                    ):
         
     | 
| 
      
 1241 
     | 
    
         
            +
                        if self.attention_backend is None:
         
     | 
| 
      
 1242 
     | 
    
         
            +
                            self.attention_backend = "dual_chunk_flash_attn"
         
     | 
| 
      
 1243 
     | 
    
         
            +
                            logger.info("Dual chunk attention is turned on by default.")
         
     | 
| 
      
 1244 
     | 
    
         
            +
                        elif self.attention_backend != "dual_chunk_flash_attn":
         
     | 
| 
      
 1245 
     | 
    
         
            +
                            raise ValueError(
         
     | 
| 
      
 1246 
     | 
    
         
            +
                                "Dual chunk attention is enabled, but attention backend is set to "
         
     | 
| 
      
 1247 
     | 
    
         
            +
                                f"{self.attention_backend}. Please set it to 'dual_chunk_flash_attn'."
         
     | 
| 
      
 1248 
     | 
    
         
            +
                            )
         
     | 
| 
       930 
1249 
     | 
    
         
             
                    if self.attention_backend == "dual_chunk_flash_attn":
         
     | 
| 
       931 
1250 
     | 
    
         
             
                        logger.warning(
         
     | 
| 
       932 
1251 
     | 
    
         
             
                            "Mixed chunk and radix cache are disabled when using dual-chunk flash attention backend"
         
     | 
| 
         @@ -946,6 +1265,22 @@ class ServerArgs: 
     | 
|
| 
       946 
1265 
     | 
    
         
             
                    if self.grammar_backend is None:
         
     | 
| 
       947 
1266 
     | 
    
         
             
                        self.grammar_backend = "xgrammar"
         
     | 
| 
       948 
1267 
     | 
    
         | 
| 
      
 1268 
     | 
    
         
            +
                def _handle_ktransformers_configs(self):
         
     | 
| 
      
 1269 
     | 
    
         
            +
                    from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import (
         
     | 
| 
      
 1270 
     | 
    
         
            +
                        CompressedTensorsWNA16AMXEPMoEMethod,
         
     | 
| 
      
 1271 
     | 
    
         
            +
                        override_config,
         
     | 
| 
      
 1272 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1273 
     | 
    
         
            +
             
     | 
| 
      
 1274 
     | 
    
         
            +
                    override_config(
         
     | 
| 
      
 1275 
     | 
    
         
            +
                        CompressedTensorsWNA16AMXEPMoEMethod,
         
     | 
| 
      
 1276 
     | 
    
         
            +
                        self.kt_num_gpu_experts,
         
     | 
| 
      
 1277 
     | 
    
         
            +
                        self.kt_cpuinfer,
         
     | 
| 
      
 1278 
     | 
    
         
            +
                        self.kt_threadpool_count,
         
     | 
| 
      
 1279 
     | 
    
         
            +
                        self.kt_amx_weight_path,
         
     | 
| 
      
 1280 
     | 
    
         
            +
                        self.kt_amx_method,
         
     | 
| 
      
 1281 
     | 
    
         
            +
                        self.chunked_prefill_size,
         
     | 
| 
      
 1282 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1283 
     | 
    
         
            +
             
     | 
| 
       949 
1284 
     | 
    
         
             
                def _handle_data_parallelism(self):
         
     | 
| 
       950 
1285 
     | 
    
         
             
                    if self.dp_size == 1:
         
     | 
| 
       951 
1286 
     | 
    
         
             
                        self.enable_dp_attention = False
         
     | 
| 
         @@ -983,7 +1318,7 @@ class ServerArgs: 
     | 
|
| 
       983 
1318 
     | 
    
         
             
                            "FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
         
     | 
| 
       984 
1319 
     | 
    
         
             
                        )
         
     | 
| 
       985 
1320 
     | 
    
         | 
| 
       986 
     | 
    
         
            -
                def  
     | 
| 
      
 1321 
     | 
    
         
            +
                def _handle_a2a_moe(self):
         
     | 
| 
       987 
1322 
     | 
    
         
             
                    if self.moe_a2a_backend == "deepep":
         
     | 
| 
       988 
1323 
     | 
    
         
             
                        if self.deepep_mode == "normal":
         
     | 
| 
       989 
1324 
     | 
    
         
             
                            logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
         
     | 
| 
         @@ -993,6 +1328,12 @@ class ServerArgs: 
     | 
|
| 
       993 
1328 
     | 
    
         
             
                            f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
         
     | 
| 
       994 
1329 
     | 
    
         
             
                        )
         
     | 
| 
       995 
1330 
     | 
    
         | 
| 
      
 1331 
     | 
    
         
            +
                    if self.moe_a2a_backend == "mooncake":
         
     | 
| 
      
 1332 
     | 
    
         
            +
                        self.ep_size = self.tp_size
         
     | 
| 
      
 1333 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 1334 
     | 
    
         
            +
                            f"Mooncake MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
         
     | 
| 
      
 1335 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1336 
     | 
    
         
            +
             
     | 
| 
       996 
1337 
     | 
    
         
             
                def _handle_eplb_and_dispatch(self):
         
     | 
| 
       997 
1338 
     | 
    
         
             
                    if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
         
     | 
| 
       998 
1339 
     | 
    
         
             
                        self.expert_distribution_recorder_mode = "stat"
         
     | 
| 
         @@ -1008,6 +1349,15 @@ class ServerArgs: 
     | 
|
| 
       1008 
1349 
     | 
    
         
             
                    if self.enable_eplb:
         
     | 
| 
       1009 
1350 
     | 
    
         
             
                        assert self.ep_size > 1
         
     | 
| 
       1010 
1351 
     | 
    
         | 
| 
      
 1352 
     | 
    
         
            +
                def _handle_elastic_ep(self):
         
     | 
| 
      
 1353 
     | 
    
         
            +
                    if self.elastic_ep_backend is not None:
         
     | 
| 
      
 1354 
     | 
    
         
            +
                        if self.enable_eplb:
         
     | 
| 
      
 1355 
     | 
    
         
            +
                            if self.eplb_algorithm == "auto":
         
     | 
| 
      
 1356 
     | 
    
         
            +
                                self.eplb_algorithm = "elasticity_aware"
         
     | 
| 
      
 1357 
     | 
    
         
            +
                            assert (
         
     | 
| 
      
 1358 
     | 
    
         
            +
                                self.eplb_algorithm == "elasticity_aware"
         
     | 
| 
      
 1359 
     | 
    
         
            +
                            ), "Elastic EP requires eplb_algorithm to be set to 'auto' or 'elasticity_aware'."
         
     | 
| 
      
 1360 
     | 
    
         
            +
             
     | 
| 
       1011 
1361 
     | 
    
         
             
                def _handle_expert_distribution_metrics(self):
         
     | 
| 
       1012 
1362 
     | 
    
         
             
                    if self.enable_expert_distribution_metrics and (
         
     | 
| 
       1013 
1363 
     | 
    
         
             
                        self.expert_distribution_recorder_mode is None
         
     | 
| 
         @@ -1046,6 +1396,24 @@ class ServerArgs: 
     | 
|
| 
       1046 
1396 
     | 
    
         
             
                                "Page first direct layout only support direct io backend"
         
     | 
| 
       1047 
1397 
     | 
    
         
             
                            )
         
     | 
| 
       1048 
1398 
     | 
    
         | 
| 
      
 1399 
     | 
    
         
            +
                    if self.enable_hierarchical_cache and self.hicache_io_backend == "kernel":
         
     | 
| 
      
 1400 
     | 
    
         
            +
                        # fix for the compatibility issue with FlashAttention3 decoding and HiCache kernel backend
         
     | 
| 
      
 1401 
     | 
    
         
            +
                        if self.decode_attention_backend is None:
         
     | 
| 
      
 1402 
     | 
    
         
            +
                            if not self.use_mla_backend():
         
     | 
| 
      
 1403 
     | 
    
         
            +
                                self.decode_attention_backend = (
         
     | 
| 
      
 1404 
     | 
    
         
            +
                                    "flashinfer" if is_flashinfer_available() else "triton"
         
     | 
| 
      
 1405 
     | 
    
         
            +
                                )
         
     | 
| 
      
 1406 
     | 
    
         
            +
                            else:
         
     | 
| 
      
 1407 
     | 
    
         
            +
                                self.decode_attention_backend = (
         
     | 
| 
      
 1408 
     | 
    
         
            +
                                    "flashinfer" if is_sm100_supported() else "triton"
         
     | 
| 
      
 1409 
     | 
    
         
            +
                                )
         
     | 
| 
      
 1410 
     | 
    
         
            +
                        elif self.decode_attention_backend == "fa3":
         
     | 
| 
      
 1411 
     | 
    
         
            +
                            self.hicache_io_backend = "direct"
         
     | 
| 
      
 1412 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 1413 
     | 
    
         
            +
                                "FlashAttention3 decode backend is not compatible with hierarchical cache. "
         
     | 
| 
      
 1414 
     | 
    
         
            +
                                "Setting hicache_io_backend to vanilla I/O, which may lead to suboptimal performance with small page sizes."
         
     | 
| 
      
 1415 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1416 
     | 
    
         
            +
             
     | 
| 
       1049 
1417 
     | 
    
         
             
                def _handle_speculative_decoding(self):
         
     | 
| 
       1050 
1418 
     | 
    
         
             
                    if self.speculative_algorithm == "NEXTN":
         
     | 
| 
       1051 
1419 
     | 
    
         
             
                        self.speculative_algorithm = "EAGLE"
         
     | 
| 
         @@ -1056,13 +1424,28 @@ class ServerArgs: 
     | 
|
| 
       1056 
1424 
     | 
    
         
             
                            raise ValueError(
         
     | 
| 
       1057 
1425 
     | 
    
         
             
                                "Currently standalone speculative decoding does not support dp attention."
         
     | 
| 
       1058 
1426 
     | 
    
         
             
                            )
         
     | 
| 
      
 1427 
     | 
    
         
            +
             
     | 
| 
       1059 
1428 
     | 
    
         
             
                        if self.max_running_requests is None:
         
     | 
| 
       1060 
1429 
     | 
    
         
             
                            self.max_running_requests = 48
         
     | 
| 
       1061 
     | 
    
         
            -
             
     | 
| 
       1062 
     | 
    
         
            -
             
     | 
| 
       1063 
     | 
    
         
            -
                             
     | 
| 
       1064 
     | 
    
         
            -
             
     | 
| 
       1065 
     | 
    
         
            -
                         
     | 
| 
      
 1430 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 1431 
     | 
    
         
            +
                                "Max running requests is reset to 48 for speculative decoding. You can override this by explicitly setting --max-running-requests."
         
     | 
| 
      
 1432 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1433 
     | 
    
         
            +
             
     | 
| 
      
 1434 
     | 
    
         
            +
                        if (
         
     | 
| 
      
 1435 
     | 
    
         
            +
                            self.speculative_algorithm == "EAGLE"
         
     | 
| 
      
 1436 
     | 
    
         
            +
                            and envs.SGLANG_ENABLE_SPEC_V2.get()
         
     | 
| 
      
 1437 
     | 
    
         
            +
                        ):
         
     | 
| 
      
 1438 
     | 
    
         
            +
                            self.disable_overlap_schedule = False
         
     | 
| 
      
 1439 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 1440 
     | 
    
         
            +
                                "Beta spec is enabled for eagle speculative decoding and overlap schedule is turned on."
         
     | 
| 
      
 1441 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1442 
     | 
    
         
            +
             
     | 
| 
      
 1443 
     | 
    
         
            +
                        if not envs.SGLANG_ENABLE_SPEC_V2.get():
         
     | 
| 
      
 1444 
     | 
    
         
            +
                            self.disable_overlap_schedule = True
         
     | 
| 
      
 1445 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 1446 
     | 
    
         
            +
                                "Overlap scheduler is disabled because of using eagle3 or standalone speculative decoding."
         
     | 
| 
      
 1447 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1448 
     | 
    
         
            +
             
     | 
| 
       1066 
1449 
     | 
    
         
             
                        if self.enable_mixed_chunk:
         
     | 
| 
       1067 
1450 
     | 
    
         
             
                            self.enable_mixed_chunk = False
         
     | 
| 
       1068 
1451 
     | 
    
         
             
                            logger.warning(
         
     | 
| 
         @@ -1129,8 +1512,13 @@ class ServerArgs: 
     | 
|
| 
       1129 
1512 
     | 
    
         
             
                            raise ValueError(
         
     | 
| 
       1130 
1513 
     | 
    
         
             
                                "Ngram speculative decoding only supports CUDA device."
         
     | 
| 
       1131 
1514 
     | 
    
         
             
                            )
         
     | 
| 
      
 1515 
     | 
    
         
            +
             
     | 
| 
       1132 
1516 
     | 
    
         
             
                        if self.max_running_requests is None:
         
     | 
| 
       1133 
1517 
     | 
    
         
             
                            self.max_running_requests = 48
         
     | 
| 
      
 1518 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 1519 
     | 
    
         
            +
                                "Max running requests is reset to 48 for speculative decoding. You can override this by explicitly setting --max-running-requests."
         
     | 
| 
      
 1520 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1521 
     | 
    
         
            +
             
     | 
| 
       1134 
1522 
     | 
    
         
             
                        self.disable_overlap_schedule = True
         
     | 
| 
       1135 
1523 
     | 
    
         
             
                        self.enable_mixed_chunk = False
         
     | 
| 
       1136 
1524 
     | 
    
         
             
                        self.speculative_eagle_topk = self.speculative_ngram_max_bfs_breadth
         
     | 
| 
         @@ -1216,6 +1604,26 @@ class ServerArgs: 
     | 
|
| 
       1216 
1604 
     | 
    
         
             
                            "Please choose one tokenizer batching approach."
         
     | 
| 
       1217 
1605 
     | 
    
         
             
                        )
         
     | 
| 
       1218 
1606 
     | 
    
         | 
| 
      
 1607 
     | 
    
         
            +
                    if self.skip_tokenizer_init:
         
     | 
| 
      
 1608 
     | 
    
         
            +
                        if self.tokenizer_worker_num != 1:
         
     | 
| 
      
 1609 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 1610 
     | 
    
         
            +
                                "skip_tokenizer_init=True disables tokenizer workers; forcing tokenizer_worker_num=1 "
         
     | 
| 
      
 1611 
     | 
    
         
            +
                                f"(requested {self.tokenizer_worker_num})."
         
     | 
| 
      
 1612 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1613 
     | 
    
         
            +
                            self.tokenizer_worker_num = 1
         
     | 
| 
      
 1614 
     | 
    
         
            +
             
     | 
| 
      
 1615 
     | 
    
         
            +
                        if self.enable_tokenizer_batch_encode:
         
     | 
| 
      
 1616 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 1617 
     | 
    
         
            +
                                "skip_tokenizer_init=True ignores --enable-tokenizer-batch-encode; disabling it."
         
     | 
| 
      
 1618 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1619 
     | 
    
         
            +
                            self.enable_tokenizer_batch_encode = False
         
     | 
| 
      
 1620 
     | 
    
         
            +
             
     | 
| 
      
 1621 
     | 
    
         
            +
                        if self.enable_dynamic_batch_tokenizer:
         
     | 
| 
      
 1622 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 1623 
     | 
    
         
            +
                                "skip_tokenizer_init=True ignores --enable-dynamic-batch-tokenizer; disabling it."
         
     | 
| 
      
 1624 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1625 
     | 
    
         
            +
                            self.enable_dynamic_batch_tokenizer = False
         
     | 
| 
      
 1626 
     | 
    
         
            +
             
     | 
| 
       1219 
1627 
     | 
    
         
             
                def _handle_environment_variables(self):
         
     | 
| 
       1220 
1628 
     | 
    
         
             
                    os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
         
     | 
| 
       1221 
1629 
     | 
    
         
             
                        "1" if self.enable_torch_compile else "0"
         
     | 
| 
         @@ -1253,21 +1661,65 @@ class ServerArgs: 
     | 
|
| 
       1253 
1661 
     | 
    
         
             
                        )
         
     | 
| 
       1254 
1662 
     | 
    
         | 
| 
       1255 
1663 
     | 
    
         
             
                def _handle_deterministic_inference(self):
         
     | 
| 
      
 1664 
     | 
    
         
            +
                    if self.rl_on_policy_target is not None:
         
     | 
| 
      
 1665 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 1666 
     | 
    
         
            +
                            "Enable deterministic inference because of rl_on_policy_target."
         
     | 
| 
      
 1667 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1668 
     | 
    
         
            +
                        self.enable_deterministic_inference = True
         
     | 
| 
      
 1669 
     | 
    
         
            +
                        # TODO remove this environment variable as a whole
         
     | 
| 
      
 1670 
     | 
    
         
            +
                        os.environ["SGLANG_ENABLE_DETERMINISTIC_INFERENCE"] = "1"
         
     | 
| 
      
 1671 
     | 
    
         
            +
             
     | 
| 
       1256 
1672 
     | 
    
         
             
                    if self.enable_deterministic_inference:
         
     | 
| 
       1257 
1673 
     | 
    
         
             
                        # Check sampling backend
         
     | 
| 
       1258 
1674 
     | 
    
         
             
                        self.sampling_backend = "pytorch"
         
     | 
| 
       1259 
1675 
     | 
    
         
             
                        logger.warning(
         
     | 
| 
       1260 
1676 
     | 
    
         
             
                            "Sampling backend is set to pytorch for deterministic inference."
         
     | 
| 
       1261 
1677 
     | 
    
         
             
                        )
         
     | 
| 
      
 1678 
     | 
    
         
            +
                        is_deepseek_model = False
         
     | 
| 
      
 1679 
     | 
    
         
            +
                        if parse_connector_type(self.model_path) != ConnectorType.INSTANCE:
         
     | 
| 
      
 1680 
     | 
    
         
            +
                            try:
         
     | 
| 
      
 1681 
     | 
    
         
            +
                                hf_config = self.get_hf_config()
         
     | 
| 
      
 1682 
     | 
    
         
            +
                                model_arch = hf_config.architectures[0]
         
     | 
| 
      
 1683 
     | 
    
         
            +
                                is_deepseek_model = model_arch in [
         
     | 
| 
      
 1684 
     | 
    
         
            +
                                    "DeepseekV2ForCausalLM",
         
     | 
| 
      
 1685 
     | 
    
         
            +
                                    "DeepseekV3ForCausalLM",
         
     | 
| 
      
 1686 
     | 
    
         
            +
                                    "DeepseekV32ForCausalLM",
         
     | 
| 
      
 1687 
     | 
    
         
            +
                                ]
         
     | 
| 
      
 1688 
     | 
    
         
            +
                            except Exception:
         
     | 
| 
      
 1689 
     | 
    
         
            +
                                pass
         
     | 
| 
       1262 
1690 
     | 
    
         | 
| 
       1263 
1691 
     | 
    
         
             
                        # Check attention backend
         
     | 
| 
       1264 
     | 
    
         
            -
                        if self.attention_backend  
     | 
| 
      
 1692 
     | 
    
         
            +
                        if self.attention_backend is None:
         
     | 
| 
      
 1693 
     | 
    
         
            +
                            # User didn't specify attention backend, fallback based on GPU architecture
         
     | 
| 
      
 1694 
     | 
    
         
            +
                            if is_sm100_supported() or is_sm120_supported():
         
     | 
| 
      
 1695 
     | 
    
         
            +
                                # Blackwell and newer architectures
         
     | 
| 
      
 1696 
     | 
    
         
            +
                                if is_deepseek_model:
         
     | 
| 
      
 1697 
     | 
    
         
            +
                                    # fallback to triton for DeepSeek models because flashinfer doesn't support deterministic inference for DeepSeek models yet
         
     | 
| 
      
 1698 
     | 
    
         
            +
                                    self.attention_backend = "triton"
         
     | 
| 
      
 1699 
     | 
    
         
            +
                                else:
         
     | 
| 
      
 1700 
     | 
    
         
            +
                                    # fallback to flashinfer on Blackwell for non-DeepSeek models
         
     | 
| 
      
 1701 
     | 
    
         
            +
                                    self.attention_backend = "flashinfer"
         
     | 
| 
      
 1702 
     | 
    
         
            +
                            else:
         
     | 
| 
      
 1703 
     | 
    
         
            +
                                # Hopper (SM90) and older architectures
         
     | 
| 
      
 1704 
     | 
    
         
            +
                                self.attention_backend = "fa3"
         
     | 
| 
      
 1705 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 1706 
     | 
    
         
            +
                                f"Attention backend not specified. Falling back to '{self.attention_backend}' for deterministic inference. "
         
     | 
| 
      
 1707 
     | 
    
         
            +
                                f"You can explicitly set --attention-backend to one of {DETERMINISTIC_ATTENTION_BACKEND_CHOICES}."
         
     | 
| 
      
 1708 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1709 
     | 
    
         
            +
                        elif self.attention_backend not in DETERMINISTIC_ATTENTION_BACKEND_CHOICES:
         
     | 
| 
      
 1710 
     | 
    
         
            +
                            # User explicitly specified an incompatible attention backend
         
     | 
| 
       1265 
1711 
     | 
    
         
             
                            raise ValueError(
         
     | 
| 
       1266 
     | 
    
         
            -
                                f"Currently only {DETERMINISTIC_ATTENTION_BACKEND_CHOICES} attention backends are supported for deterministic inference 
     | 
| 
      
 1712 
     | 
    
         
            +
                                f"Currently only {DETERMINISTIC_ATTENTION_BACKEND_CHOICES} attention backends are supported for deterministic inference, "
         
     | 
| 
      
 1713 
     | 
    
         
            +
                                f"but you explicitly specified '{self.attention_backend}'."
         
     | 
| 
       1267 
1714 
     | 
    
         
             
                            )
         
     | 
| 
       1268 
1715 
     | 
    
         | 
| 
       1269 
     | 
    
         
            -
                         
     | 
| 
       1270 
     | 
    
         
            -
             
     | 
| 
      
 1716 
     | 
    
         
            +
                        if self.attention_backend not in ["fa3", "triton"]:
         
     | 
| 
      
 1717 
     | 
    
         
            +
                            if is_deepseek_model:
         
     | 
| 
      
 1718 
     | 
    
         
            +
                                raise ValueError(
         
     | 
| 
      
 1719 
     | 
    
         
            +
                                    f"Currently only fa3 and triton attention backends are supported for deterministic inference with DeepSeek models. But you're using {self.attention_backend}."
         
     | 
| 
      
 1720 
     | 
    
         
            +
                                )
         
     | 
| 
      
 1721 
     | 
    
         
            +
             
     | 
| 
      
 1722 
     | 
    
         
            +
                            # Currently, only FA3 and Triton supports radix cache. Support for other backends is in progress
         
     | 
| 
       1271 
1723 
     | 
    
         
             
                            self.disable_radix_cache = True
         
     | 
| 
       1272 
1724 
     | 
    
         
             
                            logger.warning(
         
     | 
| 
       1273 
1725 
     | 
    
         
             
                                f"Currently radix cache is not compatible with {self.attention_backend} attention backend for deterministic inference. It will be supported in the future."
         
     | 
| 
         @@ -1286,6 +1738,7 @@ class ServerArgs: 
     | 
|
| 
       1286 
1738 
     | 
    
         | 
| 
       1287 
1739 
     | 
    
         
             
                @staticmethod
         
     | 
| 
       1288 
1740 
     | 
    
         
             
                def add_cli_args(parser: argparse.ArgumentParser):
         
     | 
| 
      
 1741 
     | 
    
         
            +
             
     | 
| 
       1289 
1742 
     | 
    
         
             
                    # Model and tokenizer
         
     | 
| 
       1290 
1743 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1291 
1744 
     | 
    
         
             
                        "--model-path",
         
     | 
| 
         @@ -1405,6 +1858,11 @@ class ServerArgs: 
     | 
|
| 
       1405 
1858 
     | 
    
         
             
                        default=ServerArgs.port,
         
     | 
| 
       1406 
1859 
     | 
    
         
             
                        help="The port of the HTTP server.",
         
     | 
| 
       1407 
1860 
     | 
    
         
             
                    )
         
     | 
| 
      
 1861 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 1862 
     | 
    
         
            +
                        "--grpc-mode",
         
     | 
| 
      
 1863 
     | 
    
         
            +
                        action="store_true",
         
     | 
| 
      
 1864 
     | 
    
         
            +
                        help="If set, use gRPC server instead of HTTP server.",
         
     | 
| 
      
 1865 
     | 
    
         
            +
                    )
         
     | 
| 
       1408 
1866 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1409 
1867 
     | 
    
         
             
                        "--skip-server-warmup",
         
     | 
| 
       1410 
1868 
     | 
    
         
             
                        action="store_true",
         
     | 
| 
         @@ -1423,6 +1881,12 @@ class ServerArgs: 
     | 
|
| 
       1423 
1881 
     | 
    
         
             
                        default=ServerArgs.nccl_port,
         
     | 
| 
       1424 
1882 
     | 
    
         
             
                        help="The port for NCCL distributed environment setup. Defaults to a random port.",
         
     | 
| 
       1425 
1883 
     | 
    
         
             
                    )
         
     | 
| 
      
 1884 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 1885 
     | 
    
         
            +
                        "--checkpoint-engine-wait-weights-before-ready",
         
     | 
| 
      
 1886 
     | 
    
         
            +
                        action="store_true",
         
     | 
| 
      
 1887 
     | 
    
         
            +
                        help="If set, the server will wait for initial weights to be loaded via checkpoint-engine or other update methods "
         
     | 
| 
      
 1888 
     | 
    
         
            +
                        "before serving inference requests.",
         
     | 
| 
      
 1889 
     | 
    
         
            +
                    )
         
     | 
| 
       1426 
1890 
     | 
    
         | 
| 
       1427 
1891 
     | 
    
         
             
                    # Quantization and data type
         
     | 
| 
       1428 
1892 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
         @@ -1459,14 +1923,53 @@ class ServerArgs: 
     | 
|
| 
       1459 
1923 
     | 
    
         
             
                        "--kv-cache-dtype",
         
     | 
| 
       1460 
1924 
     | 
    
         
             
                        type=str,
         
     | 
| 
       1461 
1925 
     | 
    
         
             
                        default=ServerArgs.kv_cache_dtype,
         
     | 
| 
       1462 
     | 
    
         
            -
                        choices=["auto", "fp8_e5m2", "fp8_e4m3"],
         
     | 
| 
       1463 
     | 
    
         
            -
                        help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" and "fp8_e4m3"  
     | 
| 
      
 1926 
     | 
    
         
            +
                        choices=["auto", "fp8_e5m2", "fp8_e4m3", "bf16", "bfloat16"],
         
     | 
| 
      
 1927 
     | 
    
         
            +
                        help='Data type for kv cache storage. "auto" will use model data type. "bf16" or "bfloat16" for BF16 KV cache. "fp8_e5m2" and "fp8_e4m3" are supported for CUDA 11.8+.',
         
     | 
| 
       1464 
1928 
     | 
    
         
             
                    )
         
     | 
| 
       1465 
1929 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1466 
1930 
     | 
    
         
             
                        "--enable-fp32-lm-head",
         
     | 
| 
       1467 
1931 
     | 
    
         
             
                        action="store_true",
         
     | 
| 
       1468 
1932 
     | 
    
         
             
                        help="If set, the LM head outputs (logits) are in FP32.",
         
     | 
| 
       1469 
1933 
     | 
    
         
             
                    )
         
     | 
| 
      
 1934 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 1935 
     | 
    
         
            +
                        "--modelopt-quant",
         
     | 
| 
      
 1936 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 1937 
     | 
    
         
            +
                        default=ServerArgs.modelopt_quant,
         
     | 
| 
      
 1938 
     | 
    
         
            +
                        help="The ModelOpt quantization configuration. "
         
     | 
| 
      
 1939 
     | 
    
         
            +
                        "Supported values: 'fp8', 'int4_awq', 'w4a8_awq', 'nvfp4', 'nvfp4_awq'. "
         
     | 
| 
      
 1940 
     | 
    
         
            +
                        "This requires the NVIDIA Model Optimizer library to be installed: pip install nvidia-modelopt",
         
     | 
| 
      
 1941 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1942 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 1943 
     | 
    
         
            +
                        "--modelopt-checkpoint-restore-path",
         
     | 
| 
      
 1944 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 1945 
     | 
    
         
            +
                        default=ServerArgs.modelopt_checkpoint_restore_path,
         
     | 
| 
      
 1946 
     | 
    
         
            +
                        help="Path to restore a previously saved ModelOpt quantized checkpoint. "
         
     | 
| 
      
 1947 
     | 
    
         
            +
                        "If provided, the quantization process will be skipped and the model "
         
     | 
| 
      
 1948 
     | 
    
         
            +
                        "will be loaded from this checkpoint.",
         
     | 
| 
      
 1949 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1950 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 1951 
     | 
    
         
            +
                        "--modelopt-checkpoint-save-path",
         
     | 
| 
      
 1952 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 1953 
     | 
    
         
            +
                        default=ServerArgs.modelopt_checkpoint_save_path,
         
     | 
| 
      
 1954 
     | 
    
         
            +
                        help="Path to save the ModelOpt quantized checkpoint after quantization. "
         
     | 
| 
      
 1955 
     | 
    
         
            +
                        "This allows reusing the quantized model in future runs.",
         
     | 
| 
      
 1956 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1957 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 1958 
     | 
    
         
            +
                        "--modelopt-export-path",
         
     | 
| 
      
 1959 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 1960 
     | 
    
         
            +
                        default=ServerArgs.modelopt_export_path,
         
     | 
| 
      
 1961 
     | 
    
         
            +
                        help="Path to export the quantized model in HuggingFace format after ModelOpt quantization. "
         
     | 
| 
      
 1962 
     | 
    
         
            +
                        "The exported model can then be used directly with SGLang for inference. "
         
     | 
| 
      
 1963 
     | 
    
         
            +
                        "If not provided, the model will not be exported.",
         
     | 
| 
      
 1964 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1965 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 1966 
     | 
    
         
            +
                        "--quantize-and-serve",
         
     | 
| 
      
 1967 
     | 
    
         
            +
                        action="store_true",
         
     | 
| 
      
 1968 
     | 
    
         
            +
                        default=ServerArgs.quantize_and_serve,
         
     | 
| 
      
 1969 
     | 
    
         
            +
                        help="Quantize the model with ModelOpt and immediately serve it without exporting. "
         
     | 
| 
      
 1970 
     | 
    
         
            +
                        "This is useful for development and prototyping. For production, it's recommended "
         
     | 
| 
      
 1971 
     | 
    
         
            +
                        "to use separate quantization and deployment steps.",
         
     | 
| 
      
 1972 
     | 
    
         
            +
                    )
         
     | 
| 
       1470 
1973 
     | 
    
         | 
| 
       1471 
1974 
     | 
    
         
             
                    # Memory and scheduling
         
     | 
| 
       1472 
1975 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
         @@ -1519,6 +2022,12 @@ class ServerArgs: 
     | 
|
| 
       1519 
2022 
     | 
    
         
             
                        default=ServerArgs.enable_priority_scheduling,
         
     | 
| 
       1520 
2023 
     | 
    
         
             
                        help="Enable priority scheduling. Requests with higher priority integer values will be scheduled first by default.",
         
     | 
| 
       1521 
2024 
     | 
    
         
             
                    )
         
     | 
| 
      
 2025 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2026 
     | 
    
         
            +
                        "--abort-on-priority-when-disabled",
         
     | 
| 
      
 2027 
     | 
    
         
            +
                        action="store_true",
         
     | 
| 
      
 2028 
     | 
    
         
            +
                        default=ServerArgs.abort_on_priority_when_disabled,
         
     | 
| 
      
 2029 
     | 
    
         
            +
                        help="If set, abort requests that specify a priority when priority scheduling is disabled.",
         
     | 
| 
      
 2030 
     | 
    
         
            +
                    )
         
     | 
| 
       1522 
2031 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1523 
2032 
     | 
    
         
             
                        "--schedule-low-priority-values-first",
         
     | 
| 
       1524 
2033 
     | 
    
         
             
                        action="store_true",
         
     | 
| 
         @@ -1565,7 +2074,14 @@ class ServerArgs: 
     | 
|
| 
       1565 
2074 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1566 
2075 
     | 
    
         
             
                        "--disable-hybrid-swa-memory",
         
     | 
| 
       1567 
2076 
     | 
    
         
             
                        action="store_true",
         
     | 
| 
       1568 
     | 
    
         
            -
                        help="Disable the hybrid SWA memory.",
         
     | 
| 
      
 2077 
     | 
    
         
            +
                        help="Disable the hybrid SWA memory pool.",
         
     | 
| 
      
 2078 
     | 
    
         
            +
                    )
         
     | 
| 
      
 2079 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2080 
     | 
    
         
            +
                        "--radix-eviction-policy",
         
     | 
| 
      
 2081 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 2082 
     | 
    
         
            +
                        choices=RADIX_EVICTION_POLICY_CHOICES,
         
     | 
| 
      
 2083 
     | 
    
         
            +
                        default=ServerArgs.radix_eviction_policy,
         
     | 
| 
      
 2084 
     | 
    
         
            +
                        help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.",
         
     | 
| 
       1569 
2085 
     | 
    
         
             
                    )
         
     | 
| 
       1570 
2086 
     | 
    
         | 
| 
       1571 
2087 
     | 
    
         
             
                    # Runtime options
         
     | 
| 
         @@ -1590,9 +2106,9 @@ class ServerArgs: 
     | 
|
| 
       1590 
2106 
     | 
    
         
             
                        help="The pipeline parallelism size.",
         
     | 
| 
       1591 
2107 
     | 
    
         
             
                    )
         
     | 
| 
       1592 
2108 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1593 
     | 
    
         
            -
                        "--max-micro-batch-size",
         
     | 
| 
      
 2109 
     | 
    
         
            +
                        "--pp-max-micro-batch-size",
         
     | 
| 
       1594 
2110 
     | 
    
         
             
                        type=int,
         
     | 
| 
       1595 
     | 
    
         
            -
                        default=ServerArgs. 
     | 
| 
      
 2111 
     | 
    
         
            +
                        default=ServerArgs.pp_max_micro_batch_size,
         
     | 
| 
       1596 
2112 
     | 
    
         
             
                        help="The maximum micro batch size in pipeline parallelism.",
         
     | 
| 
       1597 
2113 
     | 
    
         
             
                    )
         
     | 
| 
       1598 
2114 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
         @@ -1616,7 +2132,12 @@ class ServerArgs: 
     | 
|
| 
       1616 
2132 
     | 
    
         
             
                        "--constrained-json-whitespace-pattern",
         
     | 
| 
       1617 
2133 
     | 
    
         
             
                        type=str,
         
     | 
| 
       1618 
2134 
     | 
    
         
             
                        default=ServerArgs.constrained_json_whitespace_pattern,
         
     | 
| 
       1619 
     | 
    
         
            -
                        help="(outlines  
     | 
| 
      
 2135 
     | 
    
         
            +
                        help="(outlines and llguidance backends only) Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [\n\t ]*",
         
     | 
| 
      
 2136 
     | 
    
         
            +
                    )
         
     | 
| 
      
 2137 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2138 
     | 
    
         
            +
                        "--constrained-json-disable-any-whitespace",
         
     | 
| 
      
 2139 
     | 
    
         
            +
                        action="store_true",
         
     | 
| 
      
 2140 
     | 
    
         
            +
                        help="(xgrammar and llguidance backends only) Enforce compact representation in JSON constrained output.",
         
     | 
| 
       1620 
2141 
     | 
    
         
             
                    )
         
     | 
| 
       1621 
2142 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1622 
2143 
     | 
    
         
             
                        "--watchdog-timeout",
         
     | 
| 
         @@ -1863,6 +2384,16 @@ class ServerArgs: 
     | 
|
| 
       1863 
2384 
     | 
    
         
             
                        default=None,
         
     | 
| 
       1864 
2385 
     | 
    
         
             
                        help="Either 'demo' or a comma-separated list of tool server urls to use for the model. If not specified, no tool server will be used.",
         
     | 
| 
       1865 
2386 
     | 
    
         
             
                    )
         
     | 
| 
      
 2387 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2388 
     | 
    
         
            +
                        "--sampling-defaults",
         
     | 
| 
      
 2389 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 2390 
     | 
    
         
            +
                        choices=["openai", "model"],
         
     | 
| 
      
 2391 
     | 
    
         
            +
                        default=ServerArgs.sampling_defaults,
         
     | 
| 
      
 2392 
     | 
    
         
            +
                        help="Where to get default sampling parameters. "
         
     | 
| 
      
 2393 
     | 
    
         
            +
                        "'openai' uses SGLang/OpenAI defaults (temperature=1.0, top_p=1.0, etc.). "
         
     | 
| 
      
 2394 
     | 
    
         
            +
                        "'model' uses the model's generation_config.json to get the recommended "
         
     | 
| 
      
 2395 
     | 
    
         
            +
                        "sampling parameters if available. Default is 'model'.",
         
     | 
| 
      
 2396 
     | 
    
         
            +
                    )
         
     | 
| 
       1866 
2397 
     | 
    
         | 
| 
       1867 
2398 
     | 
    
         
             
                    # Data parallelism
         
     | 
| 
       1868 
2399 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
         @@ -1966,6 +2497,13 @@ class ServerArgs: 
     | 
|
| 
       1966 
2497 
     | 
    
         
             
                        default=ServerArgs.max_loaded_loras,
         
     | 
| 
       1967 
2498 
     | 
    
         
             
                        help="If specified, it limits the maximum number of LoRA adapters loaded in CPU memory at a time. The value must be greater than or equal to `--max-loras-per-batch`.",
         
     | 
| 
       1968 
2499 
     | 
    
         
             
                    )
         
     | 
| 
      
 2500 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2501 
     | 
    
         
            +
                        "--lora-eviction-policy",
         
     | 
| 
      
 2502 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 2503 
     | 
    
         
            +
                        default=ServerArgs.lora_eviction_policy,
         
     | 
| 
      
 2504 
     | 
    
         
            +
                        choices=["lru", "fifo"],
         
     | 
| 
      
 2505 
     | 
    
         
            +
                        help="LoRA adapter eviction policy when memory pool is full. 'lru': Least Recently Used (default, better cache efficiency). 'fifo': First-In-First-Out.",
         
     | 
| 
      
 2506 
     | 
    
         
            +
                    )
         
     | 
| 
       1969 
2507 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1970 
2508 
     | 
    
         
             
                        "--lora-backend",
         
     | 
| 
       1971 
2509 
     | 
    
         
             
                        type=str,
         
     | 
| 
         @@ -2025,14 +2563,14 @@ class ServerArgs: 
     | 
|
| 
       2025 
2563 
     | 
    
         
             
                        help="Set multimodal attention backend.",
         
     | 
| 
       2026 
2564 
     | 
    
         
             
                    )
         
     | 
| 
       2027 
2565 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2028 
     | 
    
         
            -
                        "--nsa-prefill",
         
     | 
| 
       2029 
     | 
    
         
            -
                        default=ServerArgs. 
     | 
| 
      
 2566 
     | 
    
         
            +
                        "--nsa-prefill-backend",
         
     | 
| 
      
 2567 
     | 
    
         
            +
                        default=ServerArgs.nsa_prefill_backend,
         
     | 
| 
       2030 
2568 
     | 
    
         
             
                        type=str,
         
     | 
| 
       2031 
2569 
     | 
    
         
             
                        choices=NSA_CHOICES,
         
     | 
| 
       2032 
2570 
     | 
    
         
             
                    )
         
     | 
| 
       2033 
2571 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2034 
     | 
    
         
            -
                        "--nsa-decode",
         
     | 
| 
       2035 
     | 
    
         
            -
                        default=ServerArgs. 
     | 
| 
      
 2572 
     | 
    
         
            +
                        "--nsa-decode-backend",
         
     | 
| 
      
 2573 
     | 
    
         
            +
                        default=ServerArgs.nsa_decode_backend,
         
     | 
| 
       2036 
2574 
     | 
    
         
             
                        type=str,
         
     | 
| 
       2037 
2575 
     | 
    
         
             
                        choices=NSA_CHOICES,
         
     | 
| 
       2038 
2576 
     | 
    
         
             
                    )
         
     | 
| 
         @@ -2058,6 +2596,15 @@ class ServerArgs: 
     | 
|
| 
       2058 
2596 
     | 
    
         
             
                        "name, a tag name, or a commit id. If unspecified, will use "
         
     | 
| 
       2059 
2597 
     | 
    
         
             
                        "the default version.",
         
     | 
| 
       2060 
2598 
     | 
    
         
             
                    )
         
     | 
| 
      
 2599 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2600 
     | 
    
         
            +
                        "--speculative-draft-load-format",
         
     | 
| 
      
 2601 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 2602 
     | 
    
         
            +
                        default=ServerArgs.speculative_draft_load_format,
         
     | 
| 
      
 2603 
     | 
    
         
            +
                        choices=LOAD_FORMAT_CHOICES,
         
     | 
| 
      
 2604 
     | 
    
         
            +
                        help="The format of the draft model weights to load. "
         
     | 
| 
      
 2605 
     | 
    
         
            +
                        "If not specified, will use the same format as --load-format. "
         
     | 
| 
      
 2606 
     | 
    
         
            +
                        "Use 'dummy' to initialize draft model weights with random values for profiling.",
         
     | 
| 
      
 2607 
     | 
    
         
            +
                    )
         
     | 
| 
       2061 
2608 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2062 
2609 
     | 
    
         
             
                        "--speculative-num-steps",
         
     | 
| 
       2063 
2610 
     | 
    
         
             
                        type=int,
         
     | 
| 
         @@ -2158,22 +2705,14 @@ class ServerArgs: 
     | 
|
| 
       2158 
2705 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2159 
2706 
     | 
    
         
             
                        "--moe-a2a-backend",
         
     | 
| 
       2160 
2707 
     | 
    
         
             
                        type=str,
         
     | 
| 
       2161 
     | 
    
         
            -
                        choices=["none", "deepep"],
         
     | 
| 
      
 2708 
     | 
    
         
            +
                        choices=["none", "deepep", "mooncake"],
         
     | 
| 
       2162 
2709 
     | 
    
         
             
                        default=ServerArgs.moe_a2a_backend,
         
     | 
| 
       2163 
2710 
     | 
    
         
             
                        help="Choose the backend for MoE A2A.",
         
     | 
| 
       2164 
2711 
     | 
    
         
             
                    )
         
     | 
| 
       2165 
2712 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2166 
2713 
     | 
    
         
             
                        "--moe-runner-backend",
         
     | 
| 
       2167 
2714 
     | 
    
         
             
                        type=str,
         
     | 
| 
       2168 
     | 
    
         
            -
                        choices= 
     | 
| 
       2169 
     | 
    
         
            -
                            "auto",
         
     | 
| 
       2170 
     | 
    
         
            -
                            "triton",
         
     | 
| 
       2171 
     | 
    
         
            -
                            "triton_kernel",
         
     | 
| 
       2172 
     | 
    
         
            -
                            "flashinfer_trtllm",
         
     | 
| 
       2173 
     | 
    
         
            -
                            "flashinfer_cutlass",
         
     | 
| 
       2174 
     | 
    
         
            -
                            "flashinfer_mxfp4",
         
     | 
| 
       2175 
     | 
    
         
            -
                            "flashinfer_cutedsl",
         
     | 
| 
       2176 
     | 
    
         
            -
                        ],
         
     | 
| 
      
 2715 
     | 
    
         
            +
                        choices=MOE_RUNNER_BACKEND_CHOICES,
         
     | 
| 
       2177 
2716 
     | 
    
         
             
                        default=ServerArgs.moe_runner_backend,
         
     | 
| 
       2178 
2717 
     | 
    
         
             
                        help="Choose the runner backend for MoE.",
         
     | 
| 
       2179 
2718 
     | 
    
         
             
                    )
         
     | 
| 
         @@ -2272,6 +2811,21 @@ class ServerArgs: 
     | 
|
| 
       2272 
2811 
     | 
    
         
             
                        default=ServerArgs.moe_dense_tp_size,
         
     | 
| 
       2273 
2812 
     | 
    
         
             
                        help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
         
     | 
| 
       2274 
2813 
     | 
    
         
             
                    )
         
     | 
| 
      
 2814 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2815 
     | 
    
         
            +
                        "--elastic-ep-backend",
         
     | 
| 
      
 2816 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 2817 
     | 
    
         
            +
                        default=ServerArgs.elastic_ep_backend,
         
     | 
| 
      
 2818 
     | 
    
         
            +
                        choices=["none", "mooncake"],
         
     | 
| 
      
 2819 
     | 
    
         
            +
                        help="Specify the collective communication backend for elastic EP. Currently supports 'mooncake'.",
         
     | 
| 
      
 2820 
     | 
    
         
            +
                    )
         
     | 
| 
      
 2821 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2822 
     | 
    
         
            +
                        "--mooncake-ib-device",
         
     | 
| 
      
 2823 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 2824 
     | 
    
         
            +
                        default=ServerArgs.mooncake_ib_device,
         
     | 
| 
      
 2825 
     | 
    
         
            +
                        help="The InfiniBand devices for Mooncake Backend transfer, accepts multiple comma-separated devices "
         
     | 
| 
      
 2826 
     | 
    
         
            +
                        "(e.g., --mooncake-ib-device mlx5_0,mlx5_1). "
         
     | 
| 
      
 2827 
     | 
    
         
            +
                        "Default is None, which triggers automatic device detection when Mooncake Backend is enabled.",
         
     | 
| 
      
 2828 
     | 
    
         
            +
                    )
         
     | 
| 
       2275 
2829 
     | 
    
         | 
| 
       2276 
2830 
     | 
    
         
             
                    # Mamba Cache
         
     | 
| 
       2277 
2831 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
         @@ -2287,6 +2841,12 @@ class ServerArgs: 
     | 
|
| 
       2287 
2841 
     | 
    
         
             
                        choices=["float32", "bfloat16"],
         
     | 
| 
       2288 
2842 
     | 
    
         
             
                        help="The data type of the SSM states in mamba cache.",
         
     | 
| 
       2289 
2843 
     | 
    
         
             
                    )
         
     | 
| 
      
 2844 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2845 
     | 
    
         
            +
                        "--mamba-full-memory-ratio",
         
     | 
| 
      
 2846 
     | 
    
         
            +
                        type=float,
         
     | 
| 
      
 2847 
     | 
    
         
            +
                        default=ServerArgs.mamba_full_memory_ratio,
         
     | 
| 
      
 2848 
     | 
    
         
            +
                        help="The ratio of mamba state memory to full kv cache memory.",
         
     | 
| 
      
 2849 
     | 
    
         
            +
                    )
         
     | 
| 
       2290 
2850 
     | 
    
         | 
| 
       2291 
2851 
     | 
    
         
             
                    # Hierarchical cache
         
     | 
| 
       2292 
2852 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
         @@ -2313,13 +2873,6 @@ class ServerArgs: 
     | 
|
| 
       2313 
2873 
     | 
    
         
             
                        default=ServerArgs.hicache_write_policy,
         
     | 
| 
       2314 
2874 
     | 
    
         
             
                        help="The write policy of hierarchical cache.",
         
     | 
| 
       2315 
2875 
     | 
    
         
             
                    )
         
     | 
| 
       2316 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       2317 
     | 
    
         
            -
                        "--radix-eviction-policy",
         
     | 
| 
       2318 
     | 
    
         
            -
                        type=str,
         
     | 
| 
       2319 
     | 
    
         
            -
                        choices=RADIX_EVICTION_POLICY_CHOICES,
         
     | 
| 
       2320 
     | 
    
         
            -
                        default=ServerArgs.radix_eviction_policy,
         
     | 
| 
       2321 
     | 
    
         
            -
                        help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.",
         
     | 
| 
       2322 
     | 
    
         
            -
                    )
         
     | 
| 
       2323 
2876 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2324 
2877 
     | 
    
         
             
                        "--hicache-io-backend",
         
     | 
| 
       2325 
2878 
     | 
    
         
             
                        type=str,
         
     | 
| 
         @@ -2364,6 +2917,35 @@ class ServerArgs: 
     | 
|
| 
       2364 
2917 
     | 
    
         
             
                        help="Using LMCache as an alternative hierarchical cache solution",
         
     | 
| 
       2365 
2918 
     | 
    
         
             
                    )
         
     | 
| 
       2366 
2919 
     | 
    
         | 
| 
      
 2920 
     | 
    
         
            +
                    # Ktransformer server args
         
     | 
| 
      
 2921 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2922 
     | 
    
         
            +
                        "--kt-amx-weight-path",
         
     | 
| 
      
 2923 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 2924 
     | 
    
         
            +
                        help="[ktransformers parameter] The path of the quantized expert weights for amx kernel. A local folder.",
         
     | 
| 
      
 2925 
     | 
    
         
            +
                    )
         
     | 
| 
      
 2926 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2927 
     | 
    
         
            +
                        "--kt-amx-method",
         
     | 
| 
      
 2928 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 2929 
     | 
    
         
            +
                        default="AMXINT4",
         
     | 
| 
      
 2930 
     | 
    
         
            +
                        help="[ktransformers parameter] Quantization formats for CPU execution.",
         
     | 
| 
      
 2931 
     | 
    
         
            +
                    )
         
     | 
| 
      
 2932 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2933 
     | 
    
         
            +
                        "--kt-cpuinfer",
         
     | 
| 
      
 2934 
     | 
    
         
            +
                        type=int,
         
     | 
| 
      
 2935 
     | 
    
         
            +
                        help="[ktransformers parameter] The number of CPUInfer threads.",
         
     | 
| 
      
 2936 
     | 
    
         
            +
                    )
         
     | 
| 
      
 2937 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2938 
     | 
    
         
            +
                        "--kt-threadpool-count",
         
     | 
| 
      
 2939 
     | 
    
         
            +
                        type=int,
         
     | 
| 
      
 2940 
     | 
    
         
            +
                        default=2,
         
     | 
| 
      
 2941 
     | 
    
         
            +
                        help="[ktransformers parameter] One-to-one with the number of NUMA nodes (one thread pool per NUMA).",
         
     | 
| 
      
 2942 
     | 
    
         
            +
                    )
         
     | 
| 
      
 2943 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2944 
     | 
    
         
            +
                        "--kt-num-gpu-experts",
         
     | 
| 
      
 2945 
     | 
    
         
            +
                        type=int,
         
     | 
| 
      
 2946 
     | 
    
         
            +
                        help="[ktransformers parameter] The number of GPU experts.",
         
     | 
| 
      
 2947 
     | 
    
         
            +
                    )
         
     | 
| 
      
 2948 
     | 
    
         
            +
             
     | 
| 
       2367 
2949 
     | 
    
         
             
                    # Double Sparsity
         
     | 
| 
       2368 
2950 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2369 
2951 
     | 
    
         
             
                        "--enable-double-sparsity",
         
     | 
| 
         @@ -2398,7 +2980,7 @@ class ServerArgs: 
     | 
|
| 
       2398 
2980 
     | 
    
         
             
                        "--ds-sparse-decode-threshold",
         
     | 
| 
       2399 
2981 
     | 
    
         
             
                        type=int,
         
     | 
| 
       2400 
2982 
     | 
    
         
             
                        default=ServerArgs.ds_sparse_decode_threshold,
         
     | 
| 
       2401 
     | 
    
         
            -
                        help="The  
     | 
| 
      
 2983 
     | 
    
         
            +
                        help="The minimum decode sequence length required before the double-sparsity backend switches from the dense fallback to the sparse decode kernel.",
         
     | 
| 
       2402 
2984 
     | 
    
         
             
                    )
         
     | 
| 
       2403 
2985 
     | 
    
         | 
| 
       2404 
2986 
     | 
    
         
             
                    # Offloading
         
     | 
| 
         @@ -2433,6 +3015,14 @@ class ServerArgs: 
     | 
|
| 
       2433 
3015 
     | 
    
         
             
                        help="Mode of offloading.",
         
     | 
| 
       2434 
3016 
     | 
    
         
             
                    )
         
     | 
| 
       2435 
3017 
     | 
    
         | 
| 
      
 3018 
     | 
    
         
            +
                    # Args for multi-item-scoring
         
     | 
| 
      
 3019 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 3020 
     | 
    
         
            +
                        "--multi-item-scoring-delimiter",
         
     | 
| 
      
 3021 
     | 
    
         
            +
                        type=int,
         
     | 
| 
      
 3022 
     | 
    
         
            +
                        default=ServerArgs.multi_item_scoring_delimiter,
         
     | 
| 
      
 3023 
     | 
    
         
            +
                        help="Delimiter token ID for multi-item scoring. Used to combine Query and Items into a single sequence: Query<delimiter>Item1<delimiter>Item2<delimiter>... This enables efficient batch processing of multiple items against a single query.",
         
     | 
| 
      
 3024 
     | 
    
         
            +
                    )
         
     | 
| 
      
 3025 
     | 
    
         
            +
             
     | 
| 
       2436 
3026 
     | 
    
         
             
                    # Optimization/debug options
         
     | 
| 
       2437 
3027 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2438 
3028 
     | 
    
         
             
                        "--disable-radix-cache",
         
     | 
| 
         @@ -2491,6 +3081,11 @@ class ServerArgs: 
     | 
|
| 
       2491 
3081 
     | 
    
         
             
                        action="store_true",
         
     | 
| 
       2492 
3082 
     | 
    
         
             
                        help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
         
     | 
| 
       2493 
3083 
     | 
    
         
             
                    )
         
     | 
| 
      
 3084 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 3085 
     | 
    
         
            +
                        "--disable-tokenizer-batch-decode",
         
     | 
| 
      
 3086 
     | 
    
         
            +
                        action="store_true",
         
     | 
| 
      
 3087 
     | 
    
         
            +
                        help="Disable batch decoding when decoding multiple completions.",
         
     | 
| 
      
 3088 
     | 
    
         
            +
                    )
         
     | 
| 
       2494 
3089 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2495 
3090 
     | 
    
         
             
                        "--disable-outlines-disk-cache",
         
     | 
| 
       2496 
3091 
     | 
    
         
             
                        action="store_true",
         
     | 
| 
         @@ -2552,12 +3147,36 @@ class ServerArgs: 
     | 
|
| 
       2552 
3147 
     | 
    
         
             
                        action="store_true",
         
     | 
| 
       2553 
3148 
     | 
    
         
             
                        help="Optimize the model with torch.compile. Experimental feature.",
         
     | 
| 
       2554 
3149 
     | 
    
         
             
                    )
         
     | 
| 
      
 3150 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 3151 
     | 
    
         
            +
                        "--enable-piecewise-cuda-graph",
         
     | 
| 
      
 3152 
     | 
    
         
            +
                        action="store_true",
         
     | 
| 
      
 3153 
     | 
    
         
            +
                        help="Optimize the model with piecewise cuda graph for extend/prefill only. Experimental feature.",
         
     | 
| 
      
 3154 
     | 
    
         
            +
                    )
         
     | 
| 
      
 3155 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 3156 
     | 
    
         
            +
                        "--piecewise-cuda-graph-tokens",
         
     | 
| 
      
 3157 
     | 
    
         
            +
                        type=json_list_type,
         
     | 
| 
      
 3158 
     | 
    
         
            +
                        default=ServerArgs.piecewise_cuda_graph_tokens,
         
     | 
| 
      
 3159 
     | 
    
         
            +
                        help="Set the list of tokens when using piecewise cuda graph.",
         
     | 
| 
      
 3160 
     | 
    
         
            +
                    )
         
     | 
| 
      
 3161 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 3162 
     | 
    
         
            +
                        "--piecewise-cuda-graph-compiler",
         
     | 
| 
      
 3163 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 3164 
     | 
    
         
            +
                        default=ServerArgs.piecewise_cuda_graph_compiler,
         
     | 
| 
      
 3165 
     | 
    
         
            +
                        help="Set the compiler for piecewise cuda graph. Choices are: eager, inductor.",
         
     | 
| 
      
 3166 
     | 
    
         
            +
                        choices=["eager", "inductor"],
         
     | 
| 
      
 3167 
     | 
    
         
            +
                    )
         
     | 
| 
       2555 
3168 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2556 
3169 
     | 
    
         
             
                        "--torch-compile-max-bs",
         
     | 
| 
       2557 
3170 
     | 
    
         
             
                        type=int,
         
     | 
| 
       2558 
3171 
     | 
    
         
             
                        default=ServerArgs.torch_compile_max_bs,
         
     | 
| 
       2559 
3172 
     | 
    
         
             
                        help="Set the maximum batch size when using torch compile.",
         
     | 
| 
       2560 
3173 
     | 
    
         
             
                    )
         
     | 
| 
      
 3174 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 3175 
     | 
    
         
            +
                        "--piecewise-cuda-graph-max-tokens",
         
     | 
| 
      
 3176 
     | 
    
         
            +
                        type=int,
         
     | 
| 
      
 3177 
     | 
    
         
            +
                        default=ServerArgs.piecewise_cuda_graph_max_tokens,
         
     | 
| 
      
 3178 
     | 
    
         
            +
                        help="Set the maximum tokens when using piecewise cuda graph.",
         
     | 
| 
      
 3179 
     | 
    
         
            +
                    )
         
     | 
| 
       2561 
3180 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2562 
3181 
     | 
    
         
             
                        "--torchao-config",
         
     | 
| 
       2563 
3182 
     | 
    
         
             
                        type=str,
         
     | 
| 
         @@ -2667,31 +3286,20 @@ class ServerArgs: 
     | 
|
| 
       2667 
3286 
     | 
    
         
             
                        nargs="+",
         
     | 
| 
       2668 
3287 
     | 
    
         
             
                        help="Sets the numa node for the subprocesses. i-th element corresponds to i-th subprocess.",
         
     | 
| 
       2669 
3288 
     | 
    
         
             
                    )
         
     | 
| 
       2670 
     | 
    
         
            -
             
     | 
| 
       2671 
     | 
    
         
            -
                    # Debug tensor dumps
         
     | 
| 
       2672 
3289 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2673 
     | 
    
         
            -
                        "-- 
     | 
| 
       2674 
     | 
    
         
            -
                         
     | 
| 
       2675 
     | 
    
         
            -
                         
     | 
| 
       2676 
     | 
    
         
            -
                        help="The output folder for dumping tensors.",
         
     | 
| 
       2677 
     | 
    
         
            -
                    )
         
     | 
| 
       2678 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       2679 
     | 
    
         
            -
                        "--debug-tensor-dump-input-file",
         
     | 
| 
       2680 
     | 
    
         
            -
                        type=str,
         
     | 
| 
       2681 
     | 
    
         
            -
                        default=ServerArgs.debug_tensor_dump_input_file,
         
     | 
| 
       2682 
     | 
    
         
            -
                        help="The input filename for dumping tensors",
         
     | 
| 
      
 3290 
     | 
    
         
            +
                        "--enable-deterministic-inference",
         
     | 
| 
      
 3291 
     | 
    
         
            +
                        action="store_true",
         
     | 
| 
      
 3292 
     | 
    
         
            +
                        help="Enable deterministic inference mode with batch invariant ops.",
         
     | 
| 
       2683 
3293 
     | 
    
         
             
                    )
         
     | 
| 
       2684 
3294 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2685 
     | 
    
         
            -
                        "-- 
     | 
| 
      
 3295 
     | 
    
         
            +
                        "--rl-on-policy-target",
         
     | 
| 
       2686 
3296 
     | 
    
         
             
                        type=str,
         
     | 
| 
       2687 
     | 
    
         
            -
                        default=ServerArgs. 
     | 
| 
       2688 
     | 
    
         
            -
                         
     | 
| 
       2689 
     | 
    
         
            -
             
     | 
| 
       2690 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       2691 
     | 
    
         
            -
                        "--debug-tensor-dump-prefill-only",
         
     | 
| 
       2692 
     | 
    
         
            -
                        action="store_true",
         
     | 
| 
       2693 
     | 
    
         
            -
                        help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
         
     | 
| 
      
 3297 
     | 
    
         
            +
                        default=ServerArgs.rl_on_policy_target,
         
     | 
| 
      
 3298 
     | 
    
         
            +
                        choices=["fsdp"],
         
     | 
| 
      
 3299 
     | 
    
         
            +
                        help="The training system that SGLang needs to match for true on-policy.",
         
     | 
| 
       2694 
3300 
     | 
    
         
             
                    )
         
     | 
| 
      
 3301 
     | 
    
         
            +
             
     | 
| 
      
 3302 
     | 
    
         
            +
                    # Dynamic batch tokenizer
         
     | 
| 
       2695 
3303 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2696 
3304 
     | 
    
         
             
                        "--enable-dynamic-batch-tokenizer",
         
     | 
| 
       2697 
3305 
     | 
    
         
             
                        action="store_true",
         
     | 
| 
         @@ -2710,6 +3318,26 @@ class ServerArgs: 
     | 
|
| 
       2710 
3318 
     | 
    
         
             
                        help="[Only used if --enable-dynamic-batch-tokenizer is set] Timeout in seconds for batching tokenization requests.",
         
     | 
| 
       2711 
3319 
     | 
    
         
             
                    )
         
     | 
| 
       2712 
3320 
     | 
    
         | 
| 
      
 3321 
     | 
    
         
            +
                    # Debug tensor dumps
         
     | 
| 
      
 3322 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 3323 
     | 
    
         
            +
                        "--debug-tensor-dump-output-folder",
         
     | 
| 
      
 3324 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 3325 
     | 
    
         
            +
                        default=ServerArgs.debug_tensor_dump_output_folder,
         
     | 
| 
      
 3326 
     | 
    
         
            +
                        help="The output folder for dumping tensors.",
         
     | 
| 
      
 3327 
     | 
    
         
            +
                    )
         
     | 
| 
      
 3328 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 3329 
     | 
    
         
            +
                        "--debug-tensor-dump-input-file",
         
     | 
| 
      
 3330 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 3331 
     | 
    
         
            +
                        default=ServerArgs.debug_tensor_dump_input_file,
         
     | 
| 
      
 3332 
     | 
    
         
            +
                        help="The input filename for dumping tensors",
         
     | 
| 
      
 3333 
     | 
    
         
            +
                    )
         
     | 
| 
      
 3334 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 3335 
     | 
    
         
            +
                        "--debug-tensor-dump-inject",
         
     | 
| 
      
 3336 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 3337 
     | 
    
         
            +
                        default=ServerArgs.debug_tensor_dump_inject,
         
     | 
| 
      
 3338 
     | 
    
         
            +
                        help="Inject the outputs from jax as the input of every layer.",
         
     | 
| 
      
 3339 
     | 
    
         
            +
                    )
         
     | 
| 
      
 3340 
     | 
    
         
            +
             
     | 
| 
       2713 
3341 
     | 
    
         
             
                    # PD disaggregation
         
     | 
| 
       2714 
3342 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2715 
3343 
     | 
    
         
             
                        "--disaggregation-mode",
         
     | 
| 
         @@ -2813,7 +3441,12 @@ class ServerArgs: 
     | 
|
| 
       2813 
3441 
     | 
    
         
             
                        action="store_true",
         
     | 
| 
       2814 
3442 
     | 
    
         
             
                        help="Enable PD-Multiplexing, PD running on greenctx stream.",
         
     | 
| 
       2815 
3443 
     | 
    
         
             
                    )
         
     | 
| 
       2816 
     | 
    
         
            -
             
     | 
| 
      
 3444 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 3445 
     | 
    
         
            +
                        "--pdmux-config-path",
         
     | 
| 
      
 3446 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 3447 
     | 
    
         
            +
                        default=None,
         
     | 
| 
      
 3448 
     | 
    
         
            +
                        help="The path of the PD-Multiplexing config file.",
         
     | 
| 
      
 3449 
     | 
    
         
            +
                    )
         
     | 
| 
       2817 
3450 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2818 
3451 
     | 
    
         
             
                        "--sm-group-num",
         
     | 
| 
       2819 
3452 
     | 
    
         
             
                        type=int,
         
     | 
| 
         @@ -2821,50 +3454,6 @@ class ServerArgs: 
     | 
|
| 
       2821 
3454 
     | 
    
         
             
                        help="Number of sm partition groups.",
         
     | 
| 
       2822 
3455 
     | 
    
         
             
                    )
         
     | 
| 
       2823 
3456 
     | 
    
         | 
| 
       2824 
     | 
    
         
            -
                    # For deterministic inference
         
     | 
| 
       2825 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       2826 
     | 
    
         
            -
                        "--enable-deterministic-inference",
         
     | 
| 
       2827 
     | 
    
         
            -
                        action="store_true",
         
     | 
| 
       2828 
     | 
    
         
            -
                        help="Enable deterministic inference mode with batch invariant ops.",
         
     | 
| 
       2829 
     | 
    
         
            -
                    )
         
     | 
| 
       2830 
     | 
    
         
            -
             
     | 
| 
       2831 
     | 
    
         
            -
                    # Deprecated arguments
         
     | 
| 
       2832 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       2833 
     | 
    
         
            -
                        "--enable-ep-moe",
         
     | 
| 
       2834 
     | 
    
         
            -
                        action=DeprecatedAction,
         
     | 
| 
       2835 
     | 
    
         
            -
                        help="NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead.",
         
     | 
| 
       2836 
     | 
    
         
            -
                    )
         
     | 
| 
       2837 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       2838 
     | 
    
         
            -
                        "--enable-deepep-moe",
         
     | 
| 
       2839 
     | 
    
         
            -
                        action=DeprecatedAction,
         
     | 
| 
       2840 
     | 
    
         
            -
                        help="NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead.",
         
     | 
| 
       2841 
     | 
    
         
            -
                    )
         
     | 
| 
       2842 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       2843 
     | 
    
         
            -
                        "--enable-flashinfer-cutlass-moe",
         
     | 
| 
       2844 
     | 
    
         
            -
                        action=DeprecatedAction,
         
     | 
| 
       2845 
     | 
    
         
            -
                        help="NOTE: --enable-flashinfer-cutlass-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutlass' instead.",
         
     | 
| 
       2846 
     | 
    
         
            -
                    )
         
     | 
| 
       2847 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       2848 
     | 
    
         
            -
                        "--enable-flashinfer-cutedsl-moe",
         
     | 
| 
       2849 
     | 
    
         
            -
                        action=DeprecatedAction,
         
     | 
| 
       2850 
     | 
    
         
            -
                        help="NOTE: --enable-flashinfer-cutedsl-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutedsl' instead.",
         
     | 
| 
       2851 
     | 
    
         
            -
                    )
         
     | 
| 
       2852 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       2853 
     | 
    
         
            -
                        "--enable-flashinfer-trtllm-moe",
         
     | 
| 
       2854 
     | 
    
         
            -
                        action=DeprecatedAction,
         
     | 
| 
       2855 
     | 
    
         
            -
                        help="NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_trtllm' instead.",
         
     | 
| 
       2856 
     | 
    
         
            -
                    )
         
     | 
| 
       2857 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       2858 
     | 
    
         
            -
                        "--enable-triton-kernel-moe",
         
     | 
| 
       2859 
     | 
    
         
            -
                        action=DeprecatedAction,
         
     | 
| 
       2860 
     | 
    
         
            -
                        help="NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead.",
         
     | 
| 
       2861 
     | 
    
         
            -
                    )
         
     | 
| 
       2862 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       2863 
     | 
    
         
            -
                        "--enable-flashinfer-mxfp4-moe",
         
     | 
| 
       2864 
     | 
    
         
            -
                        action=DeprecatedAction,
         
     | 
| 
       2865 
     | 
    
         
            -
                        help="NOTE: --enable-flashinfer-mxfp4-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_mxfp4' instead.",
         
     | 
| 
       2866 
     | 
    
         
            -
                    )
         
     | 
| 
       2867 
     | 
    
         
            -
             
     | 
| 
       2868 
3457 
     | 
    
         
             
                    # Configuration file support
         
     | 
| 
       2869 
3458 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2870 
3459 
     | 
    
         
             
                        "--config",
         
     | 
| 
         @@ -2894,11 +3483,39 @@ class ServerArgs: 
     | 
|
| 
       2894 
3483 
     | 
    
         
             
                        self.model_path,
         
     | 
| 
       2895 
3484 
     | 
    
         
             
                        trust_remote_code=self.trust_remote_code,
         
     | 
| 
       2896 
3485 
     | 
    
         
             
                        revision=self.revision,
         
     | 
| 
       2897 
     | 
    
         
            -
                        model_override_args= 
     | 
| 
      
 3486 
     | 
    
         
            +
                        model_override_args=orjson.loads(self.json_model_override_args),
         
     | 
| 
       2898 
3487 
     | 
    
         
             
                        **kwargs,
         
     | 
| 
       2899 
3488 
     | 
    
         
             
                    )
         
     | 
| 
       2900 
3489 
     | 
    
         
             
                    return hf_config
         
     | 
| 
       2901 
3490 
     | 
    
         | 
| 
      
 3491 
     | 
    
         
            +
                def get_model_config(self):
         
     | 
| 
      
 3492 
     | 
    
         
            +
                    # Lazy init to avoid circular import
         
     | 
| 
      
 3493 
     | 
    
         
            +
                    from sglang.srt.configs.model_config import ModelConfig
         
     | 
| 
      
 3494 
     | 
    
         
            +
             
     | 
| 
      
 3495 
     | 
    
         
            +
                    if hasattr(self, "model_config"):
         
     | 
| 
      
 3496 
     | 
    
         
            +
                        return self.model_config
         
     | 
| 
      
 3497 
     | 
    
         
            +
                    self.model_config = ModelConfig.from_server_args(self)
         
     | 
| 
      
 3498 
     | 
    
         
            +
                    return self.model_config
         
     | 
| 
      
 3499 
     | 
    
         
            +
             
     | 
| 
      
 3500 
     | 
    
         
            +
                def get_attention_backends(self):
         
     | 
| 
      
 3501 
     | 
    
         
            +
                    prefill_attention_backend_str = (
         
     | 
| 
      
 3502 
     | 
    
         
            +
                        self.prefill_attention_backend
         
     | 
| 
      
 3503 
     | 
    
         
            +
                        if self.prefill_attention_backend
         
     | 
| 
      
 3504 
     | 
    
         
            +
                        else self.attention_backend
         
     | 
| 
      
 3505 
     | 
    
         
            +
                    )
         
     | 
| 
      
 3506 
     | 
    
         
            +
                    decode_attention_backend_str = (
         
     | 
| 
      
 3507 
     | 
    
         
            +
                        self.decode_attention_backend
         
     | 
| 
      
 3508 
     | 
    
         
            +
                        if self.decode_attention_backend
         
     | 
| 
      
 3509 
     | 
    
         
            +
                        else self.attention_backend
         
     | 
| 
      
 3510 
     | 
    
         
            +
                    )
         
     | 
| 
      
 3511 
     | 
    
         
            +
                    return prefill_attention_backend_str, decode_attention_backend_str
         
     | 
| 
      
 3512 
     | 
    
         
            +
             
     | 
| 
      
 3513 
     | 
    
         
            +
                def use_mla_backend(self):
         
     | 
| 
      
 3514 
     | 
    
         
            +
                    from sglang.srt.configs.model_config import AttentionArch
         
     | 
| 
      
 3515 
     | 
    
         
            +
             
     | 
| 
      
 3516 
     | 
    
         
            +
                    model_config = self.get_model_config()
         
     | 
| 
      
 3517 
     | 
    
         
            +
                    return model_config.attention_arch == AttentionArch.MLA
         
     | 
| 
      
 3518 
     | 
    
         
            +
             
     | 
| 
       2902 
3519 
     | 
    
         
             
                def check_server_args(self):
         
     | 
| 
       2903 
3520 
     | 
    
         
             
                    # Check parallel size constraints
         
     | 
| 
       2904 
3521 
     | 
    
         
             
                    assert (
         
     | 
| 
         @@ -2941,7 +3558,34 @@ class ServerArgs: 
     | 
|
| 
       2941 
3558 
     | 
    
         
             
                            self.chunked_prefill_size % self.page_size == 0
         
     | 
| 
       2942 
3559 
     | 
    
         
             
                        ), "chunked_prefill_size must be divisible by page_size"
         
     | 
| 
       2943 
3560 
     | 
    
         | 
| 
       2944 
     | 
    
         
            -
                    # Check  
     | 
| 
      
 3561 
     | 
    
         
            +
                    # Check pdmux
         
     | 
| 
      
 3562 
     | 
    
         
            +
                    if self.enable_pdmux:
         
     | 
| 
      
 3563 
     | 
    
         
            +
                        assert (
         
     | 
| 
      
 3564 
     | 
    
         
            +
                            self.pp_size == 1
         
     | 
| 
      
 3565 
     | 
    
         
            +
                        ), "PD-Multiplexing is only supported with pipeline parallelism disabled (pp_size=1)."
         
     | 
| 
      
 3566 
     | 
    
         
            +
                        assert (
         
     | 
| 
      
 3567 
     | 
    
         
            +
                            self.chunked_prefill_size == -1
         
     | 
| 
      
 3568 
     | 
    
         
            +
                        ), "PD-Multiplexing is not compatible with chunked prefill."
         
     | 
| 
      
 3569 
     | 
    
         
            +
                        assert (
         
     | 
| 
      
 3570 
     | 
    
         
            +
                            self.disaggregation_mode == "null"
         
     | 
| 
      
 3571 
     | 
    
         
            +
                        ), "PD-Multiplexing is not compatible with disaggregation mode."
         
     | 
| 
      
 3572 
     | 
    
         
            +
                        assert (
         
     | 
| 
      
 3573 
     | 
    
         
            +
                            self.disable_overlap_schedule
         
     | 
| 
      
 3574 
     | 
    
         
            +
                        ), "PD-Multiplexing is not compatible with overlap schedule."
         
     | 
| 
      
 3575 
     | 
    
         
            +
             
     | 
| 
      
 3576 
     | 
    
         
            +
                        # NOTE: CUDA Green Context may encounter potential issues with CudaGraph on torch 2.7.x – 2.8.x, leading to performance degradation.
         
     | 
| 
      
 3577 
     | 
    
         
            +
                        import torch
         
     | 
| 
      
 3578 
     | 
    
         
            +
             
     | 
| 
      
 3579 
     | 
    
         
            +
                        parts = torch.__version__.split("+", 1)[0].split(".")
         
     | 
| 
      
 3580 
     | 
    
         
            +
                        major = int(parts[0]) if len(parts) > 0 and parts[0].isdigit() else 0
         
     | 
| 
      
 3581 
     | 
    
         
            +
                        minor = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else 0
         
     | 
| 
      
 3582 
     | 
    
         
            +
                        if (major, minor) > (2, 6):
         
     | 
| 
      
 3583 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 3584 
     | 
    
         
            +
                                "WARNING: PD-Multiplexing may experience performance degradation with torch versions > 2.6.x.\n"
         
     | 
| 
      
 3585 
     | 
    
         
            +
                                f"  Current torch version is {torch.__version__}.\n"
         
     | 
| 
      
 3586 
     | 
    
         
            +
                                "  Please manually install torch 2.6.x."
         
     | 
| 
      
 3587 
     | 
    
         
            +
                            )
         
     | 
| 
      
 3588 
     | 
    
         
            +
             
     | 
| 
       2945 
3589 
     | 
    
         
             
                    assert self.tokenizer_worker_num > 0, "Tokenizer worker num must >= 1"
         
     | 
| 
       2946 
3590 
     | 
    
         
             
                    self.validate_buckets_rule(
         
     | 
| 
       2947 
3591 
     | 
    
         
             
                        "--prompt-tokens-buckets", self.prompt_tokens_buckets
         
     | 
| 
         @@ -2957,6 +3601,17 @@ class ServerArgs: 
     | 
|
| 
       2957 
3601 
     | 
    
         
             
                            "lof",
         
     | 
| 
       2958 
3602 
     | 
    
         
             
                        ], f"To use priority scheduling, schedule_policy must be 'fcfs' or 'lof'. '{self.schedule_policy}' is not supported."
         
     | 
| 
       2959 
3603 
     | 
    
         | 
| 
      
 3604 
     | 
    
         
            +
                    # Check multi-item scoring
         
     | 
| 
      
 3605 
     | 
    
         
            +
                    if self.multi_item_scoring_delimiter is not None:
         
     | 
| 
      
 3606 
     | 
    
         
            +
                        assert self.disable_radix_cache, (
         
     | 
| 
      
 3607 
     | 
    
         
            +
                            "Multi-item scoring requires radix cache to be disabled. "
         
     | 
| 
      
 3608 
     | 
    
         
            +
                            "Please set --disable-radix-cache when using --multi-item-scoring-delimiter."
         
     | 
| 
      
 3609 
     | 
    
         
            +
                        )
         
     | 
| 
      
 3610 
     | 
    
         
            +
                        assert self.chunked_prefill_size == -1, (
         
     | 
| 
      
 3611 
     | 
    
         
            +
                            "Multi-item scoring requires chunked prefill to be disabled. "
         
     | 
| 
      
 3612 
     | 
    
         
            +
                            "Please set --chunked-prefill-size -1 when using --multi-item-scoring-delimiter."
         
     | 
| 
      
 3613 
     | 
    
         
            +
                        )
         
     | 
| 
      
 3614 
     | 
    
         
            +
             
     | 
| 
       2960 
3615 
     | 
    
         
             
                def check_lora_server_args(self):
         
     | 
| 
       2961 
3616 
     | 
    
         
             
                    assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"
         
     | 
| 
       2962 
3617 
     | 
    
         | 
| 
         @@ -3141,6 +3796,22 @@ class ServerArgs: 
     | 
|
| 
       3141 
3796 
     | 
    
         
             
                    )
         
     | 
| 
       3142 
3797 
     | 
    
         | 
| 
       3143 
3798 
     | 
    
         | 
| 
      
 3799 
     | 
    
         
            +
            # NOTE: This is a global variable to hold the server args for scheduler.
         
     | 
| 
      
 3800 
     | 
    
         
            +
            _global_server_args: Optional[ServerArgs] = None
         
     | 
| 
      
 3801 
     | 
    
         
            +
             
     | 
| 
      
 3802 
     | 
    
         
            +
             
     | 
| 
      
 3803 
     | 
    
         
            +
            def set_global_server_args_for_scheduler(server_args: ServerArgs):
         
     | 
| 
      
 3804 
     | 
    
         
            +
                global _global_server_args
         
     | 
| 
      
 3805 
     | 
    
         
            +
                _global_server_args = server_args
         
     | 
| 
      
 3806 
     | 
    
         
            +
             
     | 
| 
      
 3807 
     | 
    
         
            +
             
     | 
| 
      
 3808 
     | 
    
         
            +
            def get_global_server_args() -> ServerArgs:
         
     | 
| 
      
 3809 
     | 
    
         
            +
                if _global_server_args is None:
         
     | 
| 
      
 3810 
     | 
    
         
            +
                    raise ValueError("Global server args is not set yet!")
         
     | 
| 
      
 3811 
     | 
    
         
            +
             
     | 
| 
      
 3812 
     | 
    
         
            +
                return _global_server_args
         
     | 
| 
      
 3813 
     | 
    
         
            +
             
     | 
| 
      
 3814 
     | 
    
         
            +
             
     | 
| 
       3144 
3815 
     | 
    
         
             
            def prepare_server_args(argv: List[str]) -> ServerArgs:
         
     | 
| 
       3145 
3816 
     | 
    
         
             
                """
         
     | 
| 
       3146 
3817 
     | 
    
         
             
                Prepare the server arguments from the command line arguments.
         
     | 
| 
         @@ -3175,11 +3846,12 @@ def prepare_server_args(argv: List[str]) -> ServerArgs: 
     | 
|
| 
       3175 
3846 
     | 
    
         
             
                parser = argparse.ArgumentParser()
         
     | 
| 
       3176 
3847 
     | 
    
         
             
                ServerArgs.add_cli_args(parser)
         
     | 
| 
       3177 
3848 
     | 
    
         
             
                raw_args = parser.parse_args(argv)
         
     | 
| 
       3178 
     | 
    
         
            -
             
     | 
| 
       3179 
     | 
    
         
            -
                return  
     | 
| 
      
 3849 
     | 
    
         
            +
             
     | 
| 
      
 3850 
     | 
    
         
            +
                return ServerArgs.from_cli_args(raw_args)
         
     | 
| 
       3180 
3851 
     | 
    
         | 
| 
       3181 
3852 
     | 
    
         | 
| 
       3182 
3853 
     | 
    
         
             
            ZMQ_TCP_PORT_DELTA = 233
         
     | 
| 
      
 3854 
     | 
    
         
            +
            DP_ATTENTION_HANDSHAKE_PORT_DELTA = 5
         
     | 
| 
       3183 
3855 
     | 
    
         | 
| 
       3184 
3856 
     | 
    
         | 
| 
       3185 
3857 
     | 
    
         
             
            @dataclasses.dataclass
         
     | 
| 
         @@ -3204,7 +3876,11 @@ class PortArgs: 
     | 
|
| 
       3204 
3876 
     | 
    
         
             
                tokenizer_worker_ipc_name: Optional[str]
         
     | 
| 
       3205 
3877 
     | 
    
         | 
| 
       3206 
3878 
     | 
    
         
             
                @staticmethod
         
     | 
| 
       3207 
     | 
    
         
            -
                def init_new( 
     | 
| 
      
 3879 
     | 
    
         
            +
                def init_new(
         
     | 
| 
      
 3880 
     | 
    
         
            +
                    server_args: ServerArgs,
         
     | 
| 
      
 3881 
     | 
    
         
            +
                    dp_rank: Optional[int] = None,
         
     | 
| 
      
 3882 
     | 
    
         
            +
                    worker_ports: Optional[List[int]] = None,
         
     | 
| 
      
 3883 
     | 
    
         
            +
                ) -> PortArgs:
         
     | 
| 
       3208 
3884 
     | 
    
         
             
                    if server_args.nccl_port is None:
         
     | 
| 
       3209 
3885 
     | 
    
         
             
                        nccl_port = server_args.port + random.randint(100, 1000)
         
     | 
| 
       3210 
3886 
     | 
    
         
             
                        while True:
         
     | 
| 
         @@ -3217,6 +3893,13 @@ class PortArgs: 
     | 
|
| 
       3217 
3893 
     | 
    
         
             
                    else:
         
     | 
| 
       3218 
3894 
     | 
    
         
             
                        nccl_port = server_args.nccl_port
         
     | 
| 
       3219 
3895 
     | 
    
         | 
| 
      
 3896 
     | 
    
         
            +
                    if server_args.tokenizer_worker_num > 1:
         
     | 
| 
      
 3897 
     | 
    
         
            +
                        tokenizer_worker_ipc_name = (
         
     | 
| 
      
 3898 
     | 
    
         
            +
                            f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
         
     | 
| 
      
 3899 
     | 
    
         
            +
                        )
         
     | 
| 
      
 3900 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 3901 
     | 
    
         
            +
                        tokenizer_worker_ipc_name = None
         
     | 
| 
      
 3902 
     | 
    
         
            +
             
     | 
| 
       3220 
3903 
     | 
    
         
             
                    if not server_args.enable_dp_attention:
         
     | 
| 
       3221 
3904 
     | 
    
         
             
                        # Normal case, use IPC within a single node
         
     | 
| 
       3222 
3905 
     | 
    
         
             
                        return PortArgs(
         
     | 
| 
         @@ -3226,7 +3909,7 @@ class PortArgs: 
     | 
|
| 
       3226 
3909 
     | 
    
         
             
                            nccl_port=nccl_port,
         
     | 
| 
       3227 
3910 
     | 
    
         
             
                            rpc_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
         
     | 
| 
       3228 
3911 
     | 
    
         
             
                            metrics_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
         
     | 
| 
       3229 
     | 
    
         
            -
                            tokenizer_worker_ipc_name= 
     | 
| 
      
 3912 
     | 
    
         
            +
                            tokenizer_worker_ipc_name=tokenizer_worker_ipc_name,
         
     | 
| 
       3230 
3913 
     | 
    
         
             
                        )
         
     | 
| 
       3231 
3914 
     | 
    
         
             
                    else:
         
     | 
| 
       3232 
3915 
     | 
    
         
             
                        # DP attention. Use TCP + port to handle both single-node and multi-node.
         
     | 
| 
         @@ -3251,8 +3934,8 @@ class PortArgs: 
     | 
|
| 
       3251 
3934 
     | 
    
         
             
                            # TokenizerManager to DataParallelController
         
     | 
| 
       3252 
3935 
     | 
    
         
             
                            scheduler_input_port = port_base + 4
         
     | 
| 
       3253 
3936 
     | 
    
         
             
                        else:
         
     | 
| 
       3254 
     | 
    
         
            -
                             
     | 
| 
       3255 
     | 
    
         
            -
             
     | 
| 
      
 3937 
     | 
    
         
            +
                            assert worker_ports is not None
         
     | 
| 
      
 3938 
     | 
    
         
            +
                            scheduler_input_port = worker_ports[dp_rank]
         
     | 
| 
       3256 
3939 
     | 
    
         
             
                        return PortArgs(
         
     | 
| 
       3257 
3940 
     | 
    
         
             
                            tokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base}",
         
     | 
| 
       3258 
3941 
     | 
    
         
             
                            scheduler_input_ipc_name=f"tcp://{dist_init_host}:{scheduler_input_port}",
         
     | 
| 
         @@ -3260,7 +3943,7 @@ class PortArgs: 
     | 
|
| 
       3260 
3943 
     | 
    
         
             
                            nccl_port=nccl_port,
         
     | 
| 
       3261 
3944 
     | 
    
         
             
                            rpc_ipc_name=f"tcp://{dist_init_host}:{rpc_port}",
         
     | 
| 
       3262 
3945 
     | 
    
         
             
                            metrics_ipc_name=f"tcp://{dist_init_host}:{metrics_ipc_name}",
         
     | 
| 
       3263 
     | 
    
         
            -
                            tokenizer_worker_ipc_name= 
     | 
| 
      
 3946 
     | 
    
         
            +
                            tokenizer_worker_ipc_name=tokenizer_worker_ipc_name,
         
     | 
| 
       3264 
3947 
     | 
    
         
             
                        )
         
     | 
| 
       3265 
3948 
     | 
    
         | 
| 
       3266 
3949 
     | 
    
         |