sglang 0.5.4__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_serving.py +56 -12
 - sglang/launch_server.py +2 -0
 - sglang/srt/batch_invariant_ops/batch_invariant_ops.py +101 -4
 - sglang/srt/compilation/backend.py +1 -1
 - sglang/srt/configs/model_config.py +5 -5
 - sglang/srt/distributed/parallel_state.py +0 -7
 - sglang/srt/entrypoints/engine.py +18 -15
 - sglang/srt/entrypoints/grpc_server.py +0 -1
 - sglang/srt/entrypoints/http_server.py +75 -94
 - sglang/srt/environ.py +16 -2
 - sglang/srt/eplb/expert_distribution.py +30 -0
 - sglang/srt/function_call/function_call_parser.py +2 -0
 - sglang/srt/function_call/minimax_m2.py +367 -0
 - sglang/srt/layers/activation.py +6 -0
 - sglang/srt/layers/attention/flashattention_backend.py +12 -2
 - sglang/srt/layers/attention/flashinfer_backend.py +10 -1
 - sglang/srt/layers/attention/flashinfer_mla_backend.py +18 -10
 - sglang/srt/layers/attention/trtllm_mla_backend.py +1 -13
 - sglang/srt/layers/attention/utils.py +78 -0
 - sglang/srt/layers/communicator.py +1 -0
 - sglang/srt/layers/deep_gemm_wrapper/compile_utils.py +1 -1
 - sglang/srt/layers/layernorm.py +19 -4
 - sglang/srt/layers/logits_processor.py +5 -0
 - sglang/srt/layers/moe/cutlass_w4a8_moe.py +138 -0
 - sglang/srt/layers/moe/ep_moe/kernels.py +194 -0
 - sglang/srt/layers/moe/ep_moe/layer.py +79 -272
 - sglang/srt/layers/moe/fused_moe_triton/layer.py +3 -3
 - sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +7 -4
 - sglang/srt/layers/moe/moe_runner/deep_gemm.py +287 -22
 - sglang/srt/layers/moe/moe_runner/runner.py +3 -0
 - sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
 - sglang/srt/layers/moe/token_dispatcher/__init__.py +4 -4
 - sglang/srt/layers/moe/token_dispatcher/base.py +11 -5
 - sglang/srt/layers/moe/token_dispatcher/deepep.py +18 -14
 - sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
 - sglang/srt/layers/moe/topk.py +4 -4
 - sglang/srt/layers/moe/utils.py +3 -4
 - sglang/srt/layers/quantization/__init__.py +3 -5
 - sglang/srt/layers/quantization/awq.py +0 -3
 - sglang/srt/layers/quantization/base_config.py +7 -0
 - sglang/srt/layers/quantization/fp8.py +68 -63
 - sglang/srt/layers/quantization/gguf.py +566 -0
 - sglang/srt/layers/quantization/mxfp4.py +30 -38
 - sglang/srt/layers/quantization/unquant.py +23 -45
 - sglang/srt/layers/quantization/w4afp8.py +38 -2
 - sglang/srt/layers/radix_attention.py +5 -2
 - sglang/srt/layers/rotary_embedding.py +13 -1
 - sglang/srt/layers/sampler.py +12 -1
 - sglang/srt/managers/io_struct.py +3 -0
 - sglang/srt/managers/multi_tokenizer_mixin.py +17 -1
 - sglang/srt/managers/scheduler.py +21 -15
 - sglang/srt/managers/scheduler_metrics_mixin.py +22 -14
 - sglang/srt/managers/scheduler_profiler_mixin.py +3 -4
 - sglang/srt/managers/tokenizer_manager.py +11 -19
 - sglang/srt/mem_cache/hicache_storage.py +7 -1
 - sglang/srt/mem_cache/memory_pool.py +82 -0
 - sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
 - sglang/srt/model_executor/forward_batch_info.py +44 -3
 - sglang/srt/model_executor/model_runner.py +1 -149
 - sglang/srt/model_executor/piecewise_cuda_graph_runner.py +22 -12
 - sglang/srt/models/deepseek_v2.py +147 -44
 - sglang/srt/models/glm4_moe.py +322 -354
 - sglang/srt/models/glm4_moe_nextn.py +4 -14
 - sglang/srt/models/glm4v_moe.py +29 -196
 - sglang/srt/models/minimax_m2.py +922 -0
 - sglang/srt/models/nvila.py +355 -0
 - sglang/srt/models/nvila_lite.py +184 -0
 - sglang/srt/models/qwen2.py +22 -1
 - sglang/srt/models/qwen3.py +34 -4
 - sglang/srt/models/qwen3_moe.py +2 -4
 - sglang/srt/multimodal/processors/base_processor.py +1 -0
 - sglang/srt/multimodal/processors/glm4v.py +1 -1
 - sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
 - sglang/srt/multimodal/processors/points_v15_chat.py +2 -2
 - sglang/srt/parser/reasoning_parser.py +28 -1
 - sglang/srt/server_args.py +365 -186
 - sglang/srt/single_batch_overlap.py +2 -7
 - sglang/srt/utils/common.py +87 -42
 - sglang/srt/utils/hf_transformers_utils.py +7 -3
 - sglang/test/test_deterministic.py +235 -12
 - sglang/test/test_deterministic_utils.py +2 -1
 - sglang/version.py +1 -1
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +7 -6
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +87 -82
 - sglang/srt/models/vila.py +0 -306
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
 
    
        sglang/srt/server_args.py
    CHANGED
    
    | 
         @@ -27,19 +27,24 @@ from typing import Dict, List, Literal, Optional, Union 
     | 
|
| 
       27 
27 
     | 
    
         
             
            import orjson
         
     | 
| 
       28 
28 
     | 
    
         | 
| 
       29 
29 
     | 
    
         
             
            from sglang.srt.connector import ConnectorType
         
     | 
| 
      
 30 
     | 
    
         
            +
            from sglang.srt.environ import envs
         
     | 
| 
       30 
31 
     | 
    
         
             
            from sglang.srt.function_call.function_call_parser import FunctionCallParser
         
     | 
| 
       31 
32 
     | 
    
         
             
            from sglang.srt.lora.lora_registry import LoRARef
         
     | 
| 
       32 
33 
     | 
    
         
             
            from sglang.srt.parser.reasoning_parser import ReasoningParser
         
     | 
| 
       33 
     | 
    
         
            -
            from sglang.srt.utils import (
         
     | 
| 
      
 34 
     | 
    
         
            +
            from sglang.srt.utils.common import (
         
     | 
| 
       34 
35 
     | 
    
         
             
                LORA_TARGET_ALL_MODULES,
         
     | 
| 
       35 
36 
     | 
    
         
             
                SUPPORTED_LORA_TARGET_MODULES,
         
     | 
| 
       36 
37 
     | 
    
         
             
                configure_ipv6,
         
     | 
| 
      
 38 
     | 
    
         
            +
                cpu_has_amx_support,
         
     | 
| 
       37 
39 
     | 
    
         
             
                get_device,
         
     | 
| 
       38 
40 
     | 
    
         
             
                get_device_memory_capacity,
         
     | 
| 
       39 
41 
     | 
    
         
             
                get_device_sm,
         
     | 
| 
       40 
42 
     | 
    
         
             
                is_cuda,
         
     | 
| 
      
 43 
     | 
    
         
            +
                is_fa3_default_architecture,
         
     | 
| 
       41 
44 
     | 
    
         
             
                is_flashinfer_available,
         
     | 
| 
       42 
45 
     | 
    
         
             
                is_hip,
         
     | 
| 
      
 46 
     | 
    
         
            +
                is_hopper_with_cuda_12_3,
         
     | 
| 
      
 47 
     | 
    
         
            +
                is_no_spec_infer_or_topk_one,
         
     | 
| 
       43 
48 
     | 
    
         
             
                is_npu,
         
     | 
| 
       44 
49 
     | 
    
         
             
                is_port_available,
         
     | 
| 
       45 
50 
     | 
    
         
             
                is_remote_url,
         
     | 
| 
         @@ -51,6 +56,7 @@ from sglang.srt.utils import ( 
     | 
|
| 
       51 
56 
     | 
    
         
             
                json_list_type,
         
     | 
| 
       52 
57 
     | 
    
         
             
                nullable_str,
         
     | 
| 
       53 
58 
     | 
    
         
             
                parse_connector_type,
         
     | 
| 
      
 59 
     | 
    
         
            +
                xpu_has_xmx_support,
         
     | 
| 
       54 
60 
     | 
    
         
             
            )
         
     | 
| 
       55 
61 
     | 
    
         
             
            from sglang.srt.utils.hf_transformers_utils import check_gguf_file, get_config
         
     | 
| 
       56 
62 
     | 
    
         
             
            from sglang.utils import is_in_ci
         
     | 
| 
         @@ -127,8 +133,6 @@ GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"] 
     | 
|
| 
       127 
133 
     | 
    
         | 
| 
       128 
134 
     | 
    
         
             
            DETERMINISTIC_ATTENTION_BACKEND_CHOICES = ["flashinfer", "fa3", "triton"]
         
     | 
| 
       129 
135 
     | 
    
         | 
| 
       130 
     | 
    
         
            -
            DEFAULT_LORA_EVICTION_POLICY = "lru"
         
     | 
| 
       131 
     | 
    
         
            -
             
     | 
| 
       132 
136 
     | 
    
         
             
            NSA_CHOICES = ["flashmla_sparse", "flashmla_kv", "fa3", "tilelang", "aiter"]
         
     | 
| 
       133 
137 
     | 
    
         | 
| 
       134 
138 
     | 
    
         
             
            RADIX_EVICTION_POLICY_CHOICES = ["lru", "lfu"]
         
     | 
| 
         @@ -181,6 +185,15 @@ def add_radix_eviction_policy_choices(choices): 
     | 
|
| 
       181 
185 
     | 
    
         | 
| 
       182 
186 
     | 
    
         
             
            @dataclasses.dataclass
         
     | 
| 
       183 
187 
     | 
    
         
             
            class ServerArgs:
         
     | 
| 
      
 188 
     | 
    
         
            +
                """
         
     | 
| 
      
 189 
     | 
    
         
            +
                The arguments of the server.
         
     | 
| 
      
 190 
     | 
    
         
            +
             
     | 
| 
      
 191 
     | 
    
         
            +
                NOTE: When you add new arguments, please make sure the order
         
     | 
| 
      
 192 
     | 
    
         
            +
                in this class definition the same as the order in the the function
         
     | 
| 
      
 193 
     | 
    
         
            +
                `ServerArgs.add_cli_args`.
         
     | 
| 
      
 194 
     | 
    
         
            +
                Please follow the existing style to group the new arguments into related groups or create new groups.
         
     | 
| 
      
 195 
     | 
    
         
            +
                """
         
     | 
| 
      
 196 
     | 
    
         
            +
             
     | 
| 
       184 
197 
     | 
    
         
             
                # Model and tokenizer
         
     | 
| 
       185 
198 
     | 
    
         
             
                model_path: str
         
     | 
| 
       186 
199 
     | 
    
         
             
                tokenizer_path: Optional[str] = None
         
     | 
| 
         @@ -190,11 +203,6 @@ class ServerArgs: 
     | 
|
| 
       190 
203 
     | 
    
         
             
                load_format: str = "auto"
         
     | 
| 
       191 
204 
     | 
    
         
             
                model_loader_extra_config: str = "{}"
         
     | 
| 
       192 
205 
     | 
    
         
             
                trust_remote_code: bool = False
         
     | 
| 
       193 
     | 
    
         
            -
                modelopt_quant: Optional[Union[str, Dict]] = None
         
     | 
| 
       194 
     | 
    
         
            -
                modelopt_checkpoint_restore_path: Optional[str] = None
         
     | 
| 
       195 
     | 
    
         
            -
                modelopt_checkpoint_save_path: Optional[str] = None
         
     | 
| 
       196 
     | 
    
         
            -
                modelopt_export_path: Optional[str] = None
         
     | 
| 
       197 
     | 
    
         
            -
                quantize_and_serve: bool = False
         
     | 
| 
       198 
206 
     | 
    
         
             
                context_length: Optional[int] = None
         
     | 
| 
       199 
207 
     | 
    
         
             
                is_embedding: bool = False
         
     | 
| 
       200 
208 
     | 
    
         
             
                enable_multimodal: Optional[bool] = None
         
     | 
| 
         @@ -216,6 +224,11 @@ class ServerArgs: 
     | 
|
| 
       216 
224 
     | 
    
         
             
                quantization_param_path: Optional[str] = None
         
     | 
| 
       217 
225 
     | 
    
         
             
                kv_cache_dtype: str = "auto"
         
     | 
| 
       218 
226 
     | 
    
         
             
                enable_fp32_lm_head: bool = False
         
     | 
| 
      
 227 
     | 
    
         
            +
                modelopt_quant: Optional[Union[str, Dict]] = None
         
     | 
| 
      
 228 
     | 
    
         
            +
                modelopt_checkpoint_restore_path: Optional[str] = None
         
     | 
| 
      
 229 
     | 
    
         
            +
                modelopt_checkpoint_save_path: Optional[str] = None
         
     | 
| 
      
 230 
     | 
    
         
            +
                modelopt_export_path: Optional[str] = None
         
     | 
| 
      
 231 
     | 
    
         
            +
                quantize_and_serve: bool = False
         
     | 
| 
       219 
232 
     | 
    
         | 
| 
       220 
233 
     | 
    
         
             
                # Memory and scheduling
         
     | 
| 
       221 
234 
     | 
    
         
             
                mem_fraction_static: Optional[float] = None
         
     | 
| 
         @@ -238,8 +251,6 @@ class ServerArgs: 
     | 
|
| 
       238 
251 
     | 
    
         | 
| 
       239 
252 
     | 
    
         
             
                # Runtime options
         
     | 
| 
       240 
253 
     | 
    
         
             
                device: Optional[str] = None
         
     | 
| 
       241 
     | 
    
         
            -
                elastic_ep_backend: Literal[None, "mooncake"] = None
         
     | 
| 
       242 
     | 
    
         
            -
                mooncake_ib_device: Optional[str] = None
         
     | 
| 
       243 
254 
     | 
    
         
             
                tp_size: int = 1
         
     | 
| 
       244 
255 
     | 
    
         
             
                pp_size: int = 1
         
     | 
| 
       245 
256 
     | 
    
         
             
                pp_max_micro_batch_size: Optional[int] = None
         
     | 
| 
         @@ -272,10 +283,10 @@ class ServerArgs: 
     | 
|
| 
       272 
283 
     | 
    
         
             
                collect_tokens_histogram: bool = False
         
     | 
| 
       273 
284 
     | 
    
         
             
                prompt_tokens_buckets: Optional[List[str]] = None
         
     | 
| 
       274 
285 
     | 
    
         
             
                generation_tokens_buckets: Optional[List[str]] = None
         
     | 
| 
      
 286 
     | 
    
         
            +
                gc_warning_threshold_secs: float = 0.0
         
     | 
| 
       275 
287 
     | 
    
         
             
                decode_log_interval: int = 40
         
     | 
| 
       276 
288 
     | 
    
         
             
                enable_request_time_stats_logging: bool = False
         
     | 
| 
       277 
289 
     | 
    
         
             
                kv_events_config: Optional[str] = None
         
     | 
| 
       278 
     | 
    
         
            -
                gc_warning_threshold_secs: float = 0.0
         
     | 
| 
       279 
290 
     | 
    
         
             
                enable_trace: bool = False
         
     | 
| 
       280 
291 
     | 
    
         
             
                oltp_traces_endpoint: str = "localhost:4317"
         
     | 
| 
       281 
292 
     | 
    
         | 
| 
         @@ -317,7 +328,7 @@ class ServerArgs: 
     | 
|
| 
       317 
328 
     | 
    
         
             
                ] = None
         
     | 
| 
       318 
329 
     | 
    
         
             
                max_loaded_loras: Optional[int] = None
         
     | 
| 
       319 
330 
     | 
    
         
             
                max_loras_per_batch: int = 8
         
     | 
| 
       320 
     | 
    
         
            -
                lora_eviction_policy: str =  
     | 
| 
      
 331 
     | 
    
         
            +
                lora_eviction_policy: str = "lru"
         
     | 
| 
       321 
332 
     | 
    
         
             
                lora_backend: str = "triton"
         
     | 
| 
       322 
333 
     | 
    
         
             
                max_lora_chunk_size: Optional[int] = 16
         
     | 
| 
       323 
334 
     | 
    
         | 
| 
         @@ -332,7 +343,6 @@ class ServerArgs: 
     | 
|
| 
       332 
343 
     | 
    
         
             
                nsa_decode_backend: str = "fa3"
         
     | 
| 
       333 
344 
     | 
    
         | 
| 
       334 
345 
     | 
    
         
             
                # Speculative decoding
         
     | 
| 
       335 
     | 
    
         
            -
                enable_beta_spec: bool = False
         
     | 
| 
       336 
346 
     | 
    
         
             
                speculative_algorithm: Optional[str] = None
         
     | 
| 
       337 
347 
     | 
    
         
             
                speculative_draft_model_path: Optional[str] = None
         
     | 
| 
       338 
348 
     | 
    
         
             
                speculative_draft_model_revision: Optional[str] = None
         
     | 
| 
         @@ -375,6 +385,8 @@ class ServerArgs: 
     | 
|
| 
       375 
385 
     | 
    
         
             
                enable_expert_distribution_metrics: bool = False
         
     | 
| 
       376 
386 
     | 
    
         
             
                deepep_config: Optional[str] = None
         
     | 
| 
       377 
387 
     | 
    
         
             
                moe_dense_tp_size: Optional[int] = None
         
     | 
| 
      
 388 
     | 
    
         
            +
                elastic_ep_backend: Literal[None, "mooncake"] = None
         
     | 
| 
      
 389 
     | 
    
         
            +
                mooncake_ib_device: Optional[str] = None
         
     | 
| 
       378 
390 
     | 
    
         | 
| 
       379 
391 
     | 
    
         
             
                # Mamba cache
         
     | 
| 
       380 
392 
     | 
    
         
             
                max_mamba_cache_size: Optional[int] = None
         
     | 
| 
         @@ -473,6 +485,7 @@ class ServerArgs: 
     | 
|
| 
       473 
485 
     | 
    
         
             
                scheduler_recv_interval: int = 1
         
     | 
| 
       474 
486 
     | 
    
         
             
                numa_node: Optional[List[int]] = None
         
     | 
| 
       475 
487 
     | 
    
         
             
                enable_deterministic_inference: bool = False
         
     | 
| 
      
 488 
     | 
    
         
            +
                rl_on_policy_target: Optional[str] = None
         
     | 
| 
       476 
489 
     | 
    
         | 
| 
       477 
490 
     | 
    
         
             
                # Dynamic batch tokenizer
         
     | 
| 
       478 
491 
     | 
    
         
             
                enable_dynamic_batch_tokenizer: bool = False
         
     | 
| 
         @@ -509,19 +522,6 @@ class ServerArgs: 
     | 
|
| 
       509 
522 
     | 
    
         
             
                pdmux_config_path: Optional[str] = None
         
     | 
| 
       510 
523 
     | 
    
         
             
                sm_group_num: int = 8
         
     | 
| 
       511 
524 
     | 
    
         | 
| 
       512 
     | 
    
         
            -
                def get_attention_backends(server_args):
         
     | 
| 
       513 
     | 
    
         
            -
                    prefill_attention_backend_str = (
         
     | 
| 
       514 
     | 
    
         
            -
                        server_args.prefill_attention_backend
         
     | 
| 
       515 
     | 
    
         
            -
                        if server_args.prefill_attention_backend
         
     | 
| 
       516 
     | 
    
         
            -
                        else server_args.attention_backend
         
     | 
| 
       517 
     | 
    
         
            -
                    )
         
     | 
| 
       518 
     | 
    
         
            -
                    decode_attention_backend_str = (
         
     | 
| 
       519 
     | 
    
         
            -
                        server_args.decode_attention_backend
         
     | 
| 
       520 
     | 
    
         
            -
                        if server_args.decode_attention_backend
         
     | 
| 
       521 
     | 
    
         
            -
                        else server_args.attention_backend
         
     | 
| 
       522 
     | 
    
         
            -
                    )
         
     | 
| 
       523 
     | 
    
         
            -
                    return prefill_attention_backend_str, decode_attention_backend_str
         
     | 
| 
       524 
     | 
    
         
            -
             
     | 
| 
       525 
525 
     | 
    
         
             
                def __post_init__(self):
         
     | 
| 
       526 
526 
     | 
    
         
             
                    """
         
     | 
| 
       527 
527 
     | 
    
         
             
                    Orchestrates the handling of various server arguments, ensuring proper configuration and validation.
         
     | 
| 
         @@ -550,6 +550,9 @@ class ServerArgs: 
     | 
|
| 
       550 
550 
     | 
    
         
             
                    # Apply model-specific adjustments.
         
     | 
| 
       551 
551 
     | 
    
         
             
                    self._handle_model_specific_adjustments()
         
     | 
| 
       552 
552 
     | 
    
         | 
| 
      
 553 
     | 
    
         
            +
                    # Handle Hicache settings.
         
     | 
| 
      
 554 
     | 
    
         
            +
                    self._handle_hicache()
         
     | 
| 
      
 555 
     | 
    
         
            +
             
     | 
| 
       553 
556 
     | 
    
         
             
                    # Set kernel backends.
         
     | 
| 
       554 
557 
     | 
    
         
             
                    self._handle_sampling_backend()
         
     | 
| 
       555 
558 
     | 
    
         
             
                    self._handle_attention_backend_compatibility()
         
     | 
| 
         @@ -572,9 +575,6 @@ class ServerArgs: 
     | 
|
| 
       572 
575 
     | 
    
         
             
                    # Handle pipeline parallelism.
         
     | 
| 
       573 
576 
     | 
    
         
             
                    self._handle_pipeline_parallelism()
         
     | 
| 
       574 
577 
     | 
    
         | 
| 
       575 
     | 
    
         
            -
                    # Handle Hicache settings.
         
     | 
| 
       576 
     | 
    
         
            -
                    self._handle_hicache()
         
     | 
| 
       577 
     | 
    
         
            -
             
     | 
| 
       578 
578 
     | 
    
         
             
                    # Handle speculative decoding logic.
         
     | 
| 
       579 
579 
     | 
    
         
             
                    self._handle_speculative_decoding()
         
     | 
| 
       580 
580 
     | 
    
         | 
| 
         @@ -614,22 +614,6 @@ class ServerArgs: 
     | 
|
| 
       614 
614 
     | 
    
         
             
                        )
         
     | 
| 
       615 
615 
     | 
    
         
             
                        self.tool_call_parser = deprecated_tool_call_parsers[self.tool_call_parser]
         
     | 
| 
       616 
616 
     | 
    
         | 
| 
       617 
     | 
    
         
            -
                def _handle_ktransformers_configs(self):
         
     | 
| 
       618 
     | 
    
         
            -
                    from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import (
         
     | 
| 
       619 
     | 
    
         
            -
                        CompressedTensorsWNA16AMXEPMoEMethod,
         
     | 
| 
       620 
     | 
    
         
            -
                        override_config,
         
     | 
| 
       621 
     | 
    
         
            -
                    )
         
     | 
| 
       622 
     | 
    
         
            -
             
     | 
| 
       623 
     | 
    
         
            -
                    override_config(
         
     | 
| 
       624 
     | 
    
         
            -
                        CompressedTensorsWNA16AMXEPMoEMethod,
         
     | 
| 
       625 
     | 
    
         
            -
                        self.kt_num_gpu_experts,
         
     | 
| 
       626 
     | 
    
         
            -
                        self.kt_cpuinfer,
         
     | 
| 
       627 
     | 
    
         
            -
                        self.kt_threadpool_count,
         
     | 
| 
       628 
     | 
    
         
            -
                        self.kt_amx_weight_path,
         
     | 
| 
       629 
     | 
    
         
            -
                        self.kt_amx_method,
         
     | 
| 
       630 
     | 
    
         
            -
                        self.chunked_prefill_size,
         
     | 
| 
       631 
     | 
    
         
            -
                    )
         
     | 
| 
       632 
     | 
    
         
            -
             
     | 
| 
       633 
617 
     | 
    
         
             
                def _handle_missing_default_values(self):
         
     | 
| 
       634 
618 
     | 
    
         
             
                    if self.tokenizer_path is None:
         
     | 
| 
       635 
619 
     | 
    
         
             
                        self.tokenizer_path = self.model_path
         
     | 
| 
         @@ -684,7 +668,7 @@ class ServerArgs: 
     | 
|
| 
       684 
668 
     | 
    
         
             
                                    self.cuda_graph_max_bs = 64
         
     | 
| 
       685 
669 
     | 
    
         
             
                        elif gpu_mem < 35 * 1024:
         
     | 
| 
       686 
670 
     | 
    
         
             
                            # A10, 4090, 5090
         
     | 
| 
       687 
     | 
    
         
            -
                            # (chunked_prefill_size 2k, cuda_graph_max_bs  
     | 
| 
      
 671 
     | 
    
         
            +
                            # (chunked_prefill_size 2k, cuda_graph_max_bs 24 if tp < 4 else 80)
         
     | 
| 
       688 
672 
     | 
    
         
             
                            if self.chunked_prefill_size is None:
         
     | 
| 
       689 
673 
     | 
    
         
             
                                self.chunked_prefill_size = 2048
         
     | 
| 
       690 
674 
     | 
    
         
             
                            if self.cuda_graph_max_bs is None:
         
     | 
| 
         @@ -692,7 +676,7 @@ class ServerArgs: 
     | 
|
| 
       692 
676 
     | 
    
         
             
                                # However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs
         
     | 
| 
       693 
677 
     | 
    
         
             
                                # from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
         
     | 
| 
       694 
678 
     | 
    
         
             
                                if self.tp_size < 4:
         
     | 
| 
       695 
     | 
    
         
            -
                                    self.cuda_graph_max_bs =  
     | 
| 
      
 679 
     | 
    
         
            +
                                    self.cuda_graph_max_bs = 24
         
     | 
| 
       696 
680 
     | 
    
         
             
                                else:
         
     | 
| 
       697 
681 
     | 
    
         
             
                                    self.cuda_graph_max_bs = 80
         
     | 
| 
       698 
682 
     | 
    
         
             
                        elif gpu_mem < 60 * 1024:
         
     | 
| 
         @@ -800,11 +784,9 @@ class ServerArgs: 
     | 
|
| 
       800 
784 
     | 
    
         
             
                            else 0.88
         
     | 
| 
       801 
785 
     | 
    
         
             
                        )
         
     | 
| 
       802 
786 
     | 
    
         | 
| 
       803 
     | 
    
         
            -
                        #  
     | 
| 
       804 
     | 
    
         
            -
                        #  
     | 
| 
       805 
     | 
    
         
            -
                         
     | 
| 
       806 
     | 
    
         
            -
             
     | 
| 
       807 
     | 
    
         
            -
                        model_config = ModelConfig.from_server_args(self)
         
     | 
| 
      
 787 
     | 
    
         
            +
                        # Multimodal models need more memory for the image processing,
         
     | 
| 
      
 788 
     | 
    
         
            +
                        # so we adjust the mem_fraction_static accordingly.
         
     | 
| 
      
 789 
     | 
    
         
            +
                        model_config = self.get_model_config()
         
     | 
| 
       808 
790 
     | 
    
         
             
                        if model_config.is_multimodal:
         
     | 
| 
       809 
791 
     | 
    
         
             
                            self.adjust_mem_fraction_for_vlm(model_config)
         
     | 
| 
       810 
792 
     | 
    
         | 
| 
         @@ -892,14 +874,19 @@ class ServerArgs: 
     | 
|
| 
       892 
874 
     | 
    
         
             
                                logger.info(
         
     | 
| 
       893 
875 
     | 
    
         
             
                                    "Enable FlashInfer AllReduce Fusion on sm100 for DeepseekV3ForCausalLM"
         
     | 
| 
       894 
876 
     | 
    
         
             
                                )
         
     | 
| 
       895 
     | 
    
         
            -
                            if  
     | 
| 
       896 
     | 
    
         
            -
                                self.quantization == "modelopt_fp4"
         
     | 
| 
       897 
     | 
    
         
            -
                                and self.moe_runner_backend == "auto"
         
     | 
| 
       898 
     | 
    
         
            -
                            ):
         
     | 
| 
      
 877 
     | 
    
         
            +
                            if self.moe_runner_backend == "auto":
         
     | 
| 
       899 
878 
     | 
    
         
             
                                self.moe_runner_backend = "flashinfer_trtllm"
         
     | 
| 
       900 
879 
     | 
    
         
             
                                logger.info(
         
     | 
| 
       901 
     | 
    
         
            -
                                    "Use flashinfer_trtllm as  
     | 
| 
      
 880 
     | 
    
         
            +
                                    "Use flashinfer_trtllm as MoE runner backend on sm100 for DeepseekV3ForCausalLM"
         
     | 
| 
       902 
881 
     | 
    
         
             
                                )
         
     | 
| 
      
 882 
     | 
    
         
            +
                                if self.quantization is None:
         
     | 
| 
      
 883 
     | 
    
         
            +
                                    # Default DeepSeek V3/R1 native FP8 when not explicitly set,
         
     | 
| 
      
 884 
     | 
    
         
            +
                                    # Because we need this condition for an assertion in
         
     | 
| 
      
 885 
     | 
    
         
            +
                                    # flashinfer_trtllm MoE runner backend.
         
     | 
| 
      
 886 
     | 
    
         
            +
                                    self.quantization = "fp8"
         
     | 
| 
      
 887 
     | 
    
         
            +
                                    logger.info(
         
     | 
| 
      
 888 
     | 
    
         
            +
                                        "Quantization not specified, default to fp8 for DeepSeek on sm100"
         
     | 
| 
      
 889 
     | 
    
         
            +
                                    )
         
     | 
| 
       903 
890 
     | 
    
         | 
| 
       904 
891 
     | 
    
         
             
                    elif model_arch in ["GptOssForCausalLM"]:
         
     | 
| 
       905 
892 
     | 
    
         
             
                        if (
         
     | 
| 
         @@ -966,7 +953,13 @@ class ServerArgs: 
     | 
|
| 
       966 
953 
     | 
    
         
             
                            "fa3",
         
     | 
| 
       967 
954 
     | 
    
         
             
                            "aiter",
         
     | 
| 
       968 
955 
     | 
    
         
             
                            "triton",
         
     | 
| 
       969 
     | 
    
         
            -
             
     | 
| 
      
 956 
     | 
    
         
            +
                            "trtllm_mha",
         
     | 
| 
      
 957 
     | 
    
         
            +
                        }, "fa3, aiter, triton, or trtllm_mha is required for Llama4 model"
         
     | 
| 
      
 958 
     | 
    
         
            +
                        if is_sm100_supported() and self.attention_backend is None:
         
     | 
| 
      
 959 
     | 
    
         
            +
                            self.attention_backend = "trtllm_mha"
         
     | 
| 
      
 960 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 961 
     | 
    
         
            +
                                "Use trtllm_mha as attention backend on sm100 for Llama4 model"
         
     | 
| 
      
 962 
     | 
    
         
            +
                            )
         
     | 
| 
       970 
963 
     | 
    
         
             
                    elif model_arch in [
         
     | 
| 
       971 
964 
     | 
    
         
             
                        "Gemma2ForCausalLM",
         
     | 
| 
       972 
965 
     | 
    
         
             
                        "Gemma3ForCausalLM",
         
     | 
| 
         @@ -1052,6 +1045,67 @@ class ServerArgs: 
     | 
|
| 
       1052 
1045 
     | 
    
         
             
                        )
         
     | 
| 
       1053 
1046 
     | 
    
         | 
| 
       1054 
1047 
     | 
    
         
             
                def _handle_attention_backend_compatibility(self):
         
     | 
| 
      
 1048 
     | 
    
         
            +
                    model_config = self.get_model_config()
         
     | 
| 
      
 1049 
     | 
    
         
            +
                    use_mla_backend = self.use_mla_backend()
         
     | 
| 
      
 1050 
     | 
    
         
            +
             
     | 
| 
      
 1051 
     | 
    
         
            +
                    if self.prefill_attention_backend is not None and (
         
     | 
| 
      
 1052 
     | 
    
         
            +
                        self.prefill_attention_backend == self.decode_attention_backend
         
     | 
| 
      
 1053 
     | 
    
         
            +
                    ):  # override the default attention backend
         
     | 
| 
      
 1054 
     | 
    
         
            +
                        self.attention_backend = self.prefill_attention_backend
         
     | 
| 
      
 1055 
     | 
    
         
            +
             
     | 
| 
      
 1056 
     | 
    
         
            +
                    # Pick the default attention backend if not specified
         
     | 
| 
      
 1057 
     | 
    
         
            +
                    if self.attention_backend is None:
         
     | 
| 
      
 1058 
     | 
    
         
            +
                        """
         
     | 
| 
      
 1059 
     | 
    
         
            +
                        Auto select the fastest attention backend.
         
     | 
| 
      
 1060 
     | 
    
         
            +
             
     | 
| 
      
 1061 
     | 
    
         
            +
                        1. Models with MHA Architecture (e.g: Llama, QWen)
         
     | 
| 
      
 1062 
     | 
    
         
            +
                            1.1 We will turn on FA3 on hopper unless user use spec decode with topk > 1 or page_size > 1.
         
     | 
| 
      
 1063 
     | 
    
         
            +
                            1.2 In other cases, we will use flashinfer if available, otherwise use triton.
         
     | 
| 
      
 1064 
     | 
    
         
            +
                        2. Models with MLA Architecture and using FA3
         
     | 
| 
      
 1065 
     | 
    
         
            +
                            2.1 We will use FA3 backend on hopper.
         
     | 
| 
      
 1066 
     | 
    
         
            +
                            2.2 We will use Flashinfer backend on blackwell.
         
     | 
| 
      
 1067 
     | 
    
         
            +
                            2.3 Otherwise, we will use triton backend.
         
     | 
| 
      
 1068 
     | 
    
         
            +
                        """
         
     | 
| 
      
 1069 
     | 
    
         
            +
             
     | 
| 
      
 1070 
     | 
    
         
            +
                        if not use_mla_backend:
         
     | 
| 
      
 1071 
     | 
    
         
            +
                            # MHA architecture
         
     | 
| 
      
 1072 
     | 
    
         
            +
                            if (
         
     | 
| 
      
 1073 
     | 
    
         
            +
                                is_hopper_with_cuda_12_3()
         
     | 
| 
      
 1074 
     | 
    
         
            +
                                and is_no_spec_infer_or_topk_one(self)
         
     | 
| 
      
 1075 
     | 
    
         
            +
                                and is_fa3_default_architecture(self.model_config.hf_config)
         
     | 
| 
      
 1076 
     | 
    
         
            +
                            ):
         
     | 
| 
      
 1077 
     | 
    
         
            +
                                self.attention_backend = "fa3"
         
     | 
| 
      
 1078 
     | 
    
         
            +
                            elif is_hip():
         
     | 
| 
      
 1079 
     | 
    
         
            +
                                self.attention_backend = "aiter"
         
     | 
| 
      
 1080 
     | 
    
         
            +
                            elif is_npu():
         
     | 
| 
      
 1081 
     | 
    
         
            +
                                self.attention_backend = "ascend"
         
     | 
| 
      
 1082 
     | 
    
         
            +
                            else:
         
     | 
| 
      
 1083 
     | 
    
         
            +
                                self.attention_backend = (
         
     | 
| 
      
 1084 
     | 
    
         
            +
                                    "flashinfer" if is_flashinfer_available() else "triton"
         
     | 
| 
      
 1085 
     | 
    
         
            +
                                )
         
     | 
| 
      
 1086 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 1087 
     | 
    
         
            +
                            # MLA architecture
         
     | 
| 
      
 1088 
     | 
    
         
            +
                            if is_hopper_with_cuda_12_3():
         
     | 
| 
      
 1089 
     | 
    
         
            +
                                self.attention_backend = "fa3"
         
     | 
| 
      
 1090 
     | 
    
         
            +
                            elif is_sm100_supported():
         
     | 
| 
      
 1091 
     | 
    
         
            +
                                self.attention_backend = "flashinfer"
         
     | 
| 
      
 1092 
     | 
    
         
            +
                            elif is_hip():
         
     | 
| 
      
 1093 
     | 
    
         
            +
                                head_num = model_config.get_num_kv_heads(self.tp_size)
         
     | 
| 
      
 1094 
     | 
    
         
            +
                                # TODO current aiter only support head number 16 or 128 head number
         
     | 
| 
      
 1095 
     | 
    
         
            +
                                if head_num == 128 or head_num == 16:
         
     | 
| 
      
 1096 
     | 
    
         
            +
                                    self.attention_backend = "aiter"
         
     | 
| 
      
 1097 
     | 
    
         
            +
                                else:
         
     | 
| 
      
 1098 
     | 
    
         
            +
                                    self.attention_backend = "triton"
         
     | 
| 
      
 1099 
     | 
    
         
            +
                            elif is_npu():
         
     | 
| 
      
 1100 
     | 
    
         
            +
                                self.attention_backend = "ascend"
         
     | 
| 
      
 1101 
     | 
    
         
            +
                            else:
         
     | 
| 
      
 1102 
     | 
    
         
            +
                                self.attention_backend = "triton"
         
     | 
| 
      
 1103 
     | 
    
         
            +
             
     | 
| 
      
 1104 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 1105 
     | 
    
         
            +
                            f"Attention backend not explicitly specified. Use {self.attention_backend} backend by default."
         
     | 
| 
      
 1106 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1107 
     | 
    
         
            +
             
     | 
| 
      
 1108 
     | 
    
         
            +
                    # Torch native and flex attention backends
         
     | 
| 
       1055 
1109 
     | 
    
         
             
                    if self.attention_backend == "torch_native":
         
     | 
| 
       1056 
1110 
     | 
    
         
             
                        logger.warning(
         
     | 
| 
       1057 
1111 
     | 
    
         
             
                            "Cuda graph is disabled because of using torch native attention backend"
         
     | 
| 
         @@ -1067,12 +1121,7 @@ class ServerArgs: 
     | 
|
| 
       1067 
1121 
     | 
    
         
             
                            self.speculative_algorithm is None
         
     | 
| 
       1068 
1122 
     | 
    
         
             
                        ), "Speculative decoding is currently not supported with Flex Attention backend"
         
     | 
| 
       1069 
1123 
     | 
    
         | 
| 
       1070 
     | 
    
         
            -
                     
     | 
| 
       1071 
     | 
    
         
            -
                        logger.warning(
         
     | 
| 
       1072 
     | 
    
         
            -
                            "At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
         
     | 
| 
       1073 
     | 
    
         
            -
                        )
         
     | 
| 
       1074 
     | 
    
         
            -
                        self.page_size = 128
         
     | 
| 
       1075 
     | 
    
         
            -
             
     | 
| 
      
 1124 
     | 
    
         
            +
                    # Major NVIDIA platforms backends
         
     | 
| 
       1076 
1125 
     | 
    
         
             
                    if (
         
     | 
| 
       1077 
1126 
     | 
    
         
             
                        self.attention_backend == "flashmla"
         
     | 
| 
       1078 
1127 
     | 
    
         
             
                        or self.decode_attention_backend == "flashmla"
         
     | 
| 
         @@ -1127,19 +1176,13 @@ class ServerArgs: 
     | 
|
| 
       1127 
1176 
     | 
    
         
             
                            )
         
     | 
| 
       1128 
1177 
     | 
    
         
             
                            self.page_size = 64
         
     | 
| 
       1129 
1178 
     | 
    
         | 
| 
       1130 
     | 
    
         
            -
                    if self.attention_backend == " 
     | 
| 
      
 1179 
     | 
    
         
            +
                    if self.attention_backend == "fa3" and self.kv_cache_dtype == "fp8_e5m2":
         
     | 
| 
       1131 
1180 
     | 
    
         
             
                        logger.warning(
         
     | 
| 
       1132 
     | 
    
         
            -
                            " 
     | 
| 
      
 1181 
     | 
    
         
            +
                            "FlashAttention3 only supports fp8_e4m3 if using FP8; "
         
     | 
| 
      
 1182 
     | 
    
         
            +
                            "Setting attention backend to triton."
         
     | 
| 
       1133 
1183 
     | 
    
         
             
                        )
         
     | 
| 
       1134 
     | 
    
         
            -
                        self. 
     | 
| 
       1135 
     | 
    
         
            -
                        self.disable_radix_cache = True
         
     | 
| 
      
 1184 
     | 
    
         
            +
                        self.attention_backend = "triton"
         
     | 
| 
       1136 
1185 
     | 
    
         | 
| 
       1137 
     | 
    
         
            -
                    if self.attention_backend == "intel_xpu":
         
     | 
| 
       1138 
     | 
    
         
            -
                        if self.page_size not in [32, 64, 128]:
         
     | 
| 
       1139 
     | 
    
         
            -
                            logger.warning(
         
     | 
| 
       1140 
     | 
    
         
            -
                                f"Intel XPU attention backend only supports page_size of 32, 64 or 128, changing page_size from {self.page_size} to 128."
         
     | 
| 
       1141 
     | 
    
         
            -
                            )
         
     | 
| 
       1142 
     | 
    
         
            -
                            self.page_size = 128
         
     | 
| 
       1143 
1186 
     | 
    
         
             
                    if self.attention_backend == "fa4" or self.decode_attention_backend == "fa4":
         
     | 
| 
       1144 
1187 
     | 
    
         
             
                        raise ValueError(
         
     | 
| 
       1145 
1188 
     | 
    
         
             
                            "FA4 backend is only supported for prefill. Please use `--prefill-attention-backend fa4` instead."
         
     | 
| 
         @@ -1150,6 +1193,66 @@ class ServerArgs: 
     | 
|
| 
       1150 
1193 
     | 
    
         
             
                        )
         
     | 
| 
       1151 
1194 
     | 
    
         
             
                        self.page_size = 128
         
     | 
| 
       1152 
1195 
     | 
    
         | 
| 
      
 1196 
     | 
    
         
            +
                    # AMD platforms backends
         
     | 
| 
      
 1197 
     | 
    
         
            +
                    if self.attention_backend == "aiter":
         
     | 
| 
      
 1198 
     | 
    
         
            +
                        if model_config.context_len > 8192:
         
     | 
| 
      
 1199 
     | 
    
         
            +
                            self.mem_fraction_static *= 0.90
         
     | 
| 
      
 1200 
     | 
    
         
            +
             
     | 
| 
      
 1201 
     | 
    
         
            +
                    # NPU platforms backends
         
     | 
| 
      
 1202 
     | 
    
         
            +
                    if is_npu() and self.attention_backend in ["ascend"]:
         
     | 
| 
      
 1203 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 1204 
     | 
    
         
            +
                            "At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
         
     | 
| 
      
 1205 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1206 
     | 
    
         
            +
                        self.page_size = 128
         
     | 
| 
      
 1207 
     | 
    
         
            +
             
     | 
| 
      
 1208 
     | 
    
         
            +
                    # Other platforms backends
         
     | 
| 
      
 1209 
     | 
    
         
            +
                    if (
         
     | 
| 
      
 1210 
     | 
    
         
            +
                        self.attention_backend == "intel_amx"
         
     | 
| 
      
 1211 
     | 
    
         
            +
                        and self.device == "cpu"
         
     | 
| 
      
 1212 
     | 
    
         
            +
                        and not cpu_has_amx_support()
         
     | 
| 
      
 1213 
     | 
    
         
            +
                    ):
         
     | 
| 
      
 1214 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 1215 
     | 
    
         
            +
                            "The current platform does not support Intel AMX, will fallback to torch_native backend."
         
     | 
| 
      
 1216 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1217 
     | 
    
         
            +
                        self.attention_backend = "torch_native"
         
     | 
| 
      
 1218 
     | 
    
         
            +
             
     | 
| 
      
 1219 
     | 
    
         
            +
                    if (
         
     | 
| 
      
 1220 
     | 
    
         
            +
                        self.attention_backend == "intel_xpu"
         
     | 
| 
      
 1221 
     | 
    
         
            +
                        and self.device == "xpu"
         
     | 
| 
      
 1222 
     | 
    
         
            +
                        and not xpu_has_xmx_support()
         
     | 
| 
      
 1223 
     | 
    
         
            +
                    ):
         
     | 
| 
      
 1224 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 1225 
     | 
    
         
            +
                            "The current platform does not support Intel XMX, will fallback to triton backend."
         
     | 
| 
      
 1226 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1227 
     | 
    
         
            +
                        self.attention_backend = "triton"
         
     | 
| 
      
 1228 
     | 
    
         
            +
             
     | 
| 
      
 1229 
     | 
    
         
            +
                    if self.attention_backend == "intel_xpu":
         
     | 
| 
      
 1230 
     | 
    
         
            +
                        if self.page_size not in [32, 64, 128]:
         
     | 
| 
      
 1231 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 1232 
     | 
    
         
            +
                                f"Intel XPU attention backend only supports page_size of 32, 64 or 128, changing page_size from {self.page_size} to 128."
         
     | 
| 
      
 1233 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1234 
     | 
    
         
            +
                            self.page_size = 128
         
     | 
| 
      
 1235 
     | 
    
         
            +
             
     | 
| 
      
 1236 
     | 
    
         
            +
                    # Dual chunk flash attention backend
         
     | 
| 
      
 1237 
     | 
    
         
            +
                    if (
         
     | 
| 
      
 1238 
     | 
    
         
            +
                        getattr(model_config.hf_config, "dual_chunk_attention_config", None)
         
     | 
| 
      
 1239 
     | 
    
         
            +
                        is not None
         
     | 
| 
      
 1240 
     | 
    
         
            +
                    ):
         
     | 
| 
      
 1241 
     | 
    
         
            +
                        if self.attention_backend is None:
         
     | 
| 
      
 1242 
     | 
    
         
            +
                            self.attention_backend = "dual_chunk_flash_attn"
         
     | 
| 
      
 1243 
     | 
    
         
            +
                            logger.info("Dual chunk attention is turned on by default.")
         
     | 
| 
      
 1244 
     | 
    
         
            +
                        elif self.attention_backend != "dual_chunk_flash_attn":
         
     | 
| 
      
 1245 
     | 
    
         
            +
                            raise ValueError(
         
     | 
| 
      
 1246 
     | 
    
         
            +
                                "Dual chunk attention is enabled, but attention backend is set to "
         
     | 
| 
      
 1247 
     | 
    
         
            +
                                f"{self.attention_backend}. Please set it to 'dual_chunk_flash_attn'."
         
     | 
| 
      
 1248 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1249 
     | 
    
         
            +
                    if self.attention_backend == "dual_chunk_flash_attn":
         
     | 
| 
      
 1250 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 1251 
     | 
    
         
            +
                            "Mixed chunk and radix cache are disabled when using dual-chunk flash attention backend"
         
     | 
| 
      
 1252 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1253 
     | 
    
         
            +
                        self.enable_mixed_chunk = False
         
     | 
| 
      
 1254 
     | 
    
         
            +
                        self.disable_radix_cache = True
         
     | 
| 
      
 1255 
     | 
    
         
            +
             
     | 
| 
       1153 
1256 
     | 
    
         
             
                def _handle_page_size(self):
         
     | 
| 
       1154 
1257 
     | 
    
         
             
                    if self.page_size is None:
         
     | 
| 
       1155 
1258 
     | 
    
         
             
                        self.page_size = 1
         
     | 
| 
         @@ -1162,6 +1265,22 @@ class ServerArgs: 
     | 
|
| 
       1162 
1265 
     | 
    
         
             
                    if self.grammar_backend is None:
         
     | 
| 
       1163 
1266 
     | 
    
         
             
                        self.grammar_backend = "xgrammar"
         
     | 
| 
       1164 
1267 
     | 
    
         | 
| 
      
 1268 
     | 
    
         
            +
                def _handle_ktransformers_configs(self):
         
     | 
| 
      
 1269 
     | 
    
         
            +
                    from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import (
         
     | 
| 
      
 1270 
     | 
    
         
            +
                        CompressedTensorsWNA16AMXEPMoEMethod,
         
     | 
| 
      
 1271 
     | 
    
         
            +
                        override_config,
         
     | 
| 
      
 1272 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1273 
     | 
    
         
            +
             
     | 
| 
      
 1274 
     | 
    
         
            +
                    override_config(
         
     | 
| 
      
 1275 
     | 
    
         
            +
                        CompressedTensorsWNA16AMXEPMoEMethod,
         
     | 
| 
      
 1276 
     | 
    
         
            +
                        self.kt_num_gpu_experts,
         
     | 
| 
      
 1277 
     | 
    
         
            +
                        self.kt_cpuinfer,
         
     | 
| 
      
 1278 
     | 
    
         
            +
                        self.kt_threadpool_count,
         
     | 
| 
      
 1279 
     | 
    
         
            +
                        self.kt_amx_weight_path,
         
     | 
| 
      
 1280 
     | 
    
         
            +
                        self.kt_amx_method,
         
     | 
| 
      
 1281 
     | 
    
         
            +
                        self.chunked_prefill_size,
         
     | 
| 
      
 1282 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1283 
     | 
    
         
            +
             
     | 
| 
       1165 
1284 
     | 
    
         
             
                def _handle_data_parallelism(self):
         
     | 
| 
       1166 
1285 
     | 
    
         
             
                    if self.dp_size == 1:
         
     | 
| 
       1167 
1286 
     | 
    
         
             
                        self.enable_dp_attention = False
         
     | 
| 
         @@ -1277,6 +1396,24 @@ class ServerArgs: 
     | 
|
| 
       1277 
1396 
     | 
    
         
             
                                "Page first direct layout only support direct io backend"
         
     | 
| 
       1278 
1397 
     | 
    
         
             
                            )
         
     | 
| 
       1279 
1398 
     | 
    
         | 
| 
      
 1399 
     | 
    
         
            +
                    if self.enable_hierarchical_cache and self.hicache_io_backend == "kernel":
         
     | 
| 
      
 1400 
     | 
    
         
            +
                        # fix for the compatibility issue with FlashAttention3 decoding and HiCache kernel backend
         
     | 
| 
      
 1401 
     | 
    
         
            +
                        if self.decode_attention_backend is None:
         
     | 
| 
      
 1402 
     | 
    
         
            +
                            if not self.use_mla_backend():
         
     | 
| 
      
 1403 
     | 
    
         
            +
                                self.decode_attention_backend = (
         
     | 
| 
      
 1404 
     | 
    
         
            +
                                    "flashinfer" if is_flashinfer_available() else "triton"
         
     | 
| 
      
 1405 
     | 
    
         
            +
                                )
         
     | 
| 
      
 1406 
     | 
    
         
            +
                            else:
         
     | 
| 
      
 1407 
     | 
    
         
            +
                                self.decode_attention_backend = (
         
     | 
| 
      
 1408 
     | 
    
         
            +
                                    "flashinfer" if is_sm100_supported() else "triton"
         
     | 
| 
      
 1409 
     | 
    
         
            +
                                )
         
     | 
| 
      
 1410 
     | 
    
         
            +
                        elif self.decode_attention_backend == "fa3":
         
     | 
| 
      
 1411 
     | 
    
         
            +
                            self.hicache_io_backend = "direct"
         
     | 
| 
      
 1412 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 1413 
     | 
    
         
            +
                                "FlashAttention3 decode backend is not compatible with hierarchical cache. "
         
     | 
| 
      
 1414 
     | 
    
         
            +
                                "Setting hicache_io_backend to vanilla I/O, which may lead to suboptimal performance with small page sizes."
         
     | 
| 
      
 1415 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1416 
     | 
    
         
            +
             
     | 
| 
       1280 
1417 
     | 
    
         
             
                def _handle_speculative_decoding(self):
         
     | 
| 
       1281 
1418 
     | 
    
         
             
                    if self.speculative_algorithm == "NEXTN":
         
     | 
| 
       1282 
1419 
     | 
    
         
             
                        self.speculative_algorithm = "EAGLE"
         
     | 
| 
         @@ -1287,22 +1424,26 @@ class ServerArgs: 
     | 
|
| 
       1287 
1424 
     | 
    
         
             
                            raise ValueError(
         
     | 
| 
       1288 
1425 
     | 
    
         
             
                                "Currently standalone speculative decoding does not support dp attention."
         
     | 
| 
       1289 
1426 
     | 
    
         
             
                            )
         
     | 
| 
      
 1427 
     | 
    
         
            +
             
     | 
| 
       1290 
1428 
     | 
    
         
             
                        if self.max_running_requests is None:
         
     | 
| 
       1291 
1429 
     | 
    
         
             
                            self.max_running_requests = 48
         
     | 
| 
       1292 
1430 
     | 
    
         
             
                            logger.warning(
         
     | 
| 
       1293 
     | 
    
         
            -
                                "Max running requests is reset to 48 for speculative decoding."
         
     | 
| 
      
 1431 
     | 
    
         
            +
                                "Max running requests is reset to 48 for speculative decoding. You can override this by explicitly setting --max-running-requests."
         
     | 
| 
       1294 
1432 
     | 
    
         
             
                            )
         
     | 
| 
       1295 
1433 
     | 
    
         | 
| 
       1296 
     | 
    
         
            -
                        if  
     | 
| 
      
 1434 
     | 
    
         
            +
                        if (
         
     | 
| 
      
 1435 
     | 
    
         
            +
                            self.speculative_algorithm == "EAGLE"
         
     | 
| 
      
 1436 
     | 
    
         
            +
                            and envs.SGLANG_ENABLE_SPEC_V2.get()
         
     | 
| 
      
 1437 
     | 
    
         
            +
                        ):
         
     | 
| 
       1297 
1438 
     | 
    
         
             
                            self.disable_overlap_schedule = False
         
     | 
| 
       1298 
1439 
     | 
    
         
             
                            logger.warning(
         
     | 
| 
       1299 
1440 
     | 
    
         
             
                                "Beta spec is enabled for eagle speculative decoding and overlap schedule is turned on."
         
     | 
| 
       1300 
1441 
     | 
    
         
             
                            )
         
     | 
| 
       1301 
1442 
     | 
    
         | 
| 
       1302 
     | 
    
         
            -
                        if not  
     | 
| 
      
 1443 
     | 
    
         
            +
                        if not envs.SGLANG_ENABLE_SPEC_V2.get():
         
     | 
| 
       1303 
1444 
     | 
    
         
             
                            self.disable_overlap_schedule = True
         
     | 
| 
       1304 
1445 
     | 
    
         
             
                            logger.warning(
         
     | 
| 
       1305 
     | 
    
         
            -
                                "Overlap scheduler is disabled because of using eagle3  
     | 
| 
      
 1446 
     | 
    
         
            +
                                "Overlap scheduler is disabled because of using eagle3 or standalone speculative decoding."
         
     | 
| 
       1306 
1447 
     | 
    
         
             
                            )
         
     | 
| 
       1307 
1448 
     | 
    
         | 
| 
       1308 
1449 
     | 
    
         
             
                        if self.enable_mixed_chunk:
         
     | 
| 
         @@ -1371,8 +1512,13 @@ class ServerArgs: 
     | 
|
| 
       1371 
1512 
     | 
    
         
             
                            raise ValueError(
         
     | 
| 
       1372 
1513 
     | 
    
         
             
                                "Ngram speculative decoding only supports CUDA device."
         
     | 
| 
       1373 
1514 
     | 
    
         
             
                            )
         
     | 
| 
      
 1515 
     | 
    
         
            +
             
     | 
| 
       1374 
1516 
     | 
    
         
             
                        if self.max_running_requests is None:
         
     | 
| 
       1375 
1517 
     | 
    
         
             
                            self.max_running_requests = 48
         
     | 
| 
      
 1518 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 1519 
     | 
    
         
            +
                                "Max running requests is reset to 48 for speculative decoding. You can override this by explicitly setting --max-running-requests."
         
     | 
| 
      
 1520 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1521 
     | 
    
         
            +
             
     | 
| 
       1376 
1522 
     | 
    
         
             
                        self.disable_overlap_schedule = True
         
     | 
| 
       1377 
1523 
     | 
    
         
             
                        self.enable_mixed_chunk = False
         
     | 
| 
       1378 
1524 
     | 
    
         
             
                        self.speculative_eagle_topk = self.speculative_ngram_max_bfs_breadth
         
     | 
| 
         @@ -1515,19 +1661,44 @@ class ServerArgs: 
     | 
|
| 
       1515 
1661 
     | 
    
         
             
                        )
         
     | 
| 
       1516 
1662 
     | 
    
         | 
| 
       1517 
1663 
     | 
    
         
             
                def _handle_deterministic_inference(self):
         
     | 
| 
      
 1664 
     | 
    
         
            +
                    if self.rl_on_policy_target is not None:
         
     | 
| 
      
 1665 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 1666 
     | 
    
         
            +
                            "Enable deterministic inference because of rl_on_policy_target."
         
     | 
| 
      
 1667 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1668 
     | 
    
         
            +
                        self.enable_deterministic_inference = True
         
     | 
| 
      
 1669 
     | 
    
         
            +
                        # TODO remove this environment variable as a whole
         
     | 
| 
      
 1670 
     | 
    
         
            +
                        os.environ["SGLANG_ENABLE_DETERMINISTIC_INFERENCE"] = "1"
         
     | 
| 
      
 1671 
     | 
    
         
            +
             
     | 
| 
       1518 
1672 
     | 
    
         
             
                    if self.enable_deterministic_inference:
         
     | 
| 
       1519 
1673 
     | 
    
         
             
                        # Check sampling backend
         
     | 
| 
       1520 
1674 
     | 
    
         
             
                        self.sampling_backend = "pytorch"
         
     | 
| 
       1521 
1675 
     | 
    
         
             
                        logger.warning(
         
     | 
| 
       1522 
1676 
     | 
    
         
             
                            "Sampling backend is set to pytorch for deterministic inference."
         
     | 
| 
       1523 
1677 
     | 
    
         
             
                        )
         
     | 
| 
      
 1678 
     | 
    
         
            +
                        is_deepseek_model = False
         
     | 
| 
      
 1679 
     | 
    
         
            +
                        if parse_connector_type(self.model_path) != ConnectorType.INSTANCE:
         
     | 
| 
      
 1680 
     | 
    
         
            +
                            try:
         
     | 
| 
      
 1681 
     | 
    
         
            +
                                hf_config = self.get_hf_config()
         
     | 
| 
      
 1682 
     | 
    
         
            +
                                model_arch = hf_config.architectures[0]
         
     | 
| 
      
 1683 
     | 
    
         
            +
                                is_deepseek_model = model_arch in [
         
     | 
| 
      
 1684 
     | 
    
         
            +
                                    "DeepseekV2ForCausalLM",
         
     | 
| 
      
 1685 
     | 
    
         
            +
                                    "DeepseekV3ForCausalLM",
         
     | 
| 
      
 1686 
     | 
    
         
            +
                                    "DeepseekV32ForCausalLM",
         
     | 
| 
      
 1687 
     | 
    
         
            +
                                ]
         
     | 
| 
      
 1688 
     | 
    
         
            +
                            except Exception:
         
     | 
| 
      
 1689 
     | 
    
         
            +
                                pass
         
     | 
| 
       1524 
1690 
     | 
    
         | 
| 
       1525 
1691 
     | 
    
         
             
                        # Check attention backend
         
     | 
| 
       1526 
1692 
     | 
    
         
             
                        if self.attention_backend is None:
         
     | 
| 
       1527 
1693 
     | 
    
         
             
                            # User didn't specify attention backend, fallback based on GPU architecture
         
     | 
| 
       1528 
1694 
     | 
    
         
             
                            if is_sm100_supported() or is_sm120_supported():
         
     | 
| 
       1529 
1695 
     | 
    
         
             
                                # Blackwell and newer architectures
         
     | 
| 
       1530 
     | 
    
         
            -
                                 
     | 
| 
      
 1696 
     | 
    
         
            +
                                if is_deepseek_model:
         
     | 
| 
      
 1697 
     | 
    
         
            +
                                    # fallback to triton for DeepSeek models because flashinfer doesn't support deterministic inference for DeepSeek models yet
         
     | 
| 
      
 1698 
     | 
    
         
            +
                                    self.attention_backend = "triton"
         
     | 
| 
      
 1699 
     | 
    
         
            +
                                else:
         
     | 
| 
      
 1700 
     | 
    
         
            +
                                    # fallback to flashinfer on Blackwell for non-DeepSeek models
         
     | 
| 
      
 1701 
     | 
    
         
            +
                                    self.attention_backend = "flashinfer"
         
     | 
| 
       1531 
1702 
     | 
    
         
             
                            else:
         
     | 
| 
       1532 
1703 
     | 
    
         
             
                                # Hopper (SM90) and older architectures
         
     | 
| 
       1533 
1704 
     | 
    
         
             
                                self.attention_backend = "fa3"
         
     | 
| 
         @@ -1542,8 +1713,13 @@ class ServerArgs: 
     | 
|
| 
       1542 
1713 
     | 
    
         
             
                                f"but you explicitly specified '{self.attention_backend}'."
         
     | 
| 
       1543 
1714 
     | 
    
         
             
                            )
         
     | 
| 
       1544 
1715 
     | 
    
         | 
| 
       1545 
     | 
    
         
            -
                        # Currently, only FA3 and Triton supports radix cache. Support for other backends is in progress
         
     | 
| 
       1546 
1716 
     | 
    
         
             
                        if self.attention_backend not in ["fa3", "triton"]:
         
     | 
| 
      
 1717 
     | 
    
         
            +
                            if is_deepseek_model:
         
     | 
| 
      
 1718 
     | 
    
         
            +
                                raise ValueError(
         
     | 
| 
      
 1719 
     | 
    
         
            +
                                    f"Currently only fa3 and triton attention backends are supported for deterministic inference with DeepSeek models. But you're using {self.attention_backend}."
         
     | 
| 
      
 1720 
     | 
    
         
            +
                                )
         
     | 
| 
      
 1721 
     | 
    
         
            +
             
     | 
| 
      
 1722 
     | 
    
         
            +
                            # Currently, only FA3 and Triton supports radix cache. Support for other backends is in progress
         
     | 
| 
       1547 
1723 
     | 
    
         
             
                            self.disable_radix_cache = True
         
     | 
| 
       1548 
1724 
     | 
    
         
             
                            logger.warning(
         
     | 
| 
       1549 
1725 
     | 
    
         
             
                                f"Currently radix cache is not compatible with {self.attention_backend} attention backend for deterministic inference. It will be supported in the future."
         
     | 
| 
         @@ -1743,6 +1919,18 @@ class ServerArgs: 
     | 
|
| 
       1743 
1919 
     | 
    
         
             
                        "KV cache dtype is FP8. Otherwise, KV cache scaling factors "
         
     | 
| 
       1744 
1920 
     | 
    
         
             
                        "default to 1.0, which may cause accuracy issues. ",
         
     | 
| 
       1745 
1921 
     | 
    
         
             
                    )
         
     | 
| 
      
 1922 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 1923 
     | 
    
         
            +
                        "--kv-cache-dtype",
         
     | 
| 
      
 1924 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 1925 
     | 
    
         
            +
                        default=ServerArgs.kv_cache_dtype,
         
     | 
| 
      
 1926 
     | 
    
         
            +
                        choices=["auto", "fp8_e5m2", "fp8_e4m3", "bf16", "bfloat16"],
         
     | 
| 
      
 1927 
     | 
    
         
            +
                        help='Data type for kv cache storage. "auto" will use model data type. "bf16" or "bfloat16" for BF16 KV cache. "fp8_e5m2" and "fp8_e4m3" are supported for CUDA 11.8+.',
         
     | 
| 
      
 1928 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1929 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 1930 
     | 
    
         
            +
                        "--enable-fp32-lm-head",
         
     | 
| 
      
 1931 
     | 
    
         
            +
                        action="store_true",
         
     | 
| 
      
 1932 
     | 
    
         
            +
                        help="If set, the LM head outputs (logits) are in FP32.",
         
     | 
| 
      
 1933 
     | 
    
         
            +
                    )
         
     | 
| 
       1746 
1934 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1747 
1935 
     | 
    
         
             
                        "--modelopt-quant",
         
     | 
| 
       1748 
1936 
     | 
    
         
             
                        type=str,
         
     | 
| 
         @@ -1782,18 +1970,6 @@ class ServerArgs: 
     | 
|
| 
       1782 
1970 
     | 
    
         
             
                        "This is useful for development and prototyping. For production, it's recommended "
         
     | 
| 
       1783 
1971 
     | 
    
         
             
                        "to use separate quantization and deployment steps.",
         
     | 
| 
       1784 
1972 
     | 
    
         
             
                    )
         
     | 
| 
       1785 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       1786 
     | 
    
         
            -
                        "--kv-cache-dtype",
         
     | 
| 
       1787 
     | 
    
         
            -
                        type=str,
         
     | 
| 
       1788 
     | 
    
         
            -
                        default=ServerArgs.kv_cache_dtype,
         
     | 
| 
       1789 
     | 
    
         
            -
                        choices=["auto", "fp8_e5m2", "fp8_e4m3", "bf16", "bfloat16"],
         
     | 
| 
       1790 
     | 
    
         
            -
                        help='Data type for kv cache storage. "auto" will use model data type. "bf16" or "bfloat16" for BF16 KV cache. "fp8_e5m2" and "fp8_e4m3" are supported for CUDA 11.8+.',
         
     | 
| 
       1791 
     | 
    
         
            -
                    )
         
     | 
| 
       1792 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       1793 
     | 
    
         
            -
                        "--enable-fp32-lm-head",
         
     | 
| 
       1794 
     | 
    
         
            -
                        action="store_true",
         
     | 
| 
       1795 
     | 
    
         
            -
                        help="If set, the LM head outputs (logits) are in FP32.",
         
     | 
| 
       1796 
     | 
    
         
            -
                    )
         
     | 
| 
       1797 
1973 
     | 
    
         | 
| 
       1798 
1974 
     | 
    
         
             
                    # Memory and scheduling
         
     | 
| 
       1799 
1975 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
         @@ -1898,7 +2074,14 @@ class ServerArgs: 
     | 
|
| 
       1898 
2074 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1899 
2075 
     | 
    
         
             
                        "--disable-hybrid-swa-memory",
         
     | 
| 
       1900 
2076 
     | 
    
         
             
                        action="store_true",
         
     | 
| 
       1901 
     | 
    
         
            -
                        help="Disable the hybrid SWA memory.",
         
     | 
| 
      
 2077 
     | 
    
         
            +
                        help="Disable the hybrid SWA memory pool.",
         
     | 
| 
      
 2078 
     | 
    
         
            +
                    )
         
     | 
| 
      
 2079 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2080 
     | 
    
         
            +
                        "--radix-eviction-policy",
         
     | 
| 
      
 2081 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 2082 
     | 
    
         
            +
                        choices=RADIX_EVICTION_POLICY_CHOICES,
         
     | 
| 
      
 2083 
     | 
    
         
            +
                        default=ServerArgs.radix_eviction_policy,
         
     | 
| 
      
 2084 
     | 
    
         
            +
                        help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.",
         
     | 
| 
       1902 
2085 
     | 
    
         
             
                    )
         
     | 
| 
       1903 
2086 
     | 
    
         | 
| 
       1904 
2087 
     | 
    
         
             
                    # Runtime options
         
     | 
| 
         @@ -1908,21 +2091,6 @@ class ServerArgs: 
     | 
|
| 
       1908 
2091 
     | 
    
         
             
                        default=ServerArgs.device,
         
     | 
| 
       1909 
2092 
     | 
    
         
             
                        help="The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified.",
         
     | 
| 
       1910 
2093 
     | 
    
         
             
                    )
         
     | 
| 
       1911 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       1912 
     | 
    
         
            -
                        "--elastic-ep-backend",
         
     | 
| 
       1913 
     | 
    
         
            -
                        type=str,
         
     | 
| 
       1914 
     | 
    
         
            -
                        default=ServerArgs.elastic_ep_backend,
         
     | 
| 
       1915 
     | 
    
         
            -
                        choices=["none", "mooncake"],
         
     | 
| 
       1916 
     | 
    
         
            -
                        help="Specify the collective communication backend for elastic EP. Currently supports 'mooncake'.",
         
     | 
| 
       1917 
     | 
    
         
            -
                    )
         
     | 
| 
       1918 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       1919 
     | 
    
         
            -
                        "--mooncake-ib-device",
         
     | 
| 
       1920 
     | 
    
         
            -
                        type=str,
         
     | 
| 
       1921 
     | 
    
         
            -
                        default=ServerArgs.mooncake_ib_device,
         
     | 
| 
       1922 
     | 
    
         
            -
                        help="The InfiniBand devices for Mooncake Backend transfer, accepts multiple comma-separated devices "
         
     | 
| 
       1923 
     | 
    
         
            -
                        "(e.g., --mooncake-ib-device mlx5_0,mlx5_1). "
         
     | 
| 
       1924 
     | 
    
         
            -
                        "Default is None, which triggers automatic device detection when Mooncake Backend is enabled.",
         
     | 
| 
       1925 
     | 
    
         
            -
                    )
         
     | 
| 
       1926 
2094 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1927 
2095 
     | 
    
         
             
                        "--tensor-parallel-size",
         
     | 
| 
       1928 
2096 
     | 
    
         
             
                        "--tp-size",
         
     | 
| 
         @@ -2210,6 +2378,12 @@ class ServerArgs: 
     | 
|
| 
       2210 
2378 
     | 
    
         
             
                        default=ServerArgs.tool_call_parser,
         
     | 
| 
       2211 
2379 
     | 
    
         
             
                        help=f"Specify the parser for handling tool-call interactions. Options include: {tool_call_parser_choices}.",
         
     | 
| 
       2212 
2380 
     | 
    
         
             
                    )
         
     | 
| 
      
 2381 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2382 
     | 
    
         
            +
                        "--tool-server",
         
     | 
| 
      
 2383 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 2384 
     | 
    
         
            +
                        default=None,
         
     | 
| 
      
 2385 
     | 
    
         
            +
                        help="Either 'demo' or a comma-separated list of tool server urls to use for the model. If not specified, no tool server will be used.",
         
     | 
| 
      
 2386 
     | 
    
         
            +
                    )
         
     | 
| 
       2213 
2387 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2214 
2388 
     | 
    
         
             
                        "--sampling-defaults",
         
     | 
| 
       2215 
2389 
     | 
    
         
             
                        type=str,
         
     | 
| 
         @@ -2220,12 +2394,6 @@ class ServerArgs: 
     | 
|
| 
       2220 
2394 
     | 
    
         
             
                        "'model' uses the model's generation_config.json to get the recommended "
         
     | 
| 
       2221 
2395 
     | 
    
         
             
                        "sampling parameters if available. Default is 'model'.",
         
     | 
| 
       2222 
2396 
     | 
    
         
             
                    )
         
     | 
| 
       2223 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       2224 
     | 
    
         
            -
                        "--tool-server",
         
     | 
| 
       2225 
     | 
    
         
            -
                        type=str,
         
     | 
| 
       2226 
     | 
    
         
            -
                        default=None,
         
     | 
| 
       2227 
     | 
    
         
            -
                        help="Either 'demo' or a comma-separated list of tool server urls to use for the model. If not specified, no tool server will be used.",
         
     | 
| 
       2228 
     | 
    
         
            -
                    )
         
     | 
| 
       2229 
2397 
     | 
    
         | 
| 
       2230 
2398 
     | 
    
         
             
                    # Data parallelism
         
     | 
| 
       2231 
2399 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
         @@ -2332,7 +2500,7 @@ class ServerArgs: 
     | 
|
| 
       2332 
2500 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2333 
2501 
     | 
    
         
             
                        "--lora-eviction-policy",
         
     | 
| 
       2334 
2502 
     | 
    
         
             
                        type=str,
         
     | 
| 
       2335 
     | 
    
         
            -
                        default= 
     | 
| 
      
 2503 
     | 
    
         
            +
                        default=ServerArgs.lora_eviction_policy,
         
     | 
| 
       2336 
2504 
     | 
    
         
             
                        choices=["lru", "fifo"],
         
     | 
| 
       2337 
2505 
     | 
    
         
             
                        help="LoRA adapter eviction policy when memory pool is full. 'lru': Least Recently Used (default, better cache efficiency). 'fifo': First-In-First-Out.",
         
     | 
| 
       2338 
2506 
     | 
    
         
             
                    )
         
     | 
| 
         @@ -2408,7 +2576,6 @@ class ServerArgs: 
     | 
|
| 
       2408 
2576 
     | 
    
         
             
                    )
         
     | 
| 
       2409 
2577 
     | 
    
         | 
| 
       2410 
2578 
     | 
    
         
             
                    # Speculative decoding
         
     | 
| 
       2411 
     | 
    
         
            -
                    parser.add_argument("--enable-beta-spec", action="store_true")
         
     | 
| 
       2412 
2579 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2413 
2580 
     | 
    
         
             
                        "--speculative-algorithm",
         
     | 
| 
       2414 
2581 
     | 
    
         
             
                        type=str,
         
     | 
| 
         @@ -2644,6 +2811,21 @@ class ServerArgs: 
     | 
|
| 
       2644 
2811 
     | 
    
         
             
                        default=ServerArgs.moe_dense_tp_size,
         
     | 
| 
       2645 
2812 
     | 
    
         
             
                        help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
         
     | 
| 
       2646 
2813 
     | 
    
         
             
                    )
         
     | 
| 
      
 2814 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2815 
     | 
    
         
            +
                        "--elastic-ep-backend",
         
     | 
| 
      
 2816 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 2817 
     | 
    
         
            +
                        default=ServerArgs.elastic_ep_backend,
         
     | 
| 
      
 2818 
     | 
    
         
            +
                        choices=["none", "mooncake"],
         
     | 
| 
      
 2819 
     | 
    
         
            +
                        help="Specify the collective communication backend for elastic EP. Currently supports 'mooncake'.",
         
     | 
| 
      
 2820 
     | 
    
         
            +
                    )
         
     | 
| 
      
 2821 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2822 
     | 
    
         
            +
                        "--mooncake-ib-device",
         
     | 
| 
      
 2823 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 2824 
     | 
    
         
            +
                        default=ServerArgs.mooncake_ib_device,
         
     | 
| 
      
 2825 
     | 
    
         
            +
                        help="The InfiniBand devices for Mooncake Backend transfer, accepts multiple comma-separated devices "
         
     | 
| 
      
 2826 
     | 
    
         
            +
                        "(e.g., --mooncake-ib-device mlx5_0,mlx5_1). "
         
     | 
| 
      
 2827 
     | 
    
         
            +
                        "Default is None, which triggers automatic device detection when Mooncake Backend is enabled.",
         
     | 
| 
      
 2828 
     | 
    
         
            +
                    )
         
     | 
| 
       2647 
2829 
     | 
    
         | 
| 
       2648 
2830 
     | 
    
         
             
                    # Mamba Cache
         
     | 
| 
       2649 
2831 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
         @@ -2691,13 +2873,6 @@ class ServerArgs: 
     | 
|
| 
       2691 
2873 
     | 
    
         
             
                        default=ServerArgs.hicache_write_policy,
         
     | 
| 
       2692 
2874 
     | 
    
         
             
                        help="The write policy of hierarchical cache.",
         
     | 
| 
       2693 
2875 
     | 
    
         
             
                    )
         
     | 
| 
       2694 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       2695 
     | 
    
         
            -
                        "--radix-eviction-policy",
         
     | 
| 
       2696 
     | 
    
         
            -
                        type=str,
         
     | 
| 
       2697 
     | 
    
         
            -
                        choices=RADIX_EVICTION_POLICY_CHOICES,
         
     | 
| 
       2698 
     | 
    
         
            -
                        default=ServerArgs.radix_eviction_policy,
         
     | 
| 
       2699 
     | 
    
         
            -
                        help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.",
         
     | 
| 
       2700 
     | 
    
         
            -
                    )
         
     | 
| 
       2701 
2876 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2702 
2877 
     | 
    
         
             
                        "--hicache-io-backend",
         
     | 
| 
       2703 
2878 
     | 
    
         
             
                        type=str,
         
     | 
| 
         @@ -2805,7 +2980,7 @@ class ServerArgs: 
     | 
|
| 
       2805 
2980 
     | 
    
         
             
                        "--ds-sparse-decode-threshold",
         
     | 
| 
       2806 
2981 
     | 
    
         
             
                        type=int,
         
     | 
| 
       2807 
2982 
     | 
    
         
             
                        default=ServerArgs.ds_sparse_decode_threshold,
         
     | 
| 
       2808 
     | 
    
         
            -
                        help="The  
     | 
| 
      
 2983 
     | 
    
         
            +
                        help="The minimum decode sequence length required before the double-sparsity backend switches from the dense fallback to the sparse decode kernel.",
         
     | 
| 
       2809 
2984 
     | 
    
         
             
                    )
         
     | 
| 
       2810 
2985 
     | 
    
         | 
| 
       2811 
2986 
     | 
    
         
             
                    # Offloading
         
     | 
| 
         @@ -3111,26 +3286,20 @@ class ServerArgs: 
     | 
|
| 
       3111 
3286 
     | 
    
         
             
                        nargs="+",
         
     | 
| 
       3112 
3287 
     | 
    
         
             
                        help="Sets the numa node for the subprocesses. i-th element corresponds to i-th subprocess.",
         
     | 
| 
       3113 
3288 
     | 
    
         
             
                    )
         
     | 
| 
       3114 
     | 
    
         
            -
             
     | 
| 
       3115 
     | 
    
         
            -
                    # Debug tensor dumps
         
     | 
| 
       3116 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       3117 
     | 
    
         
            -
                        "--debug-tensor-dump-output-folder",
         
     | 
| 
       3118 
     | 
    
         
            -
                        type=str,
         
     | 
| 
       3119 
     | 
    
         
            -
                        default=ServerArgs.debug_tensor_dump_output_folder,
         
     | 
| 
       3120 
     | 
    
         
            -
                        help="The output folder for dumping tensors.",
         
     | 
| 
       3121 
     | 
    
         
            -
                    )
         
     | 
| 
       3122 
3289 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       3123 
     | 
    
         
            -
                        "-- 
     | 
| 
       3124 
     | 
    
         
            -
                         
     | 
| 
       3125 
     | 
    
         
            -
                         
     | 
| 
       3126 
     | 
    
         
            -
                        help="The input filename for dumping tensors",
         
     | 
| 
      
 3290 
     | 
    
         
            +
                        "--enable-deterministic-inference",
         
     | 
| 
      
 3291 
     | 
    
         
            +
                        action="store_true",
         
     | 
| 
      
 3292 
     | 
    
         
            +
                        help="Enable deterministic inference mode with batch invariant ops.",
         
     | 
| 
       3127 
3293 
     | 
    
         
             
                    )
         
     | 
| 
       3128 
3294 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       3129 
     | 
    
         
            -
                        "-- 
     | 
| 
      
 3295 
     | 
    
         
            +
                        "--rl-on-policy-target",
         
     | 
| 
       3130 
3296 
     | 
    
         
             
                        type=str,
         
     | 
| 
       3131 
     | 
    
         
            -
                        default=ServerArgs. 
     | 
| 
       3132 
     | 
    
         
            -
                         
     | 
| 
      
 3297 
     | 
    
         
            +
                        default=ServerArgs.rl_on_policy_target,
         
     | 
| 
      
 3298 
     | 
    
         
            +
                        choices=["fsdp"],
         
     | 
| 
      
 3299 
     | 
    
         
            +
                        help="The training system that SGLang needs to match for true on-policy.",
         
     | 
| 
       3133 
3300 
     | 
    
         
             
                    )
         
     | 
| 
      
 3301 
     | 
    
         
            +
             
     | 
| 
      
 3302 
     | 
    
         
            +
                    # Dynamic batch tokenizer
         
     | 
| 
       3134 
3303 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       3135 
3304 
     | 
    
         
             
                        "--enable-dynamic-batch-tokenizer",
         
     | 
| 
       3136 
3305 
     | 
    
         
             
                        action="store_true",
         
     | 
| 
         @@ -3149,6 +3318,26 @@ class ServerArgs: 
     | 
|
| 
       3149 
3318 
     | 
    
         
             
                        help="[Only used if --enable-dynamic-batch-tokenizer is set] Timeout in seconds for batching tokenization requests.",
         
     | 
| 
       3150 
3319 
     | 
    
         
             
                    )
         
     | 
| 
       3151 
3320 
     | 
    
         | 
| 
      
 3321 
     | 
    
         
            +
                    # Debug tensor dumps
         
     | 
| 
      
 3322 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 3323 
     | 
    
         
            +
                        "--debug-tensor-dump-output-folder",
         
     | 
| 
      
 3324 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 3325 
     | 
    
         
            +
                        default=ServerArgs.debug_tensor_dump_output_folder,
         
     | 
| 
      
 3326 
     | 
    
         
            +
                        help="The output folder for dumping tensors.",
         
     | 
| 
      
 3327 
     | 
    
         
            +
                    )
         
     | 
| 
      
 3328 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 3329 
     | 
    
         
            +
                        "--debug-tensor-dump-input-file",
         
     | 
| 
      
 3330 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 3331 
     | 
    
         
            +
                        default=ServerArgs.debug_tensor_dump_input_file,
         
     | 
| 
      
 3332 
     | 
    
         
            +
                        help="The input filename for dumping tensors",
         
     | 
| 
      
 3333 
     | 
    
         
            +
                    )
         
     | 
| 
      
 3334 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 3335 
     | 
    
         
            +
                        "--debug-tensor-dump-inject",
         
     | 
| 
      
 3336 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 3337 
     | 
    
         
            +
                        default=ServerArgs.debug_tensor_dump_inject,
         
     | 
| 
      
 3338 
     | 
    
         
            +
                        help="Inject the outputs from jax as the input of every layer.",
         
     | 
| 
      
 3339 
     | 
    
         
            +
                    )
         
     | 
| 
      
 3340 
     | 
    
         
            +
             
     | 
| 
       3152 
3341 
     | 
    
         
             
                    # PD disaggregation
         
     | 
| 
       3153 
3342 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       3154 
3343 
     | 
    
         
             
                        "--disaggregation-mode",
         
     | 
| 
         @@ -3258,7 +3447,6 @@ class ServerArgs: 
     | 
|
| 
       3258 
3447 
     | 
    
         
             
                        default=None,
         
     | 
| 
       3259 
3448 
     | 
    
         
             
                        help="The path of the PD-Multiplexing config file.",
         
     | 
| 
       3260 
3449 
     | 
    
         
             
                    )
         
     | 
| 
       3261 
     | 
    
         
            -
             
     | 
| 
       3262 
3450 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       3263 
3451 
     | 
    
         
             
                        "--sm-group-num",
         
     | 
| 
       3264 
3452 
     | 
    
         
             
                        type=int,
         
     | 
| 
         @@ -3266,50 +3454,6 @@ class ServerArgs: 
     | 
|
| 
       3266 
3454 
     | 
    
         
             
                        help="Number of sm partition groups.",
         
     | 
| 
       3267 
3455 
     | 
    
         
             
                    )
         
     | 
| 
       3268 
3456 
     | 
    
         | 
| 
       3269 
     | 
    
         
            -
                    # For deterministic inference
         
     | 
| 
       3270 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       3271 
     | 
    
         
            -
                        "--enable-deterministic-inference",
         
     | 
| 
       3272 
     | 
    
         
            -
                        action="store_true",
         
     | 
| 
       3273 
     | 
    
         
            -
                        help="Enable deterministic inference mode with batch invariant ops.",
         
     | 
| 
       3274 
     | 
    
         
            -
                    )
         
     | 
| 
       3275 
     | 
    
         
            -
             
     | 
| 
       3276 
     | 
    
         
            -
                    # Deprecated arguments
         
     | 
| 
       3277 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       3278 
     | 
    
         
            -
                        "--enable-ep-moe",
         
     | 
| 
       3279 
     | 
    
         
            -
                        action=DeprecatedAction,
         
     | 
| 
       3280 
     | 
    
         
            -
                        help="NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead.",
         
     | 
| 
       3281 
     | 
    
         
            -
                    )
         
     | 
| 
       3282 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       3283 
     | 
    
         
            -
                        "--enable-deepep-moe",
         
     | 
| 
       3284 
     | 
    
         
            -
                        action=DeprecatedAction,
         
     | 
| 
       3285 
     | 
    
         
            -
                        help="NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead.",
         
     | 
| 
       3286 
     | 
    
         
            -
                    )
         
     | 
| 
       3287 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       3288 
     | 
    
         
            -
                        "--enable-flashinfer-cutlass-moe",
         
     | 
| 
       3289 
     | 
    
         
            -
                        action=DeprecatedAction,
         
     | 
| 
       3290 
     | 
    
         
            -
                        help="NOTE: --enable-flashinfer-cutlass-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutlass' instead.",
         
     | 
| 
       3291 
     | 
    
         
            -
                    )
         
     | 
| 
       3292 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       3293 
     | 
    
         
            -
                        "--enable-flashinfer-cutedsl-moe",
         
     | 
| 
       3294 
     | 
    
         
            -
                        action=DeprecatedAction,
         
     | 
| 
       3295 
     | 
    
         
            -
                        help="NOTE: --enable-flashinfer-cutedsl-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutedsl' instead.",
         
     | 
| 
       3296 
     | 
    
         
            -
                    )
         
     | 
| 
       3297 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       3298 
     | 
    
         
            -
                        "--enable-flashinfer-trtllm-moe",
         
     | 
| 
       3299 
     | 
    
         
            -
                        action=DeprecatedAction,
         
     | 
| 
       3300 
     | 
    
         
            -
                        help="NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_trtllm' instead.",
         
     | 
| 
       3301 
     | 
    
         
            -
                    )
         
     | 
| 
       3302 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       3303 
     | 
    
         
            -
                        "--enable-triton-kernel-moe",
         
     | 
| 
       3304 
     | 
    
         
            -
                        action=DeprecatedAction,
         
     | 
| 
       3305 
     | 
    
         
            -
                        help="NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead.",
         
     | 
| 
       3306 
     | 
    
         
            -
                    )
         
     | 
| 
       3307 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       3308 
     | 
    
         
            -
                        "--enable-flashinfer-mxfp4-moe",
         
     | 
| 
       3309 
     | 
    
         
            -
                        action=DeprecatedAction,
         
     | 
| 
       3310 
     | 
    
         
            -
                        help="NOTE: --enable-flashinfer-mxfp4-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_mxfp4' instead.",
         
     | 
| 
       3311 
     | 
    
         
            -
                    )
         
     | 
| 
       3312 
     | 
    
         
            -
             
     | 
| 
       3313 
3457 
     | 
    
         
             
                    # Configuration file support
         
     | 
| 
       3314 
3458 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       3315 
3459 
     | 
    
         
             
                        "--config",
         
     | 
| 
         @@ -3344,6 +3488,34 @@ class ServerArgs: 
     | 
|
| 
       3344 
3488 
     | 
    
         
             
                    )
         
     | 
| 
       3345 
3489 
     | 
    
         
             
                    return hf_config
         
     | 
| 
       3346 
3490 
     | 
    
         | 
| 
      
 3491 
     | 
    
         
            +
                def get_model_config(self):
         
     | 
| 
      
 3492 
     | 
    
         
            +
                    # Lazy init to avoid circular import
         
     | 
| 
      
 3493 
     | 
    
         
            +
                    from sglang.srt.configs.model_config import ModelConfig
         
     | 
| 
      
 3494 
     | 
    
         
            +
             
     | 
| 
      
 3495 
     | 
    
         
            +
                    if hasattr(self, "model_config"):
         
     | 
| 
      
 3496 
     | 
    
         
            +
                        return self.model_config
         
     | 
| 
      
 3497 
     | 
    
         
            +
                    self.model_config = ModelConfig.from_server_args(self)
         
     | 
| 
      
 3498 
     | 
    
         
            +
                    return self.model_config
         
     | 
| 
      
 3499 
     | 
    
         
            +
             
     | 
| 
      
 3500 
     | 
    
         
            +
                def get_attention_backends(self):
         
     | 
| 
      
 3501 
     | 
    
         
            +
                    prefill_attention_backend_str = (
         
     | 
| 
      
 3502 
     | 
    
         
            +
                        self.prefill_attention_backend
         
     | 
| 
      
 3503 
     | 
    
         
            +
                        if self.prefill_attention_backend
         
     | 
| 
      
 3504 
     | 
    
         
            +
                        else self.attention_backend
         
     | 
| 
      
 3505 
     | 
    
         
            +
                    )
         
     | 
| 
      
 3506 
     | 
    
         
            +
                    decode_attention_backend_str = (
         
     | 
| 
      
 3507 
     | 
    
         
            +
                        self.decode_attention_backend
         
     | 
| 
      
 3508 
     | 
    
         
            +
                        if self.decode_attention_backend
         
     | 
| 
      
 3509 
     | 
    
         
            +
                        else self.attention_backend
         
     | 
| 
      
 3510 
     | 
    
         
            +
                    )
         
     | 
| 
      
 3511 
     | 
    
         
            +
                    return prefill_attention_backend_str, decode_attention_backend_str
         
     | 
| 
      
 3512 
     | 
    
         
            +
             
     | 
| 
      
 3513 
     | 
    
         
            +
                def use_mla_backend(self):
         
     | 
| 
      
 3514 
     | 
    
         
            +
                    from sglang.srt.configs.model_config import AttentionArch
         
     | 
| 
      
 3515 
     | 
    
         
            +
             
     | 
| 
      
 3516 
     | 
    
         
            +
                    model_config = self.get_model_config()
         
     | 
| 
      
 3517 
     | 
    
         
            +
                    return model_config.attention_arch == AttentionArch.MLA
         
     | 
| 
      
 3518 
     | 
    
         
            +
             
     | 
| 
       3347 
3519 
     | 
    
         
             
                def check_server_args(self):
         
     | 
| 
       3348 
3520 
     | 
    
         
             
                    # Check parallel size constraints
         
     | 
| 
       3349 
3521 
     | 
    
         
             
                    assert (
         
     | 
| 
         @@ -3721,6 +3893,13 @@ class PortArgs: 
     | 
|
| 
       3721 
3893 
     | 
    
         
             
                    else:
         
     | 
| 
       3722 
3894 
     | 
    
         
             
                        nccl_port = server_args.nccl_port
         
     | 
| 
       3723 
3895 
     | 
    
         | 
| 
      
 3896 
     | 
    
         
            +
                    if server_args.tokenizer_worker_num > 1:
         
     | 
| 
      
 3897 
     | 
    
         
            +
                        tokenizer_worker_ipc_name = (
         
     | 
| 
      
 3898 
     | 
    
         
            +
                            f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
         
     | 
| 
      
 3899 
     | 
    
         
            +
                        )
         
     | 
| 
      
 3900 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 3901 
     | 
    
         
            +
                        tokenizer_worker_ipc_name = None
         
     | 
| 
      
 3902 
     | 
    
         
            +
             
     | 
| 
       3724 
3903 
     | 
    
         
             
                    if not server_args.enable_dp_attention:
         
     | 
| 
       3725 
3904 
     | 
    
         
             
                        # Normal case, use IPC within a single node
         
     | 
| 
       3726 
3905 
     | 
    
         
             
                        return PortArgs(
         
     | 
| 
         @@ -3730,7 +3909,7 @@ class PortArgs: 
     | 
|
| 
       3730 
3909 
     | 
    
         
             
                            nccl_port=nccl_port,
         
     | 
| 
       3731 
3910 
     | 
    
         
             
                            rpc_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
         
     | 
| 
       3732 
3911 
     | 
    
         
             
                            metrics_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
         
     | 
| 
       3733 
     | 
    
         
            -
                            tokenizer_worker_ipc_name= 
     | 
| 
      
 3912 
     | 
    
         
            +
                            tokenizer_worker_ipc_name=tokenizer_worker_ipc_name,
         
     | 
| 
       3734 
3913 
     | 
    
         
             
                        )
         
     | 
| 
       3735 
3914 
     | 
    
         
             
                    else:
         
     | 
| 
       3736 
3915 
     | 
    
         
             
                        # DP attention. Use TCP + port to handle both single-node and multi-node.
         
     | 
| 
         @@ -3764,7 +3943,7 @@ class PortArgs: 
     | 
|
| 
       3764 
3943 
     | 
    
         
             
                            nccl_port=nccl_port,
         
     | 
| 
       3765 
3944 
     | 
    
         
             
                            rpc_ipc_name=f"tcp://{dist_init_host}:{rpc_port}",
         
     | 
| 
       3766 
3945 
     | 
    
         
             
                            metrics_ipc_name=f"tcp://{dist_init_host}:{metrics_ipc_name}",
         
     | 
| 
       3767 
     | 
    
         
            -
                            tokenizer_worker_ipc_name= 
     | 
| 
      
 3946 
     | 
    
         
            +
                            tokenizer_worker_ipc_name=tokenizer_worker_ipc_name,
         
     | 
| 
       3768 
3947 
     | 
    
         
             
                        )
         
     | 
| 
       3769 
3948 
     | 
    
         | 
| 
       3770 
3949 
     | 
    
         |