sglang 0.5.4__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +149 -34
 - sglang/bench_serving.py +73 -14
 - sglang/compile_deep_gemm.py +13 -7
 - sglang/launch_server.py +2 -0
 - sglang/srt/batch_invariant_ops/__init__.py +2 -0
 - sglang/srt/batch_invariant_ops/batch_invariant_ops.py +221 -4
 - sglang/srt/checkpoint_engine/__init__.py +9 -0
 - sglang/srt/checkpoint_engine/update.py +317 -0
 - sglang/srt/compilation/backend.py +1 -1
 - sglang/srt/configs/__init__.py +2 -0
 - sglang/srt/configs/deepseek_ocr.py +542 -10
 - sglang/srt/configs/deepseekvl2.py +95 -194
 - sglang/srt/configs/kimi_linear.py +160 -0
 - sglang/srt/configs/mamba_utils.py +66 -0
 - sglang/srt/configs/model_config.py +30 -7
 - sglang/srt/constants.py +7 -0
 - sglang/srt/debug_utils/tensor_dump_forward_hook.py +149 -0
 - sglang/srt/disaggregation/decode.py +34 -6
 - sglang/srt/disaggregation/nixl/conn.py +2 -2
 - sglang/srt/disaggregation/prefill.py +25 -3
 - sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -1
 - sglang/srt/distributed/parallel_state.py +9 -12
 - sglang/srt/entrypoints/engine.py +31 -20
 - sglang/srt/entrypoints/grpc_server.py +0 -1
 - sglang/srt/entrypoints/http_server.py +94 -94
 - sglang/srt/entrypoints/openai/protocol.py +7 -1
 - sglang/srt/entrypoints/openai/serving_chat.py +42 -0
 - sglang/srt/entrypoints/openai/serving_completions.py +10 -0
 - sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
 - sglang/srt/environ.py +23 -2
 - sglang/srt/eplb/expert_distribution.py +64 -1
 - sglang/srt/eplb/expert_location.py +106 -36
 - sglang/srt/function_call/function_call_parser.py +2 -0
 - sglang/srt/function_call/minimax_m2.py +367 -0
 - sglang/srt/grpc/compile_proto.py +3 -0
 - sglang/srt/layers/activation.py +6 -0
 - sglang/srt/layers/attention/ascend_backend.py +233 -5
 - sglang/srt/layers/attention/attention_registry.py +3 -0
 - sglang/srt/layers/attention/fla/chunk_delta_h.py +61 -32
 - sglang/srt/layers/attention/fla/fused_recurrent.py +17 -4
 - sglang/srt/layers/attention/fla/kda.py +1359 -0
 - sglang/srt/layers/attention/fla/layernorm_gated.py +7 -1
 - sglang/srt/layers/attention/flashattention_backend.py +19 -8
 - sglang/srt/layers/attention/flashinfer_backend.py +10 -1
 - sglang/srt/layers/attention/flashinfer_mla_backend.py +21 -11
 - sglang/srt/layers/attention/flashmla_backend.py +1 -1
 - sglang/srt/layers/attention/hybrid_linear_attn_backend.py +223 -0
 - sglang/srt/layers/attention/mamba/mamba.py +20 -11
 - sglang/srt/layers/attention/nsa/dequant_k_cache.py +138 -6
 - sglang/srt/layers/attention/nsa/nsa_indexer.py +45 -22
 - sglang/srt/layers/attention/nsa/quant_k_cache.py +44 -12
 - sglang/srt/layers/attention/nsa/transform_index.py +1 -1
 - sglang/srt/layers/attention/nsa_backend.py +157 -23
 - sglang/srt/layers/attention/triton_backend.py +4 -1
 - sglang/srt/layers/attention/trtllm_mha_backend.py +10 -4
 - sglang/srt/layers/attention/trtllm_mla_backend.py +11 -15
 - sglang/srt/layers/attention/utils.py +78 -0
 - sglang/srt/layers/communicator.py +24 -1
 - sglang/srt/layers/deep_gemm_wrapper/compile_utils.py +1 -1
 - sglang/srt/layers/layernorm.py +35 -6
 - sglang/srt/layers/logits_processor.py +9 -20
 - sglang/srt/layers/moe/cutlass_w4a8_moe.py +138 -0
 - sglang/srt/layers/moe/ep_moe/kernels.py +194 -0
 - sglang/srt/layers/moe/ep_moe/layer.py +78 -289
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json +164 -0
 - sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +68 -22
 - sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +43 -3
 - sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +106 -26
 - sglang/srt/layers/moe/fused_moe_triton/layer.py +3 -3
 - sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +7 -4
 - sglang/srt/layers/moe/moe_runner/deep_gemm.py +340 -55
 - sglang/srt/layers/moe/moe_runner/runner.py +3 -0
 - sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
 - sglang/srt/layers/moe/token_dispatcher/__init__.py +4 -4
 - sglang/srt/layers/moe/token_dispatcher/base.py +11 -5
 - sglang/srt/layers/moe/token_dispatcher/deepep.py +25 -18
 - sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
 - sglang/srt/layers/moe/topk.py +35 -10
 - sglang/srt/layers/moe/utils.py +3 -4
 - sglang/srt/layers/pooler.py +21 -2
 - sglang/srt/layers/quantization/__init__.py +13 -84
 - sglang/srt/layers/quantization/auto_round.py +394 -0
 - sglang/srt/layers/quantization/awq.py +0 -3
 - sglang/srt/layers/quantization/base_config.py +7 -0
 - sglang/srt/layers/quantization/fp8.py +68 -63
 - sglang/srt/layers/quantization/fp8_kernel.py +1 -1
 - sglang/srt/layers/quantization/fp8_utils.py +2 -2
 - sglang/srt/layers/quantization/gguf.py +566 -0
 - sglang/srt/layers/quantization/modelopt_quant.py +168 -11
 - sglang/srt/layers/quantization/mxfp4.py +30 -38
 - sglang/srt/layers/quantization/unquant.py +23 -45
 - sglang/srt/layers/quantization/w4afp8.py +38 -2
 - sglang/srt/layers/radix_attention.py +5 -2
 - sglang/srt/layers/rotary_embedding.py +130 -46
 - sglang/srt/layers/sampler.py +12 -1
 - sglang/srt/lora/lora_registry.py +9 -0
 - sglang/srt/managers/async_mm_data_processor.py +122 -0
 - sglang/srt/managers/data_parallel_controller.py +30 -3
 - sglang/srt/managers/detokenizer_manager.py +3 -0
 - sglang/srt/managers/io_struct.py +29 -4
 - sglang/srt/managers/multi_tokenizer_mixin.py +22 -1
 - sglang/srt/managers/schedule_batch.py +74 -15
 - sglang/srt/managers/scheduler.py +185 -144
 - sglang/srt/managers/scheduler_metrics_mixin.py +22 -14
 - sglang/srt/managers/scheduler_output_processor_mixin.py +40 -3
 - sglang/srt/managers/scheduler_pp_mixin.py +7 -2
 - sglang/srt/managers/scheduler_profiler_mixin.py +3 -4
 - sglang/srt/managers/scheduler_runtime_checker_mixin.py +45 -0
 - sglang/srt/managers/scheduler_update_weights_mixin.py +18 -3
 - sglang/srt/managers/session_controller.py +6 -5
 - sglang/srt/managers/tokenizer_manager.py +165 -78
 - sglang/srt/managers/tp_worker.py +24 -1
 - sglang/srt/mem_cache/base_prefix_cache.py +23 -4
 - sglang/srt/mem_cache/common.py +1 -0
 - sglang/srt/mem_cache/hicache_storage.py +7 -1
 - sglang/srt/mem_cache/memory_pool.py +253 -57
 - sglang/srt/mem_cache/memory_pool_host.py +12 -5
 - sglang/srt/mem_cache/radix_cache.py +4 -0
 - sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
 - sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +1 -1
 - sglang/srt/metrics/collector.py +46 -3
 - sglang/srt/model_executor/cuda_graph_runner.py +15 -3
 - sglang/srt/model_executor/forward_batch_info.py +55 -14
 - sglang/srt/model_executor/model_runner.py +77 -170
 - sglang/srt/model_executor/npu_graph_runner.py +7 -3
 - sglang/srt/model_executor/piecewise_cuda_graph_runner.py +22 -12
 - sglang/srt/model_loader/weight_utils.py +1 -1
 - sglang/srt/models/bailing_moe.py +9 -2
 - sglang/srt/models/deepseek_nextn.py +11 -2
 - sglang/srt/models/deepseek_v2.py +296 -78
 - sglang/srt/models/glm4.py +391 -77
 - sglang/srt/models/glm4_moe.py +322 -354
 - sglang/srt/models/glm4_moe_nextn.py +4 -14
 - sglang/srt/models/glm4v.py +196 -55
 - sglang/srt/models/glm4v_moe.py +29 -197
 - sglang/srt/models/gpt_oss.py +1 -10
 - sglang/srt/models/kimi_linear.py +678 -0
 - sglang/srt/models/llama4.py +1 -1
 - sglang/srt/models/llama_eagle3.py +11 -1
 - sglang/srt/models/longcat_flash.py +2 -2
 - sglang/srt/models/minimax_m2.py +922 -0
 - sglang/srt/models/nvila.py +355 -0
 - sglang/srt/models/nvila_lite.py +184 -0
 - sglang/srt/models/qwen2.py +23 -2
 - sglang/srt/models/qwen2_moe.py +30 -15
 - sglang/srt/models/qwen3.py +35 -5
 - sglang/srt/models/qwen3_moe.py +18 -12
 - sglang/srt/models/qwen3_next.py +7 -0
 - sglang/srt/multimodal/customized_mm_processor_utils.py +35 -0
 - sglang/srt/multimodal/processors/base_processor.py +1 -0
 - sglang/srt/multimodal/processors/glm4v.py +1 -1
 - sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
 - sglang/srt/multimodal/processors/points_v15_chat.py +2 -2
 - sglang/srt/multiplex/multiplexing_mixin.py +209 -0
 - sglang/srt/multiplex/pdmux_context.py +164 -0
 - sglang/srt/parser/conversation.py +7 -1
 - sglang/srt/parser/reasoning_parser.py +28 -1
 - sglang/srt/sampling/custom_logit_processor.py +67 -1
 - sglang/srt/sampling/penaltylib/frequency_penalty.py +6 -8
 - sglang/srt/sampling/penaltylib/min_new_tokens.py +7 -8
 - sglang/srt/sampling/penaltylib/orchestrator.py +43 -3
 - sglang/srt/sampling/penaltylib/presence_penalty.py +6 -8
 - sglang/srt/server_args.py +459 -199
 - sglang/srt/single_batch_overlap.py +2 -4
 - sglang/srt/speculative/draft_utils.py +16 -0
 - sglang/srt/speculative/eagle_info.py +42 -36
 - sglang/srt/speculative/eagle_info_v2.py +68 -25
 - sglang/srt/speculative/eagle_utils.py +261 -16
 - sglang/srt/speculative/eagle_worker.py +11 -3
 - sglang/srt/speculative/eagle_worker_v2.py +15 -9
 - sglang/srt/speculative/spec_info.py +305 -31
 - sglang/srt/speculative/spec_utils.py +44 -8
 - sglang/srt/tracing/trace.py +121 -12
 - sglang/srt/utils/common.py +142 -74
 - sglang/srt/utils/hf_transformers_utils.py +38 -12
 - sglang/srt/utils/torch_memory_saver_adapter.py +20 -0
 - sglang/test/kits/radix_cache_server_kit.py +50 -0
 - sglang/test/runners.py +31 -7
 - sglang/test/simple_eval_common.py +5 -3
 - sglang/test/simple_eval_humaneval.py +1 -0
 - sglang/test/simple_eval_math.py +1 -0
 - sglang/test/simple_eval_mmlu.py +1 -0
 - sglang/test/simple_eval_mmmu_vlm.py +1 -0
 - sglang/test/test_deterministic.py +235 -12
 - sglang/test/test_deterministic_utils.py +2 -1
 - sglang/test/test_utils.py +7 -1
 - sglang/version.py +1 -1
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/METADATA +15 -28
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/RECORD +194 -175
 - sglang/srt/models/vila.py +0 -306
 - /sglang/test/{kit_matched_stop.py → kits/matched_stop_kit.py} +0 -0
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/WHEEL +0 -0
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/licenses/LICENSE +0 -0
 - {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/top_level.txt +0 -0
 
    
        sglang/srt/server_args.py
    CHANGED
    
    | 
         @@ -27,19 +27,25 @@ from typing import Dict, List, Literal, Optional, Union 
     | 
|
| 
       27 
27 
     | 
    
         
             
            import orjson
         
     | 
| 
       28 
28 
     | 
    
         | 
| 
       29 
29 
     | 
    
         
             
            from sglang.srt.connector import ConnectorType
         
     | 
| 
      
 30 
     | 
    
         
            +
            from sglang.srt.environ import envs
         
     | 
| 
       30 
31 
     | 
    
         
             
            from sglang.srt.function_call.function_call_parser import FunctionCallParser
         
     | 
| 
       31 
32 
     | 
    
         
             
            from sglang.srt.lora.lora_registry import LoRARef
         
     | 
| 
       32 
33 
     | 
    
         
             
            from sglang.srt.parser.reasoning_parser import ReasoningParser
         
     | 
| 
       33 
     | 
    
         
            -
            from sglang.srt.utils import (
         
     | 
| 
      
 34 
     | 
    
         
            +
            from sglang.srt.utils.common import (
         
     | 
| 
       34 
35 
     | 
    
         
             
                LORA_TARGET_ALL_MODULES,
         
     | 
| 
       35 
36 
     | 
    
         
             
                SUPPORTED_LORA_TARGET_MODULES,
         
     | 
| 
       36 
37 
     | 
    
         
             
                configure_ipv6,
         
     | 
| 
      
 38 
     | 
    
         
            +
                cpu_has_amx_support,
         
     | 
| 
       37 
39 
     | 
    
         
             
                get_device,
         
     | 
| 
       38 
40 
     | 
    
         
             
                get_device_memory_capacity,
         
     | 
| 
       39 
41 
     | 
    
         
             
                get_device_sm,
         
     | 
| 
      
 42 
     | 
    
         
            +
                is_blackwell_supported,
         
     | 
| 
       40 
43 
     | 
    
         
             
                is_cuda,
         
     | 
| 
      
 44 
     | 
    
         
            +
                is_fa3_default_architecture,
         
     | 
| 
       41 
45 
     | 
    
         
             
                is_flashinfer_available,
         
     | 
| 
       42 
46 
     | 
    
         
             
                is_hip,
         
     | 
| 
      
 47 
     | 
    
         
            +
                is_hopper_with_cuda_12_3,
         
     | 
| 
      
 48 
     | 
    
         
            +
                is_no_spec_infer_or_topk_one,
         
     | 
| 
       43 
49 
     | 
    
         
             
                is_npu,
         
     | 
| 
       44 
50 
     | 
    
         
             
                is_port_available,
         
     | 
| 
       45 
51 
     | 
    
         
             
                is_remote_url,
         
     | 
| 
         @@ -51,6 +57,7 @@ from sglang.srt.utils import ( 
     | 
|
| 
       51 
57 
     | 
    
         
             
                json_list_type,
         
     | 
| 
       52 
58 
     | 
    
         
             
                nullable_str,
         
     | 
| 
       53 
59 
     | 
    
         
             
                parse_connector_type,
         
     | 
| 
      
 60 
     | 
    
         
            +
                xpu_has_xmx_support,
         
     | 
| 
       54 
61 
     | 
    
         
             
            )
         
     | 
| 
       55 
62 
     | 
    
         
             
            from sglang.srt.utils.hf_transformers_utils import check_gguf_file, get_config
         
     | 
| 
       56 
63 
     | 
    
         
             
            from sglang.utils import is_in_ci
         
     | 
| 
         @@ -92,6 +99,7 @@ QUANTIZATION_CHOICES = [ 
     | 
|
| 
       92 
99 
     | 
    
         
             
                "qoq",
         
     | 
| 
       93 
100 
     | 
    
         
             
                "w4afp8",
         
     | 
| 
       94 
101 
     | 
    
         
             
                "mxfp4",
         
     | 
| 
      
 102 
     | 
    
         
            +
                "auto-round",
         
     | 
| 
       95 
103 
     | 
    
         
             
                "compressed-tensors",  # for Ktransformers
         
     | 
| 
       96 
104 
     | 
    
         
             
            ]
         
     | 
| 
       97 
105 
     | 
    
         | 
| 
         @@ -127,9 +135,18 @@ GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"] 
     | 
|
| 
       127 
135 
     | 
    
         | 
| 
       128 
136 
     | 
    
         
             
            DETERMINISTIC_ATTENTION_BACKEND_CHOICES = ["flashinfer", "fa3", "triton"]
         
     | 
| 
       129 
137 
     | 
    
         | 
| 
      
 138 
     | 
    
         
            +
            RADIX_SUPPORTED_DETERMINISTIC_ATTENTION_BACKEND = ["fa3", "triton"]
         
     | 
| 
      
 139 
     | 
    
         
            +
             
     | 
| 
       130 
140 
     | 
    
         
             
            DEFAULT_LORA_EVICTION_POLICY = "lru"
         
     | 
| 
       131 
141 
     | 
    
         | 
| 
       132 
     | 
    
         
            -
            NSA_CHOICES = [ 
     | 
| 
      
 142 
     | 
    
         
            +
            NSA_CHOICES = [
         
     | 
| 
      
 143 
     | 
    
         
            +
                "flashmla_sparse",
         
     | 
| 
      
 144 
     | 
    
         
            +
                "flashmla_kv",
         
     | 
| 
      
 145 
     | 
    
         
            +
                "flashmla_auto",
         
     | 
| 
      
 146 
     | 
    
         
            +
                "fa3",
         
     | 
| 
      
 147 
     | 
    
         
            +
                "tilelang",
         
     | 
| 
      
 148 
     | 
    
         
            +
                "aiter",
         
     | 
| 
      
 149 
     | 
    
         
            +
            ]
         
     | 
| 
       133 
150 
     | 
    
         | 
| 
       134 
151 
     | 
    
         
             
            RADIX_EVICTION_POLICY_CHOICES = ["lru", "lfu"]
         
     | 
| 
       135 
152 
     | 
    
         | 
| 
         @@ -175,12 +192,25 @@ def add_deterministic_attention_backend_choices(choices): 
     | 
|
| 
       175 
192 
     | 
    
         
             
                DETERMINISTIC_ATTENTION_BACKEND_CHOICES.extend(choices)
         
     | 
| 
       176 
193 
     | 
    
         | 
| 
       177 
194 
     | 
    
         | 
| 
      
 195 
     | 
    
         
            +
            def add_radix_supported_deterministic_attention_backend_choices(choices):
         
     | 
| 
      
 196 
     | 
    
         
            +
                RADIX_SUPPORTED_DETERMINISTIC_ATTENTION_BACKEND.extend(choices)
         
     | 
| 
      
 197 
     | 
    
         
            +
             
     | 
| 
      
 198 
     | 
    
         
            +
             
     | 
| 
       178 
199 
     | 
    
         
             
            def add_radix_eviction_policy_choices(choices):
         
     | 
| 
       179 
200 
     | 
    
         
             
                RADIX_EVICTION_POLICY_CHOICES.extend(choices)
         
     | 
| 
       180 
201 
     | 
    
         | 
| 
       181 
202 
     | 
    
         | 
| 
       182 
203 
     | 
    
         
             
            @dataclasses.dataclass
         
     | 
| 
       183 
204 
     | 
    
         
             
            class ServerArgs:
         
     | 
| 
      
 205 
     | 
    
         
            +
                """
         
     | 
| 
      
 206 
     | 
    
         
            +
                The arguments of the server.
         
     | 
| 
      
 207 
     | 
    
         
            +
             
     | 
| 
      
 208 
     | 
    
         
            +
                NOTE: When you add new arguments, please make sure the order
         
     | 
| 
      
 209 
     | 
    
         
            +
                in this class definition the same as the order in the the function
         
     | 
| 
      
 210 
     | 
    
         
            +
                `ServerArgs.add_cli_args`.
         
     | 
| 
      
 211 
     | 
    
         
            +
                Please follow the existing style to group the new arguments into related groups or create new groups.
         
     | 
| 
      
 212 
     | 
    
         
            +
                """
         
     | 
| 
      
 213 
     | 
    
         
            +
             
     | 
| 
       184 
214 
     | 
    
         
             
                # Model and tokenizer
         
     | 
| 
       185 
215 
     | 
    
         
             
                model_path: str
         
     | 
| 
       186 
216 
     | 
    
         
             
                tokenizer_path: Optional[str] = None
         
     | 
| 
         @@ -190,11 +220,6 @@ class ServerArgs: 
     | 
|
| 
       190 
220 
     | 
    
         
             
                load_format: str = "auto"
         
     | 
| 
       191 
221 
     | 
    
         
             
                model_loader_extra_config: str = "{}"
         
     | 
| 
       192 
222 
     | 
    
         
             
                trust_remote_code: bool = False
         
     | 
| 
       193 
     | 
    
         
            -
                modelopt_quant: Optional[Union[str, Dict]] = None
         
     | 
| 
       194 
     | 
    
         
            -
                modelopt_checkpoint_restore_path: Optional[str] = None
         
     | 
| 
       195 
     | 
    
         
            -
                modelopt_checkpoint_save_path: Optional[str] = None
         
     | 
| 
       196 
     | 
    
         
            -
                modelopt_export_path: Optional[str] = None
         
     | 
| 
       197 
     | 
    
         
            -
                quantize_and_serve: bool = False
         
     | 
| 
       198 
223 
     | 
    
         
             
                context_length: Optional[int] = None
         
     | 
| 
       199 
224 
     | 
    
         
             
                is_embedding: bool = False
         
     | 
| 
       200 
225 
     | 
    
         
             
                enable_multimodal: Optional[bool] = None
         
     | 
| 
         @@ -216,6 +241,11 @@ class ServerArgs: 
     | 
|
| 
       216 
241 
     | 
    
         
             
                quantization_param_path: Optional[str] = None
         
     | 
| 
       217 
242 
     | 
    
         
             
                kv_cache_dtype: str = "auto"
         
     | 
| 
       218 
243 
     | 
    
         
             
                enable_fp32_lm_head: bool = False
         
     | 
| 
      
 244 
     | 
    
         
            +
                modelopt_quant: Optional[Union[str, Dict]] = None
         
     | 
| 
      
 245 
     | 
    
         
            +
                modelopt_checkpoint_restore_path: Optional[str] = None
         
     | 
| 
      
 246 
     | 
    
         
            +
                modelopt_checkpoint_save_path: Optional[str] = None
         
     | 
| 
      
 247 
     | 
    
         
            +
                modelopt_export_path: Optional[str] = None
         
     | 
| 
      
 248 
     | 
    
         
            +
                quantize_and_serve: bool = False
         
     | 
| 
       219 
249 
     | 
    
         | 
| 
       220 
250 
     | 
    
         
             
                # Memory and scheduling
         
     | 
| 
       221 
251 
     | 
    
         
             
                mem_fraction_static: Optional[float] = None
         
     | 
| 
         @@ -238,8 +268,6 @@ class ServerArgs: 
     | 
|
| 
       238 
268 
     | 
    
         | 
| 
       239 
269 
     | 
    
         
             
                # Runtime options
         
     | 
| 
       240 
270 
     | 
    
         
             
                device: Optional[str] = None
         
     | 
| 
       241 
     | 
    
         
            -
                elastic_ep_backend: Literal[None, "mooncake"] = None
         
     | 
| 
       242 
     | 
    
         
            -
                mooncake_ib_device: Optional[str] = None
         
     | 
| 
       243 
271 
     | 
    
         
             
                tp_size: int = 1
         
     | 
| 
       244 
272 
     | 
    
         
             
                pp_size: int = 1
         
     | 
| 
       245 
273 
     | 
    
         
             
                pp_max_micro_batch_size: Optional[int] = None
         
     | 
| 
         @@ -272,12 +300,12 @@ class ServerArgs: 
     | 
|
| 
       272 
300 
     | 
    
         
             
                collect_tokens_histogram: bool = False
         
     | 
| 
       273 
301 
     | 
    
         
             
                prompt_tokens_buckets: Optional[List[str]] = None
         
     | 
| 
       274 
302 
     | 
    
         
             
                generation_tokens_buckets: Optional[List[str]] = None
         
     | 
| 
      
 303 
     | 
    
         
            +
                gc_warning_threshold_secs: float = 0.0
         
     | 
| 
       275 
304 
     | 
    
         
             
                decode_log_interval: int = 40
         
     | 
| 
       276 
305 
     | 
    
         
             
                enable_request_time_stats_logging: bool = False
         
     | 
| 
       277 
306 
     | 
    
         
             
                kv_events_config: Optional[str] = None
         
     | 
| 
       278 
     | 
    
         
            -
                gc_warning_threshold_secs: float = 0.0
         
     | 
| 
       279 
307 
     | 
    
         
             
                enable_trace: bool = False
         
     | 
| 
       280 
     | 
    
         
            -
                 
     | 
| 
      
 308 
     | 
    
         
            +
                otlp_traces_endpoint: str = "localhost:4317"
         
     | 
| 
       281 
309 
     | 
    
         | 
| 
       282 
310 
     | 
    
         
             
                # API related
         
     | 
| 
       283 
311 
     | 
    
         
             
                api_key: Optional[str] = None
         
     | 
| 
         @@ -317,8 +345,8 @@ class ServerArgs: 
     | 
|
| 
       317 
345 
     | 
    
         
             
                ] = None
         
     | 
| 
       318 
346 
     | 
    
         
             
                max_loaded_loras: Optional[int] = None
         
     | 
| 
       319 
347 
     | 
    
         
             
                max_loras_per_batch: int = 8
         
     | 
| 
       320 
     | 
    
         
            -
                lora_eviction_policy: str =  
     | 
| 
       321 
     | 
    
         
            -
                lora_backend: str = " 
     | 
| 
      
 348 
     | 
    
         
            +
                lora_eviction_policy: str = "lru"
         
     | 
| 
      
 349 
     | 
    
         
            +
                lora_backend: str = "csgmv"
         
     | 
| 
       322 
350 
     | 
    
         
             
                max_lora_chunk_size: Optional[int] = 16
         
     | 
| 
       323 
351 
     | 
    
         | 
| 
       324 
352 
     | 
    
         
             
                # Kernel backend
         
     | 
| 
         @@ -332,7 +360,6 @@ class ServerArgs: 
     | 
|
| 
       332 
360 
     | 
    
         
             
                nsa_decode_backend: str = "fa3"
         
     | 
| 
       333 
361 
     | 
    
         | 
| 
       334 
362 
     | 
    
         
             
                # Speculative decoding
         
     | 
| 
       335 
     | 
    
         
            -
                enable_beta_spec: bool = False
         
     | 
| 
       336 
363 
     | 
    
         
             
                speculative_algorithm: Optional[str] = None
         
     | 
| 
       337 
364 
     | 
    
         
             
                speculative_draft_model_path: Optional[str] = None
         
     | 
| 
       338 
365 
     | 
    
         
             
                speculative_draft_model_revision: Optional[str] = None
         
     | 
| 
         @@ -375,6 +402,8 @@ class ServerArgs: 
     | 
|
| 
       375 
402 
     | 
    
         
             
                enable_expert_distribution_metrics: bool = False
         
     | 
| 
       376 
403 
     | 
    
         
             
                deepep_config: Optional[str] = None
         
     | 
| 
       377 
404 
     | 
    
         
             
                moe_dense_tp_size: Optional[int] = None
         
     | 
| 
      
 405 
     | 
    
         
            +
                elastic_ep_backend: Literal[None, "mooncake"] = None
         
     | 
| 
      
 406 
     | 
    
         
            +
                mooncake_ib_device: Optional[str] = None
         
     | 
| 
       378 
407 
     | 
    
         | 
| 
       379 
408 
     | 
    
         
             
                # Mamba cache
         
     | 
| 
       380 
409 
     | 
    
         
             
                max_mamba_cache_size: Optional[int] = None
         
     | 
| 
         @@ -473,6 +502,7 @@ class ServerArgs: 
     | 
|
| 
       473 
502 
     | 
    
         
             
                scheduler_recv_interval: int = 1
         
     | 
| 
       474 
503 
     | 
    
         
             
                numa_node: Optional[List[int]] = None
         
     | 
| 
       475 
504 
     | 
    
         
             
                enable_deterministic_inference: bool = False
         
     | 
| 
      
 505 
     | 
    
         
            +
                rl_on_policy_target: Optional[str] = None
         
     | 
| 
       476 
506 
     | 
    
         | 
| 
       477 
507 
     | 
    
         
             
                # Dynamic batch tokenizer
         
     | 
| 
       478 
508 
     | 
    
         
             
                enable_dynamic_batch_tokenizer: bool = False
         
     | 
| 
         @@ -481,6 +511,9 @@ class ServerArgs: 
     | 
|
| 
       481 
511 
     | 
    
         | 
| 
       482 
512 
     | 
    
         
             
                # Debug tensor dumps
         
     | 
| 
       483 
513 
     | 
    
         
             
                debug_tensor_dump_output_folder: Optional[str] = None
         
     | 
| 
      
 514 
     | 
    
         
            +
                # -1 mean dump all layers.
         
     | 
| 
      
 515 
     | 
    
         
            +
                debug_tensor_dump_layers: int = -1
         
     | 
| 
      
 516 
     | 
    
         
            +
                # TODO(guoyuhong): clean the old dumper code.
         
     | 
| 
       484 
517 
     | 
    
         
             
                debug_tensor_dump_input_file: Optional[str] = None
         
     | 
| 
       485 
518 
     | 
    
         
             
                debug_tensor_dump_inject: bool = False
         
     | 
| 
       486 
519 
     | 
    
         | 
| 
         @@ -509,18 +542,9 @@ class ServerArgs: 
     | 
|
| 
       509 
542 
     | 
    
         
             
                pdmux_config_path: Optional[str] = None
         
     | 
| 
       510 
543 
     | 
    
         
             
                sm_group_num: int = 8
         
     | 
| 
       511 
544 
     | 
    
         | 
| 
       512 
     | 
    
         
            -
                 
     | 
| 
       513 
     | 
    
         
            -
             
     | 
| 
       514 
     | 
    
         
            -
             
     | 
| 
       515 
     | 
    
         
            -
                        if server_args.prefill_attention_backend
         
     | 
| 
       516 
     | 
    
         
            -
                        else server_args.attention_backend
         
     | 
| 
       517 
     | 
    
         
            -
                    )
         
     | 
| 
       518 
     | 
    
         
            -
                    decode_attention_backend_str = (
         
     | 
| 
       519 
     | 
    
         
            -
                        server_args.decode_attention_backend
         
     | 
| 
       520 
     | 
    
         
            -
                        if server_args.decode_attention_backend
         
     | 
| 
       521 
     | 
    
         
            -
                        else server_args.attention_backend
         
     | 
| 
       522 
     | 
    
         
            -
                    )
         
     | 
| 
       523 
     | 
    
         
            -
                    return prefill_attention_backend_str, decode_attention_backend_str
         
     | 
| 
      
 545 
     | 
    
         
            +
                # For Multi-Modal
         
     | 
| 
      
 546 
     | 
    
         
            +
                mm_max_concurrent_calls: int = 32
         
     | 
| 
      
 547 
     | 
    
         
            +
                mm_per_request_timeout: float = 10.0
         
     | 
| 
       524 
548 
     | 
    
         | 
| 
       525 
549 
     | 
    
         
             
                def __post_init__(self):
         
     | 
| 
       526 
550 
     | 
    
         
             
                    """
         
     | 
| 
         @@ -550,6 +574,9 @@ class ServerArgs: 
     | 
|
| 
       550 
574 
     | 
    
         
             
                    # Apply model-specific adjustments.
         
     | 
| 
       551 
575 
     | 
    
         
             
                    self._handle_model_specific_adjustments()
         
     | 
| 
       552 
576 
     | 
    
         | 
| 
      
 577 
     | 
    
         
            +
                    # Handle Hicache settings.
         
     | 
| 
      
 578 
     | 
    
         
            +
                    self._handle_hicache()
         
     | 
| 
      
 579 
     | 
    
         
            +
             
     | 
| 
       553 
580 
     | 
    
         
             
                    # Set kernel backends.
         
     | 
| 
       554 
581 
     | 
    
         
             
                    self._handle_sampling_backend()
         
     | 
| 
       555 
582 
     | 
    
         
             
                    self._handle_attention_backend_compatibility()
         
     | 
| 
         @@ -572,9 +599,6 @@ class ServerArgs: 
     | 
|
| 
       572 
599 
     | 
    
         
             
                    # Handle pipeline parallelism.
         
     | 
| 
       573 
600 
     | 
    
         
             
                    self._handle_pipeline_parallelism()
         
     | 
| 
       574 
601 
     | 
    
         | 
| 
       575 
     | 
    
         
            -
                    # Handle Hicache settings.
         
     | 
| 
       576 
     | 
    
         
            -
                    self._handle_hicache()
         
     | 
| 
       577 
     | 
    
         
            -
             
     | 
| 
       578 
602 
     | 
    
         
             
                    # Handle speculative decoding logic.
         
     | 
| 
       579 
603 
     | 
    
         
             
                    self._handle_speculative_decoding()
         
     | 
| 
       580 
604 
     | 
    
         | 
| 
         @@ -614,22 +638,6 @@ class ServerArgs: 
     | 
|
| 
       614 
638 
     | 
    
         
             
                        )
         
     | 
| 
       615 
639 
     | 
    
         
             
                        self.tool_call_parser = deprecated_tool_call_parsers[self.tool_call_parser]
         
     | 
| 
       616 
640 
     | 
    
         | 
| 
       617 
     | 
    
         
            -
                def _handle_ktransformers_configs(self):
         
     | 
| 
       618 
     | 
    
         
            -
                    from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import (
         
     | 
| 
       619 
     | 
    
         
            -
                        CompressedTensorsWNA16AMXEPMoEMethod,
         
     | 
| 
       620 
     | 
    
         
            -
                        override_config,
         
     | 
| 
       621 
     | 
    
         
            -
                    )
         
     | 
| 
       622 
     | 
    
         
            -
             
     | 
| 
       623 
     | 
    
         
            -
                    override_config(
         
     | 
| 
       624 
     | 
    
         
            -
                        CompressedTensorsWNA16AMXEPMoEMethod,
         
     | 
| 
       625 
     | 
    
         
            -
                        self.kt_num_gpu_experts,
         
     | 
| 
       626 
     | 
    
         
            -
                        self.kt_cpuinfer,
         
     | 
| 
       627 
     | 
    
         
            -
                        self.kt_threadpool_count,
         
     | 
| 
       628 
     | 
    
         
            -
                        self.kt_amx_weight_path,
         
     | 
| 
       629 
     | 
    
         
            -
                        self.kt_amx_method,
         
     | 
| 
       630 
     | 
    
         
            -
                        self.chunked_prefill_size,
         
     | 
| 
       631 
     | 
    
         
            -
                    )
         
     | 
| 
       632 
     | 
    
         
            -
             
     | 
| 
       633 
641 
     | 
    
         
             
                def _handle_missing_default_values(self):
         
     | 
| 
       634 
642 
     | 
    
         
             
                    if self.tokenizer_path is None:
         
     | 
| 
       635 
643 
     | 
    
         
             
                        self.tokenizer_path = self.model_path
         
     | 
| 
         @@ -684,7 +692,7 @@ class ServerArgs: 
     | 
|
| 
       684 
692 
     | 
    
         
             
                                    self.cuda_graph_max_bs = 64
         
     | 
| 
       685 
693 
     | 
    
         
             
                        elif gpu_mem < 35 * 1024:
         
     | 
| 
       686 
694 
     | 
    
         
             
                            # A10, 4090, 5090
         
     | 
| 
       687 
     | 
    
         
            -
                            # (chunked_prefill_size 2k, cuda_graph_max_bs  
     | 
| 
      
 695 
     | 
    
         
            +
                            # (chunked_prefill_size 2k, cuda_graph_max_bs 24 if tp < 4 else 80)
         
     | 
| 
       688 
696 
     | 
    
         
             
                            if self.chunked_prefill_size is None:
         
     | 
| 
       689 
697 
     | 
    
         
             
                                self.chunked_prefill_size = 2048
         
     | 
| 
       690 
698 
     | 
    
         
             
                            if self.cuda_graph_max_bs is None:
         
     | 
| 
         @@ -692,7 +700,7 @@ class ServerArgs: 
     | 
|
| 
       692 
700 
     | 
    
         
             
                                # However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs
         
     | 
| 
       693 
701 
     | 
    
         
             
                                # from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
         
     | 
| 
       694 
702 
     | 
    
         
             
                                if self.tp_size < 4:
         
     | 
| 
       695 
     | 
    
         
            -
                                    self.cuda_graph_max_bs =  
     | 
| 
      
 703 
     | 
    
         
            +
                                    self.cuda_graph_max_bs = 24
         
     | 
| 
       696 
704 
     | 
    
         
             
                                else:
         
     | 
| 
       697 
705 
     | 
    
         
             
                                    self.cuda_graph_max_bs = 80
         
     | 
| 
       698 
706 
     | 
    
         
             
                        elif gpu_mem < 60 * 1024:
         
     | 
| 
         @@ -800,11 +808,9 @@ class ServerArgs: 
     | 
|
| 
       800 
808 
     | 
    
         
             
                            else 0.88
         
     | 
| 
       801 
809 
     | 
    
         
             
                        )
         
     | 
| 
       802 
810 
     | 
    
         | 
| 
       803 
     | 
    
         
            -
                        #  
     | 
| 
       804 
     | 
    
         
            -
                        #  
     | 
| 
       805 
     | 
    
         
            -
                         
     | 
| 
       806 
     | 
    
         
            -
             
     | 
| 
       807 
     | 
    
         
            -
                        model_config = ModelConfig.from_server_args(self)
         
     | 
| 
      
 811 
     | 
    
         
            +
                        # Multimodal models need more memory for the image processing,
         
     | 
| 
      
 812 
     | 
    
         
            +
                        # so we adjust the mem_fraction_static accordingly.
         
     | 
| 
      
 813 
     | 
    
         
            +
                        model_config = self.get_model_config()
         
     | 
| 
       808 
814 
     | 
    
         
             
                        if model_config.is_multimodal:
         
     | 
| 
       809 
815 
     | 
    
         
             
                            self.adjust_mem_fraction_for_vlm(model_config)
         
     | 
| 
       810 
816 
     | 
    
         | 
| 
         @@ -829,7 +835,7 @@ class ServerArgs: 
     | 
|
| 
       829 
835 
     | 
    
         
             
                        capture_bs = (
         
     | 
| 
       830 
836 
     | 
    
         
             
                            list(range(1, 9, 1))
         
     | 
| 
       831 
837 
     | 
    
         
             
                            + list(range(10, 33, 2))
         
     | 
| 
       832 
     | 
    
         
            -
                            + list(range(40,  
     | 
| 
      
 838 
     | 
    
         
            +
                            + list(range(40, 65, 4))
         
     | 
| 
       833 
839 
     | 
    
         
             
                            + list(range(72, 257, 8))
         
     | 
| 
       834 
840 
     | 
    
         
             
                            + list(range(272, self.cuda_graph_max_bs + 1, 16))
         
     | 
| 
       835 
841 
     | 
    
         
             
                        )
         
     | 
| 
         @@ -892,14 +898,19 @@ class ServerArgs: 
     | 
|
| 
       892 
898 
     | 
    
         
             
                                logger.info(
         
     | 
| 
       893 
899 
     | 
    
         
             
                                    "Enable FlashInfer AllReduce Fusion on sm100 for DeepseekV3ForCausalLM"
         
     | 
| 
       894 
900 
     | 
    
         
             
                                )
         
     | 
| 
       895 
     | 
    
         
            -
                            if  
     | 
| 
       896 
     | 
    
         
            -
                                self.quantization == "modelopt_fp4"
         
     | 
| 
       897 
     | 
    
         
            -
                                and self.moe_runner_backend == "auto"
         
     | 
| 
       898 
     | 
    
         
            -
                            ):
         
     | 
| 
      
 901 
     | 
    
         
            +
                            if self.moe_a2a_backend == "none" and self.moe_runner_backend == "auto":
         
     | 
| 
       899 
902 
     | 
    
         
             
                                self.moe_runner_backend = "flashinfer_trtllm"
         
     | 
| 
       900 
903 
     | 
    
         
             
                                logger.info(
         
     | 
| 
       901 
     | 
    
         
            -
                                    "Use flashinfer_trtllm as  
     | 
| 
      
 904 
     | 
    
         
            +
                                    "Use flashinfer_trtllm as MoE runner backend on sm100 for DeepseekV3ForCausalLM"
         
     | 
| 
       902 
905 
     | 
    
         
             
                                )
         
     | 
| 
      
 906 
     | 
    
         
            +
                                if self.quantization is None:
         
     | 
| 
      
 907 
     | 
    
         
            +
                                    # Default DeepSeek V3/R1 native FP8 when not explicitly set,
         
     | 
| 
      
 908 
     | 
    
         
            +
                                    # Because we need this condition for an assertion in
         
     | 
| 
      
 909 
     | 
    
         
            +
                                    # flashinfer_trtllm MoE runner backend.
         
     | 
| 
      
 910 
     | 
    
         
            +
                                    self.quantization = "fp8"
         
     | 
| 
      
 911 
     | 
    
         
            +
                                    logger.info(
         
     | 
| 
      
 912 
     | 
    
         
            +
                                        "Quantization not specified, default to fp8 for DeepSeek on sm100"
         
     | 
| 
      
 913 
     | 
    
         
            +
                                    )
         
     | 
| 
       903 
914 
     | 
    
         | 
| 
       904 
915 
     | 
    
         
             
                    elif model_arch in ["GptOssForCausalLM"]:
         
     | 
| 
       905 
916 
     | 
    
         
             
                        if (
         
     | 
| 
         @@ -925,7 +936,7 @@ class ServerArgs: 
     | 
|
| 
       925 
936 
     | 
    
         
             
                            f"- Decode: {decode_attn_backend}\n"
         
     | 
| 
       926 
937 
     | 
    
         
             
                        )
         
     | 
| 
       927 
938 
     | 
    
         | 
| 
       928 
     | 
    
         
            -
                        if  
     | 
| 
      
 939 
     | 
    
         
            +
                        if is_blackwell_supported():
         
     | 
| 
       929 
940 
     | 
    
         
             
                            if not self.enable_dp_attention:
         
     | 
| 
       930 
941 
     | 
    
         
             
                                self.enable_flashinfer_allreduce_fusion = True
         
     | 
| 
       931 
942 
     | 
    
         
             
                                logger.info(
         
     | 
| 
         @@ -937,7 +948,7 @@ class ServerArgs: 
     | 
|
| 
       937 
948 
     | 
    
         
             
                            and quantization_config.get("quant_method") == "mxfp4"
         
     | 
| 
       938 
949 
     | 
    
         
             
                        )
         
     | 
| 
       939 
950 
     | 
    
         | 
| 
       940 
     | 
    
         
            -
                        if  
     | 
| 
      
 951 
     | 
    
         
            +
                        if is_blackwell_supported() and is_mxfp4_quant_format:
         
     | 
| 
       941 
952 
     | 
    
         
             
                            self.moe_runner_backend = "flashinfer_mxfp4"
         
     | 
| 
       942 
953 
     | 
    
         
             
                            logger.warning(
         
     | 
| 
       943 
954 
     | 
    
         
             
                                "Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
         
     | 
| 
         @@ -966,7 +977,19 @@ class ServerArgs: 
     | 
|
| 
       966 
977 
     | 
    
         
             
                            "fa3",
         
     | 
| 
       967 
978 
     | 
    
         
             
                            "aiter",
         
     | 
| 
       968 
979 
     | 
    
         
             
                            "triton",
         
     | 
| 
       969 
     | 
    
         
            -
             
     | 
| 
      
 980 
     | 
    
         
            +
                            "trtllm_mha",
         
     | 
| 
      
 981 
     | 
    
         
            +
                        }, "fa3, aiter, triton, or trtllm_mha is required for Llama4 model"
         
     | 
| 
      
 982 
     | 
    
         
            +
                        if is_sm100_supported() and self.attention_backend is None:
         
     | 
| 
      
 983 
     | 
    
         
            +
                            self.attention_backend = "trtllm_mha"
         
     | 
| 
      
 984 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 985 
     | 
    
         
            +
                                "Use trtllm_mha as attention backend on sm100 for Llama4 model"
         
     | 
| 
      
 986 
     | 
    
         
            +
                            )
         
     | 
| 
      
 987 
     | 
    
         
            +
                        if is_sm100_supported() and self.moe_runner_backend == "auto":
         
     | 
| 
      
 988 
     | 
    
         
            +
                            if self.quantization in {"fp8", "modelopt_fp8"}:
         
     | 
| 
      
 989 
     | 
    
         
            +
                                self.moe_runner_backend = "flashinfer_trtllm"
         
     | 
| 
      
 990 
     | 
    
         
            +
                                logger.info(
         
     | 
| 
      
 991 
     | 
    
         
            +
                                    "Use flashinfer_trtllm as MoE runner backend on SM100 for Llama4"
         
     | 
| 
      
 992 
     | 
    
         
            +
                                )
         
     | 
| 
       970 
993 
     | 
    
         
             
                    elif model_arch in [
         
     | 
| 
       971 
994 
     | 
    
         
             
                        "Gemma2ForCausalLM",
         
     | 
| 
       972 
995 
     | 
    
         
             
                        "Gemma3ForCausalLM",
         
     | 
| 
         @@ -1005,6 +1028,11 @@ class ServerArgs: 
     | 
|
| 
       1005 
1028 
     | 
    
         
             
                        logger.info(
         
     | 
| 
       1006 
1029 
     | 
    
         
             
                            f"Using {self.attention_backend} as attention backend for {model_arch}."
         
     | 
| 
       1007 
1030 
     | 
    
         
             
                        )
         
     | 
| 
      
 1031 
     | 
    
         
            +
                    elif model_arch in ["KimiLinearForCausalLM"]:
         
     | 
| 
      
 1032 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 1033 
     | 
    
         
            +
                            f"Disabling Radix Cache for {model_arch} as it is not yet supported."
         
     | 
| 
      
 1034 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1035 
     | 
    
         
            +
                        self.disable_radix_cache = True
         
     | 
| 
       1008 
1036 
     | 
    
         | 
| 
       1009 
1037 
     | 
    
         
             
                    if is_deepseek_nsa(hf_config):
         
     | 
| 
       1010 
1038 
     | 
    
         
             
                        if (
         
     | 
| 
         @@ -1027,16 +1055,30 @@ class ServerArgs: 
     | 
|
| 
       1027 
1055 
     | 
    
         
             
                            import torch
         
     | 
| 
       1028 
1056 
     | 
    
         | 
| 
       1029 
1057 
     | 
    
         
             
                            major, _ = torch.cuda.get_device_capability()
         
     | 
| 
       1030 
     | 
    
         
            -
                            if  
     | 
| 
       1031 
     | 
    
         
            -
                                self.kv_cache_dtype = "fp8_e4m3"
         
     | 
| 
       1032 
     | 
    
         
            -
                                logger.warning( 
     | 
| 
      
 1058 
     | 
    
         
            +
                            if self.kv_cache_dtype == "auto":
         
     | 
| 
      
 1059 
     | 
    
         
            +
                                self.kv_cache_dtype = "fp8_e4m3" if major >= 10 else "bfloat16"
         
     | 
| 
      
 1060 
     | 
    
         
            +
                                logger.warning(
         
     | 
| 
      
 1061 
     | 
    
         
            +
                                    f"Setting KV cache dtype to {self.kv_cache_dtype} for DeepSeek NSA."
         
     | 
| 
      
 1062 
     | 
    
         
            +
                                )
         
     | 
| 
      
 1063 
     | 
    
         
            +
                            if self.kv_cache_dtype == "bf16":
         
     | 
| 
      
 1064 
     | 
    
         
            +
                                self.kv_cache_dtype = "bfloat16"
         
     | 
| 
      
 1065 
     | 
    
         
            +
                            assert self.kv_cache_dtype in [
         
     | 
| 
      
 1066 
     | 
    
         
            +
                                "bfloat16",
         
     | 
| 
      
 1067 
     | 
    
         
            +
                                "fp8_e4m3",
         
     | 
| 
      
 1068 
     | 
    
         
            +
                            ], "DeepSeek NSA only supports bf16/bfloat16 or fp8_e4m3 kv_cache_dtype"
         
     | 
| 
       1033 
1069 
     | 
    
         | 
| 
       1034 
1070 
     | 
    
         
             
                            if self.kv_cache_dtype == "fp8_e4m3":
         
     | 
| 
       1035 
     | 
    
         
            -
                                 
     | 
| 
      
 1071 
     | 
    
         
            +
                                # flashmla_auto dispatches to flashmla_sparse/flashmla_kv based on hardware and heuristics
         
     | 
| 
      
 1072 
     | 
    
         
            +
                                self.nsa_prefill_backend = "flashmla_auto"
         
     | 
| 
       1036 
1073 
     | 
    
         
             
                                self.nsa_decode_backend = "flashmla_kv"
         
     | 
| 
       1037 
1074 
     | 
    
         
             
                                logger.warning(
         
     | 
| 
       1038 
     | 
    
         
            -
                                    "Setting NSA backend to flashmla_kv for FP8 KV Cache."
         
     | 
| 
      
 1075 
     | 
    
         
            +
                                    "Setting NSA backend to flashmla_auto for prefill and flashmla_kv for decode for FP8 KV Cache."
         
     | 
| 
       1039 
1076 
     | 
    
         
             
                                )
         
     | 
| 
      
 1077 
     | 
    
         
            +
                            else:
         
     | 
| 
      
 1078 
     | 
    
         
            +
                                # set prefill/decode backends for Blackwell. The default settings are for Hopper.
         
     | 
| 
      
 1079 
     | 
    
         
            +
                                if major >= 10:
         
     | 
| 
      
 1080 
     | 
    
         
            +
                                    self.nsa_prefill_backend = "flashmla_sparse"
         
     | 
| 
      
 1081 
     | 
    
         
            +
                                    self.nsa_decode_backend = "flashmla_sparse"
         
     | 
| 
       1040 
1082 
     | 
    
         | 
| 
       1041 
1083 
     | 
    
         
             
                            # Logging env vars for NSA
         
     | 
| 
       1042 
1084 
     | 
    
         
             
                            from sglang.srt.layers.attention.nsa.utils import (
         
     | 
| 
         @@ -1052,6 +1094,67 @@ class ServerArgs: 
     | 
|
| 
       1052 
1094 
     | 
    
         
             
                        )
         
     | 
| 
       1053 
1095 
     | 
    
         | 
| 
       1054 
1096 
     | 
    
         
             
                def _handle_attention_backend_compatibility(self):
         
     | 
| 
      
 1097 
     | 
    
         
            +
                    model_config = self.get_model_config()
         
     | 
| 
      
 1098 
     | 
    
         
            +
                    use_mla_backend = self.use_mla_backend()
         
     | 
| 
      
 1099 
     | 
    
         
            +
             
     | 
| 
      
 1100 
     | 
    
         
            +
                    if self.prefill_attention_backend is not None and (
         
     | 
| 
      
 1101 
     | 
    
         
            +
                        self.prefill_attention_backend == self.decode_attention_backend
         
     | 
| 
      
 1102 
     | 
    
         
            +
                    ):  # override the default attention backend
         
     | 
| 
      
 1103 
     | 
    
         
            +
                        self.attention_backend = self.prefill_attention_backend
         
     | 
| 
      
 1104 
     | 
    
         
            +
             
     | 
| 
      
 1105 
     | 
    
         
            +
                    # Pick the default attention backend if not specified
         
     | 
| 
      
 1106 
     | 
    
         
            +
                    if self.attention_backend is None:
         
     | 
| 
      
 1107 
     | 
    
         
            +
                        """
         
     | 
| 
      
 1108 
     | 
    
         
            +
                        Auto select the fastest attention backend.
         
     | 
| 
      
 1109 
     | 
    
         
            +
             
     | 
| 
      
 1110 
     | 
    
         
            +
                        1. Models with MHA Architecture (e.g: Llama, QWen)
         
     | 
| 
      
 1111 
     | 
    
         
            +
                            1.1 We will turn on FA3 on hopper unless user use spec decode with topk > 1 or page_size > 1.
         
     | 
| 
      
 1112 
     | 
    
         
            +
                            1.2 In other cases, we will use flashinfer if available, otherwise use triton.
         
     | 
| 
      
 1113 
     | 
    
         
            +
                        2. Models with MLA Architecture and using FA3
         
     | 
| 
      
 1114 
     | 
    
         
            +
                            2.1 We will use FA3 backend on hopper.
         
     | 
| 
      
 1115 
     | 
    
         
            +
                            2.2 We will use Flashinfer backend on blackwell.
         
     | 
| 
      
 1116 
     | 
    
         
            +
                            2.3 Otherwise, we will use triton backend.
         
     | 
| 
      
 1117 
     | 
    
         
            +
                        """
         
     | 
| 
      
 1118 
     | 
    
         
            +
             
     | 
| 
      
 1119 
     | 
    
         
            +
                        if not use_mla_backend:
         
     | 
| 
      
 1120 
     | 
    
         
            +
                            # MHA architecture
         
     | 
| 
      
 1121 
     | 
    
         
            +
                            if (
         
     | 
| 
      
 1122 
     | 
    
         
            +
                                is_hopper_with_cuda_12_3()
         
     | 
| 
      
 1123 
     | 
    
         
            +
                                and is_no_spec_infer_or_topk_one(self)
         
     | 
| 
      
 1124 
     | 
    
         
            +
                                and is_fa3_default_architecture(self.model_config.hf_config)
         
     | 
| 
      
 1125 
     | 
    
         
            +
                            ):
         
     | 
| 
      
 1126 
     | 
    
         
            +
                                self.attention_backend = "fa3"
         
     | 
| 
      
 1127 
     | 
    
         
            +
                            elif is_hip():
         
     | 
| 
      
 1128 
     | 
    
         
            +
                                self.attention_backend = "aiter"
         
     | 
| 
      
 1129 
     | 
    
         
            +
                            elif is_npu():
         
     | 
| 
      
 1130 
     | 
    
         
            +
                                self.attention_backend = "ascend"
         
     | 
| 
      
 1131 
     | 
    
         
            +
                            else:
         
     | 
| 
      
 1132 
     | 
    
         
            +
                                self.attention_backend = (
         
     | 
| 
      
 1133 
     | 
    
         
            +
                                    "flashinfer" if is_flashinfer_available() else "triton"
         
     | 
| 
      
 1134 
     | 
    
         
            +
                                )
         
     | 
| 
      
 1135 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 1136 
     | 
    
         
            +
                            # MLA architecture
         
     | 
| 
      
 1137 
     | 
    
         
            +
                            if is_hopper_with_cuda_12_3():
         
     | 
| 
      
 1138 
     | 
    
         
            +
                                self.attention_backend = "fa3"
         
     | 
| 
      
 1139 
     | 
    
         
            +
                            elif is_sm100_supported():
         
     | 
| 
      
 1140 
     | 
    
         
            +
                                self.attention_backend = "flashinfer"
         
     | 
| 
      
 1141 
     | 
    
         
            +
                            elif is_hip():
         
     | 
| 
      
 1142 
     | 
    
         
            +
                                head_num = model_config.get_num_kv_heads(self.tp_size)
         
     | 
| 
      
 1143 
     | 
    
         
            +
                                # TODO current aiter only support head number 16 or 128 head number
         
     | 
| 
      
 1144 
     | 
    
         
            +
                                if head_num == 128 or head_num == 16:
         
     | 
| 
      
 1145 
     | 
    
         
            +
                                    self.attention_backend = "aiter"
         
     | 
| 
      
 1146 
     | 
    
         
            +
                                else:
         
     | 
| 
      
 1147 
     | 
    
         
            +
                                    self.attention_backend = "triton"
         
     | 
| 
      
 1148 
     | 
    
         
            +
                            elif is_npu():
         
     | 
| 
      
 1149 
     | 
    
         
            +
                                self.attention_backend = "ascend"
         
     | 
| 
      
 1150 
     | 
    
         
            +
                            else:
         
     | 
| 
      
 1151 
     | 
    
         
            +
                                self.attention_backend = "triton"
         
     | 
| 
      
 1152 
     | 
    
         
            +
             
     | 
| 
      
 1153 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 1154 
     | 
    
         
            +
                            f"Attention backend not explicitly specified. Use {self.attention_backend} backend by default."
         
     | 
| 
      
 1155 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1156 
     | 
    
         
            +
             
     | 
| 
      
 1157 
     | 
    
         
            +
                    # Torch native and flex attention backends
         
     | 
| 
       1055 
1158 
     | 
    
         
             
                    if self.attention_backend == "torch_native":
         
     | 
| 
       1056 
1159 
     | 
    
         
             
                        logger.warning(
         
     | 
| 
       1057 
1160 
     | 
    
         
             
                            "Cuda graph is disabled because of using torch native attention backend"
         
     | 
| 
         @@ -1067,12 +1170,7 @@ class ServerArgs: 
     | 
|
| 
       1067 
1170 
     | 
    
         
             
                            self.speculative_algorithm is None
         
     | 
| 
       1068 
1171 
     | 
    
         
             
                        ), "Speculative decoding is currently not supported with Flex Attention backend"
         
     | 
| 
       1069 
1172 
     | 
    
         | 
| 
       1070 
     | 
    
         
            -
                     
     | 
| 
       1071 
     | 
    
         
            -
                        logger.warning(
         
     | 
| 
       1072 
     | 
    
         
            -
                            "At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
         
     | 
| 
       1073 
     | 
    
         
            -
                        )
         
     | 
| 
       1074 
     | 
    
         
            -
                        self.page_size = 128
         
     | 
| 
       1075 
     | 
    
         
            -
             
     | 
| 
      
 1173 
     | 
    
         
            +
                    # Major NVIDIA platforms backends
         
     | 
| 
       1076 
1174 
     | 
    
         
             
                    if (
         
     | 
| 
       1077 
1175 
     | 
    
         
             
                        self.attention_backend == "flashmla"
         
     | 
| 
       1078 
1176 
     | 
    
         
             
                        or self.decode_attention_backend == "flashmla"
         
     | 
| 
         @@ -1095,7 +1193,7 @@ class ServerArgs: 
     | 
|
| 
       1095 
1193 
     | 
    
         
             
                        self.attention_backend == "trtllm_mla"
         
     | 
| 
       1096 
1194 
     | 
    
         
             
                        or self.decode_attention_backend == "trtllm_mla"
         
     | 
| 
       1097 
1195 
     | 
    
         
             
                    ):
         
     | 
| 
       1098 
     | 
    
         
            -
                        if not  
     | 
| 
      
 1196 
     | 
    
         
            +
                        if not is_blackwell_supported():
         
     | 
| 
       1099 
1197 
     | 
    
         
             
                            raise ValueError(
         
     | 
| 
       1100 
1198 
     | 
    
         
             
                                "TRTLLM MLA backend is only supported on Blackwell GPUs (SM100). Please use a different backend."
         
     | 
| 
       1101 
1199 
     | 
    
         
             
                            )
         
     | 
| 
         @@ -1127,19 +1225,13 @@ class ServerArgs: 
     | 
|
| 
       1127 
1225 
     | 
    
         
             
                            )
         
     | 
| 
       1128 
1226 
     | 
    
         
             
                            self.page_size = 64
         
     | 
| 
       1129 
1227 
     | 
    
         | 
| 
       1130 
     | 
    
         
            -
                    if self.attention_backend == " 
     | 
| 
      
 1228 
     | 
    
         
            +
                    if self.attention_backend == "fa3" and self.kv_cache_dtype == "fp8_e5m2":
         
     | 
| 
       1131 
1229 
     | 
    
         
             
                        logger.warning(
         
     | 
| 
       1132 
     | 
    
         
            -
                            " 
     | 
| 
      
 1230 
     | 
    
         
            +
                            "FlashAttention3 only supports fp8_e4m3 if using FP8; "
         
     | 
| 
      
 1231 
     | 
    
         
            +
                            "Setting attention backend to triton."
         
     | 
| 
       1133 
1232 
     | 
    
         
             
                        )
         
     | 
| 
       1134 
     | 
    
         
            -
                        self. 
     | 
| 
       1135 
     | 
    
         
            -
                        self.disable_radix_cache = True
         
     | 
| 
      
 1233 
     | 
    
         
            +
                        self.attention_backend = "triton"
         
     | 
| 
       1136 
1234 
     | 
    
         | 
| 
       1137 
     | 
    
         
            -
                    if self.attention_backend == "intel_xpu":
         
     | 
| 
       1138 
     | 
    
         
            -
                        if self.page_size not in [32, 64, 128]:
         
     | 
| 
       1139 
     | 
    
         
            -
                            logger.warning(
         
     | 
| 
       1140 
     | 
    
         
            -
                                f"Intel XPU attention backend only supports page_size of 32, 64 or 128, changing page_size from {self.page_size} to 128."
         
     | 
| 
       1141 
     | 
    
         
            -
                            )
         
     | 
| 
       1142 
     | 
    
         
            -
                            self.page_size = 128
         
     | 
| 
       1143 
1235 
     | 
    
         
             
                    if self.attention_backend == "fa4" or self.decode_attention_backend == "fa4":
         
     | 
| 
       1144 
1236 
     | 
    
         
             
                        raise ValueError(
         
     | 
| 
       1145 
1237 
     | 
    
         
             
                            "FA4 backend is only supported for prefill. Please use `--prefill-attention-backend fa4` instead."
         
     | 
| 
         @@ -1150,6 +1242,66 @@ class ServerArgs: 
     | 
|
| 
       1150 
1242 
     | 
    
         
             
                        )
         
     | 
| 
       1151 
1243 
     | 
    
         
             
                        self.page_size = 128
         
     | 
| 
       1152 
1244 
     | 
    
         | 
| 
      
 1245 
     | 
    
         
            +
                    # AMD platforms backends
         
     | 
| 
      
 1246 
     | 
    
         
            +
                    if self.attention_backend == "aiter":
         
     | 
| 
      
 1247 
     | 
    
         
            +
                        if model_config.context_len > 8192:
         
     | 
| 
      
 1248 
     | 
    
         
            +
                            self.mem_fraction_static *= 0.85
         
     | 
| 
      
 1249 
     | 
    
         
            +
             
     | 
| 
      
 1250 
     | 
    
         
            +
                    # NPU platforms backends
         
     | 
| 
      
 1251 
     | 
    
         
            +
                    if is_npu() and self.attention_backend in ["ascend"]:
         
     | 
| 
      
 1252 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 1253 
     | 
    
         
            +
                            "At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
         
     | 
| 
      
 1254 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1255 
     | 
    
         
            +
                        self.page_size = 128
         
     | 
| 
      
 1256 
     | 
    
         
            +
             
     | 
| 
      
 1257 
     | 
    
         
            +
                    # Other platforms backends
         
     | 
| 
      
 1258 
     | 
    
         
            +
                    if (
         
     | 
| 
      
 1259 
     | 
    
         
            +
                        self.attention_backend == "intel_amx"
         
     | 
| 
      
 1260 
     | 
    
         
            +
                        and self.device == "cpu"
         
     | 
| 
      
 1261 
     | 
    
         
            +
                        and not cpu_has_amx_support()
         
     | 
| 
      
 1262 
     | 
    
         
            +
                    ):
         
     | 
| 
      
 1263 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 1264 
     | 
    
         
            +
                            "The current platform does not support Intel AMX, will fallback to torch_native backend."
         
     | 
| 
      
 1265 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1266 
     | 
    
         
            +
                        self.attention_backend = "torch_native"
         
     | 
| 
      
 1267 
     | 
    
         
            +
             
     | 
| 
      
 1268 
     | 
    
         
            +
                    if (
         
     | 
| 
      
 1269 
     | 
    
         
            +
                        self.attention_backend == "intel_xpu"
         
     | 
| 
      
 1270 
     | 
    
         
            +
                        and self.device == "xpu"
         
     | 
| 
      
 1271 
     | 
    
         
            +
                        and not xpu_has_xmx_support()
         
     | 
| 
      
 1272 
     | 
    
         
            +
                    ):
         
     | 
| 
      
 1273 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 1274 
     | 
    
         
            +
                            "The current platform does not support Intel XMX, will fallback to triton backend."
         
     | 
| 
      
 1275 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1276 
     | 
    
         
            +
                        self.attention_backend = "triton"
         
     | 
| 
      
 1277 
     | 
    
         
            +
             
     | 
| 
      
 1278 
     | 
    
         
            +
                    if self.attention_backend == "intel_xpu":
         
     | 
| 
      
 1279 
     | 
    
         
            +
                        if self.page_size not in [32, 64, 128]:
         
     | 
| 
      
 1280 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 1281 
     | 
    
         
            +
                                f"Intel XPU attention backend only supports page_size of 32, 64 or 128, changing page_size from {self.page_size} to 128."
         
     | 
| 
      
 1282 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1283 
     | 
    
         
            +
                            self.page_size = 128
         
     | 
| 
      
 1284 
     | 
    
         
            +
             
     | 
| 
      
 1285 
     | 
    
         
            +
                    # Dual chunk flash attention backend
         
     | 
| 
      
 1286 
     | 
    
         
            +
                    if (
         
     | 
| 
      
 1287 
     | 
    
         
            +
                        getattr(model_config.hf_config, "dual_chunk_attention_config", None)
         
     | 
| 
      
 1288 
     | 
    
         
            +
                        is not None
         
     | 
| 
      
 1289 
     | 
    
         
            +
                    ):
         
     | 
| 
      
 1290 
     | 
    
         
            +
                        if self.attention_backend is None:
         
     | 
| 
      
 1291 
     | 
    
         
            +
                            self.attention_backend = "dual_chunk_flash_attn"
         
     | 
| 
      
 1292 
     | 
    
         
            +
                            logger.info("Dual chunk attention is turned on by default.")
         
     | 
| 
      
 1293 
     | 
    
         
            +
                        elif self.attention_backend != "dual_chunk_flash_attn":
         
     | 
| 
      
 1294 
     | 
    
         
            +
                            raise ValueError(
         
     | 
| 
      
 1295 
     | 
    
         
            +
                                "Dual chunk attention is enabled, but attention backend is set to "
         
     | 
| 
      
 1296 
     | 
    
         
            +
                                f"{self.attention_backend}. Please set it to 'dual_chunk_flash_attn'."
         
     | 
| 
      
 1297 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1298 
     | 
    
         
            +
                    if self.attention_backend == "dual_chunk_flash_attn":
         
     | 
| 
      
 1299 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 1300 
     | 
    
         
            +
                            "Mixed chunk and radix cache are disabled when using dual-chunk flash attention backend"
         
     | 
| 
      
 1301 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1302 
     | 
    
         
            +
                        self.enable_mixed_chunk = False
         
     | 
| 
      
 1303 
     | 
    
         
            +
                        self.disable_radix_cache = True
         
     | 
| 
      
 1304 
     | 
    
         
            +
             
     | 
| 
       1153 
1305 
     | 
    
         
             
                def _handle_page_size(self):
         
     | 
| 
       1154 
1306 
     | 
    
         
             
                    if self.page_size is None:
         
     | 
| 
       1155 
1307 
     | 
    
         
             
                        self.page_size = 1
         
     | 
| 
         @@ -1162,6 +1314,22 @@ class ServerArgs: 
     | 
|
| 
       1162 
1314 
     | 
    
         
             
                    if self.grammar_backend is None:
         
     | 
| 
       1163 
1315 
     | 
    
         
             
                        self.grammar_backend = "xgrammar"
         
     | 
| 
       1164 
1316 
     | 
    
         | 
| 
      
 1317 
     | 
    
         
            +
                def _handle_ktransformers_configs(self):
         
     | 
| 
      
 1318 
     | 
    
         
            +
                    from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import (
         
     | 
| 
      
 1319 
     | 
    
         
            +
                        CompressedTensorsWNA16AMXEPMoEMethod,
         
     | 
| 
      
 1320 
     | 
    
         
            +
                        override_config,
         
     | 
| 
      
 1321 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1322 
     | 
    
         
            +
             
     | 
| 
      
 1323 
     | 
    
         
            +
                    override_config(
         
     | 
| 
      
 1324 
     | 
    
         
            +
                        CompressedTensorsWNA16AMXEPMoEMethod,
         
     | 
| 
      
 1325 
     | 
    
         
            +
                        self.kt_num_gpu_experts,
         
     | 
| 
      
 1326 
     | 
    
         
            +
                        self.kt_cpuinfer,
         
     | 
| 
      
 1327 
     | 
    
         
            +
                        self.kt_threadpool_count,
         
     | 
| 
      
 1328 
     | 
    
         
            +
                        self.kt_amx_weight_path,
         
     | 
| 
      
 1329 
     | 
    
         
            +
                        self.kt_amx_method,
         
     | 
| 
      
 1330 
     | 
    
         
            +
                        self.chunked_prefill_size,
         
     | 
| 
      
 1331 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1332 
     | 
    
         
            +
             
     | 
| 
       1165 
1333 
     | 
    
         
             
                def _handle_data_parallelism(self):
         
     | 
| 
       1166 
1334 
     | 
    
         
             
                    if self.dp_size == 1:
         
     | 
| 
       1167 
1335 
     | 
    
         
             
                        self.enable_dp_attention = False
         
     | 
| 
         @@ -1192,8 +1360,10 @@ class ServerArgs: 
     | 
|
| 
       1192 
1360 
     | 
    
         | 
| 
       1193 
1361 
     | 
    
         
             
                    if self.moe_runner_backend == "flashinfer_trtllm":
         
     | 
| 
       1194 
1362 
     | 
    
         
             
                        assert (
         
     | 
| 
       1195 
     | 
    
         
            -
                            self.quantization == "modelopt_fp4" 
     | 
| 
       1196 
     | 
    
         
            -
             
     | 
| 
      
 1363 
     | 
    
         
            +
                            self.quantization == "modelopt_fp4"
         
     | 
| 
      
 1364 
     | 
    
         
            +
                            or self.quantization == "modelopt_fp8"
         
     | 
| 
      
 1365 
     | 
    
         
            +
                            or self.quantization == "fp8"
         
     | 
| 
      
 1366 
     | 
    
         
            +
                        ), "modelopt_fp4, modelopt_fp8 or fp8 quantization is required for Flashinfer TRTLLM MoE"
         
     | 
| 
       1197 
1367 
     | 
    
         
             
                        self.disable_shared_experts_fusion = True
         
     | 
| 
       1198 
1368 
     | 
    
         
             
                        logger.warning(
         
     | 
| 
       1199 
1369 
     | 
    
         
             
                            "FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
         
     | 
| 
         @@ -1277,6 +1447,24 @@ class ServerArgs: 
     | 
|
| 
       1277 
1447 
     | 
    
         
             
                                "Page first direct layout only support direct io backend"
         
     | 
| 
       1278 
1448 
     | 
    
         
             
                            )
         
     | 
| 
       1279 
1449 
     | 
    
         | 
| 
      
 1450 
     | 
    
         
            +
                    if self.enable_hierarchical_cache and self.hicache_io_backend == "kernel":
         
     | 
| 
      
 1451 
     | 
    
         
            +
                        # fix for the compatibility issue with FlashAttention3 decoding and HiCache kernel backend
         
     | 
| 
      
 1452 
     | 
    
         
            +
                        if self.decode_attention_backend is None:
         
     | 
| 
      
 1453 
     | 
    
         
            +
                            if not self.use_mla_backend():
         
     | 
| 
      
 1454 
     | 
    
         
            +
                                self.decode_attention_backend = (
         
     | 
| 
      
 1455 
     | 
    
         
            +
                                    "flashinfer" if is_flashinfer_available() else "triton"
         
     | 
| 
      
 1456 
     | 
    
         
            +
                                )
         
     | 
| 
      
 1457 
     | 
    
         
            +
                            else:
         
     | 
| 
      
 1458 
     | 
    
         
            +
                                self.decode_attention_backend = (
         
     | 
| 
      
 1459 
     | 
    
         
            +
                                    "flashinfer" if is_sm100_supported() else "triton"
         
     | 
| 
      
 1460 
     | 
    
         
            +
                                )
         
     | 
| 
      
 1461 
     | 
    
         
            +
                        elif self.decode_attention_backend == "fa3":
         
     | 
| 
      
 1462 
     | 
    
         
            +
                            self.hicache_io_backend = "direct"
         
     | 
| 
      
 1463 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 1464 
     | 
    
         
            +
                                "FlashAttention3 decode backend is not compatible with hierarchical cache. "
         
     | 
| 
      
 1465 
     | 
    
         
            +
                                "Setting hicache_io_backend to vanilla I/O, which may lead to suboptimal performance with small page sizes."
         
     | 
| 
      
 1466 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1467 
     | 
    
         
            +
             
     | 
| 
       1280 
1468 
     | 
    
         
             
                def _handle_speculative_decoding(self):
         
     | 
| 
       1281 
1469 
     | 
    
         
             
                    if self.speculative_algorithm == "NEXTN":
         
     | 
| 
       1282 
1470 
     | 
    
         
             
                        self.speculative_algorithm = "EAGLE"
         
     | 
| 
         @@ -1287,22 +1475,26 @@ class ServerArgs: 
     | 
|
| 
       1287 
1475 
     | 
    
         
             
                            raise ValueError(
         
     | 
| 
       1288 
1476 
     | 
    
         
             
                                "Currently standalone speculative decoding does not support dp attention."
         
     | 
| 
       1289 
1477 
     | 
    
         
             
                            )
         
     | 
| 
      
 1478 
     | 
    
         
            +
             
     | 
| 
       1290 
1479 
     | 
    
         
             
                        if self.max_running_requests is None:
         
     | 
| 
       1291 
1480 
     | 
    
         
             
                            self.max_running_requests = 48
         
     | 
| 
       1292 
1481 
     | 
    
         
             
                            logger.warning(
         
     | 
| 
       1293 
     | 
    
         
            -
                                "Max running requests is reset to 48 for speculative decoding."
         
     | 
| 
      
 1482 
     | 
    
         
            +
                                "Max running requests is reset to 48 for speculative decoding. You can override this by explicitly setting --max-running-requests."
         
     | 
| 
       1294 
1483 
     | 
    
         
             
                            )
         
     | 
| 
       1295 
1484 
     | 
    
         | 
| 
       1296 
     | 
    
         
            -
                        if  
     | 
| 
      
 1485 
     | 
    
         
            +
                        if (
         
     | 
| 
      
 1486 
     | 
    
         
            +
                            self.speculative_algorithm == "EAGLE"
         
     | 
| 
      
 1487 
     | 
    
         
            +
                            and envs.SGLANG_ENABLE_SPEC_V2.get()
         
     | 
| 
      
 1488 
     | 
    
         
            +
                        ):
         
     | 
| 
       1297 
1489 
     | 
    
         
             
                            self.disable_overlap_schedule = False
         
     | 
| 
       1298 
1490 
     | 
    
         
             
                            logger.warning(
         
     | 
| 
       1299 
1491 
     | 
    
         
             
                                "Beta spec is enabled for eagle speculative decoding and overlap schedule is turned on."
         
     | 
| 
       1300 
1492 
     | 
    
         
             
                            )
         
     | 
| 
       1301 
1493 
     | 
    
         | 
| 
       1302 
     | 
    
         
            -
                        if not  
     | 
| 
      
 1494 
     | 
    
         
            +
                        if not envs.SGLANG_ENABLE_SPEC_V2.get():
         
     | 
| 
       1303 
1495 
     | 
    
         
             
                            self.disable_overlap_schedule = True
         
     | 
| 
       1304 
1496 
     | 
    
         
             
                            logger.warning(
         
     | 
| 
       1305 
     | 
    
         
            -
                                "Overlap scheduler is disabled because of using eagle3  
     | 
| 
      
 1497 
     | 
    
         
            +
                                "Overlap scheduler is disabled because of using eagle3 or standalone speculative decoding."
         
     | 
| 
       1306 
1498 
     | 
    
         
             
                            )
         
     | 
| 
       1307 
1499 
     | 
    
         | 
| 
       1308 
1500 
     | 
    
         
             
                        if self.enable_mixed_chunk:
         
     | 
| 
         @@ -1371,8 +1563,13 @@ class ServerArgs: 
     | 
|
| 
       1371 
1563 
     | 
    
         
             
                            raise ValueError(
         
     | 
| 
       1372 
1564 
     | 
    
         
             
                                "Ngram speculative decoding only supports CUDA device."
         
     | 
| 
       1373 
1565 
     | 
    
         
             
                            )
         
     | 
| 
      
 1566 
     | 
    
         
            +
             
     | 
| 
       1374 
1567 
     | 
    
         
             
                        if self.max_running_requests is None:
         
     | 
| 
       1375 
1568 
     | 
    
         
             
                            self.max_running_requests = 48
         
     | 
| 
      
 1569 
     | 
    
         
            +
                            logger.warning(
         
     | 
| 
      
 1570 
     | 
    
         
            +
                                "Max running requests is reset to 48 for speculative decoding. You can override this by explicitly setting --max-running-requests."
         
     | 
| 
      
 1571 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1572 
     | 
    
         
            +
             
     | 
| 
       1376 
1573 
     | 
    
         
             
                        self.disable_overlap_schedule = True
         
     | 
| 
       1377 
1574 
     | 
    
         
             
                        self.enable_mixed_chunk = False
         
     | 
| 
       1378 
1575 
     | 
    
         
             
                        self.speculative_eagle_topk = self.speculative_ngram_max_bfs_breadth
         
     | 
| 
         @@ -1515,19 +1712,44 @@ class ServerArgs: 
     | 
|
| 
       1515 
1712 
     | 
    
         
             
                        )
         
     | 
| 
       1516 
1713 
     | 
    
         | 
| 
       1517 
1714 
     | 
    
         
             
                def _handle_deterministic_inference(self):
         
     | 
| 
      
 1715 
     | 
    
         
            +
                    if self.rl_on_policy_target is not None:
         
     | 
| 
      
 1716 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 1717 
     | 
    
         
            +
                            "Enable deterministic inference because of rl_on_policy_target."
         
     | 
| 
      
 1718 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1719 
     | 
    
         
            +
                        self.enable_deterministic_inference = True
         
     | 
| 
      
 1720 
     | 
    
         
            +
                        # TODO remove this environment variable as a whole
         
     | 
| 
      
 1721 
     | 
    
         
            +
                        os.environ["SGLANG_ENABLE_DETERMINISTIC_INFERENCE"] = "1"
         
     | 
| 
      
 1722 
     | 
    
         
            +
             
     | 
| 
       1518 
1723 
     | 
    
         
             
                    if self.enable_deterministic_inference:
         
     | 
| 
       1519 
1724 
     | 
    
         
             
                        # Check sampling backend
         
     | 
| 
       1520 
1725 
     | 
    
         
             
                        self.sampling_backend = "pytorch"
         
     | 
| 
       1521 
1726 
     | 
    
         
             
                        logger.warning(
         
     | 
| 
       1522 
1727 
     | 
    
         
             
                            "Sampling backend is set to pytorch for deterministic inference."
         
     | 
| 
       1523 
1728 
     | 
    
         
             
                        )
         
     | 
| 
      
 1729 
     | 
    
         
            +
                        is_deepseek_model = False
         
     | 
| 
      
 1730 
     | 
    
         
            +
                        if parse_connector_type(self.model_path) != ConnectorType.INSTANCE:
         
     | 
| 
      
 1731 
     | 
    
         
            +
                            try:
         
     | 
| 
      
 1732 
     | 
    
         
            +
                                hf_config = self.get_hf_config()
         
     | 
| 
      
 1733 
     | 
    
         
            +
                                model_arch = hf_config.architectures[0]
         
     | 
| 
      
 1734 
     | 
    
         
            +
                                is_deepseek_model = model_arch in [
         
     | 
| 
      
 1735 
     | 
    
         
            +
                                    "DeepseekV2ForCausalLM",
         
     | 
| 
      
 1736 
     | 
    
         
            +
                                    "DeepseekV3ForCausalLM",
         
     | 
| 
      
 1737 
     | 
    
         
            +
                                    "DeepseekV32ForCausalLM",
         
     | 
| 
      
 1738 
     | 
    
         
            +
                                ]
         
     | 
| 
      
 1739 
     | 
    
         
            +
                            except Exception:
         
     | 
| 
      
 1740 
     | 
    
         
            +
                                pass
         
     | 
| 
       1524 
1741 
     | 
    
         | 
| 
       1525 
1742 
     | 
    
         
             
                        # Check attention backend
         
     | 
| 
       1526 
1743 
     | 
    
         
             
                        if self.attention_backend is None:
         
     | 
| 
       1527 
1744 
     | 
    
         
             
                            # User didn't specify attention backend, fallback based on GPU architecture
         
     | 
| 
       1528 
1745 
     | 
    
         
             
                            if is_sm100_supported() or is_sm120_supported():
         
     | 
| 
       1529 
1746 
     | 
    
         
             
                                # Blackwell and newer architectures
         
     | 
| 
       1530 
     | 
    
         
            -
                                 
     | 
| 
      
 1747 
     | 
    
         
            +
                                if is_deepseek_model:
         
     | 
| 
      
 1748 
     | 
    
         
            +
                                    # fallback to triton for DeepSeek models because flashinfer doesn't support deterministic inference for DeepSeek models yet
         
     | 
| 
      
 1749 
     | 
    
         
            +
                                    self.attention_backend = "triton"
         
     | 
| 
      
 1750 
     | 
    
         
            +
                                else:
         
     | 
| 
      
 1751 
     | 
    
         
            +
                                    # fallback to flashinfer on Blackwell for non-DeepSeek models
         
     | 
| 
      
 1752 
     | 
    
         
            +
                                    self.attention_backend = "flashinfer"
         
     | 
| 
       1531 
1753 
     | 
    
         
             
                            else:
         
     | 
| 
       1532 
1754 
     | 
    
         
             
                                # Hopper (SM90) and older architectures
         
     | 
| 
       1533 
1755 
     | 
    
         
             
                                self.attention_backend = "fa3"
         
     | 
| 
         @@ -1542,8 +1764,17 @@ class ServerArgs: 
     | 
|
| 
       1542 
1764 
     | 
    
         
             
                                f"but you explicitly specified '{self.attention_backend}'."
         
     | 
| 
       1543 
1765 
     | 
    
         
             
                            )
         
     | 
| 
       1544 
1766 
     | 
    
         | 
| 
       1545 
     | 
    
         
            -
                         
     | 
| 
       1546 
     | 
    
         
            -
             
     | 
| 
      
 1767 
     | 
    
         
            +
                        if is_deepseek_model:
         
     | 
| 
      
 1768 
     | 
    
         
            +
                            if self.attention_backend not in ["fa3", "triton"]:
         
     | 
| 
      
 1769 
     | 
    
         
            +
                                raise ValueError(
         
     | 
| 
      
 1770 
     | 
    
         
            +
                                    f"Currently only {RADIX_SUPPORTED_DETERMINISTIC_ATTENTION_BACKEND} attention backends are supported for deterministic inference with DeepSeek models. But you're using {self.attention_backend}."
         
     | 
| 
      
 1771 
     | 
    
         
            +
                                )
         
     | 
| 
      
 1772 
     | 
    
         
            +
             
     | 
| 
      
 1773 
     | 
    
         
            +
                        if (
         
     | 
| 
      
 1774 
     | 
    
         
            +
                            self.attention_backend
         
     | 
| 
      
 1775 
     | 
    
         
            +
                            not in RADIX_SUPPORTED_DETERMINISTIC_ATTENTION_BACKEND
         
     | 
| 
      
 1776 
     | 
    
         
            +
                        ):
         
     | 
| 
      
 1777 
     | 
    
         
            +
                            # Currently, only certain backends support radix cache. Support for other backends is in progress
         
     | 
| 
       1547 
1778 
     | 
    
         
             
                            self.disable_radix_cache = True
         
     | 
| 
       1548 
1779 
     | 
    
         
             
                            logger.warning(
         
     | 
| 
       1549 
1780 
     | 
    
         
             
                                f"Currently radix cache is not compatible with {self.attention_backend} attention backend for deterministic inference. It will be supported in the future."
         
     | 
| 
         @@ -1558,7 +1789,13 @@ class ServerArgs: 
     | 
|
| 
       1558 
1789 
     | 
    
         
             
                            )
         
     | 
| 
       1559 
1790 
     | 
    
         | 
| 
       1560 
1791 
     | 
    
         
             
                def _handle_other_validations(self):
         
     | 
| 
       1561 
     | 
    
         
            -
                     
     | 
| 
      
 1792 
     | 
    
         
            +
                    # Handle model inference tensor dump.
         
     | 
| 
      
 1793 
     | 
    
         
            +
                    if self.debug_tensor_dump_output_folder is not None:
         
     | 
| 
      
 1794 
     | 
    
         
            +
                        logger.warning(
         
     | 
| 
      
 1795 
     | 
    
         
            +
                            "Cuda graph and server warmup are disabled because of using tensor dump mode"
         
     | 
| 
      
 1796 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1797 
     | 
    
         
            +
                        self.disable_cuda_graph = True
         
     | 
| 
      
 1798 
     | 
    
         
            +
                        self.skip_server_warmup = True
         
     | 
| 
       1562 
1799 
     | 
    
         | 
| 
       1563 
1800 
     | 
    
         
             
                @staticmethod
         
     | 
| 
       1564 
1801 
     | 
    
         
             
                def add_cli_args(parser: argparse.ArgumentParser):
         
     | 
| 
         @@ -1743,6 +1980,18 @@ class ServerArgs: 
     | 
|
| 
       1743 
1980 
     | 
    
         
             
                        "KV cache dtype is FP8. Otherwise, KV cache scaling factors "
         
     | 
| 
       1744 
1981 
     | 
    
         
             
                        "default to 1.0, which may cause accuracy issues. ",
         
     | 
| 
       1745 
1982 
     | 
    
         
             
                    )
         
     | 
| 
      
 1983 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 1984 
     | 
    
         
            +
                        "--kv-cache-dtype",
         
     | 
| 
      
 1985 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 1986 
     | 
    
         
            +
                        default=ServerArgs.kv_cache_dtype,
         
     | 
| 
      
 1987 
     | 
    
         
            +
                        choices=["auto", "fp8_e5m2", "fp8_e4m3", "bf16", "bfloat16"],
         
     | 
| 
      
 1988 
     | 
    
         
            +
                        help='Data type for kv cache storage. "auto" will use model data type. "bf16" or "bfloat16" for BF16 KV cache. "fp8_e5m2" and "fp8_e4m3" are supported for CUDA 11.8+.',
         
     | 
| 
      
 1989 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1990 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 1991 
     | 
    
         
            +
                        "--enable-fp32-lm-head",
         
     | 
| 
      
 1992 
     | 
    
         
            +
                        action="store_true",
         
     | 
| 
      
 1993 
     | 
    
         
            +
                        help="If set, the LM head outputs (logits) are in FP32.",
         
     | 
| 
      
 1994 
     | 
    
         
            +
                    )
         
     | 
| 
       1746 
1995 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1747 
1996 
     | 
    
         
             
                        "--modelopt-quant",
         
     | 
| 
       1748 
1997 
     | 
    
         
             
                        type=str,
         
     | 
| 
         @@ -1782,18 +2031,6 @@ class ServerArgs: 
     | 
|
| 
       1782 
2031 
     | 
    
         
             
                        "This is useful for development and prototyping. For production, it's recommended "
         
     | 
| 
       1783 
2032 
     | 
    
         
             
                        "to use separate quantization and deployment steps.",
         
     | 
| 
       1784 
2033 
     | 
    
         
             
                    )
         
     | 
| 
       1785 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       1786 
     | 
    
         
            -
                        "--kv-cache-dtype",
         
     | 
| 
       1787 
     | 
    
         
            -
                        type=str,
         
     | 
| 
       1788 
     | 
    
         
            -
                        default=ServerArgs.kv_cache_dtype,
         
     | 
| 
       1789 
     | 
    
         
            -
                        choices=["auto", "fp8_e5m2", "fp8_e4m3", "bf16", "bfloat16"],
         
     | 
| 
       1790 
     | 
    
         
            -
                        help='Data type for kv cache storage. "auto" will use model data type. "bf16" or "bfloat16" for BF16 KV cache. "fp8_e5m2" and "fp8_e4m3" are supported for CUDA 11.8+.',
         
     | 
| 
       1791 
     | 
    
         
            -
                    )
         
     | 
| 
       1792 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       1793 
     | 
    
         
            -
                        "--enable-fp32-lm-head",
         
     | 
| 
       1794 
     | 
    
         
            -
                        action="store_true",
         
     | 
| 
       1795 
     | 
    
         
            -
                        help="If set, the LM head outputs (logits) are in FP32.",
         
     | 
| 
       1796 
     | 
    
         
            -
                    )
         
     | 
| 
       1797 
2034 
     | 
    
         | 
| 
       1798 
2035 
     | 
    
         
             
                    # Memory and scheduling
         
     | 
| 
       1799 
2036 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
         @@ -1898,7 +2135,14 @@ class ServerArgs: 
     | 
|
| 
       1898 
2135 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1899 
2136 
     | 
    
         
             
                        "--disable-hybrid-swa-memory",
         
     | 
| 
       1900 
2137 
     | 
    
         
             
                        action="store_true",
         
     | 
| 
       1901 
     | 
    
         
            -
                        help="Disable the hybrid SWA memory.",
         
     | 
| 
      
 2138 
     | 
    
         
            +
                        help="Disable the hybrid SWA memory pool.",
         
     | 
| 
      
 2139 
     | 
    
         
            +
                    )
         
     | 
| 
      
 2140 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2141 
     | 
    
         
            +
                        "--radix-eviction-policy",
         
     | 
| 
      
 2142 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 2143 
     | 
    
         
            +
                        choices=RADIX_EVICTION_POLICY_CHOICES,
         
     | 
| 
      
 2144 
     | 
    
         
            +
                        default=ServerArgs.radix_eviction_policy,
         
     | 
| 
      
 2145 
     | 
    
         
            +
                        help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.",
         
     | 
| 
       1902 
2146 
     | 
    
         
             
                    )
         
     | 
| 
       1903 
2147 
     | 
    
         | 
| 
       1904 
2148 
     | 
    
         
             
                    # Runtime options
         
     | 
| 
         @@ -1908,21 +2152,6 @@ class ServerArgs: 
     | 
|
| 
       1908 
2152 
     | 
    
         
             
                        default=ServerArgs.device,
         
     | 
| 
       1909 
2153 
     | 
    
         
             
                        help="The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified.",
         
     | 
| 
       1910 
2154 
     | 
    
         
             
                    )
         
     | 
| 
       1911 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       1912 
     | 
    
         
            -
                        "--elastic-ep-backend",
         
     | 
| 
       1913 
     | 
    
         
            -
                        type=str,
         
     | 
| 
       1914 
     | 
    
         
            -
                        default=ServerArgs.elastic_ep_backend,
         
     | 
| 
       1915 
     | 
    
         
            -
                        choices=["none", "mooncake"],
         
     | 
| 
       1916 
     | 
    
         
            -
                        help="Specify the collective communication backend for elastic EP. Currently supports 'mooncake'.",
         
     | 
| 
       1917 
     | 
    
         
            -
                    )
         
     | 
| 
       1918 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       1919 
     | 
    
         
            -
                        "--mooncake-ib-device",
         
     | 
| 
       1920 
     | 
    
         
            -
                        type=str,
         
     | 
| 
       1921 
     | 
    
         
            -
                        default=ServerArgs.mooncake_ib_device,
         
     | 
| 
       1922 
     | 
    
         
            -
                        help="The InfiniBand devices for Mooncake Backend transfer, accepts multiple comma-separated devices "
         
     | 
| 
       1923 
     | 
    
         
            -
                        "(e.g., --mooncake-ib-device mlx5_0,mlx5_1). "
         
     | 
| 
       1924 
     | 
    
         
            -
                        "Default is None, which triggers automatic device detection when Mooncake Backend is enabled.",
         
     | 
| 
       1925 
     | 
    
         
            -
                    )
         
     | 
| 
       1926 
2155 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       1927 
2156 
     | 
    
         
             
                        "--tensor-parallel-size",
         
     | 
| 
       1928 
2157 
     | 
    
         
             
                        "--tp-size",
         
     | 
| 
         @@ -2147,7 +2376,7 @@ class ServerArgs: 
     | 
|
| 
       2147 
2376 
     | 
    
         
             
                        help="Enable opentelemetry trace",
         
     | 
| 
       2148 
2377 
     | 
    
         
             
                    )
         
     | 
| 
       2149 
2378 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2150 
     | 
    
         
            -
                        "-- 
     | 
| 
      
 2379 
     | 
    
         
            +
                        "--otlp-traces-endpoint",
         
     | 
| 
       2151 
2380 
     | 
    
         
             
                        type=str,
         
     | 
| 
       2152 
2381 
     | 
    
         
             
                        default="localhost:4317",
         
     | 
| 
       2153 
2382 
     | 
    
         
             
                        help="Config opentelemetry collector endpoint if --enable-trace is set. format: <ip>:<port>",
         
     | 
| 
         @@ -2210,6 +2439,12 @@ class ServerArgs: 
     | 
|
| 
       2210 
2439 
     | 
    
         
             
                        default=ServerArgs.tool_call_parser,
         
     | 
| 
       2211 
2440 
     | 
    
         
             
                        help=f"Specify the parser for handling tool-call interactions. Options include: {tool_call_parser_choices}.",
         
     | 
| 
       2212 
2441 
     | 
    
         
             
                    )
         
     | 
| 
      
 2442 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2443 
     | 
    
         
            +
                        "--tool-server",
         
     | 
| 
      
 2444 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 2445 
     | 
    
         
            +
                        default=None,
         
     | 
| 
      
 2446 
     | 
    
         
            +
                        help="Either 'demo' or a comma-separated list of tool server urls to use for the model. If not specified, no tool server will be used.",
         
     | 
| 
      
 2447 
     | 
    
         
            +
                    )
         
     | 
| 
       2213 
2448 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2214 
2449 
     | 
    
         
             
                        "--sampling-defaults",
         
     | 
| 
       2215 
2450 
     | 
    
         
             
                        type=str,
         
     | 
| 
         @@ -2220,12 +2455,6 @@ class ServerArgs: 
     | 
|
| 
       2220 
2455 
     | 
    
         
             
                        "'model' uses the model's generation_config.json to get the recommended "
         
     | 
| 
       2221 
2456 
     | 
    
         
             
                        "sampling parameters if available. Default is 'model'.",
         
     | 
| 
       2222 
2457 
     | 
    
         
             
                    )
         
     | 
| 
       2223 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       2224 
     | 
    
         
            -
                        "--tool-server",
         
     | 
| 
       2225 
     | 
    
         
            -
                        type=str,
         
     | 
| 
       2226 
     | 
    
         
            -
                        default=None,
         
     | 
| 
       2227 
     | 
    
         
            -
                        help="Either 'demo' or a comma-separated list of tool server urls to use for the model. If not specified, no tool server will be used.",
         
     | 
| 
       2228 
     | 
    
         
            -
                    )
         
     | 
| 
       2229 
2458 
     | 
    
         | 
| 
       2230 
2459 
     | 
    
         
             
                    # Data parallelism
         
     | 
| 
       2231 
2460 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
         @@ -2332,7 +2561,7 @@ class ServerArgs: 
     | 
|
| 
       2332 
2561 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2333 
2562 
     | 
    
         
             
                        "--lora-eviction-policy",
         
     | 
| 
       2334 
2563 
     | 
    
         
             
                        type=str,
         
     | 
| 
       2335 
     | 
    
         
            -
                        default= 
     | 
| 
      
 2564 
     | 
    
         
            +
                        default=ServerArgs.lora_eviction_policy,
         
     | 
| 
       2336 
2565 
     | 
    
         
             
                        choices=["lru", "fifo"],
         
     | 
| 
       2337 
2566 
     | 
    
         
             
                        help="LoRA adapter eviction policy when memory pool is full. 'lru': Least Recently Used (default, better cache efficiency). 'fifo': First-In-First-Out.",
         
     | 
| 
       2338 
2567 
     | 
    
         
             
                    )
         
     | 
| 
         @@ -2408,7 +2637,6 @@ class ServerArgs: 
     | 
|
| 
       2408 
2637 
     | 
    
         
             
                    )
         
     | 
| 
       2409 
2638 
     | 
    
         | 
| 
       2410 
2639 
     | 
    
         
             
                    # Speculative decoding
         
     | 
| 
       2411 
     | 
    
         
            -
                    parser.add_argument("--enable-beta-spec", action="store_true")
         
     | 
| 
       2412 
2640 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2413 
2641 
     | 
    
         
             
                        "--speculative-algorithm",
         
     | 
| 
       2414 
2642 
     | 
    
         
             
                        type=str,
         
     | 
| 
         @@ -2644,6 +2872,21 @@ class ServerArgs: 
     | 
|
| 
       2644 
2872 
     | 
    
         
             
                        default=ServerArgs.moe_dense_tp_size,
         
     | 
| 
       2645 
2873 
     | 
    
         
             
                        help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
         
     | 
| 
       2646 
2874 
     | 
    
         
             
                    )
         
     | 
| 
      
 2875 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2876 
     | 
    
         
            +
                        "--elastic-ep-backend",
         
     | 
| 
      
 2877 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 2878 
     | 
    
         
            +
                        default=ServerArgs.elastic_ep_backend,
         
     | 
| 
      
 2879 
     | 
    
         
            +
                        choices=["none", "mooncake"],
         
     | 
| 
      
 2880 
     | 
    
         
            +
                        help="Specify the collective communication backend for elastic EP. Currently supports 'mooncake'.",
         
     | 
| 
      
 2881 
     | 
    
         
            +
                    )
         
     | 
| 
      
 2882 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 2883 
     | 
    
         
            +
                        "--mooncake-ib-device",
         
     | 
| 
      
 2884 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 2885 
     | 
    
         
            +
                        default=ServerArgs.mooncake_ib_device,
         
     | 
| 
      
 2886 
     | 
    
         
            +
                        help="The InfiniBand devices for Mooncake Backend transfer, accepts multiple comma-separated devices "
         
     | 
| 
      
 2887 
     | 
    
         
            +
                        "(e.g., --mooncake-ib-device mlx5_0,mlx5_1). "
         
     | 
| 
      
 2888 
     | 
    
         
            +
                        "Default is None, which triggers automatic device detection when Mooncake Backend is enabled.",
         
     | 
| 
      
 2889 
     | 
    
         
            +
                    )
         
     | 
| 
       2647 
2890 
     | 
    
         | 
| 
       2648 
2891 
     | 
    
         
             
                    # Mamba Cache
         
     | 
| 
       2649 
2892 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
         @@ -2691,13 +2934,6 @@ class ServerArgs: 
     | 
|
| 
       2691 
2934 
     | 
    
         
             
                        default=ServerArgs.hicache_write_policy,
         
     | 
| 
       2692 
2935 
     | 
    
         
             
                        help="The write policy of hierarchical cache.",
         
     | 
| 
       2693 
2936 
     | 
    
         
             
                    )
         
     | 
| 
       2694 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       2695 
     | 
    
         
            -
                        "--radix-eviction-policy",
         
     | 
| 
       2696 
     | 
    
         
            -
                        type=str,
         
     | 
| 
       2697 
     | 
    
         
            -
                        choices=RADIX_EVICTION_POLICY_CHOICES,
         
     | 
| 
       2698 
     | 
    
         
            -
                        default=ServerArgs.radix_eviction_policy,
         
     | 
| 
       2699 
     | 
    
         
            -
                        help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.",
         
     | 
| 
       2700 
     | 
    
         
            -
                    )
         
     | 
| 
       2701 
2937 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       2702 
2938 
     | 
    
         
             
                        "--hicache-io-backend",
         
     | 
| 
       2703 
2939 
     | 
    
         
             
                        type=str,
         
     | 
| 
         @@ -2805,7 +3041,7 @@ class ServerArgs: 
     | 
|
| 
       2805 
3041 
     | 
    
         
             
                        "--ds-sparse-decode-threshold",
         
     | 
| 
       2806 
3042 
     | 
    
         
             
                        type=int,
         
     | 
| 
       2807 
3043 
     | 
    
         
             
                        default=ServerArgs.ds_sparse_decode_threshold,
         
     | 
| 
       2808 
     | 
    
         
            -
                        help="The  
     | 
| 
      
 3044 
     | 
    
         
            +
                        help="The minimum decode sequence length required before the double-sparsity backend switches from the dense fallback to the sparse decode kernel.",
         
     | 
| 
       2809 
3045 
     | 
    
         
             
                    )
         
     | 
| 
       2810 
3046 
     | 
    
         | 
| 
       2811 
3047 
     | 
    
         
             
                    # Offloading
         
     | 
| 
         @@ -3111,26 +3347,20 @@ class ServerArgs: 
     | 
|
| 
       3111 
3347 
     | 
    
         
             
                        nargs="+",
         
     | 
| 
       3112 
3348 
     | 
    
         
             
                        help="Sets the numa node for the subprocesses. i-th element corresponds to i-th subprocess.",
         
     | 
| 
       3113 
3349 
     | 
    
         
             
                    )
         
     | 
| 
       3114 
     | 
    
         
            -
             
     | 
| 
       3115 
     | 
    
         
            -
                    # Debug tensor dumps
         
     | 
| 
       3116 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       3117 
     | 
    
         
            -
                        "--debug-tensor-dump-output-folder",
         
     | 
| 
       3118 
     | 
    
         
            -
                        type=str,
         
     | 
| 
       3119 
     | 
    
         
            -
                        default=ServerArgs.debug_tensor_dump_output_folder,
         
     | 
| 
       3120 
     | 
    
         
            -
                        help="The output folder for dumping tensors.",
         
     | 
| 
       3121 
     | 
    
         
            -
                    )
         
     | 
| 
       3122 
3350 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       3123 
     | 
    
         
            -
                        "-- 
     | 
| 
       3124 
     | 
    
         
            -
                         
     | 
| 
       3125 
     | 
    
         
            -
                         
     | 
| 
       3126 
     | 
    
         
            -
                        help="The input filename for dumping tensors",
         
     | 
| 
      
 3351 
     | 
    
         
            +
                        "--enable-deterministic-inference",
         
     | 
| 
      
 3352 
     | 
    
         
            +
                        action="store_true",
         
     | 
| 
      
 3353 
     | 
    
         
            +
                        help="Enable deterministic inference mode with batch invariant ops.",
         
     | 
| 
       3127 
3354 
     | 
    
         
             
                    )
         
     | 
| 
       3128 
3355 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       3129 
     | 
    
         
            -
                        "-- 
     | 
| 
      
 3356 
     | 
    
         
            +
                        "--rl-on-policy-target",
         
     | 
| 
       3130 
3357 
     | 
    
         
             
                        type=str,
         
     | 
| 
       3131 
     | 
    
         
            -
                        default=ServerArgs. 
     | 
| 
       3132 
     | 
    
         
            -
                         
     | 
| 
      
 3358 
     | 
    
         
            +
                        default=ServerArgs.rl_on_policy_target,
         
     | 
| 
      
 3359 
     | 
    
         
            +
                        choices=["fsdp"],
         
     | 
| 
      
 3360 
     | 
    
         
            +
                        help="The training system that SGLang needs to match for true on-policy.",
         
     | 
| 
       3133 
3361 
     | 
    
         
             
                    )
         
     | 
| 
      
 3362 
     | 
    
         
            +
             
     | 
| 
      
 3363 
     | 
    
         
            +
                    # Dynamic batch tokenizer
         
     | 
| 
       3134 
3364 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       3135 
3365 
     | 
    
         
             
                        "--enable-dynamic-batch-tokenizer",
         
     | 
| 
       3136 
3366 
     | 
    
         
             
                        action="store_true",
         
     | 
| 
         @@ -3149,6 +3379,32 @@ class ServerArgs: 
     | 
|
| 
       3149 
3379 
     | 
    
         
             
                        help="[Only used if --enable-dynamic-batch-tokenizer is set] Timeout in seconds for batching tokenization requests.",
         
     | 
| 
       3150 
3380 
     | 
    
         
             
                    )
         
     | 
| 
       3151 
3381 
     | 
    
         | 
| 
      
 3382 
     | 
    
         
            +
                    # Debug tensor dumps
         
     | 
| 
      
 3383 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 3384 
     | 
    
         
            +
                        "--debug-tensor-dump-output-folder",
         
     | 
| 
      
 3385 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 3386 
     | 
    
         
            +
                        default=ServerArgs.debug_tensor_dump_output_folder,
         
     | 
| 
      
 3387 
     | 
    
         
            +
                        help="The output folder for dumping tensors.",
         
     | 
| 
      
 3388 
     | 
    
         
            +
                    )
         
     | 
| 
      
 3389 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 3390 
     | 
    
         
            +
                        "--debug-tensor-dump-layers",
         
     | 
| 
      
 3391 
     | 
    
         
            +
                        type=int,
         
     | 
| 
      
 3392 
     | 
    
         
            +
                        default=-1,
         
     | 
| 
      
 3393 
     | 
    
         
            +
                        help="The layer number for dumping tensors.",
         
     | 
| 
      
 3394 
     | 
    
         
            +
                    )
         
     | 
| 
      
 3395 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 3396 
     | 
    
         
            +
                        "--debug-tensor-dump-input-file",
         
     | 
| 
      
 3397 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 3398 
     | 
    
         
            +
                        default=ServerArgs.debug_tensor_dump_input_file,
         
     | 
| 
      
 3399 
     | 
    
         
            +
                        help="The input filename for dumping tensors",
         
     | 
| 
      
 3400 
     | 
    
         
            +
                    )
         
     | 
| 
      
 3401 
     | 
    
         
            +
                    parser.add_argument(
         
     | 
| 
      
 3402 
     | 
    
         
            +
                        "--debug-tensor-dump-inject",
         
     | 
| 
      
 3403 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 3404 
     | 
    
         
            +
                        default=ServerArgs.debug_tensor_dump_inject,
         
     | 
| 
      
 3405 
     | 
    
         
            +
                        help="Inject the outputs from jax as the input of every layer.",
         
     | 
| 
      
 3406 
     | 
    
         
            +
                    )
         
     | 
| 
      
 3407 
     | 
    
         
            +
             
     | 
| 
       3152 
3408 
     | 
    
         
             
                    # PD disaggregation
         
     | 
| 
       3153 
3409 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       3154 
3410 
     | 
    
         
             
                        "--disaggregation-mode",
         
     | 
| 
         @@ -3258,7 +3514,6 @@ class ServerArgs: 
     | 
|
| 
       3258 
3514 
     | 
    
         
             
                        default=None,
         
     | 
| 
       3259 
3515 
     | 
    
         
             
                        help="The path of the PD-Multiplexing config file.",
         
     | 
| 
       3260 
3516 
     | 
    
         
             
                    )
         
     | 
| 
       3261 
     | 
    
         
            -
             
     | 
| 
       3262 
3517 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       3263 
3518 
     | 
    
         
             
                        "--sm-group-num",
         
     | 
| 
       3264 
3519 
     | 
    
         
             
                        type=int,
         
     | 
| 
         @@ -3266,55 +3521,25 @@ class ServerArgs: 
     | 
|
| 
       3266 
3521 
     | 
    
         
             
                        help="Number of sm partition groups.",
         
     | 
| 
       3267 
3522 
     | 
    
         
             
                    )
         
     | 
| 
       3268 
3523 
     | 
    
         | 
| 
       3269 
     | 
    
         
            -
                    #  
     | 
| 
      
 3524 
     | 
    
         
            +
                    # Configuration file support
         
     | 
| 
       3270 
3525 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       3271 
     | 
    
         
            -
                        "-- 
     | 
| 
       3272 
     | 
    
         
            -
                         
     | 
| 
       3273 
     | 
    
         
            -
                        help=" 
     | 
| 
      
 3526 
     | 
    
         
            +
                        "--config",
         
     | 
| 
      
 3527 
     | 
    
         
            +
                        type=str,
         
     | 
| 
      
 3528 
     | 
    
         
            +
                        help="Read CLI options from a config file. Must be a YAML file with configuration options.",
         
     | 
| 
       3274 
3529 
     | 
    
         
             
                    )
         
     | 
| 
       3275 
3530 
     | 
    
         | 
| 
       3276 
     | 
    
         
            -
                    #  
     | 
| 
      
 3531 
     | 
    
         
            +
                    # For Multi-Modal
         
     | 
| 
       3277 
3532 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       3278 
     | 
    
         
            -
                        "-- 
     | 
| 
       3279 
     | 
    
         
            -
                         
     | 
| 
       3280 
     | 
    
         
            -
                         
     | 
| 
       3281 
     | 
    
         
            -
             
     | 
| 
       3282 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       3283 
     | 
    
         
            -
                        "--enable-deepep-moe",
         
     | 
| 
       3284 
     | 
    
         
            -
                        action=DeprecatedAction,
         
     | 
| 
       3285 
     | 
    
         
            -
                        help="NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead.",
         
     | 
| 
       3286 
     | 
    
         
            -
                    )
         
     | 
| 
       3287 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       3288 
     | 
    
         
            -
                        "--enable-flashinfer-cutlass-moe",
         
     | 
| 
       3289 
     | 
    
         
            -
                        action=DeprecatedAction,
         
     | 
| 
       3290 
     | 
    
         
            -
                        help="NOTE: --enable-flashinfer-cutlass-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutlass' instead.",
         
     | 
| 
       3291 
     | 
    
         
            -
                    )
         
     | 
| 
       3292 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       3293 
     | 
    
         
            -
                        "--enable-flashinfer-cutedsl-moe",
         
     | 
| 
       3294 
     | 
    
         
            -
                        action=DeprecatedAction,
         
     | 
| 
       3295 
     | 
    
         
            -
                        help="NOTE: --enable-flashinfer-cutedsl-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutedsl' instead.",
         
     | 
| 
       3296 
     | 
    
         
            -
                    )
         
     | 
| 
       3297 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       3298 
     | 
    
         
            -
                        "--enable-flashinfer-trtllm-moe",
         
     | 
| 
       3299 
     | 
    
         
            -
                        action=DeprecatedAction,
         
     | 
| 
       3300 
     | 
    
         
            -
                        help="NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_trtllm' instead.",
         
     | 
| 
       3301 
     | 
    
         
            -
                    )
         
     | 
| 
       3302 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       3303 
     | 
    
         
            -
                        "--enable-triton-kernel-moe",
         
     | 
| 
       3304 
     | 
    
         
            -
                        action=DeprecatedAction,
         
     | 
| 
       3305 
     | 
    
         
            -
                        help="NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead.",
         
     | 
| 
       3306 
     | 
    
         
            -
                    )
         
     | 
| 
       3307 
     | 
    
         
            -
                    parser.add_argument(
         
     | 
| 
       3308 
     | 
    
         
            -
                        "--enable-flashinfer-mxfp4-moe",
         
     | 
| 
       3309 
     | 
    
         
            -
                        action=DeprecatedAction,
         
     | 
| 
       3310 
     | 
    
         
            -
                        help="NOTE: --enable-flashinfer-mxfp4-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_mxfp4' instead.",
         
     | 
| 
      
 3533 
     | 
    
         
            +
                        "--mm-max-concurrent-calls",
         
     | 
| 
      
 3534 
     | 
    
         
            +
                        type=int,
         
     | 
| 
      
 3535 
     | 
    
         
            +
                        default=ServerArgs.mm_max_concurrent_calls,
         
     | 
| 
      
 3536 
     | 
    
         
            +
                        help="The max concurrent calls for async mm data processing.",
         
     | 
| 
       3311 
3537 
     | 
    
         
             
                    )
         
     | 
| 
       3312 
     | 
    
         
            -
             
     | 
| 
       3313 
     | 
    
         
            -
                    # Configuration file support
         
     | 
| 
       3314 
3538 
     | 
    
         
             
                    parser.add_argument(
         
     | 
| 
       3315 
     | 
    
         
            -
                        "-- 
     | 
| 
       3316 
     | 
    
         
            -
                        type= 
     | 
| 
       3317 
     | 
    
         
            -
                         
     | 
| 
      
 3539 
     | 
    
         
            +
                        "--mm-per-request-timeout",
         
     | 
| 
      
 3540 
     | 
    
         
            +
                        type=int,
         
     | 
| 
      
 3541 
     | 
    
         
            +
                        default=ServerArgs.mm_per_request_timeout,
         
     | 
| 
      
 3542 
     | 
    
         
            +
                        help="The timeout for each multi-modal request in seconds.",
         
     | 
| 
       3318 
3543 
     | 
    
         
             
                    )
         
     | 
| 
       3319 
3544 
     | 
    
         | 
| 
       3320 
3545 
     | 
    
         
             
                @classmethod
         
     | 
| 
         @@ -3344,6 +3569,34 @@ class ServerArgs: 
     | 
|
| 
       3344 
3569 
     | 
    
         
             
                    )
         
     | 
| 
       3345 
3570 
     | 
    
         
             
                    return hf_config
         
     | 
| 
       3346 
3571 
     | 
    
         | 
| 
      
 3572 
     | 
    
         
            +
                def get_model_config(self):
         
     | 
| 
      
 3573 
     | 
    
         
            +
                    # Lazy init to avoid circular import
         
     | 
| 
      
 3574 
     | 
    
         
            +
                    from sglang.srt.configs.model_config import ModelConfig
         
     | 
| 
      
 3575 
     | 
    
         
            +
             
     | 
| 
      
 3576 
     | 
    
         
            +
                    if hasattr(self, "model_config"):
         
     | 
| 
      
 3577 
     | 
    
         
            +
                        return self.model_config
         
     | 
| 
      
 3578 
     | 
    
         
            +
                    self.model_config = ModelConfig.from_server_args(self)
         
     | 
| 
      
 3579 
     | 
    
         
            +
                    return self.model_config
         
     | 
| 
      
 3580 
     | 
    
         
            +
             
     | 
| 
      
 3581 
     | 
    
         
            +
                def get_attention_backends(self):
         
     | 
| 
      
 3582 
     | 
    
         
            +
                    prefill_attention_backend_str = (
         
     | 
| 
      
 3583 
     | 
    
         
            +
                        self.prefill_attention_backend
         
     | 
| 
      
 3584 
     | 
    
         
            +
                        if self.prefill_attention_backend
         
     | 
| 
      
 3585 
     | 
    
         
            +
                        else self.attention_backend
         
     | 
| 
      
 3586 
     | 
    
         
            +
                    )
         
     | 
| 
      
 3587 
     | 
    
         
            +
                    decode_attention_backend_str = (
         
     | 
| 
      
 3588 
     | 
    
         
            +
                        self.decode_attention_backend
         
     | 
| 
      
 3589 
     | 
    
         
            +
                        if self.decode_attention_backend
         
     | 
| 
      
 3590 
     | 
    
         
            +
                        else self.attention_backend
         
     | 
| 
      
 3591 
     | 
    
         
            +
                    )
         
     | 
| 
      
 3592 
     | 
    
         
            +
                    return prefill_attention_backend_str, decode_attention_backend_str
         
     | 
| 
      
 3593 
     | 
    
         
            +
             
     | 
| 
      
 3594 
     | 
    
         
            +
                def use_mla_backend(self):
         
     | 
| 
      
 3595 
     | 
    
         
            +
                    from sglang.srt.configs.model_config import AttentionArch
         
     | 
| 
      
 3596 
     | 
    
         
            +
             
     | 
| 
      
 3597 
     | 
    
         
            +
                    model_config = self.get_model_config()
         
     | 
| 
      
 3598 
     | 
    
         
            +
                    return model_config.attention_arch == AttentionArch.MLA
         
     | 
| 
      
 3599 
     | 
    
         
            +
             
     | 
| 
       3347 
3600 
     | 
    
         
             
                def check_server_args(self):
         
     | 
| 
       3348 
3601 
     | 
    
         
             
                    # Check parallel size constraints
         
     | 
| 
       3349 
3602 
     | 
    
         
             
                    assert (
         
     | 
| 
         @@ -3721,6 +3974,13 @@ class PortArgs: 
     | 
|
| 
       3721 
3974 
     | 
    
         
             
                    else:
         
     | 
| 
       3722 
3975 
     | 
    
         
             
                        nccl_port = server_args.nccl_port
         
     | 
| 
       3723 
3976 
     | 
    
         | 
| 
      
 3977 
     | 
    
         
            +
                    if server_args.tokenizer_worker_num > 1:
         
     | 
| 
      
 3978 
     | 
    
         
            +
                        tokenizer_worker_ipc_name = (
         
     | 
| 
      
 3979 
     | 
    
         
            +
                            f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
         
     | 
| 
      
 3980 
     | 
    
         
            +
                        )
         
     | 
| 
      
 3981 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 3982 
     | 
    
         
            +
                        tokenizer_worker_ipc_name = None
         
     | 
| 
      
 3983 
     | 
    
         
            +
             
     | 
| 
       3724 
3984 
     | 
    
         
             
                    if not server_args.enable_dp_attention:
         
     | 
| 
       3725 
3985 
     | 
    
         
             
                        # Normal case, use IPC within a single node
         
     | 
| 
       3726 
3986 
     | 
    
         
             
                        return PortArgs(
         
     | 
| 
         @@ -3730,7 +3990,7 @@ class PortArgs: 
     | 
|
| 
       3730 
3990 
     | 
    
         
             
                            nccl_port=nccl_port,
         
     | 
| 
       3731 
3991 
     | 
    
         
             
                            rpc_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
         
     | 
| 
       3732 
3992 
     | 
    
         
             
                            metrics_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
         
     | 
| 
       3733 
     | 
    
         
            -
                            tokenizer_worker_ipc_name= 
     | 
| 
      
 3993 
     | 
    
         
            +
                            tokenizer_worker_ipc_name=tokenizer_worker_ipc_name,
         
     | 
| 
       3734 
3994 
     | 
    
         
             
                        )
         
     | 
| 
       3735 
3995 
     | 
    
         
             
                    else:
         
     | 
| 
       3736 
3996 
     | 
    
         
             
                        # DP attention. Use TCP + port to handle both single-node and multi-node.
         
     | 
| 
         @@ -3764,7 +4024,7 @@ class PortArgs: 
     | 
|
| 
       3764 
4024 
     | 
    
         
             
                            nccl_port=nccl_port,
         
     | 
| 
       3765 
4025 
     | 
    
         
             
                            rpc_ipc_name=f"tcp://{dist_init_host}:{rpc_port}",
         
     | 
| 
       3766 
4026 
     | 
    
         
             
                            metrics_ipc_name=f"tcp://{dist_init_host}:{metrics_ipc_name}",
         
     | 
| 
       3767 
     | 
    
         
            -
                            tokenizer_worker_ipc_name= 
     | 
| 
      
 4027 
     | 
    
         
            +
                            tokenizer_worker_ipc_name=tokenizer_worker_ipc_name,
         
     | 
| 
       3768 
4028 
     | 
    
         
             
                        )
         
     | 
| 
       3769 
4029 
     | 
    
         | 
| 
       3770 
4030 
     | 
    
         |