PyPI - sglang - Versions diffs - 0.4.9.post3__py3-none-any.whl → 0.4.9.post4__py3-none-any.whl - Mend

sglang 0.4.9.post3py3-none-any.whl → 0.4.9.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

sglang/srt/_custom_ops.py +29 -1
sglang/srt/configs/model_config.py +1 -1
sglang/srt/conversation.py +1 -1
sglang/srt/disaggregation/common/conn.py +34 -6
sglang/srt/disaggregation/mini_lb.py +3 -2
sglang/srt/disaggregation/mooncake/conn.py +49 -20
sglang/srt/disaggregation/mooncake/transfer_engine.py +4 -2
sglang/srt/disaggregation/nixl/conn.py +17 -13
sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -91
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +96 -1
sglang/srt/distributed/device_communicators/quick_all_reduce.py +273 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +12 -5
sglang/srt/distributed/parallel_state.py +70 -15
sglang/srt/entrypoints/engine.py +2 -8
sglang/srt/entrypoints/http_server.py +20 -32
sglang/srt/entrypoints/openai/protocol.py +3 -3
sglang/srt/entrypoints/openai/serving_chat.py +27 -4
sglang/srt/function_call/base_format_detector.py +74 -12
sglang/srt/function_call/deepseekv3_detector.py +26 -11
sglang/srt/function_call/ebnf_composer.py +95 -63
sglang/srt/function_call/function_call_parser.py +4 -4
sglang/srt/function_call/kimik2_detector.py +41 -16
sglang/srt/function_call/llama32_detector.py +6 -3
sglang/srt/function_call/mistral_detector.py +11 -3
sglang/srt/function_call/pythonic_detector.py +16 -14
sglang/srt/function_call/qwen25_detector.py +12 -3
sglang/srt/function_call/{qwen3_detector.py → qwen3_coder_detector.py} +10 -9
sglang/srt/layers/activation.py +11 -3
sglang/srt/layers/attention/base_attn_backend.py +3 -1
sglang/srt/layers/communicator.py +12 -12
sglang/srt/layers/dp_attention.py +72 -24
sglang/srt/layers/logits_processor.py +34 -24
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +25 -224
sglang/srt/layers/moe/topk.py +5 -13
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -9
sglang/srt/layers/quantization/modelopt_quant.py +8 -4
sglang/srt/layers/quantization/utils.py +0 -9
sglang/srt/layers/radix_attention.py +5 -3
sglang/srt/lora/lora_manager.py +133 -169
sglang/srt/lora/lora_registry.py +124 -0
sglang/srt/lora/mem_pool.py +2 -2
sglang/srt/managers/cache_controller.py +53 -6
sglang/srt/managers/io_struct.py +19 -1
sglang/srt/managers/schedule_batch.py +13 -3
sglang/srt/managers/scheduler.py +13 -25
sglang/srt/managers/tokenizer_manager.py +28 -25
sglang/srt/managers/tp_worker.py +2 -4
sglang/srt/mem_cache/allocator.py +67 -7
sglang/srt/mem_cache/hicache_storage.py +17 -1
sglang/srt/mem_cache/hiradix_cache.py +30 -16
sglang/srt/mem_cache/memory_pool_host.py +3 -0
sglang/srt/model_executor/cuda_graph_runner.py +61 -25
sglang/srt/model_executor/forward_batch_info.py +201 -29
sglang/srt/model_executor/model_runner.py +41 -23
sglang/srt/models/deepseek_v2.py +1 -2
sglang/srt/models/mllama4.py +10 -3
sglang/srt/models/qwen2_moe.py +0 -4
sglang/srt/models/qwen3_moe.py +1 -6
sglang/srt/reasoning_parser.py +46 -4
sglang/srt/sampling/sampling_batch_info.py +6 -5
sglang/srt/server_args.py +76 -55
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +33 -28
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +37 -36
sglang/srt/speculative/eagle_utils.py +51 -23
sglang/srt/speculative/eagle_worker.py +59 -44
sglang/srt/two_batch_overlap.py +9 -5
sglang/srt/utils.py +17 -68
sglang/test/test_activation.py +50 -1
sglang/version.py +1 -1
{sglang-0.4.9.post3.dist-info → sglang-0.4.9.post4.dist-info}/METADATA +5 -5
{sglang-0.4.9.post3.dist-info → sglang-0.4.9.post4.dist-info}/RECORD +75 -72
{sglang-0.4.9.post3.dist-info → sglang-0.4.9.post4.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post3.dist-info → sglang-0.4.9.post4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post3.dist-info → sglang-0.4.9.post4.dist-info}/top_level.txt +0 -0

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -68,6 +68,7 @@ from sglang.srt.layers.sampler import Sampler
 from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model
 from sglang.srt.layers.utils import is_sm100_supported
 from sglang.srt.lora.lora_manager import LoRAManager
+from sglang.srt.lora.lora_registry import LoRARef
 from sglang.srt.managers.schedule_batch import (
     GLOBAL_SERVER_ARGS_KEYS,
     global_server_args_dict,
@@ -108,7 +109,6 @@ from sglang.srt.utils import (
     get_bool_env_var,
     get_cpu_ids_by_node,
     init_custom_process_group,
-    is_cuda,
     is_fa3_default_architecture,
     is_flashinfer_available,
     is_hip,
@@ -275,6 +275,7 @@ class ModelRunner:
         self.sampler = Sampler()
         self.load_model()
+        # Check if the model is using hybrid SWA
         if (
             not self.server_args.disable_hybrid_swa_memory
             and self.sliding_window_size is not None
@@ -377,6 +378,7 @@ class ModelRunner:
                     is_hopper_with_cuda_12_3()
                     and is_no_spec_infer_or_topk_one(server_args)
                     and is_fa3_default_architecture(self.model_config.hf_config)
+                    and (not server_args.enable_hierarchical_cache)
                 ):
                     server_args.attention_backend = "fa3"
                 elif _is_hip:
@@ -389,7 +391,9 @@ class ModelRunner:
                     )
             else:
                 # MLA architecture
-                if is_hopper_with_cuda_12_3():
+                if is_hopper_with_cuda_12_3() and (
+                    not server_args.enable_hierarchical_cache
+                ):
                     server_args.attention_backend = "fa3"
                 elif is_sm100_supported():
                     server_args.attention_backend = "flashinfer"
@@ -890,44 +894,38 @@ class ModelRunner:
             tp_rank=self.tp_rank,
             max_lora_rank=self.server_args.max_lora_rank,
             target_modules=self.server_args.lora_target_modules,
+            lora_paths=self.server_args.lora_paths,
         )
-        result = self.lora_manager.load_lora_adapters(self.server_args.lora_paths or {})
-        if result.success:
-            logger.info(
-                f"LoRA manager ready. Loaded LoRA adapters: {', '.join(result.loaded_adapters)}"
-            )
-        else:
-            raise RuntimeError(f"Failed to load LoRA adapters: {result.error_message}")
-    def load_lora_adapter(self, lora_name: str, lora_path: str):
+    def load_lora_adapter(self, lora_ref: LoRARef):
         """Load a new lora adapter from disk or huggingface."""
         logger.info(
-            f"LoRA adapter loading starts: name={lora_name}, path={lora_path}. "
+            f"LoRA adapter loading starts: {lora_ref}. "
             f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
         )
-        result = self.lora_manager.load_lora_adapter(lora_name, lora_path)
+        result = self.lora_manager.load_lora_adapter(lora_ref)
         logger.info(
-            f"LoRA adapter loading completes: name={lora_name}, path={lora_path}. "
+            f"LoRA adapter loading completes: {lora_ref}. "
             f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
         )
         return result
-    def unload_lora_adapter(self, lora_name: str):
+    def unload_lora_adapter(self, lora_ref: LoRARef):
         """Unload a lora adapter that was previously loaded during initialization or dynamic loading."""
         logger.info(
-            f"LoRA adapter unloading starts: name={lora_name}. "
+            f"LoRA adapter unloading starts: {lora_ref}. "
             f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
         )
-        result = self.lora_manager.unload_lora_adapter(lora_name)
+        result = self.lora_manager.unload_lora_adapter(lora_ref)
         logger.info(
-            f"LoRA adapter unloading completes: name={lora_name}. "
+            f"LoRA adapter unloading completes: {lora_ref}. "
             f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
         )
@@ -1010,8 +1008,11 @@ class ModelRunner:
                 try:
                     layers = self.model.language_model.model.layers
                 except:
-                    self.is_hybrid = False
-                    return
+                    try:
+                        layers = self.model.language_model.layers
+                    except:
+                        self.is_hybrid = False
+                        return
             for layer in layers:
                 if (
@@ -1462,9 +1463,13 @@ class ModelRunner:
         tensor_parallel(self.model, device_mesh)
     def forward_decode(
-        self, forward_batch: ForwardBatch, pp_proxy_tensors=None
+        self,
+        forward_batch: ForwardBatch,
+        skip_attn_backend_init: bool = False,
+        pp_proxy_tensors=None,
     ) -> LogitsProcessorOutput:
-        self.attn_backend.init_forward_metadata(forward_batch)
+        if not skip_attn_backend_init:
+            self.attn_backend.init_forward_metadata(forward_batch)
         # FIXME: add pp_proxy_tensors arg to all models
         kwargs = {}
         if self.support_pp:
@@ -1576,8 +1581,18 @@ class ModelRunner:
                 skip_attn_backend_init=skip_attn_backend_init,
                 pp_proxy_tensors=pp_proxy_tensors,
             )
-        elif forward_batch.forward_mode.is_decode():
-            ret = self.forward_decode(forward_batch, pp_proxy_tensors=pp_proxy_tensors)
+            return ret, can_run_cuda_graph
+        # For MLP sync
+        if forward_batch.global_num_tokens_cpu is not None:
+            forward_batch.prepare_mlp_sync_batch(self)
+        if forward_batch.forward_mode.is_decode():
+            ret = self.forward_decode(
+                forward_batch,
+                skip_attn_backend_init=skip_attn_backend_init,
+                pp_proxy_tensors=pp_proxy_tensors,
+            )
         elif forward_batch.forward_mode.is_extend():
             ret = self.forward_extend(
                 forward_batch,
@@ -1595,6 +1610,9 @@ class ModelRunner:
         else:
             raise ValueError(f"Invalid forward mode: {forward_batch.forward_mode}")
+        if forward_batch.global_num_tokens_cpu is not None:
+            forward_batch.post_forward_mlp_sync_batch(ret)
         return ret, can_run_cuda_graph
     def _preprocess_logits(

sglang/srt/models/deepseek_v2.py CHANGED Viewed

@@ -550,9 +550,8 @@ class DeepseekV2MoE(nn.Module):
     def forward_deepep(
         self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
     ) -> torch.Tensor:
-        forward_mode = forward_batch.forward_mode
         shared_output = None
-        if is_non_idle_and_non_empty(forward_mode, hidden_states):
+        if hidden_states.shape[0] > 0:
             # router_logits: (num_tokens, n_experts)
             router_logits = self.gate(hidden_states)
             shared_output = self._forward_shared_experts(hidden_states)

sglang/srt/models/mllama4.py CHANGED Viewed

@@ -23,6 +23,7 @@ from sglang.srt.managers.schedule_batch import (
     Modality,
     MultimodalDataItem,
     MultimodalInputs,
+    global_server_args_dict,
 )
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
@@ -55,13 +56,17 @@ class Llama4ForConditionalGeneration(nn.Module):
         self.quant_config = quant_config
         # Check if this is a text-only model (modelopt fp8 llama4 has no vision components)
-        self.has_vision = self._has_vision_weights(config)
-        if not self.has_vision:
+        self.has_vision_weights = self._has_vision_weights(config)
+        if not self.has_vision_weights:
             logger.warning(
                 "No vision weights found in checkpoint. Model will run in text-only mode. "
                 "Multimodal capabilities (image processing) will be unavailable."
             )
+        self.has_vision = (
+            self.has_vision_weights and global_server_args_dict["enable_multimodal"]
+        )
         if self.has_vision:
             self.vision_model = Llama4VisionModel(config.vision_config)
             self.multi_modal_projector = Llama4MultiModalProjector(config)
@@ -269,7 +274,9 @@ class Llama4ForConditionalGeneration(nn.Module):
     def _should_skip_weight(self, name: str) -> bool:
         """Check if we should skip loading this weight."""
-        return "vision" in name and not self.has_vision
+        return not self.has_vision and (
+            "vision" in name or "multi_modal_projector" in name
+        )
     def _transform_weight_name(self, name: str) -> str:
         """Transform weight name by adding language_model prefix if needed."""

sglang/srt/models/qwen2_moe.py CHANGED Viewed

@@ -43,10 +43,6 @@ from sglang.srt.layers.communicator import (
     ScatterMode,
 )
 from sglang.srt.layers.dp_attention import (
-    attn_tp_all_gather,
-    attn_tp_reduce_scatter,
-    dp_gather_partial,
-    dp_scatter,
     get_attention_tp_rank,
     get_attention_tp_size,
     get_local_attention_dp_size,

sglang/srt/models/qwen3_moe.py CHANGED Viewed

@@ -38,10 +38,6 @@ from sglang.srt.eplb.expert_location_dispatch import ExpertLocationDispatchInfo
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
 from sglang.srt.layers.dp_attention import (
-    attn_tp_all_gather,
-    attn_tp_reduce_scatter,
-    dp_gather_partial,
-    dp_scatter,
     get_attention_tp_rank,
     get_attention_tp_size,
     get_local_attention_dp_size,
@@ -193,8 +189,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
     def forward_deepep(
         self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
     ) -> torch.Tensor:
-        forward_mode = forward_batch.forward_mode
-        if is_non_idle_and_non_empty(forward_mode, hidden_states):
+        if hidden_states.shape[0] > 0:
             # router_logits: (num_tokens, n_experts)
             router_logits, _ = self.gate(hidden_states)
             topk_weights, topk_idx, _ = self.topk(

sglang/srt/reasoning_parser.py CHANGED Viewed

@@ -118,6 +118,14 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector):
     Returns all the text before the </think> tag as `reasoning_text`
     and the rest of the text as `normal_text`.
+    Supported models:
+      - DeepSeek-R1: Always generates thinking content without <think> start tag
+      - DeepSeek-R1-0528: Generates thinking content with <think> start tag
+    Format patterns:
+      - DeepSeek-R1: "I need to think about this...</think>The answer is 42."
+      - DeepSeek-R1-0528: "<think>I need to think about this...</think>The answer is 42."
     Args:
         stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
             If True, streams reasoning content as it arrives.
@@ -136,11 +144,20 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector):
 class Qwen3Detector(BaseReasoningFormatDetector):
     """
-    Detector for Qwen3 model.
+    Detector for standard Qwen3 models (e.g., Qwen/Qwen3-235B-A22B).
     Assumes reasoning format:
       (<think>)*(.*)</think>
-    Returns all the text before the </think> tag as `reasoning_text`
-    and the rest of the text as `normal_text`.
+    Qwen3 models released before 07/2025 supports switching between thinking mode and normal
+    mode using `enable_thinking` parameter in the request parameter.
+      - enable_thinking=True: "<think>reasoning content</think>The answer is 42."
+      - enable_thinking=False: "The answer is 42." (no thinking tokens)
+    This detector handles both cases.
+    NOTE: Do NOT use this detector for Qwen3-Thinking models (e.g., Qwen3-Thinking-2507).
+    Those models always generate thinking content without <think> start tags.
+    Use "qwen3-thinking" parser type for those models instead.
     Args:
         stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
@@ -148,7 +165,6 @@ class Qwen3Detector(BaseReasoningFormatDetector):
     """
     def __init__(self, stream_reasoning: bool = True):
-        # Qwen3 won't be in reasoning mode when user passes `enable_thinking=False`
         super().__init__(
             "<think>",
             "</think>",
@@ -157,6 +173,31 @@ class Qwen3Detector(BaseReasoningFormatDetector):
         )
+class Qwen3ThinkingDetector(BaseReasoningFormatDetector):
+    """
+    Detector for Qwen3-Thinking models (e.g., Qwen3-Thinking-2507).
+    Assumes reasoning format:
+      *(.*)</think>
+    These models always generate thinking content without <think> start tag.
+    They do not support the enable_thinking parameter and always think.
+    Format: "I need to think about this...</think>The answer is 42."
+    Args:
+        stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
+            If True, streams reasoning content as it arrives.
+    """
+    def __init__(self, stream_reasoning: bool = True):
+        super().__init__(
+            "<think>",
+            "</think>",
+            force_reasoning=True,
+            stream_reasoning=stream_reasoning,
+        )
 class KimiDetector(BaseReasoningFormatDetector):
     """
     Detector for Kimi Thinking model.
@@ -189,6 +230,7 @@ class ReasoningParser:
     DetectorMap: Dict[str, Type[BaseReasoningFormatDetector]] = {
         "deepseek-r1": DeepSeekR1Detector,
         "qwen3": Qwen3Detector,
+        "qwen3-thinking": Qwen3ThinkingDetector,
         "kimi": KimiDetector,
     }

sglang/srt/sampling/sampling_batch_info.py CHANGED Viewed

@@ -322,6 +322,12 @@ class SamplingBatchInfo:
             # Set the flag to True if any of the two has custom logit processor
             self.has_custom_logit_processor = True
+        # Merge logit bias - note this has to come before the temperatures tensor update! Otherwise will cause crashes.
+        # See note below on len(self) and len(other).
+        self.logit_bias = merge_bias_tensor(
+            self.logit_bias, other.logit_bias, len(self), len(other), self.device, 0.0
+        )
         # Note: because the __len()__ operator is defined on the temperatures tensor,
         # please make sure any merge operation with len(self) or len(other) is done before
         # the merge operation of the temperatures tensor below.
@@ -340,11 +346,6 @@ class SamplingBatchInfo:
         self.need_top_k_sampling |= other.need_top_k_sampling
         self.need_min_p_sampling |= other.need_min_p_sampling
-        # Merge logit bias
-        self.logit_bias = merge_bias_tensor(
-            self.logit_bias, other.logit_bias, len(self), len(other), self.device, 0.0
-        )
 def merge_bias_tensor(
     lhs: Optional[torch.Tensor],

sglang/srt/server_args.py CHANGED Viewed

@@ -20,10 +20,10 @@ import logging
 import os
 import random
 import tempfile
-from token import OP
 from typing import List, Literal, Optional, Union
 from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
+from sglang.srt.lora.lora_registry import LoRARef
 from sglang.srt.reasoning_parser import ReasoningParser
 from sglang.srt.utils import (
     LORA_TARGET_ALL_MODULES,
@@ -80,7 +80,7 @@ class ServerArgs:
     schedule_policy: str = "fcfs"
     schedule_conservativeness: float = 1.0
     cpu_offload_gb: int = 0
-    page_size: int = 1
+    page_size: Optional[int] = None
     hybrid_kvcache_ratio: Optional[float] = None
     swa_full_tokens_ratio: float = 0.8
     disable_hybrid_swa_memory: bool = False
@@ -145,7 +145,7 @@ class ServerArgs:
     enable_lora: Optional[bool] = None
     max_lora_rank: Optional[int] = None
     lora_target_modules: Optional[Union[set[str], List[str]]] = None
-    lora_paths: Optional[Union[dict[str, str], List[str]]] = None
+    lora_paths: Optional[Union[dict[str, str], dict[str, LoRARef], List[str]]] = None
     max_loras_per_batch: int = 8
     lora_backend: str = "triton"
@@ -266,31 +266,20 @@ class ServerArgs:
     def __post_init__(self):
         # Expert parallelism
+        # We put it here first due to some internal ckpt conversation issues.
         if self.enable_ep_moe:
             self.ep_size = self.tp_size
             logger.warning(
                 f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
             )
-        if self.enable_flashinfer_moe:
-            assert (
-                self.quantization == "modelopt_fp4"
-            ), "modelopt_fp4 quantization is required for Flashinfer MOE"
-            os.environ["TRTLLM_ENABLE_PDL"] = "1"
-            self.disable_shared_experts_fusion = True
-            logger.warning(
-                f"Flashinfer MoE is enabled. Shared expert fusion is disabled."
-            )
         # Set missing default values
         if self.tokenizer_path is None:
             self.tokenizer_path = self.model_path
-        if self.device is None:
-            self.device = get_device()
         if self.served_model_name is None:
             self.served_model_name = self.model_path
+        if self.device is None:
+            self.device = get_device()
         if self.random_seed is None:
             self.random_seed = random.randint(0, 1 << 30)
@@ -359,7 +348,6 @@ class ServerArgs:
                     self.chunked_prefill_size = 16384
             else:
                 self.chunked_prefill_size = 4096
-        assert self.chunked_prefill_size % self.page_size == 0
         # Set cuda graph max batch size
         if self.cuda_graph_max_bs is None:
@@ -410,6 +398,14 @@ class ServerArgs:
             )
             self.page_size = 128
+        # Set page size
+        if self.page_size is None:
+            self.page_size = 1
+        # AMD-specific Triton attention KV splits default number
+        if is_hip():
+            self.triton_attention_num_kv_splits = 16
         # Choose grammar backend
         if self.grammar_backend is None:
             self.grammar_backend = "xgrammar"
@@ -431,6 +427,13 @@ class ServerArgs:
                 self.enable_dp_attention
             ), "Please enable dp attention when setting enable_dp_lm_head. "
+        # MoE kernel
+        if self.enable_flashinfer_moe:
+            assert (
+                self.quantization == "modelopt_fp4"
+            ), "modelopt_fp4 quantization is required for Flashinfer MOE"
+            os.environ["TRTLLM_ENABLE_PDL"] = "1"
         # DeepEP MoE
         if self.enable_deepep_moe:
             if self.deepep_mode == "normal":
@@ -502,14 +505,6 @@ class ServerArgs:
                     logger.warning(
                         "DeepSeek MTP does not require setting speculative_draft_model_path."
                     )
-            elif "Llama4" in model_arch:
-                # TODO: remove this after Llama4 supports in other backends
-                if self.attention_backend != "fa3":
-                    self.attention_backend = "fa3"
-                    logger.warning(
-                        "Llama4 requires using fa3 attention backend. "
-                        "Attention backend is automatically set to fa3."
-                    )
             # Auto choose parameters
             if self.speculative_num_steps is None:
@@ -542,12 +537,11 @@ class ServerArgs:
         ) and check_gguf_file(self.model_path):
             self.quantization = self.load_format = "gguf"
+        # Model loading
         if is_remote_url(self.model_path):
             self.load_format = "remote"
-        # AMD-specific Triton attention KV splits default number
-        if is_hip():
-            self.triton_attention_num_kv_splits = 16
+        if self.custom_weight_loader is None:
+            self.custom_weight_loader = []
         # PD disaggregation
         if self.disaggregation_mode == "decode":
@@ -572,6 +566,7 @@ class ServerArgs:
             self.disable_cuda_graph = True
             logger.warning("Cuda graph is disabled for prefill server")
+        # Propagate env vars
         os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
             "1" if self.enable_torch_compile else "0"
         )
@@ -580,9 +575,6 @@ class ServerArgs:
             "1" if self.disable_outlines_disk_cache else "0"
         )
-        if self.custom_weight_loader is None:
-            self.custom_weight_loader = []
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
         # Model and tokenizer
@@ -1099,10 +1091,10 @@ class ServerArgs:
                 "deepseekv3",
                 "pythonic",
                 "kimi_k2",
-                "qwen3",
+                "qwen3_coder",
             ],
             default=ServerArgs.tool_call_parser,
-            help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', and 'kimi_k2'.",
+            help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', and 'qwen3_coder'.",
         )
         # Data parallelism
@@ -1227,6 +1219,13 @@ class ServerArgs:
             default=ServerArgs.grammar_backend,
             help="Choose the backend for grammar-guided decoding.",
         )
+        parser.add_argument(
+            "--mm-attention-backend",
+            type=str,
+            choices=["sdpa", "fa3", "triton_attn"],
+            default=ServerArgs.mm_attention_backend,
+            help="Set multimodal attention backend.",
+        )
         # Speculative decoding
         parser.add_argument(
@@ -1276,13 +1275,6 @@ class ServerArgs:
             help="The path of the draft model's small vocab table.",
             default=ServerArgs.speculative_token_map,
         )
-        parser.add_argument(
-            "--mm-attention-backend",
-            type=str,
-            choices=["sdpa", "fa3", "triton_attn"],
-            default=ServerArgs.mm_attention_backend,
-            help="Set multimodal attention backend.",
-        )
         # Expert parallelism
         parser.add_argument(
@@ -1530,11 +1522,6 @@ class ServerArgs:
             action="store_true",
             help="Disable the overlap scheduler, which overlaps the CPU scheduler with GPU model worker.",
         )
-        parser.add_argument(
-            "--disable-overlap-cg-plan",
-            action="store_true",
-            help="Disable the overlap optimization for cudagraph preparation in eagle verify.",
-        )
         parser.add_argument(
             "--enable-mixed-chunk",
             action="store_true",
@@ -1792,11 +1779,11 @@ class ServerArgs:
         return hf_config
     def check_server_args(self):
+        # Check parallel size constraints
         assert (
             self.tp_size * self.pp_size
         ) % self.nnodes == 0, "tp_size must be divisible by number of nodes"
-        # FIXME pp constraints
         if self.pp_size > 1:
             assert (
                 self.disable_overlap_schedule
@@ -1807,11 +1794,7 @@ class ServerArgs:
         assert not (
             self.dp_size > 1 and self.nnodes != 1 and not self.enable_dp_attention
         ), "multi-node data parallel is not supported unless dp attention!"
-        assert (
-            self.max_loras_per_batch > 0
-            # FIXME
-            and (self.lora_paths is None or self.disable_radix_cache)
-        ), "compatibility of lora and radix attention is in progress"
         assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
         assert self.gpu_id_step >= 1, "gpu_id_step must be positive"
@@ -1820,9 +1803,32 @@ class ServerArgs:
             None,
         }, "moe_dense_tp_size only support 1 and None currently"
+        # Check model architecture
+        model_arch = self.get_hf_config().architectures[0]
+        if "Llama4" in model_arch:
+            assert self.attention_backend == "fa3", "fa3 is required for Llama4 model"
+        # Check LoRA
         self.check_lora_server_args()
+        # Check speculative decoding
+        if self.speculative_algorithm is not None:
+            assert (
+                not self.enable_mixed_chunk
+            ), "enable_mixed_chunk is required for speculative decoding"
+        # Check chunked prefill
+        assert (
+            self.chunked_prefill_size % self.page_size == 0
+        ), "chunked_prefill_size must be divisible by page_size"
     def check_lora_server_args(self):
+        assert (
+            self.max_loras_per_batch > 0
+            # FIXME
+            and (self.lora_paths is None or self.disable_radix_cache)
+        ), "compatibility of lora and radix attention is in progress"
         # Enable LoRA if any LoRA paths are provided for backward compatibility.
         if self.lora_paths:
             if self.enable_lora is None:
@@ -1843,9 +1849,24 @@ class ServerArgs:
                 for lora_path in lora_paths:
                     if "=" in lora_path:
                         name, path = lora_path.split("=", 1)
-                        self.lora_paths[name] = path
+                        self.lora_paths[name] = LoRARef(lora_name=name, lora_path=path)
                     else:
-                        self.lora_paths[lora_path] = lora_path
+                        self.lora_paths[lora_path] = LoRARef(
+                            lora_name=lora_path,
+                            lora_path=lora_path,
+                        )
+            elif isinstance(self.lora_paths, dict):
+                self.lora_paths = {
+                    k: LoRARef(lora_name=k, lora_path=v)
+                    for k, v in self.lora_paths.items()
+                }
+            elif self.lora_paths is None:
+                self.lora_paths = {}
+            else:
+                raise ValueError(
+                    f"Invalid type for --lora-paths: {type(self.lora_paths)}. "
+                    "Expected a list or a dictionary."
+                )
             # Expand target modules
             if self.lora_target_modules:

sglang 0.4.9.post3__py3-none-any.whl → 0.4.9.post4__py3-none-any.whl

sglang 0.4.9.post3py3-none-any.whl → 0.4.9.post4py3-none-any.whl