PyPI - sglang - Versions diffs - 0.4.10__py3-none-any.whl → 0.4.10.post2__py3-none-any.whl - Mend

sglang 0.4.10py3-none-any.whl → 0.4.10.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (92) hide show

sglang/bench_offline_throughput.py +20 -0
sglang/compile_deep_gemm.py +8 -1
sglang/global_config.py +5 -1
sglang/srt/configs/model_config.py +1 -0
sglang/srt/conversation.py +0 -112
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +1 -0
sglang/srt/disaggregation/launch_lb.py +5 -20
sglang/srt/disaggregation/mooncake/conn.py +33 -15
sglang/srt/disaggregation/prefill.py +1 -0
sglang/srt/distributed/device_communicators/pynccl.py +7 -0
sglang/srt/distributed/device_communicators/pynccl_allocator.py +133 -0
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +42 -3
sglang/srt/distributed/parallel_state.py +11 -0
sglang/srt/entrypoints/engine.py +4 -2
sglang/srt/entrypoints/http_server.py +35 -15
sglang/srt/eplb/expert_distribution.py +4 -2
sglang/srt/hf_transformers_utils.py +25 -10
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/flashattention_backend.py +7 -11
sglang/srt/layers/attention/trtllm_mla_backend.py +372 -0
sglang/srt/layers/attention/utils.py +6 -1
sglang/srt/layers/attention/vision.py +27 -10
sglang/srt/layers/communicator.py +14 -4
sglang/srt/layers/linear.py +7 -1
sglang/srt/layers/logits_processor.py +9 -1
sglang/srt/layers/moe/ep_moe/layer.py +29 -68
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +82 -25
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +0 -31
sglang/srt/layers/moe/token_dispatcher/__init__.py +23 -0
sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +12 -1
sglang/srt/layers/moe/{ep_moe/token_dispatcher.py → token_dispatcher/deepep.py} +8 -15
sglang/srt/layers/moe/utils.py +43 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +3 -2
sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +1 -1
sglang/srt/layers/quantization/fp8.py +57 -1
sglang/srt/layers/quantization/fp8_kernel.py +0 -4
sglang/srt/layers/quantization/w8a8_int8.py +4 -1
sglang/srt/layers/vocab_parallel_embedding.py +7 -1
sglang/srt/lora/lora_registry.py +7 -0
sglang/srt/managers/cache_controller.py +43 -39
sglang/srt/managers/data_parallel_controller.py +52 -2
sglang/srt/managers/io_struct.py +6 -1
sglang/srt/managers/schedule_batch.py +3 -2
sglang/srt/managers/schedule_policy.py +3 -1
sglang/srt/managers/scheduler.py +145 -6
sglang/srt/managers/template_manager.py +25 -22
sglang/srt/managers/tokenizer_manager.py +114 -62
sglang/srt/managers/utils.py +45 -1
sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py +182 -0
sglang/srt/mem_cache/hicache_storage.py +13 -12
sglang/srt/mem_cache/hiradix_cache.py +21 -4
sglang/srt/mem_cache/memory_pool.py +15 -118
sglang/srt/mem_cache/memory_pool_host.py +350 -33
sglang/srt/mem_cache/radix_cache_cpp.py +229 -0
sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +8 -2
sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp +35 -0
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +163 -0
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +238 -0
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +216 -0
sglang/srt/model_executor/cuda_graph_runner.py +42 -4
sglang/srt/model_executor/forward_batch_info.py +13 -3
sglang/srt/model_executor/model_runner.py +13 -1
sglang/srt/model_loader/weight_utils.py +2 -0
sglang/srt/models/deepseek_v2.py +28 -23
sglang/srt/models/glm4_moe.py +85 -22
sglang/srt/models/grok.py +3 -3
sglang/srt/models/llama4.py +13 -2
sglang/srt/models/mixtral.py +3 -3
sglang/srt/models/mllama4.py +428 -19
sglang/srt/models/qwen2_moe.py +1 -4
sglang/srt/models/qwen3_moe.py +7 -8
sglang/srt/models/step3_vl.py +1 -4
sglang/srt/multimodal/processors/base_processor.py +4 -3
sglang/srt/multimodal/processors/gemma3n.py +0 -7
sglang/srt/operations_strategy.py +1 -1
sglang/srt/server_args.py +115 -21
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +18 -0
sglang/srt/two_batch_overlap.py +6 -4
sglang/srt/utils.py +4 -24
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/attention/test_trtllm_mla_backend.py +945 -0
sglang/test/runners.py +2 -2
sglang/test/test_utils.py +3 -3
sglang/version.py +1 -1
{sglang-0.4.10.dist-info → sglang-0.4.10.post2.dist-info}/METADATA +3 -2
{sglang-0.4.10.dist-info → sglang-0.4.10.post2.dist-info}/RECORD +92 -81
/sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/mooncake_store.py +0 -0
/sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/unit_test.py +0 -0
{sglang-0.4.10.dist-info → sglang-0.4.10.post2.dist-info}/WHEEL +0 -0
{sglang-0.4.10.dist-info → sglang-0.4.10.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.10.dist-info → sglang-0.4.10.post2.dist-info}/top_level.txt +0 -0

sglang/srt/models/qwen3_moe.py CHANGED Viewed

@@ -24,6 +24,7 @@ import torch
 from torch import nn
 from sglang.srt.distributed import (
+    get_moe_expert_parallel_world_size,
     get_pp_group,
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
@@ -51,7 +52,6 @@ from sglang.srt.layers.linear import (
 )
 from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
 from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
-from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
 from sglang.srt.layers.moe.topk import TopK
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
@@ -72,7 +72,7 @@ from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.models.qwen2_moe import Qwen2MoeMLP as Qwen3MoeMLP
 from sglang.srt.models.qwen2_moe import Qwen2MoeModel
 from sglang.srt.two_batch_overlap import MaybeTboDeepEPDispatcher
-from sglang.srt.utils import DeepEPMode, add_prefix, is_cuda, is_non_idle_and_non_empty
+from sglang.srt.utils import add_prefix, is_cuda, is_non_idle_and_non_empty
 Qwen3MoeConfig = None
@@ -113,15 +113,14 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
             quant_config=quant_config,
             prefix=add_prefix("experts", prefix),
             **(
-                dict(deepep_mode=DeepEPMode[global_server_args_dict["deepep_mode"]])
-                if global_server_args_dict["enable_deepep_moe"]
+                dict(deepep_mode=global_server_args_dict["deepep_mode"])
+                if global_server_args_dict["moe_a2a_backend"].is_deepep()
                 else {}
             ),
             # Additional args for FusedMoE
             **(
                 dict(
                     enable_flashinfer_cutlass_moe=True,
-                    enable_ep_moe=global_server_args_dict["enable_ep_moe"],
                 )
                 if global_server_args_dict["enable_flashinfer_cutlass_moe"]
                 else {}
@@ -136,9 +135,9 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
             prefix=add_prefix("gate", prefix),
         )
-        if global_server_args_dict["enable_deepep_moe"]:
+        if global_server_args_dict["moe_a2a_backend"].is_deepep():
             # TODO: we will support tp < ep in the future
-            self.ep_size = get_tensor_model_parallel_world_size()
+            self.ep_size = get_moe_expert_parallel_world_size()
             self.num_experts = (
                 config.num_experts + global_server_args_dict["ep_num_redundant_experts"]
             )
@@ -148,7 +147,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
         self, hidden_states: torch.Tensor, forward_batch: Optional[ForwardBatch] = None
     ) -> torch.Tensor:
-        if not global_server_args_dict["enable_deepep_moe"]:
+        if not global_server_args_dict["moe_a2a_backend"].is_deepep():
             return self.forward_normal(hidden_states)
         else:
             return self.forward_deepep(hidden_states, forward_batch)

sglang/srt/models/step3_vl.py CHANGED Viewed

@@ -146,7 +146,7 @@ class Step3TextMoEMLP(nn.Module):
             prefix=add_prefix("gate", prefix),
         )
-        if global_server_args_dict["enable_deepep_moe"]:
+        if global_server_args_dict["moe_a2a_backend"].is_deepep():
             raise NotImplementedError("DeepEP MoE is not supported yet in Step3 model.")
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -868,7 +868,6 @@ class Step3VLForConditionalGeneration(nn.Module):
         )
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        # TODO:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", 0),
@@ -901,9 +900,7 @@ class Step3VLForConditionalGeneration(nn.Module):
         for name, loaded_weight in weights:
             if "vision_model" in name:
-                # 1.It’s not great, but let’s leave it like this for now
                 name = name.replace("self_attn", "self_attn.attn")
-                # 2.
                 name = name.replace("out_proj", "proj")
             # TODO: support vision model

sglang/srt/multimodal/processors/base_processor.py CHANGED Viewed

@@ -12,7 +12,6 @@ import torch
 from PIL import Image
 from transformers import BaseImageProcessorFast
-from sglang.srt.managers.mm_utils import TransportProxyTensor
 from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
 from sglang.srt.utils import load_audio, load_image, load_video, logger
@@ -218,8 +217,10 @@ class BaseMultimodalProcessor(ABC):
                 kwargs["audio"] = audios
         processor = self._processor
-        if hasattr(processor, "image_processor") and isinstance(
-            processor.image_processor, BaseImageProcessorFast
+        if (
+            hasattr(processor, "image_processor")
+            and isinstance(processor.image_processor, BaseImageProcessorFast)
+            and not self.server_args.disable_fast_image_processor
         ):
             kwargs["device"] = "cuda"
         result = processor.__call__(

sglang/srt/multimodal/processors/gemma3n.py CHANGED Viewed

@@ -12,7 +12,6 @@
 # limitations under the License.
 # ==============================================================================
-import re
 from typing import Dict, List, Optional, Union
 from sglang.srt.managers.multimodal_processor import (
@@ -38,14 +37,8 @@ class Gemma3nSGLangProcessor(SGLangBaseProcessor):
         self.mm_tokens = MultimodalSpecialTokens(
             image_token="<image_soft_token>",
             image_token_id=hf_config.image_token_id,
-            image_token_regex=re.compile(
-                r"<start_of_image>(?:(?:<image_soft_token>)*<end_of_image>)?"
-            ),
             audio_token="<audio_soft_token>",
             audio_token_id=hf_config.audio_token_id,
-            audio_token_regex=re.compile(
-                r"<start_of_audio>(?:(?:<audio_soft_token>)*<end_of_audio>)?"
-            ),
         ).build(_processor)
     async def process_mm_data_async(

sglang/srt/operations_strategy.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import List, Optional
 import torch
 from sglang.srt import operations
-from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPConfig
+from sglang.srt.layers.moe.token_dispatcher import DeepEPConfig
 from sglang.srt.model_executor.forward_batch_info import ForwardMode
 from sglang.srt.operations import Operation

sglang/srt/server_args.py CHANGED Viewed

@@ -24,6 +24,7 @@ import tempfile
 from typing import List, Literal, Optional, Union
 from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
+from sglang.srt.layers.utils import is_sm100_supported
 from sglang.srt.lora.lora_registry import LoRARef
 from sglang.srt.reasoning_parser import ReasoningParser
 from sglang.srt.utils import (
@@ -148,6 +149,7 @@ class ServerArgs:
     max_lora_rank: Optional[int] = None
     lora_target_modules: Optional[Union[set[str], List[str]]] = None
     lora_paths: Optional[Union[dict[str, str], dict[str, LoRARef], List[str]]] = None
+    max_loaded_loras: Optional[int] = None
     max_loras_per_batch: int = 8
     lora_backend: str = "triton"
@@ -171,12 +173,11 @@ class ServerArgs:
     # Expert parallelism
     ep_size: int = 1
-    enable_ep_moe: bool = False
-    enable_deepep_moe: bool = False
+    moe_a2a_backend: Optional[Literal["deepep"]] = None
     enable_flashinfer_cutlass_moe: bool = False
     enable_flashinfer_trtllm_moe: bool = False
     enable_flashinfer_allreduce_fusion: bool = False
-    deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
+    deepep_mode: Literal["auto", "normal", "low_latency"] = "auto"
     ep_num_redundant_experts: int = 0
     ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
     init_expert_location: str = "trivial"
@@ -197,7 +198,8 @@ class ServerArgs:
     hicache_ratio: float = 2.0
     hicache_size: int = 0
     hicache_write_policy: str = "write_through_selective"
-    hicache_io_backend: str = ""
+    hicache_io_backend: str = "kernel"
+    hicache_mem_layout: str = "layer_first"
     hicache_storage_backend: Optional[str] = None
     # Double Sparsity
@@ -215,7 +217,9 @@ class ServerArgs:
     disable_cuda_graph: bool = False
     disable_cuda_graph_padding: bool = False
     enable_profile_cuda_graph: bool = False
+    enable_cudagraph_gc: bool = False
     enable_nccl_nvls: bool = False
+    enable_symm_mem: bool = False
     enable_tokenizer_batch_encode: bool = False
     disable_outlines_disk_cache: bool = False
     disable_custom_all_reduce: bool = False
@@ -269,7 +273,27 @@ class ServerArgs:
     enable_pdmux: bool = False
     sm_group_num: int = 3
+    # Deprecated arguments
+    enable_ep_moe: bool = False
+    enable_deepep_moe: bool = False
     def __post_init__(self):
+        # Check deprecated arguments
+        def print_deprecated_warning(message: str):
+            logger.warning(f"\033[33m{message}\033[0m")
+        if self.enable_ep_moe:
+            self.ep_size = self.tp_size
+            print_deprecated_warning(
+                "NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead."
+            )
+        if self.enable_deepep_moe:
+            self.moe_a2a_backend = "deepep"
+            print_deprecated_warning(
+                "NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead."
+            )
         # Set missing default values
         if self.tokenizer_path is None:
             self.tokenizer_path = self.model_path
@@ -401,6 +425,22 @@ class ServerArgs:
             )
             self.page_size = 128
+        if self.attention_backend == "trtllm_mla":
+            if not is_sm100_supported():
+                raise ValueError(
+                    "TRTLLM MLA backend is only supported on Blackwell GPUs (SM100). Please use a different backend."
+                )
+            if self.page_size not in [32, 64]:
+                logger.warning(
+                    f"TensorRT-LLM MLA only supports page_size of 32 or 64, changing page_size from {self.page_size} to 64."
+                )
+                self.page_size = 64
+            if self.speculative_algorithm is not None:
+                raise ValueError(
+                    "trtllm_mla backend does not support speculative decoding yet."
+                )
         # Set page size
         if self.page_size is None:
             self.page_size = 1
@@ -436,13 +476,13 @@ class ServerArgs:
                 self.quantization == "modelopt_fp4"
             ), "modelopt_fp4 quantization is required for Flashinfer MOE"
             os.environ["TRTLLM_ENABLE_PDL"] = "1"
-        if self.enable_flashinfer_trtllm_moe:
-            assert self.enable_ep_moe, "EP MoE is required for Flashinfer TRTLLM MOE"
-            logger.warning(f"Flashinfer TRTLLM MoE is enabled.")
+            assert self.ep_size in [
+                1,
+                self.tp_size,
+            ], "The expert parallel size must be 1 or the same as the tensor parallel size"
         # DeepEP MoE
-        if self.enable_deepep_moe:
+        if self.moe_a2a_backend == "deepep":
             if self.deepep_mode == "normal":
                 logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
                 self.disable_cuda_graph = True
@@ -466,7 +506,7 @@ class ServerArgs:
             )
         if self.enable_eplb:
-            assert self.enable_ep_moe or self.enable_deepep_moe
+            assert self.ep_size > 1 or self.moe_a2a_backend is not None
         if self.enable_expert_distribution_metrics and (
             self.expert_distribution_recorder_mode is None
@@ -1131,6 +1171,7 @@ class ServerArgs:
             choices=[
                 "round_robin",
                 "shortest_queue",
+                "minimum_tokens",
             ],
         )
@@ -1198,6 +1239,12 @@ class ServerArgs:
             default=8,
             help="Maximum number of adapters for a running batch, include base-only request.",
         )
+        parser.add_argument(
+            "--max-loaded-loras",
+            type=int,
+            default=ServerArgs.max_loaded_loras,
+            help="If specified, it limits the maximum number of LoRA adapters loaded in CPU memory at a time. The value must be greater than or equal to `--max-loras-per-batch`.",
+        )
         parser.add_argument(
             "--lora-backend",
             type=str,
@@ -1219,6 +1266,7 @@ class ServerArgs:
                 "torch_native",
                 "ascend",
                 "triton",
+                "trtllm_mla",
             ],
             default=ServerArgs.attention_backend,
             help="Choose the kernels for attention layers.",
@@ -1333,30 +1381,27 @@ class ServerArgs:
             help="The expert parallelism size.",
         )
         parser.add_argument(
-            "--enable-ep-moe",
-            action="store_true",
-            help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
+            "--moe-a2a-backend",
+            type=str,
+            choices=["deepep"],
+            default=ServerArgs.moe_a2a_backend,
+            help="Choose the backend for MoE A2A.",
         )
         parser.add_argument(
             "--enable-flashinfer-cutlass-moe",
             action="store_true",
-            help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP with --enable-ep-moe",
+            help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
         )
         parser.add_argument(
             "--enable-flashinfer-trtllm-moe",
             action="store_true",
-            help="Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP with --enable-ep-moe",
+            help="Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP",
         )
         parser.add_argument(
             "--enable-flashinfer-allreduce-fusion",
             action="store_true",
             help="Enable FlashInfer allreduce fusion for Add_RMSNorm.",
         )
-        parser.add_argument(
-            "--enable-deepep-moe",
-            action="store_true",
-            help="Enabling DeepEP MoE implementation for EP MoE.",
-        )
         parser.add_argument(
             "--deepep-mode",
             type=str,
@@ -1467,10 +1512,18 @@ class ServerArgs:
             default=ServerArgs.hicache_io_backend,
             help="The IO backend for KV cache transfer between CPU and GPU",
         )
+        parser.add_argument(
+            "--hicache-mem-layout",
+            type=str,
+            choices=["layer_first", "page_first"],
+            default=ServerArgs.hicache_mem_layout,
+            help="The layout of host memory pool for hierarchical cache.",
+        )
         parser.add_argument(
             "--hicache-storage-backend",
             type=str,
-            choices=["file", "mooncake", "hf3fs"],
+            choices=["file", "mooncake", "hf3fs", "nixl"],
             default=ServerArgs.hicache_storage_backend,
             help="The storage backend for hierarchical KV cache.",
         )
@@ -1545,11 +1598,21 @@ class ServerArgs:
             action="store_true",
             help="Enable profiling of cuda graph capture.",
         )
+        parser.add_argument(
+            "--enable-cudagraph-gc",
+            action="store_true",
+            help="Enable garbage collection during CUDA graph capture. If disabled (default), GC is frozen during capture to speed up the process.",
+        )
         parser.add_argument(
             "--enable-nccl-nvls",
             action="store_true",
             help="Enable NCCL NVLS for prefill heavy requests when available.",
         )
+        parser.add_argument(
+            "--enable-symm-mem",
+            action="store_true",
+            help="Enable NCCL symmetric memory for fast collectives.",
+        )
         parser.add_argument(
             "--enable-tokenizer-batch-encode",
             action="store_true",
@@ -1805,6 +1868,18 @@ class ServerArgs:
             help="Disable mmap while loading weight using safetensors.",
         )
+        # Deprecated arguments
+        parser.add_argument(
+            "--enable-ep-moe",
+            action="store_true",
+            help="(Deprecated) Enabling expert parallelism for moe. The ep size is equal to the tp size.",
+        )
+        parser.add_argument(
+            "--enable-deepep-moe",
+            action="store_true",
+            help="(Deprecated) Enabling DeepEP MoE implementation for EP MoE.",
+        )
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
         args.tp_size = args.tensor_parallel_size
@@ -1861,6 +1936,12 @@ class ServerArgs:
         if "Llama4" in model_arch:
             assert self.attention_backend == "fa3", "fa3 is required for Llama4 model"
+        if "Gemma2ForCausalLM" in model_arch:
+            # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
+            # It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
+            logger.warning("Disable hybrid SWA memory for Gemma2ForCausalLM.")
+            self.disable_hybrid_swa_memory = True
         # Check LoRA
         self.check_lora_server_args()
@@ -1935,6 +2016,19 @@ class ServerArgs:
                 self.max_lora_rank and self.lora_target_modules
             ), "When no initial --lora-paths is provided, you need to specify both --max-lora-rank and --lora-target-modules for LoRA initialization."
+            # Validate max_loaded_loras
+            if self.max_loaded_loras is not None:
+                assert self.max_loaded_loras >= self.max_loras_per_batch, (
+                    "max_loaded_loras should be greater than or equal to max_loras_per_batch. "
+                    f"max_loaded_loras={self.max_loaded_loras}, max_loras_per_batch={self.max_loras_per_batch}"
+                )
+                assert (
+                    not self.lora_paths or len(self.lora_paths) <= self.max_loaded_loras
+                ), (
+                    "The number of LoRA paths should not exceed max_loaded_loras. "
+                    f"max_loaded_loras={self.max_loaded_loras}, lora_paths={len(self.lora_paths)}"
+                )
     def validate_disagg_tp_size(self, prefill_tp: int, decode_tp: int):
         larger_tp = max(decode_tp, prefill_tp)
         smaller_tp = min(decode_tp, prefill_tp)

sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py CHANGED Viewed

@@ -142,6 +142,22 @@ class EAGLEDraftExtendCudaGraphRunner:
                 self.global_num_tokens_for_logprob_gpu = None
                 self.gathered_buffer = None
+            if hasattr(
+                self.model_runner.model_config.hf_config, "draft_vocab_size"
+            ):  # llama_eagle
+                vocab_size = self.model_runner.model_config.hf_config.draft_vocab_size
+            elif hasattr(
+                self.model_runner.model_config.hf_config, "hot_vocab_size"
+            ):  # llama_eagle3
+                vocab_size = self.model_runner.model_config.hf_config.hot_vocab_size
+            else:
+                vocab_size = self.model_runner.model_config.vocab_size
+            self.next_token_logits_buffer = torch.zeros(
+                (self.max_bs, vocab_size),
+                dtype=torch.float,
+            )
         # Capture
         try:
             with model_capture_mode():
@@ -189,6 +205,7 @@ class EAGLEDraftExtendCudaGraphRunner:
         out_cache_loc = self.out_cache_loc[:num_tokens]
         positions = self.positions[:num_tokens]
         hidden_states = self.hidden_states[:num_tokens]
+        next_token_logits_buffer = self.next_token_logits_buffer[:bs]
         if self.require_mlp_tp_gather:
             self.global_num_tokens_gpu.copy_(
@@ -238,6 +255,7 @@ class EAGLEDraftExtendCudaGraphRunner:
             input_ids=input_ids,
             req_pool_indices=req_pool_indices,
             seq_lens=seq_lens,
+            next_token_logits_buffer=next_token_logits_buffer,
             req_to_token_pool=self.model_runner.req_to_token_pool,
             token_to_kv_pool=self.model_runner.token_to_kv_pool,
             out_cache_loc=out_cache_loc,

sglang/srt/two_batch_overlap.py CHANGED Viewed

@@ -13,17 +13,18 @@ from sglang.srt.layers.communicator import (
     CommunicateSummableTensorPairFn,
     ScatterMode,
 )
-from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
+from sglang.srt.layers.moe.token_dispatcher import DeepEPDispatcher
+from sglang.srt.layers.moe.utils import DeepEPMode
 from sglang.srt.layers.quantization import deep_gemm_wrapper
 from sglang.srt.managers.schedule_batch import ScheduleBatch, global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 from sglang.srt.operations import execute_operations, execute_overlapped_operations
 from sglang.srt.operations_strategy import OperationsStrategy
 from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
-from sglang.srt.utils import BumpAllocator, DeepEPMode, get_bool_env_var
+from sglang.srt.utils import BumpAllocator, get_bool_env_var
 if TYPE_CHECKING:
-    from sglang.srt.layers.moe.ep_moe.token_dispatcher import DispatchOutput
+    from sglang.srt.layers.moe.token_dispatcher import DispatchOutput
 _tbo_debug = get_bool_env_var("SGLANG_TBO_DEBUG")
@@ -310,7 +311,7 @@ class TboDPAttentionPreparer:
                     and not local_batch.forward_mode.is_target_verify()
                 )
                 and enable_deepep_moe
-                and (resolved_deepep_mode == DeepEPMode.low_latency)
+                and (resolved_deepep_mode == DeepEPMode.LOW_LATENCY)
             )
         else:
             self.local_tbo_split_seq_index = 0
@@ -563,6 +564,7 @@ class TboForwardBatchPreparer:
                 mm_inputs=None,
                 top_logprobs_nums=None,
                 token_ids_logprobs=None,
+                next_token_logits_buffer=None,
             )
         )

sglang/srt/utils.py CHANGED Viewed

@@ -44,7 +44,6 @@ import traceback
 import warnings
 from collections import OrderedDict, defaultdict
 from contextlib import contextmanager
-from enum import Enum
 from functools import lru_cache
 from importlib.metadata import PackageNotFoundError, version
 from importlib.util import find_spec
@@ -93,6 +92,7 @@ logger = logging.getLogger(__name__)
 show_time_cost = False
 time_infos = {}
 HIP_FP8_E4M3_FNUZ_MAX = 224.0
@@ -2205,27 +2205,6 @@ def flatten_nested_list(nested_list):
         return [nested_list]
-class DeepEPMode(Enum):
-    normal = "normal"
-    low_latency = "low_latency"
-    auto = "auto"
-    def enable_normal(self):
-        return self in [DeepEPMode.normal, DeepEPMode.auto]
-    def enable_low_latency(self):
-        return self in [DeepEPMode.low_latency, DeepEPMode.auto]
-    def resolve(self, is_extend_in_batch: bool):
-        if self != DeepEPMode.auto:
-            return self
-        if is_extend_in_batch:
-            return DeepEPMode.normal
-        else:
-            return DeepEPMode.low_latency
 def is_non_idle_and_non_empty(forward_mode, hidden_states):
     return (
         (forward_mode is not None)
@@ -2344,6 +2323,7 @@ def is_fa3_default_architecture(hf_config):
         "Qwen3ForCausalLM",
         "Qwen3MoeForCausalLM",
         "Glm4MoeForCausalLM",
+        "Step3VLForConditionalGeneration",
     }
     return architectures[0] in default_archs
@@ -2413,7 +2393,7 @@ def require_mlp_tp_gather(server_args):
             return True
         elif not server_args.enable_dp_lm_head:
             return True
-        elif not server_args.enable_deepep_moe:
+        elif server_args.moe_a2a_backend is None:
             return True
         else:
             return (
@@ -2429,7 +2409,7 @@ def require_attn_tp_gather(server_args):
     Check if the input of attention is scattered.
     """
     assert server_args.moe_dense_tp_size in [1, None]
-    if server_args.enable_deepep_moe or server_args.moe_dense_tp_size == 1:
+    if server_args.moe_a2a_backend is not None or server_args.moe_dense_tp_size == 1:
         if server_args.enable_dp_attention:
             return server_args.dp_size < server_args.tp_size
         else:

sglang/srt/weight_sync/utils.py CHANGED Viewed

@@ -45,7 +45,7 @@ async def update_weights(
         (
             name,
             MultiprocessingSerializer.serialize(
-                _preprocess_tensor_for_update_weights(tensor)
+                _preprocess_tensor_for_update_weights(tensor.detach())
             ),
         )
         for name, tensor in params_batch

sglang 0.4.10__py3-none-any.whl → 0.4.10.post2__py3-none-any.whl

sglang 0.4.10py3-none-any.whl → 0.4.10.post2py3-none-any.whl