PyPI - sglang - Versions diffs - 0.4.5.post3__py3-none-any.whl → 0.4.6__py3-none-any.whl - Mend

sglang 0.4.5.post3py3-none-any.whl → 0.4.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

sglang/bench_one_batch.py +19 -3
sglang/bench_serving.py +8 -9
sglang/compile_deep_gemm.py +45 -4
sglang/srt/code_completion_parser.py +1 -1
sglang/srt/configs/deepseekvl2.py +1 -1
sglang/srt/configs/model_config.py +9 -3
sglang/srt/constrained/llguidance_backend.py +78 -61
sglang/srt/conversation.py +34 -1
sglang/srt/disaggregation/decode.py +59 -11
sglang/srt/disaggregation/mini_lb.py +45 -8
sglang/srt/disaggregation/mooncake/conn.py +198 -31
sglang/srt/disaggregation/prefill.py +24 -9
sglang/srt/entrypoints/http_server.py +8 -2
sglang/srt/function_call_parser.py +77 -5
sglang/srt/layers/attention/base_attn_backend.py +3 -0
sglang/srt/layers/attention/flashattention_backend.py +28 -10
sglang/srt/layers/attention/flashmla_backend.py +8 -11
sglang/srt/layers/attention/vision.py +2 -0
sglang/srt/layers/layernorm.py +38 -16
sglang/srt/layers/logits_processor.py +2 -2
sglang/srt/layers/moe/fused_moe_native.py +2 -4
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +41 -41
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +18 -15
sglang/srt/layers/pooler.py +6 -0
sglang/srt/layers/quantization/awq.py +5 -1
sglang/srt/layers/quantization/deep_gemm.py +17 -10
sglang/srt/layers/quantization/int8_kernel.py +32 -1
sglang/srt/layers/radix_attention.py +13 -3
sglang/srt/layers/rotary_embedding.py +170 -126
sglang/srt/managers/data_parallel_controller.py +10 -3
sglang/srt/managers/io_struct.py +7 -0
sglang/srt/managers/mm_utils.py +85 -28
sglang/srt/managers/multimodal_processors/base_processor.py +14 -1
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +9 -2
sglang/srt/managers/multimodal_processors/gemma3.py +2 -5
sglang/srt/managers/multimodal_processors/janus_pro.py +2 -2
sglang/srt/managers/multimodal_processors/minicpm.py +4 -3
sglang/srt/managers/multimodal_processors/qwen_vl.py +38 -13
sglang/srt/managers/schedule_batch.py +29 -12
sglang/srt/managers/scheduler.py +31 -20
sglang/srt/managers/tokenizer_manager.py +5 -1
sglang/srt/mem_cache/memory_pool.py +87 -0
sglang/srt/model_executor/cuda_graph_runner.py +4 -3
sglang/srt/model_executor/forward_batch_info.py +51 -95
sglang/srt/model_executor/model_runner.py +11 -24
sglang/srt/models/deepseek.py +12 -2
sglang/srt/models/deepseek_nextn.py +101 -6
sglang/srt/models/deepseek_v2.py +144 -70
sglang/srt/models/deepseek_vl2.py +9 -4
sglang/srt/models/gemma3_causal.py +1 -1
sglang/srt/models/llama4.py +0 -1
sglang/srt/models/minicpmo.py +5 -1
sglang/srt/models/mllama4.py +2 -2
sglang/srt/models/qwen2_5_vl.py +3 -6
sglang/srt/models/qwen2_vl.py +3 -7
sglang/srt/models/roberta.py +178 -0
sglang/srt/openai_api/adapter.py +18 -8
sglang/srt/server_args.py +15 -22
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
sglang/srt/torch_memory_saver_adapter.py +10 -1
sglang/srt/utils.py +2 -1
sglang/test/runners.py +6 -13
sglang/test/test_utils.py +36 -18
sglang/version.py +1 -1
{sglang-0.4.5.post3.dist-info → sglang-0.4.6.dist-info}/METADATA +4 -5
{sglang-0.4.5.post3.dist-info → sglang-0.4.6.dist-info}/RECORD +70 -68
{sglang-0.4.5.post3.dist-info → sglang-0.4.6.dist-info}/WHEEL +1 -1
{sglang-0.4.5.post3.dist-info → sglang-0.4.6.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.5.post3.dist-info → sglang-0.4.6.dist-info}/top_level.txt +0 -0

sglang/srt/model_executor/forward_batch_info.py CHANGED Viewed

@@ -38,7 +38,7 @@ import triton
 import triton.language as tl
 from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
-from sglang.srt.utils import get_compiler_backend
+from sglang.srt.utils import flatten_nested_list, get_compiler_backend
 if TYPE_CHECKING:
     from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
@@ -364,23 +364,23 @@ class ForwardBatch:
     def merge_mm_inputs(self) -> Optional[MultimodalInputs]:
         """
-        Merge all image inputs in the batch into a single MultiModalInputs object.
+        Merge all multimodal inputs in the batch into a single MultiModalInputs object.
         Returns:
-            if none, current batch contains no image input
+            if none, current batch contains no multimodal input
         """
         if not self.mm_inputs or all(x is None for x in self.mm_inputs):
             return None
         # Filter out None values
         valid_inputs = [x for x in self.mm_inputs if x is not None]
-        # Start with the first valid image input
-        merged = valid_inputs[0]
+        # TODO: is it expensive?
+        # a workaround to avoid importing `MultimodalInputs`
+        merged = valid_inputs[0].__class__(mm_items=[])
         # Merge remaining inputs
-        for mm_input in valid_inputs[1:]:
+        for mm_input in valid_inputs:
             merged.merge(mm_input)
         return merged
@@ -407,104 +407,60 @@ class ForwardBatch:
     def _compute_mrope_positions(
         self, model_runner: ModelRunner, batch: ModelWorkerBatch
     ):
-        device = model_runner.device
-        hf_config = model_runner.model_config.hf_config
-        mrope_positions_list = [None] * self.seq_lens.shape[0]
-        if self.forward_mode.is_decode():
-            for i, _ in enumerate(mrope_positions_list):
-                mrope_position_delta = (
-                    0
-                    if batch.multimodal_inputs[i] is None
-                    else batch.multimodal_inputs[i].mrope_position_delta
-                )
-                mrope_positions_list[i] = MRotaryEmbedding.get_next_input_positions(
-                    mrope_position_delta,
-                    int(self.seq_lens[i]) - 1,
-                    int(self.seq_lens[i]),
+        # batch_size * [3 * seq_len]
+        batch_size = self.seq_lens.shape[0]
+        mrope_positions_list = [[]] * batch_size
+        for batch_idx in range(batch_size):
+            mm_input = batch.multimodal_inputs[batch_idx]
+            if self.forward_mode.is_decode():
+                mrope_position_deltas = (
+                    [0]
+                    if mm_input is None
+                    else flatten_nested_list(mm_input.mrope_position_delta.tolist())
                 )
-        elif self.forward_mode.is_extend():
-            extend_start_loc_cpu = self.extend_start_loc.cpu().numpy()
-            for i, mm_input in enumerate(batch.multimodal_inputs):
-                extend_start_loc, extend_seq_len, extend_prefix_len = (
-                    extend_start_loc_cpu[i],
-                    batch.extend_seq_lens[i],
-                    batch.extend_prefix_lens[i],
+                next_input_positions = []
+                for mrope_position_delta in mrope_position_deltas:
+                    # batched deltas needs to be processed separately
+                    # Convert list of lists to tensor with shape [3, seq_len]
+                    next_input_positions += [
+                        MRotaryEmbedding.get_next_input_positions(
+                            mrope_position_delta,
+                            int(self.seq_lens[batch_idx]) - 1,
+                            int(self.seq_lens[batch_idx]),
+                        )
+                    ]
+                # 3 * N
+                mrope_positions_list[batch_idx] = torch.cat(next_input_positions, dim=1)
+            elif self.forward_mode.is_extend():
+                extend_seq_len, extend_prefix_len = (
+                    batch.extend_seq_lens[batch_idx],
+                    batch.extend_prefix_lens[batch_idx],
                 )
                 if mm_input is None:
                     # text only
-                    mrope_positions = [
+                    mrope_positions = torch.tensor(
                         [
-                            pos
-                            for pos in range(
-                                extend_prefix_len, extend_prefix_len + extend_seq_len
-                            )
+                            [
+                                pos
+                                for pos in range(
+                                    extend_prefix_len,
+                                    extend_prefix_len + extend_seq_len,
+                                )
+                            ]
                         ]
-                    ] * 3
-                else:
-                    image_grid_thws_list = [
-                        item.image_grid_thws
-                        for item in mm_input.mm_items
-                        if item.image_grid_thws is not None
-                    ]
-                    image_grid_thw = (
-                        None
-                        if len(image_grid_thws_list) == 0
-                        else torch.cat(image_grid_thws_list, dim=0)
-                    )
-                    video_grid_thws_list = [
-                        item.video_grid_thws
-                        for item in mm_input.mm_items
-                        if item.video_grid_thws is not None
-                    ]
-                    video_grid_thw = (
-                        None
-                        if len(video_grid_thws_list) == 0
-                        else torch.cat(video_grid_thws_list, dim=0)
+                        * 3
                     )
-                    second_per_grid_ts_list = [
-                        item.second_per_grid_ts
-                        for item in mm_input.mm_items
-                        if item.second_per_grid_ts is not None
+                else:
+                    mrope_positions = mm_input.mrope_positions[
+                        :,
+                        extend_prefix_len : extend_prefix_len + extend_seq_len,
                     ]
-                    second_per_grid_ts = (
-                        None
-                        if len(second_per_grid_ts_list) == 0
-                        else torch.cat(second_per_grid_ts_list, dim=0)
-                    )
-                    # TODO: current qwen2-vl do not support radix cache since mrope position calculation
-                    mrope_positions, mrope_position_delta = (
-                        MRotaryEmbedding.get_input_positions(
-                            input_tokens=self.input_ids[
-                                extend_start_loc : extend_start_loc + extend_seq_len
-                            ].tolist(),
-                            image_grid_thw=image_grid_thw,
-                            video_grid_thw=video_grid_thw,
-                            image_token_id=hf_config.image_token_id,
-                            video_token_id=hf_config.video_token_id,
-                            vision_start_token_id=hf_config.vision_start_token_id,
-                            vision_end_token_id=hf_config.vision_end_token_id,
-                            spatial_merge_size=hf_config.vision_config.spatial_merge_size,
-                            context_len=0,
-                            seq_len=len(self.input_ids),
-                            second_per_grid_ts=second_per_grid_ts,
-                            tokens_per_second=getattr(
-                                hf_config.vision_config, "tokens_per_second", None
-                            ),
-                        )
-                    )
-                    batch.multimodal_inputs[i].mrope_position_delta = (
-                        mrope_position_delta
-                    )
-                mrope_positions_list[i] = mrope_positions
+                mrope_positions_list[batch_idx] = mrope_positions
         self.mrope_positions = torch.cat(
-            [torch.tensor(pos, device=device) for pos in mrope_positions_list],
-            axis=1,
-        )
-        self.mrope_positions = self.mrope_positions.to(torch.int64)
+            [pos.to(device=model_runner.device) for pos in mrope_positions_list],
+            dim=1,
+        ).to(dtype=torch.int64, device=model_runner.device)
     def get_max_chunk_capacity(self):
         # Maximum number of tokens in each chunk

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -91,11 +91,14 @@ from sglang.srt.utils import (
     set_cuda_arch,
 )
-logger = logging.getLogger(__name__)
+# Use a small KV cache pool size for tests in CI
 SGLANG_CI_SMALL_KV_SIZE = os.getenv("SGLANG_CI_SMALL_KV_SIZE", None)
+# Detect stragger ranks in model loading
 UNBALANCED_MODEL_LOADING_TIMEOUT_S = 300
+logger = logging.getLogger(__name__)
 class ModelRunner:
     """ModelRunner runs the forward passes of the models."""
@@ -177,7 +180,7 @@ class ModelRunner:
         if _ENABLE_JIT_DEEPGEMM:
             update_deep_gemm_config(gpu_id, server_args)
-        # If it is a draft model tp_group can be different.
+        # If it is a draft model, tp_group can be different
         self.initialize(min_per_gpu_memory)
     def initialize(self, min_per_gpu_memory: float):
@@ -230,7 +233,8 @@ class ModelRunner:
         if server_args.attention_backend is None:
             """
-            We auto select the fastest attention backend according to the current offering
+            Auto select the fastest attention backend.
             1. Models with MHA Architecture (e.g: Llama, QWen)
                 1.1 We will turn on FA3 on hopper unless user use spec decode with topk > 1 or page_size > 1.
                 1.2 In other cases, we will use flashinfer if available, otherwise use triton.
@@ -240,6 +244,7 @@ class ModelRunner:
             """
             if not self.use_mla_backend:
+                # MHA architecture
                 if (
                     is_hopper_with_cuda_12_3()
                     and is_no_spec_infer_or_topk_one(server_args)
@@ -251,6 +256,7 @@ class ModelRunner:
                         "flashinfer" if is_flashinfer_available() else "triton"
                     )
             else:
+                # MLA architecture
                 if is_hopper_with_cuda_12_3():
                     server_args.attention_backend = "fa3"
                 else:
@@ -259,7 +265,6 @@ class ModelRunner:
                 f"Attention backend not set. Use {server_args.attention_backend} backend by default."
             )
         elif self.use_mla_backend:
-            # TODO: add MLA optimization on CPU
             if server_args.device != "cpu":
                 if server_args.attention_backend in [
                     "flashinfer",
@@ -275,7 +280,7 @@ class ModelRunner:
                         f"Invalid attention backend for MLA: {server_args.attention_backend}"
                     )
             else:
-                raise ValueError(f"MLA optimization not supported on CPU.")
+                raise ValueError("MLA optimization not supported on CPU.")
         if (
             server_args.attention_backend == "fa3"
@@ -310,18 +315,6 @@ class ModelRunner:
             )
             server_args.chunked_prefill_size = -1
-            if self.model_config.hf_config.architectures == [
-                "Qwen2VLForConditionalGeneration"
-            ] or self.model_config.hf_config.architectures == [
-                "Qwen2_5_VLForConditionalGeneration"
-            ]:
-                # TODO: qwen2-vl series does not support radix cache now, set disable_radix_cache=True automatically
-                logger.info("Automatically disable radix cache for qwen-vl series.")
-                server_args.disable_radix_cache = True
-        if server_args.enable_deepep_moe:
-            logger.info(f"DeepEP is turned on. DeepEP mode: {server_args.deepep_mode}")
         if not self.use_mla_backend:
             server_args.disable_chunked_prefix_cache = True
         elif self.page_size > 1:
@@ -964,12 +957,6 @@ class ModelRunner:
             return
         if self.server_args.disable_cuda_graph:
-            logger.warning(
-                "\n\nCUDA Graph is DISABLED.\n"
-                "This will cause significant performance degradation.\n"
-                "CUDA Graph should almost never be disabled in most usage scenarios.\n"
-                "If you encounter OOM issues, please try setting --mem-fraction-static to a lower value (such as 0.8 or 0.7) instead of disabling CUDA Graph.\n"
-            )
             return
         tic = time.time()

sglang/srt/models/deepseek.py CHANGED Viewed

@@ -382,8 +382,14 @@ class DeepseekModel(nn.Module):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
     ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
         residual = None
         for i in range(len(self.layers)):
             layer = self.layers[i]
@@ -416,14 +422,18 @@ class DeepseekForCausalLM(nn.Module):
         )
         self.logits_processor = LogitsProcessor(config)
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
     @torch.no_grad()
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
     ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, forward_batch)
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
         return self.logits_processor(
             input_ids, hidden_states, self.lm_head, forward_batch
         )

sglang/srt/models/deepseek_nextn.py CHANGED Viewed

@@ -13,12 +13,14 @@
 # ==============================================================================
 """Inference-only DeepSeek NextN Speculative Decoding."""
+import logging
 from typing import Iterable, Optional, Tuple
 import torch
 from torch import nn
 from transformers import PretrainedConfig
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import ReplicatedLinear
 from sglang.srt.layers.logits_processor import LogitsProcessor
@@ -51,6 +53,9 @@ else:
     from vllm._custom_ops import awq_dequantize
+logger = logging.getLogger(__name__)
 class DeepseekModelNextN(nn.Module):
     def __init__(
         self,
@@ -134,7 +139,9 @@ class DeepseekV3ForCausalLMNextN(DeepseekV3ForCausalLM):
     ) -> None:
         nn.Module.__init__(self)
         self.config = config
+        self.tp_size = get_tensor_model_parallel_world_size()
         self.quant_config = quant_config
+        self.determine_n_share_experts_fusion("DeepseekV3ForCausalLMNextN")
         self.model = DeepseekModelNextN(
             config, quant_config, prefix=add_prefix("model", prefix)
@@ -182,6 +189,48 @@ class DeepseekV3ForCausalLMNextN(DeepseekV3ForCausalLM):
             ("gate_up_proj", "gate_proj", 0),
             ("gate_up_proj", "up_proj", 1),
         ]
+        if self.n_share_experts_fusion > 0:
+            logger.info(
+                f"Cloning {self.n_share_experts_fusion} "
+                "replicas of the shared expert into MoE for DeepseekV3ForCausalLMNextN"
+            )
+            weights_list = list(weights)
+            weights_dict = dict(weights_list)
+            if self.quant_config is None or self.quant_config.get_name() == "w8a8_int8":
+                suffix_list = [
+                    "down_proj.weight",
+                    "down_proj.weight_scale",
+                    "gate_proj.weight",
+                    "gate_proj.weight_scale",
+                    "up_proj.weight",
+                    "up_proj.weight_scale",
+                ]
+            else:
+                suffix_list = [
+                    "down_proj.weight",
+                    "down_proj.weight_scale_inv",
+                    "gate_proj.weight",
+                    "gate_proj.weight_scale_inv",
+                    "up_proj.weight",
+                    "up_proj.weight_scale_inv",
+                ]
+            names_to_remove = []
+            for suffix in suffix_list:
+                shared_expert_weight_name = (
+                    f"model.layers.0.mlp.shared_experts.{suffix}"
+                )
+                for num_repeat in range(self.n_share_experts_fusion):
+                    weights_list.append(
+                        (
+                            f"model.layers.0."
+                            f"mlp.experts."
+                            f"{self.config.n_routed_experts + num_repeat}"
+                            f".{suffix}",
+                            weights_dict[shared_expert_weight_name],
+                        )
+                    )
+                names_to_remove += [shared_expert_weight_name]
+            weights = [w for w in weights_list if w[0] not in names_to_remove]
         # Params for weights, fp8 weight scales, fp8 activation scales
         # (param_name, weight_name, expert_id, shard_id)
@@ -190,8 +239,14 @@ class DeepseekV3ForCausalLMNextN(DeepseekV3ForCausalLM):
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
-            num_experts=self.config.n_routed_experts,
+            num_experts=self.config.n_routed_experts + self.n_share_experts_fusion,
+        )
+        # Fuse q_a_proj and kv_a_proj_with_mqa along output dimension when q_lora_rank is not None
+        fuse_qkv_a_proj = hasattr(self.config, "q_lora_rank") and (
+            self.config.q_lora_rank is not None
         )
+        cached_a_proj = {} if fuse_qkv_a_proj else None
         nextn_layer_prefix = "model.layers.0"
         nextn_spec_weight_names = [
@@ -264,11 +319,51 @@ class DeepseekV3ForCausalLMNextN(DeepseekV3ForCausalLM):
                     if name.endswith(".bias") and name not in params_dict:
                         continue
-                    param = params_dict[name]
-                    weight_loader = getattr(
-                        param, "weight_loader", default_weight_loader
-                    )
-                    weight_loader(param, loaded_weight)
+                    # Handle fused_qkv_a_proj
+                    if fuse_qkv_a_proj and (
+                        "q_a_proj" in name or "kv_a_proj_with_mqa" in name
+                    ):
+                        cached_a_proj[name] = loaded_weight
+                        q_a_proj_name = (
+                            name
+                            if "q_a_proj" in name
+                            else name.replace("kv_a_proj_with_mqa", "q_a_proj")
+                        )
+                        kv_a_proj_name = (
+                            name
+                            if "kv_a_proj_with_mqa" in name
+                            else name.replace("q_a_proj", "kv_a_proj_with_mqa")
+                        )
+                        # When both q_a_proj and kv_a_proj_with_mqa has been cached, load the fused weight to parameter
+                        if (
+                            q_a_proj_name in cached_a_proj
+                            and kv_a_proj_name in cached_a_proj
+                        ):
+                            q_a_proj_weight = cached_a_proj[q_a_proj_name]
+                            kv_a_proj_weight = cached_a_proj[kv_a_proj_name]
+                            fused_weight = torch.cat(
+                                [q_a_proj_weight, kv_a_proj_weight], dim=0
+                            )
+                            param_name = name.replace(
+                                "q_a_proj", "fused_qkv_a_proj_with_mqa"
+                            )
+                            param = params_dict[param_name]
+                            weight_loader = getattr(
+                                param, "weight_loader", default_weight_loader
+                            )
+                            weight_loader(param, fused_weight)
+                            cached_a_proj.pop(q_a_proj_name)
+                            cached_a_proj.pop(kv_a_proj_name)
+                    else:
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
         self_attn = self.model.decoder.self_attn
         if hasattr(self_attn.kv_b_proj, "qweight"):

sglang 0.4.5.post3__py3-none-any.whl → 0.4.6__py3-none-any.whl

sglang 0.4.5.post3py3-none-any.whl → 0.4.6py3-none-any.whl