PyPI - sglang - Versions diffs - 0.4.2__py3-none-any.whl → 0.4.2.post1__py3-none-any.whl - Mend

sglang 0.4.2py3-none-any.whl → 0.4.2.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

sglang/srt/layers/attention/triton_ops/prefill_attention.py +6 -0
sglang/srt/layers/attention/vision.py +243 -40
sglang/srt/layers/quantization/fp8.py +7 -0
sglang/srt/layers/rotary_embedding.py +28 -12
sglang/srt/layers/sampler.py +5 -2
sglang/srt/managers/image_processor.py +77 -38
sglang/srt/managers/scheduler.py +17 -3
sglang/srt/mem_cache/base_prefix_cache.py +4 -0
sglang/srt/mem_cache/chunk_cache.py +3 -0
sglang/srt/mem_cache/radix_cache.py +30 -1
sglang/srt/models/minicpmv.py +129 -76
sglang/srt/models/mllama.py +16 -56
sglang/srt/models/qwen2.py +4 -1
sglang/srt/models/qwen2_vl.py +18 -8
sglang/srt/server_args.py +6 -0
sglang/srt/utils.py +0 -2
sglang/utils.py +42 -0
sglang/version.py +1 -1
{sglang-0.4.2.dist-info → sglang-0.4.2.post1.dist-info}/METADATA +3 -3
{sglang-0.4.2.dist-info → sglang-0.4.2.post1.dist-info}/RECORD +23 -23
{sglang-0.4.2.dist-info → sglang-0.4.2.post1.dist-info}/LICENSE +0 -0
{sglang-0.4.2.dist-info → sglang-0.4.2.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.2.dist-info → sglang-0.4.2.post1.dist-info}/top_level.txt +0 -0

sglang/srt/managers/scheduler.py CHANGED Viewed

@@ -149,6 +149,7 @@ class Scheduler:
             if not self.spec_algorithm.is_none()
             else 1
         )
+        self.enable_hierarchical_cache = server_args.enable_hierarchical_cache
         # Distributed rank info
         self.dp_size = server_args.dp_size
@@ -831,10 +832,16 @@ class Scheduler:
         available_size = (
             self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size()
         )
-        if available_size != self.max_total_num_tokens:
+        protected_size = self.tree_cache.protected_size()
+        memory_leak = available_size != (
+            self.max_total_num_tokens
+            if not self.enable_hierarchical_cache
+            else self.max_total_num_tokens - protected_size
+        )
+        if memory_leak:
             msg = (
                 "KV cache pool leak detected!"
-                f"{available_size=}, {self.max_total_num_tokens=}\n"
+                f"{available_size=}, {protected_size=}, {self.max_total_num_tokens=}\n"
             )
             warnings.warn(msg)
             if crash_on_warnings():
@@ -949,7 +956,14 @@ class Scheduler:
             res = adder.add_one_req(req)
             if res != AddReqResult.CONTINUE:
                 if res == AddReqResult.NO_TOKEN:
-                    self.batch_is_full = True
+                    if self.enable_hierarchical_cache:
+                        # Set batch_is_full after making sure there are requests that can be served
+                        self.batch_is_full = len(adder.can_run_list) > 0 or (
+                            self.running_batch is not None
+                            and not self.running_batch.is_empty()
+                        )
+                    else:
+                        self.batch_is_full = True
                 break
             if self.server_args.prefill_only_one_req:
                 break

sglang/srt/mem_cache/base_prefix_cache.py CHANGED Viewed

@@ -41,6 +41,10 @@ class BasePrefixCache(ABC):
     def evictable_size(self):
         pass
+    @abstractmethod
+    def protected_size(self):
+        raise NotImplementedError()
     def total_size(self):
         raise NotImplementedError()

sglang/srt/mem_cache/chunk_cache.py CHANGED Viewed

@@ -85,3 +85,6 @@ class ChunkCache(BasePrefixCache):
     def evictable_size(self):
         return 0
+    def protected_size(self):
+        return 0

sglang/srt/mem_cache/radix_cache.py CHANGED Viewed

@@ -34,7 +34,10 @@ if TYPE_CHECKING:
 class TreeNode:
-    def __init__(self):
+    counter = 0
+    def __init__(self, id: Optional[int] = None):
         self.children = defaultdict(TreeNode)
         self.parent = None
         self.key = None
@@ -42,6 +45,23 @@ class TreeNode:
         self.lock_ref = 0
         self.last_access_time = time.time()
+        self.hit_count = 0
+        # indicating the node is loading KV cache from host
+        self.loading = False
+        # store the host indices of KV cache
+        self.host_value = None
+        self.id = TreeNode.counter if id is None else id
+        TreeNode.counter += 1
+    @property
+    def evicted(self):
+        return self.value is None
+    @property
+    def backuped(self):
+        return self.host_value is not None
     def __lt__(self, other: "TreeNode"):
         return self.last_access_time < other.last_access_time
@@ -75,6 +95,7 @@ class RadixCache(BasePrefixCache):
         self.root_node.value = []
         self.root_node.lock_ref = 1
         self.evictable_size_ = 0
+        self.protected_size_ = 0
     def match_prefix(self, key: List[int], **kwargs) -> Tuple[torch.Tensor, int]:
         """Find the matching prefix from the radix tree.
@@ -203,6 +224,7 @@ class RadixCache(BasePrefixCache):
         while node != self.root_node:
             if node.lock_ref == 0:
                 self.evictable_size_ -= len(node.value)
+                self.protected_size_ += len(node.value)
                 delta -= len(node.value)
             node.lock_ref += 1
             node = node.parent
@@ -216,6 +238,7 @@ class RadixCache(BasePrefixCache):
         while node != self.root_node:
             if node.lock_ref == 1:
                 self.evictable_size_ += len(node.value)
+                self.protected_size_ -= len(node.value)
                 delta += len(node.value)
             node.lock_ref -= 1
             node = node.parent
@@ -224,6 +247,10 @@ class RadixCache(BasePrefixCache):
     def evictable_size(self):
         return self.evictable_size_
+    def protected_size(self):
+        # protected size refers to the size of the cache that is locked
+        return self.protected_size_
     ##### Internal Helper Functions #####
     def _match_prefix_helper(
@@ -303,6 +330,8 @@ class RadixCache(BasePrefixCache):
         self.evictable_size_ -= len(node.key)
     def _total_size_helper(self, node: TreeNode):
+        if node.evicted:
+            return 0
         x = len(node.value)
         for child in node.children.values():
             x += self._total_size_helper(child)

sglang/srt/models/minicpmv.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
-# Copyright 2023 The vLLM team.
+# Copyright 2023 The SGLang team.
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
@@ -20,7 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only MiniCPM-V model compatible with HuggingFace weights."""
-from functools import cached_property, partial
+from functools import partial
 from typing import (
     Any,
     Callable,
@@ -33,16 +33,13 @@ from typing import (
     Union,
 )
+import numpy as np
 import torch
 import torch.types
 from PIL import Image
 from torch import nn
 from torch.nn.init import trunc_normal_
 from transformers import PretrainedConfig
-from vllm.model_executor.layers.resampler import get_2d_sincos_pos_embed
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
-from vllm.model_executor.models.module_mapping import MultiModelKeys
-from vllm.model_executor.sampling_metadata import SamplingMetadata
 from sglang.srt.distributed import divide, get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import get_act_fn
@@ -63,6 +60,88 @@ from sglang.srt.models.qwen2 import Qwen2Config, Qwen2ForCausalLM
 RawImageType = Union[Image.Image, torch.Tensor]
+# sin/cos positional embedding helpers are adapted from:
+# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
+def get_1d_sincos_pos_embed_from_grid(
+    embed_dim: int, pos: np.ndarray, version: Tuple[int, int] = (2, 0)
+) -> torch.Tensor:
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,) / (H, W)
+    out: (M, D) / (H, W, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    if version == (2, 0):
+        pos = pos.reshape(-1)  # (M,)
+        out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+        emb_sin = np.sin(out)  # (M, D/2)
+        emb_cos = np.cos(out)  # (M, D/2)
+        emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    else:
+        out = np.einsum("hw,d->hwd", pos, omega)  # (H, W, D/2), outer product
+        emb_sin = np.sin(out)  # (H, W, D/2)
+        emb_cos = np.cos(out)  # (H, W, D/2)
+        emb = np.concatenate([emb_sin, emb_cos], axis=-1)  # (H, W, D)
+    return emb
+def get_2d_sincos_pos_embed_from_grid(
+    embed_dim: int, grid: np.ndarray, version: Tuple[int, int] = (2, 0)
+) -> torch.Tensor:
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(
+        embed_dim // 2, grid[0], version
+    )  # (H*W, D/2) or (H, W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(
+        embed_dim // 2, grid[1], version
+    )  # (H*W, D/2) or (H, W, D/2)
+    if version == (2, 0):
+        emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    else:
+        emb = np.concatenate([emb_h, emb_w], axis=-1)  # (H, W, D)
+    return emb
+def get_2d_sincos_pos_embed(
+    embed_dim: int,
+    grid_size: Union[int, Tuple[int, int]],
+    cls_token: bool = False,
+    version: Tuple[int, int] = (2, 0),
+) -> torch.Tensor:
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or
+                [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    if isinstance(grid_size, int):
+        grid_h_size, grid_w_size = grid_size, grid_size
+    else:
+        grid_h_size, grid_w_size = grid_size[0], grid_size[1]
+    grid_h = np.arange(grid_h_size, dtype=np.float32)
+    grid_w = np.arange(grid_w_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    assert isinstance(grid, np.ndarray) and grid.shape == (2, grid_h_size, grid_w_size)
+    if version == (2, 0):
+        grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
+        pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version)
+        if cls_token:
+            pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    else:
+        pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version)
+    return pos_embed
 class Idefics2VisionMLP(nn.Module):
     def __init__(
@@ -116,6 +195,10 @@ class Idefics2EncoderLayer(nn.Module):
             projection_size=config.intermediate_size,
             use_qkv_parallel=True,
             quant_config=quant_config,
+            dropout=config.attention_dropout,
+            use_context_forward=False,
+            use_full_precision_softmax=True,
+            flatten_batch=False,
             prefix=f"{prefix}.self_attn",
         )
         self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
@@ -126,7 +209,6 @@ class Idefics2EncoderLayer(nn.Module):
         self,
         hidden_states: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        forward_batch: ForwardBatch,
     ) -> torch.Tensor:
         """
         Args:
@@ -136,11 +218,8 @@ class Idefics2EncoderLayer(nn.Module):
         """
         residual = hidden_states
         hidden_states = self.layer_norm1(hidden_states)
-        hidden_states = self.self_attn(
-            hidden_states,
-            cu_seqlens=cu_seqlens,
-            # , forward_batch=forward_batch
-        )
+        hidden_states = self.self_attn(hidden_states, cu_seqlens=cu_seqlens)
         hidden_states = residual + hidden_states
         residual = hidden_states
         hidden_states = self.layer_norm2(hidden_states)
@@ -181,7 +260,6 @@ class Idefics2Encoder(nn.Module):
         self,
         inputs_embeds: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        forward_batch: ForwardBatch,
     ) -> torch.Tensor:
         r"""
         Args:
@@ -195,7 +273,8 @@ class Idefics2Encoder(nn.Module):
         hidden_states = inputs_embeds
         for encoder_layer in self.layers:
             layer_outputs = encoder_layer(
-                hidden_states, cu_seqlens=cu_seqlens, forward_batch=forward_batch
+                hidden_states,
+                cu_seqlens=cu_seqlens,
             )
             hidden_states = layer_outputs
         return hidden_states
@@ -232,19 +311,14 @@ class Idefics2VisionEmbeddings(nn.Module):
         self.num_positions = self.num_patches
         self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
-    def forward(
+    def get_position_ids(
         self,
         pixel_values: torch.FloatTensor,
         patch_attention_mask: torch.BoolTensor,
         tgt_sizes: Optional[torch.IntTensor] = None,
-    ) -> torch.Tensor:
+    ):
         batch_size, _, max_im_h, max_im_w = pixel_values.shape
-        target_dtype = self.patch_embedding.weight.dtype
-        pixel_values = pixel_values.to(
-            device=self.patch_embedding.weight.device, dtype=target_dtype
-        )
-        patch_embeds = self.patch_embedding(pixel_values)
-        embeddings = patch_embeds.flatten(2).transpose(1, 2)
         max_nb_patches_h, max_nb_patches_w = (
             max_im_h // self.patch_size,
             max_im_w // self.patch_size,
@@ -277,6 +351,24 @@ class Idefics2VisionEmbeddings(nn.Module):
             ).flatten()
             position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
         position_ids = position_ids.to(self.position_embedding.weight.device)
+        return position_ids
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        patch_attention_mask: torch.BoolTensor,
+        tgt_sizes: Optional[torch.IntTensor] = None,
+    ) -> torch.Tensor:
+        target_dtype = self.patch_embedding.weight.dtype
+        pixel_values = pixel_values.to(
+            device=self.patch_embedding.weight.device, dtype=target_dtype
+        )
+        patch_embeds = self.patch_embedding(pixel_values)
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+        position_ids = self.get_position_ids(
+            pixel_values, patch_attention_mask, tgt_sizes
+        )
         embeddings = embeddings + self.position_embedding(position_ids)
         return embeddings
@@ -287,7 +379,6 @@ class Idefics2VisionTransformer(nn.Module):
         self,
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
     ) -> None:
         super().__init__()
@@ -302,8 +393,6 @@ class Idefics2VisionTransformer(nn.Module):
     def compute_cu_seqlens(self, tgt_sizes: torch.Tensor) -> torch.Tensor:
         patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]  # shape: (batch_size,)
-        # 做 prefix sum 来得到 cu_seqlens，注意在最前面插一个 0 作为 offset
         cu_seqlens = torch.cat(
             [
                 torch.tensor([0], device=patch_len.device, dtype=torch.int32),
@@ -316,19 +405,18 @@ class Idefics2VisionTransformer(nn.Module):
     def forward(
         self,
         pixel_values,
-        forward_batch: ForwardBatch,
         patch_attention_mask: Optional[torch.BoolTensor] = None,
         tgt_sizes: Optional[torch.IntTensor] = None,
     ) -> torch.Tensor:
         hidden_states = self.embeddings(
             pixel_values=pixel_values,
             patch_attention_mask=patch_attention_mask,
-            # forward_batch=forward_batch,
             tgt_sizes=tgt_sizes,
         )
         cu_seqlens = self.compute_cu_seqlens(tgt_sizes)
         encoder_outputs = self.encoder(
-            hidden_states, cu_seqlens=cu_seqlens, forward_batch=forward_batch
+            hidden_states,
+            cu_seqlens=cu_seqlens,
         )
         last_hidden_state = self.post_layernorm(encoder_outputs)
         return last_hidden_state
@@ -573,14 +661,12 @@ class MiniCPMVBaseModel(nn.Module):
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
     ):
-        # multimodal_config = config.model_config.multimodal_config
         super().__init__()
         # All MiniCPM-V models disable `tie_word_embeddings` but
         # `PretrainedConfig.tie_word_embeddings` defaults to True; we cannot
-        # check `tie_word_embeddings` until vLLM integrate MiniCPM-V model
+        # check `tie_word_embeddings` until SGLang integrate MiniCPM-V model
         # and config class
         self.config = config
-        # self.multimodal_config = multimodal_config
         self.version = get_version_by_config(self.config)
         self.llm = self.init_llm(config=config, quant_config=quant_config)
@@ -598,13 +684,6 @@ class MiniCPMVBaseModel(nn.Module):
         self.logits_processor = LogitsProcessor(config)
-    @cached_property
-    def sampler(self):
-        if hasattr(self.llm, "sampler"):
-            return self.llm.sampler
-        return get_sampler()
     def _get_image_bounds(
         self,
         input_ids: torch.Tensor,
@@ -666,7 +745,6 @@ class MiniCPMVBaseModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         image_inputs: Optional[MiniCPMVImageInputs],
-        forward_batch: ForwardBatch,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         vlm_embedding: torch.Tensor = self.llm.get_input_embeddings(input_ids)
@@ -680,10 +758,7 @@ class MiniCPMVBaseModel(nn.Module):
                     .to(vlm_embedding.device)
                 )
             else:
-                vision_hidden_states = self.get_vision_hidden_states(
-                    forward_batch, image_inputs
-                )
+                vision_hidden_states = self.get_vision_hidden_states(image_inputs)
             # See NOTE in _parse_and_validate_inputs
             image_bounds = image_inputs["image_bounds"]
             if len(image_bounds) > 0:
@@ -693,6 +768,7 @@ class MiniCPMVBaseModel(nn.Module):
                         for start, end in image_bounds.tolist()
                     ]
                 ).to(vlm_embedding.device)
                 vlm_embedding.scatter_(
                     0,
                     image_indices.view(-1, 1).repeat(1, vlm_embedding.shape[-1]),
@@ -839,7 +915,7 @@ class MiniCPMVBaseModel(nn.Module):
         # There values are useless because their embeddings will be replaced by vision embeddings anyway.
         input_ids.clamp_(min=0, max=self.config.vocab_size - 1)
-        vlm_embeddings, _ = self.get_embedding(input_ids, image_inputs, forward_batch)
+        vlm_embeddings, _ = self.get_embedding(input_ids, image_inputs)
         # always pass the input via `inputs_embeds`
         # to make sure the computation graph is consistent
@@ -857,29 +933,6 @@ class MiniCPMVBaseModel(nn.Module):
             input_ids, hidden_states, self.llm.lm_head, forward_batch
         )
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[torch.Tensor]:
-        return self.llm.compute_logits(hidden_states, sampling_metadata)
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-    def get_mm_mapping(self) -> MultiModelKeys:
-        """
-        Get the module prefix in multimodal models
-        """
-        return MultiModelKeys.from_string_field(
-            language_model="llm", connector="resampler", tower_model="vpm"
-        )
     def init_llm(
         self,
         config: Qwen2Config,
@@ -910,9 +963,7 @@ class MiniCPMVBaseModel(nn.Module):
     ) -> torch.Tensor:
         raise NotImplementedError
-    def get_vision_hidden_states(
-        self, forward_batch: ForwardBatch, data: MiniCPMVImageInputs
-    ) -> torch.Tensor:
+    def get_vision_hidden_states(self, data: MiniCPMVImageInputs) -> torch.Tensor:
         raise NotImplementedError
@@ -1019,7 +1070,6 @@ class MiniCPMV2_6(MiniCPMVBaseModel):
     def get_vision_hidden_states(
         self,
-        forward_batch: ForwardBatch,
         data: MiniCPMVImageInputs,
     ) -> torch.Tensor:
         pixel_values = data["data"]
@@ -1042,15 +1092,18 @@ class MiniCPMV2_6(MiniCPMVBaseModel):
         patch_attn_mask = torch.zeros(
             (B, 1, max_patches), dtype=torch.bool, device=device
         )
-        for i in range(B):
-            patch_attn_mask[i, 0, : tgt_sizes[i][0] * tgt_sizes[i][1]] = True
+        tgt_sizes_tensor = tgt_sizes.clone().to(device=patch_attn_mask.device)
+        mask_shapes = tgt_sizes_tensor[:, 0] * tgt_sizes_tensor[:, 1]
+        patch_attn_mask[:, 0, :] = torch.arange(
+            patch_attn_mask.size(2), device=patch_attn_mask.device
+        ).unsqueeze(0) < mask_shapes.unsqueeze(1)
         vision_embedding = self.vpm(
             all_pixel_values.type(dtype),
-            forward_batch=forward_batch,
             patch_attention_mask=patch_attn_mask,
             tgt_sizes=tgt_sizes,
         )
         return self.resampler(vision_embedding, tgt_sizes)
     def pad_input_ids(self, input_ids: List[int], image_inputs: ImageInputs):
@@ -1138,7 +1191,7 @@ class MiniCPMV:
     """
     Different versions of MiniCPMV use different visual encoders and LLMs,
     which is not conducive to the current integration logic of LoRA and
-    bitsandbytes in vLLM. Therefore, it is necessary to separate them.
+    bitsandbytes in SGLang. Therefore, it is necessary to separate them.
     """
     # Ensure that the LoRA support check passes when the class is not

sglang/srt/models/mllama.py CHANGED Viewed

@@ -17,6 +17,7 @@ from transformers.models.mllama.modeling_mllama import (
 import sglang.srt.distributed.parallel_state as ps
 from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import get_act_fn
+from sglang.srt.layers.attention.vision import VisionAttention
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
     ColumnParallelLinear,
@@ -145,61 +146,6 @@ class MllamaPrecomputedPositionEmbedding(nn.Module):
         return hidden_state
-class MllamaVisionSdpaAttention(nn.Module):
-    def __init__(self, config: config_mllama.MllamaVisionConfig):
-        super().__init__()
-        model_parallel_size = get_tensor_model_parallel_world_size()
-        self.embed_dim = config.hidden_size
-        self.num_heads = config.attention_heads
-        self.head_dim = config.hidden_size // config.attention_heads
-        self.num_local_heads = self.num_heads // model_parallel_size
-        self.q_size = self.num_local_heads * self.head_dim
-        self.kv_size = self.num_local_heads * self.head_dim
-        self.qkv_proj = QKVParallelLinear(
-            self.embed_dim,
-            self.head_dim,
-            self.num_heads,
-            bias=False,
-        )
-        self.o_proj = RowParallelLinear(
-            self.num_heads * self.head_dim,
-            self.embed_dim,
-            bias=False,
-            input_is_parallel=True,
-        )
-    def forward(
-        self,
-        hidden_state: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_state)
-        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        q = q.view(
-            q.shape[0], q.shape[1], self.num_local_heads, self.head_dim
-        ).transpose(1, 2)
-        k = k.view(
-            k.shape[0], k.shape[1], self.num_local_heads, self.head_dim
-        ).transpose(1, 2)
-        v = v.view(
-            v.shape[0], v.shape[1], self.num_local_heads, self.head_dim
-        ).transpose(1, 2)
-        # TODO: remove padding in image encoder
-        attn_output = F.scaled_dot_product_attention(
-            q, k, v, attn_mask=attention_mask, dropout_p=0.0
-        )
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(
-            attn_output.shape[0], attn_output.shape[1], -1
-        )
-        output, _ = self.o_proj(attn_output)
-        return output
 class MllamaVisionMLP(nn.Module):
     def __init__(self, config, quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
@@ -237,7 +183,17 @@ class MllamaVisionEncoderLayer(nn.Module):
         self.is_gated = is_gated
         self.intermediate_size = config.intermediate_size
-        self.self_attn = MllamaVisionSdpaAttention(config)
+        self.self_attn = VisionAttention(
+            self.hidden_size,
+            self.num_attention_heads,
+            self.hidden_size,
+            use_qkv_parallel=True,
+            quant_config=None,
+            dropout=0.0,
+            use_context_forward=False,
+            use_full_precision_softmax=False,
+            flatten_batch=False,
+        )
         self.mlp = MllamaVisionMLP(config)
         self.input_layernorm = nn.LayerNorm(self.hidden_size, eps=config.norm_eps)
@@ -992,6 +948,10 @@ class MllamaForConditionalGeneration(nn.Module):
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
+                if "vision_model" in name:
+                    # adapt to VisionAttention
+                    name = name.replace("self_attn.o_proj", "self_attn.proj")
                 param = params_dict.pop(name)
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)

sglang/srt/models/qwen2.py CHANGED Viewed

@@ -249,7 +249,10 @@ class Qwen2Model(nn.Module):
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.embed_tokens(input_ids)
+        if hasattr(self.config, "scale_emb"):
+            return self.embed_tokens(input_ids) * self.config.scale_emb
+        else:
+            return self.embed_tokens(input_ids)
     def forward(
         self,

sglang 0.4.2__py3-none-any.whl → 0.4.2.post1__py3-none-any.whl

sglang 0.4.2py3-none-any.whl → 0.4.2.post1py3-none-any.whl