PyPI - sglang - Versions diffs - 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl - Mend

sglang 0.4.6.post5py3-none-any.whl → 0.4.7.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (359) hide show

sglang/srt/models/gemma3_causal.py CHANGED Viewed

@@ -277,6 +277,13 @@ class Gemma3Attention(nn.Module):
         k = k.permute(0, 2, 1, 3)
         attn_output = self.attn(q, k, v, forward_batch=forward_batch)
+        # Compatible with triton backend which returns [1, s, h, head_dim]
+        if attn_output.dim() == 4 and attn_output.shape[0] == 1:
+            attn_output = attn_output.squeeze(0)
+            attn_output = attn_output.flatten(-2, -1)
+        # [s, h * head_dim]
         output, _ = self.o_proj(attn_output)
         return output

sglang/srt/models/gemma3_mm.py CHANGED Viewed

@@ -282,25 +282,30 @@ class Gemma3ForConditionalGeneration(PreTrainedModel):
         Returns:
             image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
         """
-        if any(item.precomputed_features is not None for item in items):
-            if not all(item.precomputed_features is not None for item in items):
-                raise NotImplementedError(
-                    "MM inputs where only some items are precomputed."
-                )
-            return torch.concat([item.precomputed_features for item in items])
         # Process images one by one to handle flatten_batch=True constraint in vision_tower
         all_pixel_values = flatten_nested_list([item.pixel_values for item in items])
         vision_outputs_list = []
-        for pixel_value in all_pixel_values:
-            # Add batch dimension for single image processing
-            pixel_value_batch = pixel_value.unsqueeze(0)
-            pixel_value_batch = pixel_value_batch.to(device=self.vision_tower.device)
-            pixel_value_batch = pixel_value_batch.to(dtype=self.language_model.dtype())
+        for pixel_values_batch in all_pixel_values:
+            # Normalize input shape to [batch_size, channels, height, width]
+            if pixel_values_batch.dim() == 5:
+                pixel_values_batch = pixel_values_batch.squeeze(0)
+            elif pixel_values_batch.dim() == 3:
+                pixel_values_batch = pixel_values_batch.unsqueeze(0)
+            elif pixel_values_batch.dim() != 4:
+                raise ValueError(
+                    f"Unexpected pixel_values shape: {pixel_values_batch.shape}"
+                )
-            vision_output = self.vision_tower(pixel_values=pixel_value_batch)
-            vision_outputs_list.append(vision_output)
+            # Process each image in the batch
+            batch_size = pixel_values_batch.shape[0]
+            for i in range(batch_size):
+                pixel_value = pixel_values_batch[i : i + 1]  # Keep batch dimension as 1
+                pixel_value = pixel_value.to(
+                    device=self.vision_tower.device, dtype=self.language_model.dtype()
+                )
+                vision_output = self.vision_tower(pixel_values=pixel_value)
+                vision_outputs_list.append(vision_output)
         # Concatenate all vision outputs
         vision_outputs = torch.cat(vision_outputs_list, dim=0)

sglang/srt/models/idefics2.py ADDED Viewed

@@ -0,0 +1,342 @@
+# Copyright 2023 The SGLang team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+from sglang.srt.layers.activation import get_act_fn
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.utils import add_prefix
+class Idefics2VisionMLP(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.activation_fn = get_act_fn(config.hidden_act)
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("fc1", prefix),
+        )
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("fc2", prefix),
+        )
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        return hidden_states
+class Idefics2EncoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.self_attn = VisionAttention(
+            embed_dim=config.hidden_size,
+            num_heads=self.num_heads,
+            projection_size=config.intermediate_size,
+            use_qkv_parallel=True,
+            quant_config=quant_config,
+            dropout=config.attention_dropout,
+            qkv_backend="sdpa",
+            softmax_in_single_precision=True,
+            flatten_batch=False,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = Idefics2VisionMLP(
+            config,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+        """
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(hidden_states, cu_seqlens=cu_seqlens)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+class Idefics2Encoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention
+    layers. Each layer is a
+    [`Idefics2EncoderLayer`].
+    Args:
+        config: Idefics2Config
+    """
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            [
+                Idefics2EncoderLayer(
+                    config,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            inputs_embeds (torch.Tensor):
+                Optionally, instead of passing `input_ids` you can choose to
+                directly pass an embedded representation.
+                This is useful if you want more control over how to convert
+                `input_ids` indices into associated vectorsthan the model's
+                internal embedding lookup matrix.
+        """
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            layer_outputs = encoder_layer(
+                hidden_states,
+                cu_seqlens=cu_seqlens,
+            )
+            hidden_states = layer_outputs
+        return hidden_states
+class Idefics2VisionEmbeddings(nn.Module):
+    """
+    This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings
+    ` to enable images of variable
+    resolution.
+    The modifications are adapted from [Patch n' Pack: NaViT, a Vision
+    Transformer for any Aspect Ratio and Resolution](https://arxiv.org/abs/2307.06304)
+    which allows treating images in their native aspect ratio and without the
+    need to resize them to the same fixed size. In particular, we start from the
+    original pre-trained SigLIP model(which uses images of fixed-size square
+    images) and adapt it by training on images of variable resolutions.
+    """
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+        self.num_patches_per_side = self.image_size // self.patch_size
+        self.num_patches = self.num_patches_per_side**2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+    def get_position_ids(
+        self,
+        pixel_values: torch.FloatTensor,
+        patch_attention_mask: torch.BoolTensor,
+        tgt_sizes: Optional[torch.IntTensor] = None,
+    ):
+        batch_size, _, max_im_h, max_im_w = pixel_values.shape
+        max_nb_patches_h, max_nb_patches_w = (
+            max_im_h // self.patch_size,
+            max_im_w // self.patch_size,
+        )
+        boundaries = torch.arange(
+            1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side
+        )
+        position_ids = torch.full(
+            size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0
+        )
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+            if tgt_sizes is not None:
+                nb_patches_h = tgt_sizes[batch_idx][0]
+                nb_patches_w = tgt_sizes[batch_idx][1]
+            else:
+                nb_patches_h = p_attn_mask[:, 0].sum()
+                nb_patches_w = p_attn_mask[0].sum()
+            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
+            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
+            bucket_coords_h = torch.bucketize(
+                fractional_coords_h, boundaries, right=True
+            )
+            bucket_coords_w = torch.bucketize(
+                fractional_coords_w, boundaries, right=True
+            )
+            pos_ids = (
+                bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w
+            ).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+        position_ids = position_ids.to(self.position_embedding.weight.device)
+        return position_ids
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        patch_attention_mask: torch.BoolTensor,
+        tgt_sizes: Optional[torch.IntTensor] = None,
+    ) -> torch.Tensor:
+        target_dtype = self.patch_embedding.weight.dtype
+        pixel_values = pixel_values.to(
+            device=self.patch_embedding.weight.device, dtype=target_dtype
+        )
+        patch_embeds = self.patch_embedding(pixel_values)
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+        position_ids = self.get_position_ids(
+            pixel_values, patch_attention_mask, tgt_sizes
+        )
+        embeddings = embeddings + self.position_embedding(position_ids)
+        return embeddings
+class Idefics2VisionTransformer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        require_post_norm: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        embed_dim = config.hidden_size
+        self.config = config
+        self.embeddings = Idefics2VisionEmbeddings(config)
+        self.encoder = Idefics2Encoder(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("encoder", prefix),
+        )
+        self.post_layernorm = (
+            nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+            if require_post_norm
+            else nn.Identity()
+        )
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.embeddings
+    def compute_cu_seqlens(
+        self,
+        tgt_sizes: Optional[torch.Tensor] = None,
+        input_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # shape: (batch_size,)
+        if tgt_sizes is not None:
+            seqlen = tgt_sizes[:, 0] * tgt_sizes[:, 1]
+        elif input_embeds is not None:
+            seqlen = torch.full(
+                size=(input_embeds.shape[0],),
+                fill_value=input_embeds.shape[1],
+                dtype=torch.int32,
+                device=input_embeds.device,
+            )
+        else:
+            raise ValueError(
+                "Either `tgt_sizes` or `input_embeds` must be provided to compute cu_seqlens."
+            )
+        cu_seqlens = torch.cat(
+            [
+                torch.tensor([0], device=seqlen.device, dtype=torch.int32),
+                torch.cumsum(seqlen, dim=0, dtype=torch.int32),
+            ],
+            dim=0,
+        ).to(seqlen.device)
+        return cu_seqlens
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+        tgt_sizes: Optional[torch.IntTensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.embeddings(
+            pixel_values=pixel_values,
+            patch_attention_mask=patch_attention_mask,
+            tgt_sizes=tgt_sizes,
+        )
+        cu_seqlens = self.compute_cu_seqlens(tgt_sizes, hidden_states)
+        encoder_outputs = self.encoder(
+            hidden_states,
+            cu_seqlens=cu_seqlens,
+        )
+        last_hidden_state = self.post_layernorm(encoder_outputs)
+        return last_hidden_state

sglang/srt/models/internvl.py CHANGED Viewed

@@ -11,21 +11,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==========================582====================================================
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 import torch
 # Adapted from https://raw.githubusercontent.com/vllm-project/vllm/7f62077af5159c625fe3ad1c812e6c1a2b93ba3b/vllm/model_executor/models/internlm2.py
 # Adapted from https://raw.githubusercontent.com/hehesangsj/sglang/refs/heads/internvl/python/sglang/srt/models/internvl.py
 import torch.nn.functional as F
-from einops import rearrange, repeat
-from sgl_kernel.flash_attn import flash_attn_varlen_func
 from torch import nn
 from transformers import PretrainedConfig, PreTrainedModel
 from transformers.activations import ACT2FN
 from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from sglang.srt.layers.attention.vision import SingletonCache, VisionAttention
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.managers.mm_utils import (
     MultiModalityDataPaddingPatternTokenPairs,
@@ -40,75 +38,12 @@ from sglang.srt.models.qwen2 import Qwen2ForCausalLM
 from sglang.utils import logger
-class FlashAttention(nn.Module):
-    """Implement the scaled dot product attention with softmax.
-    Arguments
-    ---------
-        softmax_scale: The temperature to use for the softmax attention.
-                      (default: 1/sqrt(d_keys) where d_keys is computed at
-                      runtime)
-        attention_dropout: The dropout rate to apply to the attention
-                           (default: 0.0)
-    """
+class InternAttention(nn.Module):
     def __init__(
-        self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None
-    ):
-        super().__init__()
-        self.softmax_scale = softmax_scale
-        self.dropout_p = attention_dropout
-    def forward(
         self,
-        qkv,
-        causal=False,
-        max_s=None,
+        config,
+        quant_config: QuantizationConfig = None,
     ):
-        """Implements the multihead softmax attention.
-        Arguments
-        ---------
-            qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
-                if unpadded: (nnz, 3, h, d)
-        """
-        assert qkv.dtype in [torch.float16, torch.bfloat16]
-        assert qkv.is_cuda
-        batch_size, seqlen, _, nheads, d = qkv.shape
-        if batch_size == 0 or seqlen == 0:
-            output_shape = (batch_size, seqlen, nheads, d)
-            return (
-                torch.zeros(output_shape, dtype=qkv.dtype, device=qkv.device),
-                None,
-            )
-        qkv_reshaped = rearrange(qkv, "b s three h d -> (b s) three h d", three=3)
-        q, k, v = qkv_reshaped.unbind(1)
-        max_s = seqlen
-        cu_seqlens = torch.arange(
-            0,
-            (batch_size + 1) * seqlen,
-            step=seqlen,
-            dtype=torch.int32,
-            device=qkv.device,
-        )
-        output_reshaped = flash_attn_varlen_func(
-            q,
-            k,
-            v,
-            cu_seqlens,
-            cu_seqlens,
-            max_s,
-            max_s,
-            softmax_scale=self.softmax_scale,
-            causal=causal,
-        )
-        output = rearrange(output_reshaped, "(b s) h d -> b s h d", b=batch_size)
-        return output, None
-class InternAttention(nn.Module):
-    def __init__(self, config):
         super().__init__()
         self.config = config
         self.embed_dim = config.hidden_size
@@ -116,7 +51,19 @@ class InternAttention(nn.Module):
         self.head_dim = self.embed_dim // self.num_heads
         self.scale = self.head_dim**-0.5
-        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=config.qkv_bias)
+        self.attn = VisionAttention(
+            qkv_backend="fa3",
+            embed_dim=self.embed_dim,
+            num_heads=self.num_heads,
+            projection_size=self.embed_dim,
+            use_qkv_parallel=True,
+            quant_config=quant_config,
+            dropout=getattr(config, "dropout", 0.0),
+            proj_bias=getattr(config, "qkv_bias", True),
+            flatten_batch=False,
+        )
         self.proj_drop = nn.Dropout(config.dropout)
         self.qk_normalization = config.qk_normalization
@@ -125,36 +72,15 @@ class InternAttention(nn.Module):
             self.q_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
             self.k_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
-        self.inner_attn = FlashAttention(softmax_scale=self.scale)
-        self.proj = nn.Linear(self.embed_dim, self.embed_dim)
-    def _flash_attn(
+    def forward(
         self,
-        x,
-    ):
-        qkv = self.qkv(x)
-        qkv = rearrange(
-            qkv, "b s (three h d) -> b s three h d", three=3, h=self.num_heads
-        )
-        if self.qk_normalization:
-            q, k, v = qkv.unbind(2)
-            q = self.q_norm(q.flatten(-2, -1)).view(q.shape)
-            k = self.k_norm(k.flatten(-2, -1)).view(k.shape)
-            qkv = torch.stack([q, k, v], dim=2)
-        context, _ = self.inner_attn(
-            qkv,
-        )
-        outs = self.proj(rearrange(context, "b s h d -> b s (h d)"))
-        outs = self.proj_drop(outs)
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+    ) -> torch.Tensor:
+        out = self.attn(hidden_states, cu_seqlens=cu_seqlens)
+        outs = self.proj_drop(out)
         return outs
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        x = self._flash_attn(hidden_states)
-        return x
 class InternVisionEmbeddings(nn.Module):
     def __init__(self, config: PretrainedConfig):
@@ -286,6 +212,7 @@ class InternVisionEncoderLayer(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
     ) -> Tuple[
         torch.FloatTensor,
         Optional[torch.FloatTensor],
@@ -295,8 +222,12 @@ class InternVisionEncoderLayer(nn.Module):
         Args:
             hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)`
         """
         hidden_states = hidden_states + self.drop_path1(
-            self.attn(self.norm1(hidden_states).to(hidden_states.dtype)) * self.ls1
+            self.attn(
+                self.norm1(hidden_states).to(hidden_states.dtype), cu_seqlens=cu_seqlens
+            )
+            * self.ls1
         )
         hidden_states = hidden_states + self.drop_path2(
@@ -363,12 +294,12 @@ class InternVisionEncoder(nn.Module):
         encoder_states = () if output_hidden_states else None
         hidden_states = inputs_embeds
+        cu_seqlens = SingletonCache()
         for idx, encoder_layer in enumerate(self.layers):
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
-            layer_outputs = encoder_layer(
-                hidden_states,
-            )
+            layer_outputs = encoder_layer(hidden_states, cu_seqlens=cu_seqlens)
             hidden_states = layer_outputs
         if output_hidden_states:
@@ -625,6 +556,7 @@ class InternVLChatModel(nn.Module):
                 ("gate_up_proj", "up_proj", 1),
             ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
@@ -641,6 +573,11 @@ class InternVLChatModel(nn.Module):
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
+                if "vision_model" in name:
+                    # adapt to VisionAttention
+                    name = name.replace(r"attn.", r"attn.attn.")
+                    name = name.replace(r"qkv.", r"qkv_proj.")
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
@@ -665,6 +602,13 @@ class InternVLChatModel(nn.Module):
                         param, "weight_loader", default_weight_loader
                     )
                     weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        unloaded_params = params_dict.keys() - loaded_params
+        if unloaded_params:
+            raise RuntimeError(
+                f"Some weights are not initialized from checkpoints: {unloaded_params}"
+            )
+        return loaded_params
 EntryClass = InternVLChatModel

sglang/srt/models/kimi_vl.py CHANGED Viewed

@@ -144,10 +144,10 @@ class KimiVLForConditionalGeneration(nn.Module):
             .type(self.vision_tower.dtype)
             .to(self.vision_tower.device)
         )
-        image_grid_thws = torch.concat(
-            [item.image_grid_thws for item in items], dim=0
-        ).to(self.vision_tower.device)
-        image_features = self.vision_tower(pixel_values, image_grid_thws)
+        image_grid_hws = torch.cat([item.image_grid_hws for item in items], dim=0).to(
+            self.vision_tower.device
+        )
+        image_features = self.vision_tower(pixel_values, image_grid_hws)
         assert isinstance(image_features, list)
         # lengths = [x.shape[0] for x in image_features]
         res = self.multi_modal_projector(torch.cat(image_features))  # .split(lengths)

sglang/srt/models/llama.py CHANGED Viewed

@@ -17,7 +17,7 @@
 """Inference-only LLaMA model compatible with HuggingFace weights."""
 import logging
-from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 import torch
 from torch import nn

sglang/srt/models/minicpmo.py CHANGED Viewed

@@ -51,11 +51,8 @@ from sglang.srt.managers.schedule_batch import (
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.utils import set_default_torch_dtype
 from sglang.srt.model_loader.weight_utils import default_weight_loader
-from sglang.srt.models.minicpmv import (
-    Idefics2VisionTransformer,
-    MiniCPMBaseModel,
-    Resampler2_5,
-)
+from sglang.srt.models.idefics2 import Idefics2VisionTransformer
+from sglang.srt.models.minicpmv import MiniCPMBaseModel, Resampler2_5
 from sglang.srt.models.qwen2 import Qwen2ForCausalLM
 from sglang.srt.utils import logger

sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

sglang 0.4.6.post5py3-none-any.whl → 0.4.7.post1py3-none-any.whl