PyPI - sglang - Versions diffs - 0.4.1.post7__py3-none-any.whl → 0.4.2.post1__py3-none-any.whl - Mend

sglang 0.4.1.post7py3-none-any.whl → 0.4.2.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

sglang/bench_offline_throughput.py +17 -11
sglang/bench_one_batch.py +14 -6
sglang/bench_serving.py +47 -44
sglang/lang/chat_template.py +31 -0
sglang/srt/configs/load_config.py +1 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +5 -2
sglang/srt/entrypoints/engine.py +5 -2
sglang/srt/entrypoints/http_server.py +24 -0
sglang/srt/function_call_parser.py +494 -0
sglang/srt/layers/activation.py +5 -5
sglang/srt/layers/attention/triton_ops/prefill_attention.py +6 -0
sglang/srt/layers/attention/vision.py +243 -40
sglang/srt/layers/dp_attention.py +3 -1
sglang/srt/layers/layernorm.py +5 -5
sglang/srt/layers/linear.py +24 -9
sglang/srt/layers/logits_processor.py +1 -1
sglang/srt/layers/moe/ep_moe/layer.py +20 -12
sglang/srt/layers/moe/fused_moe_native.py +17 -3
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +18 -1
sglang/srt/layers/moe/fused_moe_triton/layer.py +9 -0
sglang/srt/layers/parameter.py +16 -7
sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/fp8.py +11 -1
sglang/srt/layers/rotary_embedding.py +34 -13
sglang/srt/layers/sampler.py +33 -10
sglang/srt/layers/torchao_utils.py +12 -6
sglang/srt/managers/detokenizer_manager.py +1 -0
sglang/srt/managers/image_processor.py +77 -38
sglang/srt/managers/io_struct.py +36 -5
sglang/srt/managers/schedule_batch.py +31 -25
sglang/srt/managers/scheduler.py +78 -38
sglang/srt/managers/tokenizer_manager.py +4 -0
sglang/srt/mem_cache/base_prefix_cache.py +4 -0
sglang/srt/mem_cache/chunk_cache.py +3 -0
sglang/srt/mem_cache/radix_cache.py +30 -1
sglang/srt/model_executor/cuda_graph_runner.py +23 -25
sglang/srt/model_executor/forward_batch_info.py +5 -7
sglang/srt/model_executor/model_runner.py +7 -4
sglang/srt/model_loader/loader.py +75 -0
sglang/srt/model_loader/weight_utils.py +91 -5
sglang/srt/models/commandr.py +14 -2
sglang/srt/models/dbrx.py +9 -1
sglang/srt/models/deepseek_v2.py +3 -3
sglang/srt/models/gemma2.py +9 -1
sglang/srt/models/grok.py +1 -0
sglang/srt/models/minicpm3.py +3 -3
sglang/srt/models/minicpmv.py +129 -76
sglang/srt/models/mllama.py +16 -56
sglang/srt/models/qwen2.py +4 -1
sglang/srt/models/qwen2_vl.py +18 -8
sglang/srt/models/torch_native_llama.py +17 -4
sglang/srt/openai_api/adapter.py +139 -37
sglang/srt/openai_api/protocol.py +5 -4
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +11 -14
sglang/srt/sampling/sampling_batch_info.py +4 -14
sglang/srt/server.py +2 -2
sglang/srt/server_args.py +26 -1
sglang/srt/speculative/eagle_utils.py +37 -15
sglang/srt/speculative/eagle_worker.py +11 -13
sglang/srt/utils.py +62 -67
sglang/test/test_programs.py +1 -0
sglang/test/test_utils.py +81 -22
sglang/utils.py +42 -0
sglang/version.py +1 -1
{sglang-0.4.1.post7.dist-info → sglang-0.4.2.post1.dist-info}/METADATA +8 -8
{sglang-0.4.1.post7.dist-info → sglang-0.4.2.post1.dist-info}/RECORD +78 -67
{sglang-0.4.1.post7.dist-info → sglang-0.4.2.post1.dist-info}/LICENSE +0 -0
{sglang-0.4.1.post7.dist-info → sglang-0.4.2.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.1.post7.dist-info → sglang-0.4.2.post1.dist-info}/top_level.txt +0 -0

sglang/srt/model_loader/weight_utils.py CHANGED Viewed

@@ -27,6 +27,7 @@ import huggingface_hub.constants
 import numpy as np
 import torch
 from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
+from pydantic import BaseModel, ConfigDict, ValidationInfo, model_validator
 from safetensors.torch import load_file, safe_open, save_file
 from tqdm.auto import tqdm
@@ -403,8 +404,13 @@ def np_cache_weights_iterator(
 def safetensors_weights_iterator(
     hf_weights_files: List[str],
+    is_all_weights_sharded: bool = False,
 ) -> Generator[Tuple[str, torch.Tensor], None, None]:
-    """Iterate over the weights in the model safetensor files."""
+    """Iterate over the weights in the model safetensor files.
+    If is_all_weights_sharded is True, it uses more optimize read by reading an
+    entire file instead of reading each tensor one by one.
+    """
     enable_tqdm = (
         not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
     )
@@ -414,9 +420,14 @@ def safetensors_weights_iterator(
         disable=not enable_tqdm,
         bar_format=_BAR_FORMAT,
     ):
-        with safe_open(st_file, framework="pt") as f:
-            for name in f.keys():  # noqa: SIM118
-                param = f.get_tensor(name)
+        if not is_all_weights_sharded:
+            with safe_open(st_file, framework="pt") as f:
+                for name in f.keys():  # noqa: SIM118
+                    param = f.get_tensor(name)
+                    yield name, param
+        else:
+            result = load_file(st_file, device="cpu")
+            for name, param in result.items():
                 yield name, param
@@ -650,6 +661,81 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
     return name
+# Adapted from https://github.com/vllm-project/vllm/blob/68ad4e3a8d8a66fb2a43be57471ee13a8bec4ec0/vllm/model_executor/layers/quantization/schema.py
+class KVCacheQuantSchema(BaseModel):
+    dtype: str
+    # Each key is a TP rank. Each value is a dictionary mapping a TP rank's
+    # layer indices to their per-tensor KV cache scaling factor.
+    # TODO: Consider pulling this and its validation methods out into its
+    # own schema class (tricky as its members are variable)
+    scaling_factor: Dict[int, Dict[int, float]]
+    @model_validator(mode="after")
+    def check_is_fp8(self) -> "KVCacheQuantSchema":
+        assert self.dtype == "float8_e4m3fn", (
+            "Loaded scaling factors intended for KV cache dtype = "
+            f"{self.dtype} rather than float8_e4m3fn!"
+        )
+        return self
+    @model_validator(mode="after")
+    def check_tp_ranks(self, info: ValidationInfo) -> "KVCacheQuantSchema":
+        context = info.context
+        if context:
+            tp_size = context["tp_size"]
+            num_hidden_layers = context["num_hidden_layers"]
+            assert len(self.scaling_factor) == tp_size, (
+                f"Loaded dictionary has TP size {len(self.scaling_factor)} "
+                f"but LLM engine is currently running with TP size {tp_size}."
+            )
+            for tp_rank, layer_maps in self.scaling_factor.items():
+                assert len(layer_maps) == num_hidden_layers, (
+                    f"KV cache scales map for TP rank {tp_rank} is malformed. "
+                    f"Expected {num_hidden_layers} layers, got "
+                    f"{len(layer_maps)}."
+                )
+            for i in range(tp_size):
+                assert (
+                    i in self.scaling_factor
+                ), f"KV cache scales map for TP rank {i} not found."
+        return self
+    @model_validator(mode="after")
+    def check_current_rank(self, info: ValidationInfo) -> "KVCacheQuantSchema":
+        context = info.context
+        if context:
+            tp_rank = context["tp_rank"]
+            num_hidden_layers = context["num_hidden_layers"]
+            layer_scales_map = self.scaling_factor[tp_rank]
+            for i in range(num_hidden_layers):
+                assert i in layer_scales_map, (
+                    f"Could not find KV cache scales for layer {i} in "
+                    f"TP rank {tp_rank}."
+                )
+        return self
+class QuantParamSchema(BaseModel):
+    # TODO: Generalize and extend with more fields
+    # (e.g. weights/activations params) once functionality is enabled
+    model_config = ConfigDict(protected_namespaces=())
+    model_type: Optional[str]
+    kv_cache: KVCacheQuantSchema
+    @model_validator(mode="after")
+    def check_model_type(self, info: ValidationInfo) -> "QuantParamSchema":
+        context = info.context
+        if context:
+            model_type = context.get("model_type", None)
+            if model_type is not None:
+                assert model_type == self.model_type, (
+                    f"Model type is {model_type} but loaded "
+                    f"scaling factors belonging to different "
+                    f"model type {self.model_type}!"
+                )
+        return self
 def kv_cache_scales_loader(
     filename: str,
     tp_rank: int,
@@ -681,7 +767,7 @@ def kv_cache_scales_loader(
     except json.JSONDecodeError:
         logger.error("Error decoding JSON in file '%s'.", filename)
     except Exception:
-        logger.exception("An error occurred while reading '%s'.", filename)
+        logger.error("An error occurred while reading '%s'.", filename)
     # This section is reached if and only if any of the excepts are hit
     # Return an empty iterable (list) => no KV cache scales are loaded
     # which ultimately defaults to 1.0 scales

sglang/srt/models/commandr.py CHANGED Viewed

@@ -61,7 +61,10 @@ from sglang.srt.layers.radix_attention import RadixAttention
 from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
-from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from sglang.srt.utils import get_compiler_backend, set_weight_attrs
@@ -372,10 +375,19 @@ class CohereForCausalLM(nn.Module):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
-EntryClass = CohereForCausalLM
+class Cohere2ForCausalLM(CohereForCausalLM):
+    pass
+EntryClass = [CohereForCausalLM, Cohere2ForCausalLM]

sglang/srt/models/dbrx.py CHANGED Viewed

@@ -42,7 +42,10 @@ from sglang.srt.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding,
 )
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
-from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from sglang.srt.utils import set_weight_attrs
@@ -411,6 +414,11 @@ class DbrxForCausalLM(nn.Module):
                 weight_loader(param, loaded_weight, weight_name)
                 break
             else:
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)

sglang/srt/models/deepseek_v2.py CHANGED Viewed

@@ -56,12 +56,12 @@ from sglang.srt.layers.vocab_parallel_embedding import (
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
-from sglang.srt.utils import is_flashinfer_available, is_hip
+from sglang.srt.utils import is_cuda_available, is_hip
 is_hip_ = is_hip()
-if is_flashinfer_available():
-    from flashinfer import bmm_fp8
+if is_cuda_available():
+    from sgl_kernel import bmm_fp8
 class DeepseekV2MLP(nn.Module):

sglang/srt/models/gemma2.py CHANGED Viewed

@@ -35,7 +35,10 @@ from sglang.srt.layers.radix_attention import RadixAttention
 from sglang.srt.layers.rotary_embedding import get_rope
 from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
-from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from sglang.srt.utils import make_layers
@@ -424,6 +427,11 @@ class Gemma2ForCausalLM(nn.Module):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)

sglang/srt/models/grok.py CHANGED Viewed

@@ -133,6 +133,7 @@ class Grok1MoE(nn.Module):
             renormalize=False,
             quant_config=quant_config,
             tp_size=tp_size,
+            activation="gelu",
             use_presharded_weights=use_presharded_weights,
         )

sglang/srt/models/minicpm3.py CHANGED Viewed

@@ -40,10 +40,10 @@ from sglang.srt.layers.vocab_parallel_embedding import (
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
-from sglang.srt.utils import is_flashinfer_available
+from sglang.srt.utils import is_cuda_available
-if is_flashinfer_available():
-    from flashinfer import bmm_fp8
+if is_cuda_available():
+    from sgl_kernel import bmm_fp8
 class MiniCPM3MLP(nn.Module):

sglang/srt/models/minicpmv.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # Adapted from
 # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
-# Copyright 2023 The vLLM team.
+# Copyright 2023 The SGLang team.
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
@@ -20,7 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only MiniCPM-V model compatible with HuggingFace weights."""
-from functools import cached_property, partial
+from functools import partial
 from typing import (
     Any,
     Callable,
@@ -33,16 +33,13 @@ from typing import (
     Union,
 )
+import numpy as np
 import torch
 import torch.types
 from PIL import Image
 from torch import nn
 from torch.nn.init import trunc_normal_
 from transformers import PretrainedConfig
-from vllm.model_executor.layers.resampler import get_2d_sincos_pos_embed
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
-from vllm.model_executor.models.module_mapping import MultiModelKeys
-from vllm.model_executor.sampling_metadata import SamplingMetadata
 from sglang.srt.distributed import divide, get_tensor_model_parallel_world_size
 from sglang.srt.layers.activation import get_act_fn
@@ -63,6 +60,88 @@ from sglang.srt.models.qwen2 import Qwen2Config, Qwen2ForCausalLM
 RawImageType = Union[Image.Image, torch.Tensor]
+# sin/cos positional embedding helpers are adapted from:
+# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
+def get_1d_sincos_pos_embed_from_grid(
+    embed_dim: int, pos: np.ndarray, version: Tuple[int, int] = (2, 0)
+) -> torch.Tensor:
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,) / (H, W)
+    out: (M, D) / (H, W, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    if version == (2, 0):
+        pos = pos.reshape(-1)  # (M,)
+        out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+        emb_sin = np.sin(out)  # (M, D/2)
+        emb_cos = np.cos(out)  # (M, D/2)
+        emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    else:
+        out = np.einsum("hw,d->hwd", pos, omega)  # (H, W, D/2), outer product
+        emb_sin = np.sin(out)  # (H, W, D/2)
+        emb_cos = np.cos(out)  # (H, W, D/2)
+        emb = np.concatenate([emb_sin, emb_cos], axis=-1)  # (H, W, D)
+    return emb
+def get_2d_sincos_pos_embed_from_grid(
+    embed_dim: int, grid: np.ndarray, version: Tuple[int, int] = (2, 0)
+) -> torch.Tensor:
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(
+        embed_dim // 2, grid[0], version
+    )  # (H*W, D/2) or (H, W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(
+        embed_dim // 2, grid[1], version
+    )  # (H*W, D/2) or (H, W, D/2)
+    if version == (2, 0):
+        emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    else:
+        emb = np.concatenate([emb_h, emb_w], axis=-1)  # (H, W, D)
+    return emb
+def get_2d_sincos_pos_embed(
+    embed_dim: int,
+    grid_size: Union[int, Tuple[int, int]],
+    cls_token: bool = False,
+    version: Tuple[int, int] = (2, 0),
+) -> torch.Tensor:
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or
+                [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    if isinstance(grid_size, int):
+        grid_h_size, grid_w_size = grid_size, grid_size
+    else:
+        grid_h_size, grid_w_size = grid_size[0], grid_size[1]
+    grid_h = np.arange(grid_h_size, dtype=np.float32)
+    grid_w = np.arange(grid_w_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    assert isinstance(grid, np.ndarray) and grid.shape == (2, grid_h_size, grid_w_size)
+    if version == (2, 0):
+        grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
+        pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version)
+        if cls_token:
+            pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    else:
+        pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version)
+    return pos_embed
 class Idefics2VisionMLP(nn.Module):
     def __init__(
@@ -116,6 +195,10 @@ class Idefics2EncoderLayer(nn.Module):
             projection_size=config.intermediate_size,
             use_qkv_parallel=True,
             quant_config=quant_config,
+            dropout=config.attention_dropout,
+            use_context_forward=False,
+            use_full_precision_softmax=True,
+            flatten_batch=False,
             prefix=f"{prefix}.self_attn",
         )
         self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
@@ -126,7 +209,6 @@ class Idefics2EncoderLayer(nn.Module):
         self,
         hidden_states: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        forward_batch: ForwardBatch,
     ) -> torch.Tensor:
         """
         Args:
@@ -136,11 +218,8 @@ class Idefics2EncoderLayer(nn.Module):
         """
         residual = hidden_states
         hidden_states = self.layer_norm1(hidden_states)
-        hidden_states = self.self_attn(
-            hidden_states,
-            cu_seqlens=cu_seqlens,
-            # , forward_batch=forward_batch
-        )
+        hidden_states = self.self_attn(hidden_states, cu_seqlens=cu_seqlens)
         hidden_states = residual + hidden_states
         residual = hidden_states
         hidden_states = self.layer_norm2(hidden_states)
@@ -181,7 +260,6 @@ class Idefics2Encoder(nn.Module):
         self,
         inputs_embeds: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        forward_batch: ForwardBatch,
     ) -> torch.Tensor:
         r"""
         Args:
@@ -195,7 +273,8 @@ class Idefics2Encoder(nn.Module):
         hidden_states = inputs_embeds
         for encoder_layer in self.layers:
             layer_outputs = encoder_layer(
-                hidden_states, cu_seqlens=cu_seqlens, forward_batch=forward_batch
+                hidden_states,
+                cu_seqlens=cu_seqlens,
             )
             hidden_states = layer_outputs
         return hidden_states
@@ -232,19 +311,14 @@ class Idefics2VisionEmbeddings(nn.Module):
         self.num_positions = self.num_patches
         self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
-    def forward(
+    def get_position_ids(
         self,
         pixel_values: torch.FloatTensor,
         patch_attention_mask: torch.BoolTensor,
         tgt_sizes: Optional[torch.IntTensor] = None,
-    ) -> torch.Tensor:
+    ):
         batch_size, _, max_im_h, max_im_w = pixel_values.shape
-        target_dtype = self.patch_embedding.weight.dtype
-        pixel_values = pixel_values.to(
-            device=self.patch_embedding.weight.device, dtype=target_dtype
-        )
-        patch_embeds = self.patch_embedding(pixel_values)
-        embeddings = patch_embeds.flatten(2).transpose(1, 2)
         max_nb_patches_h, max_nb_patches_w = (
             max_im_h // self.patch_size,
             max_im_w // self.patch_size,
@@ -277,6 +351,24 @@ class Idefics2VisionEmbeddings(nn.Module):
             ).flatten()
             position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
         position_ids = position_ids.to(self.position_embedding.weight.device)
+        return position_ids
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        patch_attention_mask: torch.BoolTensor,
+        tgt_sizes: Optional[torch.IntTensor] = None,
+    ) -> torch.Tensor:
+        target_dtype = self.patch_embedding.weight.dtype
+        pixel_values = pixel_values.to(
+            device=self.patch_embedding.weight.device, dtype=target_dtype
+        )
+        patch_embeds = self.patch_embedding(pixel_values)
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+        position_ids = self.get_position_ids(
+            pixel_values, patch_attention_mask, tgt_sizes
+        )
         embeddings = embeddings + self.position_embedding(position_ids)
         return embeddings
@@ -287,7 +379,6 @@ class Idefics2VisionTransformer(nn.Module):
         self,
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
     ) -> None:
         super().__init__()
@@ -302,8 +393,6 @@ class Idefics2VisionTransformer(nn.Module):
     def compute_cu_seqlens(self, tgt_sizes: torch.Tensor) -> torch.Tensor:
         patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]  # shape: (batch_size,)
-        # 做 prefix sum 来得到 cu_seqlens，注意在最前面插一个 0 作为 offset
         cu_seqlens = torch.cat(
             [
                 torch.tensor([0], device=patch_len.device, dtype=torch.int32),
@@ -316,19 +405,18 @@ class Idefics2VisionTransformer(nn.Module):
     def forward(
         self,
         pixel_values,
-        forward_batch: ForwardBatch,
         patch_attention_mask: Optional[torch.BoolTensor] = None,
         tgt_sizes: Optional[torch.IntTensor] = None,
     ) -> torch.Tensor:
         hidden_states = self.embeddings(
             pixel_values=pixel_values,
             patch_attention_mask=patch_attention_mask,
-            # forward_batch=forward_batch,
             tgt_sizes=tgt_sizes,
         )
         cu_seqlens = self.compute_cu_seqlens(tgt_sizes)
         encoder_outputs = self.encoder(
-            hidden_states, cu_seqlens=cu_seqlens, forward_batch=forward_batch
+            hidden_states,
+            cu_seqlens=cu_seqlens,
         )
         last_hidden_state = self.post_layernorm(encoder_outputs)
         return last_hidden_state
@@ -573,14 +661,12 @@ class MiniCPMVBaseModel(nn.Module):
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
     ):
-        # multimodal_config = config.model_config.multimodal_config
         super().__init__()
         # All MiniCPM-V models disable `tie_word_embeddings` but
         # `PretrainedConfig.tie_word_embeddings` defaults to True; we cannot
-        # check `tie_word_embeddings` until vLLM integrate MiniCPM-V model
+        # check `tie_word_embeddings` until SGLang integrate MiniCPM-V model
         # and config class
         self.config = config
-        # self.multimodal_config = multimodal_config
         self.version = get_version_by_config(self.config)
         self.llm = self.init_llm(config=config, quant_config=quant_config)
@@ -598,13 +684,6 @@ class MiniCPMVBaseModel(nn.Module):
         self.logits_processor = LogitsProcessor(config)
-    @cached_property
-    def sampler(self):
-        if hasattr(self.llm, "sampler"):
-            return self.llm.sampler
-        return get_sampler()
     def _get_image_bounds(
         self,
         input_ids: torch.Tensor,
@@ -666,7 +745,6 @@ class MiniCPMVBaseModel(nn.Module):
         self,
         input_ids: torch.Tensor,
         image_inputs: Optional[MiniCPMVImageInputs],
-        forward_batch: ForwardBatch,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         vlm_embedding: torch.Tensor = self.llm.get_input_embeddings(input_ids)
@@ -680,10 +758,7 @@ class MiniCPMVBaseModel(nn.Module):
                     .to(vlm_embedding.device)
                 )
             else:
-                vision_hidden_states = self.get_vision_hidden_states(
-                    forward_batch, image_inputs
-                )
+                vision_hidden_states = self.get_vision_hidden_states(image_inputs)
             # See NOTE in _parse_and_validate_inputs
             image_bounds = image_inputs["image_bounds"]
             if len(image_bounds) > 0:
@@ -693,6 +768,7 @@ class MiniCPMVBaseModel(nn.Module):
                         for start, end in image_bounds.tolist()
                     ]
                 ).to(vlm_embedding.device)
                 vlm_embedding.scatter_(
                     0,
                     image_indices.view(-1, 1).repeat(1, vlm_embedding.shape[-1]),
@@ -839,7 +915,7 @@ class MiniCPMVBaseModel(nn.Module):
         # There values are useless because their embeddings will be replaced by vision embeddings anyway.
         input_ids.clamp_(min=0, max=self.config.vocab_size - 1)
-        vlm_embeddings, _ = self.get_embedding(input_ids, image_inputs, forward_batch)
+        vlm_embeddings, _ = self.get_embedding(input_ids, image_inputs)
         # always pass the input via `inputs_embeds`
         # to make sure the computation graph is consistent
@@ -857,29 +933,6 @@ class MiniCPMVBaseModel(nn.Module):
             input_ids, hidden_states, self.llm.lm_head, forward_batch
         )
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[torch.Tensor]:
-        return self.llm.compute_logits(hidden_states, sampling_metadata)
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-    def get_mm_mapping(self) -> MultiModelKeys:
-        """
-        Get the module prefix in multimodal models
-        """
-        return MultiModelKeys.from_string_field(
-            language_model="llm", connector="resampler", tower_model="vpm"
-        )
     def init_llm(
         self,
         config: Qwen2Config,
@@ -910,9 +963,7 @@ class MiniCPMVBaseModel(nn.Module):
     ) -> torch.Tensor:
         raise NotImplementedError
-    def get_vision_hidden_states(
-        self, forward_batch: ForwardBatch, data: MiniCPMVImageInputs
-    ) -> torch.Tensor:
+    def get_vision_hidden_states(self, data: MiniCPMVImageInputs) -> torch.Tensor:
         raise NotImplementedError
@@ -1019,7 +1070,6 @@ class MiniCPMV2_6(MiniCPMVBaseModel):
     def get_vision_hidden_states(
         self,
-        forward_batch: ForwardBatch,
         data: MiniCPMVImageInputs,
     ) -> torch.Tensor:
         pixel_values = data["data"]
@@ -1042,15 +1092,18 @@ class MiniCPMV2_6(MiniCPMVBaseModel):
         patch_attn_mask = torch.zeros(
             (B, 1, max_patches), dtype=torch.bool, device=device
         )
-        for i in range(B):
-            patch_attn_mask[i, 0, : tgt_sizes[i][0] * tgt_sizes[i][1]] = True
+        tgt_sizes_tensor = tgt_sizes.clone().to(device=patch_attn_mask.device)
+        mask_shapes = tgt_sizes_tensor[:, 0] * tgt_sizes_tensor[:, 1]
+        patch_attn_mask[:, 0, :] = torch.arange(
+            patch_attn_mask.size(2), device=patch_attn_mask.device
+        ).unsqueeze(0) < mask_shapes.unsqueeze(1)
         vision_embedding = self.vpm(
             all_pixel_values.type(dtype),
-            forward_batch=forward_batch,
             patch_attention_mask=patch_attn_mask,
             tgt_sizes=tgt_sizes,
         )
         return self.resampler(vision_embedding, tgt_sizes)
     def pad_input_ids(self, input_ids: List[int], image_inputs: ImageInputs):
@@ -1138,7 +1191,7 @@ class MiniCPMV:
     """
     Different versions of MiniCPMV use different visual encoders and LLMs,
     which is not conducive to the current integration logic of LoRA and
-    bitsandbytes in vLLM. Therefore, it is necessary to separate them.
+    bitsandbytes in SGLang. Therefore, it is necessary to separate them.
     """
     # Ensure that the LoRA support check passes when the class is not

sglang 0.4.1.post7__py3-none-any.whl → 0.4.2.post1__py3-none-any.whl

sglang 0.4.1.post7py3-none-any.whl → 0.4.2.post1py3-none-any.whl