PyPI - sglang - Versions diffs - 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl - Mend

sglang 0.4.6.post5py3-none-any.whl → 0.4.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (318) hide show

sglang/srt/managers/expert_location_dispatch.py CHANGED Viewed

@@ -25,7 +25,7 @@ from sglang.srt.managers.schedule_batch import global_server_args_dict
 class ExpertLocationDispatchInfo:
     ep_dispatch_algorithm: Literal["static", "random"]
     # (num_logical_experts,)
-    partial_logical_to_rank_dispatch_physical_map: torch.Tensor
+    partial_logical_to_rank_dispatch_physical_map: Optional[torch.Tensor]
     # (num_logical_experts, X)
     partial_logical_to_all_physical_map: torch.Tensor
     # (num_logical_experts,)
@@ -42,9 +42,14 @@ class ExpertLocationDispatchInfo:
         return cls(
             ep_dispatch_algorithm=ep_dispatch_algorithm,
-            partial_logical_to_rank_dispatch_physical_map=expert_location_metadata.logical_to_rank_dispatch_physical_map[
-                layer_id, :
-            ],
+            partial_logical_to_rank_dispatch_physical_map=(
+                expert_location_metadata.logical_to_rank_dispatch_physical_map[
+                    layer_id, :
+                ]
+                if expert_location_metadata.logical_to_rank_dispatch_physical_map
+                is not None
+                else None
+            ),
             partial_logical_to_all_physical_map=expert_location_metadata.logical_to_all_physical_map[
                 layer_id, :
             ],
@@ -55,6 +60,18 @@ class ExpertLocationDispatchInfo:
         )
+def transform_select_experts_inputs(
+    router_logits: torch.Tensor,
+    correction_bias: Optional[torch.Tensor],
+    info: Optional[ExpertLocationDispatchInfo],
+):
+    if (info is not None) and (info.ep_dispatch_algorithm == "fake"):
+        router_logits = torch.randn_like(router_logits)
+        if correction_bias is not None:
+            correction_bias = torch.zeros_like(correction_bias)
+    return router_logits, correction_bias
 def topk_ids_logical_to_physical(
     topk_ids: torch.Tensor, info: Optional[ExpertLocationDispatchInfo]
 ) -> torch.Tensor:
@@ -63,9 +80,9 @@ def topk_ids_logical_to_physical(
     if info.ep_dispatch_algorithm == "static":
         return _topk_ids_logical_to_physical_static(topk_ids, info)
-    if info.ep_dispatch_algorithm == "dynamic":
+    if info.ep_dispatch_algorithm in ["dynamic", "fake"]:
         return _topk_ids_logical_to_physical_dynamic(topk_ids, info)
-    raise NotImplementedError
+    raise NotImplementedError(f"Unknown algorithm {info.ep_dispatch_algorithm}")
 def _topk_ids_logical_to_physical_static(

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -20,7 +20,7 @@ import copy
 import uuid
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 from sglang.srt.mm_utils import has_valid_data
@@ -30,7 +30,7 @@ if TYPE_CHECKING:
 else:
     Image = Any
-from sglang.srt.managers.schedule_batch import BaseFinishReason, flatten_nested_list
+from sglang.srt.managers.schedule_batch import BaseFinishReason
 from sglang.srt.sampling.sampling_params import SamplingParams
@@ -103,9 +103,12 @@ class GenerateReqInput:
     # For disaggregated inference
     bootstrap_host: Optional[Union[List[str], str]] = None
-    bootstrap_port: Optional[Union[List[int], int]] = None
+    bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
     bootstrap_room: Optional[Union[List[int], int]] = None
+    # For data parallel rank routing
+    data_parallel_rank: Optional[int] = None
     def contains_mm_input(self) -> bool:
         return has_valid_data(self.image_data) or has_valid_data(self.audio_data)
@@ -417,6 +420,9 @@ class GenerateReqInput:
             bootstrap_room=(
                 self.bootstrap_room[i] if self.bootstrap_room is not None else None
             ),
+            data_parallel_rank=(
+                self.data_parallel_rank if self.data_parallel_rank is not None else None
+            ),
         )
@@ -464,6 +470,9 @@ class TokenizedGenerateReqInput:
     bootstrap_port: Optional[int] = None
     bootstrap_room: Optional[int] = None
+    # For data parallel rank routing
+    data_parallel_rank: Optional[int] = None
 @dataclass
 class EmbeddingReqInput:
@@ -848,7 +857,8 @@ class ProfileReqInput:
     # If it is set, profiling is automatically stopped after this step, and
     # the caller doesn't need to run stop_profile.
     num_steps: Optional[int] = None
-    activities: Optional[List[Literal["CPU", "GPU", "MEM", "CUDA_PROFILER"]]] = None
+    activities: Optional[List[str]] = None
+    profile_by_stage: bool = False
     with_stack: Optional[bool] = None
     record_shapes: Optional[bool] = None
@@ -875,6 +885,7 @@ class ProfileReq:
     output_dir: Optional[str] = None
     num_steps: Optional[int] = None
     activities: Optional[List[str]] = None
+    profile_by_stage: bool = False
     with_stack: Optional[bool] = None
     record_shapes: Optional[bool] = None
     profile_id: Optional[str] = None

sglang/srt/managers/mm_utils.py CHANGED Viewed

@@ -252,40 +252,36 @@ def get_embedding_chunk(
     return embedding_chunk, start_index, end_index
-def get_embedding_and_mask(
+def _get_precomputed_embedding(
+    items: List[MultimodalDataItem],
+) -> Optional[torch.Tensor]:
+    """
+    If all items have precomputed_features, return their concatenation.
+    If some but not all have precomputed_features, raise NotImplementedError.
+    If none have precomputed_features, return None.
+    """
+    precomputed_features = [item.precomputed_features for item in items]
+    if any(feature is not None for feature in precomputed_features):
+        if not all(feature is not None for feature in precomputed_features):
+            raise NotImplementedError(
+                "MM inputs where only some items are precomputed."
+            )
+        result = torch.concat(precomputed_features)
+        # some models embedding is 3-dim, reshape it to 2-dim (similar to get_embedding_chunk)
+        result = result.reshape(-1, result.shape[-1])
+        return result
+    return None
+def _get_chunked_prefill_embedding(
     data_embedding_func: Callable[[List[MultimodalDataItem]], torch.Tensor],
     embedding_items: List[MultimodalDataItem],
-    placeholder_tensor: torch.Tensor,
-    input_ids: torch.Tensor,
     items_size: List[int],
     prefix_length: List[int],
     extend_length: List[int],
     items_offset_list: List[List[Tuple[int, int]]],
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """
-    Generate multimodal embeddings and create a mask for identifying their positions in the input sequence.
-    Args:
-        data_embedding_func: Function that generates embeddings for multimodal items
-        embedding_items: List of multimodal items to embed
-        placeholder_tensor: Tensor containing token IDs that serve as placeholders for multimodal content
-        input_ids: The input token IDs tensor
-        items_size: Cumulative sizes of multimodal items per request
-        prefix_length: Prefix lengths for each request
-        extend_length: Sequence lengths for each request
-        items_offset_list: List of offset ranges for multimodal items in each request
-    Returns:
-        A tuple containing:
-        - The generated embeddings tensor
-        - A boolean mask tensor indicating where these embeddings should be placed
-    Raises:
-        AssertionError: If the number of multimodal tokens in input_ids doesn't match
-                        the number of tokens in the generated embeddings
-    """
-    # 1. Get the embedding
-    #    Calculate embedding for each request, try to get it from cache to avoid repeated calculation
+) -> Optional[torch.Tensor]:
+    # Calculate embedding for each request, try to get it from cache to avoid repeated calculation
     embedding_list = []
     for i in range(len(items_size) - 1):
         if items_size[i] == items_size[i + 1]:
@@ -321,21 +317,28 @@ def get_embedding_and_mask(
             embedding_cache.free(embedding_items_hash)
         embedding_list.append(embedding_per_req_chunk)
     if len(embedding_list) == 0:
-        return None, None
-    embedding = torch.concat(embedding_list, dim=0)
-    # 2. Check the embedding
-    num_mm_tokens_in_embedding = embedding.shape[0]
-    special_multimodal_mask = torch.isin(
-        input_ids,
-        placeholder_tensor,
-    ).unsqueeze(-1)
+        return None
+    return torch.concat(embedding_list, dim=0)
+def _get_multimodal_mask(
+    input_ids: torch.Tensor, placeholder_tensor: torch.Tensor
+) -> torch.Tensor:
+    return torch.isin(input_ids, placeholder_tensor).unsqueeze(-1)
-    num_mm_tokens_in_input_ids = special_multimodal_mask.sum().item()
+def _adjust_embedding_length(
+    embedding: torch.Tensor,
+    mask: torch.Tensor,
+    logger,
+) -> torch.Tensor:
+    num_mm_tokens_in_embedding = embedding.shape[0]
+    num_mm_tokens_in_input_ids = mask.sum().item()
     if num_mm_tokens_in_input_ids != num_mm_tokens_in_embedding:
         logger.warning(
             f"Number of tokens in multimodal embedding does not match those in the input text. "
             f"Got {num_mm_tokens_in_input_ids} tokens in the text but {num_mm_tokens_in_embedding} "
-            "tokens from multimodal embeddings."
+            f"tokens from multimodal embeddings."
         )
         if num_mm_tokens_in_input_ids < num_mm_tokens_in_embedding:
             chunked_prefill_size = global_server_args_dict["chunked_prefill_size"]
@@ -353,7 +356,54 @@ def get_embedding_and_mask(
             raise RuntimeError(
                 f"Insufficient multimodal embedding length: {num_mm_tokens_in_input_ids=} vs {num_mm_tokens_in_embedding=}. This is an internal error"
             )
+    return embedding
+def get_embedding_and_mask(
+    data_embedding_func: Callable[[List[MultimodalDataItem]], torch.Tensor],
+    embedding_items: List[MultimodalDataItem],
+    placeholder_tensor: torch.Tensor,
+    input_ids: torch.Tensor,
+    items_size: List[int],
+    prefix_length: List[int],
+    extend_length: List[int],
+    items_offset_list: List[List[Tuple[int, int]]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Generate multimodal embeddings and create a mask for identifying their positions in the input sequence.
+    Args:
+        data_embedding_func: Function that generates embeddings for multimodal items
+        embedding_items: List of multimodal items to embed
+        placeholder_tensor: Tensor containing token IDs that serve as placeholders for multimodal content
+        input_ids: The input token IDs tensor
+        items_size: Cumulative sizes of multimodal items per request
+        prefix_length: Prefix lengths for each request
+        extend_length: Sequence lengths for each request
+        items_offset_list: List of offset ranges for multimodal items in each request
+    Returns:
+        A tuple containing:
+        - The generated embeddings tensor
+        - A boolean mask tensor indicating where these embeddings should be placed
+    """
+    # 1. Get embedding
+    embedding = _get_precomputed_embedding(embedding_items)
+    if embedding is None:
+        embedding = _get_chunked_prefill_embedding(
+            data_embedding_func,
+            embedding_items,
+            items_size,
+            prefix_length,
+            extend_length,
+            items_offset_list,
+        )
+        if embedding is None:
+            return None, None
+    # 2. Get mask
+    special_multimodal_mask = _get_multimodal_mask(input_ids, placeholder_tensor)
+    # 3. Adjust embedding length if needed
+    embedding = _adjust_embedding_length(embedding, special_multimodal_mask, logger)
     return embedding, special_multimodal_mask

sglang/srt/managers/multimodal_processors/base_processor.py CHANGED Viewed

@@ -5,7 +5,8 @@ import multiprocessing as mp
 import os
 import re
 from abc import ABC, abstractmethod
-from typing import List, Optional, Tuple, Union
+from enum import Enum
+from typing import Any, Dict, List, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -16,16 +17,24 @@ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
 from sglang.srt.utils import encode_video, load_audio, load_image
+class MultimodalInputFormat(Enum):
+    """Enum for different multimodal input formats."""
+    RAW_IMAGES = "raw_images"
+    PRECOMPUTED_FEATURES = "precomputed_features"
+    PIXEL_VALUES = "pixel_values"
 @dataclasses.dataclass
 class BaseMultiModalProcessorOutput:
     # input_text, with each frame of video/image represented with a image_token
     input_text: str
     # frames loaded from image and video, in given order
-    images: Optional[list[Union[Image.Image, MultimodalDataItem]]] = None
+    images: Optional[list[Union[Image.Image, dict]]] = None
     # audios
-    audios: Optional[list[Union[np.ndarray, MultimodalDataItem]]] = None
+    audios: Optional[list[Union[np.ndarray, dict]]] = None
     def normalize(self):
         for field_name in ["images", "audios"]:
@@ -170,8 +179,6 @@ class BaseMultimodalProcessor(ABC):
     ):
         """Static method that can be pickled for multiprocessing"""
         if isinstance(data, dict):
-            return MultimodalDataItem.from_dict(data)
-        if isinstance(data, MultimodalDataItem):
             return data
         try:
             if is_audio:
@@ -370,15 +377,180 @@ class BaseMultimodalProcessor(ABC):
         return list(zip(indices_start.tolist(), indices_end.tolist()))
-    def mm_inputs_are_preprocessed(self, mm_inputs: Optional[list]):
-        """Returns true if all images are preprocessed, false if all are not, and error otherwise."""
-        if not mm_inputs:
-            return True
-        ret = any(isinstance(mm_input, MultimodalDataItem) for mm_input in mm_inputs)
-        if ret and not all(
-            isinstance(mm_input, MultimodalDataItem) for mm_input in mm_inputs
-        ):
-            raise ValueError(
-                "Unsupported: mixture of multimodal inputs where some but not all are preprocessed."
+    @staticmethod
+    def _extract_processor_features(
+        items: List[dict], attr_name: str
+    ) -> Optional[torch.Tensor]:
+        """
+        Helper function to concat extracted attributes from processor output.
+        """
+        values = [value for item in items if (value := item.get(attr_name)) is not None]
+        return torch.cat(values) if values else None
+    # When we assume that all the items have the same attributes
+    def _extract_processor_features_from_all_attributes(
+        self, items: List[dict]
+    ) -> dict:
+        values = {}
+        # Verify all items have the same keys
+        first_keys = set(items[0].keys())
+        for item in items[1:]:
+            if set(item.keys()) != first_keys:
+                raise ValueError(
+                    f"All items must have the same attributes. "
+                    f"First item has {first_keys}, but found {set(item.keys())}"
+                )
+        # Process each attribute
+        for k, v in items[0].items():
+            if isinstance(v, list):
+                values[k] = self._extract_processor_features(items, k)
+            else:
+                # Verify all items have the same value for non-list attributes
+                for item in items[1:]:
+                    if item[k] != v:
+                        raise ValueError(
+                            f"All items must have the same value for attribute {k}. "
+                            f"First item has {v}, but found {item[k]}"
+                        )
+                values[k] = v
+        return values
+    def process_and_combine_mm_data(
+        self, base_output: BaseMultiModalProcessorOutput
+    ) -> Tuple[Optional[MultimodalDataItem], torch.Tensor]:
+        """
+        Process multimodal data and return the combined multimodal item and input_ids.
+        Handles all three input formats at the same abstraction level.
+        Returns:
+            Tuple of (combined_mm_item, input_ids)
+        """
+        def tokenize_text(input_text: str) -> torch.Tensor:
+            """Tokenize input text."""
+            return self._processor.tokenizer(
+                input_text,
+                return_tensors="pt",
+                add_special_tokens=True,
+            ).input_ids.flatten()
+        def categorize_mm_inputs(mm_inputs: List) -> MultimodalInputFormat:
+            """Categorize multimodal inputs and validate consistency."""
+            try:
+                has_image = False
+                has_pixel_values = False
+                has_precomputed_features = False
+                for mm_input in mm_inputs:
+                    if isinstance(mm_input, Image.Image):
+                        has_image = True
+                    elif isinstance(mm_input, dict):
+                        if mm_input.get("precomputed_features", None) is not None:
+                            has_precomputed_features = True
+                        elif mm_input.get("pixel_values", None) is not None:
+                            has_pixel_values = True
+                        else:
+                            raise ValueError(
+                                f"Invalid multimodal input: {mm_input}, expected dict with pixel_values or precomputed_features"
+                            )
+                    else:
+                        raise ValueError(
+                            f"Invalid multimodal input: {mm_input}, expected Image.Image or dict"
+                        )
+                # Validate format consistency
+                format_count = sum(
+                    [has_image, has_pixel_values, has_precomputed_features]
+                )
+                if format_count > 1:
+                    raise ValueError(
+                        "Unsupported: mixture of multimodal input formats. "
+                        f"Found formats: image={has_image}, pixel_values={has_pixel_values}, "
+                        f"precomputed_features={has_precomputed_features}"
+                    )
+                if has_image:
+                    return MultimodalInputFormat.RAW_IMAGES
+                elif has_precomputed_features:
+                    return MultimodalInputFormat.PRECOMPUTED_FEATURES
+                elif has_pixel_values:
+                    return MultimodalInputFormat.PIXEL_VALUES
+                else:
+                    raise ValueError("No valid multimodal input format found")
+            except Exception as e:
+                raise ValueError(f"Failed to categorize inputs: {e}")
+        def process_raw_images(
+            base_output: BaseMultiModalProcessorOutput,
+        ) -> Tuple[MultimodalDataItem, torch.Tensor]:
+            """Process raw Image.Image objects using transformers processor."""
+            ret = self.process_mm_data(
+                input_text=base_output.input_text,
+                images=base_output.images,
+            )
+            combined_mm_item = MultimodalDataItem(modality=Modality.IMAGE)
+            # Copy all fields from processor output except input_ids
+            for key, value in ret.items():
+                if key != "input_ids" and hasattr(combined_mm_item, key):
+                    setattr(combined_mm_item, key, value)
+            input_ids = ret["input_ids"].flatten()
+            return combined_mm_item, input_ids
+        def process_precomputed_features(
+            base_output: BaseMultiModalProcessorOutput,
+        ) -> Tuple[MultimodalDataItem, torch.Tensor]:
+            """Process inputs with precomputed features."""
+            combined_mm_item = MultimodalDataItem(modality=Modality.IMAGE)
+            combined_mm_item.precomputed_features = self._extract_processor_features(
+                base_output.images, "precomputed_features"
             )
-        return ret
+            input_ids = tokenize_text(base_output.input_text)
+            return combined_mm_item, input_ids
+        def process_pixel_values(
+            base_output: BaseMultiModalProcessorOutput,
+        ) -> Tuple[MultimodalDataItem, torch.Tensor]:
+            """Process inputs with pixel values."""
+            values = self._extract_processor_features_from_all_attributes(
+                base_output.images
+            )
+            combined_mm_item = MultimodalDataItem.from_dict(values)
+            input_ids = tokenize_text(base_output.input_text)
+            return combined_mm_item, input_ids
+        def finalize_mm_item(
+            combined_mm_item: MultimodalDataItem, input_ids: torch.Tensor
+        ) -> MultimodalDataItem:
+            """Apply common post-processing to the multimodal item."""
+            combined_mm_item.image_offsets = self.get_mm_items_offset(
+                input_ids=input_ids,
+                mm_token_id=self.IM_TOKEN_ID,
+            )
+            return combined_mm_item
+        # Main logic
+        mm_inputs = base_output.images
+        if not mm_inputs:
+            # Return text-only case
+            input_ids = tokenize_text(base_output.input_text)
+            return None, input_ids
+        # Categorize input formats
+        input_format = categorize_mm_inputs(mm_inputs)
+        # Process based on format
+        if input_format == MultimodalInputFormat.RAW_IMAGES:
+            combined_mm_item, input_ids = process_raw_images(base_output)
+        elif input_format == MultimodalInputFormat.PRECOMPUTED_FEATURES:
+            combined_mm_item, input_ids = process_precomputed_features(base_output)
+        elif input_format == MultimodalInputFormat.PIXEL_VALUES:
+            combined_mm_item, input_ids = process_pixel_values(base_output)
+        else:
+            raise ValueError(f"Unknown input format: {input_format}")
+        # Finalize with common processing
+        combined_mm_item = finalize_mm_item(combined_mm_item, input_ids)
+        return combined_mm_item, input_ids

sglang/srt/managers/multimodal_processors/gemma3.py CHANGED Viewed

@@ -27,6 +27,7 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
         )
         self.IM_START_TOKEN_ID = hf_config.boi_token_index
         self.IM_END_TOKEN_ID = hf_config.eoi_token_index
+        self.IM_TOKEN_ID = hf_config.image_token_index
     async def process_mm_data_async(
         self,
@@ -42,49 +43,21 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
         if isinstance(image_data, str):
             image_data = [image_data]
-        image_token = self.IMAGE_TOKEN
-        image_token_regex = self.IMAGE_TOKEN_REGEX
         base_output = self.load_mm_data(
             prompt=input_text,
             image_data=image_data,
             multimodal_tokens=MultimodalSpecialTokens(
-                image_token=image_token, image_token_regex=image_token_regex
+                image_token=self.IMAGE_TOKEN, image_token_regex=self.IMAGE_TOKEN_REGEX
             ),
             max_req_input_len=max_req_input_len,
             discard_alpha_channel=True,
         )
-        images_are_preprocessed = self.mm_inputs_are_preprocessed(base_output.images)
-        ret = self.process_mm_data(
-            input_text=base_output.input_text,
-            images=None if images_are_preprocessed else base_output.images,
-        )
-        items = []
-        input_ids = ret["input_ids"].flatten()
-        image_offsets = self.get_mm_items_offset(
-            input_ids=input_ids,
-            mm_token_id=self.hf_config.image_token_index,
-        )
-        for i, image in enumerate(base_output.images):
-            if images_are_preprocessed:
-                pixel_values = image.pixel_values
-                precomputed_features = image.precomputed_features
-            else:
-                pixel_values = ret["pixel_values"][i]
-                precomputed_features = None
-            item = MultimodalDataItem(
-                pixel_values=pixel_values,
-                precomputed_features=precomputed_features,
-                modality=Modality.IMAGE,
-                image_offsets=image_offsets[i],
-            )
-            items += [item]
+        combined_mm_item, input_ids = self.process_and_combine_mm_data(base_output)
         return {
-            "mm_items": items,
             "input_ids": input_ids.tolist(),
+            "mm_items": [combined_mm_item] if combined_mm_item is not None else [],
             "im_start_id": self.IM_START_TOKEN_ID,
             "im_end_id": self.IM_END_TOKEN_ID,
         }

sglang/srt/managers/multimodal_processors/internvl.py CHANGED Viewed

@@ -175,6 +175,10 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
         if not image_data:
             return None
+        # Ensure image_data is a list
+        if isinstance(image_data, str):
+            image_data = [image_data]
         base_output = self.load_mm_data(
             prompt=input_text,
             image_data=image_data,

sglang/srt/managers/multimodal_processors/kimi_vl.py CHANGED Viewed

@@ -1,4 +1,7 @@
-from typing import List, Union
+import re
+from typing import Any, Dict, List, Optional, Union
+import torch
 from sglang.srt.managers.multimodal_processors.base_processor import (
     BaseMultimodalProcessor as SGLangBaseProcessor,
@@ -17,20 +20,12 @@ class KimiVLImageProcessor(SGLangBaseProcessor):
     def __init__(self, hf_config, server_args, _processor):
         super().__init__(hf_config, server_args, _processor)
         self.IMAGE_TOKEN = "<|media_pad|>"
-        self.im_token_id = _processor.tokenizer.convert_tokens_to_ids(self.IMAGE_TOKEN)
-        self.im_start = "<|media_start|>"
-        self.im_start_id = _processor.tokenizer.convert_tokens_to_ids(self.im_start)
-        self.im_end = "<|media_end|>"
-        self.im_end_id = _processor.tokenizer.convert_tokens_to_ids(self.im_end)
-        self.im_content = "<|media_content|>"
-        self.im_content_id = _processor.tokenizer.convert_tokens_to_ids(self.im_content)
+        self.IMAGE_TOKEN_REGEX = re.compile(r"(?:<\|media_pad\|>)+")
+        self.IM_TOKEN_ID = _processor.tokenizer.convert_tokens_to_ids(self.IMAGE_TOKEN)
     async def process_mm_data_async(
         self,
-        image_data: List[Union[str, bytes]],
+        image_data: List[Union[str, bytes, Dict]],
         input_text,
         request_obj,
         max_req_input_len,
@@ -45,30 +40,16 @@ class KimiVLImageProcessor(SGLangBaseProcessor):
         base_output = self.load_mm_data(
             prompt=input_text,
             image_data=image_data,
-            multimodal_tokens=MultimodalSpecialTokens(image_token=self.IMAGE_TOKEN),
+            multimodal_tokens=MultimodalSpecialTokens(
+                image_token=self.IMAGE_TOKEN, image_token_regex=self.IMAGE_TOKEN_REGEX
+            ),
             max_req_input_len=max_req_input_len,
         )
-        ret = self.process_mm_data(
-            input_text=base_output.input_text,
-            images=base_output.images,
-        )
-        input_ids = ret["input_ids"].flatten()
-        image_offsets = self.get_mm_items_offset(
-            input_ids=input_ids,
-            mm_token_id=self.im_token_id,
-        )
+        combined_mm_item, input_ids = self.process_and_combine_mm_data(base_output)
         return {
             "input_ids": input_ids.tolist(),
-            "mm_items": [
-                MultimodalDataItem(
-                    pixel_values=ret["pixel_values"],
-                    image_grid_thws=ret["image_grid_hws"],
-                    modality=Modality.IMAGE,
-                    image_offsets=image_offsets,
-                )
-            ],
-            "im_token_id": self.im_token_id,
-            "im_start_id": self.im_start_id,
-            "im_end_id": self.im_end_id,
-            "im_content_id": self.im_content_id,
+            "mm_items": [combined_mm_item] if combined_mm_item is not None else [],
+            "im_token_id": self.IM_TOKEN_ID,
         }

sglang/srt/managers/multimodal_processors/minicpm.py CHANGED Viewed

@@ -42,7 +42,8 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
             audio_data=audio_data,
             image_data=image_data,
             multimodal_tokens=MultimodalSpecialTokens(
-                image_token=self.image_token, audio_token=self.audio_token
+                image_token=self.image_token,
+                audio_token=self.audio_token,
             ),
         )
         if base_output is None:

sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl

sglang 0.4.6.post5py3-none-any.whl → 0.4.7py3-none-any.whl