PyPI - sglang - Versions diffs - 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl - Mend

sglang 0.4.6.post4py3-none-any.whl → 0.4.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (358) hide show

sglang/srt/managers/multimodal_processors/base_processor.py CHANGED Viewed

@@ -3,32 +3,41 @@ import concurrent.futures
 import dataclasses
 import multiprocessing as mp
 import os
+import re
 from abc import ABC, abstractmethod
-from typing import List, Optional
+from enum import Enum
+from typing import Any, Dict, List, Optional, Tuple, Union
 import numpy as np
-import PIL
 import torch
 from PIL import Image
 from transformers import BaseImageProcessorFast
-from sglang.srt.managers.schedule_batch import Modality
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
 from sglang.srt.utils import encode_video, load_audio, load_image
+class MultimodalInputFormat(Enum):
+    """Enum for different multimodal input formats."""
+    RAW_IMAGES = "raw_images"
+    PRECOMPUTED_FEATURES = "precomputed_features"
+    PIXEL_VALUES = "pixel_values"
 @dataclasses.dataclass
 class BaseMultiModalProcessorOutput:
     # input_text, with each frame of video/image represented with a image_token
     input_text: str
     # frames loaded from image and video, in given order
-    images: Optional[list[PIL.Image]] = None
+    images: Optional[list[Union[Image.Image, dict]]] = None
     # audios
-    audios: Optional[list[np.ndarray]] = None
+    audios: Optional[list[Union[np.ndarray, dict]]] = None
     def normalize(self):
-        for field_name in ["image_sizes", "images", "audios"]:
+        for field_name in ["images", "audios"]:
             field = getattr(self, field_name, None)
             if field is not None and isinstance(field, list) and len(field) == 0:
                 setattr(self, field_name, None)
@@ -36,16 +45,48 @@ class BaseMultiModalProcessorOutput:
 @dataclasses.dataclass
 class MultimodalSpecialTokens:
-    image_token: Optional[str] = None
-    video_token: Optional[str] = None
-    audio_token: Optional[str] = None
-    def collect(self) -> list[str]:
-        return [
-            token
-            for token in [self.image_token, self.video_token, self.audio_token]
-            if token
+    image_token: Optional[Union[int, str, List[str]]] = None
+    video_token: Optional[Union[int, str, List[str]]] = None
+    audio_token: Optional[Union[int, str, List[str]]] = None
+    def convert_to_str(self, token: Union[str, int], processor) -> str:
+        if token is None:
+            return token
+        if isinstance(token, str):
+            return token
+        return processor.tokenizer.convert_ids_to_tokens([token])[0]
+    def convert_to_strs(self, processor):
+        self.image_token = self.convert_to_str(self.image_token, processor)
+        self.video_token = self.convert_to_str(self.video_token, processor)
+        self.audio_token = self.convert_to_str(self.audio_token, processor)
+    image_token_regex: Optional[re.Pattern] = None
+    video_token_regex: Optional[re.Pattern] = None
+    audio_token_regex: Optional[re.Pattern] = None
+    def __post_init__(self):
+        if self.image_token_regex is None and self.image_token is not None:
+            self.image_token_regex = re.compile(re.escape(self.image_token))
+        if self.video_token_regex is None and self.video_token is not None:
+            self.video_token_regex = re.compile(re.escape(self.video_token))
+        if self.audio_token_regex is None and self.audio_token is not None:
+            self.audio_token_regex = re.compile(re.escape(self.audio_token))
+    def collect(self) -> re.Pattern:
+        tokens = [
+            self.image_token_regex,
+            self.video_token_regex,
+            self.audio_token_regex,
         ]
+        patterns = []
+        flags = 0
+        for t in tokens:
+            if t is not None:
+                patterns.append(t.pattern)
+                flags |= t.flags
+        combined = "(" + "|".join(f"(?:{p})" for p in patterns) + ")"
+        return re.compile(combined, flags)
 class BaseMultimodalProcessor(ABC):
@@ -54,6 +95,7 @@ class BaseMultimodalProcessor(ABC):
     def __init__(self, hf_config, server_args, _processor):
         self.hf_config = hf_config
         self._processor = _processor
+        self.arch = hf_config.architectures[0]
         self.server_args = server_args
         # FIXME: not accurate, model and image specific
         self.NUM_TOKEN_PER_FRAME = 330
@@ -136,6 +178,8 @@ class BaseMultimodalProcessor(ABC):
         data, is_video, is_audio, frame_count_limit=None, discard_alpha_channel=True
     ):
         """Static method that can be pickled for multiprocessing"""
+        if isinstance(data, dict):
+            return data
         try:
             if is_audio:
                 return load_audio(data)
@@ -175,7 +219,10 @@ class BaseMultimodalProcessor(ABC):
         image_index, audio_index = 0, 0
         for text_part in text_parts:
-            if text_part == multimodal_tokens.image_token:
+            if (
+                multimodal_tokens.image_token_regex
+                and multimodal_tokens.image_token_regex.match(text_part)
+            ):
                 data = image_data[image_index]
                 is_video = isinstance(data, str) and data.startswith("video:")
                 estimated_frames = estimated_frames_list[image_index]
@@ -192,7 +239,10 @@ class BaseMultimodalProcessor(ABC):
                 )
                 task_info.append((Modality.IMAGE, data, frame_count_limit))
                 image_index += 1
-            elif text_part == multimodal_tokens.audio_token:
+            elif (
+                multimodal_tokens.audio_token_regex
+                and multimodal_tokens.audio_token_regex.match(text_part)
+            ):
                 data = audio_data[audio_index]
                 futures.append(
                     self.io_executor.submit(
@@ -228,17 +278,13 @@ class BaseMultimodalProcessor(ABC):
             discard_alpha_channel: if True, discards the alpha channel in the returned images
         """
+        if not return_text:
+            raise NotImplementedError()
         if image_data is None:
             image_data = []
-        if isinstance(multimodal_tokens.image_token, int):
-            multimodal_tokens.image_token = (
-                self._processor.tokenizer.convert_ids_to_tokens(
-                    multimodal_tokens.image_token
-                )
-            )
-        else:
-            multimodal_tokens.image_token = multimodal_tokens.image_token
+        multimodal_tokens.convert_to_strs(self._processor)
+        multimodal_tokens_pattern = multimodal_tokens.collect()
         if isinstance(prompt, list) and return_text:
             assert len(prompt) and isinstance(prompt[0], int)
@@ -247,16 +293,8 @@ class BaseMultimodalProcessor(ABC):
             prompt = prompt
         assert isinstance(prompt, str)
-        if return_text:
-            import re
-            pattern = (
-                "("
-                + "|".join(re.escape(sep) for sep in multimodal_tokens.collect())
-                + ")"
-            )
-            # split text into list of normal text and special tokens
-            text_parts = re.split(pattern, prompt)
+        # split text into list of normal text and special tokens
+        text_parts = re.split(multimodal_tokens_pattern, prompt)
         futures, task_info = self.submit_data_loading_tasks(
             text_parts=text_parts,
@@ -266,34 +304,253 @@ class BaseMultimodalProcessor(ABC):
             discard_alpha_channel=discard_alpha_channel,
         )
         # Process results
-        image_sizes, images, audios = [], [], []
+        images, audios = [], []
         new_text = ""
         task_ptr = 0
         for text_part in text_parts:
-            if text_part in multimodal_tokens.collect():
+            if multimodal_tokens_pattern.match(text_part):
                 task_type, data, frame_limit = task_info[task_ptr]
                 result = futures[task_ptr].result()
                 task_ptr += 1
                 if task_type == Modality.IMAGE:
+                    # If data is already processed it will be a
+                    # dictionary. In this case we want to keep the
+                    # expanded tokens in text_part. Otherwise, we will
+                    # call the processor code, so keep only a single image
+                    # token.
+                    mm_tokens = (
+                        text_part
+                        if isinstance(data, dict)
+                        else multimodal_tokens.image_token
+                    )
                     frames = [result] if not isinstance(result, list) else result
                     if frames:
-                        image_sizes += frames[0].size * len(frames)
                         images += frames
-                        new_text += multimodal_tokens.image_token * len(frames)
+                        new_text += mm_tokens * len(frames)
                 elif task_type == Modality.AUDIO:
                     # audio
+                    mm_tokens = (
+                        text_part
+                        if isinstance(data, dict)
+                        else multimodal_tokens.audio_token
+                    )
                     audios.append(result)
-                    new_text += multimodal_tokens.audio_token
+                    new_text += mm_tokens
                 # TODO: handle video
             else:
                 new_text += text_part
         out = BaseMultiModalProcessorOutput(
+            input_text=new_text,
             images=images,
             audios=audios,
-            input_text=new_text,
         )
         out.normalize()
         return out
+    @staticmethod
+    def get_mm_items_offset(
+        input_ids: torch.Tensor, mm_token_id: int
+    ) -> List[Tuple[int, int]]:
+        """
+        Get a set of range for mm_items from input_ids
+        Example:
+            input_ids = [1, 2, 3, 3, 3, 4, 3, 3]
+            mm_token_id = 3
+            return result = [(2,4),(6,7)]
+        """
+        mask = input_ids == mm_token_id
+        start_positions = (mask & ~torch.roll(mask, 1)).nonzero(as_tuple=True)[0]
+        end_positions = (mask & ~torch.roll(mask, -1)).nonzero(as_tuple=True)[0]
+        return list(zip(start_positions.tolist(), end_positions.tolist()))
+    @staticmethod
+    def get_mm_items_offset_by_pair(
+        input_ids: torch.Tensor, mm_start_id: int, mm_end_id: int
+    ) -> List[Tuple[int, int]]:
+        indices_start = (input_ids == mm_start_id).nonzero(as_tuple=True)[0] + 1
+        indices_end = (input_ids == mm_end_id).nonzero(as_tuple=True)[0] - 1
+        return list(zip(indices_start.tolist(), indices_end.tolist()))
+    @staticmethod
+    def _extract_processor_features(
+        items: List[dict], attr_name: str
+    ) -> Optional[torch.Tensor]:
+        """
+        Helper function to concat extracted attributes from processor output.
+        """
+        values = [value for item in items if (value := item.get(attr_name)) is not None]
+        return torch.cat(values) if values else None
+    # When we assume that all the items have the same attributes
+    def _extract_processor_features_from_all_attributes(
+        self, items: List[dict]
+    ) -> dict:
+        values = {}
+        # Verify all items have the same keys
+        first_keys = set(items[0].keys())
+        for item in items[1:]:
+            if set(item.keys()) != first_keys:
+                raise ValueError(
+                    f"All items must have the same attributes. "
+                    f"First item has {first_keys}, but found {set(item.keys())}"
+                )
+        # Process each attribute
+        for k, v in items[0].items():
+            if isinstance(v, list):
+                values[k] = self._extract_processor_features(items, k)
+            else:
+                # Verify all items have the same value for non-list attributes
+                for item in items[1:]:
+                    if item[k] != v:
+                        raise ValueError(
+                            f"All items must have the same value for attribute {k}. "
+                            f"First item has {v}, but found {item[k]}"
+                        )
+                values[k] = v
+        return values
+    def process_and_combine_mm_data(
+        self, base_output: BaseMultiModalProcessorOutput
+    ) -> Tuple[Optional[MultimodalDataItem], torch.Tensor]:
+        """
+        Process multimodal data and return the combined multimodal item and input_ids.
+        Handles all three input formats at the same abstraction level.
+        Returns:
+            Tuple of (combined_mm_item, input_ids)
+        """
+        def tokenize_text(input_text: str) -> torch.Tensor:
+            """Tokenize input text."""
+            return self._processor.tokenizer(
+                input_text,
+                return_tensors="pt",
+                add_special_tokens=True,
+            ).input_ids.flatten()
+        def categorize_mm_inputs(mm_inputs: List) -> MultimodalInputFormat:
+            """Categorize multimodal inputs and validate consistency."""
+            try:
+                has_image = False
+                has_pixel_values = False
+                has_precomputed_features = False
+                for mm_input in mm_inputs:
+                    if isinstance(mm_input, Image.Image):
+                        has_image = True
+                    elif isinstance(mm_input, dict):
+                        if mm_input.get("precomputed_features", None) is not None:
+                            has_precomputed_features = True
+                        elif mm_input.get("pixel_values", None) is not None:
+                            has_pixel_values = True
+                        else:
+                            raise ValueError(
+                                f"Invalid multimodal input: {mm_input}, expected dict with pixel_values or precomputed_features"
+                            )
+                    else:
+                        raise ValueError(
+                            f"Invalid multimodal input: {mm_input}, expected Image.Image or dict"
+                        )
+                # Validate format consistency
+                format_count = sum(
+                    [has_image, has_pixel_values, has_precomputed_features]
+                )
+                if format_count > 1:
+                    raise ValueError(
+                        "Unsupported: mixture of multimodal input formats. "
+                        f"Found formats: image={has_image}, pixel_values={has_pixel_values}, "
+                        f"precomputed_features={has_precomputed_features}"
+                    )
+                if has_image:
+                    return MultimodalInputFormat.RAW_IMAGES
+                elif has_precomputed_features:
+                    return MultimodalInputFormat.PRECOMPUTED_FEATURES
+                elif has_pixel_values:
+                    return MultimodalInputFormat.PIXEL_VALUES
+                else:
+                    raise ValueError("No valid multimodal input format found")
+            except Exception as e:
+                raise ValueError(f"Failed to categorize inputs: {e}")
+        def process_raw_images(
+            base_output: BaseMultiModalProcessorOutput,
+        ) -> Tuple[MultimodalDataItem, torch.Tensor]:
+            """Process raw Image.Image objects using transformers processor."""
+            ret = self.process_mm_data(
+                input_text=base_output.input_text,
+                images=base_output.images,
+            )
+            combined_mm_item = MultimodalDataItem(modality=Modality.IMAGE)
+            # Copy all fields from processor output except input_ids
+            for key, value in ret.items():
+                if key != "input_ids" and hasattr(combined_mm_item, key):
+                    setattr(combined_mm_item, key, value)
+            input_ids = ret["input_ids"].flatten()
+            return combined_mm_item, input_ids
+        def process_precomputed_features(
+            base_output: BaseMultiModalProcessorOutput,
+        ) -> Tuple[MultimodalDataItem, torch.Tensor]:
+            """Process inputs with precomputed features."""
+            combined_mm_item = MultimodalDataItem(modality=Modality.IMAGE)
+            combined_mm_item.precomputed_features = self._extract_processor_features(
+                base_output.images, "precomputed_features"
+            )
+            input_ids = tokenize_text(base_output.input_text)
+            return combined_mm_item, input_ids
+        def process_pixel_values(
+            base_output: BaseMultiModalProcessorOutput,
+        ) -> Tuple[MultimodalDataItem, torch.Tensor]:
+            """Process inputs with pixel values."""
+            values = self._extract_processor_features_from_all_attributes(
+                base_output.images
+            )
+            combined_mm_item = MultimodalDataItem.from_dict(values)
+            input_ids = tokenize_text(base_output.input_text)
+            return combined_mm_item, input_ids
+        def finalize_mm_item(
+            combined_mm_item: MultimodalDataItem, input_ids: torch.Tensor
+        ) -> MultimodalDataItem:
+            """Apply common post-processing to the multimodal item."""
+            combined_mm_item.image_offsets = self.get_mm_items_offset(
+                input_ids=input_ids,
+                mm_token_id=self.IM_TOKEN_ID,
+            )
+            return combined_mm_item
+        # Main logic
+        mm_inputs = base_output.images
+        if not mm_inputs:
+            # Return text-only case
+            input_ids = tokenize_text(base_output.input_text)
+            return None, input_ids
+        # Categorize input formats
+        input_format = categorize_mm_inputs(mm_inputs)
+        # Process based on format
+        if input_format == MultimodalInputFormat.RAW_IMAGES:
+            combined_mm_item, input_ids = process_raw_images(base_output)
+        elif input_format == MultimodalInputFormat.PRECOMPUTED_FEATURES:
+            combined_mm_item, input_ids = process_precomputed_features(base_output)
+        elif input_format == MultimodalInputFormat.PIXEL_VALUES:
+            combined_mm_item, input_ids = process_pixel_values(base_output)
+        else:
+            raise ValueError(f"Unknown input format: {input_format}")
+        # Finalize with common processing
+        combined_mm_item = finalize_mm_item(combined_mm_item, input_ids)
+        return combined_mm_item, input_ids

sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py CHANGED Viewed

@@ -70,8 +70,13 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
         batched_images_spatial_crop = torch.stack(batched_images_spatial_crop, dim=0)
         items = []
+        input_ids = res["input_ids"]
+        image_offsets = self.get_mm_items_offset(
+            input_ids=input_ids, mm_token_id=self._processor.image_token_id
+        )
         item = MultimodalDataItem(
             pixel_values=res["images"],
+            image_offsets=image_offsets,
             modality=Modality.IMAGE,
             image_emb_mask=images_seq_mask,
             image_spatial_crop=batched_images_spatial_crop,
@@ -80,6 +85,6 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
         return {
             "mm_items": items,
-            "input_ids": res["input_ids"].tolist(),
+            "input_ids": input_ids.tolist(),
             "im_token_id": self._processor.image_token_id,
         }

sglang/srt/managers/multimodal_processors/gemma3.py CHANGED Viewed

@@ -1,4 +1,5 @@
-from typing import List, Union
+import re
+from typing import Dict, List, Union
 from sglang.srt.managers.multimodal_processor import (
     BaseMultimodalProcessor as SGLangBaseProcessor,
@@ -18,13 +19,19 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
     def __init__(self, hf_config, server_args, _processor):
         super().__init__(hf_config, server_args, _processor)
+        # The single, pre-expanded image token.
         self.IMAGE_TOKEN = "<start_of_image>"
+        # The regex that matches expanded image tokens.
+        self.IMAGE_TOKEN_REGEX = re.compile(
+            r"<start_of_image>(?:(?:<image_soft_token>)*<end_of_image>)?"
+        )
         self.IM_START_TOKEN_ID = hf_config.boi_token_index
         self.IM_END_TOKEN_ID = hf_config.eoi_token_index
+        self.IM_TOKEN_ID = hf_config.image_token_index
     async def process_mm_data_async(
         self,
-        image_data: List[Union[str, bytes]],
+        image_data: List[Union[str, bytes, Dict]],
         input_text,
         request_obj,
         max_req_input_len,
@@ -36,30 +43,21 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
         if isinstance(image_data, str):
             image_data = [image_data]
-        image_token = self.IMAGE_TOKEN
         base_output = self.load_mm_data(
             prompt=input_text,
             image_data=image_data,
-            multimodal_tokens=MultimodalSpecialTokens(image_token=image_token),
+            multimodal_tokens=MultimodalSpecialTokens(
+                image_token=self.IMAGE_TOKEN, image_token_regex=self.IMAGE_TOKEN_REGEX
+            ),
             max_req_input_len=max_req_input_len,
             discard_alpha_channel=True,
         )
-        ret = self.process_mm_data(
-            input_text=base_output.input_text, images=base_output.images
-        )
-        items = []
-        for i, image in enumerate(base_output.images):
-            item = MultimodalDataItem(
-                pixel_values=ret["pixel_values"][i],
-                modality=Modality.IMAGE,
-            )
-            items += [item]
+        combined_mm_item, input_ids = self.process_and_combine_mm_data(base_output)
         return {
-            "mm_items": items,
-            "input_ids": ret["input_ids"].flatten().tolist(),
+            "input_ids": input_ids.tolist(),
+            "mm_items": [combined_mm_item] if combined_mm_item is not None else [],
             "im_start_id": self.IM_START_TOKEN_ID,
             "im_end_id": self.IM_END_TOKEN_ID,
         }

sglang/srt/managers/multimodal_processors/internvl.py CHANGED Viewed

@@ -3,7 +3,6 @@
 import numpy as np
 import torch
 from decord import VideoReader, cpu
-from numpy.distutils.cpuinfo import cpu
 from PIL import Image
 from sglang.srt.managers.multimodal_processors.base_processor import (
@@ -176,6 +175,10 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
         if not image_data:
             return None
+        # Ensure image_data is a list
+        if isinstance(image_data, str):
+            image_data = [image_data]
         base_output = self.load_mm_data(
             prompt=input_text,
             image_data=image_data,
@@ -210,7 +213,6 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
                 return None
         pixel_values = torch.cat(pixel_values, dim=0)
-        items = [MultimodalDataItem(pixel_values=pixel_values, modality=Modality.IMAGE)]
         for idx, num_patches in enumerate(num_patches_list):
             image_tokens = (
@@ -221,10 +223,21 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
             input_text = input_text.replace("<image>", image_tokens, 1)
         tokenizer = self._processor
+        input_ids = tokenizer(input_text, return_tensors="pt")["input_ids"].flatten()
+        image_offsets = self.get_mm_items_offset(
+            input_ids=input_ids,
+            mm_token_id=self.img_context_token_id,
+        )
+        items = [
+            MultimodalDataItem(
+                pixel_values=pixel_values,
+                modality=Modality.IMAGE,
+                image_offsets=image_offsets,
+            )
+        ]
         return {
-            "input_ids": tokenizer(input_text, return_tensors="pt")["input_ids"]
-            .flatten()
-            .tolist(),
+            "input_ids": input_ids.tolist(),
             "mm_items": items,
             "im_start_id": self.img_start_token_id,
             "im_end_id": self.img_end_token_id,

sglang/srt/managers/multimodal_processors/janus_pro.py CHANGED Viewed

@@ -45,15 +45,21 @@ class JanusProImageProcessor(BaseMultimodalProcessor):
             prompt=base_out.input_text,
             images=images,
         )
+        input_ids = res["input_ids"].flatten()
+        image_offsets = self.get_mm_items_offset(
+            input_ids=input_ids, mm_token_id=processor.image_id
+        )
         return {
             "mm_items": [
                 MultimodalDataItem(
                     pixel_values=res["pixel_values"],
                     image_emb_mask=res["images_emb_mask"],
+                    image_offsets=image_offsets,
                     modality=Modality.IMAGE,
                 )
             ],
-            "input_ids": res["input_ids"].flatten().tolist(),
+            "input_ids": input_ids.tolist(),
             "im_start_id": processor.image_start_id,
             "im_end_id": processor.image_end_id,
             "im_token_id": processor.image_id,

sglang/srt/managers/multimodal_processors/kimi_vl.py CHANGED Viewed

@@ -1,9 +1,7 @@
-import asyncio
-import math
-from typing import List, Union
+import re
+from typing import Any, Dict, List, Optional, Union
 import torch
-from PIL import Image
 from sglang.srt.managers.multimodal_processors.base_processor import (
     BaseMultimodalProcessor as SGLangBaseProcessor,
@@ -22,20 +20,12 @@ class KimiVLImageProcessor(SGLangBaseProcessor):
     def __init__(self, hf_config, server_args, _processor):
         super().__init__(hf_config, server_args, _processor)
         self.IMAGE_TOKEN = "<|media_pad|>"
-        self.im_token_id = _processor.tokenizer.convert_tokens_to_ids(self.IMAGE_TOKEN)
-        self.im_start = "<|media_start|>"
-        self.im_start_id = _processor.tokenizer.convert_tokens_to_ids(self.im_start)
-        self.im_end = "<|media_end|>"
-        self.im_end_id = _processor.tokenizer.convert_tokens_to_ids(self.im_end)
-        self.im_content = "<|media_content|>"
-        self.im_content_id = _processor.tokenizer.convert_tokens_to_ids(self.im_content)
+        self.IMAGE_TOKEN_REGEX = re.compile(r"(?:<\|media_pad\|>)+")
+        self.IM_TOKEN_ID = _processor.tokenizer.convert_tokens_to_ids(self.IMAGE_TOKEN)
     async def process_mm_data_async(
         self,
-        image_data: List[Union[str, bytes]],
+        image_data: List[Union[str, bytes, Dict]],
         input_text,
         request_obj,
         max_req_input_len,
@@ -50,24 +40,16 @@ class KimiVLImageProcessor(SGLangBaseProcessor):
         base_output = self.load_mm_data(
             prompt=input_text,
             image_data=image_data,
-            multimodal_tokens=MultimodalSpecialTokens(image_token=self.IMAGE_TOKEN),
+            multimodal_tokens=MultimodalSpecialTokens(
+                image_token=self.IMAGE_TOKEN, image_token_regex=self.IMAGE_TOKEN_REGEX
+            ),
             max_req_input_len=max_req_input_len,
         )
-        ret = self.process_mm_data(
-            input_text=base_output.input_text,
-            images=base_output.images,
-        )
+        combined_mm_item, input_ids = self.process_and_combine_mm_data(base_output)
         return {
-            "input_ids": ret["input_ids"].flatten().tolist(),
-            "mm_items": [
-                MultimodalDataItem(
-                    pixel_values=ret["pixel_values"],
-                    image_grid_thws=ret["image_grid_hws"],
-                    modality=Modality.IMAGE,
-                )
-            ],
-            "im_token_id": self.im_token_id,
-            "im_start_id": self.im_start_id,
-            "im_end_id": self.im_end_id,
-            "im_content_id": self.im_content_id,
+            "input_ids": input_ids.tolist(),
+            "mm_items": [combined_mm_item] if combined_mm_item is not None else [],
+            "im_token_id": self.IM_TOKEN_ID,
         }

sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

sglang 0.4.6.post4py3-none-any.whl → 0.4.7py3-none-any.whl