PyPI - sglang - Versions diffs - 0.4.9.post1__py3-none-any.whl → 0.4.9.post2__py3-none-any.whl - Mend

sglang 0.4.9.post1py3-none-any.whl → 0.4.9.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

sglang/srt/configs/model_config.py +24 -1
sglang/srt/conversation.py +21 -2
sglang/srt/disaggregation/ascend/__init__.py +6 -0
sglang/srt/disaggregation/ascend/conn.py +44 -0
sglang/srt/disaggregation/ascend/transfer_engine.py +58 -0
sglang/srt/disaggregation/mooncake/conn.py +15 -14
sglang/srt/disaggregation/mooncake/transfer_engine.py +17 -8
sglang/srt/disaggregation/utils.py +25 -3
sglang/srt/entrypoints/engine.py +1 -1
sglang/srt/entrypoints/http_server.py +1 -0
sglang/srt/entrypoints/openai/protocol.py +11 -0
sglang/srt/entrypoints/openai/serving_chat.py +7 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/kimik2_detector.py +220 -0
sglang/srt/hf_transformers_utils.py +18 -0
sglang/srt/jinja_template_utils.py +8 -0
sglang/srt/layers/communicator.py +17 -4
sglang/srt/layers/linear.py +12 -2
sglang/srt/layers/moe/ep_moe/kernels.py +2 -1
sglang/srt/layers/moe/ep_moe/layer.py +2 -1
sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -2
sglang/srt/layers/moe/topk.py +8 -2
sglang/srt/layers/parameter.py +19 -3
sglang/srt/layers/quantization/fp8_kernel.py +2 -2
sglang/srt/layers/quantization/moe_wna16.py +1 -2
sglang/srt/layers/quantization/w8a8_int8.py +738 -14
sglang/srt/managers/io_struct.py +27 -2
sglang/srt/managers/mm_utils.py +55 -94
sglang/srt/managers/schedule_batch.py +16 -5
sglang/srt/managers/scheduler.py +21 -1
sglang/srt/managers/tokenizer_manager.py +16 -0
sglang/srt/mem_cache/memory_pool.py +65 -40
sglang/srt/model_executor/forward_batch_info.py +13 -1
sglang/srt/model_loader/loader.py +23 -12
sglang/srt/models/deepseek_janus_pro.py +1 -1
sglang/srt/models/deepseek_v2.py +62 -17
sglang/srt/models/deepseek_vl2.py +1 -1
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/gemma3n_mm.py +6 -3
sglang/srt/models/internvl.py +8 -2
sglang/srt/models/kimi_vl.py +8 -2
sglang/srt/models/llama.py +2 -0
sglang/srt/models/llava.py +3 -1
sglang/srt/models/llavavid.py +1 -1
sglang/srt/models/minicpmo.py +1 -2
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mixtral_quant.py +4 -0
sglang/srt/models/mllama4.py +13 -4
sglang/srt/models/phi4mm.py +8 -2
sglang/srt/models/phimoe.py +553 -0
sglang/srt/models/qwen2.py +2 -0
sglang/srt/models/qwen2_5_vl.py +10 -7
sglang/srt/models/qwen2_vl.py +12 -1
sglang/srt/models/vila.py +8 -2
sglang/srt/multimodal/processors/base_processor.py +197 -137
sglang/srt/multimodal/processors/deepseek_vl_v2.py +1 -1
sglang/srt/multimodal/processors/gemma3.py +4 -2
sglang/srt/multimodal/processors/gemma3n.py +1 -1
sglang/srt/multimodal/processors/internvl.py +1 -1
sglang/srt/multimodal/processors/janus_pro.py +1 -1
sglang/srt/multimodal/processors/kimi_vl.py +1 -1
sglang/srt/multimodal/processors/minicpm.py +4 -3
sglang/srt/multimodal/processors/mllama4.py +1 -1
sglang/srt/multimodal/processors/phi4mm.py +1 -1
sglang/srt/multimodal/processors/pixtral.py +1 -1
sglang/srt/multimodal/processors/qwen_vl.py +203 -80
sglang/srt/multimodal/processors/vila.py +1 -1
sglang/srt/server_args.py +11 -4
sglang/srt/utils.py +154 -31
sglang/version.py +1 -1
{sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/METADATA +4 -3
{sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/RECORD +75 -70
{sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/top_level.txt +0 -0

sglang/srt/multimodal/processors/base_processor.py CHANGED Viewed

@@ -5,7 +5,7 @@ import multiprocessing as mp
 import os
 import re
 from abc import ABC, abstractmethod
-from enum import Enum
+from functools import lru_cache
 from typing import Any, Dict, List, Optional, Tuple, Union
 import numpy as np
@@ -14,7 +14,7 @@ from PIL import Image
 from transformers import BaseImageProcessorFast
 from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
-from sglang.srt.utils import encode_video, load_audio, load_image
+from sglang.srt.utils import load_audio, load_image, load_video, logger
 @dataclasses.dataclass
@@ -25,14 +25,22 @@ class BaseMultiModalProcessorOutput:
     # frames loaded from image and video, in given order
     images: Optional[list[Union[Image.Image, dict]]] = None
+    # videos
+    videos: Optional[list[Union[torch.Tensor, dict]]] = None
     # audios
     audios: Optional[list[Union[np.ndarray, dict]]] = None
-    def normalize(self):
-        for field_name in ["images", "audios"]:
-            field = getattr(self, field_name, None)
-            if field is not None and isinstance(field, list) and len(field) == 0:
-                setattr(self, field_name, None)
+    def organize_results(self) -> List[Tuple[Modality, Any]]:
+        """
+        :return: a list of results, with their corresponding modalities
+        """
+        return (
+            [(Modality.IMAGE, data) for data in self.images]
+            + [(Modality.VIDEO, data) for data in self.videos]
+            + [(Modality.AUDIO, data) for data in self.audios]
+        )
 @dataclasses.dataclass
@@ -41,6 +49,10 @@ class MultimodalSpecialTokens:
     video_token: Optional[Union[int, str, List[str]]] = None
     audio_token: Optional[Union[int, str, List[str]]] = None
+    image_token_regex: Optional[re.Pattern] = None
+    video_token_regex: Optional[re.Pattern] = None
+    audio_token_regex: Optional[re.Pattern] = None
     def convert_to_str(self, token: Union[str, int], processor) -> str:
         if token is None:
             return token
@@ -53,11 +65,29 @@ class MultimodalSpecialTokens:
         self.video_token = self.convert_to_str(self.video_token, processor)
         self.audio_token = self.convert_to_str(self.audio_token, processor)
-    image_token_regex: Optional[re.Pattern] = None
-    video_token_regex: Optional[re.Pattern] = None
-    audio_token_regex: Optional[re.Pattern] = None
-    def __post_init__(self):
+    def get_modality_of_token(self, token) -> Optional[Modality]:
+        """
+        :return: the modality associated with the given token, if the token is a special_token or matches with the multimodal token regex
+        """
+        modality = {
+            self.image_token: Modality.IMAGE,
+            self.video_token: Modality.VIDEO,
+            self.audio_token: Modality.AUDIO,
+        }.get(token)
+        if modality:
+            return modality
+        for regex, modality in [
+            (self.image_token_regex, Modality.IMAGE),
+            (self.video_token_regex, Modality.VIDEO),
+            (self.audio_token_regex, Modality.AUDIO),
+        ]:
+            if regex and regex.match(token):
+                return modality
+        return None
+    def parse_regex(self):
         if self.image_token_regex is None and self.image_token is not None:
             self.image_token_regex = re.compile(re.escape(self.image_token))
         if self.video_token_regex is None and self.video_token is not None:
@@ -65,7 +95,7 @@ class MultimodalSpecialTokens:
         if self.audio_token_regex is None and self.audio_token is not None:
             self.audio_token_regex = re.compile(re.escape(self.audio_token))
-    def collect(self) -> re.Pattern:
+    def combine_regex(self) -> re.Pattern:
         tokens = [
             self.image_token_regex,
             self.video_token_regex,
@@ -105,6 +135,7 @@ class BaseMultimodalProcessor(ABC):
         self.ATTR_NAME_TO_MODALITY = {
             # Image-related attributes
             "pixel_values": Modality.IMAGE,
+            "pixel_values_videos": Modality.VIDEO,
             "image_sizes": Modality.IMAGE,
             "image_grid_thw": Modality.IMAGE,
             "image_emb_mask": Modality.IMAGE,
@@ -120,7 +151,7 @@ class BaseMultimodalProcessor(ABC):
             "input_features": Modality.AUDIO,
             "input_features_mask": Modality.AUDIO,
             # Video-related attributes
-            "video_grid_thws": Modality.VIDEO,
+            "video_grid_thw": Modality.VIDEO,
             # Generic attributes that could apply to multiple modalities
             # "precomputed_features" - handled specially as it can be any modality
         }
@@ -196,20 +227,25 @@ class BaseMultimodalProcessor(ABC):
     @staticmethod
     def _load_single_item(
-        data, is_video, is_audio, frame_count_limit=None, discard_alpha_channel=True
+        data, modality: Modality, frame_count_limit=None, discard_alpha_channel=True
     ):
-        """Static method that can be pickled for multiprocessing"""
+        """
+        Load a single multimodal data.
+        If data is precomputed, returns directly.
+        Static method that can be pickled for multiprocessing"""
         if isinstance(data, dict):
             return data
         try:
-            if is_audio:
-                return load_audio(data)
-            elif is_video:
-                path = data[len("video:") :]
-                return encode_video(path, frame_count_limit)
-            else:
+            if modality == Modality.IMAGE:
                 img, _ = load_image(data)
                 return img.convert("RGB") if discard_alpha_channel else img
+            elif modality == Modality.VIDEO:
+                return load_video(data, frame_count_limit)
+            elif modality == Modality.AUDIO:
+                return load_audio(data)
         except Exception as e:
             raise RuntimeError(f"Error while loading data {data}: {e}")
@@ -217,75 +253,78 @@ class BaseMultimodalProcessor(ABC):
         self,
         text_parts: List[str],
         multimodal_tokens: MultimodalSpecialTokens,
-        image_data: Optional[list] = None,
-        audio_data: Optional[list] = None,
+        data_iterators: dict,
         discard_alpha_channel: bool = True,
-    ):
+        image_estimated_frames_iter: Optional[iter] = None,
+        image_scaling_factor: float = 1.0,
+        max_image_frames: int = 30,
+    ) -> Tuple[List, List]:
         """
-        load multimodal data parallelly
+        load multimodal data parallelly using iterators.
         """
-        # TODO(mick): load from server_args, env, or sampling_params
-        MAX_NUM_FRAMES = 30
-        estimated_frames_list = self.get_estimated_frames_list(image_data=image_data)
-        total_frame_count = sum(estimated_frames_list)
-        # a heuristic value, suggesting the maximum fraction of frames to embed from all visual inputs.
-        # e.g., 0.1 suggests that 1 frame out of 10 input frames should be used
-        scaling_factor = min(1.0, MAX_NUM_FRAMES / max(1, total_frame_count))
-        assert len(image_data) == len(estimated_frames_list)
-        # Submit all tasks
         futures = []
         task_info = []
-        image_index, audio_index = 0, 0
         for text_part in text_parts:
-            if (
-                multimodal_tokens.image_token_regex
-                and multimodal_tokens.image_token_regex.match(text_part)
-            ):
-                data = image_data[image_index]
-                is_video = isinstance(data, str) and data.startswith("video:")
-                estimated_frames = estimated_frames_list[image_index]
-                frame_count_limit = max(1, int(estimated_frames * scaling_factor))
+            modality = multimodal_tokens.get_modality_of_token(text_part)
+            if modality is not None:
+                data_iterator = data_iterators.get(modality)
+                if data_iterator is None:
+                    raise ValueError(f"No data iterator found for token: {text_part}")
+                try:
+                    data = next(data_iterator)
+                except StopIteration:
+                    raise ValueError(
+                        f"Mismatch: More '{text_part}' tokens found than corresponding data items provided."
+                    )
+                frame_count_limit = None
+                if modality == Modality.IMAGE and image_estimated_frames_iter:
+                    try:
+                        estimated_frames = next(image_estimated_frames_iter)
+                        # Use the pre-calculated scaling factor and max frames
+                        frame_count_limit = max(
+                            1, int(estimated_frames * image_scaling_factor)
+                        )
+                        # Ensure we don't exceed the absolute max (redundant if scaling_factor handles it)
+                        # frame_count_limit = min(frame_count_limit, max_image_frames)
+                    except StopIteration:
+                        raise ValueError(
+                            "Mismatch between image tokens and estimated frame counts."
+                        )
                 futures.append(
                     self.io_executor.submit(
                         BaseMultimodalProcessor._load_single_item,
                         data,
-                        is_video,
-                        False,
+                        modality,
                         frame_count_limit,
                         discard_alpha_channel,
                     )
                 )
-                task_info.append((Modality.IMAGE, data, frame_count_limit))
-                image_index += 1
-            elif (
-                multimodal_tokens.audio_token_regex
-                and multimodal_tokens.audio_token_regex.match(text_part)
-            ):
-                data = audio_data[audio_index]
-                futures.append(
-                    self.io_executor.submit(
-                        BaseMultimodalProcessor._load_single_item,
-                        data,
-                        False,
-                        True,
-                        None,
-                        discard_alpha_channel,
-                    )
+                task_info.append((modality, data, frame_count_limit))
+        for modality, iterator in data_iterators.items():
+            try:
+                next(iterator)
+                logger.warning(
+                    f"Warning: More {modality.name.lower()} data items provided than corresponding tokens found in the prompt."
                 )
-                task_info.append((Modality.AUDIO, data, None))
-                audio_index += 1
+            except StopIteration:
+                pass
+            except Exception:
+                pass
         return futures, task_info
     def load_mm_data(
         self,
-        prompt: str | List[int],
+        prompt: str,
         multimodal_tokens: MultimodalSpecialTokens,
         max_req_input_len: int,
         image_data: Optional[list] = None,
+        video_data: Optional[list] = None,
         audio_data: Optional[list] = None,
         return_text: Optional[bool] = True,
         discard_alpha_channel: bool = True,
@@ -299,14 +338,9 @@ class BaseMultimodalProcessor(ABC):
             discard_alpha_channel: if True, discards the alpha channel in the returned images
         """
-        if not return_text:
-            raise NotImplementedError()
-        if image_data is None:
-            image_data = []
         multimodal_tokens.convert_to_strs(self._processor)
-        multimodal_tokens_pattern = multimodal_tokens.collect()
+        multimodal_tokens.parse_regex()
+        multimodal_tokens_pattern = multimodal_tokens.combine_regex()
         if isinstance(prompt, list) and return_text:
             assert len(prompt) and isinstance(prompt[0], int)
             prompt = self._processor.tokenizer.decode(prompt)
@@ -317,59 +351,84 @@ class BaseMultimodalProcessor(ABC):
         # split text into list of normal text and special tokens
         text_parts = re.split(multimodal_tokens_pattern, prompt)
+        # collect all data
+        data_iterators = {}
+        if multimodal_tokens.image_token and image_data:
+            data_iterators[Modality.IMAGE] = iter(image_data)
+        if multimodal_tokens.video_token and video_data:
+            data_iterators[Modality.VIDEO] = iter(video_data)
+        if multimodal_tokens.audio_token and audio_data:
+            data_iterators[Modality.AUDIO] = iter(audio_data)
+        # futures: the futures of loaded data
+        # task_info: modality, raw_data, and other metadata of each data
         futures, task_info = self.submit_data_loading_tasks(
             text_parts=text_parts,
             multimodal_tokens=multimodal_tokens,
-            image_data=image_data,
-            audio_data=audio_data,
+            data_iterators=data_iterators,
             discard_alpha_channel=discard_alpha_channel,
         )
-        # Process results
-        images, audios = [], []
-        new_text = ""
-        task_ptr = 0
+        task_info_iter = iter(task_info)
+        futures_iter = iter(futures)
+        # Process results
+        images, videos, audios = [], [], []
+        new_text_parts = []
         for text_part in text_parts:
-            if multimodal_tokens_pattern.match(text_part):
-                task_type, data, frame_limit = task_info[task_ptr]
-                result = futures[task_ptr].result()
-                task_ptr += 1
-                if task_type == Modality.IMAGE:
-                    # If data is already processed it will be a
-                    # dictionary. In this case we want to keep the
-                    # expanded tokens in text_part. Otherwise, we will
-                    # call the processor code, so keep only a single image
-                    # token.
-                    mm_tokens = (
-                        text_part
-                        if isinstance(data, dict)
-                        else multimodal_tokens.image_token
-                    )
-                    frames = [result] if not isinstance(result, list) else result
-                    if frames:
-                        images += frames
-                        new_text += mm_tokens * len(frames)
-                elif task_type == Modality.AUDIO:
-                    # audio
-                    mm_tokens = (
-                        text_part
-                        if isinstance(data, dict)
-                        else multimodal_tokens.audio_token
-                    )
-                    audios.append(result)
-                    new_text += mm_tokens
-                # TODO: handle video
-            else:
-                new_text += text_part
-        out = BaseMultiModalProcessorOutput(
-            input_text=new_text,
+            try:
+                if multimodal_tokens_pattern.match(text_part):
+                    modality, raw_data, frame_limit = next(task_info_iter)
+                    is_precomputed = isinstance(raw_data, dict)
+                    result = next(futures_iter).result()
+                    if modality == Modality.IMAGE:
+                        # If data is already processed it will be a
+                        # dictionary(precomputed). In this case we want to keep the
+                        # expanded tokens in text_part. Otherwise, we will
+                        # call the processor code, so keep only a single image
+                        # token.
+                        mm_tokens = (
+                            text_part
+                            if is_precomputed
+                            else multimodal_tokens.image_token
+                        )
+                        frames = [result] if not isinstance(result, list) else result
+                        if frames:
+                            # only for minicpmv
+                            images += frames
+                            new_text_parts += mm_tokens * len(frames)
+                    elif modality == Modality.VIDEO:
+                        # load as video
+                        mm_tokens = (
+                            text_part
+                            if is_precomputed
+                            else multimodal_tokens.video_token
+                        )
+                        videos += [result]
+                        new_text_parts += mm_tokens
+                    elif modality == Modality.AUDIO:
+                        # audio
+                        mm_tokens = (
+                            text_part
+                            if is_precomputed
+                            else multimodal_tokens.audio_token
+                        )
+                        audios += [result]
+                        new_text_parts += mm_tokens
+                else:
+                    # normal text
+                    new_text_parts += [text_part]
+            except Exception as e:
+                raise RuntimeError(
+                    f"An exception occurred while loading multimodal data: {e}"
+                )
+        return BaseMultiModalProcessorOutput(
             images=images,
             audios=audios,
+            videos=videos,
+            input_text="".join(new_text_parts),
         )
-        out.normalize()
-        return out
     @staticmethod
     def get_mm_items_offset(
@@ -460,21 +519,19 @@ class BaseMultimodalProcessor(ABC):
                     )
                 except ValueError:
                     modality = Modality.IMAGE
             if modality:
                 # Create item if needed
                 if modality not in items:
                     items[modality] = MultimodalDataItem(modality=modality)
                 # Set attribute
-                if hasattr(items[modality], attr_name):
-                    setattr(items[modality], attr_name, value)
+                setattr(items[modality], attr_name, value)
         return list(items.values())
     def _process_and_collect_mm_items(
         self, input_text: str, images=None, audios=None, videos=None, **kwargs
-    ) -> Tuple[List[MultimodalDataItem], torch.Tensor]:
+    ) -> Tuple[List[MultimodalDataItem], torch.Tensor, dict]:
         """
         Helper method to process multimodal data and create mm_items in one step.
@@ -488,11 +545,11 @@ class BaseMultimodalProcessor(ABC):
         input_ids = ret["input_ids"].flatten()
         collected_items = self.collect_mm_items_from_processor_output(ret)
-        return collected_items, input_ids
+        return collected_items, input_ids, ret
     def process_and_combine_mm_data(
         self, base_output: BaseMultiModalProcessorOutput
-    ) -> Tuple[List[MultimodalDataItem], torch.Tensor]:
+    ) -> Tuple[List[MultimodalDataItem], torch.Tensor, dict]:
         """
         Process multimodal data and return the combined multimodal items and input_ids.
         Supports mixed modalities (images and audio in the same request).
@@ -501,8 +558,7 @@ class BaseMultimodalProcessor(ABC):
             Tuple of (list of mm_items, input_ids)
         """
         # Collect all items and categorize them
-        all_items = (base_output.images or []) + (base_output.audios or [])
+        all_items = base_output.organize_results()
         # Handle text-only case
         if not all_items:
             input_ids = self._processor.tokenizer(
@@ -510,19 +566,20 @@ class BaseMultimodalProcessor(ABC):
                 return_tensors="pt",
                 add_special_tokens=True,
             ).input_ids.flatten()
-            return [], input_ids
+            return [], input_ids, {}
-        dict_items, raw_images, raw_audios = [], [], []
-        for item in all_items:
+        dict_items, raw_images, raw_audios, raw_videos = [], [], [], []
+        for modality, item in all_items:
             if isinstance(item, dict):
                 dict_items.append(item)
-            elif isinstance(item, Image.Image):
+            elif modality == Modality.IMAGE:
                 raw_images.append(item)
-            elif isinstance(item, np.ndarray):
+            elif modality == Modality.AUDIO:
                 raw_audios.append(item)
+            elif modality == Modality.VIDEO:
+                raw_videos.append(item)
             else:
                 raise ValueError(f"Unknown multimodal item type: {type(item)}")
         # Process items and get input_ids
         all_collected_items = []
         input_ids = None
@@ -534,13 +591,16 @@ class BaseMultimodalProcessor(ABC):
             )
         # Handle raw items (need processing)
-        if raw_images or raw_audios:
-            collected_items, input_ids = self._process_and_collect_mm_items(
+        if raw_images or raw_audios or raw_videos:
+            collected_items, input_ids, ret = self._process_and_collect_mm_items(
                 input_text=base_output.input_text,
                 images=raw_images,
                 audios=raw_audios,
+                videos=raw_videos,
             )
             all_collected_items.extend(collected_items)
+        else:
+            ret = None
         # Fallback tokenization if no raw items were processed
         if input_ids is None:
@@ -553,21 +613,21 @@ class BaseMultimodalProcessor(ABC):
         # Add offsets to all items
         for mm_item in all_collected_items:
             if mm_item.modality in [Modality.IMAGE, Modality.MULTI_IMAGES]:
-                mm_item.image_offsets = self.get_mm_items_offset(
+                mm_item.offsets = self.get_mm_items_offset(
                     input_ids=input_ids,
                     mm_token_id=self.IM_TOKEN_ID,
                 )
             elif mm_item.modality == Modality.AUDIO:
-                mm_item.audio_offsets = self.get_mm_items_offset(
+                mm_item.offsets = self.get_mm_items_offset(
                     input_ids=input_ids,
                     mm_token_id=self.AUDIO_TOKEN_ID,
                 )
             elif mm_item.modality == Modality.VIDEO:
-                mm_item.video_offsets = self.get_mm_items_offset(
+                mm_item.offsets = self.get_mm_items_offset(
                     input_ids=input_ids,
                     mm_token_id=self.VIDEO_TOKEN_ID,
                 )
             else:
                 raise ValueError(f"Unknown modality: {mm_item.modality}")
-        return all_collected_items, input_ids
+        return all_collected_items, input_ids, ret

sglang/srt/multimodal/processors/deepseek_vl_v2.py CHANGED Viewed

@@ -69,7 +69,7 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
         )
         item = MultimodalDataItem(
             pixel_values=res["images"],
-            image_offsets=image_offsets,
+            offsets=image_offsets,
             modality=Modality.IMAGE,
             image_emb_mask=images_seq_mask,
             image_spatial_crop=batched_images_spatial_crop,

sglang/srt/multimodal/processors/gemma3.py CHANGED Viewed

@@ -36,6 +36,7 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
         *args,
         **kwargs,
     ):
+        print(f"{image_data=}")
         base_output = self.load_mm_data(
             prompt=input_text,
             image_data=image_data,
@@ -46,8 +47,9 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
             discard_alpha_channel=True,
         )
-        mm_items, input_ids = self.process_and_combine_mm_data(base_output)
+        mm_items, input_ids, _ = self.process_and_combine_mm_data(base_output)
+        print(f"{base_output=}")
+        print(f"{mm_items=}")
         return {
             "input_ids": input_ids.tolist(),
             "mm_items": mm_items,

sglang/srt/multimodal/processors/gemma3n.py CHANGED Viewed

@@ -72,7 +72,7 @@ class Gemma3nSGLangProcessor(SGLangBaseProcessor):
             ),
         )
-        mm_items, input_ids = self.process_and_combine_mm_data(base_output)
+        mm_items, input_ids, _ = self.process_and_combine_mm_data(base_output)
         return {
             "input_ids": input_ids.tolist(),

sglang/srt/multimodal/processors/internvl.py CHANGED Viewed

@@ -225,7 +225,7 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
             MultimodalDataItem(
                 pixel_values=pixel_values,
                 modality=Modality.IMAGE,
-                image_offsets=image_offsets,
+                offsets=image_offsets,
             )
         ]

sglang/srt/multimodal/processors/janus_pro.py CHANGED Viewed

@@ -49,7 +49,7 @@ class JanusProImageProcessor(BaseMultimodalProcessor):
                 MultimodalDataItem(
                     pixel_values=res["pixel_values"],
                     image_emb_mask=res["images_emb_mask"],
-                    image_offsets=image_offsets,
+                    offsets=image_offsets,
                     modality=Modality.IMAGE,
                 )
             ],

sglang/srt/multimodal/processors/kimi_vl.py CHANGED Viewed

@@ -39,7 +39,7 @@ class KimiVLImageProcessor(SGLangBaseProcessor):
             max_req_input_len=max_req_input_len,
         )
-        mm_items, input_ids = self.process_and_combine_mm_data(base_output)
+        mm_items, input_ids, _ = self.process_and_combine_mm_data(base_output)
         return {
             "input_ids": input_ids.tolist(),

sglang/srt/multimodal/processors/minicpm.py CHANGED Viewed

@@ -19,6 +19,7 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
         super().__init__(hf_config, server_args, _processor)
         self.image_token = "(<image>./</image>)"
         self.audio_token = "(<audio>./</audio>)"
+        self.video_token = "(<video>./</video>)"
     async def process_mm_data_async(
         self,
@@ -36,6 +37,7 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
             image_data=image_data,
             multimodal_tokens=MultimodalSpecialTokens(
                 image_token=self.image_token,
+                video_token=self.video_token,
                 audio_token=self.audio_token,
             ),
         )
@@ -113,7 +115,7 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
         if len(pixel_values) != 0:
             item = MultimodalDataItem(
                 pixel_values=pixel_values,
-                image_offsets=image_offsets,
+                offsets=image_offsets,
                 tgt_size=tgt_sizes_flat,
                 modality=Modality.IMAGE,
             )
@@ -135,11 +137,10 @@ class MiniCPMMultimodalProcessor(BaseMultimodalProcessor):
             item = MultimodalDataItem(
                 audio_features=[res["audio_features"]],
                 audio_feature_lens=res["audio_feature_lens"],
-                audio_offsets=audio_offsets,
+                offsets=audio_offsets,
                 modality=Modality.AUDIO,
             )
             items += [item]
         return {
             "mm_items": items,
             "input_ids": input_ids.tolist(),

sglang/srt/multimodal/processors/mllama4.py CHANGED Viewed

@@ -144,7 +144,7 @@ class Mllama4ImageProcessor(BaseMultimodalProcessor):
             MultimodalDataItem(
                 pixel_values=processor_output["pixel_values"],
                 modality=Modality.IMAGE,
-                image_offsets=image_offsets,
+                offsets=image_offsets,
             )
         ]

sglang/srt/multimodal/processors/phi4mm.py CHANGED Viewed

@@ -65,7 +65,7 @@ class Phi4MMImageProcessor(BaseMultimodalProcessor):
                 pixel_values=res["input_image_embeds"],
                 image_sizes=res["image_sizes"],
                 image_emb_mask=res["image_attention_mask"],
-                image_offsets=image_offsets,
+                offsets=image_offsets,
                 modality=Modality.IMAGE,
             )
         ]

sglang 0.4.9.post1__py3-none-any.whl → 0.4.9.post2__py3-none-any.whl

sglang 0.4.9.post1py3-none-any.whl → 0.4.9.post2py3-none-any.whl