PyPI - sglang - Versions diffs - 0.4.3.post4__py3-none-any.whl → 0.4.4.post1__py3-none-any.whl - Mend

sglang 0.4.3.post4py3-none-any.whl → 0.4.4.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (131) hide show

sglang/srt/managers/image_processors/qwen_vl.py ADDED Viewed

@@ -0,0 +1,161 @@
+import asyncio
+import math
+from typing import List, Union
+from PIL import Image
+from sglang.srt.managers.image_processor import BaseImageProcessor
+from sglang.srt.managers.image_processors.base_image_processor import (
+    get_global_processor,
+)
+from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
+from sglang.srt.models.qwen2_vl import Qwen2VLForConditionalGeneration
+# Compatible with Qwen2VL and Qwen2_5VL
+class Qwen2_5VLImageProcessor(BaseImageProcessor):
+    def __init__(self, hf_config, server_args, _processor):
+        super().__init__(hf_config, server_args, _processor)
+        self.IMAGE_TOKEN = "<|vision_start|><|image_pad|><|vision_end|>"
+        self.IM_START_TOKEN_ID = hf_config.vision_start_token_id
+        self.IM_END_TOKEN_ID = hf_config.vision_end_token_id
+        self.image_token_id = hf_config.image_token_id
+        self.video_token_id = hf_config.video_token_id
+        self.NUM_TOKEN_PER_FRAME = 770
+        self.IMAGE_FACTOR = 28
+        self.MIN_PIXELS = 4 * 28 * 28
+        self.MAX_PIXELS = 16384 * 28 * 28
+        self.MAX_PIXELS = 16384 * 28 * 28
+        self.MAX_RATIO = 200
+    @staticmethod
+    def _process_images_task(images, input_text, _hf_config):
+        if isinstance(images, list) and len(images) == 0:
+            images = None
+        result = get_global_processor().__call__(
+            text=[input_text], images=images, padding=True, return_tensors="pt"
+        )
+        return {
+            "input_ids": result.input_ids,
+            "pixel_values": getattr(result, "pixel_values", None),
+            "image_grid_thw": getattr(result, "image_grid_thw", None),
+            "second_per_grid_ts": getattr(result, "second_per_grid_ts", None),
+            "video_grid_thws": getattr(result, "video_grid_thws", None),
+        }
+    async def _process_images(self, images, input_text) -> dict:
+        if self.executor is not None:
+            loop = asyncio.get_event_loop()
+            return await loop.run_in_executor(
+                self.executor,
+                Qwen2_5VLImageProcessor._process_images_task,
+                images,
+                input_text,
+                self.hf_config,
+            )
+        else:
+            return self._process_images_task(images, input_text, self.hf_config)
+    async def process_images_async(
+        self,
+        image_data: List[Union[str, bytes]],
+        input_ids,
+        request_obj,
+        max_req_input_len,
+        *args,
+        **kwargs,
+    ):
+        if not image_data:
+            return None
+        if isinstance(image_data, str):
+            image_data = [image_data]
+        image_token = self.IMAGE_TOKEN
+        base_output = self.load_images(
+            input_ids,
+            image_data,
+            image_token,
+            max_req_input_len,
+        )
+        def smart_resize(
+            height: int,
+            width: int,
+            factor: int = self.IMAGE_FACTOR,
+            min_pixels: int = self.MIN_PIXELS,
+            max_pixels: int = self.MAX_PIXELS,
+        ) -> tuple[int, int]:
+            """
+            Rescales the image so that the following conditions are met:
+            1. Both dimensions (height and width) are divisible by 'factor'.
+            2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+            3. The aspect ratio of the image is maintained as closely as possible.
+            """
+            if max(height, width) / min(height, width) > self.MAX_RATIO:
+                raise ValueError(
+                    f"absolute aspect ratio must be smaller than {self.MAX_RATIO}, got {max(height, width) / min(height, width)}"
+                )
+            h_bar = max(factor, round_by_factor(height, factor))
+            w_bar = max(factor, round_by_factor(width, factor))
+            if h_bar * w_bar > max_pixels:
+                beta = math.sqrt((height * width) / max_pixels)
+                h_bar = floor_by_factor(height / beta, factor)
+                w_bar = floor_by_factor(width / beta, factor)
+            elif h_bar * w_bar < min_pixels:
+                beta = math.sqrt(min_pixels / (height * width))
+                h_bar = ceil_by_factor(height * beta, factor)
+                w_bar = ceil_by_factor(width * beta, factor)
+            return h_bar, w_bar
+        def resize_image(image, size_factor: int = self.IMAGE_FACTOR) -> Image.Image:
+            width, height = image.size
+            min_pixels = self.MIN_PIXELS
+            max_pixels = self.MAX_PIXELS
+            resized_height, resized_width = smart_resize(
+                height,
+                width,
+                factor=size_factor,
+                min_pixels=min_pixels,
+                max_pixels=max_pixels,
+            )
+            image = image.resize((resized_width, resized_height))
+            return image
+        def round_by_factor(number: int, factor: int) -> int:
+            """Returns the closest integer to 'number' that is divisible by 'factor'."""
+            return round(number / factor) * factor
+        def ceil_by_factor(number: int, factor: int) -> int:
+            """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
+            return math.ceil(number / factor) * factor
+        def floor_by_factor(number: int, factor: int) -> int:
+            """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
+            return math.floor(number / factor) * factor
+        images = [resize_image(image) for image in base_output.all_frames]
+        ret = await self._process_images(images, base_output.input_text)
+        return {
+            "input_ids": ret["input_ids"].flatten().tolist(),
+            "pixel_values": ret["pixel_values"],
+            "image_hashes": base_output.image_hashes,
+            "modalities": request_obj.modalities or ["image"],
+            "image_grid_thws": ret["image_grid_thw"],
+            "video_grid_thws": ret["video_grid_thws"],
+            "im_start_id": self.IM_START_TOKEN_ID,
+            "im_end_id": self.IM_END_TOKEN_ID,
+            "im_token_id": self.image_token_id,
+            "video_token_id": self.video_token_id,
+            "second_per_grid_ts": ret["second_per_grid_ts"],
+        }
+ImageProcessorMapping = {
+    Qwen2VLForConditionalGeneration: Qwen2_5VLImageProcessor,
+    Qwen2_5_VLForConditionalGeneration: Qwen2_5VLImageProcessor,
+}

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -293,6 +293,8 @@ class TokenizedGenerateReqInput:
 class EmbeddingReqInput:
     # The input prompt. It can be a single prompt or a batch of prompts.
     text: Optional[Union[List[str], str]] = None
+    # The image input. It can be a file name, a url, or base64 encoded string.
+    image_data: Optional[Union[List[str], str]] = None
     # The token ids for text; one can either specify text or input_ids.
     input_ids: Optional[Union[List[List[int]], List[int]]] = None
     # The request id.
@@ -303,28 +305,40 @@ class EmbeddingReqInput:
     input_embeds: Optional[Union[List[List[List[float]]], List[List[float]]]] = None
     # Whether to log metrics for this request (e.g. health_generate calls do not log metrics)
     log_metrics: bool = True
+    # The modalities of the image data [image, multi-images, video]
+    modalities: Optional[List[str]] = None
     def normalize_batch_and_arguments(self):
-        if (self.text is None and self.input_ids is None) or (
-            self.text is not None and self.input_ids is not None
-        ):
-            raise ValueError("Either text or input_ids should be provided.")
+        # at least one of text, input_ids, or image should be provided
+        if self.text is None and self.input_ids is None and self.image_data is None:
+            raise ValueError(
+                "At least one of text, input_ids, or image should be provided"
+            )
+        # text and input_ids cannot be provided at the same time
+        if self.text is not None and self.input_ids is not None:
+            raise ValueError("text and input_ids cannot be provided at the same time")
         # Derive the batch size
+        self.batch_size = 0
+        self.is_single = True
+        # check the batch size of text
         if self.text is not None:
-            if isinstance(self.text, str):
-                self.is_single = True
-                self.batch_size = 1
+            if isinstance(self.text, list):
+                self.batch_size += len(self.text)
             else:
-                self.is_single = False
-                self.batch_size = len(self.text)
-        else:
-            if isinstance(self.input_ids[0], int):
-                self.is_single = True
-                self.batch_size = 1
+                self.batch_size += 1
+        # check the batch size of input_ids
+        if self.input_ids is not None:
+            if isinstance(self.input_ids[0], list):
+                self.batch_size += len(self.input_ids)
             else:
-                self.is_single = False
-                self.batch_size = len(self.input_ids)
+                self.batch_size += 1
+        if self.batch_size > 1:
+            self.is_single = False
         # Fill in default arguments
         if self.is_single:
@@ -352,6 +366,7 @@ class EmbeddingReqInput:
         return EmbeddingReqInput(
             text=self.text[i] if self.text is not None else None,
             input_ids=self.input_ids[i] if self.input_ids is not None else None,
+            image_data=self.image_data[i] if self.image_data is not None else None,
             sampling_params=self.sampling_params[i],
             rid=self.rid[i],
         )
@@ -365,6 +380,8 @@ class TokenizedEmbeddingReqInput:
     input_text: str
     # The input token ids
     input_ids: List[int]
+    # The image inputs
+    image_inputs: dict
     # Dummy sampling params for compatibility
     sampling_params: SamplingParams

sglang/srt/managers/multi_modality_padding.py ADDED Viewed

@@ -0,0 +1,134 @@
+from abc import abstractmethod
+from typing import Callable, List, Optional, Tuple
+from sglang.srt.managers.schedule_batch import ImageInputs
+from sglang.utils import logger
+class MultiModalityDataPaddingPattern:
+    """
+    Data tokens (like image tokens) often need special handling during padding
+    to maintain model compatibility. This class provides the interface for
+    implementing different padding strategies for data tokens
+    """
+    @abstractmethod
+    def pad_input_tokens(
+        self, input_ids: List[int], image_inputs: ImageInputs
+    ) -> List[int]:
+        """
+        Pad the input ids sequence containing data tokens, and replace them with pad_values
+        """
+        pass
+class MultiModalityDataPaddingPatternTokenPairs(MultiModalityDataPaddingPattern):
+    """In this pattern, data tokens should be enclosed by special token pairs (e.g. <image>...</image>, data_token_pairs)
+    This strategy should be applied when data content is marked by start/end token pairs in the input sequence.
+    """
+    def __init__(self, data_token_pairs: Optional[List[Tuple[int, int]]]) -> None:
+        self.data_token_id_pairs = data_token_pairs
+    def pad_input_tokens(
+        self, input_ids: List[int], image_inputs: ImageInputs
+    ) -> List[int]:
+        """
+        This function will replace the data-tokens inbetween with pad_values accordingly
+        """
+        pad_values = image_inputs.pad_values
+        data_token_pairs = self.data_token_id_pairs
+        image_inputs.image_offsets = []
+        if data_token_pairs is None:
+            data_token_pairs = [image_inputs.im_start_id, image_inputs.im_end_id]
+        if data_token_pairs is None:
+            logger.warning(
+                "No data_token_pairs provided, RadixAttention might be influenced."
+            )
+            return input_ids
+        start_token_ids = [s for s, _e in data_token_pairs]
+        end_tokens_ids = [e for _s, e in data_token_pairs]
+        # First start token marks new data
+        data_start_token = start_token_ids[0]
+        padded_ids = []
+        last_idx = 0
+        data_idx = -1
+        start_indices = [i for i, x in enumerate(input_ids) if x in start_token_ids]
+        end_indices = [i for i, x in enumerate(input_ids) if x in end_tokens_ids]
+        if len(start_indices) != len(end_indices):
+            return input_ids
+        for start_idx, end_idx in zip(start_indices, end_indices):
+            padded_ids.extend(input_ids[last_idx : start_idx + 1])
+            if input_ids[start_idx] == data_start_token:
+                data_idx += 1
+                image_inputs.image_offsets += [start_idx]
+            num_tokens = end_idx - start_idx - 1
+            pad_value = pad_values[data_idx]
+            padded_ids.extend([pad_value] * num_tokens)
+            last_idx = end_idx
+        padded_ids.extend(input_ids[last_idx:])
+        assert len(input_ids) == len(padded_ids)
+        return padded_ids
+class MultModalityDataPaddingPatternSingleToken(MultiModalityDataPaddingPattern):
+    """In this pattern, data is represented with a special token_id ( image_inputs.im_token_id ),
+         which needs first to be expanded to multiple tokens, then replaced with their padding values
+    This strategy should be used when a single data token represents content that should
+    be expanded to multiple tokens during processing.
+    """
+    def __init__(
+        self, num_data_token_calc_func: Callable[[Tuple[int, int, int]], int]
+    ) -> None:
+        self.num_data_token_calc_func = num_data_token_calc_func
+    def pad_input_tokens(
+        self, input_ids: List[int], image_inputs: ImageInputs
+    ) -> List[int]:
+        """
+        This function will follow the procedure of:
+            1. the data token will be expanded, of which the final number will be calculated by `num_data_token_calc_func`
+            2. the padded data tokens will be replaced with their pad_values
+        """
+        image_grid_thws = image_inputs.image_grid_thws
+        pad_values = image_inputs.pad_values
+        image_indices = [
+            idx
+            for idx, token in enumerate(input_ids)
+            if token == image_inputs.im_token_id
+        ]
+        image_inputs.image_offsets = []
+        input_ids_with_image = []
+        for image_cnt, _ in enumerate(image_grid_thws):
+            print(f"image_cnt {image_cnt}")
+            num_image_tokens = self.num_data_token_calc_func(image_grid_thws[image_cnt])
+            if image_cnt == 0:
+                non_image_tokens = input_ids[: image_indices[image_cnt]]
+            else:
+                non_image_tokens = input_ids[
+                    image_indices[image_cnt - 1] + 1 : image_indices[image_cnt]
+                ]
+            input_ids_with_image.extend(non_image_tokens)
+            image_inputs.image_offsets.append(len(input_ids_with_image))
+            pad_ids = pad_values * (
+                (num_image_tokens + len(pad_values)) // len(pad_values)
+            )
+            input_ids_with_image.extend(pad_ids[:num_image_tokens])
+        input_ids_with_image.extend(input_ids[image_indices[-1] + 1 :])
+        return input_ids_with_image

sglang 0.4.3.post4__py3-none-any.whl → 0.4.4.post1__py3-none-any.whl

sglang 0.4.3.post4py3-none-any.whl → 0.4.4.post1py3-none-any.whl