PyPI - vllm-cpu - Versions diffs - 0.12.0__cp313-cp313-manylinux_2_17_aarch64.whl - Mend

vllm-cpu 0.12.0__cp313-cp313-manylinux_2_17_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1600) hide show

vllm/transformers_utils/processors/deepseek_vl2.py ADDED Viewed

@@ -0,0 +1,406 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+# coding=utf-8
+# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/ff23960c5cf9e6874b44be38af930cfb0ccbb620/deepseek_vl2/models/processing_deepseek_vl_v2.py
+# Copyright (c) 2023-2024 DeepSeek.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+import math
+from typing import Any
+import torch
+import torchvision.transforms as T
+from PIL import Image, ImageOps
+from transformers import AutoProcessor, BatchFeature, LlamaTokenizerFast
+from transformers.processing_utils import ProcessorMixin
+class ImageTransform:
+    def __init__(
+        self,
+        mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
+        std: tuple[float, float, float] = (0.5, 0.5, 0.5),
+        normalize: bool = True,
+    ):
+        self.mean = mean
+        self.std = std
+        self.normalize = normalize
+        transform_pipelines = [T.ToTensor()]
+        if normalize:
+            transform_pipelines.append(T.Normalize(mean, std))
+        self.transform = T.Compose(transform_pipelines)
+    def __call__(self, pil_img: Image.Image):
+        x = self.transform(pil_img)
+        return x
+class DeepseekVLV2Processor(ProcessorMixin):
+    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
+    attributes = ["tokenizer"]
+    def __init__(
+        self,
+        tokenizer: LlamaTokenizerFast,
+        candidate_resolutions: tuple[tuple[int, int]],
+        patch_size: int,
+        downsample_ratio: int,
+        image_mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
+        image_std: tuple[float, float, float] = (0.5, 0.5, 0.5),
+        normalize: bool = True,
+        image_token: str = "<image>",
+        pad_token: str = "<｜▁pad▁｜>",
+        add_special_token: bool = False,
+        sft_format: str = "deepseek",
+        mask_prompt: bool = True,
+        ignore_id: int = -100,
+        **kwargs,
+    ):
+        self.candidate_resolutions = candidate_resolutions
+        self.image_size = candidate_resolutions[0][0]
+        self.patch_size = patch_size
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.normalize = normalize
+        self.downsample_ratio = downsample_ratio
+        self.image_transform = ImageTransform(
+            mean=image_mean, std=image_std, normalize=normalize
+        )
+        self.tokenizer = tokenizer
+        self.tokenizer.padding_side = "left"  # must set this，padding side with make a difference in batch inference
+        # add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
+        if tokenizer.pad_token is None:
+            self.tokenizer.add_special_tokens({"pad_token": pad_token})
+        # add image token
+        image_token_id = self.tokenizer.vocab.get(image_token)
+        if image_token_id is None:
+            special_tokens = [image_token]
+            special_tokens_dict = {"additional_special_tokens": special_tokens}
+            self.tokenizer.add_special_tokens(special_tokens_dict)
+        self.image_token_id = self.tokenizer.vocab.get(image_token)
+        # add five special tokens for grounding-related tasks
+        # <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
+        special_tokens = ["<|ref|>", "<|/ref|>", "<|det|>", "<|/det|>", "<|grounding|>"]
+        special_tokens_dict = {"additional_special_tokens": special_tokens}
+        self.tokenizer.add_special_tokens(special_tokens_dict)
+        # add special tokens for SFT data
+        special_tokens = ["<|User|>", "<|Assistant|>"]
+        special_tokens_dict = {"additional_special_tokens": special_tokens}
+        self.tokenizer.add_special_tokens(special_tokens_dict)
+        self.image_token = image_token
+        self.pad_token = pad_token
+        self.add_special_token = add_special_token
+        self.sft_format = sft_format
+        self.mask_prompt = mask_prompt
+        self.ignore_id = ignore_id
+        super().__init__(
+            tokenizer,
+            **kwargs,
+        )
+    def select_best_resolution(self, image_size):
+        # used for cropping
+        original_width, original_height = image_size
+        best_fit = None
+        max_effective_resolution = 0
+        min_wasted_resolution = float("inf")
+        for width, height in self.candidate_resolutions:
+            scale = min(width / original_width, height / original_height)
+            downscaled_width, downscaled_height = (
+                int(original_width * scale),
+                int(original_height * scale),
+            )
+            effective_resolution = min(
+                downscaled_width * downscaled_height, original_width * original_height
+            )
+            wasted_resolution = (width * height) - effective_resolution
+            if effective_resolution > max_effective_resolution or (
+                effective_resolution == max_effective_resolution
+                and wasted_resolution < min_wasted_resolution
+            ):
+                max_effective_resolution = effective_resolution
+                min_wasted_resolution = wasted_resolution
+                best_fit = (width, height)
+        return best_fit
+    @property
+    def bos_id(self):
+        return self.tokenizer.bos_token_id
+    @property
+    def eos_id(self):
+        return self.tokenizer.eos_token_id
+    @property
+    def pad_id(self):
+        return self.tokenizer.pad_token_id
+    def encode(self, text: str, bos: bool = True, eos: bool = False):
+        t = self.tokenizer.encode(text, add_special_tokens=False)
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+        return t
+    def decode(self, t: list[int], **kwargs) -> str:
+        return self.tokenizer.decode(t, **kwargs)
+    def process_one(
+        self,
+        prompt: str,
+        images: list[Image.Image],
+        inference_mode: bool = True,
+        **kwargs: Any,
+    ):
+        """
+        Args:
+            prompt (str): the formatted prompt;
+            images (list[ImageType]): the list of images;
+            inference_mode (bool): if True, then remove the last eos token;
+            **kwargs: Additional keyword arguments.
+        Returns:
+            outputs (BaseProcessorOutput): the output of the processor,
+                - input_ids (torch.LongTensor): [N + image tokens]
+                - target_ids (torch.LongTensor): [N + image tokens]
+                - pixel_values (torch.FloatTensor): [n_patches, 3, H, W]
+                - image_id (int): the id of the image token
+                - num_image_tokens (list[int]): the number of image tokens
+        """
+        assert prompt is not None and images is not None, (
+            "prompt and images must be used at the same time."
+        )
+        sft_format = prompt
+        (
+            tokenized_str,
+            images_list,
+            images_seq_mask,
+            images_spatial_crop,
+            num_image_tokens,
+        ) = self.tokenize_with_images(
+            sft_format, images, bos=True, eos=True, cropping=len(images) <= 2
+        )
+        masked_tokenized_str = []
+        for token_index in tokenized_str:
+            if token_index != self.image_token_id:
+                masked_tokenized_str.append(token_index)
+            else:
+                masked_tokenized_str.append(self.ignore_id)
+        assert (
+            len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str)
+        ), (
+            f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
+            f"imags_seq_mask's length {len(images_seq_mask)}, are not equal"
+        )
+        input_ids = torch.LongTensor(tokenized_str)
+        target_ids = torch.LongTensor(masked_tokenized_str)
+        images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
+        # set input_ids < 0 | input_ids == self.image_token_id as ignore_id
+        target_ids[(input_ids < 0) | (input_ids == self.image_token_id)] = (
+            self.ignore_id
+        )
+        input_ids[input_ids < 0] = self.pad_id
+        if inference_mode:
+            # Remove the ending eos token
+            assert input_ids[-1] == self.eos_id
+            input_ids = input_ids[:-1]
+            target_ids = target_ids[:-1]
+            images_seq_mask = images_seq_mask[:-1]
+        if len(images_list) == 0:
+            pixel_values = torch.zeros((1, 3, self.image_size, self.image_size))
+            images_spatial_crop = torch.zeros((1, 2), dtype=torch.long)
+        else:
+            pixel_values = torch.stack(images_list, dim=0)
+            images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
+        input_ids = input_ids.unsqueeze(0)
+        prepare = BatchFeature(
+            data=dict(
+                input_ids=input_ids,
+                pixel_values=pixel_values,
+                images_seq_mask=images_seq_mask,
+                images_spatial_crop=images_spatial_crop,
+                num_image_tokens=num_image_tokens,
+            ),
+            tensor_type="pt",
+        )
+        return prepare
+    def __call__(
+        self,
+        *,
+        text: str,
+        images: list[Image.Image],
+        inference_mode: bool = True,
+        **kwargs: Any,
+    ):
+        """
+        Args:
+            text (str): the formatted prompt;
+            images (list[ImageType]): the list of images;
+            inference_mode (bool): if True, then remove the last eos token;
+            **kwargs:
+        Returns:
+            outputs (BaseProcessorOutput): the output of the processor,
+                - input_ids (torch.LongTensor): [N + image tokens]
+                - images (torch.FloatTensor): [n_images, 3, H, W]
+                - image_id (int): the id of the image token
+                - num_image_tokens (list[int]): the number of image tokens
+        """
+        prepare = self.process_one(
+            prompt=text,
+            images=images,
+            inference_mode=inference_mode,
+        )
+        return prepare
+    def tokenize_with_images(
+        self,
+        conversation: str,
+        images: list[Image.Image],
+        bos: bool = True,
+        eos: bool = True,
+        cropping: bool = True,
+    ):
+        """Tokenize text with <image> tags."""
+        assert conversation.count(self.image_token) == len(images)
+        text_splits = conversation.split(self.image_token)
+        images_list, images_seq_mask, images_spatial_crop = [], [], []
+        num_image_tokens = []
+        tokenized_str = []
+        for text_sep, image in zip(text_splits, images):
+            """encode text_sep"""
+            tokenized_sep = self.encode(text_sep, bos=False, eos=False)
+            tokenized_str += tokenized_sep
+            images_seq_mask += [False] * len(tokenized_sep)
+            """select best resolution for anyres"""
+            if cropping:
+                best_width, best_height = self.select_best_resolution(image.size)
+            else:
+                best_width, best_height = self.image_size, self.image_size
+            """process the global view"""
+            global_view = ImageOps.pad(
+                image,
+                (self.image_size, self.image_size),
+                color=tuple(int(x * 255) for x in self.image_transform.mean),
+            )
+            images_list.append(self.image_transform(global_view))
+            """process the local views"""
+            local_view = ImageOps.pad(
+                image,
+                (best_width, best_height),
+                color=tuple(int(x * 255) for x in self.image_transform.mean),
+            )
+            for i in range(0, best_height, self.image_size):
+                for j in range(0, best_width, self.image_size):
+                    images_list.append(
+                        self.image_transform(
+                            local_view.crop(
+                                (j, i, j + self.image_size, i + self.image_size)
+                            )
+                        )
+                    )
+            """record height / width crop num"""
+            num_width_tiles, num_height_tiles = (
+                best_width // self.image_size,
+                best_height // self.image_size,
+            )
+            images_spatial_crop.append([num_width_tiles, num_height_tiles])
+            """add image tokens"""
+            h = w = math.ceil(
+                (self.image_size // self.patch_size) / self.downsample_ratio
+            )
+            # global views tokens h * (w + 1), 1 is for line separator
+            tokenized_image = [self.image_token_id] * h * (w + 1)
+            # add a separator between global and local views
+            tokenized_image += [self.image_token_id]
+            # local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
+            tokenized_image += (
+                [self.image_token_id]
+                * (num_height_tiles * h)
+                * (num_width_tiles * w + 1)
+            )
+            tokenized_str += tokenized_image
+            images_seq_mask += [True] * len(tokenized_image)
+            num_image_tokens.append(len(tokenized_image))
+        """process the last text split"""
+        tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False)
+        tokenized_str += tokenized_sep
+        images_seq_mask += [False] * len(tokenized_sep)
+        """add the bos and eos tokens"""
+        if bos:
+            tokenized_str = [self.bos_id] + tokenized_str
+            images_seq_mask = [False] + images_seq_mask
+        if eos:
+            tokenized_str = tokenized_str + [self.eos_id]
+            images_seq_mask = images_seq_mask + [False]
+        assert len(tokenized_str) == len(images_seq_mask), (
+            f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
+        )
+        return (
+            tokenized_str,
+            images_list,
+            images_seq_mask,
+            images_spatial_crop,
+            num_image_tokens,
+        )
+AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor)

vllm/transformers_utils/processors/hunyuan_vl.py ADDED Viewed

@@ -0,0 +1,233 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/processing_hunyuan_vl.py
+import numpy as np
+import torch
+from transformers import AutoProcessor
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.video_utils import VideoInput
+class HunYuanVLProcessor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"  # ("AutoTokenizer", None)
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        video_processor=None,
+        chat_template=None,
+        **kwargs,
+    ):
+        # TODO Fix the init
+        self.tokenizer = tokenizer
+        self.image_token_id = 120120  # self.tokenizer.image_token_id
+        self.image_token = self.tokenizer.convert_ids_to_tokens(self.image_token_id)
+        self.im_start_token_id = 120118  # self.tokenizer.im_start_id
+        self.im_start_token = self.tokenizer.convert_ids_to_tokens(
+            self.im_start_token_id
+        )
+        self.im_end_token_id = 120119  # self.tokenizer.im_end_id
+        self.im_end_token = self.tokenizer.convert_ids_to_tokens(self.im_end_token_id)
+        self.placeholder_token = self.tokenizer.convert_ids_to_tokens(
+            self.tokenizer.vocab_size - 1
+        )
+        self.pad_id = 120002  # self.tokenizer.pad_token_id
+        super().__init__(
+            image_processor, tokenizer, video_processor, chat_template=chat_template
+        )
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: TextInput
+        | PreTokenizedInput
+        | list[TextInput]
+        | list[PreTokenizedInput] = None,
+        videos: VideoInput = None,
+        **kwargs,
+    ) -> BatchFeature:
+        image_inputs = {}
+        if images is not None:
+            image_inputs = self.image_processor(images=images)
+            image_grid_thw = image_inputs["image_grid_thw"]
+        if not isinstance(text, list):
+            text = [text]
+        text = text.copy()  # below lines change text in-place
+        image_tokens_cumsum = [0]
+        if images is not None:
+            index = 0
+            for i in range(len(text)):
+                while self.image_token in text[i]:
+                    grid_h, grid_w = image_grid_thw[index][-2:]
+                    patch_h = grid_h // self.image_processor.merge_size
+                    patch_w = grid_w // self.image_processor.merge_size
+                    num_image_tokens = patch_h * (patch_w + 1) + 2
+                    image_tokens_cumsum.append(
+                        image_tokens_cumsum[-1] + num_image_tokens
+                    )
+                    # text[i] = text[i].replace(self.image_token, self.im_start_token + self.placeholder_token * num_image_tokens + self.im_end_token, 1) # noqa: E501
+                    text[i] = text[i].replace(
+                        self.image_token, self.placeholder_token * num_image_tokens, 1
+                    )
+                    index += 1
+                text[i] = text[i].replace(self.placeholder_token, self.image_token)
+                # text[i] = self.tokenizer.bos_token + text[i]
+        text_inputs = self.tokenizer(text, add_special_tokens=False, **kwargs)
+        self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
+        input_ids = text_inputs["input_ids"]
+        position_ids = torch.arange(len(input_ids[0]))
+        position_ids_w = torch.arange(len(input_ids[0]))
+        position_ids_h = torch.arange(len(input_ids[0]))
+        position_ids_t = torch.arange(len(input_ids[0]))
+        if images is not None:
+            image_token_pos_indices = torch.where(input_ids[0] == self.image_token_id)[
+                0
+            ]
+            for i in range(len(image_grid_thw)):
+                grid_h, grid_w = image_grid_thw[i][-2:]
+                patch_h = grid_h // self.image_processor.merge_size
+                patch_w = grid_w // self.image_processor.merge_size
+                start_pos = image_token_pos_indices[image_tokens_cumsum[i]].item() + 1
+                replace_num = (patch_w + 1) * patch_h
+                position_ids_w[start_pos : start_pos + replace_num] = torch.tensor(
+                    list(range(patch_w + 1)) * patch_h, dtype=torch.int64
+                )
+                patch_h_list = []
+                for h in range(patch_h):
+                    patch_h_list += [h] * (patch_w + 1)
+                position_ids_h[start_pos : start_pos + replace_num] = torch.tensor(
+                    patch_h_list, dtype=torch.int64
+                )
+                position_ids_t[start_pos : start_pos + replace_num] = 0
+        position_ids = torch.stack(
+            [position_ids, position_ids_w, position_ids_h, position_ids_t]
+        ).unsqueeze(0)
+        text_inputs["position_ids"] = position_ids
+        attention_mask = input_ids.ne(self.pad_id)
+        text_inputs["attention_mask"] = attention_mask
+        text_inputs["imgs_pos"] = [self.get_imgs_pos(input_ids)]
+        # image_inputs["imgs"] = [[image_inputs["pixel_values"]]]
+        return_tensors = kwargs.pop("return_tensors", None)
+        return BatchFeature(
+            data={**text_inputs, **image_inputs},
+            tensor_type=return_tensors,
+        )
+    def batch_decode(self, *args, **kwargs):
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        return self.tokenizer.decode(*args, **kwargs)
+    def post_process_image_text_to_text(
+        self,
+        generated_outputs,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False,
+        **kwargs,
+    ):
+        assert 0
+    def apply_chat_template(self, *args, **kwargs):
+        token_ids = self.tokenizer.apply_chat_template(*args, **kwargs)
+        return token_ids
+    def get_imgs_pos(self, doc_ids):
+        doc_ids = np.array(doc_ids, dtype=np.int64)
+        img_begin_index = np.where(doc_ids == self.im_start_token_id)[0]
+        img_end_index = np.where(doc_ids == self.im_end_token_id)[0]
+        imgs_pos = np.concatenate(
+            (
+                np.reshape(img_begin_index + 1, (-1, 1)),
+                np.reshape(img_end_index, (-1, 1)),
+            ),
+            axis=-1,
+        ).tolist()
+        return imgs_pos
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+def split_image_into_patch_blocks(
+    pixel_values: torch.Tensor,  # shape: [batch_size, 3, H, W]
+    patch_size: int = 16,  # e.g. 16
+    adaptor_patch_div: int = 4,  # e.g. 4 --> each patch_size is cut into 4x4 small regions, i.e. patch_size // 4 # noqa: E501
+) -> torch.Tensor:
+    """
+    Split the input image tensor (supporting batch) into large patches of size `patch_size`,
+    and then further divide each large patch into smaller regions of size
+    (patch_size // adaptor_patch_div) x (patch_size // adaptor_patch_div).
+    Each small region is extracted as a tensor of shape [3, patch_size, patch_size].
+    The final output contains all such small region tensors.
+    Args:
+        pixel_values: Input image tensor of shape [batch_size, 3, H, W].
+        patch_size: Size of the large patch, e.g., 16.
+        adaptor_patch_div: Each large patch is divided into
+                          (patch_size // adaptor_patch_div) x (patch_size // adaptor_patch_div)
+                          smaller regions.
+    Returns:
+        patches: A tensor of shape [N, 3, patch_size, patch_size],
+                 where N = batch_size * (H // patch_size) * (W // patch_size) * (patch_size // adaptor_patch_div)^2.
+                 Each element in the batch corresponds to one small image region.
+    """  # noqa: E501
+    batch_size, channels, height, width = pixel_values.shape
+    assert channels == 3, "Pixel values must have 3 channels in dim=1"
+    assert height % patch_size == 0 and width % patch_size == 0, (
+        "H and W must be divisible by patch_size"
+    )
+    patch_height_num = height // patch_size
+    patch_width_num = width // patch_size
+    # Reshape to [B, 3, ph, ps, pw, ps]
+    img = pixel_values.reshape(
+        batch_size, 3, patch_height_num, patch_size, patch_width_num, patch_size
+    )
+    # Further split each psxps patch into (ps//aps)x(ps//aps) small regions
+    img = img.reshape(
+        batch_size,
+        3,
+        patch_height_num,
+        patch_size // adaptor_patch_div,  # ps // aps
+        adaptor_patch_div,
+        patch_width_num,
+        patch_size // adaptor_patch_div,  # ps // aps
+        adaptor_patch_div,
+    )
+    # Permute to group the small regions: [B, ph, pw, ps//aps, ps//aps, 3, aps, aps]
+    img = img.permute(0, 2, 5, 3, 6, 1, 4, 7)
+    # Reshape into [B * ph * pw * (ps//aps)^2, 3, patch_size, patch_size]
+    patches = img.reshape(-1, 3, patch_size, patch_size)
+    return patches
+AutoProcessor.register("HunYuanVLProcessor", HunYuanVLProcessor)