PyPI - paddlex - Versions diffs - 3.0.0rc1__py3-none-any.whl → 3.0.1__py3-none-any.whl - Mend

paddlex 3.0.0rc1py3-none-any.whl → 3.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (233) hide show

paddlex/inference/models/doc_vlm/modeling/qwen2_vl.py CHANGED Viewed

@@ -15,7 +15,6 @@
 import math
 import os
 from dataclasses import dataclass
-from functools import partial
 from typing import Any, Dict, List, Optional, Tuple, Union
 import paddle
@@ -1983,74 +1982,6 @@ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel):
     def get_decoder(self):
         return self.model
-    @classmethod
-    def _get_tensor_parallel_mappings(cls, config: Qwen2VLConfig, is_split=True):
-        logging.info("Qwen2 inference model _get_tensor_parallel_mappings")
-        from paddlenlp.transformers.conversion_utils import split_or_merge_func
-        fn = split_or_merge_func(
-            is_split=is_split,
-            tensor_parallel_degree=config.tensor_parallel_degree,
-            tensor_parallel_rank=config.tensor_parallel_rank,
-            num_attention_heads=config.num_attention_heads,
-        )
-        def get_tensor_parallel_split_mappings(num_layers):
-            final_actions = {}
-            base_actions = {
-                "lm_head.weight": partial(fn, is_column=True),
-                # Row Linear
-                "embed_tokens.weight": partial(fn, is_column=False),
-                "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False),
-                "layers.0.mlp.down_proj.weight": partial(fn, is_column=False),
-            }
-            base_actions["layers.0.self_attn.q_proj.weight"] = partial(
-                fn, is_column=True
-            )
-            base_actions["layers.0.self_attn.q_proj.bias"] = partial(fn, is_column=True)
-            # if we have enough num_key_value_heads to split, then split it.
-            if config.num_key_value_heads % config.tensor_parallel_degree == 0:
-                base_actions["layers.0.self_attn.k_proj.weight"] = partial(
-                    fn, is_column=True
-                )
-                base_actions["layers.0.self_attn.v_proj.weight"] = partial(
-                    fn, is_column=True
-                )
-                base_actions["layers.0.self_attn.k_proj.bias"] = partial(
-                    fn, is_column=True
-                )
-                base_actions["layers.0.self_attn.v_proj.bias"] = partial(
-                    fn, is_column=True
-                )
-            if config.fuse_attention_ffn:
-                base_actions["layers.0.mlp.gate_up_fused_proj.weight"] = partial(
-                    fn, is_column=True, is_naive_2fuse=True
-                )
-            else:
-                base_actions["layers.0.mlp.gate_proj.weight"] = partial(
-                    fn, is_column=True
-                )
-                base_actions["layers.0.mlp.up_proj.weight"] = partial(
-                    fn, is_column=True
-                )
-            for key, action in base_actions.items():
-                if "layers.0." in key:
-                    for i in range(num_layers):
-                        final_actions[key.replace("layers.0.", f"layers.{i}.")] = action
-                final_actions[key] = action
-            return final_actions
-        mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
-        return mappings
     @staticmethod
     def get_rope_index(
         spatial_merge_size,
@@ -2276,42 +2207,6 @@ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel):
         return model_kwargs
-    def vision_forward(
-        self,
-        input_ids: paddle.Tensor,
-        inputs_embeds: Optional[paddle.Tensor] = None,
-        attention_mask: Optional[paddle.Tensor] = None,
-        position_ids: Optional[paddle.Tensor] = None,
-        pixel_values: Optional[paddle.Tensor] = None,
-        pixel_values_videos: Optional[paddle.Tensor] = None,
-        image_grid_thw: Optional[paddle.Tensor] = None,
-        video_grid_thw: Optional[paddle.Tensor] = None,
-        rope_deltas: Optional[paddle.Tensor] = None,
-    ):
-        if inputs_embeds is None:
-            from paddlenlp.experimental.transformers.qwen2.modeling import (
-                Qwen2VLForConditionalGenerationBlockInferenceModel,
-            )
-            assert isinstance(
-                self.model, Qwen2VLForConditionalGenerationBlockInferenceModel
-            ), "model is not an instance of Qwen2VLForConditionalGenerationBlockInferenceModel"
-            inputs_embeds = self.model.qwen2.embed_tokens(input_ids)
-            if pixel_values is not None:
-                pixel_values = paddle.cast(pixel_values, paddle.bfloat16)
-                image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
-                image_mask = input_ids == self.config.image_token_id
-                inputs_embeds[image_mask] = image_embeds
-            if pixel_values_videos is not None:
-                pixel_values_videos = paddle.cast(pixel_values_videos, paddle.bfloat16)
-                video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
-                video_mask = input_ids == self.config.video_token_id
-                inputs_embeds[video_mask] = video_embeds
-        return inputs_embeds
     def forward(
         self,
         input_ids: paddle.Tensor = None,

paddlex/inference/models/doc_vlm/predictor.py CHANGED Viewed

@@ -14,6 +14,7 @@
 import copy
 import os
+import warnings
 from typing import List
 from ....modules.doc_vlm.model_list import MODELS
@@ -27,6 +28,11 @@ from .result import DocVLMResult
 class DocVLMPredictor(BasePredictor):
     entities = MODELS
+    model_group = {
+        "PP-DocBee": {"PP-DocBee-2B", "PP-DocBee-7B"},
+        "PP-DocBee2": {"PP-DocBee2-3B"},
+        "PP-Chart2Table": {"PP-Chart2Table"},
+    }
     def __init__(self, *args, **kwargs):
         """Initializes DocVLMPredictor.
@@ -34,8 +40,17 @@ class DocVLMPredictor(BasePredictor):
             *args: Arbitrary positional arguments passed to the superclass.
             **kwargs: Arbitrary keyword arguments passed to the superclass.
         """
+        import paddle
         super().__init__(*args, **kwargs)
         self.device = kwargs.get("device", None)
+        self.dtype = (
+            "bfloat16"
+            if ("npu" in get_device_type() or paddle.amp.is_bfloat16_supported())
+            and (self.device is None or "cpu" not in self.device)
+            else "float32"
+        )
         self.infer, self.processor = self._build(**kwargs)
     def _build_batch_sampler(self):
@@ -44,7 +59,7 @@ class DocVLMPredictor(BasePredictor):
         Returns:
             DocVLMBatchSampler: An instance of DocVLMBatchSampler.
         """
-        return DocVLMBatchSampler()
+        return DocVLMBatchSampler(self.model_name)
     def _get_result_class(self):
         """Returns the result class, DocVLMResult.
@@ -61,28 +76,49 @@ class DocVLMPredictor(BasePredictor):
             model: An instance of Paddle model, could be either a dynamic model or a static model.
             processor: The correspounding processor for the model.
         """
-        import paddle
+        from .modeling import (
+            PPChart2TableInference,
+            PPDocBee2Inference,
+            PPDocBeeInference,
+        )
-        from .modeling import PPDocBeeInference
+        # build processor
+        processor = self.build_processor()
         # build model
-        if "PP-DocBee" in self.model_name:
+        if self.model_name in self.model_group["PP-DocBee"]:
             if kwargs.get("use_hpip", False):
-                raise ValueError(
-                    f"PP-DocBee series do not support `use_hpip=True` for now."
+                warnings.warn(
+                    "The PP-DocBee series does not support `use_hpip=True` for now."
+                )
+            with TemporaryDeviceChanger(self.device):
+                model = PPDocBeeInference.from_pretrained(
+                    self.model_dir, dtype=self.dtype
+                )
+        elif self.model_name in self.model_group["PP-Chart2Table"]:
+            if kwargs.get("use_hpip", False):
+                warnings.warn(
+                    "The PP-Chart2Table series does not support `use_hpip=True` for now."
                 )
-            dtype = (
-                "bfloat16"
-                if ("npu" in get_device_type() or paddle.amp.is_bfloat16_supported())
-                else "float32"
-            )
             with TemporaryDeviceChanger(self.device):
-                model = PPDocBeeInference.from_pretrained(self.model_dir, dtype=dtype)
+                model = PPChart2TableInference.from_pretrained(
+                    self.model_dir,
+                    dtype=self.dtype,
+                    pad_token_id=processor.tokenizer.eos_token_id,
+                )
+        elif self.model_name in self.model_group["PP-DocBee2"]:
+            if kwargs.get("use_hpip", False):
+                warnings.warn(
+                    "The PP-Chart2Table series does not support `use_hpip=True` for now."
+                )
+            with TemporaryDeviceChanger(self.device):
+                model = PPDocBee2Inference.from_pretrained(
+                    self.model_dir,
+                    dtype=self.dtype,
+                )
         else:
             raise NotImplementedError(f"Model {self.model_name} is not supported.")
-        # build processor
-        processor = self.build_processor()
         return model, processor
     def process(self, data: List[dict], **kwargs):
@@ -96,15 +132,11 @@ class DocVLMPredictor(BasePredictor):
         Returns:
             dict: A dictionary containing the raw sample information and prediction results for every instance of the batch.
         """
-        assert (
-            isinstance(data, List) and len(data) == 1
-        ), "data must be a list of length 1"
-        assert isinstance(data[0], dict)
+        assert all(isinstance(i, dict) for i in data)
-        data = data[0]
         src_data = copy.copy(data)
         # preprocess
-        data = self.processor.preprocess(**data)
+        data = self.processor.preprocess(data)
         data = self._switch_inputs_to_device(data)
         # do infer
@@ -118,15 +150,38 @@ class DocVLMPredictor(BasePredictor):
         return result_dict
     def build_processor(self, **kwargs):
-        from ..common.tokenizer import MIXQwen2Tokenizer
-        from .processors import PPDocBeeProcessor, Qwen2VLImageProcessor
-        if "PP-DocBee" in self.model_name:
+        from ..common.tokenizer import (
+            MIXQwen2_5_Tokenizer,
+            MIXQwen2Tokenizer,
+            QWenTokenizer,
+        )
+        from .processors import (
+            GOTImageProcessor,
+            PPChart2TableProcessor,
+            PPDocBee2Processor,
+            PPDocBeeProcessor,
+            Qwen2_5_VLImageProcessor,
+            Qwen2VLImageProcessor,
+        )
+        if self.model_name in self.model_group["PP-DocBee"]:
             image_processor = Qwen2VLImageProcessor()
             tokenizer = MIXQwen2Tokenizer.from_pretrained(self.model_dir)
             return PPDocBeeProcessor(
                 image_processor=image_processor, tokenizer=tokenizer
             )
+        elif self.model_name in self.model_group["PP-Chart2Table"]:
+            image_processor = GOTImageProcessor(1024)
+            tokenizer = QWenTokenizer.from_pretrained(self.model_dir)
+            return PPChart2TableProcessor(
+                image_processor=image_processor, tokenizer=tokenizer, dtype=self.dtype
+            )
+        elif self.model_name in self.model_group["PP-DocBee2"]:
+            image_processor = Qwen2_5_VLImageProcessor()
+            tokenizer = MIXQwen2_5_Tokenizer.from_pretrained(self.model_dir)
+            return PPDocBee2Processor(
+                image_processor=image_processor, tokenizer=tokenizer
+            )
         else:
             raise NotImplementedError

paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py ADDED Viewed

@@ -0,0 +1,97 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, List, Union
+import numpy as np
+import paddle
+import requests
+from paddle.vision import transforms
+from PIL import Image
+from ....utils.benchmark import benchmark
+MEAN = (0.48145466, 0.4578275, 0.40821073)
+STD = (0.26862954, 0.26130258, 0.27577711)
+class GOTImageProcessor(object):
+    def __init__(self, image_size=1024):
+        self.transform = transforms.Compose(
+            [
+                transforms.Resize((image_size, image_size), interpolation="bicubic"),
+                transforms.ToTensor(),
+                transforms.Normalize(MEAN, STD),
+            ]
+        )
+    def __call__(self, image):
+        return self.transform(image)
+class PPChart2TableProcessor(object):
+    def __init__(self, image_processor, tokenizer, dtype, **kwargs):
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+        self.dtype = dtype
+        prompt = (
+            "<|im_start|>system\n"
+            "You should follow the instructions carefully and explain your answers in detail.<|im_end|><|im_start|>user\n"
+            "<img>" + "<imgpad>" * 256 + "</img>\n"
+            "Chart to table<|im_end|><|im_start|>assistant\n"
+        )
+        self.input_ids = paddle.to_tensor(self.tokenizer([prompt]).input_ids)
+    @benchmark.timeit
+    def preprocess(self, image: Union[str, Image.Image, np.ndarray, Dict, List]):
+        if isinstance(image, (str, Image.Image, np.ndarray)):
+            image = [image]
+        elif isinstance(image, dict):
+            image = [image["image"]]
+        assert isinstance(image, list)
+        images = [
+            image_["image"] if isinstance(image_, dict) else image_ for image_ in image
+        ]
+        images = [
+            self.image_processor(self._load_image(image)).unsqueeze(0).to(self.dtype)
+            for image in images
+        ]
+        img_cnt = len(images)
+        input_ids = paddle.tile(self.input_ids, [img_cnt, 1])
+        return {"input_ids": input_ids, "images": images}
+    @benchmark.timeit
+    def postprocess(self, model_pred, *args, **kwargs):
+        return self.tokenizer.batch_decode(
+            model_pred[0], skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+    def _load_image(self, image_file):
+        from io import BytesIO
+        if isinstance(image_file, Image.Image):
+            image = image_file.convert("RGB")
+        elif isinstance(image_file, np.ndarray):
+            image = Image.fromarray(image_file)
+        elif image_file.startswith("http") or image_file.startswith("https"):
+            response = requests.get(image_file)
+            image = Image.open(BytesIO(response.content)).convert("RGB")
+        else:
+            image = Image.open(image_file).convert("RGB")
+        return image

paddlex/inference/models/doc_vlm/processors/__init__.py CHANGED Viewed

@@ -12,4 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from .GOT_ocr_2_0 import GOTImageProcessor, PPChart2TableProcessor
+from .qwen2_5_vl import PPDocBee2Processor, Qwen2_5_VLImageProcessor
 from .qwen2_vl import PPDocBeeProcessor, Qwen2VLImageProcessor

paddlex/inference/models/doc_vlm/processors/common.py CHANGED Viewed

@@ -12,13 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import base64
+import math
 from collections import UserDict
+from io import BytesIO
 from typing import Any, Dict, List, Optional, Tuple, Union
 import numpy as np
 import paddle
 import PIL.Image
+import requests
 from packaging import version
+from PIL import Image
 from ...common.tokenizer.tokenizer_utils_base import ExplicitEnum
@@ -370,3 +375,187 @@ class BatchFeature(UserDict):
                 )
         return self
+class PaddingStrategy(ExplicitEnum):
+    """
+    Possible values for the `padding` argument in [`PretrainedTokenizerBase.__call__`]. Useful for tab-completion in an
+    IDE.
+    """
+    LONGEST = "longest"
+    MAX_LENGTH = "max_length"
+    DO_NOT_PAD = "do_not_pad"
+def extract_vision_info(
+    conversations: Union[List[dict], List[List[dict]]]
+) -> List[dict]:
+    vision_infos = []
+    if isinstance(conversations[0], dict):
+        conversations = [conversations]
+    for conversation in conversations:
+        for message in conversation:
+            if isinstance(message["content"], list):
+                for ele in message["content"]:
+                    if (
+                        "image" in ele
+                        or "image_url" in ele
+                        or ele["type"] in ("image", "image_url")
+                    ):
+                        vision_infos.append(ele)
+    return vision_infos
+def process_vision_info(
+    conversations: Union[List[dict], List[List[dict]]],
+) -> Tuple[
+    Union[List[Image.Image], None, List[Union[paddle.Tensor, List[Image.Image]]], None]
+]:
+    vision_infos = extract_vision_info(conversations)
+    image_inputs = []
+    for vision_info in vision_infos:
+        if "image" in vision_info or "image_url" in vision_info:
+            image_inputs.append(fetch_image(vision_info))
+        else:
+            raise ValueError("image, image_url should in content.")
+    if len(image_inputs) == 0:
+        image_inputs = None
+    return image_inputs
+def fetch_image(
+    ele: Dict[str, Union[str, Image.Image]],
+    size_factor: int,
+    min_pixels: int,
+    max_pixels: int,
+    max_ratio: float,
+) -> Image.Image:
+    if not isinstance(ele, dict):
+        ele = {"image": ele}
+    if "image" in ele:
+        image = ele["image"]
+    else:
+        image = ele["image_url"]
+    image_obj = None
+    if isinstance(image, Image.Image):
+        image_obj = image
+    elif isinstance(image, np.ndarray):
+        image_obj = Image.fromarray(image)
+    elif image.startswith("http://") or image.startswith("https://"):
+        image_obj = Image.open(requests.get(image, stream=True).raw)
+    elif image.startswith("file://"):
+        image_obj = Image.open(image[7:])
+    elif image.startswith("data:image"):
+        data = image.split(";", 1)[1]
+        if data.startswith("base64,"):
+            data = base64.b64decode(data[7:])
+            image_obj = Image.open(BytesIO(data))
+    else:
+        image_obj = Image.open(image)
+    if image_obj is None:
+        raise ValueError(
+            f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}"
+        )
+    image = image_obj.convert("RGB")
+    # resize
+    if "resized_height" in ele and "resized_width" in ele:
+        resized_height, resized_width = smart_resize(
+            ele["resized_height"],
+            ele["resized_width"],
+            factor=size_factor,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+            max_ratio=max_ratio,
+        )
+    else:
+        width, height = image.size  # Image, not tensor
+        min_pixels = ele.get("min_pixels", min_pixels)
+        max_pixels = ele.get("max_pixels", max_pixels)
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=size_factor,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+            max_ratio=max_ratio,
+        )
+    image = image.resize((resized_width, resized_height))
+    return image
+def round_by_factor(number: int, factor: int) -> int:
+    """Returns the closest integer to 'number' that is divisible by 'factor'."""
+    return round(number / factor) * factor
+def ceil_by_factor(number: int, factor: int) -> int:
+    """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
+    return math.ceil(number / factor) * factor
+def floor_by_factor(number: int, factor: int) -> int:
+    """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
+    return math.floor(number / factor) * factor
+def smart_resize(
+    height: int,
+    width: int,
+    factor: int,
+    min_pixels: int,
+    max_pixels: int,
+    max_ratio: float,
+) -> Tuple[int, int]:
+    """
+    Rescales the image so that the following conditions are met:
+    1. Both dimensions (height and width) are divisible by 'factor'.
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+    3. The aspect ratio of the image is maintained as closely as possible.
+    """
+    if max(height, width) / min(height, width) > max_ratio:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than {max_ratio}, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = max(factor, round_by_factor(height, factor))
+    w_bar = max(factor, round_by_factor(width, factor))
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = floor_by_factor(height / beta, factor)
+        w_bar = floor_by_factor(width / beta, factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = ceil_by_factor(height * beta, factor)
+        w_bar = ceil_by_factor(width * beta, factor)
+    return h_bar, w_bar
+def make_batched_images(images) -> List[List[ImageInput]]:
+    """
+    Accepts images in list or nested list format, and makes a list of images for preprocessing.
+    Args:
+        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
+            The input image.
+    Returns:
+        list: A list of images.
+    """
+    if (
+        isinstance(images, (list, tuple))
+        and isinstance(images[0], (list, tuple))
+        and is_valid_image(images[0][0])
+    ):
+        return [img for img_list in images for img in img_list]
+    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
+        return images
+    elif is_valid_image(images):
+        return [images]
+    raise ValueError(f"Could not make batched images from {images}")

paddlex 3.0.0rc1__py3-none-any.whl → 3.0.1__py3-none-any.whl

paddlex 3.0.0rc1py3-none-any.whl → 3.0.1py3-none-any.whl