PyPI - paddlex - Versions diffs - 3.0.0rc0__py3-none-any.whl → 3.0.1__py3-none-any.whl - Mend

paddlex 3.0.0rc0py3-none-any.whl → 3.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (824) hide show

paddlex/inference/models/open_vocabulary_detection/predictor.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,21 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Union, Dict, List, Tuple, Optional, Callable
-import numpy as np
 import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
-from ....utils.func_register import FuncRegister
 from ....modules.open_vocabulary_detection.model_list import MODELS
+from ....utils.func_register import FuncRegister
 from ...common.batch_sampler import ImageBatchSampler
 from ...common.reader import ReadImage
-from .processors import GroundingDINOProcessor, GroundingDINOPostProcessor
-from ..common import StaticInfer
-from ..base import BasicPredictor
+from ..base import BasePredictor
 from ..object_detection.result import DetResult
+from .processors import (
+    GroundingDINOPostProcessor,
+    GroundingDINOProcessor,
+    YOLOWorldPostProcessor,
+    YOLOWorldProcessor,
+)
-class OVDetPredictor(BasicPredictor):
+class OVDetPredictor(BasePredictor):
     entities = MODELS
@@ -68,11 +71,7 @@ class OVDetPredictor(BasicPredictor):
                 pre_ops.append(op)
         # build infer
-        infer = StaticInfer(
-            model_dir=self.model_dir,
-            model_prefix=self.MODEL_FILE_PREFIX,
-            option=self.pp_option,
-        )
+        infer = self.create_static_infer()
         # build postprocess op
         post_op = self.build_postprocess(pre_ops=pre_ops)
@@ -97,7 +96,7 @@ class OVDetPredictor(BasicPredictor):
         image_paths = batch_data.input_paths
         src_images = self.pre_ops[0](batch_data.instances)
         datas = src_images
-        # preprocess
+        # preprocess for image only
         for pre_op in self.pre_ops[1:-1]:
             datas = pre_op(datas)
@@ -141,6 +140,10 @@ class OVDetPredictor(BasicPredictor):
                 box_threshold=self.config["box_threshold"],
                 text_threshold=self.config["text_threshold"],
             )
+        elif "YOLO-World" in self.model_name:
+            return YOLOWorldPostProcessor(
+                threshold=self.config["threshold"],
+            )
         else:
             raise NotImplementedError
@@ -153,3 +156,17 @@ class OVDetPredictor(BasicPredictor):
             text_max_words=text_max_words,
             target_size=target_size,
         )
+    @register("YOLOWorldProcessor")
+    def build_yoloworld_preprocessor(
+        self,
+        image_target_size=(640, 640),
+        image_mean=[0.0, 0.0, 0.0],
+        image_std=[1.0, 1.0, 1.0],
+    ):
+        return YOLOWorldProcessor(
+            model_dir=self.model_dir,
+            image_target_size=image_target_size,
+            image_mean=image_mean,
+            image_std=image_std,
+        )

paddlex/inference/models/open_vocabulary_detection/processors/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,4 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .groundingdino_processors import GroundingDINOProcessor, GroundingDINOPostProcessor
+from .groundingdino_processors import GroundingDINOPostProcessor, GroundingDINOProcessor
+from .yoloworld_processors import YOLOWorldPostProcessor, YOLOWorldProcessor

paddlex/inference/models/open_vocabulary_detection/processors/common.py ADDED Viewed

@@ -0,0 +1,114 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, List, Tuple
+import numpy as np
+from .....utils.deps import class_requires_deps, is_dep_available
+if is_dep_available("opencv-contrib-python"):
+    import cv2
+@class_requires_deps("opencv-contrib-python")
+class LetterResize(object):
+    def __init__(
+        self,
+        scale=[640, 640],
+        pad_val=144,
+        use_mini_pad=False,
+        stretch_only=False,
+        allow_scale_up=False,
+    ):
+        super(LetterResize, self).__init__()
+        self.scale = scale
+        self.pad_val = pad_val
+        self.use_mini_pad = use_mini_pad
+        self.stretch_only = stretch_only
+        self.allow_scale_up = allow_scale_up
+    def _resize_img(self, image: np.ndarray) -> Dict:
+        scale = self.scale
+        image_shape = image.shape[:2]
+        ratio = min(scale[0] / image_shape[0], scale[1] / image_shape[1])
+        if not self.allow_scale_up:
+            ratio = min(ratio, 1.0)
+        ratio = [ratio, ratio]
+        no_pad_shape = (
+            int(round(image_shape[0] * ratio[0])),
+            int(round(image_shape[1] * ratio[1])),
+        )
+        padding_h, padding_w = [scale[0] - no_pad_shape[0], scale[1] - no_pad_shape[1]]
+        if self.use_mini_pad:
+            padding_w, padding_h = np.mod(padding_w, 32), np.mod(padding_h, 32)
+        elif self.stretch_only:
+            padding_h, padding_w = 0.0, 0.0
+            no_pad_shape = (scale[0], scale[1])
+            ratio = [scale[0] / image_shape[0], scale[1] / image_shape[1]]
+        if image_shape != no_pad_shape:
+            image = cv2.resize(
+                image,
+                (no_pad_shape[1], no_pad_shape[0]),
+                interpolation=cv2.INTER_LINEAR,
+            )
+        scale_factor = (
+            no_pad_shape[1] / image_shape[1],
+            no_pad_shape[0] / image_shape[0],
+        )
+        top_padding, left_padding = int(round(padding_h // 2 - 0.1)), int(
+            round(padding_w // 2 - 0.1)
+        )
+        bottom_padding = padding_h - top_padding
+        right_padding = padding_w - left_padding
+        padding_list = [top_padding, bottom_padding, left_padding, right_padding]
+        if (
+            top_padding != 0
+            or bottom_padding != 0
+            or left_padding != 0
+            or right_padding != 0
+        ):
+            pad_val = self.pad_val
+            if isinstance(pad_val, int) and image.ndim == 3:
+                pad_val = tuple(pad_val for _ in range(image.shape[2]))
+            top, bottom, left, right = padding_list
+            image = cv2.copyMakeBorder(
+                image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=pad_val
+            )
+        result = dict()
+        result["image"] = image
+        result["scale_factor"] = np.array(scale_factor, dtype=np.float32)
+        result["pad_param"] = np.array(padding_list, dtype=np.float32)
+        return result
+    def __call__(self, images: List[np.ndarray]) -> List[Dict]:
+        if not isinstance(images, (List, Tuple)):
+            images = [images]
+        rst_images = [self._resize_img(image) for image in images]
+        return rst_images

paddlex/inference/models/open_vocabulary_detection/processors/groundingdino_processors.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,18 +13,13 @@
 # limitations under the License.
 import os
-from typing import Dict, List, Optional, Union, Tuple
+from typing import Dict, List, Optional, Tuple, Union
 import numpy as np
 import PIL
+from ....utils.benchmark import benchmark
 from ...common.tokenizer.bert_tokenizer import BertTokenizer
-from .....utils.lazy_loader import LazyLoader
-# NOTE: LazyLoader is used to avoid conflicts between ultra-infer and Paddle
-paddle = LazyLoader("lazy_paddle", globals(), "paddle")
-T = LazyLoader("T", globals(), "paddle.vision.transforms")
-F = LazyLoader("F", globals(), "paddle.nn.functional")
 def _max_by_axis(the_list):
@@ -97,6 +92,7 @@ def _text_pad_batch_data(
     return return_list if len(return_list) > 1 else return_list[0]
+@benchmark.timeit
 class GroundingDINOPostProcessor(object):
     """PostProcessors for GroundingDINO"""
@@ -127,6 +123,7 @@ class GroundingDINOPostProcessor(object):
         text_threshold=None,
         **kwargs,
     ):
+        import paddle
         box_threshold = self.box_threshold if box_threshold is None else box_threshold
         text_threshold = (
@@ -166,6 +163,8 @@ class GroundingDINOPostProcessor(object):
         text_threshold,
     ):
         """Post Process for prediction result of single image."""
+        import paddle
+        import paddle.nn.functional as F
         logits = F.sigmoid(pred_logits)
         boxes = pred_boxes
@@ -206,6 +205,7 @@ class GroundingDINOPostProcessor(object):
             raise NotImplementedError("posmap must be 1-dim")
+@benchmark.timeit
 class GroundingDINOProcessor(object):
     """Image and Text Processors for GroundingDINO"""
@@ -261,6 +261,7 @@ class GroundingDINOProcessor(object):
         return [arr.numpy() for arr in paddle_rst]
+@benchmark.timeit
 class GroundingDinoTextProcessor(object):
     """Constructs a GroundingDino text processor."""
@@ -276,6 +277,8 @@ class GroundingDinoTextProcessor(object):
         special_tokens_list,
     ):
         """Preprocess the text with tokenization."""
+        import paddle
         tokenized_out = {}
         input_ids = _text_pad_batch_data(input_ids)
         input_ids = paddle.to_tensor(input_ids, dtype=paddle.int64).squeeze(-1)
@@ -321,6 +324,8 @@ class GroundingDinoTextProcessor(object):
         Returns:
             torch.Tensor: attention mask between each special tokens.
         """
+        import paddle
         input_ids = tokenized["input_ids"]
         bs, num_token = input_ids.shape
         special_tokens_mask = paddle.zeros((bs, num_token), dtype=paddle.bool)
@@ -363,6 +368,7 @@ class GroundingDinoTextProcessor(object):
         return attention_mask, position_ids.cast(paddle.int64), cate_to_token_mask_list
+@benchmark.timeit
 class GroundingDinoImageProcessor(object):
     """Constructs a GroundingDino image processor."""
@@ -393,6 +399,7 @@ class GroundingDinoImageProcessor(object):
     def resize(self, image, size=None, max_size=1333):
         """Officially aligned Image resize."""
+        import paddle.vision.transforms as T
         def get_size_with_aspect_ratio(image_size, size, max_size=None):
             w, h = image_size
@@ -426,6 +433,8 @@ class GroundingDinoImageProcessor(object):
         return rescaled_image
     def nested_tensor_from_tensor_list(self, tensor_list):
+        import paddle
         if tensor_list[0].ndim == 3:
             max_size = _max_by_axis([list(img.shape) for img in tensor_list])
             batch_shape = [len(tensor_list)] + max_size
@@ -455,6 +464,8 @@ class GroundingDinoImageProcessor(object):
         **kwargs,
     ):
         """Preprocess an image or batch of images."""
+        import paddle.vision.transforms as T
         do_resize = do_resize if do_resize is not None else self.do_resize
         do_normalize = do_normalize if do_normalize is not None else self.do_normalize
         do_nested = do_nested if do_nested is not None else self.do_nested

paddlex/inference/models/open_vocabulary_detection/processors/yoloworld_processors.py ADDED Viewed

@@ -0,0 +1,209 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import List, Tuple, Union
+import numpy as np
+from ...common.tokenizer.clip_tokenizer import CLIPTokenizer
+from .common import LetterResize
+class YOLOWorldProcessor(object):
+    """Image and Text Processors for YOLO-World"""
+    def __init__(
+        self,
+        model_dir,
+        image_target_size: Union[Tuple[int], int] = (640, 640),
+        image_mean: Union[float, List[float]] = [0.0, 0.0, 0.0],
+        image_std: Union[float, List[float]] = [1.0, 1.0, 1.0],
+        **kwargs,
+    ):
+        if isinstance(image_target_size, int):
+            image_target_size = (image_target_size, image_target_size)
+        if isinstance(image_mean, float):
+            image_mean = [image_mean, image_mean, image_mean]
+        if isinstance(image_std, float):
+            image_std = [image_std, image_std, image_std]
+        self.image_target_size = image_target_size
+        self.image_mean = image_mean
+        self.image_std = image_std
+        tokenizer_dir = os.path.join(model_dir, "tokenizer")
+        assert os.path.isdir(tokenizer_dir), f"{tokenizer_dir} not exists."
+        self.tokenizer = CLIPTokenizer.from_pretrained(tokenizer_dir)
+        self.resize_op = LetterResize(self.image_target_size, allow_scale_up=True)
+        if isinstance(image_mean, (tuple, list)):
+            self.image_mean = np.array(image_mean)
+        if self.image_mean.ndim < 4:
+            self.image_mean = self.image_mean.reshape(1, -1, 1, 1)
+        if isinstance(image_std, (tuple, list)):
+            self.image_std = np.array(image_std)
+        if self.image_std.ndim < 4:
+            self.image_std = self.image_std.reshape(1, -1, 1, 1)
+    def __call__(
+        self,
+        images: List[np.ndarray],
+        text: str,
+        **kwargs,
+    ):
+        preprocess_results = self.process_image(images)
+        preprocess_results.update(self.process_text(text))
+        static_input_orders = [
+            "attention_mask",
+            "image",
+            "input_ids",
+            "pad_param",
+            "scale_factor",
+        ]
+        result = [preprocess_results[k] for k in static_input_orders]
+        return result
+    def process_image(self, images):
+        """Image preprocess for YOLO-World"""
+        rescaled_images = self.resize_op(images)
+        images = np.stack(
+            [rescaled_image["image"] for rescaled_image in rescaled_images], axis=0
+        )
+        scale_factors = np.stack(
+            [rescaled_image["scale_factor"] for rescaled_image in rescaled_images],
+            axis=0,
+        )
+        pad_params = np.stack(
+            [rescaled_image["pad_param"] for rescaled_image in rescaled_images], axis=0
+        )
+        images = np.transpose(images, (0, 3, 1, 2)).astype(np.float32) / 255.0
+        images -= self.image_mean
+        images /= self.image_std
+        image_results = {
+            "image": images,
+            "scale_factor": scale_factors,
+            "pad_param": pad_params[:, [3, 0]],
+        }
+        return image_results
+    def process_text(self, text):
+        text = text.strip().lower()
+        words = [word.strip() for word in text.split(",")]
+        words += [" "]
+        tokenized_text = self.tokenizer(text=words, return_tensors="pd", padding=True)
+        text_results = {
+            "input_ids": tokenized_text["input_ids"].numpy(),
+            "attention_mask": tokenized_text["attention_mask"].numpy(),
+        }
+        return text_results
+class YOLOWorldPostProcessor(object):
+    """PostProcessors for YOLO-World"""
+    def __init__(
+        self,
+        threshold: float = 0.05,
+        **kwargs,
+    ):
+        """Init Function for YOLO-World PostProcessor
+        Args:
+            threshold (float): threshold for low confidence bbox filtering.
+        """
+        self.threshold = threshold
+    def __call__(
+        self,
+        pred_boxes,
+        pred_nums,
+        prompt,
+        src_images,
+        threshold=None,
+        **kwargs,
+    ):
+        threshold = self.threshold if threshold is None else threshold
+        split_index = np.cumsum(pred_nums)[:-1]
+        pred_boxes = np.split(pred_boxes, split_index, axis=0)
+        assert len(pred_boxes) == len(src_images)
+        classnames = self.prompt_to_classnames(prompt)
+        rst_boxes = []
+        for pred_box, src_image in zip(pred_boxes, src_images):
+            rst_boxes.append(
+                self.postprocess(
+                    pred_box,
+                    classnames,
+                    src_image,
+                    threshold,
+                )
+            )
+        return rst_boxes
+    def postprocess(
+        self,
+        pred_boxes,
+        classnames,
+        src_image,
+        threshold,
+    ):
+        """Post Process for prediction result of single image."""
+        pred_boxes = pred_boxes[pred_boxes[:, 1] > threshold]
+        H, W, *_ = src_image.shape
+        pred_labels = pred_boxes[:, 0].astype(np.int32)
+        pred_scores = pred_boxes[:, 1]
+        pred_bboxes = pred_boxes[:, 2:]
+        pred_bboxes[:, ::2] = np.clip(pred_bboxes[:, ::2], a_min=0, a_max=W)
+        pred_bboxes[:, 1::2] = np.clip(pred_bboxes[:, 1::2], a_min=0, a_max=H)
+        rst_bboxes = []
+        for pred_label, pred_score, pred_bbox in zip(
+            pred_labels, pred_scores, pred_bboxes
+        ):
+            rst_bboxes.append(
+                {
+                    "coordinate": pred_bbox.tolist(),
+                    "label": classnames[pred_label],
+                    "score": pred_score,
+                }
+            )
+        return rst_bboxes
+    def prompt_to_classnames(self, text):
+        text = text.strip().lower()
+        words = [word.strip() for word in text.split(",")]
+        words += [" "]
+        return words

paddlex/inference/models/open_vocabulary_segmentation/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

paddlex/inference/models/open_vocabulary_segmentation/predictor.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,21 +13,18 @@
 # limitations under the License.
-from typing import Any, Union, Dict, List, Tuple, Optional, Callable
-import numpy as np
-import inspect
+from typing import Any, Dict, List
-from ....utils.func_register import FuncRegister
 from ....modules.open_vocabulary_segmentation.model_list import MODELS
+from ....utils.func_register import FuncRegister
 from ...common.batch_sampler import ImageBatchSampler
 from ...common.reader import ReadImage
+from ..base import BasePredictor
 from .processors import SAMProcessor
-from ..common import StaticInfer
-from ..base import BasicPredictor
 from .results import SAMSegResult
-class OVSegPredictor(BasicPredictor):
+class OVSegPredictor(BasePredictor):
     entities = MODELS
@@ -62,11 +59,7 @@ class OVSegPredictor(BasicPredictor):
                 pre_ops.append(op)
         # build infer
-        infer = StaticInfer(
-            model_dir=self.model_dir,
-            model_prefix=self.MODEL_FILE_PREFIX,
-            option=self.pp_option,
-        )
+        infer = self.create_static_infer()
         # build model specific processor, it's required for a OV model.
         processor_cfg = self.config["Processor"]

paddlex/inference/models/open_vocabulary_segmentation/processors/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

paddlex/inference/models/open_vocabulary_segmentation/processors/sam_processer.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,19 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
-from typing import Dict, List, Optional, Union, Tuple
+from copy import deepcopy
+from typing import List, Optional, Tuple, Union
 import numpy as np
 import PIL
-from copy import deepcopy
-from .....utils.lazy_loader import LazyLoader
-# NOTE: LazyLoader is used to avoid conflicts between ultra-infer and Paddle
-paddle = LazyLoader("lazy_paddle", globals(), "paddle")
-T = LazyLoader("T", globals(), "paddle.vision.transforms")
-F = LazyLoader("F", globals(), "paddle.nn.functional")
+from ....utils.benchmark import benchmark
 def _get_preprocess_shape(
@@ -106,6 +100,8 @@ class SAMProcessor(object):
         return image_seg, prompt
     def postprocess(self, low_res_masks, mask_threshold: float = 0.0):
+        import paddle
+        import paddle.nn.functional as F
         if isinstance(low_res_masks, list):
             assert len(low_res_masks) == 1
@@ -126,6 +122,7 @@ class SAMProcessor(object):
         return [masks]
+@benchmark.timeit
 class SamPromptProcessor(object):
     """Constructs a Sam prompt processor."""
@@ -180,6 +177,7 @@ class SamPromptProcessor(object):
             return box.astype(np.float32)
+@benchmark.timeit
 class SamImageProcessor(object):
     """Constructs a Sam image processor."""
@@ -207,6 +205,8 @@ class SamImageProcessor(object):
     def apply_image(self, image: np.ndarray) -> np.ndarray:
         """Expects a numpy array with shape HxWxC in uint8 format."""
+        import paddle.vision.transforms as T
         target_size = _get_preprocess_shape(image.shape[0], image.shape[1], self.size)
         if isinstance(image, np.ndarray):
             image = PIL.Image.fromarray(image)
@@ -223,8 +223,8 @@ class SamImageProcessor(object):
         images,
     ):
         """Preprocess an image or a batch of images with a same shape."""
-        size = self.size
+        import paddle
+        import paddle.nn.functional as F
         input_image = [self.apply_image(image) for image in images]

paddlex/inference/models/open_vocabulary_segmentation/results/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

paddlex 3.0.0rc0__py3-none-any.whl → 3.0.1__py3-none-any.whl

paddlex 3.0.0rc0py3-none-any.whl → 3.0.1py3-none-any.whl