PyPI - paddlex - Versions diffs - 3.0.0b2__py3-none-any.whl → 3.0.0rc0__py3-none-any.whl - Mend

paddlex 3.0.0b2py3-none-any.whl → 3.0.0rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (940) hide show

paddlex/inference/models/video_detection/processors.py ADDED Viewed

@@ -0,0 +1,450 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import os.path as osp
+from typing import List, Sequence, Union, Optional, Tuple
+import numpy as np
+import cv2
+import lazy_paddle as paddle
+class ResizeVideo:
+    """Resizes frames of a video to a specified target size.
+    This class provides functionality to resize each frame of a video to
+    a specified square dimension (height and width are equal).
+    Attributes:
+        target_size (int): The desired size (in pixels) for both the height
+            and width of each frame in the video.
+    """
+    def __init__(self, target_size: int = 224) -> None:
+        """Initializes the ResizeVideo with a target size.
+        Args:
+            target_size (int): The desired size in pixels for the output
+                frames. Defaults to 224.
+        """
+        super().__init__()
+        self.target_size = target_size
+    def resize(self, video: List) -> List:
+        """Resizes all frames of a single video.
+        Args:
+            video (list): A list of segments, where each segment is a list
+                of frames represented as numpy arrays.
+        Returns:
+            list: The input video with each frame resized to the target size.
+        Raises:
+            NotImplementedError: If a frame is not an instance of numpy.ndarray.
+        """
+        num_seg = len(video)
+        seg_len = len(video[0])
+        for i in range(num_seg):
+            for j in range(seg_len):
+                img = video[i][j]
+                if isinstance(img, np.ndarray):
+                    h, w, _ = img.shape
+                else:
+                    raise NotImplementedError(
+                        "Currently, only numpy.ndarray frames are supported."
+                    )
+                video[i][j] = cv2.resize(
+                    img,
+                    (self.target_size, self.target_size),
+                    interpolation=cv2.INTER_LINEAR,
+                )
+        return video
+    def __call__(self, videos: List) -> List:
+        """Resizes frames of multiple videos.
+        Args:
+            videos (list): A list containing multiple videos, where each video
+                is a list of segments, and each segment is a list of frames.
+        Returns:
+            list: A list of videos with each frame resized to the target size.
+        """
+        return [self.resize(video) for video in videos]
+class Image2Array:
+    """Convert a sequence of images to a numpy array with optional transposition."""
+    def __init__(self, data_format: str = "tchw") -> None:
+        """
+        Initializes the Image2Array class.
+        Args:
+            data_format (str): The format to transpose to, either 'tchw' or 'cthw'.
+        Raises:
+            AssertionError: If data_format is not one of the allowed values.
+        """
+        super().__init__()
+        assert data_format in [
+            "tchw",
+            "cthw",
+        ], f"Target format must be in ['tchw', 'cthw'], but got {data_format}"
+        self.data_format = data_format
+    def img2array(self, video: List) -> List:
+        """
+        Converts a list of video frames to a numpy array, with frames transposed.
+        Args:
+            video (List): A list of frames represented as numpy arrays.
+        Returns:
+            List: A numpy array with the video frames transposed and concatenated.
+        """
+        # Transpose each image from HWC to CHW format
+        num_seg = len(video)
+        for i in range(num_seg):
+            video_one = video[i]
+            video_one = [img.transpose([2, 0, 1]) for img in video_one]
+            video_one = np.concatenate(
+                [np.expand_dims(img, axis=1) for img in video_one], axis=1
+            )
+            video[i] = video_one
+        return video
+    def __call__(self, videos: List[List[np.ndarray]]) -> List[np.ndarray]:
+        """
+        Process videos by converting each video to a transposed numpy array.
+        Args:
+            videos (List[List[np.ndarray]]): A list of videos, where each video is a list
+                of frames represented as numpy arrays.
+        Returns:
+            List[np.ndarray]: A list of processed videos with transposed frames.
+        """
+        return [self.img2array(video) for video in videos]
+class NormalizeVideo:
+    """
+    A class to normalize video frames by scaling the pixel values.
+    """
+    def __init__(self, scale: float = 255.0) -> None:
+        """
+        Initializes the NormalizeVideo class.
+        Args:
+            scale (float): The scale factor to normalize the frames, usually the max pixel value.
+        """
+        super().__init__()
+        self.scale = scale
+    def normalize_video(self, video: List[np.ndarray]) -> List[np.ndarray]:
+        """
+        Normalizes a sequence of images by scaling the pixel values.
+        Args:
+            video (List[np.ndarray]): A list of frames, where each frame is a numpy array to be normalized.
+        Returns:
+            List[np.ndarray]: The normalized video frames as a list of numpy arrays.
+        """
+        num_seg = len(video)  # Number of frames in the video
+        for i in range(num_seg):
+            # Convert frame to float32 and scale pixel values
+            video[i] = video[i].astype(np.float32) / self.scale
+            # Expand dimensions if needed
+            video[i] = np.expand_dims(video[i], axis=0)
+        return video
+    def __call__(self, videos: List[List[np.ndarray]]) -> List[List[np.ndarray]]:
+        """
+        Apply normalization to a list of videos.
+        Args:
+            videos (List[List[np.ndarray]]): A list of videos, where each video is a list of frames
+                represented as numpy arrays.
+        Returns:
+            List[List[np.ndarray]]: A list of normalized videos, each represented as a list of normalized frames.
+        """
+        return [self.normalize_video(video) for video in videos]
+def convert2cpu(gpu_matrix):
+    float_32_g = gpu_matrix.astype("float32")
+    return float_32_g.cpu()
+def convert2cpu_long(gpu_matrix):
+    int_64_g = gpu_matrix.astype("int64")
+    return int_64_g.cpu()
+def get_region_boxes(
+    output,
+    conf_thresh=0.005,
+    num_classes=24,
+    anchors=[
+        0.70458,
+        1.18803,
+        1.26654,
+        2.55121,
+        1.59382,
+        4.08321,
+        2.30548,
+        4.94180,
+        3.52332,
+        5.91979,
+    ],
+    num_anchors=5,
+    only_objectness=1,
+):
+    """
+    Processes the output of a neural network to extract bounding box predictions.
+    Args:
+        output (Tensor): The output tensor from the neural network.
+        conf_thresh (float): The confidence threshold for filtering predictions. Default is 0.005.
+        num_classes (int): The number of classes for classification. Default is 24.
+        anchors (List[float]): A list of anchor box dimensions used in the model. Default is a list
+            of 10 predefined anchor values.
+        num_anchors (int): The number of anchor boxes used in the model. Default is 5.
+        only_objectness (int): If set to 1, only objectness scores are considered for filtering. Default is 1.
+    Returns:
+        all_box(List[List[float]]): A list of predicted bounding boxes for each image in the batch.
+    """
+    anchor_step = len(anchors) // num_anchors
+    if output.dim() == 3:
+        output = output.unsqueeze(0)
+    batch = output.shape[0]
+    assert output.shape[1] == (5 + num_classes) * num_anchors
+    h = output.shape[2]
+    w = output.shape[3]
+    all_boxes = []
+    output = paddle.reshape(output, [batch * num_anchors, 5 + num_classes, h * w])
+    output = paddle.transpose(output, (1, 0, 2))
+    output = paddle.reshape(output, [5 + num_classes, batch * num_anchors * h * w])
+    grid_x = paddle.linspace(0, w - 1, w)
+    grid_x = paddle.tile(grid_x, [h, 1])
+    grid_x = paddle.tile(grid_x, [batch * num_anchors, 1, 1])
+    grid_x = paddle.reshape(grid_x, [batch * num_anchors * h * w]).cuda()
+    grid_y = paddle.linspace(0, h - 1, h)
+    grid_y = paddle.tile(grid_y, [w, 1]).t()
+    grid_y = paddle.tile(grid_y, [batch * num_anchors, 1, 1])
+    grid_y = paddle.reshape(grid_y, [batch * num_anchors * h * w]).cuda()
+    sigmoid = paddle.nn.Sigmoid()
+    xs = sigmoid(output[0]) + grid_x
+    ys = sigmoid(output[1]) + grid_y
+    anchor_w = paddle.to_tensor(anchors)
+    anchor_w = paddle.reshape(anchor_w, [num_anchors, anchor_step])
+    anchor_w = paddle.index_select(
+        anchor_w, index=paddle.to_tensor(np.array([0]).astype("int32")), axis=1
+    )
+    anchor_h = paddle.to_tensor(anchors)
+    anchor_h = paddle.reshape(anchor_h, [num_anchors, anchor_step])
+    anchor_h = paddle.index_select(
+        anchor_h, index=paddle.to_tensor(np.array([1]).astype("int32")), axis=1
+    )
+    anchor_w = paddle.tile(anchor_w, [batch, 1])
+    anchor_w = paddle.tile(anchor_w, [1, 1, h * w])
+    anchor_w = paddle.reshape(anchor_w, [batch * num_anchors * h * w]).cuda()
+    anchor_h = paddle.tile(anchor_h, [batch, 1])
+    anchor_h = paddle.tile(anchor_h, [1, 1, h * w])
+    anchor_h = paddle.reshape(anchor_h, [batch * num_anchors * h * w]).cuda()
+    ws = paddle.exp(output[2]) * anchor_w
+    hs = paddle.exp(output[3]) * anchor_h
+    det_confs = sigmoid(output[4])
+    cls_confs = paddle.to_tensor(output[5 : 5 + num_classes], stop_gradient=True)
+    cls_confs = paddle.transpose(cls_confs, [1, 0])
+    s = paddle.nn.Softmax()
+    cls_confs = paddle.to_tensor(s(cls_confs))
+    cls_max_confs = paddle.max(cls_confs, axis=1)
+    cls_max_ids = paddle.argmax(cls_confs, axis=1)
+    cls_max_confs = paddle.reshape(cls_max_confs, [-1])
+    cls_max_ids = paddle.reshape(cls_max_ids, [-1])
+    sz_hw = h * w
+    sz_hwa = sz_hw * num_anchors
+    det_confs = convert2cpu(det_confs)
+    cls_max_confs = convert2cpu(cls_max_confs)
+    cls_max_ids = convert2cpu_long(cls_max_ids)
+    xs = convert2cpu(xs)
+    ys = convert2cpu(ys)
+    ws = convert2cpu(ws)
+    hs = convert2cpu(hs)
+    for b in range(batch):
+        boxes = []
+        for cy in range(h):
+            for cx in range(w):
+                for i in range(num_anchors):
+                    ind = b * sz_hwa + i * sz_hw + cy * w + cx
+                    det_conf = det_confs[ind]
+                    if only_objectness:
+                        conf = det_confs[ind]
+                    else:
+                        conf = det_confs[ind] * cls_max_confs[ind]
+                    if conf > conf_thresh:
+                        bcx = xs[ind]
+                        bcy = ys[ind]
+                        bw = ws[ind]
+                        bh = hs[ind]
+                        cls_max_conf = cls_max_confs[ind]
+                        cls_max_id = cls_max_ids[ind]
+                        box = [
+                            bcx / w,
+                            bcy / h,
+                            bw / w,
+                            bh / h,
+                            det_conf,
+                            cls_max_conf,
+                            cls_max_id,
+                        ]
+                        boxes.append(box)
+        all_boxes.append(boxes)
+    return all_boxes
+def nms(boxes, nms_thresh):
+    """
+    Performs non-maximum suppression on the input boxes based on their IoUs.
+    """
+    if len(boxes) == 0:
+        return boxes
+    det_confs = paddle.zeros([len(boxes)])
+    for i in range(len(boxes)):
+        det_confs[i] = 1 - boxes[i][4]
+    sortIds = paddle.argsort(det_confs)
+    out_boxes = []
+    for i in range(len(boxes)):
+        box_i = boxes[sortIds[i]]
+        if box_i[4] > 0:
+            out_boxes.append(box_i)
+            for j in range(i + 1, len(boxes)):
+                box_j = boxes[sortIds[j]]
+                if bbox_iou(box_i, box_j, x1y1x2y2=False) > nms_thresh:
+                    box_j[4] = 0
+    return out_boxes
+def bbox_iou(box1, box2, x1y1x2y2=True):
+    """
+    Returns the Intersection over Union (IoU) of two bounding boxes.
+    """
+    if x1y1x2y2:
+        mx = min(box1[0], box2[0])
+        Mx = max(box1[2], box2[2])
+        my = min(box1[1], box2[1])
+        My = max(box1[3], box2[3])
+        w1 = box1[2] - box1[0]
+        h1 = box1[3] - box1[1]
+        w2 = box2[2] - box2[0]
+        h2 = box2[3] - box2[1]
+    else:
+        mx = min(float(box1[0] - box1[2] / 2.0), float(box2[0] - box2[2] / 2.0))
+        Mx = max(float(box1[0] + box1[2] / 2.0), float(box2[0] + box2[2] / 2.0))
+        my = min(float(box1[1] - box1[3] / 2.0), float(box2[1] - box2[3] / 2.0))
+        My = max(float(box1[1] + box1[3] / 2.0), float(box2[1] + box2[3] / 2.0))
+        w1 = box1[2]
+        h1 = box1[3]
+        w2 = box2[2]
+        h2 = box2[3]
+    uw = Mx - mx
+    uh = My - my
+    cw = w1 + w2 - uw
+    ch = h1 + h2 - uh
+    carea = 0
+    if cw <= 0 or ch <= 0:
+        return paddle.to_tensor(0.0)
+    area1 = w1 * h1
+    area2 = w2 * h2
+    carea = cw * ch
+    uarea = area1 + area2 - carea
+    return carea / uarea
+class DetVideoPostProcess:
+    """
+    A class used to perform post-processing on detection results in videos.
+    """
+    def __init__(
+        self,
+        label_list: List[str] = [],
+    ) -> None:
+        """
+        Args:
+            labels : List[str]
+                A list of labels or class names associated with the detection results.
+        """
+        super().__init__()
+        self.labels = label_list
+    def postprocess(self, pred: List, nms_thresh: float, score_thresh: float) -> List:
+        font = cv2.FONT_HERSHEY_SIMPLEX
+        num_seg = len(pred)
+        pred_all = []
+        for i in range(num_seg):
+            outputs = pred[i]
+            for out in outputs:
+                preds = []
+                out = paddle.to_tensor(out)
+                all_boxes = get_region_boxes(out, num_classes=len(self.labels))
+                for i in range(out.shape[0]):
+                    boxes = all_boxes[i]
+                    boxes = nms(boxes, nms_thresh)
+                    for box in boxes:
+                        x1 = round(float(box[0] - box[2] / 2.0) * 320.0)
+                        y1 = round(float(box[1] - box[3] / 2.0) * 240.0)
+                        x2 = round(float(box[0] + box[2] / 2.0) * 320.0)
+                        y2 = round(float(box[1] + box[3] / 2.0) * 240.0)
+                        det_conf = float(box[4])
+                        for j in range((len(box) - 5) // 2):
+                            cls_conf = float(box[5 + 2 * j].item())
+                            prob = det_conf * cls_conf
+                        if prob > score_thresh:
+                            preds.append(
+                                [[x1, y1, x2, y2], prob, self.labels[int(box[6])]]
+                            )
+            pred_all.append(preds)
+        return pred_all
+    def __call__(self, preds: List, nms_thresh, score_thresh) -> List:
+        return [self.postprocess(pred, nms_thresh, score_thresh) for pred in preds]

paddlex/inference/models/video_detection/result.py ADDED Viewed

@@ -0,0 +1,104 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import cv2
+import numpy as np
+import random
+import PIL
+from PIL import Image, ImageDraw, ImageFont
+from ....utils.fonts import PINGFANG_FONT_FILE_PATH
+from ...utils.color_map import get_colormap
+from ...utils.io import VideoReader
+from ...common.result import BaseVideoResult
+class DetVideoResult(BaseVideoResult):
+    def _to_video(self):
+        """Draw label on image"""
+        video_reader = VideoReader(backend="decord")
+        video = video_reader.read(self["input_path"])
+        video = list(video)
+        write_fps = video_reader.get_fps()
+        label2color = {}
+        catid2fontcolor = {}
+        color_list = get_colormap(rgb=True)
+        video_list = []
+        for i in range(len(video)):
+            image = Image.fromarray(video[i].asnumpy())
+            image_size = image.size
+            font_size = int(0.018 * int(image.width)) + 2
+            font = ImageFont.truetype(
+                PINGFANG_FONT_FILE_PATH, font_size, encoding="utf-8"
+            )
+            draw_thickness = int(max(image.size) * 0.002)
+            draw = ImageDraw.Draw(image)
+            results = self["result"][i]
+            for result in results:
+                bbox, score, class_name = result
+                if class_name not in label2color:
+                    random_index = random.randint(0, len(color_list) - 1)
+                    label2color[class_name] = color_list[random_index]
+                    catid2fontcolor[class_name] = self._get_font_colormap(random_index)
+                color = tuple(label2color[class_name])
+                font_color = tuple(catid2fontcolor[class_name])
+                xmin, ymin, xmax, ymax = bbox
+                rectangle = [
+                    (xmin, ymin),
+                    (xmin, ymax),
+                    (xmax, ymax),
+                    (xmax, ymin),
+                    (xmin, ymin),
+                ]
+                draw.line(
+                    rectangle,
+                    width=draw_thickness,
+                    fill=color,
+                )
+                text = "{} {:.2f}".format(class_name, score)
+                if tuple(map(int, PIL.__version__.split("."))) <= (10, 0, 0):
+                    tw, th = draw.textsize(text, font=font)
+                else:
+                    left, top, right, bottom = draw.textbbox((0, 0), text, font)
+                    tw, th = right - left, bottom - top + 4
+                if ymin < th:
+                    draw.rectangle(
+                        [(xmin, ymin), (xmin + tw + 4, ymin + th + 1)], fill=color
+                    )
+                    draw.text((xmin + 2, ymin - 2), text, fill=font_color, font=font)
+                else:
+                    draw.rectangle(
+                        [(xmin, ymin - th), (xmin + tw + 4, ymin + 1)], fill=color
+                    )
+                    draw.text(
+                        (xmin + 2, ymin - th - 2), text, fill=font_color, font=font
+                    )
+            image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+            video_list.append(image)
+        return {"res": (np.array(video_list), write_fps)}
+    def _get_font_colormap(self, color_index):
+        """
+        Get font colormap
+        """
+        dark = np.array([0x14, 0x0E, 0x35])
+        light = np.array([0xFF, 0xFF, 0xFF])
+        light_indexs = [0, 3, 4, 8, 9, 13, 14, 18, 19]
+        if color_index in light_indexs:
+            return light.astype("int32")
+        else:
+            return dark.astype("int32")

paddlex/inference/pipelines/3d_bev_detection/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .pipeline import BEVDet3DPipeline

paddlex/inference/pipelines/3d_bev_detection/pipeline.py ADDED Viewed

@@ -0,0 +1,67 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional, Union, List
+import numpy as np
+from importlib import import_module
+from ...utils.pp_option import PaddlePredictorOption
+from ..base import BasePipeline
+module_3d_bev_detection_result = import_module(
+    ".result", "paddlex.inference.models.3d_bev_detection"
+)
+BEV3DDetResult = getattr(module_3d_bev_detection_result, "BEV3DDetResult")
+class BEVDet3DPipeline(BasePipeline):
+    """3D Detection Pipeline"""
+    entities = "3d_bev_detection"
+    def __init__(
+        self,
+        config: Dict,
+        device: str = None,
+        pp_option: PaddlePredictorOption = None,
+        use_hpip: bool = False,
+    ) -> None:
+        """
+        Initializes the class with given configurations and options.
+        Args:
+            config (Dict): Configuration dictionary containing model and other parameters.
+            device (str): The device to run the prediction on. Default is None.
+            pp_option (PaddlePredictorOption): Options for PaddlePaddle predictor. Default is None.
+            use_hpip (bool): Whether to use high-performance inference (hpip) for prediction. Defaults to False.
+        """
+        super().__init__(device=device, pp_option=pp_option, use_hpip=use_hpip)
+        bev_detection_3d_model_config = config["SubModules"]["3DBEVDetection"]
+        self.bev_detection_3d_model = self.create_model(bev_detection_3d_model_config)
+    def predict(
+        self,
+        input: Union[str, List[str], np.ndarray, List[np.ndarray]],
+        **kwargs,
+    ) -> BEV3DDetResult:
+        """Predicts 3D detection results for the given input.
+        Args:
+            input (str | list[str] | np.ndarray | list[np.ndarray]): The input path(s) to the 3d annotation pickle file.
+            **kwargs: Additional keyword arguments that can be passed to the function.
+        Returns:
+            BEV3DDetResult: The predicted 3d detection results.
+        """
+        yield from self.bev_detection_3d_model(input)

paddlex 3.0.0b2__py3-none-any.whl → 3.0.0rc0__py3-none-any.whl

paddlex 3.0.0b2py3-none-any.whl → 3.0.0rc0py3-none-any.whl