PyPI - python-doctr - Versions diffs - 0.7.0__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

python-doctr 0.7.0py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

doctr/datasets/__init__.py +2 -0
doctr/datasets/cord.py +6 -4
doctr/datasets/datasets/base.py +3 -2
doctr/datasets/datasets/pytorch.py +4 -2
doctr/datasets/datasets/tensorflow.py +4 -2
doctr/datasets/detection.py +6 -3
doctr/datasets/doc_artefacts.py +2 -1
doctr/datasets/funsd.py +7 -8
doctr/datasets/generator/base.py +3 -2
doctr/datasets/generator/pytorch.py +3 -1
doctr/datasets/generator/tensorflow.py +3 -1
doctr/datasets/ic03.py +3 -2
doctr/datasets/ic13.py +2 -1
doctr/datasets/iiit5k.py +6 -4
doctr/datasets/iiithws.py +2 -1
doctr/datasets/imgur5k.py +3 -2
doctr/datasets/loader.py +4 -2
doctr/datasets/mjsynth.py +2 -1
doctr/datasets/ocr.py +2 -1
doctr/datasets/orientation.py +40 -0
doctr/datasets/recognition.py +3 -2
doctr/datasets/sroie.py +2 -1
doctr/datasets/svhn.py +2 -1
doctr/datasets/svt.py +3 -2
doctr/datasets/synthtext.py +2 -1
doctr/datasets/utils.py +27 -11
doctr/datasets/vocabs.py +26 -1
doctr/datasets/wildreceipt.py +111 -0
doctr/file_utils.py +3 -1
doctr/io/elements.py +52 -35
doctr/io/html.py +5 -3
doctr/io/image/base.py +5 -4
doctr/io/image/pytorch.py +12 -7
doctr/io/image/tensorflow.py +11 -6
doctr/io/pdf.py +5 -4
doctr/io/reader.py +13 -5
doctr/models/_utils.py +30 -53
doctr/models/artefacts/barcode.py +4 -3
doctr/models/artefacts/face.py +4 -2
doctr/models/builder.py +58 -43
doctr/models/classification/__init__.py +1 -0
doctr/models/classification/magc_resnet/pytorch.py +5 -2
doctr/models/classification/magc_resnet/tensorflow.py +5 -2
doctr/models/classification/mobilenet/pytorch.py +16 -4
doctr/models/classification/mobilenet/tensorflow.py +29 -20
doctr/models/classification/predictor/pytorch.py +3 -2
doctr/models/classification/predictor/tensorflow.py +2 -1
doctr/models/classification/resnet/pytorch.py +23 -13
doctr/models/classification/resnet/tensorflow.py +33 -26
doctr/models/classification/textnet/__init__.py +6 -0
doctr/models/classification/textnet/pytorch.py +275 -0
doctr/models/classification/textnet/tensorflow.py +267 -0
doctr/models/classification/vgg/pytorch.py +4 -2
doctr/models/classification/vgg/tensorflow.py +5 -2
doctr/models/classification/vit/pytorch.py +9 -3
doctr/models/classification/vit/tensorflow.py +9 -3
doctr/models/classification/zoo.py +7 -2
doctr/models/core.py +1 -1
doctr/models/detection/__init__.py +1 -0
doctr/models/detection/_utils/pytorch.py +7 -1
doctr/models/detection/_utils/tensorflow.py +7 -3
doctr/models/detection/core.py +9 -3
doctr/models/detection/differentiable_binarization/base.py +37 -25
doctr/models/detection/differentiable_binarization/pytorch.py +80 -104
doctr/models/detection/differentiable_binarization/tensorflow.py +74 -55
doctr/models/detection/fast/__init__.py +6 -0
doctr/models/detection/fast/base.py +256 -0
doctr/models/detection/fast/pytorch.py +442 -0
doctr/models/detection/fast/tensorflow.py +428 -0
doctr/models/detection/linknet/base.py +12 -5
doctr/models/detection/linknet/pytorch.py +28 -15
doctr/models/detection/linknet/tensorflow.py +68 -88
doctr/models/detection/predictor/pytorch.py +16 -6
doctr/models/detection/predictor/tensorflow.py +13 -5
doctr/models/detection/zoo.py +19 -16
doctr/models/factory/hub.py +20 -10
doctr/models/kie_predictor/base.py +2 -1
doctr/models/kie_predictor/pytorch.py +28 -36
doctr/models/kie_predictor/tensorflow.py +27 -27
doctr/models/modules/__init__.py +1 -0
doctr/models/modules/layers/__init__.py +6 -0
doctr/models/modules/layers/pytorch.py +166 -0
doctr/models/modules/layers/tensorflow.py +175 -0
doctr/models/modules/transformer/pytorch.py +24 -22
doctr/models/modules/transformer/tensorflow.py +6 -4
doctr/models/modules/vision_transformer/pytorch.py +2 -4
doctr/models/modules/vision_transformer/tensorflow.py +2 -4
doctr/models/obj_detection/faster_rcnn/pytorch.py +4 -2
doctr/models/predictor/base.py +14 -3
doctr/models/predictor/pytorch.py +26 -29
doctr/models/predictor/tensorflow.py +25 -22
doctr/models/preprocessor/pytorch.py +14 -9
doctr/models/preprocessor/tensorflow.py +10 -5
doctr/models/recognition/core.py +4 -1
doctr/models/recognition/crnn/pytorch.py +23 -16
doctr/models/recognition/crnn/tensorflow.py +25 -17
doctr/models/recognition/master/base.py +4 -1
doctr/models/recognition/master/pytorch.py +20 -9
doctr/models/recognition/master/tensorflow.py +20 -8
doctr/models/recognition/parseq/base.py +4 -1
doctr/models/recognition/parseq/pytorch.py +28 -22
doctr/models/recognition/parseq/tensorflow.py +22 -11
doctr/models/recognition/predictor/_utils.py +3 -2
doctr/models/recognition/predictor/pytorch.py +3 -2
doctr/models/recognition/predictor/tensorflow.py +2 -1
doctr/models/recognition/sar/pytorch.py +14 -7
doctr/models/recognition/sar/tensorflow.py +23 -14
doctr/models/recognition/utils.py +5 -1
doctr/models/recognition/vitstr/base.py +4 -1
doctr/models/recognition/vitstr/pytorch.py +22 -13
doctr/models/recognition/vitstr/tensorflow.py +21 -10
doctr/models/recognition/zoo.py +4 -2
doctr/models/utils/pytorch.py +24 -6
doctr/models/utils/tensorflow.py +22 -3
doctr/models/zoo.py +21 -3
doctr/transforms/functional/base.py +8 -3
doctr/transforms/functional/pytorch.py +23 -6
doctr/transforms/functional/tensorflow.py +25 -5
doctr/transforms/modules/base.py +12 -5
doctr/transforms/modules/pytorch.py +10 -12
doctr/transforms/modules/tensorflow.py +17 -9
doctr/utils/common_types.py +1 -1
doctr/utils/data.py +4 -2
doctr/utils/fonts.py +3 -2
doctr/utils/geometry.py +95 -26
doctr/utils/metrics.py +36 -22
doctr/utils/multithreading.py +5 -3
doctr/utils/repr.py +3 -1
doctr/utils/visualization.py +31 -8
doctr/version.py +1 -1
{python_doctr-0.7.0.dist-info → python_doctr-0.8.1.dist-info}/METADATA +67 -31
python_doctr-0.8.1.dist-info/RECORD +173 -0
{python_doctr-0.7.0.dist-info → python_doctr-0.8.1.dist-info}/WHEEL +1 -1
python_doctr-0.7.0.dist-info/RECORD +0 -161
{python_doctr-0.7.0.dist-info → python_doctr-0.8.1.dist-info}/LICENSE +0 -0
{python_doctr-0.7.0.dist-info → python_doctr-0.8.1.dist-info}/top_level.txt +0 -0
{python_doctr-0.7.0.dist-info → python_doctr-0.8.1.dist-info}/zip-safe +0 -0

doctr/models/kie_predictor/pytorch.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2023, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -13,7 +13,7 @@ from doctr.io.elements import Document
 from doctr.models._utils import estimate_orientation, get_language, invert_data_structure
 from doctr.models.detection.predictor import DetectionPredictor
 from doctr.models.recognition.predictor import RecognitionPredictor
-from doctr.utils.geometry import rotate_boxes, rotate_image
+from doctr.utils.geometry import rotate_image
 from .base import _KIEPredictor
@@ -24,6 +24,7 @@ class KIEPredictor(nn.Module, _KIEPredictor):
     """Implements an object able to localize and identify text elements in a set of documents
     Args:
+    ----
         det_predictor: detection module
         reco_predictor: recognition module
         assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages
@@ -35,7 +36,7 @@ class KIEPredictor(nn.Module, _KIEPredictor):
             page. Doing so will slightly deteriorate the overall latency.
         detect_language: if True, the language prediction will be added to the predictions for each
             page. Doing so will slightly deteriorate the overall latency.
-        kwargs: keyword args of `DocumentBuilder`
+        **kwargs: keyword args of `DocumentBuilder`
     """
     def __init__(
@@ -59,7 +60,7 @@ class KIEPredictor(nn.Module, _KIEPredictor):
         self.detect_orientation = detect_orientation
         self.detect_language = detect_language
-    @torch.no_grad()
+    @torch.inference_mode()
     def forward(
         self,
         pages: List[Union[np.ndarray, torch.Tensor]],
@@ -71,11 +72,20 @@ class KIEPredictor(nn.Module, _KIEPredictor):
         origin_page_shapes = [page.shape[:2] if isinstance(page, np.ndarray) else page.shape[-2:] for page in pages]
+        # Localize text elements
+        loc_preds, out_maps = self.det_predictor(pages, return_maps=True, **kwargs)
         # Detect document rotation and rotate pages
+        seg_maps = [
+            np.where(np.expand_dims(np.amax(out_map, axis=-1), axis=-1) > kwargs.get("bin_thresh", 0.3), 255, 0).astype(
+                np.uint8
+            )
+            for out_map in out_maps
+        ]
         if self.detect_orientation:
-            origin_page_orientations = [estimate_orientation(page) for page in pages]  # type: ignore[arg-type]
+            origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seg_maps]
             orientations = [
-                {"value": orientation_page, "confidence": 1.0} for orientation_page in origin_page_orientations
+                {"value": orientation_page, "confidence": None} for orientation_page in origin_page_orientations
             ]
         else:
             orientations = None
@@ -83,29 +93,28 @@ class KIEPredictor(nn.Module, _KIEPredictor):
             origin_page_orientations = (
                 origin_page_orientations
                 if self.detect_orientation
-                else [estimate_orientation(page) for page in pages]  # type: ignore[arg-type]
+                else [estimate_orientation(seq_map) for seq_map in seg_maps]
             )
-            pages = [
-                rotate_image(page, -angle, expand=True)  # type: ignore[arg-type]
-                for page, angle in zip(pages, origin_page_orientations)
-            ]
+            pages = [rotate_image(page, -angle, expand=False) for page, angle in zip(pages, origin_page_orientations)]
+            # Forward again to get predictions on straight pages
+            loc_preds = self.det_predictor(pages, **kwargs)
-        # Localize text elements
-        loc_preds = self.det_predictor(pages, **kwargs)
         dict_loc_preds: Dict[str, List[np.ndarray]] = invert_data_structure(loc_preds)  # type: ignore[assignment]
         # Check whether crop mode should be switched to channels first
         channels_last = len(pages) == 0 or isinstance(pages[0], np.ndarray)
         # Rectify crops if aspect ratio
-        dict_loc_preds = {
-            k: self._remove_padding(pages, loc_pred) for k, loc_pred in dict_loc_preds.items()  # type: ignore[arg-type]
-        }
+        dict_loc_preds = {k: self._remove_padding(pages, loc_pred) for k, loc_pred in dict_loc_preds.items()}
+        # Apply hooks to loc_preds if any
+        for hook in self.hooks:
+            dict_loc_preds = hook(dict_loc_preds)
         # Crop images
         crops = {}
         for class_name in dict_loc_preds.keys():
             crops[class_name], dict_loc_preds[class_name] = self._prepare_crops(
-                pages,  # type: ignore[arg-type]
+                pages,
                 dict_loc_preds[class_name],
                 channels_last=channels_last,
                 assume_straight_pages=self.assume_straight_pages,
@@ -136,29 +145,12 @@ class KIEPredictor(nn.Module, _KIEPredictor):
             languages_dict = [{"value": lang[0], "confidence": lang[1]} for lang in languages]
         else:
             languages_dict = None
-        # Rotate back pages and boxes while keeping original image size
-        if self.straighten_pages:
-            boxes_per_page = [
-                {
-                    k: rotate_boxes(
-                        page_boxes,
-                        angle,
-                        orig_shape=page.shape[:2]
-                        if isinstance(page, np.ndarray)
-                        else page.shape[1:],  # type: ignore[arg-type]
-                        target_shape=mask,  # type: ignore[arg-type]
-                    )
-                    for k, page_boxes in page_boxes_dict.items()
-                }
-                for page_boxes_dict, page, angle, mask in zip(
-                    boxes_per_page, pages, origin_page_orientations, origin_page_shapes
-                )
-            ]
         out = self.doc_builder(
+            pages,
             boxes_per_page,
             text_preds_per_page,
-            [page.shape[:2] if channels_last else page.shape[-2:] for page in pages],  # type: ignore[misc]
+            origin_page_shapes,
             orientations,
             languages_dict,
         )

doctr/models/kie_predictor/tensorflow.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2023, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -12,7 +12,7 @@ from doctr.io.elements import Document
 from doctr.models._utils import estimate_orientation, get_language, invert_data_structure
 from doctr.models.detection.predictor import DetectionPredictor
 from doctr.models.recognition.predictor import RecognitionPredictor
-from doctr.utils.geometry import rotate_boxes, rotate_image
+from doctr.utils.geometry import rotate_image
 from doctr.utils.repr import NestedObject
 from .base import _KIEPredictor
@@ -24,6 +24,7 @@ class KIEPredictor(NestedObject, _KIEPredictor):
     """Implements an object able to localize and identify text elements in a set of documents
     Args:
+    ----
         det_predictor: detection module
         reco_predictor: recognition module
         assume_straight_pages: if True, speeds up the inference by assuming you only pass straight pages
@@ -35,7 +36,7 @@ class KIEPredictor(NestedObject, _KIEPredictor):
             page. Doing so will slightly deteriorate the overall latency.
         detect_language: if True, the language prediction will be added to the predictions for each
             page. Doing so will slightly deteriorate the overall latency.
-        kwargs: keyword args of `DocumentBuilder`
+        **kwargs: keyword args of `DocumentBuilder`
     """
     _children_names = ["det_predictor", "reco_predictor", "doc_builder"]
@@ -71,27 +72,41 @@ class KIEPredictor(NestedObject, _KIEPredictor):
         origin_page_shapes = [page.shape[:2] for page in pages]
+        # Localize text elements
+        loc_preds, out_maps = self.det_predictor(pages, return_maps=True, **kwargs)
         # Detect document rotation and rotate pages
+        seg_maps = [
+            np.where(np.expand_dims(np.amax(out_map, axis=-1), axis=-1) > kwargs.get("bin_thresh", 0.3), 255, 0).astype(
+                np.uint8
+            )
+            for out_map in out_maps
+        ]
         if self.detect_orientation:
-            origin_page_orientations = [estimate_orientation(page) for page in pages]
+            origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seg_maps]
             orientations = [
-                {"value": orientation_page, "confidence": 1.0} for orientation_page in origin_page_orientations
+                {"value": orientation_page, "confidence": None} for orientation_page in origin_page_orientations
             ]
         else:
             orientations = None
         if self.straighten_pages:
             origin_page_orientations = (
-                origin_page_orientations if self.detect_orientation else [estimate_orientation(page) for page in pages]
+                origin_page_orientations
+                if self.detect_orientation
+                else [estimate_orientation(seq_map) for seq_map in seg_maps]
             )
-            pages = [rotate_image(page, -angle, expand=True) for page, angle in zip(pages, origin_page_orientations)]
-        # Localize text elements
-        loc_preds = self.det_predictor(pages, **kwargs)
+            pages = [rotate_image(page, -angle, expand=False) for page, angle in zip(pages, origin_page_orientations)]
+            # Forward again to get predictions on straight pages
+            loc_preds = self.det_predictor(pages, **kwargs)  # type: ignore[assignment]
-        dict_loc_preds: Dict[str, List[np.ndarray]] = invert_data_structure(loc_preds)  # type: ignore[assignment]
+        dict_loc_preds: Dict[str, List[np.ndarray]] = invert_data_structure(loc_preds)  # type: ignore
         # Rectify crops if aspect ratio
         dict_loc_preds = {k: self._remove_padding(pages, loc_pred) for k, loc_pred in dict_loc_preds.items()}
+        # Apply hooks to loc_preds if any
+        for hook in self.hooks:
+            dict_loc_preds = hook(dict_loc_preds)
         # Crop images
         crops = {}
         for class_name in dict_loc_preds.keys():
@@ -126,24 +141,9 @@ class KIEPredictor(NestedObject, _KIEPredictor):
             languages_dict = [{"value": lang[0], "confidence": lang[1]} for lang in languages]
         else:
             languages_dict = None
-        # Rotate back pages and boxes while keeping original image size
-        if self.straighten_pages:
-            boxes_per_page = [
-                {
-                    k: rotate_boxes(
-                        page_boxes,
-                        angle,
-                        orig_shape=page.shape[:2] if isinstance(page, np.ndarray) else page.shape[-2:],
-                        target_shape=mask,  # type: ignore[arg-type]
-                    )
-                    for k, page_boxes in page_boxes_dict.items()
-                }
-                for page_boxes_dict, page, angle, mask in zip(
-                    boxes_per_page, pages, origin_page_orientations, origin_page_shapes
-                )
-            ]
         out = self.doc_builder(
+            pages,
             boxes_per_page,
             text_preds_per_page,
             origin_page_shapes,  # type: ignore[arg-type]

doctr/models/modules/__init__.py CHANGED Viewed

@@ -1,2 +1,3 @@
+from .layers import *
 from .transformer import *
 from .vision_transformer import *

doctr/models/modules/layers/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+from doctr.file_utils import is_tf_available, is_torch_available
+if is_tf_available():
+    from .tensorflow import *
+elif is_torch_available():
+    from .pytorch import *  # type: ignore[assignment]

doctr/models/modules/layers/pytorch.py ADDED Viewed

@@ -0,0 +1,166 @@
+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+from typing import Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+__all__ = ["FASTConvLayer"]
+class FASTConvLayer(nn.Module):
+    """Convolutional layer used in the TextNet and FAST architectures"""
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int]],
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = False,
+    ) -> None:
+        super().__init__()
+        self.groups = groups
+        self.in_channels = in_channels
+        self.converted_ks = (kernel_size, kernel_size) if isinstance(kernel_size, int) else kernel_size
+        self.hor_conv, self.hor_bn = None, None
+        self.ver_conv, self.ver_bn = None, None
+        padding = (int(((self.converted_ks[0] - 1) * dilation) / 2), int(((self.converted_ks[1] - 1) * dilation) / 2))
+        self.activation = nn.ReLU(inplace=True)
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=self.converted_ks,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+        self.bn = nn.BatchNorm2d(out_channels)
+        if self.converted_ks[1] != 1:
+            self.ver_conv = nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=(self.converted_ks[0], 1),
+                padding=(int(((self.converted_ks[0] - 1) * dilation) / 2), 0),
+                stride=stride,
+                dilation=dilation,
+                groups=groups,
+                bias=bias,
+            )
+            self.ver_bn = nn.BatchNorm2d(out_channels)
+        if self.converted_ks[0] != 1:
+            self.hor_conv = nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=(1, self.converted_ks[1]),
+                padding=(0, int(((self.converted_ks[1] - 1) * dilation) / 2)),
+                stride=stride,
+                dilation=dilation,
+                groups=groups,
+                bias=bias,
+            )
+            self.hor_bn = nn.BatchNorm2d(out_channels)
+        self.rbr_identity = nn.BatchNorm2d(in_channels) if out_channels == in_channels and stride == 1 else None
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if hasattr(self, "fused_conv"):
+            return self.activation(self.fused_conv(x))
+        main_outputs = self.bn(self.conv(x))
+        vertical_outputs = self.ver_bn(self.ver_conv(x)) if self.ver_conv is not None and self.ver_bn is not None else 0
+        horizontal_outputs = (
+            self.hor_bn(self.hor_conv(x)) if self.hor_bn is not None and self.hor_conv is not None else 0
+        )
+        id_out = self.rbr_identity(x) if self.rbr_identity is not None and self.ver_bn is not None else 0
+        return self.activation(main_outputs + vertical_outputs + horizontal_outputs + id_out)
+    # The following logic is used to reparametrize the layer
+    # Borrowed from: https://github.com/czczup/FAST/blob/main/models/utils/nas_utils.py
+    def _identity_to_conv(
+        self, identity: Union[nn.BatchNorm2d, None]
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[int, int]]:
+        if identity is None or identity.running_var is None:
+            return 0, 0
+        if not hasattr(self, "id_tensor"):
+            input_dim = self.in_channels // self.groups
+            kernel_value = np.zeros((self.in_channels, input_dim, 1, 1), dtype=np.float32)
+            for i in range(self.in_channels):
+                kernel_value[i, i % input_dim, 0, 0] = 1
+            id_tensor = torch.from_numpy(kernel_value).to(identity.weight.device)
+            self.id_tensor = self._pad_to_mxn_tensor(id_tensor)
+        kernel = self.id_tensor
+        std = (identity.running_var + identity.eps).sqrt()  # type: ignore[attr-defined]
+        t = (identity.weight / std).reshape(-1, 1, 1, 1)
+        return kernel * t, identity.bias - identity.running_mean * identity.weight / std
+    def _fuse_bn_tensor(self, conv: nn.Conv2d, bn: nn.BatchNorm2d) -> Tuple[torch.Tensor, torch.Tensor]:
+        kernel = conv.weight
+        kernel = self._pad_to_mxn_tensor(kernel)
+        std = (bn.running_var + bn.eps).sqrt()  # type: ignore
+        t = (bn.weight / std).reshape(-1, 1, 1, 1)
+        return kernel * t, bn.bias - bn.running_mean * bn.weight / std
+    def _get_equivalent_kernel_bias(self) -> Tuple[torch.Tensor, torch.Tensor]:
+        kernel_mxn, bias_mxn = self._fuse_bn_tensor(self.conv, self.bn)
+        if self.ver_conv is not None:
+            kernel_mx1, bias_mx1 = self._fuse_bn_tensor(self.ver_conv, self.ver_bn)  # type: ignore[arg-type]
+        else:
+            kernel_mx1, bias_mx1 = 0, 0  # type: ignore[assignment]
+        if self.hor_conv is not None:
+            kernel_1xn, bias_1xn = self._fuse_bn_tensor(self.hor_conv, self.hor_bn)  # type: ignore[arg-type]
+        else:
+            kernel_1xn, bias_1xn = 0, 0  # type: ignore[assignment]
+        kernel_id, bias_id = self._identity_to_conv(self.rbr_identity)
+        kernel_mxn = kernel_mxn + kernel_mx1 + kernel_1xn + kernel_id
+        bias_mxn = bias_mxn + bias_mx1 + bias_1xn + bias_id
+        return kernel_mxn, bias_mxn
+    def _pad_to_mxn_tensor(self, kernel: torch.Tensor) -> torch.Tensor:
+        kernel_height, kernel_width = self.converted_ks
+        height, width = kernel.shape[2:]
+        pad_left_right = (kernel_width - width) // 2
+        pad_top_down = (kernel_height - height) // 2
+        return torch.nn.functional.pad(kernel, [pad_left_right, pad_left_right, pad_top_down, pad_top_down], value=0)
+    def reparameterize_layer(self):
+        if hasattr(self, "fused_conv"):
+            return
+        kernel, bias = self._get_equivalent_kernel_bias()
+        self.fused_conv = nn.Conv2d(
+            in_channels=self.conv.in_channels,
+            out_channels=self.conv.out_channels,
+            kernel_size=self.conv.kernel_size,  # type: ignore[arg-type]
+            stride=self.conv.stride,  # type: ignore[arg-type]
+            padding=self.conv.padding,  # type: ignore[arg-type]
+            dilation=self.conv.dilation,  # type: ignore[arg-type]
+            groups=self.conv.groups,
+            bias=True,
+        )
+        self.fused_conv.weight.data = kernel
+        self.fused_conv.bias.data = bias  # type: ignore[union-attr]
+        self.deploy = True
+        for para in self.parameters():
+            para.detach_()
+        for attr in ["conv", "bn", "ver_conv", "ver_bn", "hor_conv", "hor_bn"]:
+            if hasattr(self, attr):
+                self.__delattr__(attr)
+        if hasattr(self, "rbr_identity"):
+            self.__delattr__("rbr_identity")

doctr/models/modules/layers/tensorflow.py ADDED Viewed

@@ -0,0 +1,175 @@
+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+from typing import Any, Tuple, Union
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import layers
+from doctr.utils.repr import NestedObject
+__all__ = ["FASTConvLayer"]
+class FASTConvLayer(layers.Layer, NestedObject):
+    """Convolutional layer used in the TextNet and FAST architectures"""
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int]],
+        stride: int = 1,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = False,
+    ) -> None:
+        super().__init__()
+        self.groups = groups
+        self.in_channels = in_channels
+        self.converted_ks = (kernel_size, kernel_size) if isinstance(kernel_size, int) else kernel_size
+        self.hor_conv, self.hor_bn = None, None
+        self.ver_conv, self.ver_bn = None, None
+        padding = ((self.converted_ks[0] - 1) * dilation // 2, (self.converted_ks[1] - 1) * dilation // 2)
+        self.activation = layers.ReLU()
+        self.conv_pad = layers.ZeroPadding2D(padding=padding)
+        self.conv = layers.Conv2D(
+            filters=out_channels,
+            kernel_size=self.converted_ks,
+            strides=stride,
+            dilation_rate=dilation,
+            groups=groups,
+            use_bias=bias,
+        )
+        self.bn = layers.BatchNormalization()
+        if self.converted_ks[1] != 1:
+            self.ver_pad = layers.ZeroPadding2D(
+                padding=(int(((self.converted_ks[0] - 1) * dilation) / 2), 0),
+            )
+            self.ver_conv = layers.Conv2D(
+                filters=out_channels,
+                kernel_size=(self.converted_ks[0], 1),
+                strides=stride,
+                dilation_rate=dilation,
+                groups=groups,
+                use_bias=bias,
+            )
+            self.ver_bn = layers.BatchNormalization()
+        if self.converted_ks[0] != 1:
+            self.hor_pad = layers.ZeroPadding2D(
+                padding=(0, int(((self.converted_ks[1] - 1) * dilation) / 2)),
+            )
+            self.hor_conv = layers.Conv2D(
+                filters=out_channels,
+                kernel_size=(1, self.converted_ks[1]),
+                strides=stride,
+                dilation_rate=dilation,
+                groups=groups,
+                use_bias=bias,
+            )
+            self.hor_bn = layers.BatchNormalization()
+        self.rbr_identity = layers.BatchNormalization() if out_channels == in_channels and stride == 1 else None
+    def call(self, x: tf.Tensor, **kwargs: Any) -> tf.Tensor:
+        if hasattr(self, "fused_conv"):
+            return self.activation(self.fused_conv(self.conv_pad(x, **kwargs), **kwargs))
+        main_outputs = self.bn(self.conv(self.conv_pad(x, **kwargs), **kwargs), **kwargs)
+        vertical_outputs = (
+            self.ver_bn(self.ver_conv(self.ver_pad(x, **kwargs), **kwargs), **kwargs)
+            if self.ver_conv is not None and self.ver_bn is not None
+            else 0
+        )
+        horizontal_outputs = (
+            self.hor_bn(self.hor_conv(self.hor_pad(x, **kwargs), **kwargs), **kwargs)
+            if self.hor_bn is not None and self.hor_conv is not None
+            else 0
+        )
+        id_out = self.rbr_identity(x, **kwargs) if self.rbr_identity is not None and self.ver_bn is not None else 0
+        return self.activation(main_outputs + vertical_outputs + horizontal_outputs + id_out)
+    # The following logic is used to reparametrize the layer
+    # Adapted from: https://github.com/mindee/doctr/blob/main/doctr/models/modules/layers/pytorch.py
+    def _identity_to_conv(
+        self, identity: layers.BatchNormalization
+    ) -> Union[Tuple[tf.Tensor, tf.Tensor], Tuple[int, int]]:
+        if identity is None or not hasattr(identity, "moving_mean") or not hasattr(identity, "moving_variance"):
+            return 0, 0
+        if not hasattr(self, "id_tensor"):
+            input_dim = self.in_channels // self.groups
+            kernel_value = np.zeros((self.in_channels, input_dim, 1, 1), dtype=np.float32)
+            for i in range(self.in_channels):
+                kernel_value[i, i % input_dim, 0, 0] = 1
+            id_tensor = tf.constant(kernel_value, dtype=tf.float32)
+            self.id_tensor = self._pad_to_mxn_tensor(id_tensor)
+        kernel = self.id_tensor
+        std = tf.sqrt(identity.moving_variance + identity.epsilon)
+        t = tf.reshape(identity.gamma / std, (-1, 1, 1, 1))
+        return kernel * t, identity.beta - identity.moving_mean * identity.gamma / std
+    def _fuse_bn_tensor(self, conv: layers.Conv2D, bn: layers.BatchNormalization) -> Tuple[tf.Tensor, tf.Tensor]:
+        kernel = conv.kernel
+        kernel = self._pad_to_mxn_tensor(kernel)
+        std = tf.sqrt(bn.moving_variance + bn.epsilon)
+        t = tf.reshape(bn.gamma / std, (1, 1, 1, -1))
+        return kernel * t, bn.beta - bn.moving_mean * bn.gamma / std
+    def _get_equivalent_kernel_bias(self):
+        kernel_mxn, bias_mxn = self._fuse_bn_tensor(self.conv, self.bn)
+        if self.ver_conv is not None:
+            kernel_mx1, bias_mx1 = self._fuse_bn_tensor(self.ver_conv, self.ver_bn)
+        else:
+            kernel_mx1, bias_mx1 = 0, 0
+        if self.hor_conv is not None:
+            kernel_1xn, bias_1xn = self._fuse_bn_tensor(self.hor_conv, self.hor_bn)
+        else:
+            kernel_1xn, bias_1xn = 0, 0
+        kernel_id, bias_id = self._identity_to_conv(self.rbr_identity)
+        if not isinstance(kernel_id, int):
+            kernel_id = tf.transpose(kernel_id, (2, 3, 0, 1))
+        kernel_mxn = kernel_mxn + kernel_mx1 + kernel_1xn + kernel_id
+        bias_mxn = bias_mxn + bias_mx1 + bias_1xn + bias_id
+        return kernel_mxn, bias_mxn
+    def _pad_to_mxn_tensor(self, kernel: tf.Tensor) -> tf.Tensor:
+        kernel_height, kernel_width = self.converted_ks
+        height, width = kernel.shape[2:]
+        pad_left_right = tf.maximum(0, (kernel_width - width) // 2)
+        pad_top_down = tf.maximum(0, (kernel_height - height) // 2)
+        return tf.pad(kernel, [[0, 0], [0, 0], [pad_top_down, pad_top_down], [pad_left_right, pad_left_right]])
+    def reparameterize_layer(self):
+        kernel, bias = self._get_equivalent_kernel_bias()
+        self.fused_conv = layers.Conv2D(
+            filters=self.conv.filters,
+            kernel_size=self.conv.kernel_size,
+            strides=self.conv.strides,
+            padding=self.conv.padding,
+            dilation_rate=self.conv.dilation_rate,
+            groups=self.conv.groups,
+            use_bias=True,
+        )
+        # build layer to initialize weights and biases
+        self.fused_conv.build(input_shape=(None, None, None, kernel.shape[-2]))
+        self.fused_conv.set_weights([kernel.numpy(), bias.numpy()])
+        for para in self.trainable_variables:
+            para._trainable = False
+        for attr in ["conv", "bn", "ver_conv", "ver_bn", "hor_conv", "hor_bn"]:
+            if hasattr(self, attr):
+                delattr(self, attr)
+        if hasattr(self, "rbr_identity"):
+            delattr(self, "rbr_identity")

doctr/models/modules/transformer/pytorch.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2023, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -30,14 +30,17 @@ class PositionalEncoding(nn.Module):
         self.register_buffer("pe", pe.unsqueeze(0))
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
+        """Forward pass
         Args:
+        ----
             x: embeddings (batch, max_len, d_model)
-        Returns:
+        Returns
+        -------
             positional embeddings (batch, max_len, d_model)
         """
-        x = x + self.pe[:, : x.size(1)]  # type: ignore
+        x = x + self.pe[:, : x.size(1)]
         return self.dropout(x)
@@ -45,12 +48,11 @@ def scaled_dot_product_attention(
     query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask: Optional[torch.Tensor] = None
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """Scaled Dot-Product Attention"""
     scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(query.size(-1))
     if mask is not None:
         # NOTE: to ensure the ONNX compatibility, masked_fill works only with int equal condition
-        scores = scores.masked_fill(mask == 0, float("-inf"))
-    p_attn = torch.softmax(scores, dim=-1)
+        scores = scores.masked_fill(mask == 0, float("-inf"))  # type: ignore[attr-defined]
+    p_attn = torch.softmax(scores, dim=-1)  # type: ignore[call-overload]
     return torch.matmul(p_attn, value), p_attn
@@ -121,12 +123,12 @@ class EncoderBlock(nn.Module):
         self.layer_norm_output = nn.LayerNorm(d_model, eps=1e-5)
         self.dropout = nn.Dropout(dropout)
-        self.attention = nn.ModuleList(
-            [MultiHeadAttention(num_heads, d_model, dropout) for _ in range(self.num_layers)]
-        )
-        self.position_feed_forward = nn.ModuleList(
-            [PositionwiseFeedForward(d_model, dff, dropout, activation_fct) for _ in range(self.num_layers)]
-        )
+        self.attention = nn.ModuleList([
+            MultiHeadAttention(num_heads, d_model, dropout) for _ in range(self.num_layers)
+        ])
+        self.position_feed_forward = nn.ModuleList([
+            PositionwiseFeedForward(d_model, dff, dropout, activation_fct) for _ in range(self.num_layers)
+        ])
     def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
         output = x
@@ -167,15 +169,15 @@ class Decoder(nn.Module):
         self.embed = nn.Embedding(vocab_size, d_model)
         self.positional_encoding = PositionalEncoding(d_model, dropout, maximum_position_encoding)
-        self.attention = nn.ModuleList(
-            [MultiHeadAttention(num_heads, d_model, dropout) for _ in range(self.num_layers)]
-        )
-        self.source_attention = nn.ModuleList(
-            [MultiHeadAttention(num_heads, d_model, dropout) for _ in range(self.num_layers)]
-        )
-        self.position_feed_forward = nn.ModuleList(
-            [PositionwiseFeedForward(d_model, dff, dropout) for _ in range(self.num_layers)]
-        )
+        self.attention = nn.ModuleList([
+            MultiHeadAttention(num_heads, d_model, dropout) for _ in range(self.num_layers)
+        ])
+        self.source_attention = nn.ModuleList([
+            MultiHeadAttention(num_heads, d_model, dropout) for _ in range(self.num_layers)
+        ])
+        self.position_feed_forward = nn.ModuleList([
+            PositionwiseFeedForward(d_model, dff, dropout) for _ in range(self.num_layers)
+        ])
     def forward(
         self,

python-doctr 0.7.0__py3-none-any.whl → 0.8.1__py3-none-any.whl

python-doctr 0.7.0py3-none-any.whl → 0.8.1py3-none-any.whl