PyPI - python-doctr - Versions diffs - 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

python-doctr 0.8.0py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

doctr/__init__.py +1 -1
doctr/contrib/__init__.py +0 -0
doctr/contrib/artefacts.py +131 -0
doctr/contrib/base.py +105 -0
doctr/datasets/datasets/pytorch.py +2 -2
doctr/datasets/generator/base.py +6 -5
doctr/datasets/imgur5k.py +1 -1
doctr/datasets/loader.py +1 -6
doctr/datasets/utils.py +2 -1
doctr/datasets/vocabs.py +9 -2
doctr/file_utils.py +26 -12
doctr/io/elements.py +40 -6
doctr/io/html.py +2 -2
doctr/io/image/pytorch.py +6 -8
doctr/io/image/tensorflow.py +1 -1
doctr/io/pdf.py +5 -2
doctr/io/reader.py +6 -0
doctr/models/__init__.py +0 -1
doctr/models/_utils.py +57 -20
doctr/models/builder.py +71 -13
doctr/models/classification/mobilenet/pytorch.py +45 -9
doctr/models/classification/mobilenet/tensorflow.py +38 -7
doctr/models/classification/predictor/pytorch.py +18 -11
doctr/models/classification/predictor/tensorflow.py +16 -10
doctr/models/classification/textnet/pytorch.py +3 -3
doctr/models/classification/textnet/tensorflow.py +3 -3
doctr/models/classification/zoo.py +39 -15
doctr/models/detection/__init__.py +1 -0
doctr/models/detection/_utils/__init__.py +1 -0
doctr/models/detection/_utils/base.py +66 -0
doctr/models/detection/differentiable_binarization/base.py +4 -3
doctr/models/detection/differentiable_binarization/pytorch.py +2 -2
doctr/models/detection/differentiable_binarization/tensorflow.py +14 -18
doctr/models/detection/fast/__init__.py +6 -0
doctr/models/detection/fast/base.py +257 -0
doctr/models/detection/fast/pytorch.py +442 -0
doctr/models/detection/fast/tensorflow.py +428 -0
doctr/models/detection/linknet/base.py +4 -3
doctr/models/detection/predictor/pytorch.py +15 -1
doctr/models/detection/predictor/tensorflow.py +15 -1
doctr/models/detection/zoo.py +21 -4
doctr/models/factory/hub.py +3 -12
doctr/models/kie_predictor/base.py +9 -3
doctr/models/kie_predictor/pytorch.py +41 -20
doctr/models/kie_predictor/tensorflow.py +36 -16
doctr/models/modules/layers/pytorch.py +89 -10
doctr/models/modules/layers/tensorflow.py +88 -10
doctr/models/modules/transformer/pytorch.py +2 -2
doctr/models/predictor/base.py +77 -50
doctr/models/predictor/pytorch.py +31 -20
doctr/models/predictor/tensorflow.py +27 -17
doctr/models/preprocessor/pytorch.py +4 -4
doctr/models/preprocessor/tensorflow.py +3 -2
doctr/models/recognition/master/pytorch.py +2 -2
doctr/models/recognition/parseq/pytorch.py +4 -3
doctr/models/recognition/parseq/tensorflow.py +4 -3
doctr/models/recognition/sar/pytorch.py +7 -6
doctr/models/recognition/sar/tensorflow.py +3 -9
doctr/models/recognition/vitstr/pytorch.py +1 -1
doctr/models/recognition/zoo.py +1 -1
doctr/models/zoo.py +2 -2
doctr/py.typed +0 -0
doctr/transforms/functional/base.py +1 -1
doctr/transforms/functional/pytorch.py +4 -4
doctr/transforms/modules/base.py +37 -15
doctr/transforms/modules/pytorch.py +66 -8
doctr/transforms/modules/tensorflow.py +63 -7
doctr/utils/fonts.py +7 -5
doctr/utils/geometry.py +35 -12
doctr/utils/metrics.py +33 -174
doctr/utils/reconstitution.py +126 -0
doctr/utils/visualization.py +5 -118
doctr/version.py +1 -1
{python_doctr-0.8.0.dist-info → python_doctr-0.9.0.dist-info}/METADATA +96 -91
{python_doctr-0.8.0.dist-info → python_doctr-0.9.0.dist-info}/RECORD +79 -75
{python_doctr-0.8.0.dist-info → python_doctr-0.9.0.dist-info}/WHEEL +1 -1
doctr/models/artefacts/__init__.py +0 -2
doctr/models/artefacts/barcode.py +0 -74
doctr/models/artefacts/face.py +0 -63
doctr/models/obj_detection/__init__.py +0 -1
doctr/models/obj_detection/faster_rcnn/__init__.py +0 -4
doctr/models/obj_detection/faster_rcnn/pytorch.py +0 -81
{python_doctr-0.8.0.dist-info → python_doctr-0.9.0.dist-info}/LICENSE +0 -0
{python_doctr-0.8.0.dist-info → python_doctr-0.9.0.dist-info}/top_level.txt +0 -0
{python_doctr-0.8.0.dist-info → python_doctr-0.9.0.dist-info}/zip-safe +0 -0

doctr/models/kie_predictor/pytorch.py CHANGED Viewed

@@ -10,10 +10,10 @@ import torch
 from torch import nn
 from doctr.io.elements import Document
-from doctr.models._utils import estimate_orientation, get_language, invert_data_structure
+from doctr.models._utils import get_language, invert_data_structure
 from doctr.models.detection.predictor import DetectionPredictor
 from doctr.models.recognition.predictor import RecognitionPredictor
-from doctr.utils.geometry import rotate_image
+from doctr.utils.geometry import detach_scores
 from .base import _KIEPredictor
@@ -55,7 +55,13 @@ class KIEPredictor(nn.Module, _KIEPredictor):
         self.det_predictor = det_predictor.eval()  # type: ignore[attr-defined]
         self.reco_predictor = reco_predictor.eval()  # type: ignore[attr-defined]
         _KIEPredictor.__init__(
-            self, assume_straight_pages, straighten_pages, preserve_aspect_ratio, symmetric_pad, **kwargs
+            self,
+            assume_straight_pages,
+            straighten_pages,
+            preserve_aspect_ratio,
+            symmetric_pad,
+            detect_orientation,
+            **kwargs,
         )
         self.detect_orientation = detect_orientation
         self.detect_language = detect_language
@@ -83,29 +89,31 @@ class KIEPredictor(nn.Module, _KIEPredictor):
             for out_map in out_maps
         ]
         if self.detect_orientation:
-            origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seg_maps]
+            general_pages_orientations, origin_pages_orientations = self._get_orientations(pages, seg_maps)  # type: ignore[arg-type]
             orientations = [
-                {"value": orientation_page, "confidence": None} for orientation_page in origin_page_orientations
+                {"value": orientation_page, "confidence": None} for orientation_page in origin_pages_orientations
             ]
         else:
             orientations = None
+            general_pages_orientations = None
+            origin_pages_orientations = None
         if self.straighten_pages:
-            origin_page_orientations = (
-                origin_page_orientations
-                if self.detect_orientation
-                else [estimate_orientation(seq_map) for seq_map in seg_maps]
-            )
-            pages = [rotate_image(page, -angle, expand=False) for page, angle in zip(pages, origin_page_orientations)]
+            pages = self._straighten_pages(pages, seg_maps, general_pages_orientations, origin_pages_orientations)  # type: ignore
             # Forward again to get predictions on straight pages
             loc_preds = self.det_predictor(pages, **kwargs)
         dict_loc_preds: Dict[str, List[np.ndarray]] = invert_data_structure(loc_preds)  # type: ignore[assignment]
+        # Detach objectness scores from loc_preds
+        objectness_scores = {}
+        for class_name, det_preds in dict_loc_preds.items():
+            _loc_preds, _scores = detach_scores(det_preds)
+            dict_loc_preds[class_name] = _loc_preds
+            objectness_scores[class_name] = _scores
         # Check whether crop mode should be switched to channels first
         channels_last = len(pages) == 0 or isinstance(pages[0], np.ndarray)
-        # Rectify crops if aspect ratio
-        dict_loc_preds = {k: self._remove_padding(pages, loc_pred) for k, loc_pred in dict_loc_preds.items()}
         # Apply hooks to loc_preds if any
         for hook in self.hooks:
             dict_loc_preds = hook(dict_loc_preds)
@@ -114,32 +122,43 @@ class KIEPredictor(nn.Module, _KIEPredictor):
         crops = {}
         for class_name in dict_loc_preds.keys():
             crops[class_name], dict_loc_preds[class_name] = self._prepare_crops(
-                pages,
+                pages,  # type: ignore[arg-type]
                 dict_loc_preds[class_name],
                 channels_last=channels_last,
                 assume_straight_pages=self.assume_straight_pages,
             )
         # Rectify crop orientation
+        crop_orientations: Any = {}
         if not self.assume_straight_pages:
             for class_name in dict_loc_preds.keys():
-                crops[class_name], dict_loc_preds[class_name] = self._rectify_crops(
+                crops[class_name], dict_loc_preds[class_name], word_orientations = self._rectify_crops(
                     crops[class_name], dict_loc_preds[class_name]
                 )
+                crop_orientations[class_name] = [
+                    {"value": orientation[0], "confidence": orientation[1]} for orientation in word_orientations
+                ]
         # Identify character sequences
         word_preds = {
             k: self.reco_predictor([crop for page_crops in crop_value for crop in page_crops], **kwargs)
             for k, crop_value in crops.items()
         }
+        if not crop_orientations:
+            crop_orientations = {k: [{"value": 0, "confidence": None} for _ in word_preds[k]] for k in word_preds}
         boxes: Dict = {}
         text_preds: Dict = {}
+        word_crop_orientations: Dict = {}
         for class_name in dict_loc_preds.keys():
-            boxes[class_name], text_preds[class_name] = self._process_predictions(
-                dict_loc_preds[class_name], word_preds[class_name]
+            boxes[class_name], text_preds[class_name], word_crop_orientations[class_name] = self._process_predictions(
+                dict_loc_preds[class_name], word_preds[class_name], crop_orientations[class_name]
             )
         boxes_per_page: List[Dict] = invert_data_structure(boxes)  # type: ignore[assignment]
+        objectness_scores_per_page: List[Dict] = invert_data_structure(objectness_scores)  # type: ignore[assignment]
         text_preds_per_page: List[Dict] = invert_data_structure(text_preds)  # type: ignore[assignment]
+        crop_orientations_per_page: List[Dict] = invert_data_structure(word_crop_orientations)  # type: ignore[assignment]
         if self.detect_language:
             languages = [get_language(self.get_text(text_pred)) for text_pred in text_preds_per_page]
             languages_dict = [{"value": lang[0], "confidence": lang[1]} for lang in languages]
@@ -147,10 +166,12 @@ class KIEPredictor(nn.Module, _KIEPredictor):
             languages_dict = None
         out = self.doc_builder(
-            pages,
+            pages,  # type: ignore[arg-type]
             boxes_per_page,
+            objectness_scores_per_page,
             text_preds_per_page,
-            origin_page_shapes,
+            origin_page_shapes,  # type: ignore[arg-type]
+            crop_orientations_per_page,
             orientations,
             languages_dict,
         )

doctr/models/kie_predictor/tensorflow.py CHANGED Viewed

@@ -9,10 +9,10 @@ import numpy as np
 import tensorflow as tf
 from doctr.io.elements import Document
-from doctr.models._utils import estimate_orientation, get_language, invert_data_structure
+from doctr.models._utils import get_language, invert_data_structure
 from doctr.models.detection.predictor import DetectionPredictor
 from doctr.models.recognition.predictor import RecognitionPredictor
-from doctr.utils.geometry import rotate_image
+from doctr.utils.geometry import detach_scores
 from doctr.utils.repr import NestedObject
 from .base import _KIEPredictor
@@ -56,7 +56,13 @@ class KIEPredictor(NestedObject, _KIEPredictor):
         self.det_predictor = det_predictor
         self.reco_predictor = reco_predictor
         _KIEPredictor.__init__(
-            self, assume_straight_pages, straighten_pages, preserve_aspect_ratio, symmetric_pad, **kwargs
+            self,
+            assume_straight_pages,
+            straighten_pages,
+            preserve_aspect_ratio,
+            symmetric_pad,
+            detect_orientation,
+            **kwargs,
         )
         self.detect_orientation = detect_orientation
         self.detect_language = detect_language
@@ -83,25 +89,27 @@ class KIEPredictor(NestedObject, _KIEPredictor):
             for out_map in out_maps
         ]
         if self.detect_orientation:
-            origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seg_maps]
+            general_pages_orientations, origin_pages_orientations = self._get_orientations(pages, seg_maps)
             orientations = [
-                {"value": orientation_page, "confidence": None} for orientation_page in origin_page_orientations
+                {"value": orientation_page, "confidence": None} for orientation_page in origin_pages_orientations
             ]
         else:
             orientations = None
+            general_pages_orientations = None
+            origin_pages_orientations = None
         if self.straighten_pages:
-            origin_page_orientations = (
-                origin_page_orientations
-                if self.detect_orientation
-                else [estimate_orientation(seq_map) for seq_map in seg_maps]
-            )
-            pages = [rotate_image(page, -angle, expand=False) for page, angle in zip(pages, origin_page_orientations)]
+            pages = self._straighten_pages(pages, seg_maps, general_pages_orientations, origin_pages_orientations)
             # Forward again to get predictions on straight pages
             loc_preds = self.det_predictor(pages, **kwargs)  # type: ignore[assignment]
         dict_loc_preds: Dict[str, List[np.ndarray]] = invert_data_structure(loc_preds)  # type: ignore
-        # Rectify crops if aspect ratio
-        dict_loc_preds = {k: self._remove_padding(pages, loc_pred) for k, loc_pred in dict_loc_preds.items()}
+        # Detach objectness scores from loc_preds
+        objectness_scores = {}
+        for class_name, det_preds in dict_loc_preds.items():
+            _loc_preds, _scores = detach_scores(det_preds)
+            dict_loc_preds[class_name] = _loc_preds
+            objectness_scores[class_name] = _scores
         # Apply hooks to loc_preds if any
         for hook in self.hooks:
@@ -113,28 +121,38 @@ class KIEPredictor(NestedObject, _KIEPredictor):
             crops[class_name], dict_loc_preds[class_name] = self._prepare_crops(
                 pages, dict_loc_preds[class_name], channels_last=True, assume_straight_pages=self.assume_straight_pages
             )
         # Rectify crop orientation
+        crop_orientations: Any = {}
         if not self.assume_straight_pages:
             for class_name in dict_loc_preds.keys():
-                crops[class_name], dict_loc_preds[class_name] = self._rectify_crops(
+                crops[class_name], dict_loc_preds[class_name], word_orientations = self._rectify_crops(
                     crops[class_name], dict_loc_preds[class_name]
                 )
+                crop_orientations[class_name] = [
+                    {"value": orientation[0], "confidence": orientation[1]} for orientation in word_orientations
+                ]
         # Identify character sequences
         word_preds = {
             k: self.reco_predictor([crop for page_crops in crop_value for crop in page_crops], **kwargs)
             for k, crop_value in crops.items()
         }
+        if not crop_orientations:
+            crop_orientations = {k: [{"value": 0, "confidence": None} for _ in word_preds[k]] for k in word_preds}
         boxes: Dict = {}
         text_preds: Dict = {}
+        word_crop_orientations: Dict = {}
         for class_name in dict_loc_preds.keys():
-            boxes[class_name], text_preds[class_name] = self._process_predictions(
-                dict_loc_preds[class_name], word_preds[class_name]
+            boxes[class_name], text_preds[class_name], word_crop_orientations[class_name] = self._process_predictions(
+                dict_loc_preds[class_name], word_preds[class_name], crop_orientations[class_name]
             )
         boxes_per_page: List[Dict] = invert_data_structure(boxes)  # type: ignore[assignment]
+        objectness_scores_per_page: List[Dict] = invert_data_structure(objectness_scores)  # type: ignore[assignment]
         text_preds_per_page: List[Dict] = invert_data_structure(text_preds)  # type: ignore[assignment]
+        crop_orientations_per_page: List[Dict] = invert_data_structure(word_crop_orientations)  # type: ignore[assignment]
         if self.detect_language:
             languages = [get_language(self.get_text(text_pred)) for text_pred in text_preds_per_page]
@@ -145,8 +163,10 @@ class KIEPredictor(NestedObject, _KIEPredictor):
         out = self.doc_builder(
             pages,
             boxes_per_page,
+            objectness_scores_per_page,
             text_preds_per_page,
             origin_page_shapes,  # type: ignore[arg-type]
+            crop_orientations_per_page,
             orientations,
             languages_dict,
         )

doctr/models/modules/layers/pytorch.py CHANGED Viewed

@@ -5,6 +5,7 @@
 from typing import Tuple, Union
+import numpy as np
 import torch
 import torch.nn as nn
@@ -26,18 +27,20 @@ class FASTConvLayer(nn.Module):
     ) -> None:
         super().__init__()
-        converted_ks = (kernel_size, kernel_size) if isinstance(kernel_size, int) else kernel_size
+        self.groups = groups
+        self.in_channels = in_channels
+        self.converted_ks = (kernel_size, kernel_size) if isinstance(kernel_size, int) else kernel_size
         self.hor_conv, self.hor_bn = None, None
         self.ver_conv, self.ver_bn = None, None
-        padding = (int(((converted_ks[0] - 1) * dilation) / 2), int(((converted_ks[1] - 1) * dilation) / 2))
+        padding = (int(((self.converted_ks[0] - 1) * dilation) / 2), int(((self.converted_ks[1] - 1) * dilation) / 2))
         self.activation = nn.ReLU(inplace=True)
         self.conv = nn.Conv2d(
             in_channels,
             out_channels,
-            kernel_size=converted_ks,
+            kernel_size=self.converted_ks,
             stride=stride,
             padding=padding,
             dilation=dilation,
@@ -47,12 +50,12 @@ class FASTConvLayer(nn.Module):
         self.bn = nn.BatchNorm2d(out_channels)
-        if converted_ks[1] != 1:
+        if self.converted_ks[1] != 1:
             self.ver_conv = nn.Conv2d(
                 in_channels,
                 out_channels,
-                kernel_size=(converted_ks[0], 1),
-                padding=(int(((converted_ks[0] - 1) * dilation) / 2), 0),
+                kernel_size=(self.converted_ks[0], 1),
+                padding=(int(((self.converted_ks[0] - 1) * dilation) / 2), 0),
                 stride=stride,
                 dilation=dilation,
                 groups=groups,
@@ -60,12 +63,12 @@ class FASTConvLayer(nn.Module):
             )
             self.ver_bn = nn.BatchNorm2d(out_channels)
-        if converted_ks[0] != 1:
+        if self.converted_ks[0] != 1:
             self.hor_conv = nn.Conv2d(
                 in_channels,
                 out_channels,
-                kernel_size=(1, converted_ks[1]),
-                padding=(0, int(((converted_ks[1] - 1) * dilation) / 2)),
+                kernel_size=(1, self.converted_ks[1]),
+                padding=(0, int(((self.converted_ks[1] - 1) * dilation) / 2)),
                 stride=stride,
                 dilation=dilation,
                 groups=groups,
@@ -76,11 +79,87 @@ class FASTConvLayer(nn.Module):
         self.rbr_identity = nn.BatchNorm2d(in_channels) if out_channels == in_channels and stride == 1 else None
     def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if hasattr(self, "fused_conv"):
+            return self.activation(self.fused_conv(x))
         main_outputs = self.bn(self.conv(x))
         vertical_outputs = self.ver_bn(self.ver_conv(x)) if self.ver_conv is not None and self.ver_bn is not None else 0
         horizontal_outputs = (
             self.hor_bn(self.hor_conv(x)) if self.hor_bn is not None and self.hor_conv is not None else 0
         )
-        id_out = self.rbr_identity(x) if self.rbr_identity is not None and self.ver_bn is not None else 0
+        id_out = self.rbr_identity(x) if self.rbr_identity is not None else 0
         return self.activation(main_outputs + vertical_outputs + horizontal_outputs + id_out)
+    # The following logic is used to reparametrize the layer
+    # Borrowed from: https://github.com/czczup/FAST/blob/main/models/utils/nas_utils.py
+    def _identity_to_conv(
+        self, identity: Union[nn.BatchNorm2d, None]
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[int, int]]:
+        if identity is None or identity.running_var is None:
+            return 0, 0
+        if not hasattr(self, "id_tensor"):
+            input_dim = self.in_channels // self.groups
+            kernel_value = np.zeros((self.in_channels, input_dim, 1, 1), dtype=np.float32)
+            for i in range(self.in_channels):
+                kernel_value[i, i % input_dim, 0, 0] = 1
+            id_tensor = torch.from_numpy(kernel_value).to(identity.weight.device)
+            self.id_tensor = self._pad_to_mxn_tensor(id_tensor)
+        kernel = self.id_tensor
+        std = (identity.running_var + identity.eps).sqrt()
+        t = (identity.weight / std).reshape(-1, 1, 1, 1)
+        return kernel * t, identity.bias - identity.running_mean * identity.weight / std
+    def _fuse_bn_tensor(self, conv: nn.Conv2d, bn: nn.BatchNorm2d) -> Tuple[torch.Tensor, torch.Tensor]:
+        kernel = conv.weight
+        kernel = self._pad_to_mxn_tensor(kernel)
+        std = (bn.running_var + bn.eps).sqrt()  # type: ignore
+        t = (bn.weight / std).reshape(-1, 1, 1, 1)
+        return kernel * t, bn.bias - bn.running_mean * bn.weight / std
+    def _get_equivalent_kernel_bias(self) -> Tuple[torch.Tensor, torch.Tensor]:
+        kernel_mxn, bias_mxn = self._fuse_bn_tensor(self.conv, self.bn)
+        if self.ver_conv is not None:
+            kernel_mx1, bias_mx1 = self._fuse_bn_tensor(self.ver_conv, self.ver_bn)  # type: ignore[arg-type]
+        else:
+            kernel_mx1, bias_mx1 = 0, 0  # type: ignore[assignment]
+        if self.hor_conv is not None:
+            kernel_1xn, bias_1xn = self._fuse_bn_tensor(self.hor_conv, self.hor_bn)  # type: ignore[arg-type]
+        else:
+            kernel_1xn, bias_1xn = 0, 0  # type: ignore[assignment]
+        kernel_id, bias_id = self._identity_to_conv(self.rbr_identity)
+        kernel_mxn = kernel_mxn + kernel_mx1 + kernel_1xn + kernel_id
+        bias_mxn = bias_mxn + bias_mx1 + bias_1xn + bias_id
+        return kernel_mxn, bias_mxn
+    def _pad_to_mxn_tensor(self, kernel: torch.Tensor) -> torch.Tensor:
+        kernel_height, kernel_width = self.converted_ks
+        height, width = kernel.shape[2:]
+        pad_left_right = (kernel_width - width) // 2
+        pad_top_down = (kernel_height - height) // 2
+        return torch.nn.functional.pad(kernel, [pad_left_right, pad_left_right, pad_top_down, pad_top_down], value=0)
+    def reparameterize_layer(self):
+        if hasattr(self, "fused_conv"):
+            return
+        kernel, bias = self._get_equivalent_kernel_bias()
+        self.fused_conv = nn.Conv2d(
+            in_channels=self.conv.in_channels,
+            out_channels=self.conv.out_channels,
+            kernel_size=self.conv.kernel_size,  # type: ignore[arg-type]
+            stride=self.conv.stride,  # type: ignore[arg-type]
+            padding=self.conv.padding,  # type: ignore[arg-type]
+            dilation=self.conv.dilation,  # type: ignore[arg-type]
+            groups=self.conv.groups,
+            bias=True,
+        )
+        self.fused_conv.weight.data = kernel
+        self.fused_conv.bias.data = bias  # type: ignore[union-attr]
+        for para in self.parameters():
+            para.detach_()
+        for attr in ["conv", "bn", "ver_conv", "ver_bn", "hor_conv", "hor_bn"]:
+            if hasattr(self, attr):
+                self.__delattr__(attr)
+        if hasattr(self, "rbr_identity"):
+            self.__delattr__("rbr_identity")

doctr/models/modules/layers/tensorflow.py CHANGED Viewed

@@ -5,6 +5,7 @@
 from typing import Any, Tuple, Union
+import numpy as np
 import tensorflow as tf
 from tensorflow.keras import layers
@@ -28,18 +29,21 @@ class FASTConvLayer(layers.Layer, NestedObject):
     ) -> None:
         super().__init__()
-        converted_ks = (kernel_size, kernel_size) if isinstance(kernel_size, int) else kernel_size
+        self.groups = groups
+        self.in_channels = in_channels
+        self.converted_ks = (kernel_size, kernel_size) if isinstance(kernel_size, int) else kernel_size
         self.hor_conv, self.hor_bn = None, None
         self.ver_conv, self.ver_bn = None, None
-        padding = ((converted_ks[0] - 1) * dilation // 2, (converted_ks[1] - 1) * dilation // 2)
+        padding = ((self.converted_ks[0] - 1) * dilation // 2, (self.converted_ks[1] - 1) * dilation // 2)
         self.activation = layers.ReLU()
         self.conv_pad = layers.ZeroPadding2D(padding=padding)
         self.conv = layers.Conv2D(
             filters=out_channels,
-            kernel_size=converted_ks,
+            kernel_size=self.converted_ks,
             strides=stride,
             dilation_rate=dilation,
             groups=groups,
@@ -48,13 +52,13 @@ class FASTConvLayer(layers.Layer, NestedObject):
         self.bn = layers.BatchNormalization()
-        if converted_ks[1] != 1:
+        if self.converted_ks[1] != 1:
             self.ver_pad = layers.ZeroPadding2D(
-                padding=(int(((converted_ks[0] - 1) * dilation) / 2), 0),
+                padding=(int(((self.converted_ks[0] - 1) * dilation) / 2), 0),
             )
             self.ver_conv = layers.Conv2D(
                 filters=out_channels,
-                kernel_size=(converted_ks[0], 1),
+                kernel_size=(self.converted_ks[0], 1),
                 strides=stride,
                 dilation_rate=dilation,
                 groups=groups,
@@ -62,13 +66,13 @@ class FASTConvLayer(layers.Layer, NestedObject):
             )
             self.ver_bn = layers.BatchNormalization()
-        if converted_ks[0] != 1:
+        if self.converted_ks[0] != 1:
             self.hor_pad = layers.ZeroPadding2D(
-                padding=(0, int(((converted_ks[1] - 1) * dilation) / 2)),
+                padding=(0, int(((self.converted_ks[1] - 1) * dilation) / 2)),
             )
             self.hor_conv = layers.Conv2D(
                 filters=out_channels,
-                kernel_size=(1, converted_ks[1]),
+                kernel_size=(1, self.converted_ks[1]),
                 strides=stride,
                 dilation_rate=dilation,
                 groups=groups,
@@ -79,6 +83,9 @@ class FASTConvLayer(layers.Layer, NestedObject):
         self.rbr_identity = layers.BatchNormalization() if out_channels == in_channels and stride == 1 else None
     def call(self, x: tf.Tensor, **kwargs: Any) -> tf.Tensor:
+        if hasattr(self, "fused_conv"):
+            return self.activation(self.fused_conv(self.conv_pad(x, **kwargs), **kwargs))
         main_outputs = self.bn(self.conv(self.conv_pad(x, **kwargs), **kwargs), **kwargs)
         vertical_outputs = (
             self.ver_bn(self.ver_conv(self.ver_pad(x, **kwargs), **kwargs), **kwargs)
@@ -90,6 +97,77 @@ class FASTConvLayer(layers.Layer, NestedObject):
             if self.hor_bn is not None and self.hor_conv is not None
             else 0
         )
-        id_out = self.rbr_identity(x, **kwargs) if self.rbr_identity is not None and self.ver_bn is not None else 0
+        id_out = self.rbr_identity(x, **kwargs) if self.rbr_identity is not None else 0
         return self.activation(main_outputs + vertical_outputs + horizontal_outputs + id_out)
+    # The following logic is used to reparametrize the layer
+    # Adapted from: https://github.com/mindee/doctr/blob/main/doctr/models/modules/layers/pytorch.py
+    def _identity_to_conv(
+        self, identity: layers.BatchNormalization
+    ) -> Union[Tuple[tf.Tensor, tf.Tensor], Tuple[int, int]]:
+        if identity is None or not hasattr(identity, "moving_mean") or not hasattr(identity, "moving_variance"):
+            return 0, 0
+        if not hasattr(self, "id_tensor"):
+            input_dim = self.in_channels // self.groups
+            kernel_value = np.zeros((1, 1, input_dim, self.in_channels), dtype=np.float32)
+            for i in range(self.in_channels):
+                kernel_value[0, 0, i % input_dim, i] = 1
+            id_tensor = tf.constant(kernel_value, dtype=tf.float32)
+            self.id_tensor = self._pad_to_mxn_tensor(id_tensor)
+        kernel = self.id_tensor
+        std = tf.sqrt(identity.moving_variance + identity.epsilon)
+        t = tf.reshape(identity.gamma / std, (1, 1, 1, -1))
+        return kernel * t, identity.beta - identity.moving_mean * identity.gamma / std
+    def _fuse_bn_tensor(self, conv: layers.Conv2D, bn: layers.BatchNormalization) -> Tuple[tf.Tensor, tf.Tensor]:
+        kernel = conv.kernel
+        kernel = self._pad_to_mxn_tensor(kernel)
+        std = tf.sqrt(bn.moving_variance + bn.epsilon)
+        t = tf.reshape(bn.gamma / std, (1, 1, 1, -1))
+        return kernel * t, bn.beta - bn.moving_mean * bn.gamma / std
+    def _get_equivalent_kernel_bias(self):
+        kernel_mxn, bias_mxn = self._fuse_bn_tensor(self.conv, self.bn)
+        if self.ver_conv is not None:
+            kernel_mx1, bias_mx1 = self._fuse_bn_tensor(self.ver_conv, self.ver_bn)
+        else:
+            kernel_mx1, bias_mx1 = 0, 0
+        if self.hor_conv is not None:
+            kernel_1xn, bias_1xn = self._fuse_bn_tensor(self.hor_conv, self.hor_bn)
+        else:
+            kernel_1xn, bias_1xn = 0, 0
+        kernel_id, bias_id = self._identity_to_conv(self.rbr_identity)
+        kernel_mxn = kernel_mxn + kernel_mx1 + kernel_1xn + kernel_id
+        bias_mxn = bias_mxn + bias_mx1 + bias_1xn + bias_id
+        return kernel_mxn, bias_mxn
+    def _pad_to_mxn_tensor(self, kernel: tf.Tensor) -> tf.Tensor:
+        kernel_height, kernel_width = self.converted_ks
+        height, width = kernel.shape[:2]
+        pad_left_right = tf.maximum(0, (kernel_width - width) // 2)
+        pad_top_down = tf.maximum(0, (kernel_height - height) // 2)
+        return tf.pad(kernel, [[pad_top_down, pad_top_down], [pad_left_right, pad_left_right], [0, 0], [0, 0]])
+    def reparameterize_layer(self):
+        kernel, bias = self._get_equivalent_kernel_bias()
+        self.fused_conv = layers.Conv2D(
+            filters=self.conv.filters,
+            kernel_size=self.conv.kernel_size,
+            strides=self.conv.strides,
+            padding=self.conv.padding,
+            dilation_rate=self.conv.dilation_rate,
+            groups=self.conv.groups,
+            use_bias=True,
+        )
+        # build layer to initialize weights and biases
+        self.fused_conv.build(input_shape=(None, None, None, kernel.shape[-2]))
+        self.fused_conv.set_weights([kernel.numpy(), bias.numpy()])
+        for para in self.trainable_variables:
+            para._trainable = False
+        for attr in ["conv", "bn", "ver_conv", "ver_bn", "hor_conv", "hor_bn"]:
+            if hasattr(self, attr):
+                delattr(self, attr)
+        if hasattr(self, "rbr_identity"):
+            delattr(self, "rbr_identity")

doctr/models/modules/transformer/pytorch.py CHANGED Viewed

@@ -51,8 +51,8 @@ def scaled_dot_product_attention(
     scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(query.size(-1))
     if mask is not None:
         # NOTE: to ensure the ONNX compatibility, masked_fill works only with int equal condition
-        scores = scores.masked_fill(mask == 0, float("-inf"))  # type: ignore[attr-defined]
-    p_attn = torch.softmax(scores, dim=-1)  # type: ignore[call-overload]
+        scores = scores.masked_fill(mask == 0, float("-inf"))
+    p_attn = torch.softmax(scores, dim=-1)
     return torch.matmul(p_attn, value), p_attn

python-doctr 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

python-doctr 0.8.0py3-none-any.whl → 0.9.0py3-none-any.whl