PyPI - python-doctr - Versions diffs - 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

python-doctr 0.8.1py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

doctr/__init__.py +1 -1
doctr/contrib/__init__.py +0 -0
doctr/contrib/artefacts.py +131 -0
doctr/contrib/base.py +105 -0
doctr/datasets/cord.py +10 -1
doctr/datasets/datasets/pytorch.py +2 -2
doctr/datasets/funsd.py +11 -1
doctr/datasets/generator/base.py +6 -5
doctr/datasets/ic03.py +11 -1
doctr/datasets/ic13.py +10 -1
doctr/datasets/iiit5k.py +26 -16
doctr/datasets/imgur5k.py +11 -2
doctr/datasets/loader.py +1 -6
doctr/datasets/sroie.py +11 -1
doctr/datasets/svhn.py +11 -1
doctr/datasets/svt.py +11 -1
doctr/datasets/synthtext.py +11 -1
doctr/datasets/utils.py +9 -3
doctr/datasets/vocabs.py +15 -4
doctr/datasets/wildreceipt.py +12 -1
doctr/file_utils.py +45 -12
doctr/io/elements.py +52 -10
doctr/io/html.py +2 -2
doctr/io/image/pytorch.py +6 -8
doctr/io/image/tensorflow.py +1 -1
doctr/io/pdf.py +5 -2
doctr/io/reader.py +6 -0
doctr/models/__init__.py +0 -1
doctr/models/_utils.py +57 -20
doctr/models/builder.py +73 -15
doctr/models/classification/magc_resnet/tensorflow.py +13 -6
doctr/models/classification/mobilenet/pytorch.py +47 -9
doctr/models/classification/mobilenet/tensorflow.py +51 -14
doctr/models/classification/predictor/pytorch.py +28 -17
doctr/models/classification/predictor/tensorflow.py +26 -16
doctr/models/classification/resnet/tensorflow.py +21 -8
doctr/models/classification/textnet/pytorch.py +3 -3
doctr/models/classification/textnet/tensorflow.py +11 -5
doctr/models/classification/vgg/tensorflow.py +9 -3
doctr/models/classification/vit/tensorflow.py +10 -4
doctr/models/classification/zoo.py +55 -19
doctr/models/detection/_utils/__init__.py +1 -0
doctr/models/detection/_utils/base.py +66 -0
doctr/models/detection/differentiable_binarization/base.py +4 -3
doctr/models/detection/differentiable_binarization/pytorch.py +2 -2
doctr/models/detection/differentiable_binarization/tensorflow.py +34 -12
doctr/models/detection/fast/base.py +6 -5
doctr/models/detection/fast/pytorch.py +4 -4
doctr/models/detection/fast/tensorflow.py +15 -12
doctr/models/detection/linknet/base.py +4 -3
doctr/models/detection/linknet/tensorflow.py +23 -11
doctr/models/detection/predictor/pytorch.py +15 -1
doctr/models/detection/predictor/tensorflow.py +17 -3
doctr/models/detection/zoo.py +7 -2
doctr/models/factory/hub.py +8 -18
doctr/models/kie_predictor/base.py +13 -3
doctr/models/kie_predictor/pytorch.py +45 -20
doctr/models/kie_predictor/tensorflow.py +44 -17
doctr/models/modules/layers/pytorch.py +2 -3
doctr/models/modules/layers/tensorflow.py +6 -8
doctr/models/modules/transformer/pytorch.py +2 -2
doctr/models/modules/transformer/tensorflow.py +0 -2
doctr/models/modules/vision_transformer/pytorch.py +1 -1
doctr/models/modules/vision_transformer/tensorflow.py +1 -1
doctr/models/predictor/base.py +97 -58
doctr/models/predictor/pytorch.py +35 -20
doctr/models/predictor/tensorflow.py +35 -18
doctr/models/preprocessor/pytorch.py +4 -4
doctr/models/preprocessor/tensorflow.py +3 -2
doctr/models/recognition/crnn/tensorflow.py +8 -6
doctr/models/recognition/master/pytorch.py +2 -2
doctr/models/recognition/master/tensorflow.py +9 -4
doctr/models/recognition/parseq/pytorch.py +4 -3
doctr/models/recognition/parseq/tensorflow.py +14 -11
doctr/models/recognition/sar/pytorch.py +7 -6
doctr/models/recognition/sar/tensorflow.py +10 -12
doctr/models/recognition/vitstr/pytorch.py +1 -1
doctr/models/recognition/vitstr/tensorflow.py +9 -4
doctr/models/recognition/zoo.py +1 -1
doctr/models/utils/pytorch.py +1 -1
doctr/models/utils/tensorflow.py +15 -15
doctr/models/zoo.py +2 -2
doctr/py.typed +0 -0
doctr/transforms/functional/base.py +1 -1
doctr/transforms/functional/pytorch.py +5 -5
doctr/transforms/modules/base.py +37 -15
doctr/transforms/modules/pytorch.py +73 -14
doctr/transforms/modules/tensorflow.py +78 -19
doctr/utils/fonts.py +7 -5
doctr/utils/geometry.py +141 -31
doctr/utils/metrics.py +34 -175
doctr/utils/reconstitution.py +212 -0
doctr/utils/visualization.py +5 -118
doctr/version.py +1 -1
{python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/METADATA +85 -81
python_doctr-0.10.0.dist-info/RECORD +173 -0
{python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/WHEEL +1 -1
doctr/models/artefacts/__init__.py +0 -2
doctr/models/artefacts/barcode.py +0 -74
doctr/models/artefacts/face.py +0 -63
doctr/models/obj_detection/__init__.py +0 -1
doctr/models/obj_detection/faster_rcnn/__init__.py +0 -4
doctr/models/obj_detection/faster_rcnn/pytorch.py +0 -81
python_doctr-0.8.1.dist-info/RECORD +0 -173
{python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/LICENSE +0 -0
{python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/top_level.txt +0 -0
{python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/zip-safe +0 -0

doctr/models/kie_predictor/pytorch.py CHANGED Viewed

@@ -10,10 +10,10 @@ import torch
 from torch import nn
 from doctr.io.elements import Document
-from doctr.models._utils import estimate_orientation, get_language, invert_data_structure
+from doctr.models._utils import get_language, invert_data_structure
 from doctr.models.detection.predictor import DetectionPredictor
 from doctr.models.recognition.predictor import RecognitionPredictor
-from doctr.utils.geometry import rotate_image
+from doctr.utils.geometry import detach_scores
 from .base import _KIEPredictor
@@ -55,7 +55,13 @@ class KIEPredictor(nn.Module, _KIEPredictor):
         self.det_predictor = det_predictor.eval()  # type: ignore[attr-defined]
         self.reco_predictor = reco_predictor.eval()  # type: ignore[attr-defined]
         _KIEPredictor.__init__(
-            self, assume_straight_pages, straighten_pages, preserve_aspect_ratio, symmetric_pad, **kwargs
+            self,
+            assume_straight_pages,
+            straighten_pages,
+            preserve_aspect_ratio,
+            symmetric_pad,
+            detect_orientation,
+            **kwargs,
         )
         self.detect_orientation = detect_orientation
         self.detect_language = detect_language
@@ -83,29 +89,34 @@ class KIEPredictor(nn.Module, _KIEPredictor):
             for out_map in out_maps
         ]
         if self.detect_orientation:
-            origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seg_maps]
+            general_pages_orientations, origin_pages_orientations = self._get_orientations(pages, seg_maps)  # type: ignore[arg-type]
             orientations = [
-                {"value": orientation_page, "confidence": None} for orientation_page in origin_page_orientations
+                {"value": orientation_page, "confidence": None} for orientation_page in origin_pages_orientations
             ]
         else:
             orientations = None
+            general_pages_orientations = None
+            origin_pages_orientations = None
         if self.straighten_pages:
-            origin_page_orientations = (
-                origin_page_orientations
-                if self.detect_orientation
-                else [estimate_orientation(seq_map) for seq_map in seg_maps]
-            )
-            pages = [rotate_image(page, -angle, expand=False) for page, angle in zip(pages, origin_page_orientations)]
+            pages = self._straighten_pages(pages, seg_maps, general_pages_orientations, origin_pages_orientations)  # type: ignore
+            # update page shapes after straightening
+            origin_page_shapes = [page.shape[:2] for page in pages]
             # Forward again to get predictions on straight pages
             loc_preds = self.det_predictor(pages, **kwargs)
         dict_loc_preds: Dict[str, List[np.ndarray]] = invert_data_structure(loc_preds)  # type: ignore[assignment]
+        # Detach objectness scores from loc_preds
+        objectness_scores = {}
+        for class_name, det_preds in dict_loc_preds.items():
+            _loc_preds, _scores = detach_scores(det_preds)
+            dict_loc_preds[class_name] = _loc_preds
+            objectness_scores[class_name] = _scores
         # Check whether crop mode should be switched to channels first
         channels_last = len(pages) == 0 or isinstance(pages[0], np.ndarray)
-        # Rectify crops if aspect ratio
-        dict_loc_preds = {k: self._remove_padding(pages, loc_pred) for k, loc_pred in dict_loc_preds.items()}
         # Apply hooks to loc_preds if any
         for hook in self.hooks:
             dict_loc_preds = hook(dict_loc_preds)
@@ -114,32 +125,44 @@ class KIEPredictor(nn.Module, _KIEPredictor):
         crops = {}
         for class_name in dict_loc_preds.keys():
             crops[class_name], dict_loc_preds[class_name] = self._prepare_crops(
-                pages,
+                pages,  # type: ignore[arg-type]
                 dict_loc_preds[class_name],
                 channels_last=channels_last,
                 assume_straight_pages=self.assume_straight_pages,
+                assume_horizontal=self._page_orientation_disabled,
             )
         # Rectify crop orientation
+        crop_orientations: Any = {}
         if not self.assume_straight_pages:
             for class_name in dict_loc_preds.keys():
-                crops[class_name], dict_loc_preds[class_name] = self._rectify_crops(
+                crops[class_name], dict_loc_preds[class_name], word_orientations = self._rectify_crops(
                     crops[class_name], dict_loc_preds[class_name]
                 )
+                crop_orientations[class_name] = [
+                    {"value": orientation[0], "confidence": orientation[1]} for orientation in word_orientations
+                ]
         # Identify character sequences
         word_preds = {
             k: self.reco_predictor([crop for page_crops in crop_value for crop in page_crops], **kwargs)
             for k, crop_value in crops.items()
         }
+        if not crop_orientations:
+            crop_orientations = {k: [{"value": 0, "confidence": None} for _ in word_preds[k]] for k in word_preds}
         boxes: Dict = {}
         text_preds: Dict = {}
+        word_crop_orientations: Dict = {}
         for class_name in dict_loc_preds.keys():
-            boxes[class_name], text_preds[class_name] = self._process_predictions(
-                dict_loc_preds[class_name], word_preds[class_name]
+            boxes[class_name], text_preds[class_name], word_crop_orientations[class_name] = self._process_predictions(
+                dict_loc_preds[class_name], word_preds[class_name], crop_orientations[class_name]
             )
         boxes_per_page: List[Dict] = invert_data_structure(boxes)  # type: ignore[assignment]
+        objectness_scores_per_page: List[Dict] = invert_data_structure(objectness_scores)  # type: ignore[assignment]
         text_preds_per_page: List[Dict] = invert_data_structure(text_preds)  # type: ignore[assignment]
+        crop_orientations_per_page: List[Dict] = invert_data_structure(word_crop_orientations)  # type: ignore[assignment]
         if self.detect_language:
             languages = [get_language(self.get_text(text_pred)) for text_pred in text_preds_per_page]
             languages_dict = [{"value": lang[0], "confidence": lang[1]} for lang in languages]
@@ -147,10 +170,12 @@ class KIEPredictor(nn.Module, _KIEPredictor):
             languages_dict = None
         out = self.doc_builder(
-            pages,
+            pages,  # type: ignore[arg-type]
             boxes_per_page,
+            objectness_scores_per_page,
             text_preds_per_page,
-            origin_page_shapes,
+            origin_page_shapes,  # type: ignore[arg-type]
+            crop_orientations_per_page,
             orientations,
             languages_dict,
         )

doctr/models/kie_predictor/tensorflow.py CHANGED Viewed

@@ -9,10 +9,10 @@ import numpy as np
 import tensorflow as tf
 from doctr.io.elements import Document
-from doctr.models._utils import estimate_orientation, get_language, invert_data_structure
+from doctr.models._utils import get_language, invert_data_structure
 from doctr.models.detection.predictor import DetectionPredictor
 from doctr.models.recognition.predictor import RecognitionPredictor
-from doctr.utils.geometry import rotate_image
+from doctr.utils.geometry import detach_scores
 from doctr.utils.repr import NestedObject
 from .base import _KIEPredictor
@@ -56,7 +56,13 @@ class KIEPredictor(NestedObject, _KIEPredictor):
         self.det_predictor = det_predictor
         self.reco_predictor = reco_predictor
         _KIEPredictor.__init__(
-            self, assume_straight_pages, straighten_pages, preserve_aspect_ratio, symmetric_pad, **kwargs
+            self,
+            assume_straight_pages,
+            straighten_pages,
+            preserve_aspect_ratio,
+            symmetric_pad,
+            detect_orientation,
+            **kwargs,
         )
         self.detect_orientation = detect_orientation
         self.detect_language = detect_language
@@ -83,25 +89,30 @@ class KIEPredictor(NestedObject, _KIEPredictor):
             for out_map in out_maps
         ]
         if self.detect_orientation:
-            origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seg_maps]
+            general_pages_orientations, origin_pages_orientations = self._get_orientations(pages, seg_maps)
             orientations = [
-                {"value": orientation_page, "confidence": None} for orientation_page in origin_page_orientations
+                {"value": orientation_page, "confidence": None} for orientation_page in origin_pages_orientations
             ]
         else:
             orientations = None
+            general_pages_orientations = None
+            origin_pages_orientations = None
         if self.straighten_pages:
-            origin_page_orientations = (
-                origin_page_orientations
-                if self.detect_orientation
-                else [estimate_orientation(seq_map) for seq_map in seg_maps]
-            )
-            pages = [rotate_image(page, -angle, expand=False) for page, angle in zip(pages, origin_page_orientations)]
+            pages = self._straighten_pages(pages, seg_maps, general_pages_orientations, origin_pages_orientations)
+            # update page shapes after straightening
+            origin_page_shapes = [page.shape[:2] for page in pages]
             # Forward again to get predictions on straight pages
             loc_preds = self.det_predictor(pages, **kwargs)  # type: ignore[assignment]
         dict_loc_preds: Dict[str, List[np.ndarray]] = invert_data_structure(loc_preds)  # type: ignore
-        # Rectify crops if aspect ratio
-        dict_loc_preds = {k: self._remove_padding(pages, loc_pred) for k, loc_pred in dict_loc_preds.items()}
+        # Detach objectness scores from loc_preds
+        objectness_scores = {}
+        for class_name, det_preds in dict_loc_preds.items():
+            _loc_preds, _scores = detach_scores(det_preds)
+            dict_loc_preds[class_name] = _loc_preds
+            objectness_scores[class_name] = _scores
         # Apply hooks to loc_preds if any
         for hook in self.hooks:
@@ -111,30 +122,44 @@ class KIEPredictor(NestedObject, _KIEPredictor):
         crops = {}
         for class_name in dict_loc_preds.keys():
             crops[class_name], dict_loc_preds[class_name] = self._prepare_crops(
-                pages, dict_loc_preds[class_name], channels_last=True, assume_straight_pages=self.assume_straight_pages
+                pages,
+                dict_loc_preds[class_name],
+                channels_last=True,
+                assume_straight_pages=self.assume_straight_pages,
+                assume_horizontal=self._page_orientation_disabled,
             )
         # Rectify crop orientation
+        crop_orientations: Any = {}
         if not self.assume_straight_pages:
             for class_name in dict_loc_preds.keys():
-                crops[class_name], dict_loc_preds[class_name] = self._rectify_crops(
+                crops[class_name], dict_loc_preds[class_name], word_orientations = self._rectify_crops(
                     crops[class_name], dict_loc_preds[class_name]
                 )
+                crop_orientations[class_name] = [
+                    {"value": orientation[0], "confidence": orientation[1]} for orientation in word_orientations
+                ]
         # Identify character sequences
         word_preds = {
             k: self.reco_predictor([crop for page_crops in crop_value for crop in page_crops], **kwargs)
             for k, crop_value in crops.items()
         }
+        if not crop_orientations:
+            crop_orientations = {k: [{"value": 0, "confidence": None} for _ in word_preds[k]] for k in word_preds}
         boxes: Dict = {}
         text_preds: Dict = {}
+        word_crop_orientations: Dict = {}
         for class_name in dict_loc_preds.keys():
-            boxes[class_name], text_preds[class_name] = self._process_predictions(
-                dict_loc_preds[class_name], word_preds[class_name]
+            boxes[class_name], text_preds[class_name], word_crop_orientations[class_name] = self._process_predictions(
+                dict_loc_preds[class_name], word_preds[class_name], crop_orientations[class_name]
             )
         boxes_per_page: List[Dict] = invert_data_structure(boxes)  # type: ignore[assignment]
+        objectness_scores_per_page: List[Dict] = invert_data_structure(objectness_scores)  # type: ignore[assignment]
         text_preds_per_page: List[Dict] = invert_data_structure(text_preds)  # type: ignore[assignment]
+        crop_orientations_per_page: List[Dict] = invert_data_structure(word_crop_orientations)  # type: ignore[assignment]
         if self.detect_language:
             languages = [get_language(self.get_text(text_pred)) for text_pred in text_preds_per_page]
@@ -145,8 +170,10 @@ class KIEPredictor(NestedObject, _KIEPredictor):
         out = self.doc_builder(
             pages,
             boxes_per_page,
+            objectness_scores_per_page,
             text_preds_per_page,
             origin_page_shapes,  # type: ignore[arg-type]
+            crop_orientations_per_page,
             orientations,
             languages_dict,
         )

doctr/models/modules/layers/pytorch.py CHANGED Viewed

@@ -87,7 +87,7 @@ class FASTConvLayer(nn.Module):
         horizontal_outputs = (
             self.hor_bn(self.hor_conv(x)) if self.hor_bn is not None and self.hor_conv is not None else 0
         )
-        id_out = self.rbr_identity(x) if self.rbr_identity is not None and self.ver_bn is not None else 0
+        id_out = self.rbr_identity(x) if self.rbr_identity is not None else 0
         return self.activation(main_outputs + vertical_outputs + horizontal_outputs + id_out)
@@ -106,7 +106,7 @@ class FASTConvLayer(nn.Module):
             id_tensor = torch.from_numpy(kernel_value).to(identity.weight.device)
             self.id_tensor = self._pad_to_mxn_tensor(id_tensor)
         kernel = self.id_tensor
-        std = (identity.running_var + identity.eps).sqrt()  # type: ignore[attr-defined]
+        std = (identity.running_var + identity.eps).sqrt()
         t = (identity.weight / std).reshape(-1, 1, 1, 1)
         return kernel * t, identity.bias - identity.running_mean * identity.weight / std
@@ -155,7 +155,6 @@ class FASTConvLayer(nn.Module):
         )
         self.fused_conv.weight.data = kernel
         self.fused_conv.bias.data = bias  # type: ignore[union-attr]
-        self.deploy = True
         for para in self.parameters():
             para.detach_()
         for attr in ["conv", "bn", "ver_conv", "ver_bn", "hor_conv", "hor_bn"]:

doctr/models/modules/layers/tensorflow.py CHANGED Viewed

@@ -97,7 +97,7 @@ class FASTConvLayer(layers.Layer, NestedObject):
             if self.hor_bn is not None and self.hor_conv is not None
             else 0
         )
-        id_out = self.rbr_identity(x, **kwargs) if self.rbr_identity is not None and self.ver_bn is not None else 0
+        id_out = self.rbr_identity(x, **kwargs) if self.rbr_identity is not None else 0
         return self.activation(main_outputs + vertical_outputs + horizontal_outputs + id_out)
@@ -110,14 +110,14 @@ class FASTConvLayer(layers.Layer, NestedObject):
             return 0, 0
         if not hasattr(self, "id_tensor"):
             input_dim = self.in_channels // self.groups
-            kernel_value = np.zeros((self.in_channels, input_dim, 1, 1), dtype=np.float32)
+            kernel_value = np.zeros((1, 1, input_dim, self.in_channels), dtype=np.float32)
             for i in range(self.in_channels):
-                kernel_value[i, i % input_dim, 0, 0] = 1
+                kernel_value[0, 0, i % input_dim, i] = 1
             id_tensor = tf.constant(kernel_value, dtype=tf.float32)
             self.id_tensor = self._pad_to_mxn_tensor(id_tensor)
         kernel = self.id_tensor
         std = tf.sqrt(identity.moving_variance + identity.epsilon)
-        t = tf.reshape(identity.gamma / std, (-1, 1, 1, 1))
+        t = tf.reshape(identity.gamma / std, (1, 1, 1, -1))
         return kernel * t, identity.beta - identity.moving_mean * identity.gamma / std
     def _fuse_bn_tensor(self, conv: layers.Conv2D, bn: layers.BatchNormalization) -> Tuple[tf.Tensor, tf.Tensor]:
@@ -138,18 +138,16 @@ class FASTConvLayer(layers.Layer, NestedObject):
         else:
             kernel_1xn, bias_1xn = 0, 0
         kernel_id, bias_id = self._identity_to_conv(self.rbr_identity)
-        if not isinstance(kernel_id, int):
-            kernel_id = tf.transpose(kernel_id, (2, 3, 0, 1))
         kernel_mxn = kernel_mxn + kernel_mx1 + kernel_1xn + kernel_id
         bias_mxn = bias_mxn + bias_mx1 + bias_1xn + bias_id
         return kernel_mxn, bias_mxn
     def _pad_to_mxn_tensor(self, kernel: tf.Tensor) -> tf.Tensor:
         kernel_height, kernel_width = self.converted_ks
-        height, width = kernel.shape[2:]
+        height, width = kernel.shape[:2]
         pad_left_right = tf.maximum(0, (kernel_width - width) // 2)
         pad_top_down = tf.maximum(0, (kernel_height - height) // 2)
-        return tf.pad(kernel, [[0, 0], [0, 0], [pad_top_down, pad_top_down], [pad_left_right, pad_left_right]])
+        return tf.pad(kernel, [[pad_top_down, pad_top_down], [pad_left_right, pad_left_right], [0, 0], [0, 0]])
     def reparameterize_layer(self):
         kernel, bias = self._get_equivalent_kernel_bias()

doctr/models/modules/transformer/pytorch.py CHANGED Viewed

@@ -51,8 +51,8 @@ def scaled_dot_product_attention(
     scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(query.size(-1))
     if mask is not None:
         # NOTE: to ensure the ONNX compatibility, masked_fill works only with int equal condition
-        scores = scores.masked_fill(mask == 0, float("-inf"))  # type: ignore[attr-defined]
-    p_attn = torch.softmax(scores, dim=-1)  # type: ignore[call-overload]
+        scores = scores.masked_fill(mask == 0, float("-inf"))
+    p_attn = torch.softmax(scores, dim=-1)
     return torch.matmul(p_attn, value), p_attn

doctr/models/modules/transformer/tensorflow.py CHANGED Viewed

@@ -13,8 +13,6 @@ from doctr.utils.repr import NestedObject
 __all__ = ["Decoder", "PositionalEncoding", "EncoderBlock", "PositionwiseFeedForward", "MultiHeadAttention"]
-tf.config.run_functions_eagerly(True)
 class PositionalEncoding(layers.Layer, NestedObject):
     """Compute positional encoding"""

doctr/models/modules/vision_transformer/pytorch.py CHANGED Viewed

@@ -20,7 +20,7 @@ class PatchEmbedding(nn.Module):
         channels, height, width = input_shape
         self.patch_size = patch_size
         self.interpolate = True if patch_size[0] == patch_size[1] else False
-        self.grid_size = tuple([s // p for s, p in zip((height, width), self.patch_size)])
+        self.grid_size = tuple(s // p for s, p in zip((height, width), self.patch_size))
         self.num_patches = self.grid_size[0] * self.grid_size[1]
         self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))

doctr/models/modules/vision_transformer/tensorflow.py CHANGED Viewed

@@ -22,7 +22,7 @@ class PatchEmbedding(layers.Layer, NestedObject):
         height, width, _ = input_shape
         self.patch_size = patch_size
         self.interpolate = True if patch_size[0] == patch_size[1] else False
-        self.grid_size = tuple([s // p for s, p in zip((height, width), self.patch_size)])
+        self.grid_size = tuple(s // p for s, p in zip((height, width), self.patch_size))
         self.num_patches = self.grid_size[0] * self.grid_size[1]
         self.cls_token = self.add_weight(shape=(1, 1, embed_dim), initializer="zeros", trainable=True, name="cls_token")

doctr/models/predictor/base.py CHANGED Viewed

@@ -3,16 +3,16 @@
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-from typing import Any, Callable, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple
 import numpy as np
 from doctr.models.builder import DocumentBuilder
-from doctr.utils.geometry import extract_crops, extract_rcrops
+from doctr.utils.geometry import extract_crops, extract_rcrops, remove_image_padding, rotate_image
-from .._utils import rectify_crops, rectify_loc_preds
-from ..classification import crop_orientation_predictor
-from ..classification.predictor import CropOrientationPredictor
+from .._utils import estimate_orientation, rectify_crops, rectify_loc_preds
+from ..classification import crop_orientation_predictor, page_orientation_predictor
+from ..classification.predictor import OrientationPredictor
 __all__ = ["_OCRPredictor"]
@@ -29,10 +29,13 @@ class _OCRPredictor:
             accordingly. Doing so will improve performances for documents with page-uniform rotations.
         preserve_aspect_ratio: if True, resize preserving the aspect ratio (with padding)
         symmetric_pad: if True and preserve_aspect_ratio is True, pas the image symmetrically.
+        detect_orientation: if True, the estimated general page orientation will be added to the predictions for each
+            page. Doing so will slightly deteriorate the overall latency.
         **kwargs: keyword args of `DocumentBuilder`
     """
-    crop_orientation_predictor: Optional[CropOrientationPredictor]
+    crop_orientation_predictor: Optional[OrientationPredictor]
+    page_orientation_predictor: Optional[OrientationPredictor]
     def __init__(
         self,
@@ -40,29 +43,93 @@ class _OCRPredictor:
         straighten_pages: bool = False,
         preserve_aspect_ratio: bool = True,
         symmetric_pad: bool = True,
+        detect_orientation: bool = False,
         **kwargs: Any,
     ) -> None:
         self.assume_straight_pages = assume_straight_pages
         self.straighten_pages = straighten_pages
-        self.crop_orientation_predictor = None if assume_straight_pages else crop_orientation_predictor(pretrained=True)
+        self._page_orientation_disabled = kwargs.pop("disable_page_orientation", False)
+        self._crop_orientation_disabled = kwargs.pop("disable_crop_orientation", False)
+        self.crop_orientation_predictor = (
+            None
+            if assume_straight_pages
+            else crop_orientation_predictor(pretrained=True, disabled=self._crop_orientation_disabled)
+        )
+        self.page_orientation_predictor = (
+            page_orientation_predictor(pretrained=True, disabled=self._page_orientation_disabled)
+            if detect_orientation or straighten_pages or not assume_straight_pages
+            else None
+        )
         self.doc_builder = DocumentBuilder(**kwargs)
         self.preserve_aspect_ratio = preserve_aspect_ratio
         self.symmetric_pad = symmetric_pad
         self.hooks: List[Callable] = []
+    def _general_page_orientations(
+        self,
+        pages: List[np.ndarray],
+    ) -> List[Tuple[int, float]]:
+        _, classes, probs = zip(self.page_orientation_predictor(pages))  # type: ignore[misc]
+        # Flatten to list of tuples with (value, confidence)
+        page_orientations = [
+            (orientation, prob)
+            for page_classes, page_probs in zip(classes, probs)
+            for orientation, prob in zip(page_classes, page_probs)
+        ]
+        return page_orientations
+    def _get_orientations(
+        self, pages: List[np.ndarray], seg_maps: List[np.ndarray]
+    ) -> Tuple[List[Tuple[int, float]], List[int]]:
+        general_pages_orientations = self._general_page_orientations(pages)
+        origin_page_orientations = [
+            estimate_orientation(seq_map, general_orientation)
+            for seq_map, general_orientation in zip(seg_maps, general_pages_orientations)
+        ]
+        return general_pages_orientations, origin_page_orientations
+    def _straighten_pages(
+        self,
+        pages: List[np.ndarray],
+        seg_maps: List[np.ndarray],
+        general_pages_orientations: Optional[List[Tuple[int, float]]] = None,
+        origin_pages_orientations: Optional[List[int]] = None,
+    ) -> List[np.ndarray]:
+        general_pages_orientations = (
+            general_pages_orientations if general_pages_orientations else self._general_page_orientations(pages)
+        )
+        origin_pages_orientations = (
+            origin_pages_orientations
+            if origin_pages_orientations
+            else [
+                estimate_orientation(seq_map, general_orientation)
+                for seq_map, general_orientation in zip(seg_maps, general_pages_orientations)
+            ]
+        )
+        return [
+            # expand if height and width are not equal, then remove the padding
+            remove_image_padding(rotate_image(page, angle, expand=page.shape[0] != page.shape[1]))
+            for page, angle in zip(pages, origin_pages_orientations)
+        ]
     @staticmethod
     def _generate_crops(
         pages: List[np.ndarray],
         loc_preds: List[np.ndarray],
         channels_last: bool,
         assume_straight_pages: bool = False,
+        assume_horizontal: bool = False,
     ) -> List[List[np.ndarray]]:
-        extraction_fn = extract_crops if assume_straight_pages else extract_rcrops
-        crops = [
-            extraction_fn(page, _boxes[:, :4], channels_last=channels_last)  # type: ignore[operator]
-            for page, _boxes in zip(pages, loc_preds)
-        ]
+        if assume_straight_pages:
+            crops = [
+                extract_crops(page, _boxes[:, :4], channels_last=channels_last)
+                for page, _boxes in zip(pages, loc_preds)
+            ]
+        else:
+            crops = [
+                extract_rcrops(page, _boxes[:, :4], channels_last=channels_last, assume_horizontal=assume_horizontal)
+                for page, _boxes in zip(pages, loc_preds)
+            ]
         return crops
     @staticmethod
@@ -71,8 +138,9 @@ class _OCRPredictor:
         loc_preds: List[np.ndarray],
         channels_last: bool,
         assume_straight_pages: bool = False,
+        assume_horizontal: bool = False,
     ) -> Tuple[List[List[np.ndarray]], List[np.ndarray]]:
-        crops = _OCRPredictor._generate_crops(pages, loc_preds, channels_last, assume_straight_pages)
+        crops = _OCRPredictor._generate_crops(pages, loc_preds, channels_last, assume_straight_pages, assume_horizontal)
         # Avoid sending zero-sized crops
         is_kept = [[all(s > 0 for s in crop.shape) for crop in page_crops] for page_crops in crops]
@@ -88,68 +156,39 @@ class _OCRPredictor:
         self,
         crops: List[List[np.ndarray]],
         loc_preds: List[np.ndarray],
-    ) -> Tuple[List[List[np.ndarray]], List[np.ndarray]]:
+    ) -> Tuple[List[List[np.ndarray]], List[np.ndarray], List[Tuple[int, float]]]:
         # Work at a page level
-        orientations = [self.crop_orientation_predictor(page_crops) for page_crops in crops]  # type: ignore[misc]
+        orientations, classes, probs = zip(*[self.crop_orientation_predictor(page_crops) for page_crops in crops])  # type: ignore[misc]
         rect_crops = [rectify_crops(page_crops, orientation) for page_crops, orientation in zip(crops, orientations)]
         rect_loc_preds = [
             rectify_loc_preds(page_loc_preds, orientation) if len(page_loc_preds) > 0 else page_loc_preds
             for page_loc_preds, orientation in zip(loc_preds, orientations)
         ]
-        return rect_crops, rect_loc_preds  # type: ignore[return-value]
-    def _remove_padding(
-        self,
-        pages: List[np.ndarray],
-        loc_preds: List[np.ndarray],
-    ) -> List[np.ndarray]:
-        if self.preserve_aspect_ratio:
-            # Rectify loc_preds to remove padding
-            rectified_preds = []
-            for page, loc_pred in zip(pages, loc_preds):
-                h, w = page.shape[0], page.shape[1]
-                if h > w:
-                    # y unchanged, dilate x coord
-                    if self.symmetric_pad:
-                        if self.assume_straight_pages:
-                            loc_pred[:, [0, 2]] = np.clip((loc_pred[:, [0, 2]] - 0.5) * h / w + 0.5, 0, 1)
-                        else:
-                            loc_pred[:, :, 0] = np.clip((loc_pred[:, :, 0] - 0.5) * h / w + 0.5, 0, 1)
-                    else:
-                        if self.assume_straight_pages:
-                            loc_pred[:, [0, 2]] *= h / w
-                        else:
-                            loc_pred[:, :, 0] *= h / w
-                elif w > h:
-                    # x unchanged, dilate y coord
-                    if self.symmetric_pad:
-                        if self.assume_straight_pages:
-                            loc_pred[:, [1, 3]] = np.clip((loc_pred[:, [1, 3]] - 0.5) * w / h + 0.5, 0, 1)
-                        else:
-                            loc_pred[:, :, 1] = np.clip((loc_pred[:, :, 1] - 0.5) * w / h + 0.5, 0, 1)
-                    else:
-                        if self.assume_straight_pages:
-                            loc_pred[:, [1, 3]] *= w / h
-                        else:
-                            loc_pred[:, :, 1] *= w / h
-                rectified_preds.append(loc_pred)
-            return rectified_preds
-        return loc_preds
+        # Flatten to list of tuples with (value, confidence)
+        crop_orientations = [
+            (orientation, prob)
+            for page_classes, page_probs in zip(classes, probs)
+            for orientation, prob in zip(page_classes, page_probs)
+        ]
+        return rect_crops, rect_loc_preds, crop_orientations  # type: ignore[return-value]
     @staticmethod
     def _process_predictions(
         loc_preds: List[np.ndarray],
         word_preds: List[Tuple[str, float]],
-    ) -> Tuple[List[np.ndarray], List[List[Tuple[str, float]]]]:
+        crop_orientations: List[Dict[str, Any]],
+    ) -> Tuple[List[np.ndarray], List[List[Tuple[str, float]]], List[List[Dict[str, Any]]]]:
         text_preds = []
+        crop_orientation_preds = []
         if len(loc_preds) > 0:
-            # Text
+            # Text & crop orientation predictions at page level
             _idx = 0
             for page_boxes in loc_preds:
                 text_preds.append(word_preds[_idx : _idx + page_boxes.shape[0]])
+                crop_orientation_preds.append(crop_orientations[_idx : _idx + page_boxes.shape[0]])
                 _idx += page_boxes.shape[0]
-        return loc_preds, text_preds
+        return loc_preds, text_preds, crop_orientation_preds
     def add_hook(self, hook: Callable) -> None:
         """Add a hook to the predictor

python-doctr 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

python-doctr 0.8.1py3-none-any.whl → 0.10.0py3-none-any.whl