PyPI - python-doctr - Versions diffs - 0.7.0__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

python-doctr 0.7.0py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

doctr/datasets/__init__.py +2 -0
doctr/datasets/cord.py +6 -4
doctr/datasets/datasets/base.py +3 -2
doctr/datasets/datasets/pytorch.py +4 -2
doctr/datasets/datasets/tensorflow.py +4 -2
doctr/datasets/detection.py +6 -3
doctr/datasets/doc_artefacts.py +2 -1
doctr/datasets/funsd.py +7 -8
doctr/datasets/generator/base.py +3 -2
doctr/datasets/generator/pytorch.py +3 -1
doctr/datasets/generator/tensorflow.py +3 -1
doctr/datasets/ic03.py +3 -2
doctr/datasets/ic13.py +2 -1
doctr/datasets/iiit5k.py +6 -4
doctr/datasets/iiithws.py +2 -1
doctr/datasets/imgur5k.py +3 -2
doctr/datasets/loader.py +4 -2
doctr/datasets/mjsynth.py +2 -1
doctr/datasets/ocr.py +2 -1
doctr/datasets/orientation.py +40 -0
doctr/datasets/recognition.py +3 -2
doctr/datasets/sroie.py +2 -1
doctr/datasets/svhn.py +2 -1
doctr/datasets/svt.py +3 -2
doctr/datasets/synthtext.py +2 -1
doctr/datasets/utils.py +27 -11
doctr/datasets/vocabs.py +26 -1
doctr/datasets/wildreceipt.py +111 -0
doctr/file_utils.py +3 -1
doctr/io/elements.py +52 -35
doctr/io/html.py +5 -3
doctr/io/image/base.py +5 -4
doctr/io/image/pytorch.py +12 -7
doctr/io/image/tensorflow.py +11 -6
doctr/io/pdf.py +5 -4
doctr/io/reader.py +13 -5
doctr/models/_utils.py +30 -53
doctr/models/artefacts/barcode.py +4 -3
doctr/models/artefacts/face.py +4 -2
doctr/models/builder.py +58 -43
doctr/models/classification/__init__.py +1 -0
doctr/models/classification/magc_resnet/pytorch.py +5 -2
doctr/models/classification/magc_resnet/tensorflow.py +5 -2
doctr/models/classification/mobilenet/pytorch.py +16 -4
doctr/models/classification/mobilenet/tensorflow.py +29 -20
doctr/models/classification/predictor/pytorch.py +3 -2
doctr/models/classification/predictor/tensorflow.py +2 -1
doctr/models/classification/resnet/pytorch.py +23 -13
doctr/models/classification/resnet/tensorflow.py +33 -26
doctr/models/classification/textnet/__init__.py +6 -0
doctr/models/classification/textnet/pytorch.py +275 -0
doctr/models/classification/textnet/tensorflow.py +267 -0
doctr/models/classification/vgg/pytorch.py +4 -2
doctr/models/classification/vgg/tensorflow.py +5 -2
doctr/models/classification/vit/pytorch.py +9 -3
doctr/models/classification/vit/tensorflow.py +9 -3
doctr/models/classification/zoo.py +7 -2
doctr/models/core.py +1 -1
doctr/models/detection/__init__.py +1 -0
doctr/models/detection/_utils/pytorch.py +7 -1
doctr/models/detection/_utils/tensorflow.py +7 -3
doctr/models/detection/core.py +9 -3
doctr/models/detection/differentiable_binarization/base.py +37 -25
doctr/models/detection/differentiable_binarization/pytorch.py +80 -104
doctr/models/detection/differentiable_binarization/tensorflow.py +74 -55
doctr/models/detection/fast/__init__.py +6 -0
doctr/models/detection/fast/base.py +256 -0
doctr/models/detection/fast/pytorch.py +442 -0
doctr/models/detection/fast/tensorflow.py +428 -0
doctr/models/detection/linknet/base.py +12 -5
doctr/models/detection/linknet/pytorch.py +28 -15
doctr/models/detection/linknet/tensorflow.py +68 -88
doctr/models/detection/predictor/pytorch.py +16 -6
doctr/models/detection/predictor/tensorflow.py +13 -5
doctr/models/detection/zoo.py +19 -16
doctr/models/factory/hub.py +20 -10
doctr/models/kie_predictor/base.py +2 -1
doctr/models/kie_predictor/pytorch.py +28 -36
doctr/models/kie_predictor/tensorflow.py +27 -27
doctr/models/modules/__init__.py +1 -0
doctr/models/modules/layers/__init__.py +6 -0
doctr/models/modules/layers/pytorch.py +166 -0
doctr/models/modules/layers/tensorflow.py +175 -0
doctr/models/modules/transformer/pytorch.py +24 -22
doctr/models/modules/transformer/tensorflow.py +6 -4
doctr/models/modules/vision_transformer/pytorch.py +2 -4
doctr/models/modules/vision_transformer/tensorflow.py +2 -4
doctr/models/obj_detection/faster_rcnn/pytorch.py +4 -2
doctr/models/predictor/base.py +14 -3
doctr/models/predictor/pytorch.py +26 -29
doctr/models/predictor/tensorflow.py +25 -22
doctr/models/preprocessor/pytorch.py +14 -9
doctr/models/preprocessor/tensorflow.py +10 -5
doctr/models/recognition/core.py +4 -1
doctr/models/recognition/crnn/pytorch.py +23 -16
doctr/models/recognition/crnn/tensorflow.py +25 -17
doctr/models/recognition/master/base.py +4 -1
doctr/models/recognition/master/pytorch.py +20 -9
doctr/models/recognition/master/tensorflow.py +20 -8
doctr/models/recognition/parseq/base.py +4 -1
doctr/models/recognition/parseq/pytorch.py +28 -22
doctr/models/recognition/parseq/tensorflow.py +22 -11
doctr/models/recognition/predictor/_utils.py +3 -2
doctr/models/recognition/predictor/pytorch.py +3 -2
doctr/models/recognition/predictor/tensorflow.py +2 -1
doctr/models/recognition/sar/pytorch.py +14 -7
doctr/models/recognition/sar/tensorflow.py +23 -14
doctr/models/recognition/utils.py +5 -1
doctr/models/recognition/vitstr/base.py +4 -1
doctr/models/recognition/vitstr/pytorch.py +22 -13
doctr/models/recognition/vitstr/tensorflow.py +21 -10
doctr/models/recognition/zoo.py +4 -2
doctr/models/utils/pytorch.py +24 -6
doctr/models/utils/tensorflow.py +22 -3
doctr/models/zoo.py +21 -3
doctr/transforms/functional/base.py +8 -3
doctr/transforms/functional/pytorch.py +23 -6
doctr/transforms/functional/tensorflow.py +25 -5
doctr/transforms/modules/base.py +12 -5
doctr/transforms/modules/pytorch.py +10 -12
doctr/transforms/modules/tensorflow.py +17 -9
doctr/utils/common_types.py +1 -1
doctr/utils/data.py +4 -2
doctr/utils/fonts.py +3 -2
doctr/utils/geometry.py +95 -26
doctr/utils/metrics.py +36 -22
doctr/utils/multithreading.py +5 -3
doctr/utils/repr.py +3 -1
doctr/utils/visualization.py +31 -8
doctr/version.py +1 -1
{python_doctr-0.7.0.dist-info → python_doctr-0.8.1.dist-info}/METADATA +67 -31
python_doctr-0.8.1.dist-info/RECORD +173 -0
{python_doctr-0.7.0.dist-info → python_doctr-0.8.1.dist-info}/WHEEL +1 -1
python_doctr-0.7.0.dist-info/RECORD +0 -161
{python_doctr-0.7.0.dist-info → python_doctr-0.8.1.dist-info}/LICENSE +0 -0
{python_doctr-0.7.0.dist-info → python_doctr-0.8.1.dist-info}/top_level.txt +0 -0
{python_doctr-0.7.0.dist-info → python_doctr-0.8.1.dist-info}/zip-safe +0 -0

doctr/models/recognition/vitstr/pytorch.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2023, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -14,7 +14,7 @@ from torchvision.models._utils import IntermediateLayerGetter
 from doctr.datasets import VOCABS
 from ...classification import vit_b, vit_s
-from ...utils.pytorch import load_pretrained_params
+from ...utils.pytorch import _bf16_to_float32, load_pretrained_params
 from .base import _ViTSTR, _ViTSTRPostProcessor
 __all__ = ["ViTSTR", "vitstr_small", "vitstr_base"]
@@ -25,14 +25,14 @@ default_cfgs: Dict[str, Dict[str, Any]] = {
         "std": (0.299, 0.296, 0.301),
         "input_shape": (3, 32, 128),
         "vocab": VOCABS["french"],
-        "url": None,
+        "url": "https://doctr-static.mindee.com/models?id=v0.7.0/vitstr_small-fcd12655.pt&src=0",
     },
     "vitstr_base": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
         "input_shape": (3, 32, 128),
         "vocab": VOCABS["french"],
-        "url": None,
+        "url": "https://doctr-static.mindee.com/models?id=v0.7.0/vitstr_base-50b21df2.pt&src=0",
     },
 }
@@ -42,6 +42,7 @@ class ViTSTR(_ViTSTR, nn.Module):
     Efficient Scene Text Recognition" <https://arxiv.org/pdf/2105.08582.pdf>`_.
     Args:
+    ----
         feature_extractor: the backbone serving as feature extractor
         vocab: vocabulary used for encoding
         embedding_units: number of embedding units
@@ -95,7 +96,7 @@ class ViTSTR(_ViTSTR, nn.Module):
         B, N, E = features.size()
         features = features.reshape(B * N, E)
         logits = self.head(features).view(B, N, len(self.vocab) + 1)  # (batch_size, max_length, vocab + 1)
-        decoded_features = logits[:, 1:]  # remove cls_token
+        decoded_features = _bf16_to_float32(logits[:, 1:])  # remove cls_token
         out: Dict[str, Any] = {}
         if self.exportable:
@@ -124,17 +125,19 @@ class ViTSTR(_ViTSTR, nn.Module):
         Sequences are masked after the EOS character.
         Args:
+        ----
             model_output: predicted logits of the model
             gt: the encoded tensor with gt labels
             seq_len: lengths of each gt word inside the batch
         Returns:
+        -------
             The loss of the model on the batch
         """
         # Input length : number of steps
         input_len = model_output.shape[1]
         # Add one for additional <eos> token (sos disappear in shift!)
-        seq_len = seq_len + 1
+        seq_len = seq_len + 1  # type: ignore[assignment]
         # Compute loss: don't forget to shift gt! Otherwise the model learns to output the gt[t-1]!
         # The "masked" first gt char is <sos>.
         cce = F.cross_entropy(model_output.permute(0, 2, 1), gt[:, 1:], reduction="none")
@@ -150,6 +153,7 @@ class ViTSTRPostProcessor(_ViTSTRPostProcessor):
     """Post processor for ViTSTR architecture
     Args:
+    ----
         vocab: string containing the ordered sequence of supported characters
     """
@@ -159,18 +163,19 @@ class ViTSTRPostProcessor(_ViTSTRPostProcessor):
     ) -> List[Tuple[str, float]]:
         # compute pred with argmax for attention models
         out_idxs = logits.argmax(-1)
-        # N x L
-        probs = torch.gather(torch.softmax(logits, -1), -1, out_idxs.unsqueeze(-1)).squeeze(-1)
-        # Take the minimum confidence of the sequence
-        probs = probs.min(dim=1).values.detach().cpu()
+        preds_prob = torch.softmax(logits, -1).max(dim=-1)[0]
         # Manual decoding
         word_values = [
             "".join(self._embedding[idx] for idx in encoded_seq).split("<eos>")[0]
             for encoded_seq in out_idxs.cpu().numpy()
         ]
+        # compute probabilties for each word up to the EOS token
+        probs = [
+            preds_prob[i, : len(word)].clip(0, 1).mean().item() if word else 0.0 for i, word in enumerate(word_values)
+        ]
-        return list(zip(word_values, probs.numpy().tolist()))
+        return list(zip(word_values, probs))
 def _vitstr(
@@ -223,12 +228,14 @@ def vitstr_small(pretrained: bool = False, **kwargs: Any) -> ViTSTR:
     >>> out = model(input_tensor)
     Args:
+    ----
         pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
+        kwargs: keyword arguments of the ViTSTR architecture
     Returns:
+    -------
         text recognition architecture
     """
     return _vitstr(
         "vitstr_small",
         pretrained,
@@ -252,12 +259,14 @@ def vitstr_base(pretrained: bool = False, **kwargs: Any) -> ViTSTR:
     >>> out = model(input_tensor)
     Args:
+    ----
         pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
+        kwargs: keyword arguments of the ViTSTR architecture
     Returns:
+    -------
         text recognition architecture
     """
     return _vitstr(
         "vitstr_base",
         pretrained,

doctr/models/recognition/vitstr/tensorflow.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2023, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -12,7 +12,7 @@ from tensorflow.keras import Model, layers
 from doctr.datasets import VOCABS
 from ...classification import vit_b, vit_s
-from ...utils.tensorflow import load_pretrained_params
+from ...utils.tensorflow import _bf16_to_float32, load_pretrained_params
 from .base import _ViTSTR, _ViTSTRPostProcessor
 __all__ = ["ViTSTR", "vitstr_small", "vitstr_base"]
@@ -40,6 +40,7 @@ class ViTSTR(_ViTSTR, Model):
     Efficient Scene Text Recognition" <https://arxiv.org/pdf/2105.08582.pdf>`_.
     Args:
+    ----
         feature_extractor: the backbone serving as feature extractor
         vocab: vocabulary used for encoding
         embedding_units: number of embedding units
@@ -84,11 +85,13 @@ class ViTSTR(_ViTSTR, Model):
         Sequences are masked after the EOS character.
         Args:
+        ----
             model_output: predicted logits of the model
             gt: the encoded tensor with gt labels
             seq_len: lengths of each gt word inside the batch
         Returns:
+        -------
             The loss of the model on the batch
         """
         # Input length : number of steps
@@ -131,7 +134,7 @@ class ViTSTR(_ViTSTR, Model):
         logits = tf.reshape(
             self.head(features, **kwargs), (B, N, len(self.vocab) + 1)
         )  # (batch_size, max_length, vocab + 1)
-        decoded_features = logits[:, 1:]  # remove cls_token
+        decoded_features = _bf16_to_float32(logits[:, 1:])  # remove cls_token
         out: Dict[str, tf.Tensor] = {}
         if self.exportable:
@@ -155,6 +158,7 @@ class ViTSTRPostProcessor(_ViTSTRPostProcessor):
     """Post processor for ViTSTR architecture
     Args:
+    ----
         vocab: string containing the ordered sequence of supported characters
     """
@@ -164,10 +168,7 @@ class ViTSTRPostProcessor(_ViTSTRPostProcessor):
     ) -> List[Tuple[str, float]]:
         # compute pred with argmax for attention models
         out_idxs = tf.math.argmax(logits, axis=2)
-        # N x L
-        probs = tf.gather(tf.nn.softmax(logits, axis=-1), out_idxs, axis=-1, batch_dims=2)
-        # Take the minimum confidence of the sequence
-        probs = tf.math.reduce_min(probs, axis=1)
+        preds_prob = tf.math.reduce_max(tf.nn.softmax(logits, axis=-1), axis=-1)
         # decode raw output of the model with tf_label_to_idx
         out_idxs = tf.cast(out_idxs, dtype="int32")
@@ -177,7 +178,13 @@ class ViTSTRPostProcessor(_ViTSTRPostProcessor):
         decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value="not valid")[:, 0]
         word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
-        return list(zip(word_values, probs.numpy().tolist()))
+        # compute probabilties for each word up to the EOS token
+        probs = [
+            preds_prob[i, : len(word)].numpy().clip(0, 1).mean().item() if word else 0.0
+            for i, word in enumerate(word_values)
+        ]
+        return list(zip(word_values, probs))
 def _vitstr(
@@ -227,12 +234,14 @@ def vitstr_small(pretrained: bool = False, **kwargs: Any) -> ViTSTR:
     >>> out = model(input_tensor)
     Args:
+    ----
         pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
+        **kwargs: keyword arguments of the ViTSTR architecture
     Returns:
+    -------
         text recognition architecture
     """
     return _vitstr(
         "vitstr_small",
         pretrained,
@@ -254,12 +263,14 @@ def vitstr_base(pretrained: bool = False, **kwargs: Any) -> ViTSTR:
     >>> out = model(input_tensor)
     Args:
+    ----
         pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
+        **kwargs: keyword arguments of the ViTSTR architecture
     Returns:
+    -------
         text recognition architecture
     """
     return _vitstr(
         "vitstr_base",
         pretrained,

doctr/models/recognition/zoo.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2023, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -63,11 +63,13 @@ def recognition_predictor(arch: Any = "crnn_vgg16_bn", pretrained: bool = False,
         >>> out = model([input_page])
     Args:
+    ----
         arch: name of the architecture or model itself to use (e.g. 'crnn_vgg16_bn')
         pretrained: If True, returns a model pre-trained on our text recognition dataset
+        **kwargs: optional parameters to be passed to the architecture
     Returns:
+    -------
         Recognition predictor
     """
     return _predictor(arch, pretrained, **kwargs)

doctr/models/utils/pytorch.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2023, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -11,18 +11,29 @@ from torch import nn
 from doctr.utils.data import download_from_url
-__all__ = ["load_pretrained_params", "conv_sequence_pt", "set_device_and_dtype", "export_model_to_onnx", "_copy_tensor"]
+__all__ = [
+    "load_pretrained_params",
+    "conv_sequence_pt",
+    "set_device_and_dtype",
+    "export_model_to_onnx",
+    "_copy_tensor",
+    "_bf16_to_float32",
+]
 def _copy_tensor(x: torch.Tensor) -> torch.Tensor:
     return x.clone().detach()
+def _bf16_to_float32(x: torch.Tensor) -> torch.Tensor:
+    # bfloat16 is not supported in .numpy(): torch/csrc/utils/tensor_numpy.cpp:aten_to_numpy_dtype
+    return x.float() if x.dtype == torch.bfloat16 else x
 def load_pretrained_params(
     model: nn.Module,
     url: Optional[str] = None,
     hash_prefix: Optional[str] = None,
-    overwrite: bool = False,
     ignore_keys: Optional[List[str]] = None,
     **kwargs: Any,
 ) -> None:
@@ -32,13 +43,13 @@ def load_pretrained_params(
     >>> load_pretrained_params(model, "https://yoursource.com/yourcheckpoint-yourhash.zip")
     Args:
+    ----
         model: the PyTorch model to be loaded
         url: URL of the zipped set of parameters
         hash_prefix: first characters of SHA256 expected hash
-        overwrite: should the zip extraction be enforced if the archive has already been extracted
         ignore_keys: list of weights to be ignored from the state_dict
+        **kwargs: additional arguments to be passed to `doctr.utils.data.download_from_url`
     """
     if url is None:
         logging.warning("Invalid model URL, using default initialization.")
     else:
@@ -73,11 +84,15 @@ def conv_sequence_pt(
     >>> module = Sequential(conv_sequence(3, 32, True, True, kernel_size=3))
     Args:
+    ----
+        in_channels: number of input channels
         out_channels: number of output channels
         relu: whether ReLU should be used
         bn: should a batch normalization layer be added
+        **kwargs: additional arguments to be passed to the convolutional layer
     Returns:
+    -------
         list of layers
     """
     # No bias before Batch norm
@@ -107,15 +122,16 @@ def set_device_and_dtype(
     >>> model, batches = set_device_and_dtype(model, batches, device="cuda", dtype=torch.float16)
     Args:
+    ----
         model: the model to be set
         batches: the batches to be set
         device: the device to be used
         dtype: the dtype to be used
     Returns:
+    -------
         the model and batches set
     """
     return model.to(device=device, dtype=dtype), [batch.to(device=device, dtype=dtype) for batch in batches]
@@ -129,12 +145,14 @@ def export_model_to_onnx(model: nn.Module, model_name: str, dummy_input: torch.T
     >>> export_model_to_onnx(model, "my_model", dummy_input=torch.randn(1, 3, 32, 32))
     Args:
+    ----
         model: the PyTorch model to be exported
         model_name: the name for the exported model
         dummy_input: the dummy input to the model
         kwargs: additional arguments to be passed to torch.onnx.export
     Returns:
+    -------
         the path to the exported model
     """
     torch.onnx.export(

doctr/models/utils/tensorflow.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2023, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -17,13 +17,25 @@ from doctr.utils.data import download_from_url
 logging.getLogger("tensorflow").setLevel(logging.DEBUG)
-__all__ = ["load_pretrained_params", "conv_sequence", "IntermediateLayerGetter", "export_model_to_onnx", "_copy_tensor"]
+__all__ = [
+    "load_pretrained_params",
+    "conv_sequence",
+    "IntermediateLayerGetter",
+    "export_model_to_onnx",
+    "_copy_tensor",
+    "_bf16_to_float32",
+]
 def _copy_tensor(x: tf.Tensor) -> tf.Tensor:
     return tf.identity(x)
+def _bf16_to_float32(x: tf.Tensor) -> tf.Tensor:
+    # Convert bfloat16 to float32 for numpy compatibility
+    return tf.cast(x, tf.float32) if x.dtype == tf.bfloat16 else x
 def load_pretrained_params(
     model: Model,
     url: Optional[str] = None,
@@ -38,13 +50,14 @@ def load_pretrained_params(
     >>> load_pretrained_params(model, "https://yoursource.com/yourcheckpoint-yourhash.zip")
     Args:
+    ----
         model: the keras model to be loaded
         url: URL of the zipped set of parameters
         hash_prefix: first characters of SHA256 expected hash
         overwrite: should the zip extraction be enforced if the archive has already been extracted
         internal_name: name of the ckpt files
+        **kwargs: additional arguments to be passed to `doctr.utils.data.download_from_url`
     """
     if url is None:
         logging.warning("Invalid model URL, using default initialization.")
     else:
@@ -75,13 +88,16 @@ def conv_sequence(
     >>> module = Sequential(conv_sequence(32, 'relu', True, kernel_size=3, input_shape=[224, 224, 3]))
     Args:
+    ----
         out_channels: number of output channels
         activation: activation to be used (default: no activation)
         bn: should a batch normalization layer be added
         padding: padding scheme
         kernel_initializer: kernel initializer
+        **kwargs: additional arguments to be passed to the convolutional layer
     Returns:
+    -------
         list of layers
     """
     # No bias before Batch norm
@@ -109,6 +125,7 @@ class IntermediateLayerGetter(Model):
     >>> feat_extractor = IntermediateLayerGetter(ResNet50(include_top=False, pooling=False), target_layers)
     Args:
+    ----
         model: the model to extract feature maps from
         layer_names: the list of layers to retrieve the feature map from
     """
@@ -134,12 +151,14 @@ def export_model_to_onnx(
     >>> dummy_input=[tf.TensorSpec([None, 32, 32, 3], tf.float32, name="input")])
     Args:
+    ----
         model: the keras model to be exported
         model_name: the name for the exported model
         dummy_input: the dummy input to the model
         kwargs: additional arguments to be passed to tf2onnx
     Returns:
+    -------
         the path to the exported model and a list with the output layer names
     """
     large_model = kwargs.get("large_model", False)

doctr/models/zoo.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2023, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -24,6 +24,7 @@ def _predictor(
     det_bs: int = 2,
     reco_bs: int = 128,
     detect_orientation: bool = False,
+    straighten_pages: bool = False,
     detect_language: bool = False,
     **kwargs,
 ) -> OCRPredictor:
@@ -53,6 +54,7 @@ def _predictor(
         preserve_aspect_ratio=preserve_aspect_ratio,
         symmetric_pad=symmetric_pad,
         detect_orientation=detect_orientation,
+        straighten_pages=straighten_pages,
         detect_language=detect_language,
         **kwargs,
     )
@@ -68,6 +70,7 @@ def ocr_predictor(
     symmetric_pad: bool = True,
     export_as_straight_boxes: bool = False,
     detect_orientation: bool = False,
+    straighten_pages: bool = False,
     detect_language: bool = False,
     **kwargs: Any,
 ) -> OCRPredictor:
@@ -80,6 +83,7 @@ def ocr_predictor(
     >>> out = model([input_page])
     Args:
+    ----
         det_arch: name of the detection architecture or the model itself to use
             (e.g. 'db_resnet50', 'db_mobilenet_v3_large')
         reco_arch: name of the recognition architecture or the model itself to use
@@ -95,14 +99,18 @@ def ocr_predictor(
             (potentially rotated) as straight bounding boxes.
         detect_orientation: if True, the estimated general page orientation will be added to the predictions for each
             page. Doing so will slightly deteriorate the overall latency.
+        straighten_pages: if True, estimates the page general orientation
+            based on the segmentation map median line orientation.
+            Then, rotates page before passing it again to the deep learning detection module.
+            Doing so will improve performances for documents with page-uniform rotations.
         detect_language: if True, the language prediction will be added to the predictions for each
             page. Doing so will slightly deteriorate the overall latency.
         kwargs: keyword args of `OCRPredictor`
     Returns:
+    -------
         OCR predictor
     """
     return _predictor(
         det_arch,
         reco_arch,
@@ -113,6 +121,7 @@ def ocr_predictor(
         symmetric_pad=symmetric_pad,
         export_as_straight_boxes=export_as_straight_boxes,
         detect_orientation=detect_orientation,
+        straighten_pages=straighten_pages,
         detect_language=detect_language,
         **kwargs,
     )
@@ -129,6 +138,7 @@ def _kie_predictor(
     det_bs: int = 2,
     reco_bs: int = 128,
     detect_orientation: bool = False,
+    straighten_pages: bool = False,
     detect_language: bool = False,
     **kwargs,
 ) -> KIEPredictor:
@@ -158,6 +168,7 @@ def _kie_predictor(
         preserve_aspect_ratio=preserve_aspect_ratio,
         symmetric_pad=symmetric_pad,
         detect_orientation=detect_orientation,
+        straighten_pages=straighten_pages,
         detect_language=detect_language,
         **kwargs,
     )
@@ -173,6 +184,7 @@ def kie_predictor(
     symmetric_pad: bool = True,
     export_as_straight_boxes: bool = False,
     detect_orientation: bool = False,
+    straighten_pages: bool = False,
     detect_language: bool = False,
     **kwargs: Any,
 ) -> KIEPredictor:
@@ -185,6 +197,7 @@ def kie_predictor(
     >>> out = model([input_page])
     Args:
+    ----
         det_arch: name of the detection architecture or the model itself to use
             (e.g. 'db_resnet50', 'db_mobilenet_v3_large')
         reco_arch: name of the recognition architecture or the model itself to use
@@ -200,14 +213,18 @@ def kie_predictor(
             (potentially rotated) as straight bounding boxes.
         detect_orientation: if True, the estimated general page orientation will be added to the predictions for each
             page. Doing so will slightly deteriorate the overall latency.
+        straighten_pages: if True, estimates the page general orientation
+            based on the segmentation map median line orientation.
+            Then, rotates page before passing it again to the deep learning detection module.
+            Doing so will improve performances for documents with page-uniform rotations.
         detect_language: if True, the language prediction will be added to the predictions for each
             page. Doing so will slightly deteriorate the overall latency.
         kwargs: keyword args of `OCRPredictor`
     Returns:
+    -------
         KIE predictor
     """
     return _kie_predictor(
         det_arch,
         reco_arch,
@@ -218,6 +235,7 @@ def kie_predictor(
         symmetric_pad=symmetric_pad,
         export_as_straight_boxes=export_as_straight_boxes,
         detect_orientation=detect_orientation,
+        straighten_pages=straighten_pages,
         detect_language=detect_language,
         **kwargs,
     )

doctr/transforms/functional/base.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2023, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -20,10 +20,12 @@ def crop_boxes(
     """Crop localization boxes
     Args:
+    ----
         boxes: ndarray of shape (N, 4) in relative or abs coordinates
         crop_box: box (xmin, ymin, xmax, ymax) to crop the image, in the same coord format that the boxes
     Returns:
+    -------
         the cropped boxes
     """
     is_box_rel = boxes.max() <= 1
@@ -52,10 +54,12 @@ def expand_line(line: np.ndarray, target_shape: Tuple[int, int]) -> Tuple[float,
     the same direction until we meet one of the edges.
     Args:
+    ----
         line: array of shape (2, 2) of the point supposed to be on one edge, and the shadow tip.
         target_shape: the desired mask shape
     Returns:
+    -------
         2D coordinates of the first point once we extended the line (on one of the edges)
     """
     if any(coord == 0 or coord == size for coord, size in zip(line[0], target_shape[::-1])):
@@ -116,15 +120,16 @@ def create_shadow_mask(
     """Creates a random shadow mask
     Args:
+    ----
         target_shape: the target shape (H, W)
         min_base_width: the relative minimum shadow base width
         max_tip_width: the relative maximum shadow tip width
         max_tip_height: the relative maximum shadow tip height
     Returns:
+    -------
         a numpy ndarray of shape (H, W, 1) with values in the range [0, 1]
     """
     # Default base is top
     _params = np.random.rand(6)
     base_width = min_base_width + (1 - min_base_width) * _params[0]
@@ -195,4 +200,4 @@ def create_shadow_mask(
     mask: np.ndarray = np.zeros((*target_shape, 1), dtype=np.uint8)
     mask = cv2.fillPoly(mask, [final_contour], (255,), lineType=cv2.LINE_AA)[..., 0]
-    return (mask / 255).astype(np.float32).clip(0, 1) * intensity_mask.astype(np.float32)
+    return (mask / 255).astype(np.float32).clip(0, 1) * intensity_mask.astype(np.float32)  # type: ignore[operator]

python-doctr 0.7.0__py3-none-any.whl → 0.8.1__py3-none-any.whl

python-doctr 0.7.0py3-none-any.whl → 0.8.1py3-none-any.whl