PyPI - python-doctr - Versions diffs - 0.7.0__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

python-doctr 0.7.0py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

doctr/datasets/__init__.py +2 -0
doctr/datasets/cord.py +6 -4
doctr/datasets/datasets/base.py +3 -2
doctr/datasets/datasets/pytorch.py +4 -2
doctr/datasets/datasets/tensorflow.py +4 -2
doctr/datasets/detection.py +6 -3
doctr/datasets/doc_artefacts.py +2 -1
doctr/datasets/funsd.py +7 -8
doctr/datasets/generator/base.py +3 -2
doctr/datasets/generator/pytorch.py +3 -1
doctr/datasets/generator/tensorflow.py +3 -1
doctr/datasets/ic03.py +3 -2
doctr/datasets/ic13.py +2 -1
doctr/datasets/iiit5k.py +6 -4
doctr/datasets/iiithws.py +2 -1
doctr/datasets/imgur5k.py +3 -2
doctr/datasets/loader.py +4 -2
doctr/datasets/mjsynth.py +2 -1
doctr/datasets/ocr.py +2 -1
doctr/datasets/orientation.py +40 -0
doctr/datasets/recognition.py +3 -2
doctr/datasets/sroie.py +2 -1
doctr/datasets/svhn.py +2 -1
doctr/datasets/svt.py +3 -2
doctr/datasets/synthtext.py +2 -1
doctr/datasets/utils.py +27 -11
doctr/datasets/vocabs.py +26 -1
doctr/datasets/wildreceipt.py +111 -0
doctr/file_utils.py +3 -1
doctr/io/elements.py +52 -35
doctr/io/html.py +5 -3
doctr/io/image/base.py +5 -4
doctr/io/image/pytorch.py +12 -7
doctr/io/image/tensorflow.py +11 -6
doctr/io/pdf.py +5 -4
doctr/io/reader.py +13 -5
doctr/models/_utils.py +30 -53
doctr/models/artefacts/barcode.py +4 -3
doctr/models/artefacts/face.py +4 -2
doctr/models/builder.py +58 -43
doctr/models/classification/__init__.py +1 -0
doctr/models/classification/magc_resnet/pytorch.py +5 -2
doctr/models/classification/magc_resnet/tensorflow.py +5 -2
doctr/models/classification/mobilenet/pytorch.py +16 -4
doctr/models/classification/mobilenet/tensorflow.py +29 -20
doctr/models/classification/predictor/pytorch.py +3 -2
doctr/models/classification/predictor/tensorflow.py +2 -1
doctr/models/classification/resnet/pytorch.py +23 -13
doctr/models/classification/resnet/tensorflow.py +33 -26
doctr/models/classification/textnet/__init__.py +6 -0
doctr/models/classification/textnet/pytorch.py +275 -0
doctr/models/classification/textnet/tensorflow.py +267 -0
doctr/models/classification/vgg/pytorch.py +4 -2
doctr/models/classification/vgg/tensorflow.py +5 -2
doctr/models/classification/vit/pytorch.py +9 -3
doctr/models/classification/vit/tensorflow.py +9 -3
doctr/models/classification/zoo.py +7 -2
doctr/models/core.py +1 -1
doctr/models/detection/__init__.py +1 -0
doctr/models/detection/_utils/pytorch.py +7 -1
doctr/models/detection/_utils/tensorflow.py +7 -3
doctr/models/detection/core.py +9 -3
doctr/models/detection/differentiable_binarization/base.py +37 -25
doctr/models/detection/differentiable_binarization/pytorch.py +80 -104
doctr/models/detection/differentiable_binarization/tensorflow.py +74 -55
doctr/models/detection/fast/__init__.py +6 -0
doctr/models/detection/fast/base.py +256 -0
doctr/models/detection/fast/pytorch.py +442 -0
doctr/models/detection/fast/tensorflow.py +428 -0
doctr/models/detection/linknet/base.py +12 -5
doctr/models/detection/linknet/pytorch.py +28 -15
doctr/models/detection/linknet/tensorflow.py +68 -88
doctr/models/detection/predictor/pytorch.py +16 -6
doctr/models/detection/predictor/tensorflow.py +13 -5
doctr/models/detection/zoo.py +19 -16
doctr/models/factory/hub.py +20 -10
doctr/models/kie_predictor/base.py +2 -1
doctr/models/kie_predictor/pytorch.py +28 -36
doctr/models/kie_predictor/tensorflow.py +27 -27
doctr/models/modules/__init__.py +1 -0
doctr/models/modules/layers/__init__.py +6 -0
doctr/models/modules/layers/pytorch.py +166 -0
doctr/models/modules/layers/tensorflow.py +175 -0
doctr/models/modules/transformer/pytorch.py +24 -22
doctr/models/modules/transformer/tensorflow.py +6 -4
doctr/models/modules/vision_transformer/pytorch.py +2 -4
doctr/models/modules/vision_transformer/tensorflow.py +2 -4
doctr/models/obj_detection/faster_rcnn/pytorch.py +4 -2
doctr/models/predictor/base.py +14 -3
doctr/models/predictor/pytorch.py +26 -29
doctr/models/predictor/tensorflow.py +25 -22
doctr/models/preprocessor/pytorch.py +14 -9
doctr/models/preprocessor/tensorflow.py +10 -5
doctr/models/recognition/core.py +4 -1
doctr/models/recognition/crnn/pytorch.py +23 -16
doctr/models/recognition/crnn/tensorflow.py +25 -17
doctr/models/recognition/master/base.py +4 -1
doctr/models/recognition/master/pytorch.py +20 -9
doctr/models/recognition/master/tensorflow.py +20 -8
doctr/models/recognition/parseq/base.py +4 -1
doctr/models/recognition/parseq/pytorch.py +28 -22
doctr/models/recognition/parseq/tensorflow.py +22 -11
doctr/models/recognition/predictor/_utils.py +3 -2
doctr/models/recognition/predictor/pytorch.py +3 -2
doctr/models/recognition/predictor/tensorflow.py +2 -1
doctr/models/recognition/sar/pytorch.py +14 -7
doctr/models/recognition/sar/tensorflow.py +23 -14
doctr/models/recognition/utils.py +5 -1
doctr/models/recognition/vitstr/base.py +4 -1
doctr/models/recognition/vitstr/pytorch.py +22 -13
doctr/models/recognition/vitstr/tensorflow.py +21 -10
doctr/models/recognition/zoo.py +4 -2
doctr/models/utils/pytorch.py +24 -6
doctr/models/utils/tensorflow.py +22 -3
doctr/models/zoo.py +21 -3
doctr/transforms/functional/base.py +8 -3
doctr/transforms/functional/pytorch.py +23 -6
doctr/transforms/functional/tensorflow.py +25 -5
doctr/transforms/modules/base.py +12 -5
doctr/transforms/modules/pytorch.py +10 -12
doctr/transforms/modules/tensorflow.py +17 -9
doctr/utils/common_types.py +1 -1
doctr/utils/data.py +4 -2
doctr/utils/fonts.py +3 -2
doctr/utils/geometry.py +95 -26
doctr/utils/metrics.py +36 -22
doctr/utils/multithreading.py +5 -3
doctr/utils/repr.py +3 -1
doctr/utils/visualization.py +31 -8
doctr/version.py +1 -1
{python_doctr-0.7.0.dist-info → python_doctr-0.8.1.dist-info}/METADATA +67 -31
python_doctr-0.8.1.dist-info/RECORD +173 -0
{python_doctr-0.7.0.dist-info → python_doctr-0.8.1.dist-info}/WHEEL +1 -1
python_doctr-0.7.0.dist-info/RECORD +0 -161
{python_doctr-0.7.0.dist-info → python_doctr-0.8.1.dist-info}/LICENSE +0 -0
{python_doctr-0.7.0.dist-info → python_doctr-0.8.1.dist-info}/top_level.txt +0 -0
{python_doctr-0.7.0.dist-info → python_doctr-0.8.1.dist-info}/zip-safe +0 -0

doctr/models/recognition/parseq/pytorch.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2023, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -18,7 +18,7 @@ from doctr.datasets import VOCABS
 from doctr.models.modules.transformer import MultiHeadAttention, PositionwiseFeedForward
 from ...classification import vit_s
-from ...utils.pytorch import load_pretrained_params
+from ...utils.pytorch import _bf16_to_float32, load_pretrained_params
 from .base import _PARSeq, _PARSeqPostProcessor
 __all__ = ["PARSeq", "parseq"]
@@ -29,7 +29,7 @@ default_cfgs: Dict[str, Dict[str, Any]] = {
         "std": (0.299, 0.296, 0.301),
         "input_shape": (3, 32, 128),
         "vocab": VOCABS["french"],
-        "url": None,
+        "url": "https://doctr-static.mindee.com/models?id=v0.7.0/parseq-56125471.pt&src=0",
     },
 }
@@ -38,6 +38,7 @@ class CharEmbedding(nn.Module):
     """Implements the character embedding module
     Args:
+    ----
         vocab_size: size of the vocabulary
         d_model: dimension of the model
     """
@@ -55,6 +56,7 @@ class PARSeqDecoder(nn.Module):
     """Implements decoder module of the PARSeq model
     Args:
+    ----
         d_model: dimension of the model
         num_heads: number of attention heads
         ffd: dimension of the feed forward layer
@@ -110,6 +112,7 @@ class PARSeq(_PARSeq, nn.Module):
     Slightly modified implementation based on the official Pytorch implementation: <https://github.com/baudm/parseq/tree/main`_.
     Args:
+    ----
         feature_extractor: the backbone serving as feature extractor
         vocab: vocabulary used for encoding
         embedding_units: number of embedding units
@@ -197,11 +200,11 @@ class PARSeq(_PARSeq, nn.Module):
             final_perms = torch.stack(perms)
             if len(perm_pool):
                 i = self.rng.choice(len(perm_pool), size=num_gen_perms - len(final_perms), replace=False)
-                final_perms = torch.cat([final_perms, perm_pool[i]])  # type: ignore[index]
+                final_perms = torch.cat([final_perms, perm_pool[i]])
         else:
-            perms.extend(
-                [torch.randperm(max_num_chars, device=seqlen.device) for _ in range(num_gen_perms - len(perms))]
-            )
+            perms.extend([
+                torch.randperm(max_num_chars, device=seqlen.device) for _ in range(num_gen_perms - len(perms))
+            ])
             final_perms = torch.stack(perms)
         comp = final_perms.flip(-1)
@@ -209,7 +212,7 @@ class PARSeq(_PARSeq, nn.Module):
         sos_idx = torch.zeros(len(final_perms), 1, device=seqlen.device)
         eos_idx = torch.full((len(final_perms), 1), max_num_chars + 1, device=seqlen.device)
-        combined = torch.cat([sos_idx, final_perms + 1, eos_idx], dim=1).int()
+        combined = torch.cat([sos_idx, final_perms + 1, eos_idx], dim=1).int()  # type: ignore
         if len(combined) > 1:
             combined[1, 1:] = max_num_chars + 1 - torch.arange(max_num_chars + 1, device=seqlen.device)
         return combined
@@ -237,7 +240,6 @@ class PARSeq(_PARSeq, nn.Module):
         target_query: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Add positional information to the target sequence and pass it through the decoder."""
         batch_size, sequence_length = target.shape
         # apply positional information to the target sequence excluding the SOS token
         null_ctx = self.embed(target[:, :1])
@@ -280,7 +282,7 @@ class PARSeq(_PARSeq, nn.Module):
                 ys[:, i + 1] = pos_prob.squeeze().argmax(-1)
                 # Stop decoding if all sequences have reached the EOS token
-                if max_len is None and (ys == self.vocab_size).any(dim=-1).all():
+                if max_len is None and (ys == self.vocab_size).any(dim=-1).all():  # type: ignore[attr-defined]
                     break
         logits = torch.cat(pos_logits, dim=1)  # (N, max_length, vocab_size + 1)
@@ -295,7 +297,7 @@ class PARSeq(_PARSeq, nn.Module):
         # Create padding mask for refined target input maskes all behind EOS token as False
         # (N, 1, 1, max_length)
-        target_pad_mask = ~((ys == self.vocab_size).int().cumsum(-1) > 0).unsqueeze(1).unsqueeze(1)
+        target_pad_mask = ~((ys == self.vocab_size).int().cumsum(-1) > 0).unsqueeze(1).unsqueeze(1)  # type: ignore[attr-defined]
         mask = (target_pad_mask.bool() & query_mask[:, : ys.shape[1]].bool()).int()
         logits = self.head(self.decode(ys, features, mask, target_query=pos_queries))
@@ -329,11 +331,9 @@ class PARSeq(_PARSeq, nn.Module):
                 gt_out = gt[:, 1:]  # remove SOS token
                 # Create padding mask for target input
                 # [True, True, True, ..., False, False, False] -> False is masked
-                padding_mask = (
-                    ~(((gt_in == self.vocab_size + 2) | (gt_in == self.vocab_size)).int().cumsum(-1) > 0)
-                    .unsqueeze(1)
-                    .unsqueeze(1)
-                )  # (N, 1, 1, seq_len)
+                padding_mask = ~(
+                    ((gt_in == self.vocab_size + 2) | (gt_in == self.vocab_size)).int().cumsum(-1) > 0
+                ).unsqueeze(1).unsqueeze(1)  # (N, 1, 1, seq_len)
                 loss = torch.tensor(0.0, device=features.device)
                 loss_numel: Union[int, float] = 0
@@ -362,6 +362,8 @@ class PARSeq(_PARSeq, nn.Module):
         else:
             logits = self.decode_autoregressive(features)
+        logits = _bf16_to_float32(logits)
         out: Dict[str, Any] = {}
         if self.exportable:
             out["logits"] = logits
@@ -384,6 +386,7 @@ class PARSeqPostProcessor(_PARSeqPostProcessor):
     """Post processor for PARSeq architecture
     Args:
+    ----
         vocab: string containing the ordered sequence of supported characters
     """
@@ -393,18 +396,19 @@ class PARSeqPostProcessor(_PARSeqPostProcessor):
     ) -> List[Tuple[str, float]]:
         # compute pred with argmax for attention models
         out_idxs = logits.argmax(-1)
-        # N x L
-        probs = torch.gather(torch.softmax(logits, -1), -1, out_idxs.unsqueeze(-1)).squeeze(-1)
-        # Take the minimum confidence of the sequence
-        probs = probs.min(dim=1).values.detach().cpu()
+        preds_prob = torch.softmax(logits, -1).max(dim=-1)[0]
         # Manual decoding
         word_values = [
             "".join(self._embedding[idx] for idx in encoded_seq).split("<eos>")[0]
             for encoded_seq in out_idxs.cpu().numpy()
         ]
+        # compute probabilties for each word up to the EOS token
+        probs = [
+            preds_prob[i, : len(word)].clip(0, 1).mean().item() if word else 0.0 for i, word in enumerate(word_values)
+        ]
-        return list(zip(word_values, probs.numpy().tolist()))
+        return list(zip(word_values, probs))
 def _parseq(
@@ -457,12 +461,14 @@ def parseq(pretrained: bool = False, **kwargs: Any) -> PARSeq:
     >>> out = model(input_tensor)
     Args:
+    ----
         pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
+        **kwargs: keyword arguments of the PARSeq architecture
     Returns:
+    -------
         text recognition architecture
     """
     return _parseq(
         "parseq",
         pretrained,

doctr/models/recognition/parseq/tensorflow.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2023, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -16,7 +16,7 @@ from doctr.datasets import VOCABS
 from doctr.models.modules.transformer import MultiHeadAttention, PositionwiseFeedForward
 from ...classification import vit_s
-from ...utils.tensorflow import load_pretrained_params
+from ...utils.tensorflow import _bf16_to_float32, load_pretrained_params
 from .base import _PARSeq, _PARSeqPostProcessor
 __all__ = ["PARSeq", "parseq"]
@@ -36,6 +36,7 @@ class CharEmbedding(layers.Layer):
     """Implements the character embedding module
     Args:
+    ----
         vocab_size: size of the vocabulary
         d_model: dimension of the model
     """
@@ -53,6 +54,7 @@ class PARSeqDecoder(layers.Layer):
     """Implements decoder module of the PARSeq model
     Args:
+    ----
         d_model: dimension of the model
         num_heads: number of attention heads
         ffd: dimension of the feed forward layer
@@ -113,6 +115,7 @@ class PARSeq(_PARSeq, Model):
     Modified implementation based on the official Pytorch implementation: <https://github.com/baudm/parseq/tree/main`_.
     Args:
+    ----
         feature_extractor: the backbone serving as feature extractor
         vocab: vocabulary used for encoding
         embedding_units: number of embedding units
@@ -191,9 +194,9 @@ class PARSeq(_PARSeq, Model):
                 i = self.rng.choice(len(perm_pool), size=num_gen_perms - len(final_perms), replace=False)
                 final_perms = tf.concat([final_perms, perm_pool[i[0] : i[1]]], axis=0)
         else:
-            perms.extend(
-                [tf.random.shuffle(tf.range(max_num_chars, dtype=tf.int32)) for _ in range(num_gen_perms - len(perms))]
-            )
+            perms.extend([
+                tf.random.shuffle(tf.range(max_num_chars, dtype=tf.int32)) for _ in range(num_gen_perms - len(perms))
+            ])
             final_perms = tf.stack(perms)
         comp = tf.reverse(final_perms, axis=[-1])
@@ -390,6 +393,8 @@ class PARSeq(_PARSeq, Model):
         else:
             logits = self.decode_autoregressive(features, **kwargs)
+        logits = _bf16_to_float32(logits)
         out: Dict[str, tf.Tensor] = {}
         if self.exportable:
             out["logits"] = logits
@@ -412,6 +417,7 @@ class PARSeqPostProcessor(_PARSeqPostProcessor):
     """Post processor for PARSeq architecture
     Args:
+    ----
         vocab: string containing the ordered sequence of supported characters
     """
@@ -421,10 +427,7 @@ class PARSeqPostProcessor(_PARSeqPostProcessor):
     ) -> List[Tuple[str, float]]:
         # compute pred with argmax for attention models
         out_idxs = tf.math.argmax(logits, axis=2)
-        # N x L
-        probs = tf.gather(tf.nn.softmax(logits, axis=-1), out_idxs, axis=-1, batch_dims=2)
-        # Take the minimum confidence of the sequence
-        probs = tf.math.reduce_min(probs, axis=1)
+        preds_prob = tf.math.reduce_max(tf.nn.softmax(logits, axis=-1), axis=-1)
         # decode raw output of the model with tf_label_to_idx
         out_idxs = tf.cast(out_idxs, dtype="int32")
@@ -434,7 +437,13 @@ class PARSeqPostProcessor(_PARSeqPostProcessor):
         decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value="not valid")[:, 0]
         word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
-        return list(zip(word_values, probs.numpy().tolist()))
+        # compute probabilties for each word up to the EOS token
+        probs = [
+            preds_prob[i, : len(word)].numpy().clip(0, 1).mean().item() if word else 0.0
+            for i, word in enumerate(word_values)
+        ]
+        return list(zip(word_values, probs))
 def _parseq(
@@ -484,12 +493,14 @@ def parseq(pretrained: bool = False, **kwargs: Any) -> PARSeq:
     >>> out = model(input_tensor)
     Args:
+    ----
         pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
+        **kwargs: keyword arguments of the PARSeq architecture
     Returns:
+    -------
         text recognition architecture
     """
     return _parseq(
         "parseq",
         pretrained,

doctr/models/recognition/predictor/_utils.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2023, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -22,6 +22,7 @@ def split_crops(
     """Chunk crops horizontally to match a given aspect ratio
     Args:
+    ----
         crops: list of numpy array of shape (H, W, 3) if channels_last or (3, H, W) otherwise
         max_ratio: the maximum aspect ratio that won't trigger the chunk
         target_ratio: when crops are chunked, they will be chunked to match this aspect ratio
@@ -29,9 +30,9 @@ def split_crops(
         channels_last: whether the numpy array has dimensions in channels last order
     Returns:
+    -------
         a tuple with the new crops, their mapping, and a boolean specifying whether any remap is required
     """
     _remap_required = False
     crop_map: List[Union[int, Tuple[int, int]]] = []
     new_crops: List[np.ndarray] = []

doctr/models/recognition/predictor/pytorch.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2023, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -21,6 +21,7 @@ class RecognitionPredictor(nn.Module):
     """Implements an object able to identify character sequences in images
     Args:
+    ----
         pre_processor: transform inputs for easier batched model inference
         model: core detection architecture
         split_wide_crops: wether to use crop splitting for high aspect ratio crops
@@ -40,7 +41,7 @@ class RecognitionPredictor(nn.Module):
         self.dil_factor = 1.4  # Dilation factor to overlap the crops
         self.target_ar = 6  # Target aspect ratio
-    @torch.no_grad()
+    @torch.inference_mode()
     def forward(
         self,
         crops: Sequence[Union[np.ndarray, torch.Tensor]],

doctr/models/recognition/predictor/tensorflow.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2023, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -21,6 +21,7 @@ class RecognitionPredictor(NestedObject):
     """Implements an object able to identify character sequences in images
     Args:
+    ----
         pre_processor: transform inputs for easier batched model inference
         model: core detection architecture
         split_wide_crops: wether to use crop splitting for high aspect ratio crops

doctr/models/recognition/sar/pytorch.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2023, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -14,7 +14,7 @@ from torchvision.models._utils import IntermediateLayerGetter
 from doctr.datasets import VOCABS
 from ...classification import resnet31
-from ...utils.pytorch import load_pretrained_params
+from ...utils.pytorch import _bf16_to_float32, load_pretrained_params
 from ..core import RecognitionModel, RecognitionPostProcessor
 __all__ = ["SAR", "sar_resnet31"]
@@ -25,7 +25,7 @@ default_cfgs: Dict[str, Dict[str, Any]] = {
         "std": (0.299, 0.296, 0.301),
         "input_shape": (3, 32, 128),
         "vocab": VOCABS["french"],
-        "url": None,
+        "url": "https://doctr-static.mindee.com/models?id=v0.7.0/sar_resnet31-9a1deedf.pt&src=0",
     },
 }
@@ -80,6 +80,7 @@ class SARDecoder(nn.Module):
     """Implements decoder module of the SAR model
     Args:
+    ----
         rnn_units: number of hidden units in recurrent cells
         max_length: maximum length of a sequence
         vocab_size: number of classes in the model alphabet
@@ -164,6 +165,7 @@ class SAR(nn.Module, RecognitionModel):
     Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_.
     Args:
+    ----
         feature_extractor: the backbone serving as feature extractor
         vocab: vocabulary used for encoding
         rnn_units: number of hidden units in both encoder and decoder LSTM
@@ -249,7 +251,7 @@ class SAR(nn.Module, RecognitionModel):
         if self.training and target is None:
             raise ValueError("Need to provide labels during training for teacher forcing")
-        decoded_features = self.decoder(features, encoded, gt=None if target is None else gt)
+        decoded_features = _bf16_to_float32(self.decoder(features, encoded, gt=None if target is None else gt))
         out: Dict[str, Any] = {}
         if self.exportable:
@@ -278,17 +280,19 @@ class SAR(nn.Module, RecognitionModel):
         Sequences are masked after the EOS character.
         Args:
+        ----
             model_output: predicted logits of the model
             gt: the encoded tensor with gt labels
             seq_len: lengths of each gt word inside the batch
         Returns:
+        -------
             The loss of the model on the batch
         """
         # Input length : number of timesteps
         input_len = model_output.shape[1]
         # Add one for additional <eos> token
-        seq_len = seq_len + 1
+        seq_len = seq_len + 1  # type: ignore[assignment]
         # Compute loss
         # (N, L, vocab_size + 1)
         cce = F.cross_entropy(model_output.permute(0, 2, 1), gt, reduction="none")
@@ -303,6 +307,7 @@ class SARPostProcessor(RecognitionPostProcessor):
     """Post processor for SAR architectures
     Args:
+    ----
         vocab: string containing the ordered sequence of supported characters
     """
@@ -323,7 +328,7 @@ class SARPostProcessor(RecognitionPostProcessor):
             for encoded_seq in out_idxs.detach().cpu().numpy()
         ]
-        return list(zip(word_values, probs.numpy().tolist()))
+        return list(zip(word_values, probs.numpy().clip(0, 1).tolist()))
 def _sar(
@@ -373,12 +378,14 @@ def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR:
     >>> out = model(input_tensor)
     Args:
+    ----
         pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
+        **kwargs: keyword arguments of the SAR architecture
     Returns:
+    -------
         text recognition architecture
     """
     return _sar(
         "sar_resnet31",
         pretrained,

doctr/models/recognition/sar/tensorflow.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2023, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -13,7 +13,7 @@ from doctr.datasets import VOCABS
 from doctr.utils.repr import NestedObject
 from ...classification import resnet31
-from ...utils.tensorflow import load_pretrained_params
+from ...utils.tensorflow import _bf16_to_float32, load_pretrained_params
 from ..core import RecognitionModel, RecognitionPostProcessor
 __all__ = ["SAR", "sar_resnet31"]
@@ -33,18 +33,17 @@ class SAREncoder(layers.Layer, NestedObject):
     """Implements encoder module of the SAR model
     Args:
+    ----
         rnn_units: number of hidden rnn units
         dropout_prob: dropout probability
     """
     def __init__(self, rnn_units: int, dropout_prob: float = 0.0) -> None:
         super().__init__()
-        self.rnn = Sequential(
-            [
-                layers.LSTM(units=rnn_units, return_sequences=True, recurrent_dropout=dropout_prob),
-                layers.LSTM(units=rnn_units, return_sequences=False, recurrent_dropout=dropout_prob),
-            ]
-        )
+        self.rnn = Sequential([
+            layers.LSTM(units=rnn_units, return_sequences=True, recurrent_dropout=dropout_prob),
+            layers.LSTM(units=rnn_units, return_sequences=False, recurrent_dropout=dropout_prob),
+        ])
     def call(
         self,
@@ -59,6 +58,7 @@ class AttentionModule(layers.Layer, NestedObject):
     """Implements attention module of the SAR model
     Args:
+    ----
         attention_units: number of hidden attention units
     """
@@ -120,6 +120,7 @@ class SARDecoder(layers.Layer, NestedObject):
     """Implements decoder module of the SAR model
     Args:
+    ----
         rnn_units: number of hidden units in recurrent cells
         max_length: maximum length of a sequence
         vocab_size: number of classes in the model alphabet
@@ -147,9 +148,9 @@ class SARDecoder(layers.Layer, NestedObject):
         self.embed = layers.Dense(embedding_units, use_bias=False)
         self.embed_tgt = layers.Embedding(embedding_units, self.vocab_size + 1)
-        self.lstm_cells = layers.StackedRNNCells(
-            [layers.LSTMCell(rnn_units, implementation=1) for _ in range(num_decoder_cells)]
-        )
+        self.lstm_cells = layers.StackedRNNCells([
+            layers.LSTMCell(rnn_units, implementation=1) for _ in range(num_decoder_cells)
+        ])
         self.attention_module = AttentionModule(attention_units)
         self.output_dense = layers.Dense(self.vocab_size + 1, use_bias=True)
         self.dropout = layers.Dropout(dropout_prob)
@@ -215,6 +216,7 @@ class SAR(Model, RecognitionModel):
     Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_.
     Args:
+    ----
         feature_extractor: the backbone serving as feature extractor
         vocab: vocabulary used for encoding
         rnn_units: number of hidden units in both encoder and decoder LSTM
@@ -273,11 +275,13 @@ class SAR(Model, RecognitionModel):
         Sequences are masked after the EOS character.
         Args:
+        ----
             gt: the encoded tensor with gt labels
             model_output: predicted logits of the model
             seq_len: lengths of each gt word inside the batch
         Returns:
+        -------
             The loss of the model on the batch
         """
         # Input length : number of timesteps
@@ -316,7 +320,9 @@ class SAR(Model, RecognitionModel):
         if kwargs.get("training", False) and target is None:
             raise ValueError("Need to provide labels during training for teacher forcing")
-        decoded_features = self.decoder(features, encoded, gt=None if target is None else gt, **kwargs)
+        decoded_features = _bf16_to_float32(
+            self.decoder(features, encoded, gt=None if target is None else gt, **kwargs)
+        )
         out: Dict[str, tf.Tensor] = {}
         if self.exportable:
@@ -340,6 +346,7 @@ class SARPostProcessor(RecognitionPostProcessor):
     """Post processor for SAR architectures
     Args:
+    ----
         vocab: string containing the ordered sequence of supported characters
     """
@@ -362,7 +369,7 @@ class SARPostProcessor(RecognitionPostProcessor):
         decoded_strings_pred = tf.sparse.to_dense(decoded_strings_pred.to_sparse(), default_value="not valid")[:, 0]
         word_values = [word.decode() for word in decoded_strings_pred.numpy().tolist()]
-        return list(zip(word_values, probs.numpy().tolist()))
+        return list(zip(word_values, probs.numpy().clip(0, 1).tolist()))
 def _sar(
@@ -409,10 +416,12 @@ def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR:
     >>> out = model(input_tensor)
     Args:
+    ----
         pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
+        **kwargs: keyword arguments of the SAR architecture
     Returns:
+    -------
         text recognition architecture
     """
     return _sar("sar_resnet31", pretrained, resnet31, **kwargs)

doctr/models/recognition/utils.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2023, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -14,12 +14,14 @@ def merge_strings(a: str, b: str, dil_factor: float) -> str:
     """Merges 2 character sequences in the best way to maximize the alignment of their overlapping characters.
     Args:
+    ----
         a: first char seq, suffix should be similar to b's prefix.
         b: second char seq, prefix should be similar to a's suffix.
         dil_factor: dilation factor of the boxes to overlap, should be > 1. This parameter is
             only used when the mother sequence is splitted on a character repetition
     Returns:
+    -------
         A merged character sequence.
     Example::
@@ -63,11 +65,13 @@ def merge_multi_strings(seq_list: List[str], dil_factor: float) -> str:
     """Recursively merges consecutive string sequences with overlapping characters.
     Args:
+    ----
         seq_list: list of sequences to merge. Sequences need to be ordered from left to right.
         dil_factor: dilation factor of the boxes to overlap, should be > 1. This parameter is
             only used when the mother sequence is splitted on a character repetition
     Returns:
+    -------
         A merged character sequence
     Example::

doctr/models/recognition/vitstr/base.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2023, Mindee.
+# Copyright (C) 2021-2024, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
@@ -23,9 +23,11 @@ class _ViTSTR:
         sequence lengths.
         Args:
+        ----
             gts: list of ground-truth labels
         Returns:
+        -------
             A tuple of 2 tensors: Encoded labels and sequence lengths (for each entry of the batch)
         """
         encoded = encode_sequences(
@@ -43,6 +45,7 @@ class _ViTSTRPostProcessor(RecognitionPostProcessor):
     """Abstract class to postprocess the raw output of the model
     Args:
+    ----
         vocab: string containing the ordered sequence of supported characters
     """

python-doctr 0.7.0__py3-none-any.whl → 0.8.1__py3-none-any.whl

python-doctr 0.7.0py3-none-any.whl → 0.8.1py3-none-any.whl