PyPI - python-doctr - Versions diffs - 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

python-doctr 0.8.1py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

doctr/__init__.py +1 -1
doctr/contrib/__init__.py +0 -0
doctr/contrib/artefacts.py +131 -0
doctr/contrib/base.py +105 -0
doctr/datasets/cord.py +10 -1
doctr/datasets/datasets/pytorch.py +2 -2
doctr/datasets/funsd.py +11 -1
doctr/datasets/generator/base.py +6 -5
doctr/datasets/ic03.py +11 -1
doctr/datasets/ic13.py +10 -1
doctr/datasets/iiit5k.py +26 -16
doctr/datasets/imgur5k.py +11 -2
doctr/datasets/loader.py +1 -6
doctr/datasets/sroie.py +11 -1
doctr/datasets/svhn.py +11 -1
doctr/datasets/svt.py +11 -1
doctr/datasets/synthtext.py +11 -1
doctr/datasets/utils.py +9 -3
doctr/datasets/vocabs.py +15 -4
doctr/datasets/wildreceipt.py +12 -1
doctr/file_utils.py +45 -12
doctr/io/elements.py +52 -10
doctr/io/html.py +2 -2
doctr/io/image/pytorch.py +6 -8
doctr/io/image/tensorflow.py +1 -1
doctr/io/pdf.py +5 -2
doctr/io/reader.py +6 -0
doctr/models/__init__.py +0 -1
doctr/models/_utils.py +57 -20
doctr/models/builder.py +73 -15
doctr/models/classification/magc_resnet/tensorflow.py +13 -6
doctr/models/classification/mobilenet/pytorch.py +47 -9
doctr/models/classification/mobilenet/tensorflow.py +51 -14
doctr/models/classification/predictor/pytorch.py +28 -17
doctr/models/classification/predictor/tensorflow.py +26 -16
doctr/models/classification/resnet/tensorflow.py +21 -8
doctr/models/classification/textnet/pytorch.py +3 -3
doctr/models/classification/textnet/tensorflow.py +11 -5
doctr/models/classification/vgg/tensorflow.py +9 -3
doctr/models/classification/vit/tensorflow.py +10 -4
doctr/models/classification/zoo.py +55 -19
doctr/models/detection/_utils/__init__.py +1 -0
doctr/models/detection/_utils/base.py +66 -0
doctr/models/detection/differentiable_binarization/base.py +4 -3
doctr/models/detection/differentiable_binarization/pytorch.py +2 -2
doctr/models/detection/differentiable_binarization/tensorflow.py +34 -12
doctr/models/detection/fast/base.py +6 -5
doctr/models/detection/fast/pytorch.py +4 -4
doctr/models/detection/fast/tensorflow.py +15 -12
doctr/models/detection/linknet/base.py +4 -3
doctr/models/detection/linknet/tensorflow.py +23 -11
doctr/models/detection/predictor/pytorch.py +15 -1
doctr/models/detection/predictor/tensorflow.py +17 -3
doctr/models/detection/zoo.py +7 -2
doctr/models/factory/hub.py +8 -18
doctr/models/kie_predictor/base.py +13 -3
doctr/models/kie_predictor/pytorch.py +45 -20
doctr/models/kie_predictor/tensorflow.py +44 -17
doctr/models/modules/layers/pytorch.py +2 -3
doctr/models/modules/layers/tensorflow.py +6 -8
doctr/models/modules/transformer/pytorch.py +2 -2
doctr/models/modules/transformer/tensorflow.py +0 -2
doctr/models/modules/vision_transformer/pytorch.py +1 -1
doctr/models/modules/vision_transformer/tensorflow.py +1 -1
doctr/models/predictor/base.py +97 -58
doctr/models/predictor/pytorch.py +35 -20
doctr/models/predictor/tensorflow.py +35 -18
doctr/models/preprocessor/pytorch.py +4 -4
doctr/models/preprocessor/tensorflow.py +3 -2
doctr/models/recognition/crnn/tensorflow.py +8 -6
doctr/models/recognition/master/pytorch.py +2 -2
doctr/models/recognition/master/tensorflow.py +9 -4
doctr/models/recognition/parseq/pytorch.py +4 -3
doctr/models/recognition/parseq/tensorflow.py +14 -11
doctr/models/recognition/sar/pytorch.py +7 -6
doctr/models/recognition/sar/tensorflow.py +10 -12
doctr/models/recognition/vitstr/pytorch.py +1 -1
doctr/models/recognition/vitstr/tensorflow.py +9 -4
doctr/models/recognition/zoo.py +1 -1
doctr/models/utils/pytorch.py +1 -1
doctr/models/utils/tensorflow.py +15 -15
doctr/models/zoo.py +2 -2
doctr/py.typed +0 -0
doctr/transforms/functional/base.py +1 -1
doctr/transforms/functional/pytorch.py +5 -5
doctr/transforms/modules/base.py +37 -15
doctr/transforms/modules/pytorch.py +73 -14
doctr/transforms/modules/tensorflow.py +78 -19
doctr/utils/fonts.py +7 -5
doctr/utils/geometry.py +141 -31
doctr/utils/metrics.py +34 -175
doctr/utils/reconstitution.py +212 -0
doctr/utils/visualization.py +5 -118
doctr/version.py +1 -1
{python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/METADATA +85 -81
python_doctr-0.10.0.dist-info/RECORD +173 -0
{python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/WHEEL +1 -1
doctr/models/artefacts/__init__.py +0 -2
doctr/models/artefacts/barcode.py +0 -74
doctr/models/artefacts/face.py +0 -63
doctr/models/obj_detection/__init__.py +0 -1
doctr/models/obj_detection/faster_rcnn/__init__.py +0 -4
doctr/models/obj_detection/faster_rcnn/pytorch.py +0 -81
python_doctr-0.8.1.dist-info/RECORD +0 -173
{python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/LICENSE +0 -0
{python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/top_level.txt +0 -0
{python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/zip-safe +0 -0

doctr/models/predictor/pytorch.py CHANGED Viewed

@@ -10,10 +10,10 @@ import torch
 from torch import nn
 from doctr.io.elements import Document
-from doctr.models._utils import estimate_orientation, get_language
+from doctr.models._utils import get_language
 from doctr.models.detection.predictor import DetectionPredictor
 from doctr.models.recognition.predictor import RecognitionPredictor
-from doctr.utils.geometry import rotate_image
+from doctr.utils.geometry import detach_scores
 from .base import _OCRPredictor
@@ -55,7 +55,13 @@ class OCRPredictor(nn.Module, _OCRPredictor):
         self.det_predictor = det_predictor.eval()  # type: ignore[attr-defined]
         self.reco_predictor = reco_predictor.eval()  # type: ignore[attr-defined]
         _OCRPredictor.__init__(
-            self, assume_straight_pages, straighten_pages, preserve_aspect_ratio, symmetric_pad, **kwargs
+            self,
+            assume_straight_pages,
+            straighten_pages,
+            preserve_aspect_ratio,
+            symmetric_pad,
+            detect_orientation,
+            **kwargs,
         )
         self.detect_orientation = detect_orientation
         self.detect_language = detect_language
@@ -81,19 +87,19 @@ class OCRPredictor(nn.Module, _OCRPredictor):
             for out_map in out_maps
         ]
         if self.detect_orientation:
-            origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seg_maps]
+            general_pages_orientations, origin_pages_orientations = self._get_orientations(pages, seg_maps)  # type: ignore[arg-type]
             orientations = [
-                {"value": orientation_page, "confidence": None} for orientation_page in origin_page_orientations
+                {"value": orientation_page, "confidence": None} for orientation_page in origin_pages_orientations
             ]
         else:
             orientations = None
+            general_pages_orientations = None
+            origin_pages_orientations = None
         if self.straighten_pages:
-            origin_page_orientations = (
-                origin_page_orientations
-                if self.detect_orientation
-                else [estimate_orientation(seq_map) for seq_map in seg_maps]
-            )
-            pages = [rotate_image(page, -angle, expand=False) for page, angle in zip(pages, origin_page_orientations)]
+            pages = self._straighten_pages(pages, seg_maps, general_pages_orientations, origin_pages_orientations)  # type: ignore
+            # update page shapes after straightening
+            origin_page_shapes = [page.shape[:2] for page in pages]
             # Forward again to get predictions on straight pages
             loc_preds = self.det_predictor(pages, **kwargs)
@@ -102,30 +108,37 @@ class OCRPredictor(nn.Module, _OCRPredictor):
         ), "Detection Model in ocr_predictor should output only one class"
         loc_preds = [list(loc_pred.values())[0] for loc_pred in loc_preds]
+        # Detach objectness scores from loc_preds
+        loc_preds, objectness_scores = detach_scores(loc_preds)
         # Check whether crop mode should be switched to channels first
         channels_last = len(pages) == 0 or isinstance(pages[0], np.ndarray)
-        # Rectify crops if aspect ratio
-        loc_preds = self._remove_padding(pages, loc_preds)
         # Apply hooks to loc_preds if any
         for hook in self.hooks:
             loc_preds = hook(loc_preds)
         # Crop images
         crops, loc_preds = self._prepare_crops(
-            pages,
+            pages,  # type: ignore[arg-type]
             loc_preds,
             channels_last=channels_last,
             assume_straight_pages=self.assume_straight_pages,
+            assume_horizontal=self._page_orientation_disabled,
         )
-        # Rectify crop orientation
+        # Rectify crop orientation and get crop orientation predictions
+        crop_orientations: Any = []
         if not self.assume_straight_pages:
-            crops, loc_preds = self._rectify_crops(crops, loc_preds)
+            crops, loc_preds, _crop_orientations = self._rectify_crops(crops, loc_preds)
+            crop_orientations = [
+                {"value": orientation[0], "confidence": orientation[1]} for orientation in _crop_orientations
+            ]
         # Identify character sequences
         word_preds = self.reco_predictor([crop for page_crops in crops for crop in page_crops], **kwargs)
+        if not crop_orientations:
+            crop_orientations = [{"value": 0, "confidence": None} for _ in word_preds]
-        boxes, text_preds = self._process_predictions(loc_preds, word_preds)
+        boxes, text_preds, crop_orientations = self._process_predictions(loc_preds, word_preds, crop_orientations)
         if self.detect_language:
             languages = [get_language(" ".join([item[0] for item in text_pred])) for text_pred in text_preds]
@@ -134,10 +147,12 @@ class OCRPredictor(nn.Module, _OCRPredictor):
             languages_dict = None
         out = self.doc_builder(
-            pages,
+            pages,  # type: ignore[arg-type]
             boxes,
+            objectness_scores,
             text_preds,
-            origin_page_shapes,
+            origin_page_shapes,  # type: ignore[arg-type]
+            crop_orientations,
             orientations,
             languages_dict,
         )

doctr/models/predictor/tensorflow.py CHANGED Viewed

@@ -9,10 +9,10 @@ import numpy as np
 import tensorflow as tf
 from doctr.io.elements import Document
-from doctr.models._utils import estimate_orientation, get_language
+from doctr.models._utils import get_language
 from doctr.models.detection.predictor import DetectionPredictor
 from doctr.models.recognition.predictor import RecognitionPredictor
-from doctr.utils.geometry import rotate_image
+from doctr.utils.geometry import detach_scores
 from doctr.utils.repr import NestedObject
 from .base import _OCRPredictor
@@ -56,7 +56,13 @@ class OCRPredictor(NestedObject, _OCRPredictor):
         self.det_predictor = det_predictor
         self.reco_predictor = reco_predictor
         _OCRPredictor.__init__(
-            self, assume_straight_pages, straighten_pages, preserve_aspect_ratio, symmetric_pad, **kwargs
+            self,
+            assume_straight_pages,
+            straighten_pages,
+            preserve_aspect_ratio,
+            symmetric_pad,
+            detect_orientation,
+            **kwargs,
         )
         self.detect_orientation = detect_orientation
         self.detect_language = detect_language
@@ -81,19 +87,19 @@ class OCRPredictor(NestedObject, _OCRPredictor):
             for out_map in out_maps
         ]
         if self.detect_orientation:
-            origin_page_orientations = [estimate_orientation(seq_map) for seq_map in seg_maps]
+            general_pages_orientations, origin_pages_orientations = self._get_orientations(pages, seg_maps)
             orientations = [
-                {"value": orientation_page, "confidence": None} for orientation_page in origin_page_orientations
+                {"value": orientation_page, "confidence": None} for orientation_page in origin_pages_orientations
             ]
         else:
             orientations = None
+            general_pages_orientations = None
+            origin_pages_orientations = None
         if self.straighten_pages:
-            origin_page_orientations = (
-                origin_page_orientations
-                if self.detect_orientation
-                else [estimate_orientation(seq_map) for seq_map in seg_maps]
-            )
-            pages = [rotate_image(page, -angle, expand=False) for page, angle in zip(pages, origin_page_orientations)]
+            pages = self._straighten_pages(pages, seg_maps, general_pages_orientations, origin_pages_orientations)
+            # update page shapes after straightening
+            origin_page_shapes = [page.shape[:2] for page in pages]
             # forward again to get predictions on straight pages
             loc_preds_dict = self.det_predictor(pages, **kwargs)  # type: ignore[assignment]
@@ -101,9 +107,8 @@ class OCRPredictor(NestedObject, _OCRPredictor):
             len(loc_pred) == 1 for loc_pred in loc_preds_dict
         ), "Detection Model in ocr_predictor should output only one class"
         loc_preds: List[np.ndarray] = [list(loc_pred.values())[0] for loc_pred in loc_preds_dict]  # type: ignore[union-attr]
-        # Rectify crops if aspect ratio
-        loc_preds = self._remove_padding(pages, loc_preds)
+        # Detach objectness scores from loc_preds
+        loc_preds, objectness_scores = detach_scores(loc_preds)
         # Apply hooks to loc_preds if any
         for hook in self.hooks:
@@ -111,16 +116,26 @@ class OCRPredictor(NestedObject, _OCRPredictor):
         # Crop images
         crops, loc_preds = self._prepare_crops(
-            pages, loc_preds, channels_last=True, assume_straight_pages=self.assume_straight_pages
+            pages,
+            loc_preds,
+            channels_last=True,
+            assume_straight_pages=self.assume_straight_pages,
+            assume_horizontal=self._page_orientation_disabled,
         )
-        # Rectify crop orientation
+        # Rectify crop orientation and get crop orientation predictions
+        crop_orientations: Any = []
         if not self.assume_straight_pages:
-            crops, loc_preds = self._rectify_crops(crops, loc_preds)
+            crops, loc_preds, _crop_orientations = self._rectify_crops(crops, loc_preds)
+            crop_orientations = [
+                {"value": orientation[0], "confidence": orientation[1]} for orientation in _crop_orientations
+            ]
         # Identify character sequences
         word_preds = self.reco_predictor([crop for page_crops in crops for crop in page_crops], **kwargs)
+        if not crop_orientations:
+            crop_orientations = [{"value": 0, "confidence": None} for _ in word_preds]
-        boxes, text_preds = self._process_predictions(loc_preds, word_preds)
+        boxes, text_preds, crop_orientations = self._process_predictions(loc_preds, word_preds, crop_orientations)
         if self.detect_language:
             languages = [get_language(" ".join([item[0] for item in text_pred])) for text_pred in text_preds]
@@ -131,8 +146,10 @@ class OCRPredictor(NestedObject, _OCRPredictor):
         out = self.doc_builder(
             pages,
             boxes,
+            objectness_scores,
             text_preds,
             origin_page_shapes,  # type: ignore[arg-type]
+            crop_orientations,
             orientations,
             languages_dict,
         )

doctr/models/preprocessor/pytorch.py CHANGED Viewed

@@ -79,7 +79,7 @@ class PreProcessor(nn.Module):
         else:
             x = x.to(dtype=torch.float32)  # type: ignore[union-attr]
-        return x  # type: ignore[return-value]
+        return x
     def __call__(self, x: Union[torch.Tensor, np.ndarray, List[Union[torch.Tensor, np.ndarray]]]) -> List[torch.Tensor]:
         """Prepare document data for model forwarding
@@ -103,7 +103,7 @@ class PreProcessor(nn.Module):
             elif x.dtype not in (torch.uint8, torch.float16, torch.float32):
                 raise TypeError("unsupported data type for torch.Tensor")
             # Resizing
-            if x.shape[-2] != self.resize.size[0] or x.shape[-1] != self.resize.size[1]:  # type: ignore[union-attr]
+            if x.shape[-2] != self.resize.size[0] or x.shape[-1] != self.resize.size[1]:
                 x = F.resize(
                     x, self.resize.size, interpolation=self.resize.interpolation, antialias=self.resize.antialias
                 )
@@ -118,11 +118,11 @@ class PreProcessor(nn.Module):
             # Sample transform (to tensor, resize)
             samples = list(multithread_exec(self.sample_transforms, x))
             # Batching
-            batches = self.batch_inputs(samples)  # type: ignore[assignment]
+            batches = self.batch_inputs(samples)
         else:
             raise TypeError(f"invalid input type: {type(x)}")
         # Batch transforms (normalize)
         batches = list(multithread_exec(self.normalize, batches))
-        return batches  # type: ignore[return-value]
+        return batches

doctr/models/preprocessor/tensorflow.py CHANGED Viewed

@@ -41,6 +41,7 @@ class PreProcessor(NestedObject):
         self.resize = Resize(output_size, **kwargs)
         # Perform the division by 255 at the same time
         self.normalize = Normalize(mean, std)
+        self._runs_on_cuda = tf.config.list_physical_devices("GPU") != []
     def batch_inputs(self, samples: List[tf.Tensor]) -> List[tf.Tensor]:
         """Gather samples into batches for inference purposes
@@ -113,13 +114,13 @@ class PreProcessor(NestedObject):
         elif isinstance(x, list) and all(isinstance(sample, (np.ndarray, tf.Tensor)) for sample in x):
             # Sample transform (to tensor, resize)
-            samples = list(multithread_exec(self.sample_transforms, x))
+            samples = list(multithread_exec(self.sample_transforms, x, threads=1 if self._runs_on_cuda else None))
             # Batching
             batches = self.batch_inputs(samples)
         else:
             raise TypeError(f"invalid input type: {type(x)}")
         # Batch transforms (normalize)
-        batches = list(multithread_exec(self.normalize, batches))
+        batches = list(multithread_exec(self.normalize, batches, threads=1 if self._runs_on_cuda else None))
         return batches

doctr/models/recognition/crnn/tensorflow.py CHANGED Viewed

@@ -13,7 +13,7 @@ from tensorflow.keras.models import Model, Sequential
 from doctr.datasets import VOCABS
 from ...classification import mobilenet_v3_large_r, mobilenet_v3_small_r, vgg16_bn_r
-from ...utils.tensorflow import _bf16_to_float32, load_pretrained_params
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
 from ..core import RecognitionModel, RecognitionPostProcessor
 __all__ = ["CRNN", "crnn_vgg16_bn", "crnn_mobilenet_v3_small", "crnn_mobilenet_v3_large"]
@@ -24,21 +24,21 @@ default_cfgs: Dict[str, Dict[str, Any]] = {
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 128, 3),
         "vocab": VOCABS["legacy_french"],
-        "url": "https://doctr-static.mindee.com/models?id=v0.3.0/crnn_vgg16_bn-76b7f2c6.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/crnn_vgg16_bn-9c188f45.weights.h5&src=0",
     },
     "crnn_mobilenet_v3_small": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 128, 3),
         "vocab": VOCABS["french"],
-        "url": "https://doctr-static.mindee.com/models?id=v0.3.1/crnn_mobilenet_v3_small-7f36edec.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/crnn_mobilenet_v3_small-54850265.weights.h5&src=0",
     },
     "crnn_mobilenet_v3_large": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 128, 3),
         "vocab": VOCABS["french"],
-        "url": "https://doctr-static.mindee.com/models?id=v0.6.0/crnn_mobilenet_v3_large-cccc50b1.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/crnn_mobilenet_v3_large-c64045e5.weights.h5&src=0",
     },
 }
@@ -128,7 +128,7 @@ class CRNN(RecognitionModel, Model):
     def __init__(
         self,
-        feature_extractor: tf.keras.Model,
+        feature_extractor: Model,
         vocab: str,
         rnn_units: int = 128,
         exportable: bool = False,
@@ -245,9 +245,11 @@ def _crnn(
     # Build the model
     model = CRNN(feat_extractor, cfg=_cfg, **kwargs)
+    _build_model(model)
     # Load pretrained parameters
     if pretrained:
-        load_pretrained_params(model, _cfg["url"])
+        # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning
+        load_pretrained_params(model, _cfg["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"])
     return model

doctr/models/recognition/master/pytorch.py CHANGED Viewed

@@ -107,7 +107,7 @@ class MASTER(_MASTER, nn.Module):
         # NOTE: nn.TransformerDecoder takes the inverse from this implementation
         # [True, True, True, ..., False, False, False] -> False is masked
         # (N, 1, 1, max_length)
-        target_pad_mask = (target != self.vocab_size + 2).unsqueeze(1).unsqueeze(1)  # type: ignore[attr-defined]
+        target_pad_mask = (target != self.vocab_size + 2).unsqueeze(1).unsqueeze(1)
         target_length = target.size(1)
         # sub mask filled diagonal with True = see and False = masked (max_length, max_length)
         # NOTE: onnxruntime tril/triu works only with float currently (onnxruntime 1.11.1 - opset 14)
@@ -142,7 +142,7 @@ class MASTER(_MASTER, nn.Module):
         # Input length : number of timesteps
         input_len = model_output.shape[1]
         # Add one for additional <eos> token (sos disappear in shift!)
-        seq_len = seq_len + 1  # type: ignore[assignment]
+        seq_len = seq_len + 1
         # Compute loss: don't forget to shift gt! Otherwise the model learns to output the gt[t-1]!
         # The "masked" first gt char is <sos>. Delete last logit of the model output.
         cce = F.cross_entropy(model_output[:, :-1, :].permute(0, 2, 1), gt[:, 1:], reduction="none")

doctr/models/recognition/master/tensorflow.py CHANGED Viewed

@@ -13,7 +13,7 @@ from doctr.datasets import VOCABS
 from doctr.models.classification import magc_resnet31
 from doctr.models.modules.transformer import Decoder, PositionalEncoding
-from ...utils.tensorflow import _bf16_to_float32, load_pretrained_params
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
 from .base import _MASTER, _MASTERPostProcessor
 __all__ = ["MASTER", "master"]
@@ -25,7 +25,7 @@ default_cfgs: Dict[str, Dict[str, Any]] = {
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 128, 3),
         "vocab": VOCABS["french"],
-        "url": "https://doctr-static.mindee.com/models?id=v0.6.0/master-a8232e9f.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/master-d7fdaeff.weights.h5&src=0",
     },
 }
@@ -51,7 +51,7 @@ class MASTER(_MASTER, Model):
     def __init__(
         self,
-        feature_extractor: tf.keras.Model,
+        feature_extractor: Model,
         vocab: str,
         d_model: int = 512,
         dff: int = 2048,
@@ -290,9 +290,14 @@ def _master(arch: str, pretrained: bool, backbone_fn, pretrained_backbone: bool
         cfg=_cfg,
         **kwargs,
     )
+    _build_model(model)
     # Load pretrained parameters
     if pretrained:
-        load_pretrained_params(model, default_cfgs[arch]["url"])
+        # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning
+        load_pretrained_params(
+            model, default_cfgs[arch]["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"]
+        )
     return model

doctr/models/recognition/parseq/pytorch.py CHANGED Viewed

@@ -212,7 +212,7 @@ class PARSeq(_PARSeq, nn.Module):
         sos_idx = torch.zeros(len(final_perms), 1, device=seqlen.device)
         eos_idx = torch.full((len(final_perms), 1), max_num_chars + 1, device=seqlen.device)
-        combined = torch.cat([sos_idx, final_perms + 1, eos_idx], dim=1).int()  # type: ignore
+        combined = torch.cat([sos_idx, final_perms + 1, eos_idx], dim=1).int()
         if len(combined) > 1:
             combined[1, 1:] = max_num_chars + 1 - torch.arange(max_num_chars + 1, device=seqlen.device)
         return combined
@@ -282,7 +282,8 @@ class PARSeq(_PARSeq, nn.Module):
                 ys[:, i + 1] = pos_prob.squeeze().argmax(-1)
                 # Stop decoding if all sequences have reached the EOS token
-                if max_len is None and (ys == self.vocab_size).any(dim=-1).all():  # type: ignore[attr-defined]
+                # NOTE: `break` isn't correctly translated to Onnx so we don't break here if we want to export
+                if not self.exportable and max_len is None and (ys == self.vocab_size).any(dim=-1).all():
                     break
         logits = torch.cat(pos_logits, dim=1)  # (N, max_length, vocab_size + 1)
@@ -297,7 +298,7 @@ class PARSeq(_PARSeq, nn.Module):
         # Create padding mask for refined target input maskes all behind EOS token as False
         # (N, 1, 1, max_length)
-        target_pad_mask = ~((ys == self.vocab_size).int().cumsum(-1) > 0).unsqueeze(1).unsqueeze(1)  # type: ignore[attr-defined]
+        target_pad_mask = ~((ys == self.vocab_size).int().cumsum(-1) > 0).unsqueeze(1).unsqueeze(1)
         mask = (target_pad_mask.bool() & query_mask[:, : ys.shape[1]].bool()).int()
         logits = self.head(self.decode(ys, features, mask, target_query=pos_queries))

doctr/models/recognition/parseq/tensorflow.py CHANGED Viewed

@@ -16,7 +16,7 @@ from doctr.datasets import VOCABS
 from doctr.models.modules.transformer import MultiHeadAttention, PositionwiseFeedForward
 from ...classification import vit_s
-from ...utils.tensorflow import _bf16_to_float32, load_pretrained_params
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
 from .base import _PARSeq, _PARSeqPostProcessor
 __all__ = ["PARSeq", "parseq"]
@@ -27,7 +27,7 @@ default_cfgs: Dict[str, Dict[str, Any]] = {
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 128, 3),
         "vocab": VOCABS["french"],
-        "url": "https://doctr-static.mindee.com/models?id=v0.6.0/parseq-24cf693e.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/parseq-4152a87e.weights.h5&src=0",
     },
 }
@@ -43,7 +43,7 @@ class CharEmbedding(layers.Layer):
     def __init__(self, vocab_size: int, d_model: int):
         super(CharEmbedding, self).__init__()
-        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
+        self.embedding = layers.Embedding(vocab_size, d_model)
         self.d_model = d_model
     def call(self, x: tf.Tensor, **kwargs: Any) -> tf.Tensor:
@@ -167,7 +167,6 @@ class PARSeq(_PARSeq, Model):
         self.postprocessor = PARSeqPostProcessor(vocab=self.vocab)
-    @tf.function
     def generate_permutations(self, seqlen: tf.Tensor) -> tf.Tensor:
         # Generates permutations of the target sequence.
         # Translated from https://github.com/baudm/parseq/blob/main/strhub/models/parseq/system.py
@@ -214,7 +213,6 @@ class PARSeq(_PARSeq, Model):
             )
         return combined
-    @tf.function
     def generate_permutations_attention_masks(self, permutation: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
         # Generate source and target mask for the decoder attention.
         sz = permutation.shape[0]
@@ -234,11 +232,10 @@ class PARSeq(_PARSeq, Model):
         target_mask = mask[1:, :-1]
         return tf.cast(source_mask, dtype=tf.bool), tf.cast(target_mask, dtype=tf.bool)
-    @tf.function
     def decode(
         self,
         target: tf.Tensor,
-        memory: tf,
+        memory: tf.Tensor,
         target_mask: Optional[tf.Tensor] = None,
         target_query: Optional[tf.Tensor] = None,
         **kwargs: Any,
@@ -288,10 +285,11 @@ class PARSeq(_PARSeq, Model):
                 )
                 # Stop decoding if all sequences have reached the EOS token
-                # We need to check it on True to be compatible with ONNX
+                # NOTE: `break` isn't correctly translated to Onnx so we don't break here if we want to export
                 if (
-                    max_len is None
-                    and tf.reduce_any(tf.reduce_all(tf.equal(ys, tf.constant(self.vocab_size)), axis=-1)) is True
+                    not self.exportable
+                    and max_len is None
+                    and tf.reduce_any(tf.reduce_all(tf.equal(ys, tf.constant(self.vocab_size)), axis=-1))
                 ):
                     break
@@ -475,9 +473,14 @@ def _parseq(
     # Build the model
     model = PARSeq(feat_extractor, cfg=_cfg, **kwargs)
+    _build_model(model)
     # Load pretrained parameters
     if pretrained:
-        load_pretrained_params(model, default_cfgs[arch]["url"])
+        # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning
+        load_pretrained_params(
+            model, default_cfgs[arch]["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"]
+        )
     return model

doctr/models/recognition/sar/pytorch.py CHANGED Viewed

@@ -125,25 +125,26 @@ class SARDecoder(nn.Module):
             if t == 0:
                 # step to init the first states of the LSTMCell
                 hidden_state_init = cell_state_init = torch.zeros(
-                    features.size(0), features.size(1), device=features.device
+                    features.size(0), features.size(1), device=features.device, dtype=features.dtype
                 )
                 hidden_state, cell_state = hidden_state_init, cell_state_init
                 prev_symbol = holistic
             elif t == 1:
                 # step to init a 'blank' sequence of length vocab_size + 1 filled with zeros
                 # (N, vocab_size + 1) --> (N, embedding_units)
-                prev_symbol = torch.zeros(features.size(0), self.vocab_size + 1, device=features.device)
+                prev_symbol = torch.zeros(
+                    features.size(0), self.vocab_size + 1, device=features.device, dtype=features.dtype
+                )
                 prev_symbol = self.embed(prev_symbol)
             else:
-                if gt is not None:
+                if gt is not None and self.training:
                     # (N, embedding_units) -2 because of <bos> and <eos> (same)
                     prev_symbol = self.embed(gt_embedding[:, t - 2])
                 else:
                     # -1 to start at timestep where prev_symbol was initialized
                     index = logits_list[t - 1].argmax(-1)
                     # update prev_symbol with ones at the index of the previous logit vector
-                    # (N, embedding_units)
-                    prev_symbol = prev_symbol.scatter_(1, index.unsqueeze(1), 1)
+                    prev_symbol = self.embed(self.embed_tgt(index))
             # (N, C), (N, C)  take the last hidden state and cell state from current timestep
             hidden_state_init, cell_state_init = self.lstm_cell(prev_symbol, (hidden_state_init, cell_state_init))
@@ -292,7 +293,7 @@ class SAR(nn.Module, RecognitionModel):
         # Input length : number of timesteps
         input_len = model_output.shape[1]
         # Add one for additional <eos> token
-        seq_len = seq_len + 1  # type: ignore[assignment]
+        seq_len = seq_len + 1
         # Compute loss
         # (N, L, vocab_size + 1)
         cce = F.cross_entropy(model_output.permute(0, 2, 1), gt, reduction="none")

doctr/models/recognition/sar/tensorflow.py CHANGED Viewed

@@ -13,7 +13,7 @@ from doctr.datasets import VOCABS
 from doctr.utils.repr import NestedObject
 from ...classification import resnet31
-from ...utils.tensorflow import _bf16_to_float32, load_pretrained_params
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
 from ..core import RecognitionModel, RecognitionPostProcessor
 __all__ = ["SAR", "sar_resnet31"]
@@ -24,7 +24,7 @@ default_cfgs: Dict[str, Dict[str, Any]] = {
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 128, 3),
         "vocab": VOCABS["french"],
-        "url": "https://doctr-static.mindee.com/models?id=v0.6.0/sar_resnet31-c41e32a5.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/sar_resnet31-5a58806c.weights.h5&src=0",
     },
 }
@@ -177,23 +177,17 @@ class SARDecoder(layers.Layer, NestedObject):
             elif t == 1:
                 # step to init a 'blank' sequence of length vocab_size + 1 filled with zeros
                 # (N, vocab_size + 1) --> (N, embedding_units)
-                prev_symbol = tf.zeros([features.shape[0], self.vocab_size + 1])
+                prev_symbol = tf.zeros([features.shape[0], self.vocab_size + 1], dtype=features.dtype)
                 prev_symbol = self.embed(prev_symbol, **kwargs)
             else:
-                if gt is not None:
+                if gt is not None and kwargs.get("training", False):
                     # (N, embedding_units) -2 because of <bos> and <eos> (same)
                     prev_symbol = self.embed(gt_embedding[:, t - 2], **kwargs)
                 else:
                     # -1 to start at timestep where prev_symbol was initialized
                     index = tf.argmax(logits_list[t - 1], axis=-1)
                     # update prev_symbol with ones at the index of the previous logit vector
-                    # (N, embedding_units)
-                    index = tf.ones_like(index)
-                    prev_symbol = tf.scatter_nd(
-                        tf.expand_dims(index, axis=1),
-                        prev_symbol,
-                        tf.constant([features.shape[0], features.shape[-1]], dtype=tf.int64),
-                    )
+                    prev_symbol = self.embed(self.embed_tgt(index, **kwargs), **kwargs)
             # (N, C), (N, C)  take the last hidden state and cell state from current timestep
             _, states = self.lstm_cells(prev_symbol, states, **kwargs)
@@ -398,9 +392,13 @@ def _sar(
     # Build the model
     model = SAR(feat_extractor, cfg=_cfg, **kwargs)
+    _build_model(model)
     # Load pretrained parameters
     if pretrained:
-        load_pretrained_params(model, default_cfgs[arch]["url"])
+        # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning
+        load_pretrained_params(
+            model, default_cfgs[arch]["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"]
+        )
     return model

doctr/models/recognition/vitstr/pytorch.py CHANGED Viewed

@@ -137,7 +137,7 @@ class ViTSTR(_ViTSTR, nn.Module):
         # Input length : number of steps
         input_len = model_output.shape[1]
         # Add one for additional <eos> token (sos disappear in shift!)
-        seq_len = seq_len + 1  # type: ignore[assignment]
+        seq_len = seq_len + 1
         # Compute loss: don't forget to shift gt! Otherwise the model learns to output the gt[t-1]!
         # The "masked" first gt char is <sos>.
         cce = F.cross_entropy(model_output.permute(0, 2, 1), gt[:, 1:], reduction="none")

python-doctr 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

python-doctr 0.8.1py3-none-any.whl → 0.10.0py3-none-any.whl