PyPI - python-doctr - Versions diffs - 0.10.0__py3-none-any.whl → 0.12.0__py3-none-any.whl - Mend

python-doctr 0.10.0py3-none-any.whl → 0.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (172) hide show

doctr/contrib/__init__.py +1 -0
doctr/contrib/artefacts.py +7 -9
doctr/contrib/base.py +8 -17
doctr/datasets/__init__.py +1 -0
doctr/datasets/coco_text.py +139 -0
doctr/datasets/cord.py +10 -8
doctr/datasets/datasets/__init__.py +4 -4
doctr/datasets/datasets/base.py +16 -16
doctr/datasets/datasets/pytorch.py +12 -12
doctr/datasets/datasets/tensorflow.py +10 -10
doctr/datasets/detection.py +6 -9
doctr/datasets/doc_artefacts.py +3 -4
doctr/datasets/funsd.py +9 -8
doctr/datasets/generator/__init__.py +4 -4
doctr/datasets/generator/base.py +16 -17
doctr/datasets/generator/pytorch.py +1 -3
doctr/datasets/generator/tensorflow.py +1 -3
doctr/datasets/ic03.py +5 -6
doctr/datasets/ic13.py +6 -6
doctr/datasets/iiit5k.py +10 -6
doctr/datasets/iiithws.py +4 -5
doctr/datasets/imgur5k.py +15 -7
doctr/datasets/loader.py +4 -7
doctr/datasets/mjsynth.py +6 -5
doctr/datasets/ocr.py +3 -4
doctr/datasets/orientation.py +3 -4
doctr/datasets/recognition.py +4 -5
doctr/datasets/sroie.py +6 -5
doctr/datasets/svhn.py +7 -6
doctr/datasets/svt.py +6 -7
doctr/datasets/synthtext.py +19 -7
doctr/datasets/utils.py +41 -35
doctr/datasets/vocabs.py +1107 -49
doctr/datasets/wildreceipt.py +14 -10
doctr/file_utils.py +11 -7
doctr/io/elements.py +96 -82
doctr/io/html.py +1 -3
doctr/io/image/__init__.py +3 -3
doctr/io/image/base.py +2 -5
doctr/io/image/pytorch.py +3 -12
doctr/io/image/tensorflow.py +2 -11
doctr/io/pdf.py +5 -7
doctr/io/reader.py +5 -11
doctr/models/_utils.py +15 -23
doctr/models/builder.py +30 -48
doctr/models/classification/__init__.py +1 -0
doctr/models/classification/magc_resnet/__init__.py +3 -3
doctr/models/classification/magc_resnet/pytorch.py +11 -15
doctr/models/classification/magc_resnet/tensorflow.py +11 -14
doctr/models/classification/mobilenet/__init__.py +3 -3
doctr/models/classification/mobilenet/pytorch.py +20 -18
doctr/models/classification/mobilenet/tensorflow.py +19 -23
doctr/models/classification/predictor/__init__.py +4 -4
doctr/models/classification/predictor/pytorch.py +7 -9
doctr/models/classification/predictor/tensorflow.py +6 -8
doctr/models/classification/resnet/__init__.py +4 -4
doctr/models/classification/resnet/pytorch.py +47 -34
doctr/models/classification/resnet/tensorflow.py +45 -35
doctr/models/classification/textnet/__init__.py +3 -3
doctr/models/classification/textnet/pytorch.py +20 -18
doctr/models/classification/textnet/tensorflow.py +19 -17
doctr/models/classification/vgg/__init__.py +3 -3
doctr/models/classification/vgg/pytorch.py +21 -8
doctr/models/classification/vgg/tensorflow.py +20 -14
doctr/models/classification/vip/__init__.py +4 -0
doctr/models/classification/vip/layers/__init__.py +4 -0
doctr/models/classification/vip/layers/pytorch.py +615 -0
doctr/models/classification/vip/pytorch.py +505 -0
doctr/models/classification/vit/__init__.py +3 -3
doctr/models/classification/vit/pytorch.py +18 -15
doctr/models/classification/vit/tensorflow.py +15 -12
doctr/models/classification/zoo.py +23 -14
doctr/models/core.py +3 -3
doctr/models/detection/_utils/__init__.py +4 -4
doctr/models/detection/_utils/base.py +4 -7
doctr/models/detection/_utils/pytorch.py +1 -5
doctr/models/detection/_utils/tensorflow.py +1 -5
doctr/models/detection/core.py +2 -8
doctr/models/detection/differentiable_binarization/__init__.py +4 -4
doctr/models/detection/differentiable_binarization/base.py +10 -21
doctr/models/detection/differentiable_binarization/pytorch.py +37 -31
doctr/models/detection/differentiable_binarization/tensorflow.py +26 -29
doctr/models/detection/fast/__init__.py +4 -4
doctr/models/detection/fast/base.py +8 -17
doctr/models/detection/fast/pytorch.py +37 -35
doctr/models/detection/fast/tensorflow.py +24 -28
doctr/models/detection/linknet/__init__.py +4 -4
doctr/models/detection/linknet/base.py +8 -18
doctr/models/detection/linknet/pytorch.py +34 -28
doctr/models/detection/linknet/tensorflow.py +24 -25
doctr/models/detection/predictor/__init__.py +5 -5
doctr/models/detection/predictor/pytorch.py +6 -7
doctr/models/detection/predictor/tensorflow.py +5 -6
doctr/models/detection/zoo.py +27 -7
doctr/models/factory/hub.py +6 -10
doctr/models/kie_predictor/__init__.py +5 -5
doctr/models/kie_predictor/base.py +4 -5
doctr/models/kie_predictor/pytorch.py +19 -20
doctr/models/kie_predictor/tensorflow.py +14 -15
doctr/models/modules/layers/__init__.py +3 -3
doctr/models/modules/layers/pytorch.py +55 -10
doctr/models/modules/layers/tensorflow.py +5 -7
doctr/models/modules/transformer/__init__.py +3 -3
doctr/models/modules/transformer/pytorch.py +12 -13
doctr/models/modules/transformer/tensorflow.py +9 -10
doctr/models/modules/vision_transformer/__init__.py +3 -3
doctr/models/modules/vision_transformer/pytorch.py +2 -3
doctr/models/modules/vision_transformer/tensorflow.py +3 -3
doctr/models/predictor/__init__.py +5 -5
doctr/models/predictor/base.py +28 -29
doctr/models/predictor/pytorch.py +13 -14
doctr/models/predictor/tensorflow.py +9 -10
doctr/models/preprocessor/__init__.py +4 -4
doctr/models/preprocessor/pytorch.py +13 -17
doctr/models/preprocessor/tensorflow.py +10 -14
doctr/models/recognition/__init__.py +1 -0
doctr/models/recognition/core.py +3 -7
doctr/models/recognition/crnn/__init__.py +4 -4
doctr/models/recognition/crnn/pytorch.py +30 -29
doctr/models/recognition/crnn/tensorflow.py +21 -24
doctr/models/recognition/master/__init__.py +3 -3
doctr/models/recognition/master/base.py +3 -7
doctr/models/recognition/master/pytorch.py +32 -25
doctr/models/recognition/master/tensorflow.py +22 -25
doctr/models/recognition/parseq/__init__.py +3 -3
doctr/models/recognition/parseq/base.py +3 -7
doctr/models/recognition/parseq/pytorch.py +47 -29
doctr/models/recognition/parseq/tensorflow.py +29 -27
doctr/models/recognition/predictor/__init__.py +5 -5
doctr/models/recognition/predictor/_utils.py +111 -52
doctr/models/recognition/predictor/pytorch.py +9 -9
doctr/models/recognition/predictor/tensorflow.py +8 -9
doctr/models/recognition/sar/__init__.py +4 -4
doctr/models/recognition/sar/pytorch.py +30 -22
doctr/models/recognition/sar/tensorflow.py +22 -24
doctr/models/recognition/utils.py +57 -53
doctr/models/recognition/viptr/__init__.py +4 -0
doctr/models/recognition/viptr/pytorch.py +277 -0
doctr/models/recognition/vitstr/__init__.py +4 -4
doctr/models/recognition/vitstr/base.py +3 -7
doctr/models/recognition/vitstr/pytorch.py +28 -21
doctr/models/recognition/vitstr/tensorflow.py +22 -23
doctr/models/recognition/zoo.py +27 -11
doctr/models/utils/__init__.py +4 -4
doctr/models/utils/pytorch.py +41 -34
doctr/models/utils/tensorflow.py +31 -23
doctr/models/zoo.py +1 -5
doctr/transforms/functional/__init__.py +3 -3
doctr/transforms/functional/base.py +4 -11
doctr/transforms/functional/pytorch.py +20 -28
doctr/transforms/functional/tensorflow.py +10 -22
doctr/transforms/modules/__init__.py +4 -4
doctr/transforms/modules/base.py +48 -55
doctr/transforms/modules/pytorch.py +58 -22
doctr/transforms/modules/tensorflow.py +18 -32
doctr/utils/common_types.py +8 -9
doctr/utils/data.py +9 -13
doctr/utils/fonts.py +2 -7
doctr/utils/geometry.py +17 -48
doctr/utils/metrics.py +17 -37
doctr/utils/multithreading.py +4 -6
doctr/utils/reconstitution.py +9 -13
doctr/utils/repr.py +2 -3
doctr/utils/visualization.py +16 -29
doctr/version.py +1 -1
{python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info}/METADATA +70 -52
python_doctr-0.12.0.dist-info/RECORD +180 -0
{python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info}/WHEEL +1 -1
python_doctr-0.10.0.dist-info/RECORD +0 -173
{python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info/licenses}/LICENSE +0 -0
{python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info}/top_level.txt +0 -0
{python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info}/zip-safe +0 -0

doctr/models/recognition/vitstr/pytorch.py CHANGED Viewed

@@ -1,10 +1,11 @@
-# Copyright (C) 2021-2024, Mindee.
+# Copyright (C) 2021-2025, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+from collections.abc import Callable
 from copy import deepcopy
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Any
 import torch
 from torch import nn
@@ -19,7 +20,7 @@ from .base import _ViTSTR, _ViTSTRPostProcessor
 __all__ = ["ViTSTR", "vitstr_small", "vitstr_base"]
-default_cfgs: Dict[str, Dict[str, Any]] = {
+default_cfgs: dict[str, dict[str, Any]] = {
     "vitstr_small": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
@@ -42,7 +43,6 @@ class ViTSTR(_ViTSTR, nn.Module):
     Efficient Scene Text Recognition" <https://arxiv.org/pdf/2105.08582.pdf>`_.
     Args:
-    ----
         feature_extractor: the backbone serving as feature extractor
         vocab: vocabulary used for encoding
         embedding_units: number of embedding units
@@ -59,9 +59,9 @@ class ViTSTR(_ViTSTR, nn.Module):
         vocab: str,
         embedding_units: int,
         max_length: int = 32,  # different from paper
-        input_shape: Tuple[int, int, int] = (3, 32, 128),  # different from paper
+        input_shape: tuple[int, int, int] = (3, 32, 128),  # different from paper
         exportable: bool = False,
-        cfg: Optional[Dict[str, Any]] = None,
+        cfg: dict[str, Any] | None = None,
     ) -> None:
         super().__init__()
         self.vocab = vocab
@@ -74,13 +74,22 @@ class ViTSTR(_ViTSTR, nn.Module):
         self.postprocessor = ViTSTRPostProcessor(vocab=self.vocab)
+    def from_pretrained(self, path_or_url: str, **kwargs: Any) -> None:
+        """Load pretrained parameters onto the model
+        Args:
+            path_or_url: the path or URL to the model parameters (checkpoint)
+            **kwargs: additional arguments to be passed to `doctr.models.utils.load_pretrained_params`
+        """
+        load_pretrained_params(self, path_or_url, **kwargs)
     def forward(
         self,
         x: torch.Tensor,
-        target: Optional[List[str]] = None,
+        target: list[str] | None = None,
         return_model_output: bool = False,
         return_preds: bool = False,
-    ) -> Dict[str, Any]:
+    ) -> dict[str, Any]:
         features = self.feat_extractor(x)["features"]  # (batch_size, patches_seqlen, d_model)
         if target is not None:
@@ -98,7 +107,7 @@ class ViTSTR(_ViTSTR, nn.Module):
         logits = self.head(features).view(B, N, len(self.vocab) + 1)  # (batch_size, max_length, vocab + 1)
         decoded_features = _bf16_to_float32(logits[:, 1:])  # remove cls_token
-        out: Dict[str, Any] = {}
+        out: dict[str, Any] = {}
         if self.exportable:
             out["logits"] = decoded_features
             return out
@@ -107,8 +116,13 @@ class ViTSTR(_ViTSTR, nn.Module):
             out["out_map"] = decoded_features
         if target is None or return_preds:
+            # Disable for torch.compile compatibility
+            @torch.compiler.disable  # type: ignore[attr-defined]
+            def _postprocess(decoded_features: torch.Tensor) -> list[tuple[str, float]]:
+                return self.postprocessor(decoded_features)
             # Post-process boxes
-            out["preds"] = self.postprocessor(decoded_features)
+            out["preds"] = _postprocess(decoded_features)
         if target is not None:
             out["loss"] = self.compute_loss(decoded_features, gt, seq_len)
@@ -125,19 +139,17 @@ class ViTSTR(_ViTSTR, nn.Module):
         Sequences are masked after the EOS character.
         Args:
-        ----
             model_output: predicted logits of the model
             gt: the encoded tensor with gt labels
             seq_len: lengths of each gt word inside the batch
         Returns:
-        -------
             The loss of the model on the batch
         """
         # Input length : number of steps
         input_len = model_output.shape[1]
         # Add one for additional <eos> token (sos disappear in shift!)
-        seq_len = seq_len + 1
+        seq_len = seq_len + 1  # type: ignore[assignment]
         # Compute loss: don't forget to shift gt! Otherwise the model learns to output the gt[t-1]!
         # The "masked" first gt char is <sos>.
         cce = F.cross_entropy(model_output.permute(0, 2, 1), gt[:, 1:], reduction="none")
@@ -153,14 +165,13 @@ class ViTSTRPostProcessor(_ViTSTRPostProcessor):
     """Post processor for ViTSTR architecture
     Args:
-    ----
         vocab: string containing the ordered sequence of supported characters
     """
     def __call__(
         self,
         logits: torch.Tensor,
-    ) -> List[Tuple[str, float]]:
+    ) -> list[tuple[str, float]]:
         # compute pred with argmax for attention models
         out_idxs = logits.argmax(-1)
         preds_prob = torch.softmax(logits, -1).max(dim=-1)[0]
@@ -183,7 +194,7 @@ def _vitstr(
     pretrained: bool,
     backbone_fn: Callable[[bool], nn.Module],
     layer: str,
-    ignore_keys: Optional[List[str]] = None,
+    ignore_keys: list[str] | None = None,
     **kwargs: Any,
 ) -> ViTSTR:
     # Patch the config
@@ -212,7 +223,7 @@ def _vitstr(
         # The number of classes is not the same as the number of classes in the pretrained model =>
         # remove the last layer weights
         _ignore_keys = ignore_keys if _cfg["vocab"] != default_cfgs[arch]["vocab"] else None
-        load_pretrained_params(model, default_cfgs[arch]["url"], ignore_keys=_ignore_keys)
+        model.from_pretrained(default_cfgs[arch]["url"], ignore_keys=_ignore_keys)
     return model
@@ -228,12 +239,10 @@ def vitstr_small(pretrained: bool = False, **kwargs: Any) -> ViTSTR:
     >>> out = model(input_tensor)
     Args:
-    ----
         pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
         kwargs: keyword arguments of the ViTSTR architecture
     Returns:
-    -------
         text recognition architecture
     """
     return _vitstr(
@@ -259,12 +268,10 @@ def vitstr_base(pretrained: bool = False, **kwargs: Any) -> ViTSTR:
     >>> out = model(input_tensor)
     Args:
-    ----
         pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
         kwargs: keyword arguments of the ViTSTR architecture
     Returns:
-    -------
         text recognition architecture
     """
     return _vitstr(

doctr/models/recognition/vitstr/tensorflow.py CHANGED Viewed

@@ -1,10 +1,10 @@
-# Copyright (C) 2021-2024, Mindee.
+# Copyright (C) 2021-2025, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 from copy import deepcopy
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any
 import tensorflow as tf
 from tensorflow.keras import Model, layers
@@ -17,7 +17,7 @@ from .base import _ViTSTR, _ViTSTRPostProcessor
 __all__ = ["ViTSTR", "vitstr_small", "vitstr_base"]
-default_cfgs: Dict[str, Dict[str, Any]] = {
+default_cfgs: dict[str, dict[str, Any]] = {
     "vitstr_small": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
@@ -40,7 +40,6 @@ class ViTSTR(_ViTSTR, Model):
     Efficient Scene Text Recognition" <https://arxiv.org/pdf/2105.08582.pdf>`_.
     Args:
-    ----
         feature_extractor: the backbone serving as feature extractor
         vocab: vocabulary used for encoding
         embedding_units: number of embedding units
@@ -51,7 +50,7 @@ class ViTSTR(_ViTSTR, Model):
         cfg: dictionary containing information about the model
     """
-    _children_names: List[str] = ["feat_extractor", "postprocessor"]
+    _children_names: list[str] = ["feat_extractor", "postprocessor"]
     def __init__(
         self,
@@ -60,9 +59,9 @@ class ViTSTR(_ViTSTR, Model):
         embedding_units: int,
         max_length: int = 32,
         dropout_prob: float = 0.0,
-        input_shape: Tuple[int, int, int] = (32, 128, 3),  # different from paper
+        input_shape: tuple[int, int, int] = (32, 128, 3),  # different from paper
         exportable: bool = False,
-        cfg: Optional[Dict[str, Any]] = None,
+        cfg: dict[str, Any] | None = None,
     ) -> None:
         super().__init__()
         self.vocab = vocab
@@ -75,23 +74,30 @@ class ViTSTR(_ViTSTR, Model):
         self.postprocessor = ViTSTRPostProcessor(vocab=self.vocab)
+    def from_pretrained(self, path_or_url: str, **kwargs: Any) -> None:
+        """Load pretrained parameters onto the model
+        Args:
+            path_or_url: the path or URL to the model parameters (checkpoint)
+            **kwargs: additional arguments to be passed to `doctr.models.utils.load_pretrained_params`
+        """
+        load_pretrained_params(self, path_or_url, **kwargs)
     @staticmethod
     def compute_loss(
         model_output: tf.Tensor,
         gt: tf.Tensor,
-        seq_len: List[int],
+        seq_len: list[int],
     ) -> tf.Tensor:
         """Compute categorical cross-entropy loss for the model.
         Sequences are masked after the EOS character.
         Args:
-        ----
             model_output: predicted logits of the model
             gt: the encoded tensor with gt labels
             seq_len: lengths of each gt word inside the batch
         Returns:
-        -------
             The loss of the model on the batch
         """
         # Input length : number of steps
@@ -114,11 +120,11 @@ class ViTSTR(_ViTSTR, Model):
     def call(
         self,
         x: tf.Tensor,
-        target: Optional[List[str]] = None,
+        target: list[str] | None = None,
         return_model_output: bool = False,
         return_preds: bool = False,
         **kwargs: Any,
-    ) -> Dict[str, Any]:
+    ) -> dict[str, Any]:
         features = self.feat_extractor(x, **kwargs)  # (batch_size, patches_seqlen, d_model)
         if target is not None:
@@ -136,7 +142,7 @@ class ViTSTR(_ViTSTR, Model):
         )  # (batch_size, max_length, vocab + 1)
         decoded_features = _bf16_to_float32(logits[:, 1:])  # remove cls_token
-        out: Dict[str, tf.Tensor] = {}
+        out: dict[str, tf.Tensor] = {}
         if self.exportable:
             out["logits"] = decoded_features
             return out
@@ -158,14 +164,13 @@ class ViTSTRPostProcessor(_ViTSTRPostProcessor):
     """Post processor for ViTSTR architecture
     Args:
-    ----
         vocab: string containing the ordered sequence of supported characters
     """
     def __call__(
         self,
         logits: tf.Tensor,
-    ) -> List[Tuple[str, float]]:
+    ) -> list[tuple[str, float]]:
         # compute pred with argmax for attention models
         out_idxs = tf.math.argmax(logits, axis=2)
         preds_prob = tf.math.reduce_max(tf.nn.softmax(logits, axis=-1), axis=-1)
@@ -191,7 +196,7 @@ def _vitstr(
     arch: str,
     pretrained: bool,
     backbone_fn,
-    input_shape: Optional[Tuple[int, int, int]] = None,
+    input_shape: tuple[int, int, int] | None = None,
     **kwargs: Any,
 ) -> ViTSTR:
     # Patch the config
@@ -221,9 +226,7 @@ def _vitstr(
     # Load pretrained parameters
     if pretrained:
         # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning
-        load_pretrained_params(
-            model, default_cfgs[arch]["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"]
-        )
+        model.from_pretrained(default_cfgs[arch]["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"])
     return model
@@ -239,12 +242,10 @@ def vitstr_small(pretrained: bool = False, **kwargs: Any) -> ViTSTR:
     >>> out = model(input_tensor)
     Args:
-    ----
         pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
         **kwargs: keyword arguments of the ViTSTR architecture
     Returns:
-    -------
         text recognition architecture
     """
     return _vitstr(
@@ -268,12 +269,10 @@ def vitstr_base(pretrained: bool = False, **kwargs: Any) -> ViTSTR:
     >>> out = model(input_tensor)
     Args:
-    ----
         pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
         **kwargs: keyword arguments of the ViTSTR architecture
     Returns:
-    -------
         text recognition architecture
     """
     return _vitstr(

doctr/models/recognition/zoo.py CHANGED Viewed

@@ -1,11 +1,11 @@
-# Copyright (C) 2021-2024, Mindee.
+# Copyright (C) 2021-2025, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-from typing import Any, List
+from typing import Any
-from doctr.file_utils import is_tf_available
+from doctr.file_utils import is_tf_available, is_torch_available
 from doctr.models.preprocessor import PreProcessor
 from .. import recognition
@@ -14,7 +14,7 @@ from .predictor import RecognitionPredictor
 __all__ = ["recognition_predictor"]
-ARCHS: List[str] = [
+ARCHS: list[str] = [
     "crnn_vgg16_bn",
     "crnn_mobilenet_v3_small",
     "crnn_mobilenet_v3_large",
@@ -25,6 +25,9 @@ ARCHS: List[str] = [
     "parseq",
 ]
+if is_torch_available():
+    ARCHS.extend(["viptr_tiny"])
 def _predictor(arch: Any, pretrained: bool, **kwargs: Any) -> RecognitionPredictor:
     if isinstance(arch, str):
@@ -35,9 +38,16 @@ def _predictor(arch: Any, pretrained: bool, **kwargs: Any) -> RecognitionPredict
             pretrained=pretrained, pretrained_backbone=kwargs.get("pretrained_backbone", True)
         )
     else:
-        if not isinstance(
-            arch, (recognition.CRNN, recognition.SAR, recognition.MASTER, recognition.ViTSTR, recognition.PARSeq)
-        ):
+        allowed_archs = [recognition.CRNN, recognition.SAR, recognition.MASTER, recognition.ViTSTR, recognition.PARSeq]
+        if is_torch_available():
+            # Add VIPTR which is only available in torch at the moment
+            allowed_archs.append(recognition.VIPTR)
+            # Adding the type for torch compiled models to the allowed architectures
+            from doctr.models.utils import _CompiledModule
+            allowed_archs.append(_CompiledModule)
+        if not isinstance(arch, tuple(allowed_archs)):
             raise ValueError(f"unknown architecture: {type(arch)}")
         _model = arch
@@ -52,7 +62,13 @@ def _predictor(arch: Any, pretrained: bool, **kwargs: Any) -> RecognitionPredict
     return predictor
-def recognition_predictor(arch: Any = "crnn_vgg16_bn", pretrained: bool = False, **kwargs: Any) -> RecognitionPredictor:
+def recognition_predictor(
+    arch: Any = "crnn_vgg16_bn",
+    pretrained: bool = False,
+    symmetric_pad: bool = False,
+    batch_size: int = 128,
+    **kwargs: Any,
+) -> RecognitionPredictor:
     """Text recognition architecture.
     Example::
@@ -63,13 +79,13 @@ def recognition_predictor(arch: Any = "crnn_vgg16_bn", pretrained: bool = False,
         >>> out = model([input_page])
     Args:
-    ----
         arch: name of the architecture or model itself to use (e.g. 'crnn_vgg16_bn')
         pretrained: If True, returns a model pre-trained on our text recognition dataset
+        symmetric_pad: if True, pad the image symmetrically instead of padding at the bottom-right
+        batch_size: number of samples the model processes in parallel
         **kwargs: optional parameters to be passed to the architecture
     Returns:
-    -------
         Recognition predictor
     """
-    return _predictor(arch, pretrained, **kwargs)
+    return _predictor(arch=arch, pretrained=pretrained, symmetric_pad=symmetric_pad, batch_size=batch_size, **kwargs)

doctr/models/utils/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from doctr.file_utils import is_tf_available, is_torch_available
-if is_tf_available():
-    from .tensorflow import *
-elif is_torch_available():
-    from .pytorch import *  # type: ignore[assignment]
+if is_torch_available():
+    from .pytorch import *
+elif is_tf_available():
+    from .tensorflow import *  # type: ignore[assignment]

doctr/models/utils/pytorch.py CHANGED Viewed

@@ -1,12 +1,13 @@
-# Copyright (C) 2021-2024, Mindee.
+# Copyright (C) 2021-2025, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 import logging
-from typing import Any, List, Optional, Tuple, Union
+from typing import Any
 import torch
+import validators
 from torch import nn
 from doctr.utils.data import download_from_url
@@ -18,8 +19,12 @@ __all__ = [
     "export_model_to_onnx",
     "_copy_tensor",
     "_bf16_to_float32",
+    "_CompiledModule",
 ]
+# torch compiled model type
+_CompiledModule = torch._dynamo.eval_frame.OptimizedModule
 def _copy_tensor(x: torch.Tensor) -> torch.Tensor:
     return x.clone().detach()
@@ -32,42 +37,50 @@ def _bf16_to_float32(x: torch.Tensor) -> torch.Tensor:
 def load_pretrained_params(
     model: nn.Module,
-    url: Optional[str] = None,
-    hash_prefix: Optional[str] = None,
-    ignore_keys: Optional[List[str]] = None,
+    path_or_url: str | None = None,
+    hash_prefix: str | None = None,
+    ignore_keys: list[str] | None = None,
     **kwargs: Any,
 ) -> None:
     """Load a set of parameters onto a model
     >>> from doctr.models import load_pretrained_params
-    >>> load_pretrained_params(model, "https://yoursource.com/yourcheckpoint-yourhash.zip")
+    >>> load_pretrained_params(model, "https://yoursource.com/yourcheckpoint-yourhash.pt")
     Args:
-    ----
         model: the PyTorch model to be loaded
-        url: URL of the zipped set of parameters
+        path_or_url: the path or URL to the model parameters (checkpoint)
         hash_prefix: first characters of SHA256 expected hash
         ignore_keys: list of weights to be ignored from the state_dict
         **kwargs: additional arguments to be passed to `doctr.utils.data.download_from_url`
     """
-    if url is None:
-        logging.warning("Invalid model URL, using default initialization.")
-    else:
-        archive_path = download_from_url(url, hash_prefix=hash_prefix, cache_subdir="models", **kwargs)
+    if path_or_url is None:
+        logging.warning("No model URL or Path provided, using default initialization.")
+        return
+    archive_path = (
+        download_from_url(path_or_url, hash_prefix=hash_prefix, cache_subdir="models", **kwargs)
+        if validators.url(path_or_url)
+        else path_or_url
+    )
-        # Read state_dict
-        state_dict = torch.load(archive_path, map_location="cpu")
+    # Read state_dict
+    state_dict = torch.load(archive_path, map_location="cpu")
-        # Remove weights from the state_dict
-        if ignore_keys is not None and len(ignore_keys) > 0:
-            for key in ignore_keys:
+    # Remove weights from the state_dict
+    if ignore_keys is not None and len(ignore_keys) > 0:
+        for key in ignore_keys:
+            if key in state_dict:
                 state_dict.pop(key)
-            missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-            if set(missing_keys) != set(ignore_keys) or len(unexpected_keys) > 0:
-                raise ValueError("unable to load state_dict, due to non-matching keys.")
-        else:
-            # Load weights
-            model.load_state_dict(state_dict)
+        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+        if any(k not in ignore_keys for k in missing_keys + unexpected_keys):
+            raise ValueError(
+                "Unable to load state_dict, due to non-matching keys.\n"
+                + f"Unexpected keys: {unexpected_keys}\nMissing keys: {missing_keys}"
+            )
+    else:
+        # Load weights
+        model.load_state_dict(state_dict)
 def conv_sequence_pt(
@@ -76,7 +89,7 @@ def conv_sequence_pt(
     relu: bool = False,
     bn: bool = False,
     **kwargs: Any,
-) -> List[nn.Module]:
+) -> list[nn.Module]:
     """Builds a convolutional-based layer sequence
     >>> from torch.nn import Sequential
@@ -84,7 +97,6 @@ def conv_sequence_pt(
     >>> module = Sequential(conv_sequence(3, 32, True, True, kernel_size=3))
     Args:
-    ----
         in_channels: number of input channels
         out_channels: number of output channels
         relu: whether ReLU should be used
@@ -92,13 +104,12 @@ def conv_sequence_pt(
         **kwargs: additional arguments to be passed to the convolutional layer
     Returns:
-    -------
         list of layers
     """
     # No bias before Batch norm
     kwargs["bias"] = kwargs.get("bias", not bn)
     # Add activation directly to the conv if there is no BN
-    conv_seq: List[nn.Module] = [nn.Conv2d(in_channels, out_channels, **kwargs)]
+    conv_seq: list[nn.Module] = [nn.Conv2d(in_channels, out_channels, **kwargs)]
     if bn:
         conv_seq.append(nn.BatchNorm2d(out_channels))
@@ -110,8 +121,8 @@ def conv_sequence_pt(
 def set_device_and_dtype(
-    model: Any, batches: List[torch.Tensor], device: Union[str, torch.device], dtype: torch.dtype
-) -> Tuple[Any, List[torch.Tensor]]:
+    model: Any, batches: list[torch.Tensor], device: str | torch.device, dtype: torch.dtype
+) -> tuple[Any, list[torch.Tensor]]:
     """Set the device and dtype of a model and its batches
     >>> import torch
@@ -122,14 +133,12 @@ def set_device_and_dtype(
     >>> model, batches = set_device_and_dtype(model, batches, device="cuda", dtype=torch.float16)
     Args:
-    ----
         model: the model to be set
         batches: the batches to be set
         device: the device to be used
         dtype: the dtype to be used
     Returns:
-    -------
         the model and batches set
     """
     return model.to(device=device, dtype=dtype), [batch.to(device=device, dtype=dtype) for batch in batches]
@@ -145,19 +154,17 @@ def export_model_to_onnx(model: nn.Module, model_name: str, dummy_input: torch.T
     >>> export_model_to_onnx(model, "my_model", dummy_input=torch.randn(1, 3, 32, 32))
     Args:
-    ----
         model: the PyTorch model to be exported
         model_name: the name for the exported model
         dummy_input: the dummy input to the model
         kwargs: additional arguments to be passed to torch.onnx.export
     Returns:
-    -------
         the path to the exported model
     """
     torch.onnx.export(
         model,
-        dummy_input,  # type: ignore[arg-type]
+        dummy_input,
         f"{model_name}.onnx",
         input_names=["input"],
         output_names=["logits"],

python-doctr 0.10.0__py3-none-any.whl → 0.12.0__py3-none-any.whl

python-doctr 0.10.0py3-none-any.whl → 0.12.0py3-none-any.whl