PyPI - python-doctr - Versions diffs - 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl - Mend

python-doctr 0.10.0py3-none-any.whl → 0.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (162) hide show

doctr/contrib/__init__.py +1 -0
doctr/contrib/artefacts.py +7 -9
doctr/contrib/base.py +8 -17
doctr/datasets/cord.py +8 -7
doctr/datasets/datasets/__init__.py +4 -4
doctr/datasets/datasets/base.py +16 -16
doctr/datasets/datasets/pytorch.py +12 -12
doctr/datasets/datasets/tensorflow.py +10 -10
doctr/datasets/detection.py +6 -9
doctr/datasets/doc_artefacts.py +3 -4
doctr/datasets/funsd.py +7 -6
doctr/datasets/generator/__init__.py +4 -4
doctr/datasets/generator/base.py +16 -17
doctr/datasets/generator/pytorch.py +1 -3
doctr/datasets/generator/tensorflow.py +1 -3
doctr/datasets/ic03.py +4 -5
doctr/datasets/ic13.py +4 -5
doctr/datasets/iiit5k.py +6 -5
doctr/datasets/iiithws.py +4 -5
doctr/datasets/imgur5k.py +6 -5
doctr/datasets/loader.py +4 -7
doctr/datasets/mjsynth.py +6 -5
doctr/datasets/ocr.py +3 -4
doctr/datasets/orientation.py +3 -4
doctr/datasets/recognition.py +3 -4
doctr/datasets/sroie.py +6 -5
doctr/datasets/svhn.py +6 -5
doctr/datasets/svt.py +4 -5
doctr/datasets/synthtext.py +4 -5
doctr/datasets/utils.py +34 -29
doctr/datasets/vocabs.py +17 -7
doctr/datasets/wildreceipt.py +14 -10
doctr/file_utils.py +2 -7
doctr/io/elements.py +59 -79
doctr/io/html.py +1 -3
doctr/io/image/__init__.py +3 -3
doctr/io/image/base.py +2 -5
doctr/io/image/pytorch.py +3 -12
doctr/io/image/tensorflow.py +2 -11
doctr/io/pdf.py +5 -7
doctr/io/reader.py +5 -11
doctr/models/_utils.py +14 -22
doctr/models/builder.py +30 -48
doctr/models/classification/magc_resnet/__init__.py +3 -3
doctr/models/classification/magc_resnet/pytorch.py +10 -13
doctr/models/classification/magc_resnet/tensorflow.py +8 -11
doctr/models/classification/mobilenet/__init__.py +3 -3
doctr/models/classification/mobilenet/pytorch.py +5 -17
doctr/models/classification/mobilenet/tensorflow.py +8 -21
doctr/models/classification/predictor/__init__.py +4 -4
doctr/models/classification/predictor/pytorch.py +6 -8
doctr/models/classification/predictor/tensorflow.py +6 -8
doctr/models/classification/resnet/__init__.py +4 -4
doctr/models/classification/resnet/pytorch.py +21 -31
doctr/models/classification/resnet/tensorflow.py +20 -31
doctr/models/classification/textnet/__init__.py +3 -3
doctr/models/classification/textnet/pytorch.py +10 -17
doctr/models/classification/textnet/tensorflow.py +8 -15
doctr/models/classification/vgg/__init__.py +3 -3
doctr/models/classification/vgg/pytorch.py +5 -7
doctr/models/classification/vgg/tensorflow.py +9 -12
doctr/models/classification/vit/__init__.py +3 -3
doctr/models/classification/vit/pytorch.py +8 -14
doctr/models/classification/vit/tensorflow.py +6 -12
doctr/models/classification/zoo.py +19 -14
doctr/models/core.py +3 -3
doctr/models/detection/_utils/__init__.py +4 -4
doctr/models/detection/_utils/base.py +4 -7
doctr/models/detection/_utils/pytorch.py +1 -5
doctr/models/detection/_utils/tensorflow.py +1 -5
doctr/models/detection/core.py +2 -8
doctr/models/detection/differentiable_binarization/__init__.py +4 -4
doctr/models/detection/differentiable_binarization/base.py +7 -17
doctr/models/detection/differentiable_binarization/pytorch.py +27 -30
doctr/models/detection/differentiable_binarization/tensorflow.py +15 -25
doctr/models/detection/fast/__init__.py +4 -4
doctr/models/detection/fast/base.py +6 -14
doctr/models/detection/fast/pytorch.py +24 -31
doctr/models/detection/fast/tensorflow.py +14 -26
doctr/models/detection/linknet/__init__.py +4 -4
doctr/models/detection/linknet/base.py +6 -15
doctr/models/detection/linknet/pytorch.py +24 -27
doctr/models/detection/linknet/tensorflow.py +14 -23
doctr/models/detection/predictor/__init__.py +5 -5
doctr/models/detection/predictor/pytorch.py +6 -7
doctr/models/detection/predictor/tensorflow.py +5 -6
doctr/models/detection/zoo.py +27 -7
doctr/models/factory/hub.py +3 -7
doctr/models/kie_predictor/__init__.py +5 -5
doctr/models/kie_predictor/base.py +4 -5
doctr/models/kie_predictor/pytorch.py +18 -19
doctr/models/kie_predictor/tensorflow.py +13 -14
doctr/models/modules/layers/__init__.py +3 -3
doctr/models/modules/layers/pytorch.py +6 -9
doctr/models/modules/layers/tensorflow.py +5 -7
doctr/models/modules/transformer/__init__.py +3 -3
doctr/models/modules/transformer/pytorch.py +12 -13
doctr/models/modules/transformer/tensorflow.py +9 -10
doctr/models/modules/vision_transformer/__init__.py +3 -3
doctr/models/modules/vision_transformer/pytorch.py +2 -3
doctr/models/modules/vision_transformer/tensorflow.py +3 -3
doctr/models/predictor/__init__.py +5 -5
doctr/models/predictor/base.py +28 -29
doctr/models/predictor/pytorch.py +12 -13
doctr/models/predictor/tensorflow.py +8 -9
doctr/models/preprocessor/__init__.py +4 -4
doctr/models/preprocessor/pytorch.py +13 -17
doctr/models/preprocessor/tensorflow.py +10 -14
doctr/models/recognition/core.py +3 -7
doctr/models/recognition/crnn/__init__.py +4 -4
doctr/models/recognition/crnn/pytorch.py +20 -28
doctr/models/recognition/crnn/tensorflow.py +11 -23
doctr/models/recognition/master/__init__.py +3 -3
doctr/models/recognition/master/base.py +3 -7
doctr/models/recognition/master/pytorch.py +22 -24
doctr/models/recognition/master/tensorflow.py +12 -22
doctr/models/recognition/parseq/__init__.py +3 -3
doctr/models/recognition/parseq/base.py +3 -7
doctr/models/recognition/parseq/pytorch.py +26 -26
doctr/models/recognition/parseq/tensorflow.py +16 -22
doctr/models/recognition/predictor/__init__.py +5 -5
doctr/models/recognition/predictor/_utils.py +7 -10
doctr/models/recognition/predictor/pytorch.py +6 -6
doctr/models/recognition/predictor/tensorflow.py +5 -6
doctr/models/recognition/sar/__init__.py +4 -4
doctr/models/recognition/sar/pytorch.py +20 -21
doctr/models/recognition/sar/tensorflow.py +12 -21
doctr/models/recognition/utils.py +5 -10
doctr/models/recognition/vitstr/__init__.py +4 -4
doctr/models/recognition/vitstr/base.py +3 -7
doctr/models/recognition/vitstr/pytorch.py +18 -20
doctr/models/recognition/vitstr/tensorflow.py +12 -20
doctr/models/recognition/zoo.py +22 -11
doctr/models/utils/__init__.py +4 -4
doctr/models/utils/pytorch.py +14 -17
doctr/models/utils/tensorflow.py +17 -16
doctr/models/zoo.py +1 -5
doctr/transforms/functional/__init__.py +3 -3
doctr/transforms/functional/base.py +4 -11
doctr/transforms/functional/pytorch.py +20 -28
doctr/transforms/functional/tensorflow.py +10 -22
doctr/transforms/modules/__init__.py +4 -4
doctr/transforms/modules/base.py +48 -55
doctr/transforms/modules/pytorch.py +58 -22
doctr/transforms/modules/tensorflow.py +18 -32
doctr/utils/common_types.py +8 -9
doctr/utils/data.py +8 -12
doctr/utils/fonts.py +2 -7
doctr/utils/geometry.py +16 -47
doctr/utils/metrics.py +17 -37
doctr/utils/multithreading.py +4 -6
doctr/utils/reconstitution.py +9 -13
doctr/utils/repr.py +2 -3
doctr/utils/visualization.py +16 -29
doctr/version.py +1 -1
{python_doctr-0.10.0.dist-info → python_doctr-0.11.0.dist-info}/METADATA +54 -52
python_doctr-0.11.0.dist-info/RECORD +173 -0
{python_doctr-0.10.0.dist-info → python_doctr-0.11.0.dist-info}/WHEEL +1 -1
python_doctr-0.10.0.dist-info/RECORD +0 -173
{python_doctr-0.10.0.dist-info → python_doctr-0.11.0.dist-info}/LICENSE +0 -0
{python_doctr-0.10.0.dist-info → python_doctr-0.11.0.dist-info}/top_level.txt +0 -0
{python_doctr-0.10.0.dist-info → python_doctr-0.11.0.dist-info}/zip-safe +0 -0

doctr/models/recognition/predictor/tensorflow.py CHANGED Viewed

@@ -1,9 +1,9 @@
-# Copyright (C) 2021-2024, Mindee.
+# Copyright (C) 2021-2025, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-from typing import Any, List, Tuple, Union
+from typing import Any
 import numpy as np
 import tensorflow as tf
@@ -21,13 +21,12 @@ class RecognitionPredictor(NestedObject):
     """Implements an object able to identify character sequences in images
     Args:
-    ----
         pre_processor: transform inputs for easier batched model inference
         model: core detection architecture
         split_wide_crops: wether to use crop splitting for high aspect ratio crops
     """
-    _children_names: List[str] = ["pre_processor", "model"]
+    _children_names: list[str] = ["pre_processor", "model"]
     def __init__(
         self,
@@ -45,9 +44,9 @@ class RecognitionPredictor(NestedObject):
     def __call__(
         self,
-        crops: List[Union[np.ndarray, tf.Tensor]],
+        crops: list[np.ndarray | tf.Tensor],
         **kwargs: Any,
-    ) -> List[Tuple[str, float]]:
+    ) -> list[tuple[str, float]]:
         if len(crops) == 0:
             return []
         # Dimension check

doctr/models/recognition/sar/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from doctr.file_utils import is_tf_available, is_torch_available
-if is_tf_available():
-    from .tensorflow import *
-elif is_torch_available():
-    from .pytorch import *  # type: ignore[assignment]
+if is_torch_available():
+    from .pytorch import *
+elif is_tf_available():
+    from .tensorflow import *  # type: ignore[assignment]

doctr/models/recognition/sar/pytorch.py CHANGED Viewed

@@ -1,10 +1,11 @@
-# Copyright (C) 2021-2024, Mindee.
+# Copyright (C) 2021-2025, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+from collections.abc import Callable
 from copy import deepcopy
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Any
 import torch
 from torch import nn
@@ -19,7 +20,7 @@ from ..core import RecognitionModel, RecognitionPostProcessor
 __all__ = ["SAR", "sar_resnet31"]
-default_cfgs: Dict[str, Dict[str, Any]] = {
+default_cfgs: dict[str, dict[str, Any]] = {
     "sar_resnet31": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
@@ -80,7 +81,6 @@ class SARDecoder(nn.Module):
     """Implements decoder module of the SAR model
     Args:
-    ----
         rnn_units: number of hidden units in recurrent cells
         max_length: maximum length of a sequence
         vocab_size: number of classes in the model alphabet
@@ -114,12 +114,12 @@ class SARDecoder(nn.Module):
         self,
         features: torch.Tensor,  # (N, C, H, W)
         holistic: torch.Tensor,  # (N, C)
-        gt: Optional[torch.Tensor] = None,  # (N, L)
+        gt: torch.Tensor | None = None,  # (N, L)
     ) -> torch.Tensor:
         if gt is not None:
             gt_embedding = self.embed_tgt(gt)
-        logits_list: List[torch.Tensor] = []
+        logits_list: list[torch.Tensor] = []
         for t in range(self.max_length + 1):  # 32
             if t == 0:
@@ -166,7 +166,6 @@ class SAR(nn.Module, RecognitionModel):
     Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_.
     Args:
-    ----
         feature_extractor: the backbone serving as feature extractor
         vocab: vocabulary used for encoding
         rnn_units: number of hidden units in both encoder and decoder LSTM
@@ -187,9 +186,9 @@ class SAR(nn.Module, RecognitionModel):
         attention_units: int = 512,
         max_length: int = 30,
         dropout_prob: float = 0.0,
-        input_shape: Tuple[int, int, int] = (3, 32, 128),
+        input_shape: tuple[int, int, int] = (3, 32, 128),
         exportable: bool = False,
-        cfg: Optional[Dict[str, Any]] = None,
+        cfg: dict[str, Any] | None = None,
     ) -> None:
         super().__init__()
         self.vocab = vocab
@@ -232,10 +231,10 @@ class SAR(nn.Module, RecognitionModel):
     def forward(
         self,
         x: torch.Tensor,
-        target: Optional[List[str]] = None,
+        target: list[str] | None = None,
         return_model_output: bool = False,
         return_preds: bool = False,
-    ) -> Dict[str, Any]:
+    ) -> dict[str, Any]:
         features = self.feat_extractor(x)["features"]
         # NOTE: use max instead of functional max_pool2d which leads to ONNX incompatibility (kernel_size)
         # Vertical max pooling (N, C, H, W) --> (N, C, W)
@@ -254,7 +253,7 @@ class SAR(nn.Module, RecognitionModel):
         decoded_features = _bf16_to_float32(self.decoder(features, encoded, gt=None if target is None else gt))
-        out: Dict[str, Any] = {}
+        out: dict[str, Any] = {}
         if self.exportable:
             out["logits"] = decoded_features
             return out
@@ -263,8 +262,13 @@ class SAR(nn.Module, RecognitionModel):
             out["out_map"] = decoded_features
         if target is None or return_preds:
+            # Disable for torch.compile compatibility
+            @torch.compiler.disable  # type: ignore[attr-defined]
+            def _postprocess(decoded_features: torch.Tensor) -> list[tuple[str, float]]:
+                return self.postprocessor(decoded_features)
             # Post-process boxes
-            out["preds"] = self.postprocessor(decoded_features)
+            out["preds"] = _postprocess(decoded_features)
         if target is not None:
             out["loss"] = self.compute_loss(decoded_features, gt, seq_len)
@@ -281,19 +285,17 @@ class SAR(nn.Module, RecognitionModel):
         Sequences are masked after the EOS character.
         Args:
-        ----
             model_output: predicted logits of the model
             gt: the encoded tensor with gt labels
             seq_len: lengths of each gt word inside the batch
         Returns:
-        -------
             The loss of the model on the batch
         """
         # Input length : number of timesteps
         input_len = model_output.shape[1]
         # Add one for additional <eos> token
-        seq_len = seq_len + 1
+        seq_len = seq_len + 1  # type: ignore[assignment]
         # Compute loss
         # (N, L, vocab_size + 1)
         cce = F.cross_entropy(model_output.permute(0, 2, 1), gt, reduction="none")
@@ -308,14 +310,13 @@ class SARPostProcessor(RecognitionPostProcessor):
     """Post processor for SAR architectures
     Args:
-    ----
         vocab: string containing the ordered sequence of supported characters
     """
     def __call__(
         self,
         logits: torch.Tensor,
-    ) -> List[Tuple[str, float]]:
+    ) -> list[tuple[str, float]]:
         # compute pred with argmax for attention models
         out_idxs = logits.argmax(-1)
         # N x L
@@ -338,7 +339,7 @@ def _sar(
     backbone_fn: Callable[[bool], nn.Module],
     layer: str,
     pretrained_backbone: bool = True,
-    ignore_keys: Optional[List[str]] = None,
+    ignore_keys: list[str] | None = None,
     **kwargs: Any,
 ) -> SAR:
     pretrained_backbone = pretrained_backbone and not pretrained
@@ -379,12 +380,10 @@ def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR:
     >>> out = model(input_tensor)
     Args:
-    ----
         pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
         **kwargs: keyword arguments of the SAR architecture
     Returns:
-    -------
         text recognition architecture
     """
     return _sar(

doctr/models/recognition/sar/tensorflow.py CHANGED Viewed

@@ -1,10 +1,10 @@
-# Copyright (C) 2021-2024, Mindee.
+# Copyright (C) 2021-2025, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 from copy import deepcopy
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any
 import tensorflow as tf
 from tensorflow.keras import Model, Sequential, layers
@@ -18,7 +18,7 @@ from ..core import RecognitionModel, RecognitionPostProcessor
 __all__ = ["SAR", "sar_resnet31"]
-default_cfgs: Dict[str, Dict[str, Any]] = {
+default_cfgs: dict[str, dict[str, Any]] = {
     "sar_resnet31": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
@@ -33,7 +33,6 @@ class SAREncoder(layers.Layer, NestedObject):
     """Implements encoder module of the SAR model
     Args:
-    ----
         rnn_units: number of hidden rnn units
         dropout_prob: dropout probability
     """
@@ -58,7 +57,6 @@ class AttentionModule(layers.Layer, NestedObject):
     """Implements attention module of the SAR model
     Args:
-    ----
         attention_units: number of hidden attention units
     """
@@ -120,7 +118,6 @@ class SARDecoder(layers.Layer, NestedObject):
     """Implements decoder module of the SAR model
     Args:
-    ----
         rnn_units: number of hidden units in recurrent cells
         max_length: maximum length of a sequence
         vocab_size: number of classes in the model alphabet
@@ -159,13 +156,13 @@ class SARDecoder(layers.Layer, NestedObject):
         self,
         features: tf.Tensor,
         holistic: tf.Tensor,
-        gt: Optional[tf.Tensor] = None,
+        gt: tf.Tensor | None = None,
         **kwargs: Any,
     ) -> tf.Tensor:
         if gt is not None:
             gt_embedding = self.embed_tgt(gt, **kwargs)
-        logits_list: List[tf.Tensor] = []
+        logits_list: list[tf.Tensor] = []
         for t in range(self.max_length + 1):  # 32
             if t == 0:
@@ -210,7 +207,6 @@ class SAR(Model, RecognitionModel):
     Irregular Text Recognition" <https://arxiv.org/pdf/1811.00751.pdf>`_.
     Args:
-    ----
         feature_extractor: the backbone serving as feature extractor
         vocab: vocabulary used for encoding
         rnn_units: number of hidden units in both encoder and decoder LSTM
@@ -223,7 +219,7 @@ class SAR(Model, RecognitionModel):
         cfg: dictionary containing information about the model
     """
-    _children_names: List[str] = ["feat_extractor", "encoder", "decoder", "postprocessor"]
+    _children_names: list[str] = ["feat_extractor", "encoder", "decoder", "postprocessor"]
     def __init__(
         self,
@@ -236,7 +232,7 @@ class SAR(Model, RecognitionModel):
         num_decoder_cells: int = 2,
         dropout_prob: float = 0.0,
         exportable: bool = False,
-        cfg: Optional[Dict[str, Any]] = None,
+        cfg: dict[str, Any] | None = None,
     ) -> None:
         super().__init__()
         self.vocab = vocab
@@ -269,13 +265,11 @@ class SAR(Model, RecognitionModel):
         Sequences are masked after the EOS character.
         Args:
-        ----
             gt: the encoded tensor with gt labels
             model_output: predicted logits of the model
             seq_len: lengths of each gt word inside the batch
         Returns:
-        -------
             The loss of the model on the batch
         """
         # Input length : number of timesteps
@@ -296,11 +290,11 @@ class SAR(Model, RecognitionModel):
     def call(
         self,
         x: tf.Tensor,
-        target: Optional[List[str]] = None,
+        target: list[str] | None = None,
         return_model_output: bool = False,
         return_preds: bool = False,
         **kwargs: Any,
-    ) -> Dict[str, Any]:
+    ) -> dict[str, Any]:
         features = self.feat_extractor(x, **kwargs)
         # vertical max pooling --> (N, C, W)
         pooled_features = tf.reduce_max(features, axis=1)
@@ -318,7 +312,7 @@ class SAR(Model, RecognitionModel):
             self.decoder(features, encoded, gt=None if target is None else gt, **kwargs)
         )
-        out: Dict[str, tf.Tensor] = {}
+        out: dict[str, tf.Tensor] = {}
         if self.exportable:
             out["logits"] = decoded_features
             return out
@@ -340,14 +334,13 @@ class SARPostProcessor(RecognitionPostProcessor):
     """Post processor for SAR architectures
     Args:
-    ----
         vocab: string containing the ordered sequence of supported characters
     """
     def __call__(
         self,
         logits: tf.Tensor,
-    ) -> List[Tuple[str, float]]:
+    ) -> list[tuple[str, float]]:
         # compute pred with argmax for attention models
         out_idxs = tf.math.argmax(logits, axis=2)
         # N x L
@@ -371,7 +364,7 @@ def _sar(
     pretrained: bool,
     backbone_fn,
     pretrained_backbone: bool = True,
-    input_shape: Optional[Tuple[int, int, int]] = None,
+    input_shape: tuple[int, int, int] | None = None,
     **kwargs: Any,
 ) -> SAR:
     pretrained_backbone = pretrained_backbone and not pretrained
@@ -414,12 +407,10 @@ def sar_resnet31(pretrained: bool = False, **kwargs: Any) -> SAR:
     >>> out = model(input_tensor)
     Args:
-    ----
         pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
         **kwargs: keyword arguments of the SAR architecture
     Returns:
-    -------
         text recognition architecture
     """
     return _sar("sar_resnet31", pretrained, resnet31, **kwargs)

doctr/models/recognition/utils.py CHANGED Viewed

@@ -1,9 +1,8 @@
-# Copyright (C) 2021-2024, Mindee.
+# Copyright (C) 2021-2025, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-from typing import List
 from rapidfuzz.distance import Levenshtein
@@ -14,18 +13,16 @@ def merge_strings(a: str, b: str, dil_factor: float) -> str:
     """Merges 2 character sequences in the best way to maximize the alignment of their overlapping characters.
     Args:
-    ----
         a: first char seq, suffix should be similar to b's prefix.
         b: second char seq, prefix should be similar to a's suffix.
         dil_factor: dilation factor of the boxes to overlap, should be > 1. This parameter is
             only used when the mother sequence is splitted on a character repetition
     Returns:
-    -------
         A merged character sequence.
     Example::
-        >>> from doctr.model.recognition.utils import merge_sequences
+        >>> from doctr.models.recognition.utils import merge_sequences
         >>> merge_sequences('abcd', 'cdefgh', 1.4)
         'abcdefgh'
         >>> merge_sequences('abcdi', 'cdefgh', 1.4)
@@ -61,26 +58,24 @@ def merge_strings(a: str, b: str, dil_factor: float) -> str:
     return a[:-1] + b[index - 1 :]
-def merge_multi_strings(seq_list: List[str], dil_factor: float) -> str:
+def merge_multi_strings(seq_list: list[str], dil_factor: float) -> str:
     """Recursively merges consecutive string sequences with overlapping characters.
     Args:
-    ----
         seq_list: list of sequences to merge. Sequences need to be ordered from left to right.
         dil_factor: dilation factor of the boxes to overlap, should be > 1. This parameter is
             only used when the mother sequence is splitted on a character repetition
     Returns:
-    -------
         A merged character sequence
     Example::
-        >>> from doctr.model.recognition.utils import merge_multi_sequences
+        >>> from doctr.models.recognition.utils import merge_multi_sequences
         >>> merge_multi_sequences(['abc', 'bcdef', 'difghi', 'aijkl'], 1.4)
         'abcdefghijkl'
     """
-    def _recursive_merge(a: str, seq_list: List[str], dil_factor: float) -> str:
+    def _recursive_merge(a: str, seq_list: list[str], dil_factor: float) -> str:
         # Recursive version of compute_overlap
         if len(seq_list) == 1:
             return merge_strings(a, seq_list[0], dil_factor)

doctr/models/recognition/vitstr/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from doctr.file_utils import is_tf_available, is_torch_available
-if is_tf_available():
-    from .tensorflow import *
-elif is_torch_available():
-    from .pytorch import *  # type: ignore[assignment]
+if is_torch_available():
+    from .pytorch import *
+elif is_tf_available():
+    from .tensorflow import *  # type: ignore[assignment]

doctr/models/recognition/vitstr/base.py CHANGED Viewed

@@ -1,9 +1,8 @@
-# Copyright (C) 2021-2024, Mindee.
+# Copyright (C) 2021-2025, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-from typing import List, Tuple
 import numpy as np
@@ -17,17 +16,15 @@ class _ViTSTR:
     def build_target(
         self,
-        gts: List[str],
-    ) -> Tuple[np.ndarray, List[int]]:
+        gts: list[str],
+    ) -> tuple[np.ndarray, list[int]]:
         """Encode a list of gts sequences into a np array and gives the corresponding*
         sequence lengths.
         Args:
-        ----
             gts: list of ground-truth labels
         Returns:
-        -------
             A tuple of 2 tensors: Encoded labels and sequence lengths (for each entry of the batch)
         """
         encoded = encode_sequences(
@@ -45,7 +42,6 @@ class _ViTSTRPostProcessor(RecognitionPostProcessor):
     """Abstract class to postprocess the raw output of the model
     Args:
-    ----
         vocab: string containing the ordered sequence of supported characters
     """

doctr/models/recognition/vitstr/pytorch.py CHANGED Viewed

@@ -1,10 +1,11 @@
-# Copyright (C) 2021-2024, Mindee.
+# Copyright (C) 2021-2025, Mindee.
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+from collections.abc import Callable
 from copy import deepcopy
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Any
 import torch
 from torch import nn
@@ -19,7 +20,7 @@ from .base import _ViTSTR, _ViTSTRPostProcessor
 __all__ = ["ViTSTR", "vitstr_small", "vitstr_base"]
-default_cfgs: Dict[str, Dict[str, Any]] = {
+default_cfgs: dict[str, dict[str, Any]] = {
     "vitstr_small": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
@@ -42,7 +43,6 @@ class ViTSTR(_ViTSTR, nn.Module):
     Efficient Scene Text Recognition" <https://arxiv.org/pdf/2105.08582.pdf>`_.
     Args:
-    ----
         feature_extractor: the backbone serving as feature extractor
         vocab: vocabulary used for encoding
         embedding_units: number of embedding units
@@ -59,9 +59,9 @@ class ViTSTR(_ViTSTR, nn.Module):
         vocab: str,
         embedding_units: int,
         max_length: int = 32,  # different from paper
-        input_shape: Tuple[int, int, int] = (3, 32, 128),  # different from paper
+        input_shape: tuple[int, int, int] = (3, 32, 128),  # different from paper
         exportable: bool = False,
-        cfg: Optional[Dict[str, Any]] = None,
+        cfg: dict[str, Any] | None = None,
     ) -> None:
         super().__init__()
         self.vocab = vocab
@@ -77,10 +77,10 @@ class ViTSTR(_ViTSTR, nn.Module):
     def forward(
         self,
         x: torch.Tensor,
-        target: Optional[List[str]] = None,
+        target: list[str] | None = None,
         return_model_output: bool = False,
         return_preds: bool = False,
-    ) -> Dict[str, Any]:
+    ) -> dict[str, Any]:
         features = self.feat_extractor(x)["features"]  # (batch_size, patches_seqlen, d_model)
         if target is not None:
@@ -98,7 +98,7 @@ class ViTSTR(_ViTSTR, nn.Module):
         logits = self.head(features).view(B, N, len(self.vocab) + 1)  # (batch_size, max_length, vocab + 1)
         decoded_features = _bf16_to_float32(logits[:, 1:])  # remove cls_token
-        out: Dict[str, Any] = {}
+        out: dict[str, Any] = {}
         if self.exportable:
             out["logits"] = decoded_features
             return out
@@ -107,8 +107,13 @@ class ViTSTR(_ViTSTR, nn.Module):
             out["out_map"] = decoded_features
         if target is None or return_preds:
+            # Disable for torch.compile compatibility
+            @torch.compiler.disable  # type: ignore[attr-defined]
+            def _postprocess(decoded_features: torch.Tensor) -> list[tuple[str, float]]:
+                return self.postprocessor(decoded_features)
             # Post-process boxes
-            out["preds"] = self.postprocessor(decoded_features)
+            out["preds"] = _postprocess(decoded_features)
         if target is not None:
             out["loss"] = self.compute_loss(decoded_features, gt, seq_len)
@@ -125,19 +130,17 @@ class ViTSTR(_ViTSTR, nn.Module):
         Sequences are masked after the EOS character.
         Args:
-        ----
             model_output: predicted logits of the model
             gt: the encoded tensor with gt labels
             seq_len: lengths of each gt word inside the batch
         Returns:
-        -------
             The loss of the model on the batch
         """
         # Input length : number of steps
         input_len = model_output.shape[1]
         # Add one for additional <eos> token (sos disappear in shift!)
-        seq_len = seq_len + 1
+        seq_len = seq_len + 1  # type: ignore[assignment]
         # Compute loss: don't forget to shift gt! Otherwise the model learns to output the gt[t-1]!
         # The "masked" first gt char is <sos>.
         cce = F.cross_entropy(model_output.permute(0, 2, 1), gt[:, 1:], reduction="none")
@@ -153,14 +156,13 @@ class ViTSTRPostProcessor(_ViTSTRPostProcessor):
     """Post processor for ViTSTR architecture
     Args:
-    ----
         vocab: string containing the ordered sequence of supported characters
     """
     def __call__(
         self,
         logits: torch.Tensor,
-    ) -> List[Tuple[str, float]]:
+    ) -> list[tuple[str, float]]:
         # compute pred with argmax for attention models
         out_idxs = logits.argmax(-1)
         preds_prob = torch.softmax(logits, -1).max(dim=-1)[0]
@@ -183,7 +185,7 @@ def _vitstr(
     pretrained: bool,
     backbone_fn: Callable[[bool], nn.Module],
     layer: str,
-    ignore_keys: Optional[List[str]] = None,
+    ignore_keys: list[str] | None = None,
     **kwargs: Any,
 ) -> ViTSTR:
     # Patch the config
@@ -228,12 +230,10 @@ def vitstr_small(pretrained: bool = False, **kwargs: Any) -> ViTSTR:
     >>> out = model(input_tensor)
     Args:
-    ----
         pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
         kwargs: keyword arguments of the ViTSTR architecture
     Returns:
-    -------
         text recognition architecture
     """
     return _vitstr(
@@ -259,12 +259,10 @@ def vitstr_base(pretrained: bool = False, **kwargs: Any) -> ViTSTR:
     >>> out = model(input_tensor)
     Args:
-    ----
         pretrained (bool): If True, returns a model pre-trained on our text recognition dataset
         kwargs: keyword arguments of the ViTSTR architecture
     Returns:
-    -------
         text recognition architecture
     """
     return _vitstr(

python-doctr 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl

python-doctr 0.10.0py3-none-any.whl → 0.11.0py3-none-any.whl