PyPI - python-doctr - Versions diffs - 0.11.0__py3-none-any.whl → 0.12.0__py3-none-any.whl - Mend

python-doctr 0.11.0py3-none-any.whl → 0.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

doctr/datasets/__init__.py +1 -0
doctr/datasets/coco_text.py +139 -0
doctr/datasets/cord.py +2 -1
doctr/datasets/funsd.py +2 -2
doctr/datasets/ic03.py +1 -1
doctr/datasets/ic13.py +2 -1
doctr/datasets/iiit5k.py +4 -1
doctr/datasets/imgur5k.py +9 -2
doctr/datasets/loader.py +1 -1
doctr/datasets/ocr.py +1 -1
doctr/datasets/recognition.py +1 -1
doctr/datasets/svhn.py +1 -1
doctr/datasets/svt.py +2 -2
doctr/datasets/synthtext.py +15 -2
doctr/datasets/utils.py +7 -6
doctr/datasets/vocabs.py +1102 -54
doctr/file_utils.py +9 -0
doctr/io/elements.py +37 -3
doctr/models/_utils.py +1 -1
doctr/models/classification/__init__.py +1 -0
doctr/models/classification/magc_resnet/pytorch.py +1 -2
doctr/models/classification/magc_resnet/tensorflow.py +3 -3
doctr/models/classification/mobilenet/pytorch.py +15 -1
doctr/models/classification/mobilenet/tensorflow.py +11 -2
doctr/models/classification/predictor/pytorch.py +1 -1
doctr/models/classification/resnet/pytorch.py +26 -3
doctr/models/classification/resnet/tensorflow.py +25 -4
doctr/models/classification/textnet/pytorch.py +10 -1
doctr/models/classification/textnet/tensorflow.py +11 -2
doctr/models/classification/vgg/pytorch.py +16 -1
doctr/models/classification/vgg/tensorflow.py +11 -2
doctr/models/classification/vip/__init__.py +4 -0
doctr/models/classification/vip/layers/__init__.py +4 -0
doctr/models/classification/vip/layers/pytorch.py +615 -0
doctr/models/classification/vip/pytorch.py +505 -0
doctr/models/classification/vit/pytorch.py +10 -1
doctr/models/classification/vit/tensorflow.py +9 -0
doctr/models/classification/zoo.py +4 -0
doctr/models/detection/differentiable_binarization/base.py +3 -4
doctr/models/detection/differentiable_binarization/pytorch.py +10 -1
doctr/models/detection/differentiable_binarization/tensorflow.py +11 -4
doctr/models/detection/fast/base.py +2 -3
doctr/models/detection/fast/pytorch.py +13 -4
doctr/models/detection/fast/tensorflow.py +10 -2
doctr/models/detection/linknet/base.py +2 -3
doctr/models/detection/linknet/pytorch.py +10 -1
doctr/models/detection/linknet/tensorflow.py +10 -2
doctr/models/factory/hub.py +3 -3
doctr/models/kie_predictor/pytorch.py +1 -1
doctr/models/kie_predictor/tensorflow.py +1 -1
doctr/models/modules/layers/pytorch.py +49 -1
doctr/models/predictor/pytorch.py +1 -1
doctr/models/predictor/tensorflow.py +1 -1
doctr/models/recognition/__init__.py +1 -0
doctr/models/recognition/crnn/pytorch.py +10 -1
doctr/models/recognition/crnn/tensorflow.py +10 -1
doctr/models/recognition/master/pytorch.py +10 -1
doctr/models/recognition/master/tensorflow.py +10 -3
doctr/models/recognition/parseq/pytorch.py +23 -5
doctr/models/recognition/parseq/tensorflow.py +13 -5
doctr/models/recognition/predictor/_utils.py +107 -45
doctr/models/recognition/predictor/pytorch.py +3 -3
doctr/models/recognition/predictor/tensorflow.py +3 -3
doctr/models/recognition/sar/pytorch.py +10 -1
doctr/models/recognition/sar/tensorflow.py +10 -3
doctr/models/recognition/utils.py +56 -47
doctr/models/recognition/viptr/__init__.py +4 -0
doctr/models/recognition/viptr/pytorch.py +277 -0
doctr/models/recognition/vitstr/pytorch.py +10 -1
doctr/models/recognition/vitstr/tensorflow.py +10 -3
doctr/models/recognition/zoo.py +5 -0
doctr/models/utils/pytorch.py +28 -18
doctr/models/utils/tensorflow.py +15 -8
doctr/utils/data.py +1 -1
doctr/utils/geometry.py +1 -1
doctr/version.py +1 -1
{python_doctr-0.11.0.dist-info → python_doctr-0.12.0.dist-info}/METADATA +19 -3
{python_doctr-0.11.0.dist-info → python_doctr-0.12.0.dist-info}/RECORD +82 -75
{python_doctr-0.11.0.dist-info → python_doctr-0.12.0.dist-info}/WHEEL +1 -1
{python_doctr-0.11.0.dist-info → python_doctr-0.12.0.dist-info/licenses}/LICENSE +0 -0
{python_doctr-0.11.0.dist-info → python_doctr-0.12.0.dist-info}/top_level.txt +0 -0
{python_doctr-0.11.0.dist-info → python_doctr-0.12.0.dist-info}/zip-safe +0 -0

doctr/models/detection/fast/tensorflow.py CHANGED Viewed

@@ -153,6 +153,15 @@ class FAST(_FAST, Model, NestedObject):
         # Pooling layer as erosion reversal as described in the paper
         self.pooling = layers.MaxPooling2D(pool_size=pooling_size // 2 + 1, strides=1, padding="same")
+    def from_pretrained(self, path_or_url: str, **kwargs: Any) -> None:
+        """Load pretrained parameters onto the model
+        Args:
+            path_or_url: the path or URL to the model parameters (checkpoint)
+            **kwargs: additional arguments to be passed to `doctr.models.utils.load_pretrained_params`
+        """
+        load_pretrained_params(self, path_or_url, **kwargs)
     def compute_loss(
         self,
         out_map: tf.Tensor,
@@ -332,8 +341,7 @@ def _fast(
     # Load pretrained parameters
     if pretrained:
         # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning
-        load_pretrained_params(
-            model,
+        model.from_pretrained(
             _cfg["url"],
             skip_mismatch=kwargs["class_names"] != default_cfgs[arch].get("class_names", [CLASS_NAME]),
         )

doctr/models/detection/linknet/base.py CHANGED Viewed

@@ -56,9 +56,8 @@ class LinkNetPostProcessor(DetectionPostProcessor):
             area = (rect[1][0] + 1) * (1 + rect[1][1])
             length = 2 * (rect[1][0] + rect[1][1]) + 2
         else:
-            poly = Polygon(points)
-            area = poly.area
-            length = poly.length
+            area = cv2.contourArea(points)
+            length = cv2.arcLength(points, closed=True)
         distance = area * self.unclip_ratio / length  # compute distance to expand polygon
         offset = pyclipper.PyclipperOffset()
         offset.AddPath(points, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)

doctr/models/detection/linknet/pytorch.py CHANGED Viewed

@@ -160,6 +160,15 @@ class LinkNet(nn.Module, _LinkNet):
                 m.weight.data.fill_(1.0)
                 m.bias.data.zero_()
+    def from_pretrained(self, path_or_url: str, **kwargs: Any) -> None:
+        """Load pretrained parameters onto the model
+        Args:
+            path_or_url: the path or URL to the model parameters (checkpoint)
+            **kwargs: additional arguments to be passed to `doctr.models.utils.load_pretrained_params`
+        """
+        load_pretrained_params(self, path_or_url, **kwargs)
     def forward(
         self,
         x: torch.Tensor,
@@ -282,7 +291,7 @@ def _linknet(
         _ignore_keys = (
             ignore_keys if kwargs["class_names"] != default_cfgs[arch].get("class_names", [CLASS_NAME]) else None
         )
-        load_pretrained_params(model, default_cfgs[arch]["url"], ignore_keys=_ignore_keys)
+        model.from_pretrained(default_cfgs[arch]["url"], ignore_keys=_ignore_keys)
     return model

doctr/models/detection/linknet/tensorflow.py CHANGED Viewed

@@ -163,6 +163,15 @@ class LinkNet(_LinkNet, Model):
             assume_straight_pages=assume_straight_pages, bin_thresh=bin_thresh, box_thresh=box_thresh
         )
+    def from_pretrained(self, path_or_url: str, **kwargs: Any) -> None:
+        """Load pretrained parameters onto the model
+        Args:
+            path_or_url: the path or URL to the model parameters (checkpoint)
+            **kwargs: additional arguments to be passed to `doctr.models.utils.load_pretrained_params`
+        """
+        load_pretrained_params(self, path_or_url, **kwargs)
     def compute_loss(
         self,
         out_map: tf.Tensor,
@@ -282,8 +291,7 @@ def _linknet(
     # Load pretrained parameters
     if pretrained:
         # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning
-        load_pretrained_params(
-            model,
+        model.from_pretrained(
             _cfg["url"],
             skip_mismatch=kwargs["class_names"] != default_cfgs[arch].get("class_names", [CLASS_NAME]),
         )

doctr/models/factory/hub.py CHANGED Viewed

@@ -217,10 +217,10 @@ def from_hub(repo_id: str, **kwargs: Any):
     # Load checkpoint
     if is_torch_available():
-        state_dict = torch.load(hf_hub_download(repo_id, filename="pytorch_model.bin", **kwargs), map_location="cpu")
-        model.load_state_dict(state_dict)
+        weights = hf_hub_download(repo_id, filename="pytorch_model.bin", **kwargs)
     else:  # tf
         weights = hf_hub_download(repo_id, filename="tf_model.weights.h5", **kwargs)
-        model.load_weights(weights)
+    model.from_pretrained(weights)
     return model

doctr/models/kie_predictor/pytorch.py CHANGED Viewed

@@ -173,7 +173,7 @@ class KIEPredictor(nn.Module, _KIEPredictor):
             boxes_per_page,
             objectness_scores_per_page,
             text_preds_per_page,
-            origin_page_shapes,  # type: ignore[arg-type]
+            origin_page_shapes,
             crop_orientations_per_page,
             orientations,
             languages_dict,

doctr/models/kie_predictor/tensorflow.py CHANGED Viewed

@@ -171,7 +171,7 @@ class KIEPredictor(NestedObject, _KIEPredictor):
             boxes_per_page,
             objectness_scores_per_page,
             text_preds_per_page,
-            origin_page_shapes,  # type: ignore[arg-type]
+            origin_page_shapes,
             crop_orientations_per_page,
             orientations,
             languages_dict,

doctr/models/modules/layers/pytorch.py CHANGED Viewed

@@ -8,7 +8,55 @@ import numpy as np
 import torch
 import torch.nn as nn
-__all__ = ["FASTConvLayer"]
+__all__ = ["FASTConvLayer", "DropPath", "AdaptiveAvgPool2d"]
+class DropPath(nn.Module):
+    """
+    DropPath (Drop Connect) layer. This is a stochastic version of the identity layer.
+    """
+    # Borrowed from https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py
+    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.drop_prob == 0.0 or not self.training:
+            return x
+        keep_prob = 1 - self.drop_prob
+        shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with different dimensions
+        random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+        if keep_prob > 0.0 and self.scale_by_keep:
+            random_tensor.div_(keep_prob)
+        return x * random_tensor
+class AdaptiveAvgPool2d(nn.Module):
+    """
+    Custom AdaptiveAvgPool2d implementation which is ONNX and `torch.compile` compatible.
+    """
+    def __init__(self, output_size):
+        super().__init__()
+        self.output_size = output_size
+    def forward(self, x: torch.Tensor):
+        H_out, W_out = self.output_size
+        N, C, H, W = x.shape
+        out = torch.empty((N, C, H_out, W_out), device=x.device, dtype=x.dtype)
+        for oh in range(H_out):
+            start_h = (oh * H) // H_out
+            end_h = ((oh + 1) * H + H_out - 1) // H_out  # ceil((oh+1)*H / H_out)
+            for ow in range(W_out):
+                start_w = (ow * W) // W_out
+                end_w = ((ow + 1) * W + W_out - 1) // W_out  # ceil((ow+1)*W / W_out)
+                # average over the window
+                out[:, :, oh, ow] = x[:, :, start_h:end_h, start_w:end_w].mean(dim=(-2, -1))
+        return out
 class FASTConvLayer(nn.Module):

doctr/models/predictor/pytorch.py CHANGED Viewed

@@ -150,7 +150,7 @@ class OCRPredictor(nn.Module, _OCRPredictor):
             boxes,
             objectness_scores,
             text_preds,
-            origin_page_shapes,  # type: ignore[arg-type]
+            origin_page_shapes,
             crop_orientations,
             orientations,
             languages_dict,

doctr/models/predictor/tensorflow.py CHANGED Viewed

@@ -147,7 +147,7 @@ class OCRPredictor(NestedObject, _OCRPredictor):
             boxes,
             objectness_scores,
             text_preds,
-            origin_page_shapes,  # type: ignore[arg-type]
+            origin_page_shapes,
             crop_orientations,
             orientations,
             languages_dict,

doctr/models/recognition/__init__.py CHANGED Viewed

@@ -3,4 +3,5 @@ from .master import *
 from .sar import *
 from .vitstr import *
 from .parseq import *
+from .viptr import *
 from .zoo import *

doctr/models/recognition/crnn/pytorch.py CHANGED Viewed

@@ -155,6 +155,15 @@ class CRNN(RecognitionModel, nn.Module):
                 m.weight.data.fill_(1.0)
                 m.bias.data.zero_()
+    def from_pretrained(self, path_or_url: str, **kwargs: Any) -> None:
+        """Load pretrained parameters onto the model
+        Args:
+            path_or_url: the path or URL to the model parameters (checkpoint)
+            **kwargs: additional arguments to be passed to `doctr.models.utils.load_pretrained_params`
+        """
+        load_pretrained_params(self, path_or_url, **kwargs)
     def compute_loss(
         self,
         model_output: torch.Tensor,
@@ -254,7 +263,7 @@ def _crnn(
         # The number of classes is not the same as the number of classes in the pretrained model =>
         # remove the last layer weights
         _ignore_keys = ignore_keys if _cfg["vocab"] != default_cfgs[arch]["vocab"] else None
-        load_pretrained_params(model, _cfg["url"], ignore_keys=_ignore_keys)
+        model.from_pretrained(_cfg["url"], ignore_keys=_ignore_keys)
     return model

doctr/models/recognition/crnn/tensorflow.py CHANGED Viewed

@@ -154,6 +154,15 @@ class CRNN(RecognitionModel, Model):
         self.beam_width = beam_width
         self.top_paths = top_paths
+    def from_pretrained(self, path_or_url: str, **kwargs: Any) -> None:
+        """Load pretrained parameters onto the model
+        Args:
+            path_or_url: the path or URL to the model parameters (checkpoint)
+            **kwargs: additional arguments to be passed to `doctr.models.utils.load_pretrained_params`
+        """
+        load_pretrained_params(self, path_or_url, **kwargs)
     def compute_loss(
         self,
         model_output: tf.Tensor,
@@ -243,7 +252,7 @@ def _crnn(
     # Load pretrained parameters
     if pretrained:
         # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning
-        load_pretrained_params(model, _cfg["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"])
+        model.from_pretrained(_cfg["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"])
     return model

doctr/models/recognition/master/pytorch.py CHANGED Viewed

@@ -151,6 +151,15 @@ class MASTER(_MASTER, nn.Module):
         ce_loss = cce.sum(1) / seq_len.to(dtype=model_output.dtype)
         return ce_loss.mean()
+    def from_pretrained(self, path_or_url: str, **kwargs: Any) -> None:
+        """Load pretrained parameters onto the model
+        Args:
+            path_or_url: the path or URL to the model parameters (checkpoint)
+            **kwargs: additional arguments to be passed to `doctr.models.utils.load_pretrained_params`
+        """
+        load_pretrained_params(self, path_or_url, **kwargs)
     def forward(
         self,
         x: torch.Tensor,
@@ -301,7 +310,7 @@ def _master(
         # The number of classes is not the same as the number of classes in the pretrained model =>
         # remove the last layer weights
         _ignore_keys = ignore_keys if _cfg["vocab"] != default_cfgs[arch]["vocab"] else None
-        load_pretrained_params(model, default_cfgs[arch]["url"], ignore_keys=_ignore_keys)
+        model.from_pretrained(default_cfgs[arch]["url"], ignore_keys=_ignore_keys)
     return model

doctr/models/recognition/master/tensorflow.py CHANGED Viewed

@@ -87,6 +87,15 @@ class MASTER(_MASTER, Model):
         self.linear = layers.Dense(self.vocab_size + 3, kernel_initializer=tf.initializers.he_uniform())
         self.postprocessor = MASTERPostProcessor(vocab=self.vocab)
+    def from_pretrained(self, path_or_url: str, **kwargs: Any) -> None:
+        """Load pretrained parameters onto the model
+        Args:
+            path_or_url: the path or URL to the model parameters (checkpoint)
+            **kwargs: additional arguments to be passed to `doctr.models.utils.load_pretrained_params`
+        """
+        load_pretrained_params(self, path_or_url, **kwargs)
     @tf.function
     def make_source_and_target_mask(self, source: tf.Tensor, target: tf.Tensor) -> tuple[tf.Tensor, tf.Tensor]:
         # [1, 1, 1, ..., 0, 0, 0] -> 0 is masked
@@ -287,9 +296,7 @@ def _master(arch: str, pretrained: bool, backbone_fn, pretrained_backbone: bool
     # Load pretrained parameters
     if pretrained:
         # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning
-        load_pretrained_params(
-            model, default_cfgs[arch]["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"]
-        )
+        model.from_pretrained(default_cfgs[arch]["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"])
     return model

doctr/models/recognition/parseq/pytorch.py CHANGED Viewed

@@ -76,8 +76,6 @@ class PARSeqDecoder(nn.Module):
         self.cross_attention = MultiHeadAttention(num_heads, d_model, dropout=dropout)
         self.position_feed_forward = PositionwiseFeedForward(d_model, ffd * ffd_ratio, dropout, nn.GELU())
-        self.attention_norm = nn.LayerNorm(d_model, eps=1e-5)
-        self.cross_attention_norm = nn.LayerNorm(d_model, eps=1e-5)
         self.query_norm = nn.LayerNorm(d_model, eps=1e-5)
         self.content_norm = nn.LayerNorm(d_model, eps=1e-5)
         self.feed_forward_norm = nn.LayerNorm(d_model, eps=1e-5)
@@ -173,6 +171,26 @@ class PARSeq(_PARSeq, nn.Module):
                 nn.init.constant_(m.weight, 1)
                 nn.init.constant_(m.bias, 0)
+    def from_pretrained(self, path_or_url: str, **kwargs: Any) -> None:
+        """Load pretrained parameters onto the model
+        Args:
+            path_or_url: the path or URL to the model parameters (checkpoint)
+            **kwargs: additional arguments to be passed to `doctr.models.utils.load_pretrained_params`
+        """
+        # NOTE: This is required to make the model backward compatible with already trained models docTR version <0.11.1
+        # ref.: https://github.com/mindee/doctr/issues/1911
+        if kwargs.get("ignore_keys") is None:
+            kwargs["ignore_keys"] = []
+        kwargs["ignore_keys"].extend([
+            "decoder.attention_norm.weight",
+            "decoder.attention_norm.bias",
+            "decoder.cross_attention_norm.weight",
+            "decoder.cross_attention_norm.bias",
+        ])
+        load_pretrained_params(self, path_or_url, **kwargs)
     def generate_permutations(self, seqlen: torch.Tensor) -> torch.Tensor:
         # Generates permutations of the target sequence.
         # Borrowed from https://github.com/baudm/parseq/blob/main/strhub/models/parseq/system.py
@@ -210,7 +228,7 @@ class PARSeq(_PARSeq, nn.Module):
         sos_idx = torch.zeros(len(final_perms), 1, device=seqlen.device)
         eos_idx = torch.full((len(final_perms), 1), max_num_chars + 1, device=seqlen.device)
-        combined = torch.cat([sos_idx, final_perms + 1, eos_idx], dim=1).int()  # type: ignore[list-item]
+        combined = torch.cat([sos_idx, final_perms + 1, eos_idx], dim=1).int()
         if len(combined) > 1:
             combined[1, 1:] = max_num_chars + 1 - torch.arange(max_num_chars + 1, device=seqlen.device)
         return combined
@@ -349,7 +367,7 @@ class PARSeq(_PARSeq, nn.Module):
                     # remove the [EOS] tokens for the succeeding perms
                     if i == 1:
                         gt_out = torch.where(gt_out == self.vocab_size, self.vocab_size + 2, gt_out)
-                        n = (gt_out != self.vocab_size + 2).sum().item()
+                        n = (gt_out != self.vocab_size + 2).sum().item()  # type: ignore[attr-defined]
                 loss /= loss_numel
@@ -448,7 +466,7 @@ def _parseq(
         # The number of classes is not the same as the number of classes in the pretrained model =>
         # remove the last layer weights
         _ignore_keys = ignore_keys if _cfg["vocab"] != default_cfgs[arch]["vocab"] else None
-        load_pretrained_params(model, default_cfgs[arch]["url"], ignore_keys=_ignore_keys)
+        model.from_pretrained(default_cfgs[arch]["url"], ignore_keys=_ignore_keys)
     return model

doctr/models/recognition/parseq/tensorflow.py CHANGED Viewed

@@ -76,8 +76,6 @@ class PARSeqDecoder(layers.Layer):
             d_model, ffd * ffd_ratio, dropout, layers.Activation(tf.nn.gelu)
         )
-        self.attention_norm = layers.LayerNormalization(epsilon=1e-5)
-        self.cross_attention_norm = layers.LayerNormalization(epsilon=1e-5)
         self.query_norm = layers.LayerNormalization(epsilon=1e-5)
         self.content_norm = layers.LayerNormalization(epsilon=1e-5)
         self.feed_forward_norm = layers.LayerNormalization(epsilon=1e-5)
@@ -165,6 +163,18 @@ class PARSeq(_PARSeq, Model):
         self.postprocessor = PARSeqPostProcessor(vocab=self.vocab)
+    def from_pretrained(self, path_or_url: str, **kwargs: Any) -> None:
+        """Load pretrained parameters onto the model
+        Args:
+            path_or_url: the path or URL to the model parameters (checkpoint)
+            **kwargs: additional arguments to be passed to `doctr.models.utils.load_pretrained_params`
+        """
+        # NOTE: This is required to make the model backward compatible with already trained models docTR version <0.11.1
+        # ref.: https://github.com/mindee/doctr/issues/1911
+        kwargs["skip_mismatch"] = True
+        load_pretrained_params(self, path_or_url, **kwargs)
     def generate_permutations(self, seqlen: tf.Tensor) -> tf.Tensor:
         # Generates permutations of the target sequence.
         # Translated from https://github.com/baudm/parseq/blob/main/strhub/models/parseq/system.py
@@ -474,9 +484,7 @@ def _parseq(
     # Load pretrained parameters
     if pretrained:
         # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning
-        load_pretrained_params(
-            model, default_cfgs[arch]["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"]
-        )
+        model.from_pretrained(default_cfgs[arch]["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"])
     return model

doctr/models/recognition/predictor/_utils.py CHANGED Viewed

@@ -4,6 +4,8 @@
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import math
 import numpy as np
 from ..utils import merge_multi_strings
@@ -15,69 +17,129 @@ def split_crops(
     crops: list[np.ndarray],
     max_ratio: float,
     target_ratio: int,
-    dilation: float,
+    split_overlap_ratio: float,
     channels_last: bool = True,
-) -> tuple[list[np.ndarray], list[int | tuple[int, int]], bool]:
-    """Chunk crops horizontally to match a given aspect ratio
+) -> tuple[list[np.ndarray], list[int | tuple[int, int, float]], bool]:
+    """
+    Split crops horizontally if they exceed a given aspect ratio.
     Args:
-        crops: list of numpy array of shape (H, W, 3) if channels_last or (3, H, W) otherwise
-        max_ratio: the maximum aspect ratio that won't trigger the chunk
-        target_ratio: when crops are chunked, they will be chunked to match this aspect ratio
-        dilation: the width dilation of final chunks (to provide some overlaps)
-        channels_last: whether the numpy array has dimensions in channels last order
+        crops: List of image crops (H, W, C) if channels_last else (C, H, W).
+        max_ratio: Aspect ratio threshold above which crops are split.
+        target_ratio: Target aspect ratio after splitting (e.g., 4 for 128x32).
+        split_overlap_ratio: Desired overlap between splits (as a fraction of split width).
+        channels_last: Whether the crops are in channels-last format.
     Returns:
-        a tuple with the new crops, their mapping, and a boolean specifying whether any remap is required
+        A tuple containing:
+            - The new list of crops (possibly with splits),
+            - A mapping indicating how to reassemble predictions,
+            - A boolean indicating whether remapping is required.
     """
-    _remap_required = False
-    crop_map: list[int | tuple[int, int]] = []
+    if split_overlap_ratio <= 0.0 or split_overlap_ratio >= 1.0:
+        raise ValueError(f"Valid range for split_overlap_ratio is (0.0, 1.0), but is: {split_overlap_ratio}")
+    remap_required = False
     new_crops: list[np.ndarray] = []
+    crop_map: list[int | tuple[int, int, float]] = []
     for crop in crops:
         h, w = crop.shape[:2] if channels_last else crop.shape[-2:]
         aspect_ratio = w / h
         if aspect_ratio > max_ratio:
-            # Determine the number of crops, reference aspect ratio = 4 = 128 / 32
-            num_subcrops = int(aspect_ratio // target_ratio)
-            # Find the new widths, additional dilation factor to overlap crops
-            width = dilation * w / num_subcrops
-            centers = [(w / num_subcrops) * (1 / 2 + idx) for idx in range(num_subcrops)]
-            # Get the crops
-            if channels_last:
-                _crops = [
-                    crop[:, max(0, int(round(center - width / 2))) : min(w - 1, int(round(center + width / 2))), :]
-                    for center in centers
-                ]
+            split_width = max(1, math.ceil(h * target_ratio))
+            overlap_width = max(0, math.floor(split_width * split_overlap_ratio))
+            splits, last_overlap = _split_horizontally(crop, split_width, overlap_width, channels_last)
+            # Remove any empty splits
+            splits = [s for s in splits if all(dim > 0 for dim in s.shape)]
+            if splits:
+                crop_map.append((len(new_crops), len(new_crops) + len(splits), last_overlap))
+                new_crops.extend(splits)
+                remap_required = True
             else:
-                _crops = [
-                    crop[:, :, max(0, int(round(center - width / 2))) : min(w - 1, int(round(center + width / 2)))]
-                    for center in centers
-                ]
-            # Avoid sending zero-sized crops
-            _crops = [crop for crop in _crops if all(s > 0 for s in crop.shape)]
-            # Record the slice of crops
-            crop_map.append((len(new_crops), len(new_crops) + len(_crops)))
-            new_crops.extend(_crops)
-            # At least one crop will require merging
-            _remap_required = True
+                # Fallback: treat it as a single crop
+                crop_map.append(len(new_crops))
+                new_crops.append(crop)
         else:
             crop_map.append(len(new_crops))
             new_crops.append(crop)
-    return new_crops, crop_map, _remap_required
+    return new_crops, crop_map, remap_required
+def _split_horizontally(
+    image: np.ndarray, split_width: int, overlap_width: int, channels_last: bool
+) -> tuple[list[np.ndarray], float]:
+    """
+    Horizontally split a single image with overlapping regions.
+    Args:
+        image: The image to split (H, W, C) if channels_last else (C, H, W).
+        split_width: Width of each split.
+        overlap_width: Width of the overlapping region.
+        channels_last: Whether the image is in channels-last format.
+    Returns:
+        - A list of horizontal image slices.
+        - The actual overlap ratio of the last split.
+    """
+    image_width = image.shape[1] if channels_last else image.shape[-1]
+    if image_width <= split_width:
+        return [image], 0.0
+    # Compute start columns for each split
+    step = split_width - overlap_width
+    starts = list(range(0, image_width - split_width + 1, step))
+    # Ensure the last patch reaches the end of the image
+    if starts[-1] + split_width < image_width:
+        starts.append(image_width - split_width)
+    splits = []
+    for start_col in starts:
+        end_col = start_col + split_width
+        if channels_last:
+            split = image[:, start_col:end_col, :]
+        else:
+            split = image[:, :, start_col:end_col]
+        splits.append(split)
+    # Calculate the last overlap ratio, if only one split no overlap
+    last_overlap = 0
+    if len(starts) > 1:
+        last_overlap = (starts[-2] + split_width) - starts[-1]
+    last_overlap_ratio = last_overlap / split_width if split_width else 0.0
+    return splits, last_overlap_ratio
 def remap_preds(
-    preds: list[tuple[str, float]], crop_map: list[int | tuple[int, int]], dilation: float
+    preds: list[tuple[str, float]],
+    crop_map: list[int | tuple[int, int, float]],
+    overlap_ratio: float,
 ) -> list[tuple[str, float]]:
-    remapped_out = []
-    for _idx in crop_map:
-        # Crop hasn't been split
-        if isinstance(_idx, int):
-            remapped_out.append(preds[_idx])
+    """
+    Reconstruct predictions from possibly split crops.
+    Args:
+        preds: List of (text, confidence) tuples from each crop.
+        crop_map: Map returned by `split_crops`.
+        overlap_ratio: Overlap ratio used during splitting.
+    Returns:
+        List of merged (text, confidence) tuples corresponding to original crops.
+    """
+    remapped = []
+    for item in crop_map:
+        if isinstance(item, int):
+            remapped.append(preds[item])
         else:
-            # unzip
-            vals, probs = zip(*preds[_idx[0] : _idx[1]])
-            # Merge the string values
-            remapped_out.append((merge_multi_strings(vals, dilation), min(probs)))  # type: ignore[arg-type]
-    return remapped_out
+            start_idx, end_idx, last_overlap = item
+            text_parts, confidences = zip(*preds[start_idx:end_idx])
+            merged_text = merge_multi_strings(list(text_parts), overlap_ratio, last_overlap)
+            merged_conf = sum(confidences) / len(confidences)  # average confidence
+            remapped.append((merged_text, merged_conf))
+    return remapped

doctr/models/recognition/predictor/pytorch.py CHANGED Viewed

@@ -38,7 +38,7 @@ class RecognitionPredictor(nn.Module):
         self.model = model.eval()
         self.split_wide_crops = split_wide_crops
         self.critical_ar = 8  # Critical aspect ratio
-        self.dil_factor = 1.4  # Dilation factor to overlap the crops
+        self.overlap_ratio = 0.5  # Ratio of overlap between neighboring crops
         self.target_ar = 6  # Target aspect ratio
     @torch.inference_mode()
@@ -60,7 +60,7 @@ class RecognitionPredictor(nn.Module):
                 crops,  # type: ignore[arg-type]
                 self.critical_ar,
                 self.target_ar,
-                self.dil_factor,
+                self.overlap_ratio,
                 isinstance(crops[0], np.ndarray),
             )
             if remapped:
@@ -81,6 +81,6 @@ class RecognitionPredictor(nn.Module):
         # Remap crops
         if self.split_wide_crops and remapped:
-            out = remap_preds(out, crop_map, self.dil_factor)
+            out = remap_preds(out, crop_map, self.overlap_ratio)
         return out

doctr/models/recognition/predictor/tensorflow.py CHANGED Viewed

@@ -39,7 +39,7 @@ class RecognitionPredictor(NestedObject):
         self.model = model
         self.split_wide_crops = split_wide_crops
         self.critical_ar = 8  # Critical aspect ratio
-        self.dil_factor = 1.4  # Dilation factor to overlap the crops
+        self.overlap_ratio = 0.5  # Ratio of overlap between neighboring crops
         self.target_ar = 6  # Target aspect ratio
     def __call__(
@@ -56,7 +56,7 @@ class RecognitionPredictor(NestedObject):
         # Split crops that are too wide
         remapped = False
         if self.split_wide_crops:
-            new_crops, crop_map, remapped = split_crops(crops, self.critical_ar, self.target_ar, self.dil_factor)
+            new_crops, crop_map, remapped = split_crops(crops, self.critical_ar, self.target_ar, self.overlap_ratio)
             if remapped:
                 crops = new_crops
@@ -74,6 +74,6 @@ class RecognitionPredictor(NestedObject):
         # Remap crops
         if self.split_wide_crops and remapped:
-            out = remap_preds(out, crop_map, self.dil_factor)
+            out = remap_preds(out, crop_map, self.overlap_ratio)
         return out

python-doctr 0.11.0__py3-none-any.whl → 0.12.0__py3-none-any.whl

python-doctr 0.11.0py3-none-any.whl → 0.12.0py3-none-any.whl