PyPI - python-doctr - Versions diffs - 0.11.0__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

python-doctr 0.11.0py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (138) hide show

doctr/__init__.py +0 -1
doctr/datasets/__init__.py +1 -5
doctr/datasets/coco_text.py +139 -0
doctr/datasets/cord.py +2 -1
doctr/datasets/datasets/__init__.py +1 -6
doctr/datasets/datasets/pytorch.py +2 -2
doctr/datasets/funsd.py +2 -2
doctr/datasets/generator/__init__.py +1 -6
doctr/datasets/ic03.py +1 -1
doctr/datasets/ic13.py +2 -1
doctr/datasets/iiit5k.py +4 -1
doctr/datasets/imgur5k.py +9 -2
doctr/datasets/ocr.py +1 -1
doctr/datasets/recognition.py +1 -1
doctr/datasets/svhn.py +1 -1
doctr/datasets/svt.py +2 -2
doctr/datasets/synthtext.py +15 -2
doctr/datasets/utils.py +7 -6
doctr/datasets/vocabs.py +1100 -54
doctr/file_utils.py +2 -92
doctr/io/elements.py +37 -3
doctr/io/image/__init__.py +1 -7
doctr/io/image/pytorch.py +1 -1
doctr/models/_utils.py +4 -4
doctr/models/classification/__init__.py +1 -0
doctr/models/classification/magc_resnet/__init__.py +1 -6
doctr/models/classification/magc_resnet/pytorch.py +3 -4
doctr/models/classification/mobilenet/__init__.py +1 -6
doctr/models/classification/mobilenet/pytorch.py +15 -1
doctr/models/classification/predictor/__init__.py +1 -6
doctr/models/classification/predictor/pytorch.py +2 -2
doctr/models/classification/resnet/__init__.py +1 -6
doctr/models/classification/resnet/pytorch.py +26 -3
doctr/models/classification/textnet/__init__.py +1 -6
doctr/models/classification/textnet/pytorch.py +11 -2
doctr/models/classification/vgg/__init__.py +1 -6
doctr/models/classification/vgg/pytorch.py +16 -1
doctr/models/classification/vip/__init__.py +1 -0
doctr/models/classification/vip/layers/__init__.py +1 -0
doctr/models/classification/vip/layers/pytorch.py +615 -0
doctr/models/classification/vip/pytorch.py +505 -0
doctr/models/classification/vit/__init__.py +1 -6
doctr/models/classification/vit/pytorch.py +12 -3
doctr/models/classification/zoo.py +7 -8
doctr/models/detection/_utils/__init__.py +1 -6
doctr/models/detection/core.py +1 -1
doctr/models/detection/differentiable_binarization/__init__.py +1 -6
doctr/models/detection/differentiable_binarization/base.py +7 -16
doctr/models/detection/differentiable_binarization/pytorch.py +13 -4
doctr/models/detection/fast/__init__.py +1 -6
doctr/models/detection/fast/base.py +6 -17
doctr/models/detection/fast/pytorch.py +17 -8
doctr/models/detection/linknet/__init__.py +1 -6
doctr/models/detection/linknet/base.py +5 -15
doctr/models/detection/linknet/pytorch.py +12 -3
doctr/models/detection/predictor/__init__.py +1 -6
doctr/models/detection/predictor/pytorch.py +1 -1
doctr/models/detection/zoo.py +15 -32
doctr/models/factory/hub.py +9 -22
doctr/models/kie_predictor/__init__.py +1 -6
doctr/models/kie_predictor/pytorch.py +3 -7
doctr/models/modules/layers/__init__.py +1 -6
doctr/models/modules/layers/pytorch.py +52 -4
doctr/models/modules/transformer/__init__.py +1 -6
doctr/models/modules/transformer/pytorch.py +2 -2
doctr/models/modules/vision_transformer/__init__.py +1 -6
doctr/models/predictor/__init__.py +1 -6
doctr/models/predictor/base.py +3 -8
doctr/models/predictor/pytorch.py +3 -6
doctr/models/preprocessor/__init__.py +1 -6
doctr/models/preprocessor/pytorch.py +27 -32
doctr/models/recognition/__init__.py +1 -0
doctr/models/recognition/crnn/__init__.py +1 -6
doctr/models/recognition/crnn/pytorch.py +16 -7
doctr/models/recognition/master/__init__.py +1 -6
doctr/models/recognition/master/pytorch.py +15 -6
doctr/models/recognition/parseq/__init__.py +1 -6
doctr/models/recognition/parseq/pytorch.py +26 -8
doctr/models/recognition/predictor/__init__.py +1 -6
doctr/models/recognition/predictor/_utils.py +100 -47
doctr/models/recognition/predictor/pytorch.py +4 -5
doctr/models/recognition/sar/__init__.py +1 -6
doctr/models/recognition/sar/pytorch.py +13 -4
doctr/models/recognition/utils.py +56 -47
doctr/models/recognition/viptr/__init__.py +1 -0
doctr/models/recognition/viptr/pytorch.py +277 -0
doctr/models/recognition/vitstr/__init__.py +1 -6
doctr/models/recognition/vitstr/pytorch.py +13 -4
doctr/models/recognition/zoo.py +13 -8
doctr/models/utils/__init__.py +1 -6
doctr/models/utils/pytorch.py +29 -19
doctr/transforms/functional/__init__.py +1 -6
doctr/transforms/functional/pytorch.py +4 -4
doctr/transforms/modules/__init__.py +1 -7
doctr/transforms/modules/base.py +26 -92
doctr/transforms/modules/pytorch.py +28 -26
doctr/utils/data.py +1 -1
doctr/utils/geometry.py +7 -11
doctr/utils/visualization.py +1 -1
doctr/version.py +1 -1
{python_doctr-0.11.0.dist-info → python_doctr-1.0.0.dist-info}/METADATA +22 -63
python_doctr-1.0.0.dist-info/RECORD +149 -0
{python_doctr-0.11.0.dist-info → python_doctr-1.0.0.dist-info}/WHEEL +1 -1
doctr/datasets/datasets/tensorflow.py +0 -59
doctr/datasets/generator/tensorflow.py +0 -58
doctr/datasets/loader.py +0 -94
doctr/io/image/tensorflow.py +0 -101
doctr/models/classification/magc_resnet/tensorflow.py +0 -196
doctr/models/classification/mobilenet/tensorflow.py +0 -433
doctr/models/classification/predictor/tensorflow.py +0 -60
doctr/models/classification/resnet/tensorflow.py +0 -397
doctr/models/classification/textnet/tensorflow.py +0 -266
doctr/models/classification/vgg/tensorflow.py +0 -116
doctr/models/classification/vit/tensorflow.py +0 -192
doctr/models/detection/_utils/tensorflow.py +0 -34
doctr/models/detection/differentiable_binarization/tensorflow.py +0 -414
doctr/models/detection/fast/tensorflow.py +0 -419
doctr/models/detection/linknet/tensorflow.py +0 -369
doctr/models/detection/predictor/tensorflow.py +0 -70
doctr/models/kie_predictor/tensorflow.py +0 -187
doctr/models/modules/layers/tensorflow.py +0 -171
doctr/models/modules/transformer/tensorflow.py +0 -235
doctr/models/modules/vision_transformer/tensorflow.py +0 -100
doctr/models/predictor/tensorflow.py +0 -155
doctr/models/preprocessor/tensorflow.py +0 -122
doctr/models/recognition/crnn/tensorflow.py +0 -308
doctr/models/recognition/master/tensorflow.py +0 -313
doctr/models/recognition/parseq/tensorflow.py +0 -508
doctr/models/recognition/predictor/tensorflow.py +0 -79
doctr/models/recognition/sar/tensorflow.py +0 -416
doctr/models/recognition/vitstr/tensorflow.py +0 -278
doctr/models/utils/tensorflow.py +0 -182
doctr/transforms/functional/tensorflow.py +0 -254
doctr/transforms/modules/tensorflow.py +0 -562
python_doctr-0.11.0.dist-info/RECORD +0 -173
{python_doctr-0.11.0.dist-info → python_doctr-1.0.0.dist-info/licenses}/LICENSE +0 -0
{python_doctr-0.11.0.dist-info → python_doctr-1.0.0.dist-info}/top_level.txt +0 -0
{python_doctr-0.11.0.dist-info → python_doctr-1.0.0.dist-info}/zip-safe +0 -0

doctr/models/predictor/base.py CHANGED Viewed

@@ -116,18 +116,14 @@ class _OCRPredictor:
     def _generate_crops(
         pages: list[np.ndarray],
         loc_preds: list[np.ndarray],
-        channels_last: bool,
         assume_straight_pages: bool = False,
         assume_horizontal: bool = False,
     ) -> list[list[np.ndarray]]:
         if assume_straight_pages:
-            crops = [
-                extract_crops(page, _boxes[:, :4], channels_last=channels_last)
-                for page, _boxes in zip(pages, loc_preds)
-            ]
+            crops = [extract_crops(page, _boxes[:, :4]) for page, _boxes in zip(pages, loc_preds)]
         else:
             crops = [
-                extract_rcrops(page, _boxes[:, :4], channels_last=channels_last, assume_horizontal=assume_horizontal)
+                extract_rcrops(page, _boxes[:, :4], assume_horizontal=assume_horizontal)
                 for page, _boxes in zip(pages, loc_preds)
             ]
         return crops
@@ -136,11 +132,10 @@ class _OCRPredictor:
     def _prepare_crops(
         pages: list[np.ndarray],
         loc_preds: list[np.ndarray],
-        channels_last: bool,
         assume_straight_pages: bool = False,
         assume_horizontal: bool = False,
     ) -> tuple[list[list[np.ndarray]], list[np.ndarray]]:
-        crops = _OCRPredictor._generate_crops(pages, loc_preds, channels_last, assume_straight_pages, assume_horizontal)
+        crops = _OCRPredictor._generate_crops(pages, loc_preds, assume_straight_pages, assume_horizontal)
         # Avoid sending zero-sized crops
         is_kept = [[all(s > 0 for s in crop.shape) for crop in page_crops] for page_crops in crops]

doctr/models/predictor/pytorch.py CHANGED Viewed

@@ -68,14 +68,14 @@ class OCRPredictor(nn.Module, _OCRPredictor):
     @torch.inference_mode()
     def forward(
         self,
-        pages: list[np.ndarray | torch.Tensor],
+        pages: list[np.ndarray],
         **kwargs: Any,
     ) -> Document:
         # Dimension check
         if any(page.ndim != 3 for page in pages):
             raise ValueError("incorrect input shape: all pages are expected to be multi-channel 2D images.")
-        origin_page_shapes = [page.shape[:2] if isinstance(page, np.ndarray) else page.shape[-2:] for page in pages]
+        origin_page_shapes = [page.shape[:2] for page in pages]
         # Localize text elements
         loc_preds, out_maps = self.det_predictor(pages, return_maps=True, **kwargs)
@@ -109,8 +109,6 @@ class OCRPredictor(nn.Module, _OCRPredictor):
         loc_preds = [list(loc_pred.values())[0] for loc_pred in loc_preds]
         # Detach objectness scores from loc_preds
         loc_preds, objectness_scores = detach_scores(loc_preds)
-        # Check whether crop mode should be switched to channels first
-        channels_last = len(pages) == 0 or isinstance(pages[0], np.ndarray)
         # Apply hooks to loc_preds if any
         for hook in self.hooks:
@@ -120,7 +118,6 @@ class OCRPredictor(nn.Module, _OCRPredictor):
         crops, loc_preds = self._prepare_crops(
             pages,
             loc_preds,
-            channels_last=channels_last,
             assume_straight_pages=self.assume_straight_pages,
             assume_horizontal=self._page_orientation_disabled,
         )
@@ -150,7 +147,7 @@ class OCRPredictor(nn.Module, _OCRPredictor):
             boxes,
             objectness_scores,
             text_preds,
-            origin_page_shapes,  # type: ignore[arg-type]
+            origin_page_shapes,
             crop_orientations,
             orientations,
             languages_dict,

doctr/models/preprocessor/__init__.py CHANGED Viewed

@@ -1,6 +1 @@
-from doctr.file_utils import is_tf_available, is_torch_available
-if is_torch_available():
-    from .pytorch import *
-elif is_tf_available():
-    from .tensorflow import *  # type: ignore[assignment]
+from .pytorch import *

doctr/models/preprocessor/pytorch.py CHANGED Viewed

@@ -60,65 +60,60 @@ class PreProcessor(nn.Module):
         return batches
-    def sample_transforms(self, x: np.ndarray | torch.Tensor) -> torch.Tensor:
+    def sample_transforms(self, x: np.ndarray) -> torch.Tensor:
         if x.ndim != 3:
             raise AssertionError("expected list of 3D Tensors")
-        if isinstance(x, np.ndarray):
-            if x.dtype not in (np.uint8, np.float32):
-                raise TypeError("unsupported data type for numpy.ndarray")
-            x = torch.from_numpy(x.copy()).permute(2, 0, 1)
-        elif x.dtype not in (torch.uint8, torch.float16, torch.float32):
-            raise TypeError("unsupported data type for torch.Tensor")
+        if x.dtype not in (np.uint8, np.float32, np.float16):
+            raise TypeError("unsupported data type for numpy.ndarray")
+        tensor = torch.from_numpy(x.copy()).permute(2, 0, 1)
         # Resizing
-        x = self.resize(x)
+        tensor = self.resize(tensor)
         # Data type
-        if x.dtype == torch.uint8:
-            x = x.to(dtype=torch.float32).div(255).clip(0, 1)  # type: ignore[union-attr]
+        if tensor.dtype == torch.uint8:
+            tensor = tensor.to(dtype=torch.float32).div(255).clip(0, 1)
         else:
-            x = x.to(dtype=torch.float32)  # type: ignore[union-attr]
+            tensor = tensor.to(dtype=torch.float32)
-        return x  # type: ignore[return-value]
+        return tensor
-    def __call__(self, x: torch.Tensor | np.ndarray | list[torch.Tensor | np.ndarray]) -> list[torch.Tensor]:
+    def __call__(self, x: np.ndarray | list[np.ndarray]) -> list[torch.Tensor]:
         """Prepare document data for model forwarding
         Args:
-            x: list of images (np.array) or tensors (already resized and batched)
+            x: list of images (np.array) or a single image (np.array) of shape (H, W, C)
         Returns:
-            list of page batches
+            list of page batches (*, C, H, W) ready for model inference
         """
         # Input type check
-        if isinstance(x, (np.ndarray, torch.Tensor)):
+        if isinstance(x, np.ndarray):
             if x.ndim != 4:
                 raise AssertionError("expected 4D Tensor")
-            if isinstance(x, np.ndarray):
-                if x.dtype not in (np.uint8, np.float32):
-                    raise TypeError("unsupported data type for numpy.ndarray")
-                x = torch.from_numpy(x.copy()).permute(0, 3, 1, 2)
-            elif x.dtype not in (torch.uint8, torch.float16, torch.float32):
-                raise TypeError("unsupported data type for torch.Tensor")
+            if x.dtype not in (np.uint8, np.float32, np.float16):
+                raise TypeError("unsupported data type for numpy.ndarray")
+            tensor = torch.from_numpy(x.copy()).permute(0, 3, 1, 2)
             # Resizing
-            if x.shape[-2] != self.resize.size[0] or x.shape[-1] != self.resize.size[1]:  # type: ignore[union-attr]
-                x = F.resize(
-                    x, self.resize.size, interpolation=self.resize.interpolation, antialias=self.resize.antialias
+            if tensor.shape[-2] != self.resize.size[0] or tensor.shape[-1] != self.resize.size[1]:
+                tensor = F.resize(
+                    tensor, self.resize.size, interpolation=self.resize.interpolation, antialias=self.resize.antialias
                 )
             # Data type
-            if x.dtype == torch.uint8:  # type: ignore[union-attr]
-                x = x.to(dtype=torch.float32).div(255).clip(0, 1)  # type: ignore[union-attr]
+            if tensor.dtype == torch.uint8:
+                tensor = tensor.to(dtype=torch.float32).div(255).clip(0, 1)
             else:
-                x = x.to(dtype=torch.float32)  # type: ignore[union-attr]
-            batches = [x]
+                tensor = tensor.to(dtype=torch.float32)
+            batches = [tensor]
-        elif isinstance(x, list) and all(isinstance(sample, (np.ndarray, torch.Tensor)) for sample in x):
+        elif isinstance(x, list) and all(isinstance(sample, np.ndarray) for sample in x):
             # Sample transform (to tensor, resize)
             samples = list(multithread_exec(self.sample_transforms, x))
             # Batching
-            batches = self.batch_inputs(samples)  # type: ignore[assignment]
+            batches = self.batch_inputs(samples)
         else:
             raise TypeError(f"invalid input type: {type(x)}")
         # Batch transforms (normalize)
         batches = list(multithread_exec(self.normalize, batches))
-        return batches  # type: ignore[return-value]
+        return batches

doctr/models/recognition/__init__.py CHANGED Viewed

@@ -3,4 +3,5 @@ from .master import *
 from .sar import *
 from .vitstr import *
 from .parseq import *
+from .viptr import *
 from .zoo import *

doctr/models/recognition/crnn/__init__.py CHANGED Viewed

@@ -1,6 +1 @@
-from doctr.file_utils import is_tf_available, is_torch_available
-if is_torch_available():
-    from .pytorch import *
-elif is_tf_available():
-    from .tensorflow import *  # type: ignore[assignment]
+from .pytorch import *

doctr/models/recognition/crnn/pytorch.py CHANGED Viewed

@@ -15,7 +15,7 @@ from torch.nn import functional as F
 from doctr.datasets import VOCABS, decode_sequence
 from ...classification import mobilenet_v3_large_r, mobilenet_v3_small_r, vgg16_bn_r
-from ...utils.pytorch import load_pretrained_params
+from ...utils import load_pretrained_params
 from ..core import RecognitionModel, RecognitionPostProcessor
 __all__ = ["CRNN", "crnn_vgg16_bn", "crnn_mobilenet_v3_small", "crnn_mobilenet_v3_large"]
@@ -25,8 +25,8 @@ default_cfgs: dict[str, dict[str, Any]] = {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
         "input_shape": (3, 32, 128),
-        "vocab": VOCABS["legacy_french"],
-        "url": "https://doctr-static.mindee.com/models?id=v0.3.1/crnn_vgg16_bn-9762b0b0.pt&src=0",
+        "vocab": VOCABS["french"],
+        "url": "https://doctr-static.mindee.com/models?id=v0.12.0/crnn_vgg16_bn-0417f351.pt&src=0",
     },
     "crnn_mobilenet_v3_small": {
         "mean": (0.694, 0.695, 0.693),
@@ -82,7 +82,7 @@ class CTCPostProcessor(RecognitionPostProcessor):
     def __call__(self, logits: torch.Tensor) -> list[tuple[str, float]]:
         """Performs decoding of raw output with CTC and decoding of CTC predictions
-        with label_to_idx mapping dictionnary
+        with label_to_idx mapping dictionary
         Args:
             logits: raw output of the model, shape (N, C + 1, seq_len)
@@ -155,6 +155,15 @@ class CRNN(RecognitionModel, nn.Module):
                 m.weight.data.fill_(1.0)
                 m.bias.data.zero_()
+    def from_pretrained(self, path_or_url: str, **kwargs: Any) -> None:
+        """Load pretrained parameters onto the model
+        Args:
+            path_or_url: the path or URL to the model parameters (checkpoint)
+            **kwargs: additional arguments to be passed to `doctr.models.utils.load_pretrained_params`
+        """
+        load_pretrained_params(self, path_or_url, **kwargs)
     def compute_loss(
         self,
         model_output: torch.Tensor,
@@ -214,7 +223,7 @@ class CRNN(RecognitionModel, nn.Module):
         if target is None or return_preds:
             # Disable for torch.compile compatibility
-            @torch.compiler.disable  # type: ignore[attr-defined]
+            @torch.compiler.disable
             def _postprocess(logits: torch.Tensor) -> list[tuple[str, float]]:
                 return self.postprocessor(logits)
@@ -248,13 +257,13 @@ def _crnn(
     _cfg["input_shape"] = kwargs["input_shape"]
     # Build the model
-    model = CRNN(feat_extractor, cfg=_cfg, **kwargs)
+    model = CRNN(feat_extractor, cfg=_cfg, **kwargs)  # type: ignore[arg-type]
     # Load pretrained parameters
     if pretrained:
         # The number of classes is not the same as the number of classes in the pretrained model =>
         # remove the last layer weights
         _ignore_keys = ignore_keys if _cfg["vocab"] != default_cfgs[arch]["vocab"] else None
-        load_pretrained_params(model, _cfg["url"], ignore_keys=_ignore_keys)
+        model.from_pretrained(_cfg["url"], ignore_keys=_ignore_keys)
     return model

doctr/models/recognition/master/__init__.py CHANGED Viewed

@@ -1,6 +1 @@
-from doctr.file_utils import is_tf_available, is_torch_available
-if is_torch_available():
-    from .pytorch import *
-elif is_tf_available():
-    from .tensorflow import *
+from .pytorch import *

doctr/models/recognition/master/pytorch.py CHANGED Viewed

@@ -16,7 +16,7 @@ from doctr.datasets import VOCABS
 from doctr.models.classification import magc_resnet31
 from doctr.models.modules.transformer import Decoder, PositionalEncoding
-from ...utils.pytorch import _bf16_to_float32, load_pretrained_params
+from ...utils import _bf16_to_float32, load_pretrained_params
 from .base import _MASTER, _MASTERPostProcessor
 __all__ = ["MASTER", "master"]
@@ -107,7 +107,7 @@ class MASTER(_MASTER, nn.Module):
         # NOTE: nn.TransformerDecoder takes the inverse from this implementation
         # [True, True, True, ..., False, False, False] -> False is masked
         # (N, 1, 1, max_length)
-        target_pad_mask = (target != self.vocab_size + 2).unsqueeze(1).unsqueeze(1)  # type: ignore[attr-defined]
+        target_pad_mask = (target != self.vocab_size + 2).unsqueeze(1).unsqueeze(1)
         target_length = target.size(1)
         # sub mask filled diagonal with True = see and False = masked (max_length, max_length)
         # NOTE: onnxruntime tril/triu works only with float currently (onnxruntime 1.11.1 - opset 14)
@@ -140,7 +140,7 @@ class MASTER(_MASTER, nn.Module):
         # Input length : number of timesteps
         input_len = model_output.shape[1]
         # Add one for additional <eos> token (sos disappear in shift!)
-        seq_len = seq_len + 1  # type: ignore[assignment]
+        seq_len = seq_len + 1
         # Compute loss: don't forget to shift gt! Otherwise the model learns to output the gt[t-1]!
         # The "masked" first gt char is <sos>. Delete last logit of the model output.
         cce = F.cross_entropy(model_output[:, :-1, :].permute(0, 2, 1), gt[:, 1:], reduction="none")
@@ -151,6 +151,15 @@ class MASTER(_MASTER, nn.Module):
         ce_loss = cce.sum(1) / seq_len.to(dtype=model_output.dtype)
         return ce_loss.mean()
+    def from_pretrained(self, path_or_url: str, **kwargs: Any) -> None:
+        """Load pretrained parameters onto the model
+        Args:
+            path_or_url: the path or URL to the model parameters (checkpoint)
+            **kwargs: additional arguments to be passed to `doctr.models.utils.load_pretrained_params`
+        """
+        load_pretrained_params(self, path_or_url, **kwargs)
     def forward(
         self,
         x: torch.Tensor,
@@ -167,7 +176,7 @@ class MASTER(_MASTER, nn.Module):
             return_preds: if True, decode logits
         Returns:
-            A dictionnary containing eventually loss, logits and predictions.
+            A dictionary containing eventually loss, logits and predictions.
         """
         # Encode
         features = self.feat_extractor(x)["features"]
@@ -210,7 +219,7 @@ class MASTER(_MASTER, nn.Module):
         if return_preds:
             # Disable for torch.compile compatibility
-            @torch.compiler.disable  # type: ignore[attr-defined]
+            @torch.compiler.disable
             def _postprocess(logits: torch.Tensor) -> list[tuple[str, float]]:
                 return self.postprocessor(logits)
@@ -301,7 +310,7 @@ def _master(
         # The number of classes is not the same as the number of classes in the pretrained model =>
         # remove the last layer weights
         _ignore_keys = ignore_keys if _cfg["vocab"] != default_cfgs[arch]["vocab"] else None
-        load_pretrained_params(model, default_cfgs[arch]["url"], ignore_keys=_ignore_keys)
+        model.from_pretrained(default_cfgs[arch]["url"], ignore_keys=_ignore_keys)
     return model

doctr/models/recognition/parseq/__init__.py CHANGED Viewed

@@ -1,6 +1 @@
-from doctr.file_utils import is_tf_available, is_torch_available
-if is_torch_available():
-    from .pytorch import *
-elif is_tf_available():
-    from .tensorflow import *
+from .pytorch import *

doctr/models/recognition/parseq/pytorch.py CHANGED Viewed

@@ -19,7 +19,7 @@ from doctr.datasets import VOCABS
 from doctr.models.modules.transformer import MultiHeadAttention, PositionwiseFeedForward
 from ...classification import vit_s
-from ...utils.pytorch import _bf16_to_float32, load_pretrained_params
+from ...utils import _bf16_to_float32, load_pretrained_params
 from .base import _PARSeq, _PARSeqPostProcessor
 __all__ = ["PARSeq", "parseq"]
@@ -76,8 +76,6 @@ class PARSeqDecoder(nn.Module):
         self.cross_attention = MultiHeadAttention(num_heads, d_model, dropout=dropout)
         self.position_feed_forward = PositionwiseFeedForward(d_model, ffd * ffd_ratio, dropout, nn.GELU())
-        self.attention_norm = nn.LayerNorm(d_model, eps=1e-5)
-        self.cross_attention_norm = nn.LayerNorm(d_model, eps=1e-5)
         self.query_norm = nn.LayerNorm(d_model, eps=1e-5)
         self.content_norm = nn.LayerNorm(d_model, eps=1e-5)
         self.feed_forward_norm = nn.LayerNorm(d_model, eps=1e-5)
@@ -173,6 +171,26 @@ class PARSeq(_PARSeq, nn.Module):
                 nn.init.constant_(m.weight, 1)
                 nn.init.constant_(m.bias, 0)
+    def from_pretrained(self, path_or_url: str, **kwargs: Any) -> None:
+        """Load pretrained parameters onto the model
+        Args:
+            path_or_url: the path or URL to the model parameters (checkpoint)
+            **kwargs: additional arguments to be passed to `doctr.models.utils.load_pretrained_params`
+        """
+        # NOTE: This is required to make the model backward compatible with already trained models docTR version <0.11.1
+        # ref.: https://github.com/mindee/doctr/issues/1911
+        if kwargs.get("ignore_keys") is None:
+            kwargs["ignore_keys"] = []
+        kwargs["ignore_keys"].extend([
+            "decoder.attention_norm.weight",
+            "decoder.attention_norm.bias",
+            "decoder.cross_attention_norm.weight",
+            "decoder.cross_attention_norm.bias",
+        ])
+        load_pretrained_params(self, path_or_url, **kwargs)
     def generate_permutations(self, seqlen: torch.Tensor) -> torch.Tensor:
         # Generates permutations of the target sequence.
         # Borrowed from https://github.com/baudm/parseq/blob/main/strhub/models/parseq/system.py
@@ -210,7 +228,7 @@ class PARSeq(_PARSeq, nn.Module):
         sos_idx = torch.zeros(len(final_perms), 1, device=seqlen.device)
         eos_idx = torch.full((len(final_perms), 1), max_num_chars + 1, device=seqlen.device)
-        combined = torch.cat([sos_idx, final_perms + 1, eos_idx], dim=1).int()  # type: ignore[list-item]
+        combined = torch.cat([sos_idx, final_perms + 1, eos_idx], dim=1).int()
         if len(combined) > 1:
             combined[1, 1:] = max_num_chars + 1 - torch.arange(max_num_chars + 1, device=seqlen.device)
         return combined
@@ -281,7 +299,7 @@ class PARSeq(_PARSeq, nn.Module):
                 # Stop decoding if all sequences have reached the EOS token
                 # NOTE: `break` isn't correctly translated to Onnx so we don't break here if we want to export
-                if not self.exportable and max_len is None and (ys == self.vocab_size).any(dim=-1).all():  # type: ignore[attr-defined]
+                if not self.exportable and max_len is None and (ys == self.vocab_size).any(dim=-1).all():
                     break
         logits = torch.cat(pos_logits, dim=1)  # (N, max_length, vocab_size + 1)
@@ -296,7 +314,7 @@ class PARSeq(_PARSeq, nn.Module):
         # Create padding mask for refined target input maskes all behind EOS token as False
         # (N, 1, 1, max_length)
-        target_pad_mask = ~((ys == self.vocab_size).int().cumsum(-1) > 0).unsqueeze(1).unsqueeze(1)  # type: ignore[attr-defined]
+        target_pad_mask = ~((ys == self.vocab_size).int().cumsum(-1) > 0).unsqueeze(1).unsqueeze(1)
         mask = (target_pad_mask.bool() & query_mask[:, : ys.shape[1]].bool()).int()
         logits = self.head(self.decode(ys, features, mask, target_query=pos_queries))
@@ -373,7 +391,7 @@ class PARSeq(_PARSeq, nn.Module):
         if target is None or return_preds:
             # Disable for torch.compile compatibility
-            @torch.compiler.disable  # type: ignore[attr-defined]
+            @torch.compiler.disable
             def _postprocess(logits: torch.Tensor) -> list[tuple[str, float]]:
                 return self.postprocessor(logits)
@@ -448,7 +466,7 @@ def _parseq(
         # The number of classes is not the same as the number of classes in the pretrained model =>
         # remove the last layer weights
         _ignore_keys = ignore_keys if _cfg["vocab"] != default_cfgs[arch]["vocab"] else None
-        load_pretrained_params(model, default_cfgs[arch]["url"], ignore_keys=_ignore_keys)
+        model.from_pretrained(default_cfgs[arch]["url"], ignore_keys=_ignore_keys)
     return model

doctr/models/recognition/predictor/__init__.py CHANGED Viewed

@@ -1,6 +1 @@
-from doctr.file_utils import is_tf_available, is_torch_available
-if is_torch_available():
-    from .pytorch import *
-elif is_tf_available():
-    from .tensorflow import *  # type: ignore[assignment]
+from .pytorch import *

doctr/models/recognition/predictor/_utils.py CHANGED Viewed

@@ -4,6 +4,8 @@
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import math
 import numpy as np
 from ..utils import merge_multi_strings
@@ -15,69 +17,120 @@ def split_crops(
     crops: list[np.ndarray],
     max_ratio: float,
     target_ratio: int,
-    dilation: float,
-    channels_last: bool = True,
-) -> tuple[list[np.ndarray], list[int | tuple[int, int]], bool]:
-    """Chunk crops horizontally to match a given aspect ratio
+    split_overlap_ratio: float,
+) -> tuple[list[np.ndarray], list[int | tuple[int, int, float]], bool]:
+    """
+    Split crops horizontally if they exceed a given aspect ratio.
     Args:
-        crops: list of numpy array of shape (H, W, 3) if channels_last or (3, H, W) otherwise
-        max_ratio: the maximum aspect ratio that won't trigger the chunk
-        target_ratio: when crops are chunked, they will be chunked to match this aspect ratio
-        dilation: the width dilation of final chunks (to provide some overlaps)
-        channels_last: whether the numpy array has dimensions in channels last order
+        crops: List of image crops (H, W, C).
+        max_ratio: Aspect ratio threshold above which crops are split.
+        target_ratio: Target aspect ratio after splitting (e.g., 4 for 128x32).
+        split_overlap_ratio: Desired overlap between splits (as a fraction of split width).
     Returns:
-        a tuple with the new crops, their mapping, and a boolean specifying whether any remap is required
+        A tuple containing:
+            - The new list of crops (possibly with splits),
+            - A mapping indicating how to reassemble predictions,
+            - A boolean indicating whether remapping is required.
     """
-    _remap_required = False
-    crop_map: list[int | tuple[int, int]] = []
+    if split_overlap_ratio <= 0.0 or split_overlap_ratio >= 1.0:
+        raise ValueError(f"Valid range for split_overlap_ratio is (0.0, 1.0), but is: {split_overlap_ratio}")
+    remap_required = False
     new_crops: list[np.ndarray] = []
+    crop_map: list[int | tuple[int, int, float]] = []
     for crop in crops:
-        h, w = crop.shape[:2] if channels_last else crop.shape[-2:]
+        h, w = crop.shape[:2]
         aspect_ratio = w / h
         if aspect_ratio > max_ratio:
-            # Determine the number of crops, reference aspect ratio = 4 = 128 / 32
-            num_subcrops = int(aspect_ratio // target_ratio)
-            # Find the new widths, additional dilation factor to overlap crops
-            width = dilation * w / num_subcrops
-            centers = [(w / num_subcrops) * (1 / 2 + idx) for idx in range(num_subcrops)]
-            # Get the crops
-            if channels_last:
-                _crops = [
-                    crop[:, max(0, int(round(center - width / 2))) : min(w - 1, int(round(center + width / 2))), :]
-                    for center in centers
-                ]
+            split_width = max(1, math.ceil(h * target_ratio))
+            overlap_width = max(0, math.floor(split_width * split_overlap_ratio))
+            splits, last_overlap = _split_horizontally(crop, split_width, overlap_width)
+            # Remove any empty splits
+            splits = [s for s in splits if all(dim > 0 for dim in s.shape)]
+            if splits:
+                crop_map.append((len(new_crops), len(new_crops) + len(splits), last_overlap))
+                new_crops.extend(splits)
+                remap_required = True
             else:
-                _crops = [
-                    crop[:, :, max(0, int(round(center - width / 2))) : min(w - 1, int(round(center + width / 2)))]
-                    for center in centers
-                ]
-            # Avoid sending zero-sized crops
-            _crops = [crop for crop in _crops if all(s > 0 for s in crop.shape)]
-            # Record the slice of crops
-            crop_map.append((len(new_crops), len(new_crops) + len(_crops)))
-            new_crops.extend(_crops)
-            # At least one crop will require merging
-            _remap_required = True
+                # Fallback: treat it as a single crop
+                crop_map.append(len(new_crops))
+                new_crops.append(crop)
         else:
             crop_map.append(len(new_crops))
             new_crops.append(crop)
-    return new_crops, crop_map, _remap_required
+    return new_crops, crop_map, remap_required
+def _split_horizontally(image: np.ndarray, split_width: int, overlap_width: int) -> tuple[list[np.ndarray], float]:
+    """
+    Horizontally split a single image with overlapping regions.
+    Args:
+        image: The image to split (H, W, C).
+        split_width: Width of each split.
+        overlap_width: Width of the overlapping region.
+    Returns:
+        - A list of horizontal image slices.
+        - The actual overlap ratio of the last split.
+    """
+    image_width = image.shape[1]
+    if image_width <= split_width:
+        return [image], 0.0
+    # Compute start columns for each split
+    step = split_width - overlap_width
+    starts = list(range(0, image_width - split_width + 1, step))
+    # Ensure the last patch reaches the end of the image
+    if starts[-1] + split_width < image_width:
+        starts.append(image_width - split_width)
+    splits = []
+    for start_col in starts:
+        end_col = start_col + split_width
+        splits.append(image[:, start_col:end_col, :])
+    # Calculate the last overlap ratio, if only one split no overlap
+    last_overlap = 0
+    if len(starts) > 1:
+        last_overlap = (starts[-2] + split_width) - starts[-1]
+    last_overlap_ratio = last_overlap / split_width if split_width else 0.0
+    return splits, last_overlap_ratio
 def remap_preds(
-    preds: list[tuple[str, float]], crop_map: list[int | tuple[int, int]], dilation: float
+    preds: list[tuple[str, float]],
+    crop_map: list[int | tuple[int, int, float]],
+    overlap_ratio: float,
 ) -> list[tuple[str, float]]:
-    remapped_out = []
-    for _idx in crop_map:
-        # Crop hasn't been split
-        if isinstance(_idx, int):
-            remapped_out.append(preds[_idx])
+    """
+    Reconstruct predictions from possibly split crops.
+    Args:
+        preds: List of (text, confidence) tuples from each crop.
+        crop_map: Map returned by `split_crops`.
+        overlap_ratio: Overlap ratio used during splitting.
+    Returns:
+        List of merged (text, confidence) tuples corresponding to original crops.
+    """
+    remapped = []
+    for item in crop_map:
+        if isinstance(item, int):
+            remapped.append(preds[item])
         else:
-            # unzip
-            vals, probs = zip(*preds[_idx[0] : _idx[1]])
-            # Merge the string values
-            remapped_out.append((merge_multi_strings(vals, dilation), min(probs)))  # type: ignore[arg-type]
-    return remapped_out
+            start_idx, end_idx, last_overlap = item
+            text_parts, confidences = zip(*preds[start_idx:end_idx])
+            merged_text = merge_multi_strings(list(text_parts), overlap_ratio, last_overlap)
+            merged_conf = sum(confidences) / len(confidences)  # average confidence
+            remapped.append((merged_text, merged_conf))
+    return remapped

python-doctr 0.11.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

python-doctr 0.11.0py3-none-any.whl → 1.0.0py3-none-any.whl