PyPI - python-doctr - Versions diffs - 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

python-doctr 0.9.0py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

doctr/datasets/cord.py +10 -1
doctr/datasets/funsd.py +11 -1
doctr/datasets/ic03.py +11 -1
doctr/datasets/ic13.py +10 -1
doctr/datasets/iiit5k.py +26 -16
doctr/datasets/imgur5k.py +10 -1
doctr/datasets/sroie.py +11 -1
doctr/datasets/svhn.py +11 -1
doctr/datasets/svt.py +11 -1
doctr/datasets/synthtext.py +11 -1
doctr/datasets/utils.py +7 -2
doctr/datasets/vocabs.py +6 -2
doctr/datasets/wildreceipt.py +12 -1
doctr/file_utils.py +19 -0
doctr/io/elements.py +12 -4
doctr/models/builder.py +2 -2
doctr/models/classification/magc_resnet/tensorflow.py +13 -6
doctr/models/classification/mobilenet/pytorch.py +2 -0
doctr/models/classification/mobilenet/tensorflow.py +14 -8
doctr/models/classification/predictor/pytorch.py +11 -7
doctr/models/classification/predictor/tensorflow.py +10 -6
doctr/models/classification/resnet/tensorflow.py +21 -8
doctr/models/classification/textnet/tensorflow.py +11 -5
doctr/models/classification/vgg/tensorflow.py +9 -3
doctr/models/classification/vit/tensorflow.py +10 -4
doctr/models/classification/zoo.py +22 -10
doctr/models/detection/differentiable_binarization/tensorflow.py +34 -12
doctr/models/detection/fast/tensorflow.py +14 -11
doctr/models/detection/linknet/tensorflow.py +23 -11
doctr/models/detection/predictor/tensorflow.py +2 -2
doctr/models/factory/hub.py +5 -6
doctr/models/kie_predictor/base.py +4 -0
doctr/models/kie_predictor/pytorch.py +4 -0
doctr/models/kie_predictor/tensorflow.py +8 -1
doctr/models/modules/transformer/tensorflow.py +0 -2
doctr/models/modules/vision_transformer/pytorch.py +1 -1
doctr/models/modules/vision_transformer/tensorflow.py +1 -1
doctr/models/predictor/base.py +24 -12
doctr/models/predictor/pytorch.py +4 -0
doctr/models/predictor/tensorflow.py +8 -1
doctr/models/preprocessor/tensorflow.py +1 -1
doctr/models/recognition/crnn/tensorflow.py +8 -6
doctr/models/recognition/master/tensorflow.py +9 -4
doctr/models/recognition/parseq/tensorflow.py +10 -8
doctr/models/recognition/sar/tensorflow.py +7 -3
doctr/models/recognition/vitstr/tensorflow.py +9 -4
doctr/models/utils/pytorch.py +1 -1
doctr/models/utils/tensorflow.py +15 -15
doctr/transforms/functional/pytorch.py +1 -1
doctr/transforms/modules/pytorch.py +7 -6
doctr/transforms/modules/tensorflow.py +15 -12
doctr/utils/geometry.py +106 -19
doctr/utils/metrics.py +1 -1
doctr/utils/reconstitution.py +151 -65
doctr/version.py +1 -1
{python_doctr-0.9.0.dist-info → python_doctr-0.10.0.dist-info}/METADATA +11 -11
{python_doctr-0.9.0.dist-info → python_doctr-0.10.0.dist-info}/RECORD +61 -61
{python_doctr-0.9.0.dist-info → python_doctr-0.10.0.dist-info}/WHEEL +1 -1
{python_doctr-0.9.0.dist-info → python_doctr-0.10.0.dist-info}/LICENSE +0 -0
{python_doctr-0.9.0.dist-info → python_doctr-0.10.0.dist-info}/top_level.txt +0 -0
{python_doctr-0.9.0.dist-info → python_doctr-0.10.0.dist-info}/zip-safe +0 -0

doctr/models/classification/magc_resnet/tensorflow.py CHANGED Viewed

@@ -9,12 +9,12 @@ from functools import partial
 from typing import Any, Dict, List, Optional, Tuple
 import tensorflow as tf
-from tensorflow.keras import layers
+from tensorflow.keras import activations, layers
 from tensorflow.keras.models import Sequential
 from doctr.datasets import VOCABS
-from ...utils import load_pretrained_params
+from ...utils import _build_model, load_pretrained_params
 from ..resnet.tensorflow import ResNet
 __all__ = ["magc_resnet31"]
@@ -26,7 +26,7 @@ default_cfgs: Dict[str, Dict[str, Any]] = {
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 32, 3),
         "classes": list(VOCABS["french"]),
-        "url": "https://doctr-static.mindee.com/models?id=v0.6.0/magc_resnet31-addbb705.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/magc_resnet31-16aa7d71.weights.h5&src=0",
     },
 }
@@ -57,6 +57,7 @@ class MAGC(layers.Layer):
         self.headers = headers  # h
         self.inplanes = inplanes  # C
         self.attn_scale = attn_scale
+        self.ratio = ratio
         self.planes = int(inplanes * ratio)
         self.single_header_inplanes = int(inplanes / headers)  # C / h
@@ -97,7 +98,7 @@ class MAGC(layers.Layer):
         if self.attn_scale and self.headers > 1:
             context_mask = context_mask / math.sqrt(self.single_header_inplanes)
         # B*h, 1, H*W, 1
-        context_mask = tf.keras.activations.softmax(context_mask, axis=2)
+        context_mask = activations.softmax(context_mask, axis=2)
         # Compute context
         # B*h, 1, C/h, 1
@@ -114,7 +115,7 @@ class MAGC(layers.Layer):
         # Context modeling: B, H, W, C  ->  B, 1, 1, C
         context = self.context_modeling(inputs)
         # Transform: B, 1, 1, C  ->  B, 1, 1, C
-        transformed = self.transform(context)
+        transformed = self.transform(context, **kwargs)
         return inputs + transformed
@@ -151,9 +152,15 @@ def _magc_resnet(
         cfg=_cfg,
         **kwargs,
     )
+    _build_model(model)
     # Load pretrained parameters
     if pretrained:
-        load_pretrained_params(model, default_cfgs[arch]["url"])
+        # The number of classes is not the same as the number of classes in the pretrained model =>
+        # skip the mismatching layers for fine tuning
+        load_pretrained_params(
+            model, default_cfgs[arch]["url"], skip_mismatch=kwargs["num_classes"] != len(default_cfgs[arch]["classes"])
+        )
     return model

doctr/models/classification/mobilenet/pytorch.py CHANGED Viewed

@@ -9,12 +9,14 @@ from copy import deepcopy
 from typing import Any, Dict, List, Optional
 from torchvision.models import mobilenetv3
+from torchvision.models.mobilenetv3 import MobileNetV3
 from doctr.datasets import VOCABS
 from ...utils import load_pretrained_params
 __all__ = [
+    "MobileNetV3",
     "mobilenet_v3_small",
     "mobilenet_v3_small_r",
     "mobilenet_v3_large",

doctr/models/classification/mobilenet/tensorflow.py CHANGED Viewed

@@ -13,7 +13,7 @@ from tensorflow.keras import layers
 from tensorflow.keras.models import Sequential
 from ....datasets import VOCABS
-from ...utils import conv_sequence, load_pretrained_params
+from ...utils import _build_model, conv_sequence, load_pretrained_params
 __all__ = [
     "MobileNetV3",
@@ -32,42 +32,42 @@ default_cfgs: Dict[str, Dict[str, Any]] = {
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 32, 3),
         "classes": list(VOCABS["french"]),
-        "url": "https://doctr-static.mindee.com/models?id=v0.4.1/mobilenet_v3_large-47d25d7e.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/mobilenet_v3_large-d857506e.weights.h5&src=0",
     },
     "mobilenet_v3_large_r": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 32, 3),
         "classes": list(VOCABS["french"]),
-        "url": "https://doctr-static.mindee.com/models?id=v0.4.1/mobilenet_v3_large_r-a108e192.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/mobilenet_v3_large_r-eef2e3c6.weights.h5&src=0",
     },
     "mobilenet_v3_small": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 32, 3),
         "classes": list(VOCABS["french"]),
-        "url": "https://doctr-static.mindee.com/models?id=v0.4.1/mobilenet_v3_small-8a32c32c.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/mobilenet_v3_small-3fcebad7.weights.h5&src=0",
     },
     "mobilenet_v3_small_r": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 32, 3),
         "classes": list(VOCABS["french"]),
-        "url": "https://doctr-static.mindee.com/models?id=v0.4.1/mobilenet_v3_small_r-3d61452e.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/mobilenet_v3_small_r-dd50218d.weights.h5&src=0",
     },
     "mobilenet_v3_small_crop_orientation": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
         "input_shape": (128, 128, 3),
         "classes": [0, -90, 180, 90],
-        "url": "https://doctr-static.mindee.com/models?id=v0.4.1/classif_mobilenet_v3_small-1ea8db03.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/mobilenet_v3_small_crop_orientation-ef019b6b.weights.h5&src=0",
     },
     "mobilenet_v3_small_page_orientation": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
         "input_shape": (512, 512, 3),
         "classes": [0, -90, 180, 90],
-        "url": "https://doctr-static.mindee.com/models?id=v0.8.1/mobilenet_v3_small_page_orientation-aec9553e.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/mobilenet_v3_small_page_orientation-0071d55d.weights.h5&src=0",
     },
 }
@@ -295,9 +295,15 @@ def _mobilenet_v3(arch: str, pretrained: bool, rect_strides: bool = False, **kwa
         cfg=_cfg,
         **kwargs,
     )
+    _build_model(model)
     # Load pretrained parameters
     if pretrained:
-        load_pretrained_params(model, default_cfgs[arch]["url"])
+        # The number of classes is not the same as the number of classes in the pretrained model =>
+        # skip the mismatching layers for fine tuning
+        load_pretrained_params(
+            model, default_cfgs[arch]["url"], skip_mismatch=kwargs["num_classes"] != len(default_cfgs[arch]["classes"])
+        )
     return model

doctr/models/classification/predictor/pytorch.py CHANGED Viewed

@@ -3,7 +3,7 @@
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-from typing import List, Union
+from typing import List, Optional, Union
 import numpy as np
 import torch
@@ -27,12 +27,12 @@ class OrientationPredictor(nn.Module):
     def __init__(
         self,
-        pre_processor: PreProcessor,
-        model: nn.Module,
+        pre_processor: Optional[PreProcessor],
+        model: Optional[nn.Module],
     ) -> None:
         super().__init__()
-        self.pre_processor = pre_processor
-        self.model = model.eval()
+        self.pre_processor = pre_processor if isinstance(pre_processor, PreProcessor) else None
+        self.model = model.eval() if isinstance(model, nn.Module) else None
     @torch.inference_mode()
     def forward(
@@ -43,12 +43,16 @@ class OrientationPredictor(nn.Module):
         if any(input.ndim != 3 for input in inputs):
             raise ValueError("incorrect input shape: all inputs are expected to be multi-channel 2D images.")
+        if self.model is None or self.pre_processor is None:
+            # predictor is disabled
+            return [[0] * len(inputs), [0] * len(inputs), [1.0] * len(inputs)]
         processed_batches = self.pre_processor(inputs)
         _params = next(self.model.parameters())
         self.model, processed_batches = set_device_and_dtype(
             self.model, processed_batches, _params.device, _params.dtype
         )
-        predicted_batches = [self.model(batch) for batch in processed_batches]
+        predicted_batches = [self.model(batch) for batch in processed_batches]  # type: ignore[misc]
         # confidence
         probs = [
             torch.max(torch.softmax(batch, dim=1), dim=1).values.cpu().detach().numpy() for batch in predicted_batches
@@ -57,7 +61,7 @@ class OrientationPredictor(nn.Module):
         predicted_batches = [out_batch.argmax(dim=1).cpu().detach().numpy() for out_batch in predicted_batches]
         class_idxs = [int(pred) for batch in predicted_batches for pred in batch]
-        classes = [int(self.model.cfg["classes"][idx]) for idx in class_idxs]
+        classes = [int(self.model.cfg["classes"][idx]) for idx in class_idxs]  # type: ignore[union-attr]
         confs = [round(float(p), 2) for prob in probs for p in prob]
         return [class_idxs, classes, confs]

doctr/models/classification/predictor/tensorflow.py CHANGED Viewed

@@ -3,11 +3,11 @@
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-from typing import List, Union
+from typing import List, Optional, Union
 import numpy as np
 import tensorflow as tf
-from tensorflow import keras
+from tensorflow.keras import Model
 from doctr.models.preprocessor import PreProcessor
 from doctr.utils.repr import NestedObject
@@ -29,11 +29,11 @@ class OrientationPredictor(NestedObject):
     def __init__(
         self,
-        pre_processor: PreProcessor,
-        model: keras.Model,
+        pre_processor: Optional[PreProcessor],
+        model: Optional[Model],
     ) -> None:
-        self.pre_processor = pre_processor
-        self.model = model
+        self.pre_processor = pre_processor if isinstance(pre_processor, PreProcessor) else None
+        self.model = model if isinstance(model, Model) else None
     def __call__(
         self,
@@ -43,6 +43,10 @@ class OrientationPredictor(NestedObject):
         if any(input.ndim != 3 for input in inputs):
             raise ValueError("incorrect input shape: all inputs are expected to be multi-channel 2D images.")
+        if self.model is None or self.pre_processor is None:
+            # predictor is disabled
+            return [[0] * len(inputs), [0] * len(inputs), [1.0] * len(inputs)]
         processed_batches = self.pre_processor(inputs)
         predicted_batches = [self.model(batch, training=False) for batch in processed_batches]

doctr/models/classification/resnet/tensorflow.py CHANGED Viewed

@@ -13,7 +13,7 @@ from tensorflow.keras.models import Sequential
 from doctr.datasets import VOCABS
-from ...utils import conv_sequence, load_pretrained_params
+from ...utils import _build_model, conv_sequence, load_pretrained_params
 __all__ = ["ResNet", "resnet18", "resnet31", "resnet34", "resnet50", "resnet34_wide"]
@@ -24,35 +24,35 @@ default_cfgs: Dict[str, Dict[str, Any]] = {
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 32, 3),
         "classes": list(VOCABS["french"]),
-        "url": "https://doctr-static.mindee.com/models?id=v0.4.1/resnet18-d4634669.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/resnet18-f42d3854.weights.h5&src=0",
     },
     "resnet31": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 32, 3),
         "classes": list(VOCABS["french"]),
-        "url": "https://doctr-static.mindee.com/models?id=v0.5.0/resnet31-5a47a60b.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/resnet31-ab75f78c.weights.h5&src=0",
     },
     "resnet34": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 32, 3),
         "classes": list(VOCABS["french"]),
-        "url": "https://doctr-static.mindee.com/models?id=v0.5.0/resnet34-5dcc97ca.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/resnet34-03967df9.weights.h5&src=0",
     },
     "resnet50": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 32, 3),
         "classes": list(VOCABS["french"]),
-        "url": "https://doctr-static.mindee.com/models?id=v0.5.0/resnet50-e75e4cdf.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/resnet50-82358f34.weights.h5&src=0",
     },
     "resnet34_wide": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 32, 3),
         "classes": list(VOCABS["french"]),
-        "url": "https://doctr-static.mindee.com/models?id=v0.5.0/resnet34_wide-c1271816.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/resnet34_wide-b18fdf79.weights.h5&src=0",
     },
 }
@@ -210,9 +210,15 @@ def _resnet(
     model = ResNet(
         num_blocks, output_channels, stage_downsample, stage_conv, stage_pooling, origin_stem, cfg=_cfg, **kwargs
     )
+    _build_model(model)
     # Load pretrained parameters
     if pretrained:
-        load_pretrained_params(model, default_cfgs[arch]["url"])
+        # The number of classes is not the same as the number of classes in the pretrained model =>
+        # skip the mismatching layers for fine tuning
+        load_pretrained_params(
+            model, default_cfgs[arch]["url"], skip_mismatch=kwargs["num_classes"] != len(default_cfgs[arch]["classes"])
+        )
     return model
@@ -354,10 +360,17 @@ def resnet50(pretrained: bool = False, **kwargs: Any) -> ResNet:
     )
     model.cfg = _cfg
+    _build_model(model)
     # Load pretrained parameters
     if pretrained:
-        load_pretrained_params(model, default_cfgs["resnet50"]["url"])
+        # The number of classes is not the same as the number of classes in the pretrained model =>
+        # skip the mismatching layers for fine tuning
+        load_pretrained_params(
+            model,
+            default_cfgs["resnet50"]["url"],
+            skip_mismatch=kwargs["num_classes"] != len(default_cfgs["resnet50"]["classes"]),
+        )
     return model

doctr/models/classification/textnet/tensorflow.py CHANGED Viewed

@@ -12,7 +12,7 @@ from tensorflow.keras import Sequential, layers
 from doctr.datasets import VOCABS
 from ...modules.layers.tensorflow import FASTConvLayer
-from ...utils import conv_sequence, load_pretrained_params
+from ...utils import _build_model, conv_sequence, load_pretrained_params
 __all__ = ["textnet_tiny", "textnet_small", "textnet_base"]
@@ -22,21 +22,21 @@ default_cfgs: Dict[str, Dict[str, Any]] = {
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 32, 3),
         "classes": list(VOCABS["french"]),
-        "url": "https://doctr-static.mindee.com/models?id=v0.8.1/textnet_tiny-fe9cc245.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/textnet_tiny-a29eeb4a.weights.h5&src=0",
     },
     "textnet_small": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 32, 3),
         "classes": list(VOCABS["french"]),
-        "url": "https://doctr-static.mindee.com/models?id=v0.8.1/textnet_small-29c39c82.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/textnet_small-1c2df0e3.weights.h5&src=0",
     },
     "textnet_base": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 32, 3),
         "classes": list(VOCABS["french"]),
-        "url": "https://doctr-static.mindee.com/models?id=v0.8.1/textnet_base-168aa82c.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/textnet_base-8b4b89bc.weights.h5&src=0",
     },
 }
@@ -111,9 +111,15 @@ def _textnet(
     # Build the model
     model = TextNet(cfg=_cfg, **kwargs)
+    _build_model(model)
     # Load pretrained parameters
     if pretrained:
-        load_pretrained_params(model, default_cfgs[arch]["url"])
+        # The number of classes is not the same as the number of classes in the pretrained model =>
+        # skip the mismatching layers for fine tuning
+        load_pretrained_params(
+            model, default_cfgs[arch]["url"], skip_mismatch=kwargs["num_classes"] != len(default_cfgs[arch]["classes"])
+        )
     return model

doctr/models/classification/vgg/tensorflow.py CHANGED Viewed

@@ -11,7 +11,7 @@ from tensorflow.keras.models import Sequential
 from doctr.datasets import VOCABS
-from ...utils import conv_sequence, load_pretrained_params
+from ...utils import _build_model, conv_sequence, load_pretrained_params
 __all__ = ["VGG", "vgg16_bn_r"]
@@ -22,7 +22,7 @@ default_cfgs: Dict[str, Dict[str, Any]] = {
         "std": (1.0, 1.0, 1.0),
         "input_shape": (32, 32, 3),
         "classes": list(VOCABS["french"]),
-        "url": "https://doctr-static.mindee.com/models?id=v0.4.1/vgg16_bn_r-c5836cea.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/vgg16_bn_r-b4d69212.weights.h5&src=0",
     },
 }
@@ -81,9 +81,15 @@ def _vgg(
     # Build the model
     model = VGG(num_blocks, planes, rect_pools, cfg=_cfg, **kwargs)
+    _build_model(model)
     # Load pretrained parameters
     if pretrained:
-        load_pretrained_params(model, default_cfgs[arch]["url"])
+        # The number of classes is not the same as the number of classes in the pretrained model =>
+        # skip the mismatching layers for fine tuning
+        load_pretrained_params(
+            model, default_cfgs[arch]["url"], skip_mismatch=kwargs["num_classes"] != len(default_cfgs[arch]["classes"])
+        )
     return model

doctr/models/classification/vit/tensorflow.py CHANGED Viewed

@@ -14,7 +14,7 @@ from doctr.models.modules.transformer import EncoderBlock
 from doctr.models.modules.vision_transformer.tensorflow import PatchEmbedding
 from doctr.utils.repr import NestedObject
-from ...utils import load_pretrained_params
+from ...utils import _build_model, load_pretrained_params
 __all__ = ["vit_s", "vit_b"]
@@ -25,14 +25,14 @@ default_cfgs: Dict[str, Dict[str, Any]] = {
         "std": (0.299, 0.296, 0.301),
         "input_shape": (3, 32, 32),
         "classes": list(VOCABS["french"]),
-        "url": "https://doctr-static.mindee.com/models?id=v0.6.0/vit_s-6300fcc9.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/vit_s-69bc459e.weights.h5&src=0",
     },
     "vit_b": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 32, 3),
         "classes": list(VOCABS["french"]),
-        "url": "https://doctr-static.mindee.com/models?id=v0.6.0/vit_b-57158446.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/vit_b-c64705bd.weights.h5&src=0",
     },
 }
@@ -121,9 +121,15 @@ def _vit(
     # Build the model
     model = VisionTransformer(cfg=_cfg, **kwargs)
+    _build_model(model)
     # Load pretrained parameters
     if pretrained:
-        load_pretrained_params(model, default_cfgs[arch]["url"])
+        # The number of classes is not the same as the number of classes in the pretrained model =>
+        # skip the mismatching layers for fine tuning
+        load_pretrained_params(
+            model, default_cfgs[arch]["url"], skip_mismatch=kwargs["num_classes"] != len(default_cfgs[arch]["classes"])
+        )
     return model

doctr/models/classification/zoo.py CHANGED Viewed

@@ -34,15 +34,27 @@ ARCHS: List[str] = [
 ORIENTATION_ARCHS: List[str] = ["mobilenet_v3_small_crop_orientation", "mobilenet_v3_small_page_orientation"]
-def _orientation_predictor(arch: str, pretrained: bool, **kwargs: Any) -> OrientationPredictor:
-    if arch not in ORIENTATION_ARCHS:
-        raise ValueError(f"unknown architecture '{arch}'")
+def _orientation_predictor(
+    arch: Any, pretrained: bool, model_type: str, disabled: bool = False, **kwargs: Any
+) -> OrientationPredictor:
+    if disabled:
+        # Case where the orientation predictor is disabled
+        return OrientationPredictor(None, None)
+    if isinstance(arch, str):
+        if arch not in ORIENTATION_ARCHS:
+            raise ValueError(f"unknown architecture '{arch}'")
+        # Load directly classifier from backbone
+        _model = classification.__dict__[arch](pretrained=pretrained)
+    else:
+        if not isinstance(arch, classification.MobileNetV3):
+            raise ValueError(f"unknown architecture: {type(arch)}")
+        _model = arch
-    # Load directly classifier from backbone
-    _model = classification.__dict__[arch](pretrained=pretrained)
     kwargs["mean"] = kwargs.get("mean", _model.cfg["mean"])
     kwargs["std"] = kwargs.get("std", _model.cfg["std"])
-    kwargs["batch_size"] = kwargs.get("batch_size", 128 if "crop" in arch else 4)
+    kwargs["batch_size"] = kwargs.get("batch_size", 128 if model_type == "crop" else 4)
     input_shape = _model.cfg["input_shape"][:-1] if is_tf_available() else _model.cfg["input_shape"][1:]
     predictor = OrientationPredictor(
         PreProcessor(input_shape, preserve_aspect_ratio=True, symmetric_pad=True, **kwargs), _model
@@ -51,7 +63,7 @@ def _orientation_predictor(arch: str, pretrained: bool, **kwargs: Any) -> Orient
 def crop_orientation_predictor(
-    arch: str = "mobilenet_v3_small_crop_orientation", pretrained: bool = False, **kwargs: Any
+    arch: Any = "mobilenet_v3_small_crop_orientation", pretrained: bool = False, **kwargs: Any
 ) -> OrientationPredictor:
     """Crop orientation classification architecture.
@@ -71,11 +83,11 @@ def crop_orientation_predictor(
     -------
         OrientationPredictor
     """
-    return _orientation_predictor(arch, pretrained, **kwargs)
+    return _orientation_predictor(arch, pretrained, model_type="crop", **kwargs)
 def page_orientation_predictor(
-    arch: str = "mobilenet_v3_small_page_orientation", pretrained: bool = False, **kwargs: Any
+    arch: Any = "mobilenet_v3_small_page_orientation", pretrained: bool = False, **kwargs: Any
 ) -> OrientationPredictor:
     """Page orientation classification architecture.
@@ -95,4 +107,4 @@ def page_orientation_predictor(
     -------
         OrientationPredictor
     """
-    return _orientation_predictor(arch, pretrained, **kwargs)
+    return _orientation_predictor(arch, pretrained, model_type="page", **kwargs)

doctr/models/detection/differentiable_binarization/tensorflow.py CHANGED Viewed

@@ -10,12 +10,17 @@ from typing import Any, Dict, List, Optional, Tuple
 import numpy as np
 import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers
+from tensorflow.keras import Model, Sequential, layers, losses
 from tensorflow.keras.applications import ResNet50
 from doctr.file_utils import CLASS_NAME
-from doctr.models.utils import IntermediateLayerGetter, _bf16_to_float32, conv_sequence, load_pretrained_params
+from doctr.models.utils import (
+    IntermediateLayerGetter,
+    _bf16_to_float32,
+    _build_model,
+    conv_sequence,
+    load_pretrained_params,
+)
 from doctr.utils.repr import NestedObject
 from ...classification import mobilenet_v3_large
@@ -29,13 +34,13 @@ default_cfgs: Dict[str, Dict[str, Any]] = {
         "mean": (0.798, 0.785, 0.772),
         "std": (0.264, 0.2749, 0.287),
         "input_shape": (1024, 1024, 3),
-        "url": "https://doctr-static.mindee.com/models?id=v0.7.0/db_resnet50-84171458.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/db_resnet50-649fa22b.weights.h5&src=0",
     },
     "db_mobilenet_v3_large": {
         "mean": (0.798, 0.785, 0.772),
         "std": (0.264, 0.2749, 0.287),
         "input_shape": (1024, 1024, 3),
-        "url": "https://doctr-static.mindee.com/models?id=v0.7.0/db_mobilenet_v3_large-da524564.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/db_mobilenet_v3_large-ee2e1dbe.weights.h5&src=0",
     },
 }
@@ -81,7 +86,7 @@ class FeaturePyramidNetwork(layers.Layer, NestedObject):
         if dilation_factor > 1:
             _layers.append(layers.UpSampling2D(size=(dilation_factor, dilation_factor), interpolation="nearest"))
-        module = keras.Sequential(_layers)
+        module = Sequential(_layers)
         return module
@@ -104,7 +109,7 @@ class FeaturePyramidNetwork(layers.Layer, NestedObject):
         return layers.concatenate(results)
-class DBNet(_DBNet, keras.Model, NestedObject):
+class DBNet(_DBNet, Model, NestedObject):
     """DBNet as described in `"Real-time Scene Text Detection with Differentiable Binarization"
     <https://arxiv.org/pdf/1911.08947.pdf>`_.
@@ -147,14 +152,14 @@ class DBNet(_DBNet, keras.Model, NestedObject):
         _inputs = [layers.Input(shape=in_shape[1:]) for in_shape in self.feat_extractor.output_shape]
         output_shape = tuple(self.fpn(_inputs).shape)
-        self.probability_head = keras.Sequential([
+        self.probability_head = Sequential([
             *conv_sequence(64, "relu", True, kernel_size=3, input_shape=output_shape[1:]),
             layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer="he_normal"),
             layers.BatchNormalization(),
             layers.Activation("relu"),
             layers.Conv2DTranspose(num_classes, 2, strides=2, kernel_initializer="he_normal"),
         ])
-        self.threshold_head = keras.Sequential([
+        self.threshold_head = Sequential([
             *conv_sequence(64, "relu", True, kernel_size=3, input_shape=output_shape[1:]),
             layers.Conv2DTranspose(64, 2, strides=2, use_bias=False, kernel_initializer="he_normal"),
             layers.BatchNormalization(),
@@ -206,7 +211,7 @@ class DBNet(_DBNet, keras.Model, NestedObject):
         # Focal loss
         focal_scale = 10.0
-        bce_loss = tf.keras.losses.binary_crossentropy(seg_target[..., None], out_map[..., None], from_logits=True)
+        bce_loss = losses.binary_crossentropy(seg_target[..., None], out_map[..., None], from_logits=True)
         # Convert logits to prob, compute gamma factor
         p_t = (seg_target * prob_map) + ((1 - seg_target) * (1 - prob_map))
@@ -305,9 +310,16 @@ def _db_resnet(
     # Build the model
     model = DBNet(feat_extractor, cfg=_cfg, **kwargs)
+    _build_model(model)
     # Load pretrained parameters
     if pretrained:
-        load_pretrained_params(model, _cfg["url"])
+        # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning
+        load_pretrained_params(
+            model,
+            _cfg["url"],
+            skip_mismatch=kwargs["class_names"] != default_cfgs[arch].get("class_names", [CLASS_NAME]),
+        )
     return model
@@ -326,6 +338,10 @@ def _db_mobilenet(
     # Patch the config
     _cfg = deepcopy(default_cfgs[arch])
     _cfg["input_shape"] = input_shape or _cfg["input_shape"]
+    if not kwargs.get("class_names", None):
+        kwargs["class_names"] = default_cfgs[arch].get("class_names", [CLASS_NAME])
+    else:
+        kwargs["class_names"] = sorted(kwargs["class_names"])
     # Feature extractor
     feat_extractor = IntermediateLayerGetter(
@@ -339,9 +355,15 @@ def _db_mobilenet(
     # Build the model
     model = DBNet(feat_extractor, cfg=_cfg, **kwargs)
+    _build_model(model)
     # Load pretrained parameters
     if pretrained:
-        load_pretrained_params(model, _cfg["url"])
+        # The given class_names differs from the pretrained model => skip the mismatching layers for fine tuning
+        load_pretrained_params(
+            model,
+            _cfg["url"],
+            skip_mismatch=kwargs["class_names"] != default_cfgs[arch].get("class_names", [CLASS_NAME]),
+        )
     return model

python-doctr 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

python-doctr 0.9.0py3-none-any.whl → 0.10.0py3-none-any.whl