PyPI - python-doctr - Versions diffs - 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

python-doctr 0.8.1py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

doctr/__init__.py +1 -1
doctr/contrib/__init__.py +0 -0
doctr/contrib/artefacts.py +131 -0
doctr/contrib/base.py +105 -0
doctr/datasets/cord.py +10 -1
doctr/datasets/datasets/pytorch.py +2 -2
doctr/datasets/funsd.py +11 -1
doctr/datasets/generator/base.py +6 -5
doctr/datasets/ic03.py +11 -1
doctr/datasets/ic13.py +10 -1
doctr/datasets/iiit5k.py +26 -16
doctr/datasets/imgur5k.py +11 -2
doctr/datasets/loader.py +1 -6
doctr/datasets/sroie.py +11 -1
doctr/datasets/svhn.py +11 -1
doctr/datasets/svt.py +11 -1
doctr/datasets/synthtext.py +11 -1
doctr/datasets/utils.py +9 -3
doctr/datasets/vocabs.py +15 -4
doctr/datasets/wildreceipt.py +12 -1
doctr/file_utils.py +45 -12
doctr/io/elements.py +52 -10
doctr/io/html.py +2 -2
doctr/io/image/pytorch.py +6 -8
doctr/io/image/tensorflow.py +1 -1
doctr/io/pdf.py +5 -2
doctr/io/reader.py +6 -0
doctr/models/__init__.py +0 -1
doctr/models/_utils.py +57 -20
doctr/models/builder.py +73 -15
doctr/models/classification/magc_resnet/tensorflow.py +13 -6
doctr/models/classification/mobilenet/pytorch.py +47 -9
doctr/models/classification/mobilenet/tensorflow.py +51 -14
doctr/models/classification/predictor/pytorch.py +28 -17
doctr/models/classification/predictor/tensorflow.py +26 -16
doctr/models/classification/resnet/tensorflow.py +21 -8
doctr/models/classification/textnet/pytorch.py +3 -3
doctr/models/classification/textnet/tensorflow.py +11 -5
doctr/models/classification/vgg/tensorflow.py +9 -3
doctr/models/classification/vit/tensorflow.py +10 -4
doctr/models/classification/zoo.py +55 -19
doctr/models/detection/_utils/__init__.py +1 -0
doctr/models/detection/_utils/base.py +66 -0
doctr/models/detection/differentiable_binarization/base.py +4 -3
doctr/models/detection/differentiable_binarization/pytorch.py +2 -2
doctr/models/detection/differentiable_binarization/tensorflow.py +34 -12
doctr/models/detection/fast/base.py +6 -5
doctr/models/detection/fast/pytorch.py +4 -4
doctr/models/detection/fast/tensorflow.py +15 -12
doctr/models/detection/linknet/base.py +4 -3
doctr/models/detection/linknet/tensorflow.py +23 -11
doctr/models/detection/predictor/pytorch.py +15 -1
doctr/models/detection/predictor/tensorflow.py +17 -3
doctr/models/detection/zoo.py +7 -2
doctr/models/factory/hub.py +8 -18
doctr/models/kie_predictor/base.py +13 -3
doctr/models/kie_predictor/pytorch.py +45 -20
doctr/models/kie_predictor/tensorflow.py +44 -17
doctr/models/modules/layers/pytorch.py +2 -3
doctr/models/modules/layers/tensorflow.py +6 -8
doctr/models/modules/transformer/pytorch.py +2 -2
doctr/models/modules/transformer/tensorflow.py +0 -2
doctr/models/modules/vision_transformer/pytorch.py +1 -1
doctr/models/modules/vision_transformer/tensorflow.py +1 -1
doctr/models/predictor/base.py +97 -58
doctr/models/predictor/pytorch.py +35 -20
doctr/models/predictor/tensorflow.py +35 -18
doctr/models/preprocessor/pytorch.py +4 -4
doctr/models/preprocessor/tensorflow.py +3 -2
doctr/models/recognition/crnn/tensorflow.py +8 -6
doctr/models/recognition/master/pytorch.py +2 -2
doctr/models/recognition/master/tensorflow.py +9 -4
doctr/models/recognition/parseq/pytorch.py +4 -3
doctr/models/recognition/parseq/tensorflow.py +14 -11
doctr/models/recognition/sar/pytorch.py +7 -6
doctr/models/recognition/sar/tensorflow.py +10 -12
doctr/models/recognition/vitstr/pytorch.py +1 -1
doctr/models/recognition/vitstr/tensorflow.py +9 -4
doctr/models/recognition/zoo.py +1 -1
doctr/models/utils/pytorch.py +1 -1
doctr/models/utils/tensorflow.py +15 -15
doctr/models/zoo.py +2 -2
doctr/py.typed +0 -0
doctr/transforms/functional/base.py +1 -1
doctr/transforms/functional/pytorch.py +5 -5
doctr/transforms/modules/base.py +37 -15
doctr/transforms/modules/pytorch.py +73 -14
doctr/transforms/modules/tensorflow.py +78 -19
doctr/utils/fonts.py +7 -5
doctr/utils/geometry.py +141 -31
doctr/utils/metrics.py +34 -175
doctr/utils/reconstitution.py +212 -0
doctr/utils/visualization.py +5 -118
doctr/version.py +1 -1
{python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/METADATA +85 -81
python_doctr-0.10.0.dist-info/RECORD +173 -0
{python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/WHEEL +1 -1
doctr/models/artefacts/__init__.py +0 -2
doctr/models/artefacts/barcode.py +0 -74
doctr/models/artefacts/face.py +0 -63
doctr/models/obj_detection/__init__.py +0 -1
doctr/models/obj_detection/faster_rcnn/__init__.py +0 -4
doctr/models/obj_detection/faster_rcnn/pytorch.py +0 -81
python_doctr-0.8.1.dist-info/RECORD +0 -173
{python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/LICENSE +0 -0
{python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/top_level.txt +0 -0
{python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/zip-safe +0 -0

doctr/models/_utils.py CHANGED Viewed

@@ -11,6 +11,8 @@ import cv2
 import numpy as np
 from langdetect import LangDetectException, detect_langs
+from doctr.utils.geometry import rotate_image
 __all__ = ["estimate_orientation", "get_language", "invert_data_structure"]
@@ -29,56 +31,91 @@ def get_max_width_length_ratio(contour: np.ndarray) -> float:
     return max(w / h, h / w)
-def estimate_orientation(img: np.ndarray, n_ct: int = 50, ratio_threshold_for_lines: float = 5) -> int:
+def estimate_orientation(
+    img: np.ndarray,
+    general_page_orientation: Optional[Tuple[int, float]] = None,
+    n_ct: int = 70,
+    ratio_threshold_for_lines: float = 3,
+    min_confidence: float = 0.2,
+    lower_area: int = 100,
+) -> int:
     """Estimate the angle of the general document orientation based on the
      lines of the document and the assumption that they should be horizontal.
     Args:
     ----
         img: the img or bitmap to analyze (H, W, C)
+        general_page_orientation: the general orientation of the page (angle [0, 90, 180, 270 (-90)], confidence)
+            estimated by a model
         n_ct: the number of contours used for the orientation estimation
         ratio_threshold_for_lines: this is the ratio w/h used to discriminates lines
+        min_confidence: the minimum confidence to consider the general_page_orientation
+        lower_area: the minimum area of a contour to be considered
     Returns:
     -------
-        the angle of the general document orientation
+        the estimated angle of the page (clockwise, negative for left side rotation, positive for right side rotation)
     """
     assert len(img.shape) == 3 and img.shape[-1] in [1, 3], f"Image shape {img.shape} not supported"
-    max_value = np.max(img)
-    min_value = np.min(img)
-    if max_value <= 1 and min_value >= 0 or (max_value <= 255 and min_value >= 0 and img.shape[-1] == 1):
-        thresh = img.astype(np.uint8)
-    if max_value <= 255 and min_value >= 0 and img.shape[-1] == 3:
+    thresh = None
+    # Convert image to grayscale if necessary
+    if img.shape[-1] == 3:
         gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
         gray_img = cv2.medianBlur(gray_img, 5)
-        thresh = cv2.threshold(gray_img, thresh=0, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]  # type: ignore[assignment]
-    # try to merge words in lines
-    (h, w) = img.shape[:2]
-    k_x = max(1, (floor(w / 100)))
-    k_y = max(1, (floor(h / 100)))
-    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (k_x, k_y))
-    thresh = cv2.dilate(thresh, kernel, iterations=1)  # type: ignore[assignment]
+        thresh = cv2.threshold(gray_img, thresh=0, maxval=255, type=cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
+    else:
+        thresh = img.astype(np.uint8)  # type: ignore[assignment]
+    page_orientation, orientation_confidence = general_page_orientation or (None, 0.0)
+    if page_orientation and orientation_confidence >= min_confidence:
+        # We rotate the image to the general orientation which improves the detection
+        # No expand needed bitmap is already padded
+        thresh = rotate_image(thresh, -page_orientation)  # type: ignore
+    else:  # That's only required if we do not work on the detection models bin map
+        # try to merge words in lines
+        (h, w) = img.shape[:2]
+        k_x = max(1, (floor(w / 100)))
+        k_y = max(1, (floor(h / 100)))
+        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (k_x, k_y))
+        thresh = cv2.dilate(thresh, kernel, iterations=1)
     # extract contours
     contours, _ = cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
-    # Sort contours
-    contours = sorted(contours, key=get_max_width_length_ratio, reverse=True)
+    # Filter & Sort contours
+    contours = sorted(
+        [contour for contour in contours if cv2.contourArea(contour) > lower_area],
+        key=get_max_width_length_ratio,
+        reverse=True,
+    )
     angles = []
     for contour in contours[:n_ct]:
-        _, (w, h), angle = cv2.minAreaRect(contour)
+        _, (w, h), angle = cv2.minAreaRect(contour)  # type: ignore[assignment]
         if w / h > ratio_threshold_for_lines:  # select only contours with ratio like lines
             angles.append(angle)
         elif w / h < 1 / ratio_threshold_for_lines:  # if lines are vertical, substract 90 degree
             angles.append(angle - 90)
     if len(angles) == 0:
-        return 0  # in case no angles is found
+        estimated_angle = 0  # in case no angles is found
     else:
         median = -median_low(angles)
-        return round(median) if abs(median) != 0 else 0
+        estimated_angle = -round(median) if abs(median) != 0 else 0
+    # combine with the general orientation and the estimated angle
+    if page_orientation and orientation_confidence >= min_confidence:
+        # special case where the estimated angle is mostly wrong:
+        # case 1: - and + swapped
+        # case 2: estimated angle is completely wrong
+        # so in this case we prefer the general page orientation
+        if abs(estimated_angle) == abs(page_orientation):
+            return page_orientation
+        estimated_angle = estimated_angle if page_orientation == 0 else page_orientation + estimated_angle
+        if estimated_angle > 180:
+            estimated_angle -= 360
+    return estimated_angle  # return the clockwise angle (negative - left side rotation, positive - right side rotation)
 def rectify_crops(

doctr/models/builder.py CHANGED Viewed

@@ -31,7 +31,7 @@ class DocumentBuilder(NestedObject):
     def __init__(
         self,
         resolve_lines: bool = True,
-        resolve_blocks: bool = True,
+        resolve_blocks: bool = False,
         paragraph_break: float = 0.035,
         export_as_straight_boxes: bool = False,
     ) -> None:
@@ -220,13 +220,22 @@ class DocumentBuilder(NestedObject):
         return blocks
-    def _build_blocks(self, boxes: np.ndarray, word_preds: List[Tuple[str, float]]) -> List[Block]:
+    def _build_blocks(
+        self,
+        boxes: np.ndarray,
+        objectness_scores: np.ndarray,
+        word_preds: List[Tuple[str, float]],
+        crop_orientations: List[Dict[str, Any]],
+    ) -> List[Block]:
         """Gather independent words in structured blocks
         Args:
         ----
-            boxes: bounding boxes of all detected words of the page, of shape (N, 5) or (N, 4, 2)
+            boxes: bounding boxes of all detected words of the page, of shape (N, 4) or (N, 4, 2)
+            objectness_scores: objectness scores of all detected words of the page, of shape N
             word_preds: list of all detected words of the page, of shape N
+            crop_orientations: list of dictoinaries containing
+                the general orientation (orientations + confidences) of the crops
         Returns:
         -------
@@ -257,10 +266,17 @@ class DocumentBuilder(NestedObject):
                 Line([
                     Word(
                         *word_preds[idx],
-                        tuple([tuple(pt) for pt in boxes[idx].tolist()]),  # type: ignore[arg-type]
+                        tuple(tuple(pt) for pt in boxes[idx].tolist()),  # type: ignore[arg-type]
+                        float(objectness_scores[idx]),
+                        crop_orientations[idx],
                     )
                     if boxes.ndim == 3
-                    else Word(*word_preds[idx], ((boxes[idx, 0], boxes[idx, 1]), (boxes[idx, 2], boxes[idx, 3])))
+                    else Word(
+                        *word_preds[idx],
+                        ((boxes[idx, 0], boxes[idx, 1]), (boxes[idx, 2], boxes[idx, 3])),
+                        float(objectness_scores[idx]),
+                        crop_orientations[idx],
+                    )
                     for idx in line
                 ])
                 for line in lines
@@ -281,8 +297,10 @@ class DocumentBuilder(NestedObject):
         self,
         pages: List[np.ndarray],
         boxes: List[np.ndarray],
+        objectness_scores: List[np.ndarray],
         text_preds: List[List[Tuple[str, float]]],
         page_shapes: List[Tuple[int, int]],
+        crop_orientations: List[Dict[str, Any]],
         orientations: Optional[List[Dict[str, Any]]] = None,
         languages: Optional[List[Dict[str, Any]]] = None,
     ) -> Document:
@@ -291,10 +309,13 @@ class DocumentBuilder(NestedObject):
         Args:
         ----
             pages: list of N elements, where each element represents the page image
-            boxes: list of N elements, where each element represents the localization predictions, of shape (*, 5)
-                or (*, 6) for all words for a given page
+            boxes: list of N elements, where each element represents the localization predictions, of shape (*, 4)
+                or (*, 4, 2) for all words for a given page
+            objectness_scores: list of N elements, where each element represents the objectness scores
             text_preds: list of N elements, where each element is the list of all word prediction (text + confidence)
             page_shapes: shape of each page, of size N
+            crop_orientations: list of N elements, where each element is
+                a dictionary containing the general orientation (orientations + confidences) of the crops
             orientations: optional, list of N elements,
                 where each element is a dictionary containing the orientation (orientation + confidence)
             languages: optional, list of N elements,
@@ -304,7 +325,9 @@ class DocumentBuilder(NestedObject):
         -------
             document object
         """
-        if len(boxes) != len(text_preds) or len(boxes) != len(page_shapes):
+        if len(boxes) != len(text_preds) != len(crop_orientations) != len(objectness_scores) or len(boxes) != len(
+            page_shapes
+        ) != len(crop_orientations) != len(objectness_scores):
             raise ValueError("All arguments are expected to be lists of the same size")
         _orientations = (
@@ -322,15 +345,25 @@ class DocumentBuilder(NestedObject):
                 page,
                 self._build_blocks(
                     page_boxes,
+                    loc_scores,
                     word_preds,
+                    word_crop_orientations,
                 ),
                 _idx,
                 shape,
                 orientation,
                 language,
             )
-            for page, _idx, shape, page_boxes, word_preds, orientation, language in zip(
-                pages, range(len(boxes)), page_shapes, boxes, text_preds, _orientations, _languages
+            for page, _idx, shape, page_boxes, loc_scores, word_preds, word_crop_orientations, orientation, language in zip(  # noqa: E501
+                pages,
+                range(len(boxes)),
+                page_shapes,
+                boxes,
+                objectness_scores,
+                text_preds,
+                crop_orientations,
+                _orientations,
+                _languages,
             )
         ]
@@ -353,8 +386,10 @@ class KIEDocumentBuilder(DocumentBuilder):
         self,
         pages: List[np.ndarray],
         boxes: List[Dict[str, np.ndarray]],
+        objectness_scores: List[Dict[str, np.ndarray]],
         text_preds: List[Dict[str, List[Tuple[str, float]]]],
         page_shapes: List[Tuple[int, int]],
+        crop_orientations: List[Dict[str, List[Dict[str, Any]]]],
         orientations: Optional[List[Dict[str, Any]]] = None,
         languages: Optional[List[Dict[str, Any]]] = None,
     ) -> KIEDocument:
@@ -365,8 +400,11 @@ class KIEDocumentBuilder(DocumentBuilder):
             pages: list of N elements, where each element represents the page image
             boxes: list of N dictionaries, where each element represents the localization predictions for a class,
                 of shape (*, 5) or (*, 6) for all predictions
+            objectness_scores: list of N dictionaries, where each element represents the objectness scores for a class
             text_preds: list of N dictionaries, where each element is the list of all word prediction
             page_shapes: shape of each page, of size N
+            crop_orientations: list of N dictonaries, where each element is
+                a list containing the general crop orientations (orientations + confidences) of the crops
             orientations: optional, list of N elements,
                 where each element is a dictionary containing the orientation (orientation + confidence)
             languages: optional, list of N elements,
@@ -376,7 +414,9 @@ class KIEDocumentBuilder(DocumentBuilder):
         -------
             document object
         """
-        if len(boxes) != len(text_preds) or len(boxes) != len(page_shapes):
+        if len(boxes) != len(text_preds) != len(crop_orientations) != len(objectness_scores) or len(boxes) != len(
+            page_shapes
+        ) != len(crop_orientations) != len(objectness_scores):
             raise ValueError("All arguments are expected to be lists of the same size")
         _orientations = (
             orientations if isinstance(orientations, list) else [None] * len(boxes)  # type: ignore[list-item]
@@ -401,7 +441,9 @@ class KIEDocumentBuilder(DocumentBuilder):
                 {
                     k: self._build_blocks(
                         page_boxes[k],
+                        loc_scores[k],
                         word_preds[k],
+                        word_crop_orientations[k],
                     )
                     for k in page_boxes.keys()
                 },
@@ -410,8 +452,16 @@ class KIEDocumentBuilder(DocumentBuilder):
                 orientation,
                 language,
             )
-            for page, _idx, shape, page_boxes, word_preds, orientation, language in zip(
-                pages, range(len(boxes)), page_shapes, boxes, text_preds, _orientations, _languages
+            for page, _idx, shape, page_boxes, loc_scores, word_preds, word_crop_orientations, orientation, language in zip(  # noqa: E501
+                pages,
+                range(len(boxes)),
+                page_shapes,
+                boxes,
+                objectness_scores,
+                text_preds,
+                crop_orientations,
+                _orientations,
+                _languages,
             )
         ]
@@ -420,14 +470,18 @@ class KIEDocumentBuilder(DocumentBuilder):
     def _build_blocks(  # type: ignore[override]
         self,
         boxes: np.ndarray,
+        objectness_scores: np.ndarray,
         word_preds: List[Tuple[str, float]],
+        crop_orientations: List[Dict[str, Any]],
     ) -> List[Prediction]:
         """Gather independent words in structured blocks
         Args:
         ----
-            boxes: bounding boxes of all detected words of the page, of shape (N, 5) or (N, 4, 2)
+            boxes: bounding boxes of all detected words of the page, of shape (N, 4) or (N, 4, 2)
+            objectness_scores: objectness scores of all detected words of the page
             word_preds: list of all detected words of the page, of shape N
+            crop_orientations: list of orientations for each word crop
         Returns:
         -------
@@ -446,13 +500,17 @@ class KIEDocumentBuilder(DocumentBuilder):
             Prediction(
                 value=word_preds[idx][0],
                 confidence=word_preds[idx][1],
-                geometry=tuple([tuple(pt) for pt in boxes[idx].tolist()]),  # type: ignore[arg-type]
+                geometry=tuple(tuple(pt) for pt in boxes[idx].tolist()),  # type: ignore[arg-type]
+                objectness_score=float(objectness_scores[idx]),
+                crop_orientation=crop_orientations[idx],
             )
             if boxes.ndim == 3
             else Prediction(
                 value=word_preds[idx][0],
                 confidence=word_preds[idx][1],
                 geometry=((boxes[idx, 0], boxes[idx, 1]), (boxes[idx, 2], boxes[idx, 3])),
+                objectness_score=float(objectness_scores[idx]),
+                crop_orientation=crop_orientations[idx],
             )
             for idx in idxs
         ]

doctr/models/classification/magc_resnet/tensorflow.py CHANGED Viewed

@@ -9,12 +9,12 @@ from functools import partial
 from typing import Any, Dict, List, Optional, Tuple
 import tensorflow as tf
-from tensorflow.keras import layers
+from tensorflow.keras import activations, layers
 from tensorflow.keras.models import Sequential
 from doctr.datasets import VOCABS
-from ...utils import load_pretrained_params
+from ...utils import _build_model, load_pretrained_params
 from ..resnet.tensorflow import ResNet
 __all__ = ["magc_resnet31"]
@@ -26,7 +26,7 @@ default_cfgs: Dict[str, Dict[str, Any]] = {
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 32, 3),
         "classes": list(VOCABS["french"]),
-        "url": "https://doctr-static.mindee.com/models?id=v0.6.0/magc_resnet31-addbb705.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/magc_resnet31-16aa7d71.weights.h5&src=0",
     },
 }
@@ -57,6 +57,7 @@ class MAGC(layers.Layer):
         self.headers = headers  # h
         self.inplanes = inplanes  # C
         self.attn_scale = attn_scale
+        self.ratio = ratio
         self.planes = int(inplanes * ratio)
         self.single_header_inplanes = int(inplanes / headers)  # C / h
@@ -97,7 +98,7 @@ class MAGC(layers.Layer):
         if self.attn_scale and self.headers > 1:
             context_mask = context_mask / math.sqrt(self.single_header_inplanes)
         # B*h, 1, H*W, 1
-        context_mask = tf.keras.activations.softmax(context_mask, axis=2)
+        context_mask = activations.softmax(context_mask, axis=2)
         # Compute context
         # B*h, 1, C/h, 1
@@ -114,7 +115,7 @@ class MAGC(layers.Layer):
         # Context modeling: B, H, W, C  ->  B, 1, 1, C
         context = self.context_modeling(inputs)
         # Transform: B, 1, 1, C  ->  B, 1, 1, C
-        transformed = self.transform(context)
+        transformed = self.transform(context, **kwargs)
         return inputs + transformed
@@ -151,9 +152,15 @@ def _magc_resnet(
         cfg=_cfg,
         **kwargs,
     )
+    _build_model(model)
     # Load pretrained parameters
     if pretrained:
-        load_pretrained_params(model, default_cfgs[arch]["url"])
+        # The number of classes is not the same as the number of classes in the pretrained model =>
+        # skip the mismatching layers for fine tuning
+        load_pretrained_params(
+            model, default_cfgs[arch]["url"], skip_mismatch=kwargs["num_classes"] != len(default_cfgs[arch]["classes"])
+        )
     return model

doctr/models/classification/mobilenet/pytorch.py CHANGED Viewed

@@ -9,17 +9,20 @@ from copy import deepcopy
 from typing import Any, Dict, List, Optional
 from torchvision.models import mobilenetv3
+from torchvision.models.mobilenetv3 import MobileNetV3
 from doctr.datasets import VOCABS
 from ...utils import load_pretrained_params
 __all__ = [
+    "MobileNetV3",
     "mobilenet_v3_small",
     "mobilenet_v3_small_r",
     "mobilenet_v3_large",
     "mobilenet_v3_large_r",
-    "mobilenet_v3_small_orientation",
+    "mobilenet_v3_small_crop_orientation",
+    "mobilenet_v3_small_page_orientation",
 ]
 default_cfgs: Dict[str, Dict[str, Any]] = {
@@ -51,12 +54,19 @@ default_cfgs: Dict[str, Dict[str, Any]] = {
         "classes": list(VOCABS["french"]),
         "url": "https://doctr-static.mindee.com/models?id=v0.4.1/mobilenet_v3_small_r-1a8a3530.pt&src=0",
     },
-    "mobilenet_v3_small_orientation": {
+    "mobilenet_v3_small_crop_orientation": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
-        "input_shape": (3, 128, 128),
-        "classes": [0, 90, 180, 270],
-        "url": "https://doctr-static.mindee.com/models?id=v0.4.1/classif_mobilenet_v3_small-24f8ff57.pt&src=0",
+        "input_shape": (3, 256, 256),
+        "classes": [0, -90, 180, 90],
+        "url": "https://doctr-static.mindee.com/models?id=v0.8.1/mobilenet_v3_small_crop_orientation-f0847a18.pt&src=0",
+    },
+    "mobilenet_v3_small_page_orientation": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (3, 512, 512),
+        "classes": [0, -90, 180, 90],
+        "url": "https://doctr-static.mindee.com/models?id=v0.8.1/mobilenet_v3_small_page_orientation-8e60325c.pt&src=0",
     },
 }
@@ -212,14 +222,42 @@ def mobilenet_v3_large_r(pretrained: bool = False, **kwargs: Any) -> mobilenetv3
     )
-def mobilenet_v3_small_orientation(pretrained: bool = False, **kwargs: Any) -> mobilenetv3.MobileNetV3:
+def mobilenet_v3_small_crop_orientation(pretrained: bool = False, **kwargs: Any) -> mobilenetv3.MobileNetV3:
+    """MobileNetV3-Small architecture as described in
+    `"Searching for MobileNetV3",
+    <https://arxiv.org/pdf/1905.02244.pdf>`_.
+    >>> import torch
+    >>> from doctr.models import mobilenet_v3_small_crop_orientation
+    >>> model = mobilenet_v3_small_crop_orientation(pretrained=False)
+    >>> input_tensor = torch.rand((1, 3, 512, 512), dtype=torch.float32)
+    >>> out = model(input_tensor)
+    Args:
+    ----
+        pretrained: boolean, True if model is pretrained
+        **kwargs: keyword arguments of the MobileNetV3 architecture
+    Returns:
+    -------
+        a torch.nn.Module
+    """
+    return _mobilenet_v3(
+        "mobilenet_v3_small_crop_orientation",
+        pretrained,
+        ignore_keys=["classifier.3.weight", "classifier.3.bias"],
+        **kwargs,
+    )
+def mobilenet_v3_small_page_orientation(pretrained: bool = False, **kwargs: Any) -> mobilenetv3.MobileNetV3:
     """MobileNetV3-Small architecture as described in
     `"Searching for MobileNetV3",
     <https://arxiv.org/pdf/1905.02244.pdf>`_.
     >>> import torch
-    >>> from doctr.models import mobilenet_v3_small_orientation
-    >>> model = mobilenet_v3_small_orientation(pretrained=False)
+    >>> from doctr.models import mobilenet_v3_small_page_orientation
+    >>> model = mobilenet_v3_small_page_orientation(pretrained=False)
     >>> input_tensor = torch.rand((1, 3, 512, 512), dtype=torch.float32)
     >>> out = model(input_tensor)
@@ -233,7 +271,7 @@ def mobilenet_v3_small_orientation(pretrained: bool = False, **kwargs: Any) -> m
         a torch.nn.Module
     """
     return _mobilenet_v3(
-        "mobilenet_v3_small_orientation",
+        "mobilenet_v3_small_page_orientation",
         pretrained,
         ignore_keys=["classifier.3.weight", "classifier.3.bias"],
         **kwargs,

doctr/models/classification/mobilenet/tensorflow.py CHANGED Viewed

@@ -13,7 +13,7 @@ from tensorflow.keras import layers
 from tensorflow.keras.models import Sequential
 from ....datasets import VOCABS
-from ...utils import conv_sequence, load_pretrained_params
+from ...utils import _build_model, conv_sequence, load_pretrained_params
 __all__ = [
     "MobileNetV3",
@@ -21,7 +21,8 @@ __all__ = [
     "mobilenet_v3_small_r",
     "mobilenet_v3_large",
     "mobilenet_v3_large_r",
-    "mobilenet_v3_small_orientation",
+    "mobilenet_v3_small_crop_orientation",
+    "mobilenet_v3_small_page_orientation",
 ]
@@ -31,35 +32,42 @@ default_cfgs: Dict[str, Dict[str, Any]] = {
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 32, 3),
         "classes": list(VOCABS["french"]),
-        "url": "https://doctr-static.mindee.com/models?id=v0.4.1/mobilenet_v3_large-47d25d7e.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/mobilenet_v3_large-d857506e.weights.h5&src=0",
     },
     "mobilenet_v3_large_r": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 32, 3),
         "classes": list(VOCABS["french"]),
-        "url": "https://doctr-static.mindee.com/models?id=v0.4.1/mobilenet_v3_large_r-a108e192.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/mobilenet_v3_large_r-eef2e3c6.weights.h5&src=0",
     },
     "mobilenet_v3_small": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 32, 3),
         "classes": list(VOCABS["french"]),
-        "url": "https://doctr-static.mindee.com/models?id=v0.4.1/mobilenet_v3_small-8a32c32c.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/mobilenet_v3_small-3fcebad7.weights.h5&src=0",
     },
     "mobilenet_v3_small_r": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 32, 3),
         "classes": list(VOCABS["french"]),
-        "url": "https://doctr-static.mindee.com/models?id=v0.4.1/mobilenet_v3_small_r-3d61452e.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/mobilenet_v3_small_r-dd50218d.weights.h5&src=0",
     },
-    "mobilenet_v3_small_orientation": {
+    "mobilenet_v3_small_crop_orientation": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
         "input_shape": (128, 128, 3),
-        "classes": [0, 90, 180, 270],
-        "url": "https://doctr-static.mindee.com/models?id=v0.4.1/classif_mobilenet_v3_small-1ea8db03.zip&src=0",
+        "classes": [0, -90, 180, 90],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/mobilenet_v3_small_crop_orientation-ef019b6b.weights.h5&src=0",
+    },
+    "mobilenet_v3_small_page_orientation": {
+        "mean": (0.694, 0.695, 0.693),
+        "std": (0.299, 0.296, 0.301),
+        "input_shape": (512, 512, 3),
+        "classes": [0, -90, 180, 90],
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/mobilenet_v3_small_page_orientation-0071d55d.weights.h5&src=0",
     },
 }
@@ -287,9 +295,15 @@ def _mobilenet_v3(arch: str, pretrained: bool, rect_strides: bool = False, **kwa
         cfg=_cfg,
         **kwargs,
     )
+    _build_model(model)
     # Load pretrained parameters
     if pretrained:
-        load_pretrained_params(model, default_cfgs[arch]["url"])
+        # The number of classes is not the same as the number of classes in the pretrained model =>
+        # skip the mismatching layers for fine tuning
+        load_pretrained_params(
+            model, default_cfgs[arch]["url"], skip_mismatch=kwargs["num_classes"] != len(default_cfgs[arch]["classes"])
+        )
     return model
@@ -386,14 +400,37 @@ def mobilenet_v3_large_r(pretrained: bool = False, **kwargs: Any) -> MobileNetV3
     return _mobilenet_v3("mobilenet_v3_large_r", pretrained, True, **kwargs)
-def mobilenet_v3_small_orientation(pretrained: bool = False, **kwargs: Any) -> MobileNetV3:
+def mobilenet_v3_small_crop_orientation(pretrained: bool = False, **kwargs: Any) -> MobileNetV3:
+    """MobileNetV3-Small architecture as described in
+    `"Searching for MobileNetV3",
+    <https://arxiv.org/pdf/1905.02244.pdf>`_.
+    >>> import tensorflow as tf
+    >>> from doctr.models import mobilenet_v3_small_crop_orientation
+    >>> model = mobilenet_v3_small_crop_orientation(pretrained=False)
+    >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32)
+    >>> out = model(input_tensor)
+    Args:
+    ----
+        pretrained: boolean, True if model is pretrained
+        **kwargs: keyword arguments of the MobileNetV3 architecture
+    Returns:
+    -------
+        a keras.Model
+    """
+    return _mobilenet_v3("mobilenet_v3_small_crop_orientation", pretrained, include_top=True, **kwargs)
+def mobilenet_v3_small_page_orientation(pretrained: bool = False, **kwargs: Any) -> MobileNetV3:
     """MobileNetV3-Small architecture as described in
     `"Searching for MobileNetV3",
     <https://arxiv.org/pdf/1905.02244.pdf>`_.
     >>> import tensorflow as tf
-    >>> from doctr.models import mobilenet_v3_small_orientation
-    >>> model = mobilenet_v3_small_orientation(pretrained=False)
+    >>> from doctr.models import mobilenet_v3_small_page_orientation
+    >>> model = mobilenet_v3_small_page_orientation(pretrained=False)
     >>> input_tensor = tf.random.uniform(shape=[1, 512, 512, 3], maxval=1, dtype=tf.float32)
     >>> out = model(input_tensor)
@@ -406,4 +443,4 @@ def mobilenet_v3_small_orientation(pretrained: bool = False, **kwargs: Any) -> M
     -------
         a keras.Model
     """
-    return _mobilenet_v3("mobilenet_v3_small_orientation", pretrained, include_top=True, **kwargs)
+    return _mobilenet_v3("mobilenet_v3_small_page_orientation", pretrained, include_top=True, **kwargs)

python-doctr 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

python-doctr 0.8.1py3-none-any.whl → 0.10.0py3-none-any.whl