PyPI - python-doctr - Versions diffs - 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

python-doctr 0.9.0py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

doctr/datasets/cord.py +10 -1
doctr/datasets/funsd.py +11 -1
doctr/datasets/ic03.py +11 -1
doctr/datasets/ic13.py +10 -1
doctr/datasets/iiit5k.py +26 -16
doctr/datasets/imgur5k.py +10 -1
doctr/datasets/sroie.py +11 -1
doctr/datasets/svhn.py +11 -1
doctr/datasets/svt.py +11 -1
doctr/datasets/synthtext.py +11 -1
doctr/datasets/utils.py +7 -2
doctr/datasets/vocabs.py +6 -2
doctr/datasets/wildreceipt.py +12 -1
doctr/file_utils.py +19 -0
doctr/io/elements.py +12 -4
doctr/models/builder.py +2 -2
doctr/models/classification/magc_resnet/tensorflow.py +13 -6
doctr/models/classification/mobilenet/pytorch.py +2 -0
doctr/models/classification/mobilenet/tensorflow.py +14 -8
doctr/models/classification/predictor/pytorch.py +11 -7
doctr/models/classification/predictor/tensorflow.py +10 -6
doctr/models/classification/resnet/tensorflow.py +21 -8
doctr/models/classification/textnet/tensorflow.py +11 -5
doctr/models/classification/vgg/tensorflow.py +9 -3
doctr/models/classification/vit/tensorflow.py +10 -4
doctr/models/classification/zoo.py +22 -10
doctr/models/detection/differentiable_binarization/tensorflow.py +34 -12
doctr/models/detection/fast/tensorflow.py +14 -11
doctr/models/detection/linknet/tensorflow.py +23 -11
doctr/models/detection/predictor/tensorflow.py +2 -2
doctr/models/factory/hub.py +5 -6
doctr/models/kie_predictor/base.py +4 -0
doctr/models/kie_predictor/pytorch.py +4 -0
doctr/models/kie_predictor/tensorflow.py +8 -1
doctr/models/modules/transformer/tensorflow.py +0 -2
doctr/models/modules/vision_transformer/pytorch.py +1 -1
doctr/models/modules/vision_transformer/tensorflow.py +1 -1
doctr/models/predictor/base.py +24 -12
doctr/models/predictor/pytorch.py +4 -0
doctr/models/predictor/tensorflow.py +8 -1
doctr/models/preprocessor/tensorflow.py +1 -1
doctr/models/recognition/crnn/tensorflow.py +8 -6
doctr/models/recognition/master/tensorflow.py +9 -4
doctr/models/recognition/parseq/tensorflow.py +10 -8
doctr/models/recognition/sar/tensorflow.py +7 -3
doctr/models/recognition/vitstr/tensorflow.py +9 -4
doctr/models/utils/pytorch.py +1 -1
doctr/models/utils/tensorflow.py +15 -15
doctr/transforms/functional/pytorch.py +1 -1
doctr/transforms/modules/pytorch.py +7 -6
doctr/transforms/modules/tensorflow.py +15 -12
doctr/utils/geometry.py +106 -19
doctr/utils/metrics.py +1 -1
doctr/utils/reconstitution.py +151 -65
doctr/version.py +1 -1
{python_doctr-0.9.0.dist-info → python_doctr-0.10.0.dist-info}/METADATA +11 -11
{python_doctr-0.9.0.dist-info → python_doctr-0.10.0.dist-info}/RECORD +61 -61
{python_doctr-0.9.0.dist-info → python_doctr-0.10.0.dist-info}/WHEEL +1 -1
{python_doctr-0.9.0.dist-info → python_doctr-0.10.0.dist-info}/LICENSE +0 -0
{python_doctr-0.9.0.dist-info → python_doctr-0.10.0.dist-info}/top_level.txt +0 -0
{python_doctr-0.9.0.dist-info → python_doctr-0.10.0.dist-info}/zip-safe +0 -0

doctr/models/recognition/vitstr/tensorflow.py CHANGED Viewed

@@ -12,7 +12,7 @@ from tensorflow.keras import Model, layers
 from doctr.datasets import VOCABS
 from ...classification import vit_b, vit_s
-from ...utils.tensorflow import _bf16_to_float32, load_pretrained_params
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
 from .base import _ViTSTR, _ViTSTRPostProcessor
 __all__ = ["ViTSTR", "vitstr_small", "vitstr_base"]
@@ -23,14 +23,14 @@ default_cfgs: Dict[str, Dict[str, Any]] = {
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 128, 3),
         "vocab": VOCABS["french"],
-        "url": "https://doctr-static.mindee.com/models?id=v0.6.0/vitstr_small-358fab2e.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/vitstr_small-d28b8d92.weights.h5&src=0",
     },
     "vitstr_base": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 128, 3),
         "vocab": VOCABS["french"],
-        "url": "https://doctr-static.mindee.com/models?id=v0.6.0/vitstr_base-2889159a.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/vitstr_base-9ad6eb84.weights.h5&src=0",
     },
 }
@@ -216,9 +216,14 @@ def _vitstr(
     # Build the model
     model = ViTSTR(feat_extractor, cfg=_cfg, **kwargs)
+    _build_model(model)
     # Load pretrained parameters
     if pretrained:
-        load_pretrained_params(model, default_cfgs[arch]["url"])
+        # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning
+        load_pretrained_params(
+            model, default_cfgs[arch]["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"]
+        )
     return model

doctr/models/utils/pytorch.py CHANGED Viewed

@@ -157,7 +157,7 @@ def export_model_to_onnx(model: nn.Module, model_name: str, dummy_input: torch.T
     """
     torch.onnx.export(
         model,
-        dummy_input,
+        dummy_input,  # type: ignore[arg-type]
         f"{model_name}.onnx",
         input_names=["input"],
         output_names=["logits"],

doctr/models/utils/tensorflow.py CHANGED Viewed

@@ -4,9 +4,7 @@
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 import logging
-import os
 from typing import Any, Callable, List, Optional, Tuple, Union
-from zipfile import ZipFile
 import tensorflow as tf
 import tf2onnx
@@ -19,6 +17,7 @@ logging.getLogger("tensorflow").setLevel(logging.DEBUG)
 __all__ = [
     "load_pretrained_params",
+    "_build_model",
     "conv_sequence",
     "IntermediateLayerGetter",
     "export_model_to_onnx",
@@ -36,41 +35,42 @@ def _bf16_to_float32(x: tf.Tensor) -> tf.Tensor:
     return tf.cast(x, tf.float32) if x.dtype == tf.bfloat16 else x
+def _build_model(model: Model):
+    """Build a model by calling it once with dummy input
+    Args:
+    ----
+        model: the model to be built
+    """
+    model(tf.zeros((1, *model.cfg["input_shape"])), training=False)
 def load_pretrained_params(
     model: Model,
     url: Optional[str] = None,
     hash_prefix: Optional[str] = None,
-    overwrite: bool = False,
-    internal_name: str = "weights",
+    skip_mismatch: bool = False,
     **kwargs: Any,
 ) -> None:
     """Load a set of parameters onto a model
     >>> from doctr.models import load_pretrained_params
-    >>> load_pretrained_params(model, "https://yoursource.com/yourcheckpoint-yourhash.zip")
+    >>> load_pretrained_params(model, "https://yoursource.com/yourcheckpoint-yourhash.weights.h5")
     Args:
     ----
         model: the keras model to be loaded
         url: URL of the zipped set of parameters
         hash_prefix: first characters of SHA256 expected hash
-        overwrite: should the zip extraction be enforced if the archive has already been extracted
-        internal_name: name of the ckpt files
+        skip_mismatch: skip loading layers with mismatched shapes
         **kwargs: additional arguments to be passed to `doctr.utils.data.download_from_url`
     """
     if url is None:
         logging.warning("Invalid model URL, using default initialization.")
     else:
         archive_path = download_from_url(url, hash_prefix=hash_prefix, cache_subdir="models", **kwargs)
-        # Unzip the archive
-        params_path = archive_path.parent.joinpath(archive_path.stem)
-        if not params_path.is_dir() or overwrite:
-            with ZipFile(archive_path, "r") as f:
-                f.extractall(path=params_path)
         # Load weights
-        model.load_weights(f"{params_path}{os.sep}{internal_name}")
+        model.load_weights(archive_path, skip_mismatch=skip_mismatch)
 def conv_sequence(

doctr/transforms/functional/pytorch.py CHANGED Viewed

@@ -89,7 +89,7 @@ def rotate_sample(
     rotated_geoms[..., 0] = rotated_geoms[..., 0] / rotated_img.shape[2]
     rotated_geoms[..., 1] = rotated_geoms[..., 1] / rotated_img.shape[1]
-    return rotated_img, np.clip(rotated_geoms, 0, 1)
+    return rotated_img, np.clip(np.around(rotated_geoms, decimals=15), 0, 1)
 def crop_detection(

doctr/transforms/modules/pytorch.py CHANGED Viewed

@@ -74,16 +74,18 @@ class Resize(T.Resize):
                 if self.symmetric_pad:
                     half_pad = (math.ceil(_pad[1] / 2), math.ceil(_pad[3] / 2))
                     _pad = (half_pad[0], _pad[1] - half_pad[0], half_pad[1], _pad[3] - half_pad[1])
+                # Pad image
                 img = pad(img, _pad)
             # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio)
             if target is not None:
+                if self.symmetric_pad:
+                    offset = half_pad[0] / img.shape[-1], half_pad[1] / img.shape[-2]
                 if self.preserve_aspect_ratio:
                     # Get absolute coords
                     if target.shape[1:] == (4,):
                         if isinstance(self.size, (tuple, list)) and self.symmetric_pad:
-                            if np.max(target) <= 1:
-                                offset = half_pad[0] / img.shape[-1], half_pad[1] / img.shape[-2]
                             target[:, [0, 2]] = offset[0] + target[:, [0, 2]] * raw_shape[-1] / img.shape[-1]
                             target[:, [1, 3]] = offset[1] + target[:, [1, 3]] * raw_shape[-2] / img.shape[-2]
                         else:
@@ -91,16 +93,15 @@ class Resize(T.Resize):
                             target[:, [1, 3]] *= raw_shape[-2] / img.shape[-2]
                     elif target.shape[1:] == (4, 2):
                         if isinstance(self.size, (tuple, list)) and self.symmetric_pad:
-                            if np.max(target) <= 1:
-                                offset = half_pad[0] / img.shape[-1], half_pad[1] / img.shape[-2]
                             target[..., 0] = offset[0] + target[..., 0] * raw_shape[-1] / img.shape[-1]
                             target[..., 1] = offset[1] + target[..., 1] * raw_shape[-2] / img.shape[-2]
                         else:
                             target[..., 0] *= raw_shape[-1] / img.shape[-1]
                             target[..., 1] *= raw_shape[-2] / img.shape[-2]
                     else:
-                        raise AssertionError
-                return img, target
+                        raise AssertionError("Boxes should be in the format (n_boxes, 4, 2) or (n_boxes, 4)")
+                return img, np.clip(target, 0, 1)
             return img

doctr/transforms/modules/tensorflow.py CHANGED Viewed

@@ -107,29 +107,34 @@ class Resize(NestedObject):
         target: Optional[np.ndarray] = None,
     ) -> Union[tf.Tensor, Tuple[tf.Tensor, np.ndarray]]:
         input_dtype = img.dtype
+        self.output_size = (
+            (self.output_size, self.output_size) if isinstance(self.output_size, int) else self.output_size
+        )
         img = tf.image.resize(img, self.wanted_size, self.method, self.preserve_aspect_ratio, self.antialias)
         # It will produce an un-padded resized image, with a side shorter than wanted if we preserve aspect ratio
         raw_shape = img.shape[:2]
+        if self.symmetric_pad:
+            half_pad = (int((self.output_size[0] - img.shape[0]) / 2), 0)
         if self.preserve_aspect_ratio:
             if isinstance(self.output_size, (tuple, list)):
                 # In that case we need to pad because we want to enforce both width and height
                 if not self.symmetric_pad:
-                    offset = (0, 0)
+                    half_pad = (0, 0)
                 elif self.output_size[0] == img.shape[0]:
-                    offset = (0, int((self.output_size[1] - img.shape[1]) / 2))
-                else:
-                    offset = (int((self.output_size[0] - img.shape[0]) / 2), 0)
-                img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size)
+                    half_pad = (0, int((self.output_size[1] - img.shape[1]) / 2))
+                # Pad image
+                img = tf.image.pad_to_bounding_box(img, *half_pad, *self.output_size)
         # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio)
         if target is not None:
+            if self.symmetric_pad:
+                offset = half_pad[0] / img.shape[0], half_pad[1] / img.shape[1]
             if self.preserve_aspect_ratio:
                 # Get absolute coords
                 if target.shape[1:] == (4,):
                     if isinstance(self.output_size, (tuple, list)) and self.symmetric_pad:
-                        if np.max(target) <= 1:
-                            offset = offset[0] / img.shape[0], offset[1] / img.shape[1]
                         target[:, [0, 2]] = offset[1] + target[:, [0, 2]] * raw_shape[1] / img.shape[1]
                         target[:, [1, 3]] = offset[0] + target[:, [1, 3]] * raw_shape[0] / img.shape[0]
                     else:
@@ -137,16 +142,15 @@ class Resize(NestedObject):
                         target[:, [1, 3]] *= raw_shape[0] / img.shape[0]
                 elif target.shape[1:] == (4, 2):
                     if isinstance(self.output_size, (tuple, list)) and self.symmetric_pad:
-                        if np.max(target) <= 1:
-                            offset = offset[0] / img.shape[0], offset[1] / img.shape[1]
                         target[..., 0] = offset[1] + target[..., 0] * raw_shape[1] / img.shape[1]
                         target[..., 1] = offset[0] + target[..., 1] * raw_shape[0] / img.shape[0]
                     else:
                         target[..., 0] *= raw_shape[1] / img.shape[1]
                         target[..., 1] *= raw_shape[0] / img.shape[0]
                 else:
-                    raise AssertionError
-            return tf.cast(img, dtype=input_dtype), target
+                    raise AssertionError("Boxes should be in the format (n_boxes, 4, 2) or (n_boxes, 4)")
+            return tf.cast(img, dtype=input_dtype), np.clip(target, 0, 1)
         return tf.cast(img, dtype=input_dtype)
@@ -395,7 +399,6 @@ class GaussianBlur(NestedObject):
     def extra_repr(self) -> str:
         return f"kernel_shape={self.kernel_shape}, std={self.std}"
-    @tf.function
     def __call__(self, img: tf.Tensor) -> tf.Tensor:
         return tf.squeeze(
             _gaussian_filter(

doctr/utils/geometry.py CHANGED Viewed

@@ -20,6 +20,7 @@ __all__ = [
     "rotate_boxes",
     "compute_expanded_shape",
     "rotate_image",
+    "remove_image_padding",
     "estimate_page_angle",
     "convert_to_relative_coords",
     "rotate_abs_geoms",
@@ -351,6 +352,26 @@ def rotate_image(
     return rot_img
+def remove_image_padding(image: np.ndarray) -> np.ndarray:
+    """Remove black border padding from an image
+    Args:
+    ----
+        image: numpy tensor to remove padding from
+    Returns:
+    -------
+        Image with padding removed
+    """
+    # Find the bounding box of the non-black region
+    rows = np.any(image, axis=1)
+    cols = np.any(image, axis=0)
+    rmin, rmax = np.where(rows)[0][[0, -1]]
+    cmin, cmax = np.where(cols)[0][[0, -1]]
+    return image[rmin : rmax + 1, cmin : cmax + 1]
 def estimate_page_angle(polys: np.ndarray) -> float:
     """Takes a batch of rotated previously ORIENTED polys (N, 4, 2) (rectified by the classifier) and return the
     estimated angle ccw in degrees
@@ -431,7 +452,7 @@ def extract_crops(img: np.ndarray, boxes: np.ndarray, channels_last: bool = True
 def extract_rcrops(
-    img: np.ndarray, polys: np.ndarray, dtype=np.float32, channels_last: bool = True
+    img: np.ndarray, polys: np.ndarray, dtype=np.float32, channels_last: bool = True, assume_horizontal: bool = False
 ) -> List[np.ndarray]:
     """Created cropped images from list of rotated bounding boxes
@@ -441,6 +462,7 @@ def extract_rcrops(
         polys: bounding boxes of shape (N, 4, 2)
         dtype: target data type of bounding boxes
         channels_last: whether the channel dimensions is the last one instead of the last one
+        assume_horizontal: whether the boxes are assumed to be only horizontally oriented
     Returns:
     -------
@@ -458,22 +480,87 @@ def extract_rcrops(
         _boxes[:, :, 0] *= width
         _boxes[:, :, 1] *= height
-    src_pts = _boxes[:, :3].astype(np.float32)
-    # Preserve size
-    d1 = np.linalg.norm(src_pts[:, 0] - src_pts[:, 1], axis=-1)
-    d2 = np.linalg.norm(src_pts[:, 1] - src_pts[:, 2], axis=-1)
-    # (N, 3, 2)
-    dst_pts = np.zeros((_boxes.shape[0], 3, 2), dtype=dtype)
-    dst_pts[:, 1, 0] = dst_pts[:, 2, 0] = d1 - 1
-    dst_pts[:, 2, 1] = d2 - 1
-    # Use a warp transformation to extract the crop
-    crops = [
-        cv2.warpAffine(
-            img if channels_last else img.transpose(1, 2, 0),
-            # Transformation matrix
-            cv2.getAffineTransform(src_pts[idx], dst_pts[idx]),
-            (int(d1[idx]), int(d2[idx])),
-        )
-        for idx in range(_boxes.shape[0])
-    ]
+    src_img = img if channels_last else img.transpose(1, 2, 0)
+    # Handle only horizontal oriented boxes
+    if assume_horizontal:
+        crops = []
+        for box in _boxes:
+            # Calculate the centroid of the quadrilateral
+            centroid = np.mean(box, axis=0)
+            # Divide the points into left and right
+            left_points = box[box[:, 0] < centroid[0]]
+            right_points = box[box[:, 0] >= centroid[0]]
+            # Sort the left points according to the y-axis
+            left_points = left_points[np.argsort(left_points[:, 1])]
+            top_left_pt = left_points[0]
+            bottom_left_pt = left_points[-1]
+            # Sort the right points according to the y-axis
+            right_points = right_points[np.argsort(right_points[:, 1])]
+            top_right_pt = right_points[0]
+            bottom_right_pt = right_points[-1]
+            box_points = np.array(
+                [top_left_pt, bottom_left_pt, top_right_pt, bottom_right_pt],
+                dtype=dtype,
+            )
+            # Get the width and height of the rectangle that will contain the warped quadrilateral
+            width_upper = np.linalg.norm(top_right_pt - top_left_pt)
+            width_lower = np.linalg.norm(bottom_right_pt - bottom_left_pt)
+            height_left = np.linalg.norm(bottom_left_pt - top_left_pt)
+            height_right = np.linalg.norm(bottom_right_pt - top_right_pt)
+            # Get the maximum width and height
+            rect_width = max(int(width_upper), int(width_lower))
+            rect_height = max(int(height_left), int(height_right))
+            dst_pts = np.array(
+                [
+                    [0, 0],  # top-left
+                    # bottom-left
+                    [0, rect_height - 1],
+                    # top-right
+                    [rect_width - 1, 0],
+                    # bottom-right
+                    [rect_width - 1, rect_height - 1],
+                ],
+                dtype=dtype,
+            )
+            # Get the perspective transform matrix using the box points
+            affine_mat = cv2.getPerspectiveTransform(box_points, dst_pts)
+            # Perform the perspective warp to get the rectified crop
+            crop = cv2.warpPerspective(
+                src_img,
+                affine_mat,
+                (rect_width, rect_height),
+            )
+            # Add the crop to the list of crops
+            crops.append(crop)
+    # Handle any oriented boxes
+    else:
+        src_pts = _boxes[:, :3].astype(np.float32)
+        # Preserve size
+        d1 = np.linalg.norm(src_pts[:, 0] - src_pts[:, 1], axis=-1)
+        d2 = np.linalg.norm(src_pts[:, 1] - src_pts[:, 2], axis=-1)
+        # (N, 3, 2)
+        dst_pts = np.zeros((_boxes.shape[0], 3, 2), dtype=dtype)
+        dst_pts[:, 1, 0] = dst_pts[:, 2, 0] = d1 - 1
+        dst_pts[:, 2, 1] = d2 - 1
+        # Use a warp transformation to extract the crop
+        crops = [
+            cv2.warpAffine(
+                src_img,
+                # Transformation matrix
+                cv2.getAffineTransform(src_pts[idx], dst_pts[idx]),
+                (int(d1[idx]), int(d2[idx])),
+            )
+            for idx in range(_boxes.shape[0])
+        ]
     return crops  # type: ignore[return-value]

doctr/utils/metrics.py CHANGED Viewed

@@ -149,7 +149,7 @@ def box_iou(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray:
         right = np.minimum(r1, r2.T)
         bot = np.minimum(b1, b2.T)
-        intersection = np.clip(right - left, 0, np.Inf) * np.clip(bot - top, 0, np.Inf)
+        intersection = np.clip(right - left, 0, np.inf) * np.clip(bot - top, 0, np.inf)
         union = (r1 - l1) * (b1 - t1) + ((r2 - l2) * (b2 - t2)).T - intersection
         iou_mat = intersection / union

python-doctr 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

python-doctr 0.9.0py3-none-any.whl → 0.10.0py3-none-any.whl