PyPI - python-doctr - Versions diffs - 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

python-doctr 0.8.1py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (107) hide show

doctr/__init__.py +1 -1
doctr/contrib/__init__.py +0 -0
doctr/contrib/artefacts.py +131 -0
doctr/contrib/base.py +105 -0
doctr/datasets/cord.py +10 -1
doctr/datasets/datasets/pytorch.py +2 -2
doctr/datasets/funsd.py +11 -1
doctr/datasets/generator/base.py +6 -5
doctr/datasets/ic03.py +11 -1
doctr/datasets/ic13.py +10 -1
doctr/datasets/iiit5k.py +26 -16
doctr/datasets/imgur5k.py +11 -2
doctr/datasets/loader.py +1 -6
doctr/datasets/sroie.py +11 -1
doctr/datasets/svhn.py +11 -1
doctr/datasets/svt.py +11 -1
doctr/datasets/synthtext.py +11 -1
doctr/datasets/utils.py +9 -3
doctr/datasets/vocabs.py +15 -4
doctr/datasets/wildreceipt.py +12 -1
doctr/file_utils.py +45 -12
doctr/io/elements.py +52 -10
doctr/io/html.py +2 -2
doctr/io/image/pytorch.py +6 -8
doctr/io/image/tensorflow.py +1 -1
doctr/io/pdf.py +5 -2
doctr/io/reader.py +6 -0
doctr/models/__init__.py +0 -1
doctr/models/_utils.py +57 -20
doctr/models/builder.py +73 -15
doctr/models/classification/magc_resnet/tensorflow.py +13 -6
doctr/models/classification/mobilenet/pytorch.py +47 -9
doctr/models/classification/mobilenet/tensorflow.py +51 -14
doctr/models/classification/predictor/pytorch.py +28 -17
doctr/models/classification/predictor/tensorflow.py +26 -16
doctr/models/classification/resnet/tensorflow.py +21 -8
doctr/models/classification/textnet/pytorch.py +3 -3
doctr/models/classification/textnet/tensorflow.py +11 -5
doctr/models/classification/vgg/tensorflow.py +9 -3
doctr/models/classification/vit/tensorflow.py +10 -4
doctr/models/classification/zoo.py +55 -19
doctr/models/detection/_utils/__init__.py +1 -0
doctr/models/detection/_utils/base.py +66 -0
doctr/models/detection/differentiable_binarization/base.py +4 -3
doctr/models/detection/differentiable_binarization/pytorch.py +2 -2
doctr/models/detection/differentiable_binarization/tensorflow.py +34 -12
doctr/models/detection/fast/base.py +6 -5
doctr/models/detection/fast/pytorch.py +4 -4
doctr/models/detection/fast/tensorflow.py +15 -12
doctr/models/detection/linknet/base.py +4 -3
doctr/models/detection/linknet/tensorflow.py +23 -11
doctr/models/detection/predictor/pytorch.py +15 -1
doctr/models/detection/predictor/tensorflow.py +17 -3
doctr/models/detection/zoo.py +7 -2
doctr/models/factory/hub.py +8 -18
doctr/models/kie_predictor/base.py +13 -3
doctr/models/kie_predictor/pytorch.py +45 -20
doctr/models/kie_predictor/tensorflow.py +44 -17
doctr/models/modules/layers/pytorch.py +2 -3
doctr/models/modules/layers/tensorflow.py +6 -8
doctr/models/modules/transformer/pytorch.py +2 -2
doctr/models/modules/transformer/tensorflow.py +0 -2
doctr/models/modules/vision_transformer/pytorch.py +1 -1
doctr/models/modules/vision_transformer/tensorflow.py +1 -1
doctr/models/predictor/base.py +97 -58
doctr/models/predictor/pytorch.py +35 -20
doctr/models/predictor/tensorflow.py +35 -18
doctr/models/preprocessor/pytorch.py +4 -4
doctr/models/preprocessor/tensorflow.py +3 -2
doctr/models/recognition/crnn/tensorflow.py +8 -6
doctr/models/recognition/master/pytorch.py +2 -2
doctr/models/recognition/master/tensorflow.py +9 -4
doctr/models/recognition/parseq/pytorch.py +4 -3
doctr/models/recognition/parseq/tensorflow.py +14 -11
doctr/models/recognition/sar/pytorch.py +7 -6
doctr/models/recognition/sar/tensorflow.py +10 -12
doctr/models/recognition/vitstr/pytorch.py +1 -1
doctr/models/recognition/vitstr/tensorflow.py +9 -4
doctr/models/recognition/zoo.py +1 -1
doctr/models/utils/pytorch.py +1 -1
doctr/models/utils/tensorflow.py +15 -15
doctr/models/zoo.py +2 -2
doctr/py.typed +0 -0
doctr/transforms/functional/base.py +1 -1
doctr/transforms/functional/pytorch.py +5 -5
doctr/transforms/modules/base.py +37 -15
doctr/transforms/modules/pytorch.py +73 -14
doctr/transforms/modules/tensorflow.py +78 -19
doctr/utils/fonts.py +7 -5
doctr/utils/geometry.py +141 -31
doctr/utils/metrics.py +34 -175
doctr/utils/reconstitution.py +212 -0
doctr/utils/visualization.py +5 -118
doctr/version.py +1 -1
{python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/METADATA +85 -81
python_doctr-0.10.0.dist-info/RECORD +173 -0
{python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/WHEEL +1 -1
doctr/models/artefacts/__init__.py +0 -2
doctr/models/artefacts/barcode.py +0 -74
doctr/models/artefacts/face.py +0 -63
doctr/models/obj_detection/__init__.py +0 -1
doctr/models/obj_detection/faster_rcnn/__init__.py +0 -4
doctr/models/obj_detection/faster_rcnn/pytorch.py +0 -81
python_doctr-0.8.1.dist-info/RECORD +0 -173
{python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/LICENSE +0 -0
{python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/top_level.txt +0 -0
{python_doctr-0.8.1.dist-info → python_doctr-0.10.0.dist-info}/zip-safe +0 -0

doctr/models/recognition/vitstr/tensorflow.py CHANGED Viewed

@@ -12,7 +12,7 @@ from tensorflow.keras import Model, layers
 from doctr.datasets import VOCABS
 from ...classification import vit_b, vit_s
-from ...utils.tensorflow import _bf16_to_float32, load_pretrained_params
+from ...utils.tensorflow import _bf16_to_float32, _build_model, load_pretrained_params
 from .base import _ViTSTR, _ViTSTRPostProcessor
 __all__ = ["ViTSTR", "vitstr_small", "vitstr_base"]
@@ -23,14 +23,14 @@ default_cfgs: Dict[str, Dict[str, Any]] = {
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 128, 3),
         "vocab": VOCABS["french"],
-        "url": "https://doctr-static.mindee.com/models?id=v0.6.0/vitstr_small-358fab2e.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/vitstr_small-d28b8d92.weights.h5&src=0",
     },
     "vitstr_base": {
         "mean": (0.694, 0.695, 0.693),
         "std": (0.299, 0.296, 0.301),
         "input_shape": (32, 128, 3),
         "vocab": VOCABS["french"],
-        "url": "https://doctr-static.mindee.com/models?id=v0.6.0/vitstr_base-2889159a.zip&src=0",
+        "url": "https://doctr-static.mindee.com/models?id=v0.9.0/vitstr_base-9ad6eb84.weights.h5&src=0",
     },
 }
@@ -216,9 +216,14 @@ def _vitstr(
     # Build the model
     model = ViTSTR(feat_extractor, cfg=_cfg, **kwargs)
+    _build_model(model)
     # Load pretrained parameters
     if pretrained:
-        load_pretrained_params(model, default_cfgs[arch]["url"])
+        # The given vocab differs from the pretrained model => skip the mismatching layers for fine tuning
+        load_pretrained_params(
+            model, default_cfgs[arch]["url"], skip_mismatch=kwargs["vocab"] != default_cfgs[arch]["vocab"]
+        )
     return model

doctr/models/recognition/zoo.py CHANGED Viewed

@@ -45,7 +45,7 @@ def _predictor(arch: Any, pretrained: bool, **kwargs: Any) -> RecognitionPredict
     kwargs["mean"] = kwargs.get("mean", _model.cfg["mean"])
     kwargs["std"] = kwargs.get("std", _model.cfg["std"])
-    kwargs["batch_size"] = kwargs.get("batch_size", 32)
+    kwargs["batch_size"] = kwargs.get("batch_size", 128)
     input_shape = _model.cfg["input_shape"][:2] if is_tf_available() else _model.cfg["input_shape"][-2:]
     predictor = RecognitionPredictor(PreProcessor(input_shape, preserve_aspect_ratio=True, **kwargs), _model)

doctr/models/utils/pytorch.py CHANGED Viewed

@@ -157,7 +157,7 @@ def export_model_to_onnx(model: nn.Module, model_name: str, dummy_input: torch.T
     """
     torch.onnx.export(
         model,
-        dummy_input,
+        dummy_input,  # type: ignore[arg-type]
         f"{model_name}.onnx",
         input_names=["input"],
         output_names=["logits"],

doctr/models/utils/tensorflow.py CHANGED Viewed

@@ -4,9 +4,7 @@
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 import logging
-import os
 from typing import Any, Callable, List, Optional, Tuple, Union
-from zipfile import ZipFile
 import tensorflow as tf
 import tf2onnx
@@ -19,6 +17,7 @@ logging.getLogger("tensorflow").setLevel(logging.DEBUG)
 __all__ = [
     "load_pretrained_params",
+    "_build_model",
     "conv_sequence",
     "IntermediateLayerGetter",
     "export_model_to_onnx",
@@ -36,41 +35,42 @@ def _bf16_to_float32(x: tf.Tensor) -> tf.Tensor:
     return tf.cast(x, tf.float32) if x.dtype == tf.bfloat16 else x
+def _build_model(model: Model):
+    """Build a model by calling it once with dummy input
+    Args:
+    ----
+        model: the model to be built
+    """
+    model(tf.zeros((1, *model.cfg["input_shape"])), training=False)
 def load_pretrained_params(
     model: Model,
     url: Optional[str] = None,
     hash_prefix: Optional[str] = None,
-    overwrite: bool = False,
-    internal_name: str = "weights",
+    skip_mismatch: bool = False,
     **kwargs: Any,
 ) -> None:
     """Load a set of parameters onto a model
     >>> from doctr.models import load_pretrained_params
-    >>> load_pretrained_params(model, "https://yoursource.com/yourcheckpoint-yourhash.zip")
+    >>> load_pretrained_params(model, "https://yoursource.com/yourcheckpoint-yourhash.weights.h5")
     Args:
     ----
         model: the keras model to be loaded
         url: URL of the zipped set of parameters
         hash_prefix: first characters of SHA256 expected hash
-        overwrite: should the zip extraction be enforced if the archive has already been extracted
-        internal_name: name of the ckpt files
+        skip_mismatch: skip loading layers with mismatched shapes
         **kwargs: additional arguments to be passed to `doctr.utils.data.download_from_url`
     """
     if url is None:
         logging.warning("Invalid model URL, using default initialization.")
     else:
         archive_path = download_from_url(url, hash_prefix=hash_prefix, cache_subdir="models", **kwargs)
-        # Unzip the archive
-        params_path = archive_path.parent.joinpath(archive_path.stem)
-        if not params_path.is_dir() or overwrite:
-            with ZipFile(archive_path, "r") as f:
-                f.extractall(path=params_path)
         # Load weights
-        model.load_weights(f"{params_path}{os.sep}{internal_name}")
+        model.load_weights(archive_path, skip_mismatch=skip_mismatch)
 def conv_sequence(

doctr/models/zoo.py CHANGED Viewed

@@ -61,7 +61,7 @@ def _predictor(
 def ocr_predictor(
-    det_arch: Any = "db_resnet50",
+    det_arch: Any = "fast_base",
     reco_arch: Any = "crnn_vgg16_bn",
     pretrained: bool = False,
     pretrained_backbone: bool = True,
@@ -175,7 +175,7 @@ def _kie_predictor(
 def kie_predictor(
-    det_arch: Any = "db_resnet50",
+    det_arch: Any = "fast_base",
     reco_arch: Any = "crnn_vgg16_bn",
     pretrained: bool = False,
     pretrained_backbone: bool = True,

doctr/py.typed ADDED Viewed

File without changes

doctr/transforms/functional/base.py CHANGED Viewed

@@ -200,4 +200,4 @@ def create_shadow_mask(
     mask: np.ndarray = np.zeros((*target_shape, 1), dtype=np.uint8)
     mask = cv2.fillPoly(mask, [final_contour], (255,), lineType=cv2.LINE_AA)[..., 0]
-    return (mask / 255).astype(np.float32).clip(0, 1) * intensity_mask.astype(np.float32)  # type: ignore[operator]
+    return (mask / 255).astype(np.float32).clip(0, 1) * intensity_mask.astype(np.float32)

doctr/transforms/functional/pytorch.py CHANGED Viewed

@@ -35,9 +35,9 @@ def invert_colors(img: torch.Tensor, min_val: float = 0.6) -> torch.Tensor:
     rgb_shift = min_val + (1 - min_val) * torch.rand(shift_shape)
     # Inverse the color
     if out.dtype == torch.uint8:
-        out = (out.to(dtype=rgb_shift.dtype) * rgb_shift).to(dtype=torch.uint8)  # type: ignore[attr-defined]
+        out = (out.to(dtype=rgb_shift.dtype) * rgb_shift).to(dtype=torch.uint8)
     else:
-        out = out * rgb_shift.to(dtype=out.dtype)  # type: ignore[attr-defined]
+        out = out * rgb_shift.to(dtype=out.dtype)
     # Inverse the color
     out = 255 - out if out.dtype == torch.uint8 else 1 - out
     return out
@@ -81,7 +81,7 @@ def rotate_sample(
     rotated_geoms: np.ndarray = rotate_abs_geoms(
         _geoms,
         angle,
-        img.shape[1:],
+        img.shape[1:],  # type: ignore[arg-type]
         expand,
     ).astype(np.float32)
@@ -89,7 +89,7 @@ def rotate_sample(
     rotated_geoms[..., 0] = rotated_geoms[..., 0] / rotated_img.shape[2]
     rotated_geoms[..., 1] = rotated_geoms[..., 1] / rotated_img.shape[1]
-    return rotated_img, np.clip(rotated_geoms, 0, 1)
+    return rotated_img, np.clip(np.around(rotated_geoms, decimals=15), 0, 1)
 def crop_detection(
@@ -132,7 +132,7 @@ def random_shadow(img: torch.Tensor, opacity_range: Tuple[float, float], **kwarg
     -------
         shaded image
     """
-    shadow_mask = create_shadow_mask(img.shape[1:], **kwargs)
+    shadow_mask = create_shadow_mask(img.shape[1:], **kwargs)  # type: ignore[arg-type]
     opacity = np.random.uniform(*opacity_range)
     shadow_tensor = 1 - torch.from_numpy(shadow_mask[None, ...])

doctr/transforms/modules/base.py CHANGED Viewed

@@ -5,7 +5,7 @@
 import math
 import random
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, List, Optional, Tuple, Union
 import numpy as np
@@ -168,11 +168,11 @@ class OneOf(NestedObject):
     def __init__(self, transforms: List[Callable[[Any], Any]]) -> None:
         self.transforms = transforms
-    def __call__(self, img: Any) -> Any:
+    def __call__(self, img: Any, target: Optional[np.ndarray] = None) -> Union[Any, Tuple[Any, np.ndarray]]:
         # Pick transformation
         transfo = self.transforms[int(random.random() * len(self.transforms))]
         # Apply
-        return transfo(img)
+        return transfo(img) if target is None else transfo(img, target)  # type: ignore[call-arg]
 class RandomApply(NestedObject):
@@ -261,17 +261,39 @@ class RandomCrop(NestedObject):
     def extra_repr(self) -> str:
         return f"scale={self.scale}, ratio={self.ratio}"
-    def __call__(self, img: Any, target: Dict[str, np.ndarray]) -> Tuple[Any, Dict[str, np.ndarray]]:
+    def __call__(self, img: Any, target: np.ndarray) -> Tuple[Any, np.ndarray]:
         scale = random.uniform(self.scale[0], self.scale[1])
         ratio = random.uniform(self.ratio[0], self.ratio[1])
-        # Those might overflow
-        crop_h = math.sqrt(scale * ratio)
-        crop_w = math.sqrt(scale / ratio)
-        xmin, ymin = random.uniform(0, 1 - crop_w), random.uniform(0, 1 - crop_h)
-        xmax, ymax = xmin + crop_w, ymin + crop_h
-        # Clip them
-        xmin, ymin = max(xmin, 0), max(ymin, 0)
-        xmax, ymax = min(xmax, 1), min(ymax, 1)
-        croped_img, crop_boxes = F.crop_detection(img, target["boxes"], (xmin, ymin, xmax, ymax))
-        return croped_img, dict(boxes=crop_boxes)
+        height, width = img.shape[:2]
+        # Calculate crop size
+        crop_area = scale * width * height
+        aspect_ratio = ratio * (width / height)
+        crop_width = int(round(math.sqrt(crop_area * aspect_ratio)))
+        crop_height = int(round(math.sqrt(crop_area / aspect_ratio)))
+        # Ensure crop size does not exceed image dimensions
+        crop_width = min(crop_width, width)
+        crop_height = min(crop_height, height)
+        # Randomly select crop position
+        x = random.randint(0, width - crop_width)
+        y = random.randint(0, height - crop_height)
+        # relative crop box
+        crop_box = (x / width, y / height, (x + crop_width) / width, (y + crop_height) / height)
+        if target.shape[1:] == (4, 2):
+            min_xy = np.min(target, axis=1)
+            max_xy = np.max(target, axis=1)
+            _target = np.concatenate((min_xy, max_xy), axis=1)
+        else:
+            _target = target
+        # Crop image and targets
+        croped_img, crop_boxes = F.crop_detection(img, _target, crop_box)
+        # hard fallback if no box is kept
+        if crop_boxes.shape[0] == 0:
+            return img, target
+        # clip boxes
+        return croped_img, np.clip(crop_boxes, 0, 1)

doctr/transforms/modules/pytorch.py CHANGED Viewed

@@ -4,7 +4,7 @@
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 import math
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 import numpy as np
 import torch
@@ -15,7 +15,7 @@ from torchvision.transforms import transforms as T
 from ..functional.pytorch import random_shadow
-__all__ = ["Resize", "GaussianNoise", "ChannelShuffle", "RandomHorizontalFlip", "RandomShadow"]
+__all__ = ["Resize", "GaussianNoise", "ChannelShuffle", "RandomHorizontalFlip", "RandomShadow", "RandomResize"]
 class Resize(T.Resize):
@@ -74,16 +74,18 @@ class Resize(T.Resize):
                 if self.symmetric_pad:
                     half_pad = (math.ceil(_pad[1] / 2), math.ceil(_pad[3] / 2))
                     _pad = (half_pad[0], _pad[1] - half_pad[0], half_pad[1], _pad[3] - half_pad[1])
+                # Pad image
                 img = pad(img, _pad)
             # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio)
             if target is not None:
+                if self.symmetric_pad:
+                    offset = half_pad[0] / img.shape[-1], half_pad[1] / img.shape[-2]
                 if self.preserve_aspect_ratio:
                     # Get absolute coords
                     if target.shape[1:] == (4,):
                         if isinstance(self.size, (tuple, list)) and self.symmetric_pad:
-                            if np.max(target) <= 1:
-                                offset = half_pad[0] / img.shape[-1], half_pad[1] / img.shape[-2]
                             target[:, [0, 2]] = offset[0] + target[:, [0, 2]] * raw_shape[-1] / img.shape[-1]
                             target[:, [1, 3]] = offset[1] + target[:, [1, 3]] * raw_shape[-2] / img.shape[-2]
                         else:
@@ -91,16 +93,15 @@ class Resize(T.Resize):
                             target[:, [1, 3]] *= raw_shape[-2] / img.shape[-2]
                     elif target.shape[1:] == (4, 2):
                         if isinstance(self.size, (tuple, list)) and self.symmetric_pad:
-                            if np.max(target) <= 1:
-                                offset = half_pad[0] / img.shape[-1], half_pad[1] / img.shape[-2]
                             target[..., 0] = offset[0] + target[..., 0] * raw_shape[-1] / img.shape[-1]
                             target[..., 1] = offset[1] + target[..., 1] * raw_shape[-2] / img.shape[-2]
                         else:
                             target[..., 0] *= raw_shape[-1] / img.shape[-1]
                             target[..., 1] *= raw_shape[-2] / img.shape[-2]
                     else:
-                        raise AssertionError
-                return img, target
+                        raise AssertionError("Boxes should be in the format (n_boxes, 4, 2) or (n_boxes, 4)")
+                return img, np.clip(target, 0, 1)
             return img
@@ -135,9 +136,9 @@ class GaussianNoise(torch.nn.Module):
         # Reshape the distribution
         noise = self.mean + 2 * self.std * torch.rand(x.shape, device=x.device) - self.std
         if x.dtype == torch.uint8:
-            return (x + 255 * noise).round().clamp(0, 255).to(dtype=torch.uint8)  # type: ignore[attr-defined]
+            return (x + 255 * noise).round().clamp(0, 255).to(dtype=torch.uint8)
         else:
-            return (x + noise.to(dtype=x.dtype)).clamp(0, 1)  # type: ignore[attr-defined]
+            return (x + noise.to(dtype=x.dtype)).clamp(0, 1)
     def extra_repr(self) -> str:
         return f"mean={self.mean}, std={self.std}"
@@ -159,13 +160,16 @@ class RandomHorizontalFlip(T.RandomHorizontalFlip):
     """Randomly flip the input image horizontally"""
     def forward(
-        self, img: Union[torch.Tensor, Image], target: Dict[str, Any]
-    ) -> Tuple[Union[torch.Tensor, Image], Dict[str, Any]]:
+        self, img: Union[torch.Tensor, Image], target: np.ndarray
+    ) -> Tuple[Union[torch.Tensor, Image], np.ndarray]:
         if torch.rand(1) < self.p:
             _img = F.hflip(img)
             _target = target.copy()
             # Changing the relative bbox coordinates
-            _target["boxes"][:, ::2] = 1 - target["boxes"][:, [2, 0]]
+            if target.shape[1:] == (4,):
+                _target[:, ::2] = 1 - target[:, [2, 0]]
+            else:
+                _target[..., 0] = 1 - target[..., 0]
             return _img, _target
         return img, target
@@ -199,7 +203,7 @@ class RandomShadow(torch.nn.Module):
                             self.opacity_range,
                         )
                     )
-                    .round()  # type: ignore[attr-defined]
+                    .round()
                     .clip(0, 255)
                     .to(dtype=torch.uint8)
                 )
@@ -210,3 +214,58 @@ class RandomShadow(torch.nn.Module):
     def extra_repr(self) -> str:
         return f"opacity_range={self.opacity_range}"
+class RandomResize(torch.nn.Module):
+    """Randomly resize the input image and align corresponding targets
+    >>> import torch
+    >>> from doctr.transforms import RandomResize
+    >>> transfo = RandomResize((0.3, 0.9), preserve_aspect_ratio=True, symmetric_pad=True, p=0.5)
+    >>> out = transfo(torch.rand((3, 64, 64)))
+    Args:
+    ----
+        scale_range: range of the resizing factor for width and height (independently)
+        preserve_aspect_ratio: whether to preserve the aspect ratio of the image,
+            given a float value, the aspect ratio will be preserved with this probability
+        symmetric_pad: whether to symmetrically pad the image,
+            given a float value, the symmetric padding will be applied with this probability
+        p: probability to apply the transformation
+    """
+    def __init__(
+        self,
+        scale_range: Tuple[float, float] = (0.3, 0.9),
+        preserve_aspect_ratio: Union[bool, float] = False,
+        symmetric_pad: Union[bool, float] = False,
+        p: float = 0.5,
+    ) -> None:
+        super().__init__()
+        self.scale_range = scale_range
+        self.preserve_aspect_ratio = preserve_aspect_ratio
+        self.symmetric_pad = symmetric_pad
+        self.p = p
+        self._resize = Resize
+    def forward(self, img: torch.Tensor, target: np.ndarray) -> Tuple[torch.Tensor, np.ndarray]:
+        if torch.rand(1) < self.p:
+            scale_h = np.random.uniform(*self.scale_range)
+            scale_w = np.random.uniform(*self.scale_range)
+            new_size = (int(img.shape[-2] * scale_h), int(img.shape[-1] * scale_w))
+            _img, _target = self._resize(
+                new_size,
+                preserve_aspect_ratio=self.preserve_aspect_ratio
+                if isinstance(self.preserve_aspect_ratio, bool)
+                else bool(torch.rand(1) <= self.symmetric_pad),
+                symmetric_pad=self.symmetric_pad
+                if isinstance(self.symmetric_pad, bool)
+                else bool(torch.rand(1) <= self.symmetric_pad),
+            )(img, target)
+            return _img, _target
+        return img, target
+    def extra_repr(self) -> str:
+        return f"scale_range={self.scale_range}, preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}, p={self.p}"  # noqa: E501

doctr/transforms/modules/tensorflow.py CHANGED Viewed

@@ -4,7 +4,7 @@
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 import random
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Callable, Iterable, List, Optional, Tuple, Union
 import numpy as np
 import tensorflow as tf
@@ -30,6 +30,7 @@ __all__ = [
     "GaussianNoise",
     "RandomHorizontalFlip",
     "RandomShadow",
+    "RandomResize",
 ]
@@ -106,29 +107,34 @@ class Resize(NestedObject):
         target: Optional[np.ndarray] = None,
     ) -> Union[tf.Tensor, Tuple[tf.Tensor, np.ndarray]]:
         input_dtype = img.dtype
+        self.output_size = (
+            (self.output_size, self.output_size) if isinstance(self.output_size, int) else self.output_size
+        )
         img = tf.image.resize(img, self.wanted_size, self.method, self.preserve_aspect_ratio, self.antialias)
         # It will produce an un-padded resized image, with a side shorter than wanted if we preserve aspect ratio
         raw_shape = img.shape[:2]
+        if self.symmetric_pad:
+            half_pad = (int((self.output_size[0] - img.shape[0]) / 2), 0)
         if self.preserve_aspect_ratio:
             if isinstance(self.output_size, (tuple, list)):
                 # In that case we need to pad because we want to enforce both width and height
                 if not self.symmetric_pad:
-                    offset = (0, 0)
+                    half_pad = (0, 0)
                 elif self.output_size[0] == img.shape[0]:
-                    offset = (0, int((self.output_size[1] - img.shape[1]) / 2))
-                else:
-                    offset = (int((self.output_size[0] - img.shape[0]) / 2), 0)
-                img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size)
+                    half_pad = (0, int((self.output_size[1] - img.shape[1]) / 2))
+                # Pad image
+                img = tf.image.pad_to_bounding_box(img, *half_pad, *self.output_size)
         # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio)
         if target is not None:
+            if self.symmetric_pad:
+                offset = half_pad[0] / img.shape[0], half_pad[1] / img.shape[1]
             if self.preserve_aspect_ratio:
                 # Get absolute coords
                 if target.shape[1:] == (4,):
                     if isinstance(self.output_size, (tuple, list)) and self.symmetric_pad:
-                        if np.max(target) <= 1:
-                            offset = offset[0] / img.shape[0], offset[1] / img.shape[1]
                         target[:, [0, 2]] = offset[1] + target[:, [0, 2]] * raw_shape[1] / img.shape[1]
                         target[:, [1, 3]] = offset[0] + target[:, [1, 3]] * raw_shape[0] / img.shape[0]
                     else:
@@ -136,16 +142,15 @@ class Resize(NestedObject):
                         target[:, [1, 3]] *= raw_shape[0] / img.shape[0]
                 elif target.shape[1:] == (4, 2):
                     if isinstance(self.output_size, (tuple, list)) and self.symmetric_pad:
-                        if np.max(target) <= 1:
-                            offset = offset[0] / img.shape[0], offset[1] / img.shape[1]
                         target[..., 0] = offset[1] + target[..., 0] * raw_shape[1] / img.shape[1]
                         target[..., 1] = offset[0] + target[..., 1] * raw_shape[0] / img.shape[0]
                     else:
                         target[..., 0] *= raw_shape[1] / img.shape[1]
                         target[..., 1] *= raw_shape[0] / img.shape[0]
                 else:
-                    raise AssertionError
-            return tf.cast(img, dtype=input_dtype), target
+                    raise AssertionError("Boxes should be in the format (n_boxes, 4, 2) or (n_boxes, 4)")
+            return tf.cast(img, dtype=input_dtype), np.clip(target, 0, 1)
         return tf.cast(img, dtype=input_dtype)
@@ -394,7 +399,6 @@ class GaussianBlur(NestedObject):
     def extra_repr(self) -> str:
         return f"kernel_shape={self.kernel_shape}, std={self.std}"
-    @tf.function
     def __call__(self, img: tf.Tensor) -> tf.Tensor:
         return tf.squeeze(
             _gaussian_filter(
@@ -457,10 +461,7 @@ class RandomHorizontalFlip(NestedObject):
     >>> from doctr.transforms import RandomHorizontalFlip
     >>> transfo = RandomHorizontalFlip(p=0.5)
     >>> image = tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1)
-    >>> target = {
-    >>> "boxes": np.array([[0.1, 0.1, 0.4, 0.5] ], dtype= np.float32),
-    >>> "labels": np.ones(1, dtype= np.int64)
-    >>> }
+    >>> target = np.array([[0.1, 0.1, 0.4, 0.5] ], dtype= np.float32)
     >>> out = transfo(image, target)
     Args:
@@ -472,12 +473,15 @@ class RandomHorizontalFlip(NestedObject):
         super().__init__()
         self.p = p
-    def __call__(self, img: Union[tf.Tensor, np.ndarray], target: Dict[str, Any]) -> Tuple[tf.Tensor, Dict[str, Any]]:
+    def __call__(self, img: Union[tf.Tensor, np.ndarray], target: np.ndarray) -> Tuple[tf.Tensor, np.ndarray]:
         if np.random.rand(1) <= self.p:
             _img = tf.image.flip_left_right(img)
             _target = target.copy()
             # Changing the relative bbox coordinates
-            _target["boxes"][:, ::2] = 1 - target["boxes"][:, [2, 0]]
+            if target.shape[1:] == (4,):
+                _target[:, ::2] = 1 - target[:, [2, 0]]
+            else:
+                _target[..., 0] = 1 - target[..., 0]
             return _img, _target
         return img, target
@@ -515,3 +519,58 @@ class RandomShadow(NestedObject):
     def extra_repr(self) -> str:
         return f"opacity_range={self.opacity_range}"
+class RandomResize(NestedObject):
+    """Randomly resize the input image and align corresponding targets
+    >>> import tensorflow as tf
+    >>> from doctr.transforms import RandomResize
+    >>> transfo = RandomResize((0.3, 0.9), preserve_aspect_ratio=True, symmetric_pad=True, p=0.5)
+    >>> out = transfo(tf.random.uniform(shape=[64, 64, 3], minval=0, maxval=1))
+    Args:
+    ----
+        scale_range: range of the resizing factor for width and height (independently)
+        preserve_aspect_ratio: whether to preserve the aspect ratio of the image,
+            given a float value, the aspect ratio will be preserved with this probability
+        symmetric_pad: whether to symmetrically pad the image,
+            given a float value, the symmetric padding will be applied with this probability
+        p: probability to apply the transformation
+    """
+    def __init__(
+        self,
+        scale_range: Tuple[float, float] = (0.3, 0.9),
+        preserve_aspect_ratio: Union[bool, float] = False,
+        symmetric_pad: Union[bool, float] = False,
+        p: float = 0.5,
+    ):
+        super().__init__()
+        self.scale_range = scale_range
+        self.preserve_aspect_ratio = preserve_aspect_ratio
+        self.symmetric_pad = symmetric_pad
+        self.p = p
+        self._resize = Resize
+    def __call__(self, img: tf.Tensor, target: np.ndarray) -> Tuple[tf.Tensor, np.ndarray]:
+        if np.random.rand(1) <= self.p:
+            scale_h = random.uniform(*self.scale_range)
+            scale_w = random.uniform(*self.scale_range)
+            new_size = (int(img.shape[-3] * scale_h), int(img.shape[-2] * scale_w))
+            _img, _target = self._resize(
+                new_size,
+                preserve_aspect_ratio=self.preserve_aspect_ratio
+                if isinstance(self.preserve_aspect_ratio, bool)
+                else bool(np.random.rand(1) <= self.symmetric_pad),
+                symmetric_pad=self.symmetric_pad
+                if isinstance(self.symmetric_pad, bool)
+                else bool(np.random.rand(1) <= self.symmetric_pad),
+            )(img, target)
+            return _img, _target
+        return img, target
+    def extra_repr(self) -> str:
+        return f"scale_range={self.scale_range}, preserve_aspect_ratio={self.preserve_aspect_ratio}, symmetric_pad={self.symmetric_pad}, p={self.p}"  # noqa: E501

doctr/utils/fonts.py CHANGED Viewed

@@ -5,14 +5,16 @@
 import logging
 import platform
-from typing import Optional
+from typing import Optional, Union
 from PIL import ImageFont
 __all__ = ["get_font"]
-def get_font(font_family: Optional[str] = None, font_size: int = 13) -> ImageFont.ImageFont:
+def get_font(
+    font_family: Optional[str] = None, font_size: int = 13
+) -> Union[ImageFont.FreeTypeFont, ImageFont.ImageFont]:
     """Resolves a compatible ImageFont for the system
     Args:
@@ -28,14 +30,14 @@ def get_font(font_family: Optional[str] = None, font_size: int = 13) -> ImageFon
     if font_family is None:
         try:
             font = ImageFont.truetype("FreeMono.ttf" if platform.system() == "Linux" else "Arial.ttf", font_size)
-        except OSError:
-            font = ImageFont.load_default()
+        except OSError:  # pragma: no cover
+            font = ImageFont.load_default()  # type: ignore[assignment]
             logging.warning(
                 "unable to load recommended font family. Loading default PIL font,"
                 "font size issues may be expected."
                 "To prevent this, it is recommended to specify the value of 'font_family'."
             )
-    else:
+    else:  # pragma: no cover
         font = ImageFont.truetype(font_family, font_size)
     return font

python-doctr 0.8.1__py3-none-any.whl → 0.10.0__py3-none-any.whl

python-doctr 0.8.1py3-none-any.whl → 0.10.0py3-none-any.whl