PyPI - python-doctr - Versions diffs - 0.11.0__py3-none-any.whl → 0.12.0__py3-none-any.whl - Mend

python-doctr 0.11.0py3-none-any.whl → 0.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

doctr/datasets/__init__.py +1 -0
doctr/datasets/coco_text.py +139 -0
doctr/datasets/cord.py +2 -1
doctr/datasets/funsd.py +2 -2
doctr/datasets/ic03.py +1 -1
doctr/datasets/ic13.py +2 -1
doctr/datasets/iiit5k.py +4 -1
doctr/datasets/imgur5k.py +9 -2
doctr/datasets/loader.py +1 -1
doctr/datasets/ocr.py +1 -1
doctr/datasets/recognition.py +1 -1
doctr/datasets/svhn.py +1 -1
doctr/datasets/svt.py +2 -2
doctr/datasets/synthtext.py +15 -2
doctr/datasets/utils.py +7 -6
doctr/datasets/vocabs.py +1102 -54
doctr/file_utils.py +9 -0
doctr/io/elements.py +37 -3
doctr/models/_utils.py +1 -1
doctr/models/classification/__init__.py +1 -0
doctr/models/classification/magc_resnet/pytorch.py +1 -2
doctr/models/classification/magc_resnet/tensorflow.py +3 -3
doctr/models/classification/mobilenet/pytorch.py +15 -1
doctr/models/classification/mobilenet/tensorflow.py +11 -2
doctr/models/classification/predictor/pytorch.py +1 -1
doctr/models/classification/resnet/pytorch.py +26 -3
doctr/models/classification/resnet/tensorflow.py +25 -4
doctr/models/classification/textnet/pytorch.py +10 -1
doctr/models/classification/textnet/tensorflow.py +11 -2
doctr/models/classification/vgg/pytorch.py +16 -1
doctr/models/classification/vgg/tensorflow.py +11 -2
doctr/models/classification/vip/__init__.py +4 -0
doctr/models/classification/vip/layers/__init__.py +4 -0
doctr/models/classification/vip/layers/pytorch.py +615 -0
doctr/models/classification/vip/pytorch.py +505 -0
doctr/models/classification/vit/pytorch.py +10 -1
doctr/models/classification/vit/tensorflow.py +9 -0
doctr/models/classification/zoo.py +4 -0
doctr/models/detection/differentiable_binarization/base.py +3 -4
doctr/models/detection/differentiable_binarization/pytorch.py +10 -1
doctr/models/detection/differentiable_binarization/tensorflow.py +11 -4
doctr/models/detection/fast/base.py +2 -3
doctr/models/detection/fast/pytorch.py +13 -4
doctr/models/detection/fast/tensorflow.py +10 -2
doctr/models/detection/linknet/base.py +2 -3
doctr/models/detection/linknet/pytorch.py +10 -1
doctr/models/detection/linknet/tensorflow.py +10 -2
doctr/models/factory/hub.py +3 -3
doctr/models/kie_predictor/pytorch.py +1 -1
doctr/models/kie_predictor/tensorflow.py +1 -1
doctr/models/modules/layers/pytorch.py +49 -1
doctr/models/predictor/pytorch.py +1 -1
doctr/models/predictor/tensorflow.py +1 -1
doctr/models/recognition/__init__.py +1 -0
doctr/models/recognition/crnn/pytorch.py +10 -1
doctr/models/recognition/crnn/tensorflow.py +10 -1
doctr/models/recognition/master/pytorch.py +10 -1
doctr/models/recognition/master/tensorflow.py +10 -3
doctr/models/recognition/parseq/pytorch.py +23 -5
doctr/models/recognition/parseq/tensorflow.py +13 -5
doctr/models/recognition/predictor/_utils.py +107 -45
doctr/models/recognition/predictor/pytorch.py +3 -3
doctr/models/recognition/predictor/tensorflow.py +3 -3
doctr/models/recognition/sar/pytorch.py +10 -1
doctr/models/recognition/sar/tensorflow.py +10 -3
doctr/models/recognition/utils.py +56 -47
doctr/models/recognition/viptr/__init__.py +4 -0
doctr/models/recognition/viptr/pytorch.py +277 -0
doctr/models/recognition/vitstr/pytorch.py +10 -1
doctr/models/recognition/vitstr/tensorflow.py +10 -3
doctr/models/recognition/zoo.py +5 -0
doctr/models/utils/pytorch.py +28 -18
doctr/models/utils/tensorflow.py +15 -8
doctr/utils/data.py +1 -1
doctr/utils/geometry.py +1 -1
doctr/version.py +1 -1
{python_doctr-0.11.0.dist-info → python_doctr-0.12.0.dist-info}/METADATA +19 -3
{python_doctr-0.11.0.dist-info → python_doctr-0.12.0.dist-info}/RECORD +82 -75
{python_doctr-0.11.0.dist-info → python_doctr-0.12.0.dist-info}/WHEEL +1 -1
{python_doctr-0.11.0.dist-info → python_doctr-0.12.0.dist-info/licenses}/LICENSE +0 -0
{python_doctr-0.11.0.dist-info → python_doctr-0.12.0.dist-info}/top_level.txt +0 -0
{python_doctr-0.11.0.dist-info → python_doctr-0.12.0.dist-info}/zip-safe +0 -0

doctr/datasets/__init__.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from doctr.file_utils import is_tf_available
 from .generator import *
+from .coco_text import *
 from .cord import *
 from .detection import *
 from .doc_artefacts import *

doctr/datasets/coco_text.py ADDED Viewed

@@ -0,0 +1,139 @@
+# Copyright (C) 2021-2025, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import json
+import os
+from pathlib import Path
+from typing import Any
+import numpy as np
+from tqdm import tqdm
+from .datasets import AbstractDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
+__all__ = ["COCOTEXT"]
+class COCOTEXT(AbstractDataset):
+    """
+    COCO-Text dataset from `"COCO-Text: Dataset and Benchmark for Text Detection and Recognition in Natural Images"
+    <https://arxiv.org/pdf/1601.07140v2>`_ |
+    `"homepage" <https://bgshih.github.io/cocotext/>`_.
+    >>> # NOTE: You need to download the dataset first.
+    >>> from doctr.datasets import COCOTEXT
+    >>> train_set = COCOTEXT(train=True, img_folder="/path/to/coco_text/train2014/",
+    >>>                     label_path="/path/to/coco_text/cocotext.v2.json")
+    >>> img, target = train_set[0]
+    >>> test_set = COCOTEXT(train=False, img_folder="/path/to/coco_text/train2014/",
+    >>> label_path = "/path/to/coco_text/cocotext.v2.json")
+    >>> img, target = test_set[0]
+    Args:
+        img_folder: folder with all the images of the dataset
+        label_path: path to the annotations file of the dataset
+        train: whether the subset should be the training one
+        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
+        recognition_task: whether the dataset should be used for recognition task
+        detection_task: whether the dataset should be used for detection task
+        **kwargs: keyword arguments from `AbstractDataset`.
+    """
+    def __init__(
+        self,
+        img_folder: str,
+        label_path: str,
+        train: bool = True,
+        use_polygons: bool = False,
+        recognition_task: bool = False,
+        detection_task: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(
+            img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs
+        )
+        # Task check
+        if recognition_task and detection_task:
+            raise ValueError(
+                " 'recognition' and 'detection task' cannot be set to True simultaneously. "
+                + " To get the whole dataset with boxes and labels leave both parameters to False "
+            )
+        # File existence check
+        if not os.path.exists(label_path) or not os.path.exists(img_folder):
+            raise FileNotFoundError(f"unable to find {label_path if not os.path.exists(label_path) else img_folder}")
+        tmp_root = img_folder
+        self.train = train
+        np_dtype = np.float32
+        self.data: list[tuple[str | Path | np.ndarray, str | dict[str, Any] | np.ndarray]] = []
+        with open(label_path, "r") as file:
+            data = json.load(file)
+        # Filter images based on the set
+        img_items = [img for img in data["imgs"].items() if (img[1]["set"] == "train") == train]
+        box: list[float] | np.ndarray
+        for img_id, img_info in tqdm(img_items, desc="Preparing and Loading COCOTEXT", total=len(img_items)):
+            img_path = os.path.join(img_folder, img_info["file_name"])
+            # File existence check
+            if not os.path.exists(img_path):  # pragma: no cover
+                raise FileNotFoundError(f"Unable to locate {img_path}")
+            # Get annotations for the current image (only legible text)
+            annotations = [
+                ann
+                for ann in data["anns"].values()
+                if ann["image_id"] == int(img_id) and ann["legibility"] == "legible"
+            ]
+            # Some images have no annotations with readable text
+            if not annotations:  # pragma: no cover
+                continue
+            _targets = []
+            for annotation in annotations:
+                x, y, w, h = annotation["bbox"]
+                if use_polygons:
+                    # (x, y) coordinates of top left, top right, bottom right, bottom left corners
+                    box = np.array(
+                        [
+                            [x, y],
+                            [x + w, y],
+                            [x + w, y + h],
+                            [x, y + h],
+                        ],
+                        dtype=np_dtype,
+                    )
+                else:
+                    # (xmin, ymin, xmax, ymax) coordinates
+                    box = [x, y, x + w, y + h]
+                _targets.append((annotation["utf8_string"], box))
+            text_targets, box_targets = zip(*_targets)
+            if recognition_task:
+                crops = crop_bboxes_from_image(
+                    img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0)
+                )
+                for crop, label in zip(crops, list(text_targets)):
+                    if label and " " not in label:
+                        self.data.append((crop, label))
+            elif detection_task:
+                self.data.append((img_path, np.asarray(box_targets, dtype=int).clip(min=0)))
+            else:
+                self.data.append((
+                    img_path,
+                    dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets)),
+                ))
+        self.root = tmp_root
+    def extra_repr(self) -> str:
+        return f"train={self.train}"

doctr/datasets/cord.py CHANGED Viewed

@@ -116,7 +116,8 @@ class CORD(VisionDataset):
                     img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0)
                 )
                 for crop, label in zip(crops, list(text_targets)):
-                    self.data.append((crop, label))
+                    if " " not in label:
+                        self.data.append((crop, label))
             elif detection_task:
                 self.data.append((img_path, np.asarray(box_targets, dtype=int).clip(min=0)))
             else:

doctr/datasets/funsd.py CHANGED Viewed

@@ -107,8 +107,8 @@ class FUNSD(VisionDataset):
                 )
                 for crop, label in zip(crops, list(text_targets)):
                     # filter labels with unknown characters
-                    if not any(char in label for char in ["☑", "☐", "\uf703", "\uf702"]):
-                        self.data.append((crop, label))
+                    if not any(char in label for char in ["☑", "☐", "\u03bf", "\uf703", "\uf702", " "]):
+                        self.data.append((crop, label.replace("–", "-")))
             elif detection_task:
                 self.data.append((img_path, np.asarray(box_targets, dtype=np_dtype)))
             else:

doctr/datasets/ic03.py CHANGED Viewed

@@ -122,7 +122,7 @@ class IC03(VisionDataset):
                 if recognition_task:
                     crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, name.text), geoms=boxes)
                     for crop, label in zip(crops, labels):
-                        if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
+                        if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0 and " " not in label:
                             self.data.append((crop, label))
                 elif detection_task:
                     self.data.append((name.text, boxes))

doctr/datasets/ic13.py CHANGED Viewed

@@ -100,7 +100,8 @@ class IC13(AbstractDataset):
             if recognition_task:
                 crops = crop_bboxes_from_image(img_path=img_path, geoms=box_targets)
                 for crop, label in zip(crops, labels):
-                    self.data.append((crop, label))
+                    if " " not in label:
+                        self.data.append((crop, label))
             elif detection_task:
                 self.data.append((img_path, box_targets))
             else:

doctr/datasets/iiit5k.py CHANGED Viewed

@@ -8,6 +8,7 @@ from typing import Any
 import numpy as np
 import scipy.io as sio
+from PIL import Image
 from tqdm import tqdm
 from .datasets import VisionDataset
@@ -98,7 +99,9 @@ class IIIT5K(VisionDataset):
                 box_targets = [[box[0], box[1], box[0] + box[2], box[1] + box[3]] for box in box_targets]
             if recognition_task:
-                self.data.append((_raw_path, _raw_label))
+                if " " not in _raw_label:
+                    with Image.open(os.path.join(tmp_root, _raw_path)) as pil_img:
+                        self.data.append((np.array(pil_img.convert("RGB")), _raw_label))
             elif detection_task:
                 self.data.append((_raw_path, np.asarray(box_targets, dtype=np_dtype)))
             else:

doctr/datasets/imgur5k.py CHANGED Viewed

@@ -133,7 +133,13 @@ class IMGUR5K(AbstractDataset):
                         img_path=os.path.join(self.root, img_name), geoms=np.asarray(box_targets, dtype=np_dtype)
                     )
                     for crop, label in zip(crops, labels):
-                        if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
+                        if (
+                            crop.shape[0] > 0
+                            and crop.shape[1] > 0
+                            and len(label) > 0
+                            and len(label) < 30
+                            and " " not in label
+                        ):
                             # write data to disk
                             with open(os.path.join(reco_folder_path, f"{reco_images_counter}.txt"), "w") as f:
                                 f.write(label)
@@ -152,6 +158,7 @@ class IMGUR5K(AbstractDataset):
         return f"train={self.train}"
     def _read_from_folder(self, path: str) -> None:
-        for img_path in glob.glob(os.path.join(path, "*.png")):
+        img_paths = glob.glob(os.path.join(path, "*.png"))
+        for img_path in tqdm(iterable=img_paths, desc="Preparing and Loading IMGUR5K", total=len(img_paths)):
             with open(os.path.join(path, f"{os.path.basename(img_path)[:-4]}.txt"), "r") as f:
                 self.data.append((img_path, f.read()))

doctr/datasets/loader.py CHANGED Viewed

@@ -19,7 +19,7 @@ def default_collate(samples):
         samples: list of N tuples containing M elements
     Returns:
-        tuple of M sequences contianing N elements each
+        tuple of M sequences containing N elements each
     """
     batch_data = zip(*samples)

doctr/datasets/ocr.py CHANGED Viewed

@@ -40,7 +40,7 @@ class OCRDataset(AbstractDataset):
         super().__init__(img_folder, **kwargs)
         # List images
-        self.data: list[tuple[str, dict[str, Any]]] = []
+        self.data: list[tuple[Path, dict[str, Any]]] = []
         np_dtype = np.float32
         with open(label_file, "rb") as f:
             data = json.load(f)

doctr/datasets/recognition.py CHANGED Viewed

@@ -23,7 +23,7 @@ class RecognitionDataset(AbstractDataset):
     Args:
         img_folder: path to the images folder
-        labels_path: pathe to the json file containing all labels (character sequences)
+        labels_path: path to the json file containing all labels (character sequences)
         **kwargs: keyword arguments from `AbstractDataset`.
     """

doctr/datasets/svhn.py CHANGED Viewed

@@ -129,7 +129,7 @@ class SVHN(VisionDataset):
                 if recognition_task:
                     crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, img_name), geoms=box_targets)
                     for crop, label in zip(crops, label_targets):
-                        if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
+                        if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0 and " " not in label:
                             self.data.append((crop, label))
                 elif detection_task:
                     self.data.append((img_name, box_targets))

doctr/datasets/svt.py CHANGED Viewed

@@ -35,7 +35,7 @@ class SVT(VisionDataset):
         **kwargs: keyword arguments from `VisionDataset`.
     """
-    URL = "http://vision.ucsd.edu/~kai/svt/svt.zip"
+    URL = "http://www.iapr-tc11.org/dataset/SVT/svt.zip"
     SHA256 = "63b3d55e6b6d1e036e2a844a20c034fe3af3c32e4d914d6e0c4a3cd43df3bebf"
     def __init__(
@@ -113,7 +113,7 @@ class SVT(VisionDataset):
             if recognition_task:
                 crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, name.text), geoms=boxes)
                 for crop, label in zip(crops, labels):
-                    if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
+                    if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0 and " " not in label:
                         self.data.append((crop, label))
             elif detection_task:
                 self.data.append((name.text, boxes))

doctr/datasets/synthtext.py CHANGED Viewed

@@ -41,6 +41,12 @@ class SynthText(VisionDataset):
     URL = "https://thor.robots.ox.ac.uk/~vgg/data/scenetext/SynthText.zip"
     SHA256 = "28ab030485ec8df3ed612c568dd71fb2793b9afbfa3a9d9c6e792aef33265bf1"
+    # filter corrupted or missing images
+    BLACKLIST = (
+        "67/fruits_129_",
+        "194/window_19_",
+    )
     def __init__(
         self,
         train: bool = True,
@@ -111,7 +117,13 @@ class SynthText(VisionDataset):
             if recognition_task:
                 crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, img_path[0]), geoms=word_boxes)
                 for crop, label in zip(crops, labels):
-                    if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
+                    if (
+                        crop.shape[0] > 0
+                        and crop.shape[1] > 0
+                        and len(label) > 0
+                        and len(label) < 30
+                        and " " not in label
+                    ):
                         # write data to disk
                         with open(os.path.join(reco_folder_path, f"{reco_images_counter}.txt"), "w") as f:
                             f.write(label)
@@ -132,6 +144,7 @@ class SynthText(VisionDataset):
         return f"train={self.train}"
     def _read_from_folder(self, path: str) -> None:
-        for img_path in glob.glob(os.path.join(path, "*.png")):
+        img_paths = glob.glob(os.path.join(path, "*.png"))
+        for img_path in tqdm(iterable=img_paths, desc="Preparing and Loading SynthText", total=len(img_paths)):
             with open(os.path.join(path, f"{os.path.basename(img_path)[:-4]}.txt"), "r") as f:
                 self.data.append((img_path, f.read()))

doctr/datasets/utils.py CHANGED Viewed

@@ -48,7 +48,7 @@ def translate(
         A string translated in a given vocab
     """
     if VOCABS.get(vocab_name) is None:
-        raise KeyError("output vocabulary must be in vocabs dictionnary")
+        raise KeyError("output vocabulary must be in vocabs dictionary")
     translated = ""
     for char in input_string:
@@ -81,11 +81,12 @@ def encode_string(
     """
     try:
         return list(map(vocab.index, input_string))
-    except ValueError:
+    except ValueError as e:
+        missing_chars = [char for char in input_string if char not in vocab]
         raise ValueError(
-            f"some characters cannot be found in 'vocab'. \
-                         Please check the input string {input_string} and the vocabulary {vocab}"
-        )
+            f"Some characters cannot be found in 'vocab': {set(missing_chars)}.\n"
+            f"Please check the input string `{input_string}` and the vocabulary `{vocab}`"
+        ) from e
 def decode_sequence(
@@ -199,7 +200,7 @@ def crop_bboxes_from_image(img_path: str | Path, geoms: np.ndarray) -> list[np.n
         a list of cropped images
     """
     with Image.open(img_path) as pil_img:
-        img: np.ndarray = np.array(pil_img.convert("RGB"))
+        img: np.ndarray = np.asarray(pil_img.convert("RGB"))
     # Polygon
     if geoms.ndim == 3 and geoms.shape[1:] == (4, 2):
         return extract_rcrops(img, geoms.astype(dtype=int))

python-doctr 0.11.0__py3-none-any.whl → 0.12.0__py3-none-any.whl

python-doctr 0.11.0py3-none-any.whl → 0.12.0py3-none-any.whl