python-doctr 0.10.0__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctr/contrib/__init__.py +1 -0
- doctr/contrib/artefacts.py +7 -9
- doctr/contrib/base.py +8 -17
- doctr/datasets/__init__.py +1 -0
- doctr/datasets/coco_text.py +139 -0
- doctr/datasets/cord.py +10 -8
- doctr/datasets/datasets/__init__.py +4 -4
- doctr/datasets/datasets/base.py +16 -16
- doctr/datasets/datasets/pytorch.py +12 -12
- doctr/datasets/datasets/tensorflow.py +10 -10
- doctr/datasets/detection.py +6 -9
- doctr/datasets/doc_artefacts.py +3 -4
- doctr/datasets/funsd.py +9 -8
- doctr/datasets/generator/__init__.py +4 -4
- doctr/datasets/generator/base.py +16 -17
- doctr/datasets/generator/pytorch.py +1 -3
- doctr/datasets/generator/tensorflow.py +1 -3
- doctr/datasets/ic03.py +5 -6
- doctr/datasets/ic13.py +6 -6
- doctr/datasets/iiit5k.py +10 -6
- doctr/datasets/iiithws.py +4 -5
- doctr/datasets/imgur5k.py +15 -7
- doctr/datasets/loader.py +4 -7
- doctr/datasets/mjsynth.py +6 -5
- doctr/datasets/ocr.py +3 -4
- doctr/datasets/orientation.py +3 -4
- doctr/datasets/recognition.py +4 -5
- doctr/datasets/sroie.py +6 -5
- doctr/datasets/svhn.py +7 -6
- doctr/datasets/svt.py +6 -7
- doctr/datasets/synthtext.py +19 -7
- doctr/datasets/utils.py +41 -35
- doctr/datasets/vocabs.py +1107 -49
- doctr/datasets/wildreceipt.py +14 -10
- doctr/file_utils.py +11 -7
- doctr/io/elements.py +96 -82
- doctr/io/html.py +1 -3
- doctr/io/image/__init__.py +3 -3
- doctr/io/image/base.py +2 -5
- doctr/io/image/pytorch.py +3 -12
- doctr/io/image/tensorflow.py +2 -11
- doctr/io/pdf.py +5 -7
- doctr/io/reader.py +5 -11
- doctr/models/_utils.py +15 -23
- doctr/models/builder.py +30 -48
- doctr/models/classification/__init__.py +1 -0
- doctr/models/classification/magc_resnet/__init__.py +3 -3
- doctr/models/classification/magc_resnet/pytorch.py +11 -15
- doctr/models/classification/magc_resnet/tensorflow.py +11 -14
- doctr/models/classification/mobilenet/__init__.py +3 -3
- doctr/models/classification/mobilenet/pytorch.py +20 -18
- doctr/models/classification/mobilenet/tensorflow.py +19 -23
- doctr/models/classification/predictor/__init__.py +4 -4
- doctr/models/classification/predictor/pytorch.py +7 -9
- doctr/models/classification/predictor/tensorflow.py +6 -8
- doctr/models/classification/resnet/__init__.py +4 -4
- doctr/models/classification/resnet/pytorch.py +47 -34
- doctr/models/classification/resnet/tensorflow.py +45 -35
- doctr/models/classification/textnet/__init__.py +3 -3
- doctr/models/classification/textnet/pytorch.py +20 -18
- doctr/models/classification/textnet/tensorflow.py +19 -17
- doctr/models/classification/vgg/__init__.py +3 -3
- doctr/models/classification/vgg/pytorch.py +21 -8
- doctr/models/classification/vgg/tensorflow.py +20 -14
- doctr/models/classification/vip/__init__.py +4 -0
- doctr/models/classification/vip/layers/__init__.py +4 -0
- doctr/models/classification/vip/layers/pytorch.py +615 -0
- doctr/models/classification/vip/pytorch.py +505 -0
- doctr/models/classification/vit/__init__.py +3 -3
- doctr/models/classification/vit/pytorch.py +18 -15
- doctr/models/classification/vit/tensorflow.py +15 -12
- doctr/models/classification/zoo.py +23 -14
- doctr/models/core.py +3 -3
- doctr/models/detection/_utils/__init__.py +4 -4
- doctr/models/detection/_utils/base.py +4 -7
- doctr/models/detection/_utils/pytorch.py +1 -5
- doctr/models/detection/_utils/tensorflow.py +1 -5
- doctr/models/detection/core.py +2 -8
- doctr/models/detection/differentiable_binarization/__init__.py +4 -4
- doctr/models/detection/differentiable_binarization/base.py +10 -21
- doctr/models/detection/differentiable_binarization/pytorch.py +37 -31
- doctr/models/detection/differentiable_binarization/tensorflow.py +26 -29
- doctr/models/detection/fast/__init__.py +4 -4
- doctr/models/detection/fast/base.py +8 -17
- doctr/models/detection/fast/pytorch.py +37 -35
- doctr/models/detection/fast/tensorflow.py +24 -28
- doctr/models/detection/linknet/__init__.py +4 -4
- doctr/models/detection/linknet/base.py +8 -18
- doctr/models/detection/linknet/pytorch.py +34 -28
- doctr/models/detection/linknet/tensorflow.py +24 -25
- doctr/models/detection/predictor/__init__.py +5 -5
- doctr/models/detection/predictor/pytorch.py +6 -7
- doctr/models/detection/predictor/tensorflow.py +5 -6
- doctr/models/detection/zoo.py +27 -7
- doctr/models/factory/hub.py +6 -10
- doctr/models/kie_predictor/__init__.py +5 -5
- doctr/models/kie_predictor/base.py +4 -5
- doctr/models/kie_predictor/pytorch.py +19 -20
- doctr/models/kie_predictor/tensorflow.py +14 -15
- doctr/models/modules/layers/__init__.py +3 -3
- doctr/models/modules/layers/pytorch.py +55 -10
- doctr/models/modules/layers/tensorflow.py +5 -7
- doctr/models/modules/transformer/__init__.py +3 -3
- doctr/models/modules/transformer/pytorch.py +12 -13
- doctr/models/modules/transformer/tensorflow.py +9 -10
- doctr/models/modules/vision_transformer/__init__.py +3 -3
- doctr/models/modules/vision_transformer/pytorch.py +2 -3
- doctr/models/modules/vision_transformer/tensorflow.py +3 -3
- doctr/models/predictor/__init__.py +5 -5
- doctr/models/predictor/base.py +28 -29
- doctr/models/predictor/pytorch.py +13 -14
- doctr/models/predictor/tensorflow.py +9 -10
- doctr/models/preprocessor/__init__.py +4 -4
- doctr/models/preprocessor/pytorch.py +13 -17
- doctr/models/preprocessor/tensorflow.py +10 -14
- doctr/models/recognition/__init__.py +1 -0
- doctr/models/recognition/core.py +3 -7
- doctr/models/recognition/crnn/__init__.py +4 -4
- doctr/models/recognition/crnn/pytorch.py +30 -29
- doctr/models/recognition/crnn/tensorflow.py +21 -24
- doctr/models/recognition/master/__init__.py +3 -3
- doctr/models/recognition/master/base.py +3 -7
- doctr/models/recognition/master/pytorch.py +32 -25
- doctr/models/recognition/master/tensorflow.py +22 -25
- doctr/models/recognition/parseq/__init__.py +3 -3
- doctr/models/recognition/parseq/base.py +3 -7
- doctr/models/recognition/parseq/pytorch.py +47 -29
- doctr/models/recognition/parseq/tensorflow.py +29 -27
- doctr/models/recognition/predictor/__init__.py +5 -5
- doctr/models/recognition/predictor/_utils.py +111 -52
- doctr/models/recognition/predictor/pytorch.py +9 -9
- doctr/models/recognition/predictor/tensorflow.py +8 -9
- doctr/models/recognition/sar/__init__.py +4 -4
- doctr/models/recognition/sar/pytorch.py +30 -22
- doctr/models/recognition/sar/tensorflow.py +22 -24
- doctr/models/recognition/utils.py +57 -53
- doctr/models/recognition/viptr/__init__.py +4 -0
- doctr/models/recognition/viptr/pytorch.py +277 -0
- doctr/models/recognition/vitstr/__init__.py +4 -4
- doctr/models/recognition/vitstr/base.py +3 -7
- doctr/models/recognition/vitstr/pytorch.py +28 -21
- doctr/models/recognition/vitstr/tensorflow.py +22 -23
- doctr/models/recognition/zoo.py +27 -11
- doctr/models/utils/__init__.py +4 -4
- doctr/models/utils/pytorch.py +41 -34
- doctr/models/utils/tensorflow.py +31 -23
- doctr/models/zoo.py +1 -5
- doctr/transforms/functional/__init__.py +3 -3
- doctr/transforms/functional/base.py +4 -11
- doctr/transforms/functional/pytorch.py +20 -28
- doctr/transforms/functional/tensorflow.py +10 -22
- doctr/transforms/modules/__init__.py +4 -4
- doctr/transforms/modules/base.py +48 -55
- doctr/transforms/modules/pytorch.py +58 -22
- doctr/transforms/modules/tensorflow.py +18 -32
- doctr/utils/common_types.py +8 -9
- doctr/utils/data.py +9 -13
- doctr/utils/fonts.py +2 -7
- doctr/utils/geometry.py +17 -48
- doctr/utils/metrics.py +17 -37
- doctr/utils/multithreading.py +4 -6
- doctr/utils/reconstitution.py +9 -13
- doctr/utils/repr.py +2 -3
- doctr/utils/visualization.py +16 -29
- doctr/version.py +1 -1
- {python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info}/METADATA +70 -52
- python_doctr-0.12.0.dist-info/RECORD +180 -0
- {python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info}/WHEEL +1 -1
- python_doctr-0.10.0.dist-info/RECORD +0 -173
- {python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info/licenses}/LICENSE +0 -0
- {python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info}/top_level.txt +0 -0
- {python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info}/zip-safe +0 -0
doctr/datasets/funsd.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
import json
|
|
7
7
|
import os
|
|
8
8
|
from pathlib import Path
|
|
9
|
-
from typing import Any
|
|
9
|
+
from typing import Any
|
|
10
10
|
|
|
11
11
|
import numpy as np
|
|
12
12
|
from tqdm import tqdm
|
|
@@ -29,7 +29,6 @@ class FUNSD(VisionDataset):
|
|
|
29
29
|
>>> img, target = train_set[0]
|
|
30
30
|
|
|
31
31
|
Args:
|
|
32
|
-
----
|
|
33
32
|
train: whether the subset should be the training one
|
|
34
33
|
use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
|
|
35
34
|
recognition_task: whether the dataset should be used for recognition task
|
|
@@ -69,10 +68,12 @@ class FUNSD(VisionDataset):
|
|
|
69
68
|
# Use the subset
|
|
70
69
|
subfolder = os.path.join("dataset", "training_data" if train else "testing_data")
|
|
71
70
|
|
|
72
|
-
# #
|
|
71
|
+
# # list images
|
|
73
72
|
tmp_root = os.path.join(self.root, subfolder, "images")
|
|
74
|
-
self.data:
|
|
75
|
-
for img_path in tqdm(
|
|
73
|
+
self.data: list[tuple[str | np.ndarray, str | dict[str, Any] | np.ndarray]] = []
|
|
74
|
+
for img_path in tqdm(
|
|
75
|
+
iterable=os.listdir(tmp_root), desc="Preparing and Loading FUNSD", total=len(os.listdir(tmp_root))
|
|
76
|
+
):
|
|
76
77
|
# File existence check
|
|
77
78
|
if not os.path.exists(os.path.join(tmp_root, img_path)):
|
|
78
79
|
raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}")
|
|
@@ -106,8 +107,8 @@ class FUNSD(VisionDataset):
|
|
|
106
107
|
)
|
|
107
108
|
for crop, label in zip(crops, list(text_targets)):
|
|
108
109
|
# filter labels with unknown characters
|
|
109
|
-
if not any(char in label for char in ["☑", "☐", "\uf703", "\uf702"]):
|
|
110
|
-
self.data.append((crop, label))
|
|
110
|
+
if not any(char in label for char in ["☑", "☐", "\u03bf", "\uf703", "\uf702", " "]):
|
|
111
|
+
self.data.append((crop, label.replace("–", "-")))
|
|
111
112
|
elif detection_task:
|
|
112
113
|
self.data.append((img_path, np.asarray(box_targets, dtype=np_dtype)))
|
|
113
114
|
else:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from doctr.file_utils import is_tf_available, is_torch_available
|
|
2
2
|
|
|
3
|
-
if
|
|
4
|
-
from .
|
|
5
|
-
elif
|
|
6
|
-
from .
|
|
3
|
+
if is_torch_available():
|
|
4
|
+
from .pytorch import *
|
|
5
|
+
elif is_tf_available():
|
|
6
|
+
from .tensorflow import * # type: ignore[assignment]
|
doctr/datasets/generator/base.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
import random
|
|
7
|
-
from
|
|
7
|
+
from collections.abc import Callable
|
|
8
|
+
from typing import Any
|
|
8
9
|
|
|
9
10
|
from PIL import Image, ImageDraw
|
|
10
11
|
|
|
@@ -17,14 +18,13 @@ from ..datasets import AbstractDataset
|
|
|
17
18
|
def synthesize_text_img(
|
|
18
19
|
text: str,
|
|
19
20
|
font_size: int = 32,
|
|
20
|
-
font_family:
|
|
21
|
-
background_color:
|
|
22
|
-
text_color:
|
|
21
|
+
font_family: str | None = None,
|
|
22
|
+
background_color: tuple[int, int, int] | None = None,
|
|
23
|
+
text_color: tuple[int, int, int] | None = None,
|
|
23
24
|
) -> Image.Image:
|
|
24
25
|
"""Generate a synthetic text image
|
|
25
26
|
|
|
26
27
|
Args:
|
|
27
|
-
----
|
|
28
28
|
text: the text to render as an image
|
|
29
29
|
font_size: the size of the font
|
|
30
30
|
font_family: the font family (has to be installed on your system)
|
|
@@ -32,7 +32,6 @@ def synthesize_text_img(
|
|
|
32
32
|
text_color: text color on the final image
|
|
33
33
|
|
|
34
34
|
Returns:
|
|
35
|
-
-------
|
|
36
35
|
PIL image of the text
|
|
37
36
|
"""
|
|
38
37
|
background_color = (0, 0, 0) if background_color is None else background_color
|
|
@@ -61,9 +60,9 @@ class _CharacterGenerator(AbstractDataset):
|
|
|
61
60
|
vocab: str,
|
|
62
61
|
num_samples: int,
|
|
63
62
|
cache_samples: bool = False,
|
|
64
|
-
font_family:
|
|
65
|
-
img_transforms:
|
|
66
|
-
sample_transforms:
|
|
63
|
+
font_family: str | list[str] | None = None,
|
|
64
|
+
img_transforms: Callable[[Any], Any] | None = None,
|
|
65
|
+
sample_transforms: Callable[[Any, Any], tuple[Any, Any]] | None = None,
|
|
67
66
|
) -> None:
|
|
68
67
|
self.vocab = vocab
|
|
69
68
|
self._num_samples = num_samples
|
|
@@ -78,7 +77,7 @@ class _CharacterGenerator(AbstractDataset):
|
|
|
78
77
|
self.img_transforms = img_transforms
|
|
79
78
|
self.sample_transforms = sample_transforms
|
|
80
79
|
|
|
81
|
-
self._data:
|
|
80
|
+
self._data: list[Image.Image] = []
|
|
82
81
|
if cache_samples:
|
|
83
82
|
self._data = [
|
|
84
83
|
(synthesize_text_img(char, font_family=font), idx) # type: ignore[misc]
|
|
@@ -89,7 +88,7 @@ class _CharacterGenerator(AbstractDataset):
|
|
|
89
88
|
def __len__(self) -> int:
|
|
90
89
|
return self._num_samples
|
|
91
90
|
|
|
92
|
-
def _read_sample(self, index: int) ->
|
|
91
|
+
def _read_sample(self, index: int) -> tuple[Any, int]:
|
|
93
92
|
# Samples are already cached
|
|
94
93
|
if len(self._data) > 0:
|
|
95
94
|
idx = index % len(self._data)
|
|
@@ -110,9 +109,9 @@ class _WordGenerator(AbstractDataset):
|
|
|
110
109
|
max_chars: int,
|
|
111
110
|
num_samples: int,
|
|
112
111
|
cache_samples: bool = False,
|
|
113
|
-
font_family:
|
|
114
|
-
img_transforms:
|
|
115
|
-
sample_transforms:
|
|
112
|
+
font_family: str | list[str] | None = None,
|
|
113
|
+
img_transforms: Callable[[Any], Any] | None = None,
|
|
114
|
+
sample_transforms: Callable[[Any, Any], tuple[Any, Any]] | None = None,
|
|
116
115
|
) -> None:
|
|
117
116
|
self.vocab = vocab
|
|
118
117
|
self.wordlen_range = (min_chars, max_chars)
|
|
@@ -128,7 +127,7 @@ class _WordGenerator(AbstractDataset):
|
|
|
128
127
|
self.img_transforms = img_transforms
|
|
129
128
|
self.sample_transforms = sample_transforms
|
|
130
129
|
|
|
131
|
-
self._data:
|
|
130
|
+
self._data: list[Image.Image] = []
|
|
132
131
|
if cache_samples:
|
|
133
132
|
_words = [self._generate_string(*self.wordlen_range) for _ in range(num_samples)]
|
|
134
133
|
self._data = [
|
|
@@ -143,7 +142,7 @@ class _WordGenerator(AbstractDataset):
|
|
|
143
142
|
def __len__(self) -> int:
|
|
144
143
|
return self._num_samples
|
|
145
144
|
|
|
146
|
-
def _read_sample(self, index: int) ->
|
|
145
|
+
def _read_sample(self, index: int) -> tuple[Any, str]:
|
|
147
146
|
# Samples are already cached
|
|
148
147
|
if len(self._data) > 0:
|
|
149
148
|
pil_img, target = self._data[index] # type: ignore[misc]
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
@@ -18,7 +18,6 @@ class CharacterGenerator(_CharacterGenerator):
|
|
|
18
18
|
>>> img, target = ds[0]
|
|
19
19
|
|
|
20
20
|
Args:
|
|
21
|
-
----
|
|
22
21
|
vocab: vocabulary to take the character from
|
|
23
22
|
num_samples: number of samples that will be generated iterating over the dataset
|
|
24
23
|
cache_samples: whether generated images should be cached firsthand
|
|
@@ -40,7 +39,6 @@ class WordGenerator(_WordGenerator):
|
|
|
40
39
|
>>> img, target = ds[0]
|
|
41
40
|
|
|
42
41
|
Args:
|
|
43
|
-
----
|
|
44
42
|
vocab: vocabulary to take the character from
|
|
45
43
|
min_chars: minimum number of characters in a word
|
|
46
44
|
max_chars: maximum number of characters in a word
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
@@ -18,7 +18,6 @@ class CharacterGenerator(_CharacterGenerator):
|
|
|
18
18
|
>>> img, target = ds[0]
|
|
19
19
|
|
|
20
20
|
Args:
|
|
21
|
-
----
|
|
22
21
|
vocab: vocabulary to take the character from
|
|
23
22
|
num_samples: number of samples that will be generated iterating over the dataset
|
|
24
23
|
cache_samples: whether generated images should be cached firsthand
|
|
@@ -46,7 +45,6 @@ class WordGenerator(_WordGenerator):
|
|
|
46
45
|
>>> img, target = ds[0]
|
|
47
46
|
|
|
48
47
|
Args:
|
|
49
|
-
----
|
|
50
48
|
vocab: vocabulary to take the character from
|
|
51
49
|
min_chars: minimum number of characters in a word
|
|
52
50
|
max_chars: maximum number of characters in a word
|
doctr/datasets/ic03.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
import os
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any
|
|
8
8
|
|
|
9
9
|
import defusedxml.ElementTree as ET
|
|
10
10
|
import numpy as np
|
|
@@ -28,7 +28,6 @@ class IC03(VisionDataset):
|
|
|
28
28
|
>>> img, target = train_set[0]
|
|
29
29
|
|
|
30
30
|
Args:
|
|
31
|
-
----
|
|
32
31
|
train: whether the subset should be the training one
|
|
33
32
|
use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
|
|
34
33
|
recognition_task: whether the dataset should be used for recognition task
|
|
@@ -71,7 +70,7 @@ class IC03(VisionDataset):
|
|
|
71
70
|
)
|
|
72
71
|
|
|
73
72
|
self.train = train
|
|
74
|
-
self.data:
|
|
73
|
+
self.data: list[tuple[str | np.ndarray, str | dict[str, Any] | np.ndarray]] = []
|
|
75
74
|
np_dtype = np.float32
|
|
76
75
|
|
|
77
76
|
# Load xml data
|
|
@@ -81,7 +80,7 @@ class IC03(VisionDataset):
|
|
|
81
80
|
xml_tree = ET.parse(os.path.join(tmp_root, "words.xml"))
|
|
82
81
|
xml_root = xml_tree.getroot()
|
|
83
82
|
|
|
84
|
-
for image in tqdm(iterable=xml_root, desc="
|
|
83
|
+
for image in tqdm(iterable=xml_root, desc="Preparing and Loading IC03", total=len(xml_root)):
|
|
85
84
|
name, _resolution, rectangles = image
|
|
86
85
|
|
|
87
86
|
# File existence check
|
|
@@ -123,7 +122,7 @@ class IC03(VisionDataset):
|
|
|
123
122
|
if recognition_task:
|
|
124
123
|
crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, name.text), geoms=boxes)
|
|
125
124
|
for crop, label in zip(crops, labels):
|
|
126
|
-
if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
|
|
125
|
+
if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0 and " " not in label:
|
|
127
126
|
self.data.append((crop, label))
|
|
128
127
|
elif detection_task:
|
|
129
128
|
self.data.append((name.text, boxes))
|
doctr/datasets/ic13.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
import csv
|
|
7
7
|
import os
|
|
8
8
|
from pathlib import Path
|
|
9
|
-
from typing import Any
|
|
9
|
+
from typing import Any
|
|
10
10
|
|
|
11
11
|
import numpy as np
|
|
12
12
|
from tqdm import tqdm
|
|
@@ -33,7 +33,6 @@ class IC13(AbstractDataset):
|
|
|
33
33
|
>>> img, target = test_set[0]
|
|
34
34
|
|
|
35
35
|
Args:
|
|
36
|
-
----
|
|
37
36
|
img_folder: folder with all the images of the dataset
|
|
38
37
|
label_folder: folder with all annotation files for the images
|
|
39
38
|
use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
|
|
@@ -66,12 +65,12 @@ class IC13(AbstractDataset):
|
|
|
66
65
|
f"unable to locate {label_folder if not os.path.exists(label_folder) else img_folder}"
|
|
67
66
|
)
|
|
68
67
|
|
|
69
|
-
self.data:
|
|
68
|
+
self.data: list[tuple[Path | np.ndarray, str | dict[str, Any] | np.ndarray]] = []
|
|
70
69
|
np_dtype = np.float32
|
|
71
70
|
|
|
72
71
|
img_names = os.listdir(img_folder)
|
|
73
72
|
|
|
74
|
-
for img_name in tqdm(iterable=img_names, desc="
|
|
73
|
+
for img_name in tqdm(iterable=img_names, desc="Preparing and Loading IC13", total=len(img_names)):
|
|
75
74
|
img_path = Path(img_folder, img_name)
|
|
76
75
|
label_path = Path(label_folder, "gt_" + Path(img_name).stem + ".txt")
|
|
77
76
|
|
|
@@ -101,7 +100,8 @@ class IC13(AbstractDataset):
|
|
|
101
100
|
if recognition_task:
|
|
102
101
|
crops = crop_bboxes_from_image(img_path=img_path, geoms=box_targets)
|
|
103
102
|
for crop, label in zip(crops, labels):
|
|
104
|
-
|
|
103
|
+
if " " not in label:
|
|
104
|
+
self.data.append((crop, label))
|
|
105
105
|
elif detection_task:
|
|
106
106
|
self.data.append((img_path, box_targets))
|
|
107
107
|
else:
|
doctr/datasets/iiit5k.py
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
import os
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
import scipy.io as sio
|
|
11
|
+
from PIL import Image
|
|
11
12
|
from tqdm import tqdm
|
|
12
13
|
|
|
13
14
|
from .datasets import VisionDataset
|
|
@@ -30,7 +31,6 @@ class IIIT5K(VisionDataset):
|
|
|
30
31
|
>>> img, target = train_set[0]
|
|
31
32
|
|
|
32
33
|
Args:
|
|
33
|
-
----
|
|
34
34
|
train: whether the subset should be the training one
|
|
35
35
|
use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
|
|
36
36
|
recognition_task: whether the dataset should be used for recognition task
|
|
@@ -70,10 +70,12 @@ class IIIT5K(VisionDataset):
|
|
|
70
70
|
mat_file = "trainCharBound" if self.train else "testCharBound"
|
|
71
71
|
mat_data = sio.loadmat(os.path.join(tmp_root, f"{mat_file}.mat"))[mat_file][0]
|
|
72
72
|
|
|
73
|
-
self.data:
|
|
73
|
+
self.data: list[tuple[str | np.ndarray, str | dict[str, Any] | np.ndarray]] = []
|
|
74
74
|
np_dtype = np.float32
|
|
75
75
|
|
|
76
|
-
for img_path, label, box_targets in tqdm(
|
|
76
|
+
for img_path, label, box_targets in tqdm(
|
|
77
|
+
iterable=mat_data, desc="Preparing and Loading IIIT5K", total=len(mat_data)
|
|
78
|
+
):
|
|
77
79
|
_raw_path = img_path[0]
|
|
78
80
|
_raw_label = label[0]
|
|
79
81
|
|
|
@@ -97,7 +99,9 @@ class IIIT5K(VisionDataset):
|
|
|
97
99
|
box_targets = [[box[0], box[1], box[0] + box[2], box[1] + box[3]] for box in box_targets]
|
|
98
100
|
|
|
99
101
|
if recognition_task:
|
|
100
|
-
|
|
102
|
+
if " " not in _raw_label:
|
|
103
|
+
with Image.open(os.path.join(tmp_root, _raw_path)) as pil_img:
|
|
104
|
+
self.data.append((np.array(pil_img.convert("RGB")), _raw_label))
|
|
101
105
|
elif detection_task:
|
|
102
106
|
self.data.append((_raw_path, np.asarray(box_targets, dtype=np_dtype)))
|
|
103
107
|
else:
|
doctr/datasets/iiithws.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
import os
|
|
7
7
|
from random import sample
|
|
8
|
-
from typing import Any
|
|
8
|
+
from typing import Any
|
|
9
9
|
|
|
10
10
|
from tqdm import tqdm
|
|
11
11
|
|
|
@@ -32,7 +32,6 @@ class IIITHWS(AbstractDataset):
|
|
|
32
32
|
>>> img, target = test_set[0]
|
|
33
33
|
|
|
34
34
|
Args:
|
|
35
|
-
----
|
|
36
35
|
img_folder: folder with all the images of the dataset
|
|
37
36
|
label_path: path to the file with the labels
|
|
38
37
|
train: whether the subset should be the training one
|
|
@@ -52,7 +51,7 @@ class IIITHWS(AbstractDataset):
|
|
|
52
51
|
if not os.path.exists(label_path) or not os.path.exists(img_folder):
|
|
53
52
|
raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")
|
|
54
53
|
|
|
55
|
-
self.data:
|
|
54
|
+
self.data: list[tuple[str, str]] = []
|
|
56
55
|
self.train = train
|
|
57
56
|
|
|
58
57
|
with open(label_path) as f:
|
|
@@ -64,7 +63,7 @@ class IIITHWS(AbstractDataset):
|
|
|
64
63
|
set_slice = slice(train_samples) if self.train else slice(train_samples, None)
|
|
65
64
|
|
|
66
65
|
for annotation in tqdm(
|
|
67
|
-
iterable=annotations[set_slice], desc="
|
|
66
|
+
iterable=annotations[set_slice], desc="Preparing and Loading IIITHWS", total=len(annotations[set_slice])
|
|
68
67
|
):
|
|
69
68
|
img_path, label = annotation.split()[0:2]
|
|
70
69
|
img_path = os.path.join(img_folder, img_path)
|
doctr/datasets/imgur5k.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
@@ -7,7 +7,7 @@ import glob
|
|
|
7
7
|
import json
|
|
8
8
|
import os
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
from typing import Any
|
|
10
|
+
from typing import Any
|
|
11
11
|
|
|
12
12
|
import cv2
|
|
13
13
|
import numpy as np
|
|
@@ -40,7 +40,6 @@ class IMGUR5K(AbstractDataset):
|
|
|
40
40
|
>>> img, target = test_set[0]
|
|
41
41
|
|
|
42
42
|
Args:
|
|
43
|
-
----
|
|
44
43
|
img_folder: folder with all the images of the dataset
|
|
45
44
|
label_path: path to the annotations file of the dataset
|
|
46
45
|
train: whether the subset should be the training one
|
|
@@ -73,7 +72,7 @@ class IMGUR5K(AbstractDataset):
|
|
|
73
72
|
if not os.path.exists(label_path) or not os.path.exists(img_folder):
|
|
74
73
|
raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")
|
|
75
74
|
|
|
76
|
-
self.data:
|
|
75
|
+
self.data: list[tuple[str | Path | np.ndarray, str | dict[str, Any] | np.ndarray]] = []
|
|
77
76
|
self.train = train
|
|
78
77
|
np_dtype = np.float32
|
|
79
78
|
|
|
@@ -96,7 +95,9 @@ class IMGUR5K(AbstractDataset):
|
|
|
96
95
|
with open(label_path) as f:
|
|
97
96
|
annotation_file = json.load(f)
|
|
98
97
|
|
|
99
|
-
for img_name in tqdm(
|
|
98
|
+
for img_name in tqdm(
|
|
99
|
+
iterable=img_names[set_slice], desc="Preparing and Loading IMGUR5K", total=len(img_names[set_slice])
|
|
100
|
+
):
|
|
100
101
|
img_path = Path(img_folder, img_name)
|
|
101
102
|
img_id = img_name.split(".")[0]
|
|
102
103
|
|
|
@@ -132,7 +133,13 @@ class IMGUR5K(AbstractDataset):
|
|
|
132
133
|
img_path=os.path.join(self.root, img_name), geoms=np.asarray(box_targets, dtype=np_dtype)
|
|
133
134
|
)
|
|
134
135
|
for crop, label in zip(crops, labels):
|
|
135
|
-
if
|
|
136
|
+
if (
|
|
137
|
+
crop.shape[0] > 0
|
|
138
|
+
and crop.shape[1] > 0
|
|
139
|
+
and len(label) > 0
|
|
140
|
+
and len(label) < 30
|
|
141
|
+
and " " not in label
|
|
142
|
+
):
|
|
136
143
|
# write data to disk
|
|
137
144
|
with open(os.path.join(reco_folder_path, f"{reco_images_counter}.txt"), "w") as f:
|
|
138
145
|
f.write(label)
|
|
@@ -151,6 +158,7 @@ class IMGUR5K(AbstractDataset):
|
|
|
151
158
|
return f"train={self.train}"
|
|
152
159
|
|
|
153
160
|
def _read_from_folder(self, path: str) -> None:
|
|
154
|
-
|
|
161
|
+
img_paths = glob.glob(os.path.join(path, "*.png"))
|
|
162
|
+
for img_path in tqdm(iterable=img_paths, desc="Preparing and Loading IMGUR5K", total=len(img_paths)):
|
|
155
163
|
with open(os.path.join(path, f"{os.path.basename(img_path)[:-4]}.txt"), "r") as f:
|
|
156
164
|
self.data.append((img_path, f.read()))
|
doctr/datasets/loader.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
import math
|
|
7
|
-
from
|
|
7
|
+
from collections.abc import Callable
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
import tensorflow as tf
|
|
@@ -16,12 +16,10 @@ def default_collate(samples):
|
|
|
16
16
|
"""Collate multiple elements into batches
|
|
17
17
|
|
|
18
18
|
Args:
|
|
19
|
-
----
|
|
20
19
|
samples: list of N tuples containing M elements
|
|
21
20
|
|
|
22
21
|
Returns:
|
|
23
|
-
|
|
24
|
-
Tuple of M sequences contianing N elements each
|
|
22
|
+
tuple of M sequences containing N elements each
|
|
25
23
|
"""
|
|
26
24
|
batch_data = zip(*samples)
|
|
27
25
|
|
|
@@ -40,7 +38,6 @@ class DataLoader:
|
|
|
40
38
|
>>> images, targets = next(train_iter)
|
|
41
39
|
|
|
42
40
|
Args:
|
|
43
|
-
----
|
|
44
41
|
dataset: the dataset
|
|
45
42
|
shuffle: whether the samples should be shuffled before passing it to the iterator
|
|
46
43
|
batch_size: number of elements in each batch
|
|
@@ -54,7 +51,7 @@ class DataLoader:
|
|
|
54
51
|
shuffle: bool = True,
|
|
55
52
|
batch_size: int = 1,
|
|
56
53
|
drop_last: bool = False,
|
|
57
|
-
collate_fn:
|
|
54
|
+
collate_fn: Callable | None = None,
|
|
58
55
|
) -> None:
|
|
59
56
|
self.dataset = dataset
|
|
60
57
|
self.shuffle = shuffle
|
doctr/datasets/mjsynth.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
import os
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any
|
|
8
8
|
|
|
9
9
|
from tqdm import tqdm
|
|
10
10
|
|
|
@@ -30,7 +30,6 @@ class MJSynth(AbstractDataset):
|
|
|
30
30
|
>>> img, target = test_set[0]
|
|
31
31
|
|
|
32
32
|
Args:
|
|
33
|
-
----
|
|
34
33
|
img_folder: folder with all the images of the dataset
|
|
35
34
|
label_path: path to the file with the labels
|
|
36
35
|
train: whether the subset should be the training one
|
|
@@ -86,7 +85,7 @@ class MJSynth(AbstractDataset):
|
|
|
86
85
|
if not os.path.exists(label_path) or not os.path.exists(img_folder):
|
|
87
86
|
raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")
|
|
88
87
|
|
|
89
|
-
self.data:
|
|
88
|
+
self.data: list[tuple[str, str]] = []
|
|
90
89
|
self.train = train
|
|
91
90
|
|
|
92
91
|
with open(label_path) as f:
|
|
@@ -95,7 +94,9 @@ class MJSynth(AbstractDataset):
|
|
|
95
94
|
train_samples = int(len(img_paths) * 0.9)
|
|
96
95
|
set_slice = slice(train_samples) if self.train else slice(train_samples, None)
|
|
97
96
|
|
|
98
|
-
for path in tqdm(
|
|
97
|
+
for path in tqdm(
|
|
98
|
+
iterable=img_paths[set_slice], desc="Preparing and Loading MJSynth", total=len(img_paths[set_slice])
|
|
99
|
+
):
|
|
99
100
|
if path not in self.BLACKLIST:
|
|
100
101
|
label = path.split("_")[1]
|
|
101
102
|
img_path = os.path.join(img_folder, path[2:]).strip()
|
doctr/datasets/ocr.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
import json
|
|
7
7
|
import os
|
|
8
8
|
from pathlib import Path
|
|
9
|
-
from typing import Any
|
|
9
|
+
from typing import Any
|
|
10
10
|
|
|
11
11
|
import numpy as np
|
|
12
12
|
|
|
@@ -24,7 +24,6 @@ class OCRDataset(AbstractDataset):
|
|
|
24
24
|
>>> img, target = train_set[0]
|
|
25
25
|
|
|
26
26
|
Args:
|
|
27
|
-
----
|
|
28
27
|
img_folder: local path to image folder (all jpg at the root)
|
|
29
28
|
label_file: local path to the label file
|
|
30
29
|
use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
|
|
@@ -41,7 +40,7 @@ class OCRDataset(AbstractDataset):
|
|
|
41
40
|
super().__init__(img_folder, **kwargs)
|
|
42
41
|
|
|
43
42
|
# List images
|
|
44
|
-
self.data:
|
|
43
|
+
self.data: list[tuple[Path, dict[str, Any]]] = []
|
|
45
44
|
np_dtype = np.float32
|
|
46
45
|
with open(label_file, "rb") as f:
|
|
47
46
|
data = json.load(f)
|
doctr/datasets/orientation.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
import os
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
|
|
@@ -21,7 +21,6 @@ class OrientationDataset(AbstractDataset):
|
|
|
21
21
|
>>> img, target = train_set[0]
|
|
22
22
|
|
|
23
23
|
Args:
|
|
24
|
-
----
|
|
25
24
|
img_folder: folder with all the images of the dataset
|
|
26
25
|
**kwargs: keyword arguments from `AbstractDataset`.
|
|
27
26
|
"""
|
|
@@ -37,4 +36,4 @@ class OrientationDataset(AbstractDataset):
|
|
|
37
36
|
)
|
|
38
37
|
|
|
39
38
|
# initialize dataset with 0 degree rotation targets
|
|
40
|
-
self.data:
|
|
39
|
+
self.data: list[tuple[str, np.ndarray]] = [(img_name, np.array([0])) for img_name in os.listdir(self.root)]
|
doctr/datasets/recognition.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
import json
|
|
7
7
|
import os
|
|
8
8
|
from pathlib import Path
|
|
9
|
-
from typing import Any
|
|
9
|
+
from typing import Any
|
|
10
10
|
|
|
11
11
|
from .datasets import AbstractDataset
|
|
12
12
|
|
|
@@ -22,9 +22,8 @@ class RecognitionDataset(AbstractDataset):
|
|
|
22
22
|
>>> img, target = train_set[0]
|
|
23
23
|
|
|
24
24
|
Args:
|
|
25
|
-
----
|
|
26
25
|
img_folder: path to the images folder
|
|
27
|
-
labels_path:
|
|
26
|
+
labels_path: path to the json file containing all labels (character sequences)
|
|
28
27
|
**kwargs: keyword arguments from `AbstractDataset`.
|
|
29
28
|
"""
|
|
30
29
|
|
|
@@ -36,7 +35,7 @@ class RecognitionDataset(AbstractDataset):
|
|
|
36
35
|
) -> None:
|
|
37
36
|
super().__init__(img_folder, **kwargs)
|
|
38
37
|
|
|
39
|
-
self.data:
|
|
38
|
+
self.data: list[tuple[str, str]] = []
|
|
40
39
|
with open(labels_path, encoding="utf-8") as f:
|
|
41
40
|
labels = json.load(f)
|
|
42
41
|
|