python-doctr 0.9.0__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctr/contrib/__init__.py +1 -0
- doctr/contrib/artefacts.py +7 -9
- doctr/contrib/base.py +8 -17
- doctr/datasets/cord.py +17 -7
- doctr/datasets/datasets/__init__.py +4 -4
- doctr/datasets/datasets/base.py +16 -16
- doctr/datasets/datasets/pytorch.py +12 -12
- doctr/datasets/datasets/tensorflow.py +10 -10
- doctr/datasets/detection.py +6 -9
- doctr/datasets/doc_artefacts.py +3 -4
- doctr/datasets/funsd.py +17 -6
- doctr/datasets/generator/__init__.py +4 -4
- doctr/datasets/generator/base.py +16 -17
- doctr/datasets/generator/pytorch.py +1 -3
- doctr/datasets/generator/tensorflow.py +1 -3
- doctr/datasets/ic03.py +14 -5
- doctr/datasets/ic13.py +13 -5
- doctr/datasets/iiit5k.py +31 -20
- doctr/datasets/iiithws.py +4 -5
- doctr/datasets/imgur5k.py +15 -5
- doctr/datasets/loader.py +4 -7
- doctr/datasets/mjsynth.py +6 -5
- doctr/datasets/ocr.py +3 -4
- doctr/datasets/orientation.py +3 -4
- doctr/datasets/recognition.py +3 -4
- doctr/datasets/sroie.py +16 -5
- doctr/datasets/svhn.py +16 -5
- doctr/datasets/svt.py +14 -5
- doctr/datasets/synthtext.py +14 -5
- doctr/datasets/utils.py +37 -27
- doctr/datasets/vocabs.py +21 -7
- doctr/datasets/wildreceipt.py +25 -10
- doctr/file_utils.py +18 -4
- doctr/io/elements.py +69 -81
- doctr/io/html.py +1 -3
- doctr/io/image/__init__.py +3 -3
- doctr/io/image/base.py +2 -5
- doctr/io/image/pytorch.py +3 -12
- doctr/io/image/tensorflow.py +2 -11
- doctr/io/pdf.py +5 -7
- doctr/io/reader.py +5 -11
- doctr/models/_utils.py +14 -22
- doctr/models/builder.py +32 -50
- doctr/models/classification/magc_resnet/__init__.py +3 -3
- doctr/models/classification/magc_resnet/pytorch.py +10 -13
- doctr/models/classification/magc_resnet/tensorflow.py +21 -17
- doctr/models/classification/mobilenet/__init__.py +3 -3
- doctr/models/classification/mobilenet/pytorch.py +7 -17
- doctr/models/classification/mobilenet/tensorflow.py +22 -29
- doctr/models/classification/predictor/__init__.py +4 -4
- doctr/models/classification/predictor/pytorch.py +13 -11
- doctr/models/classification/predictor/tensorflow.py +13 -11
- doctr/models/classification/resnet/__init__.py +4 -4
- doctr/models/classification/resnet/pytorch.py +21 -31
- doctr/models/classification/resnet/tensorflow.py +41 -39
- doctr/models/classification/textnet/__init__.py +3 -3
- doctr/models/classification/textnet/pytorch.py +10 -17
- doctr/models/classification/textnet/tensorflow.py +19 -20
- doctr/models/classification/vgg/__init__.py +3 -3
- doctr/models/classification/vgg/pytorch.py +5 -7
- doctr/models/classification/vgg/tensorflow.py +18 -15
- doctr/models/classification/vit/__init__.py +3 -3
- doctr/models/classification/vit/pytorch.py +8 -14
- doctr/models/classification/vit/tensorflow.py +16 -16
- doctr/models/classification/zoo.py +36 -19
- doctr/models/core.py +3 -3
- doctr/models/detection/_utils/__init__.py +4 -4
- doctr/models/detection/_utils/base.py +4 -7
- doctr/models/detection/_utils/pytorch.py +1 -5
- doctr/models/detection/_utils/tensorflow.py +1 -5
- doctr/models/detection/core.py +2 -8
- doctr/models/detection/differentiable_binarization/__init__.py +4 -4
- doctr/models/detection/differentiable_binarization/base.py +7 -17
- doctr/models/detection/differentiable_binarization/pytorch.py +27 -30
- doctr/models/detection/differentiable_binarization/tensorflow.py +49 -37
- doctr/models/detection/fast/__init__.py +4 -4
- doctr/models/detection/fast/base.py +6 -14
- doctr/models/detection/fast/pytorch.py +24 -31
- doctr/models/detection/fast/tensorflow.py +28 -37
- doctr/models/detection/linknet/__init__.py +4 -4
- doctr/models/detection/linknet/base.py +6 -15
- doctr/models/detection/linknet/pytorch.py +24 -27
- doctr/models/detection/linknet/tensorflow.py +36 -33
- doctr/models/detection/predictor/__init__.py +5 -5
- doctr/models/detection/predictor/pytorch.py +6 -7
- doctr/models/detection/predictor/tensorflow.py +7 -8
- doctr/models/detection/zoo.py +27 -7
- doctr/models/factory/hub.py +8 -13
- doctr/models/kie_predictor/__init__.py +5 -5
- doctr/models/kie_predictor/base.py +8 -5
- doctr/models/kie_predictor/pytorch.py +22 -19
- doctr/models/kie_predictor/tensorflow.py +21 -15
- doctr/models/modules/layers/__init__.py +3 -3
- doctr/models/modules/layers/pytorch.py +6 -9
- doctr/models/modules/layers/tensorflow.py +5 -7
- doctr/models/modules/transformer/__init__.py +3 -3
- doctr/models/modules/transformer/pytorch.py +12 -13
- doctr/models/modules/transformer/tensorflow.py +9 -12
- doctr/models/modules/vision_transformer/__init__.py +3 -3
- doctr/models/modules/vision_transformer/pytorch.py +3 -4
- doctr/models/modules/vision_transformer/tensorflow.py +4 -4
- doctr/models/predictor/__init__.py +5 -5
- doctr/models/predictor/base.py +52 -41
- doctr/models/predictor/pytorch.py +16 -13
- doctr/models/predictor/tensorflow.py +16 -10
- doctr/models/preprocessor/__init__.py +4 -4
- doctr/models/preprocessor/pytorch.py +13 -17
- doctr/models/preprocessor/tensorflow.py +11 -15
- doctr/models/recognition/core.py +3 -7
- doctr/models/recognition/crnn/__init__.py +4 -4
- doctr/models/recognition/crnn/pytorch.py +20 -28
- doctr/models/recognition/crnn/tensorflow.py +19 -29
- doctr/models/recognition/master/__init__.py +3 -3
- doctr/models/recognition/master/base.py +3 -7
- doctr/models/recognition/master/pytorch.py +22 -24
- doctr/models/recognition/master/tensorflow.py +21 -26
- doctr/models/recognition/parseq/__init__.py +3 -3
- doctr/models/recognition/parseq/base.py +3 -7
- doctr/models/recognition/parseq/pytorch.py +26 -26
- doctr/models/recognition/parseq/tensorflow.py +26 -30
- doctr/models/recognition/predictor/__init__.py +5 -5
- doctr/models/recognition/predictor/_utils.py +7 -10
- doctr/models/recognition/predictor/pytorch.py +6 -6
- doctr/models/recognition/predictor/tensorflow.py +5 -6
- doctr/models/recognition/sar/__init__.py +4 -4
- doctr/models/recognition/sar/pytorch.py +20 -21
- doctr/models/recognition/sar/tensorflow.py +19 -24
- doctr/models/recognition/utils.py +5 -10
- doctr/models/recognition/vitstr/__init__.py +4 -4
- doctr/models/recognition/vitstr/base.py +3 -7
- doctr/models/recognition/vitstr/pytorch.py +18 -20
- doctr/models/recognition/vitstr/tensorflow.py +21 -24
- doctr/models/recognition/zoo.py +22 -11
- doctr/models/utils/__init__.py +4 -4
- doctr/models/utils/pytorch.py +13 -16
- doctr/models/utils/tensorflow.py +31 -30
- doctr/models/zoo.py +1 -5
- doctr/transforms/functional/__init__.py +3 -3
- doctr/transforms/functional/base.py +4 -11
- doctr/transforms/functional/pytorch.py +21 -29
- doctr/transforms/functional/tensorflow.py +10 -22
- doctr/transforms/modules/__init__.py +4 -4
- doctr/transforms/modules/base.py +48 -55
- doctr/transforms/modules/pytorch.py +65 -28
- doctr/transforms/modules/tensorflow.py +33 -44
- doctr/utils/common_types.py +8 -9
- doctr/utils/data.py +8 -12
- doctr/utils/fonts.py +2 -7
- doctr/utils/geometry.py +120 -64
- doctr/utils/metrics.py +18 -38
- doctr/utils/multithreading.py +4 -6
- doctr/utils/reconstitution.py +157 -75
- doctr/utils/repr.py +2 -3
- doctr/utils/visualization.py +16 -29
- doctr/version.py +1 -1
- {python_doctr-0.9.0.dist-info → python_doctr-0.11.0.dist-info}/METADATA +59 -57
- python_doctr-0.11.0.dist-info/RECORD +173 -0
- {python_doctr-0.9.0.dist-info → python_doctr-0.11.0.dist-info}/WHEEL +1 -1
- python_doctr-0.9.0.dist-info/RECORD +0 -173
- {python_doctr-0.9.0.dist-info → python_doctr-0.11.0.dist-info}/LICENSE +0 -0
- {python_doctr-0.9.0.dist-info → python_doctr-0.11.0.dist-info}/top_level.txt +0 -0
- {python_doctr-0.9.0.dist-info → python_doctr-0.11.0.dist-info}/zip-safe +0 -0
doctr/datasets/generator/base.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
import random
|
|
7
|
-
from
|
|
7
|
+
from collections.abc import Callable
|
|
8
|
+
from typing import Any
|
|
8
9
|
|
|
9
10
|
from PIL import Image, ImageDraw
|
|
10
11
|
|
|
@@ -17,14 +18,13 @@ from ..datasets import AbstractDataset
|
|
|
17
18
|
def synthesize_text_img(
|
|
18
19
|
text: str,
|
|
19
20
|
font_size: int = 32,
|
|
20
|
-
font_family:
|
|
21
|
-
background_color:
|
|
22
|
-
text_color:
|
|
21
|
+
font_family: str | None = None,
|
|
22
|
+
background_color: tuple[int, int, int] | None = None,
|
|
23
|
+
text_color: tuple[int, int, int] | None = None,
|
|
23
24
|
) -> Image.Image:
|
|
24
25
|
"""Generate a synthetic text image
|
|
25
26
|
|
|
26
27
|
Args:
|
|
27
|
-
----
|
|
28
28
|
text: the text to render as an image
|
|
29
29
|
font_size: the size of the font
|
|
30
30
|
font_family: the font family (has to be installed on your system)
|
|
@@ -32,7 +32,6 @@ def synthesize_text_img(
|
|
|
32
32
|
text_color: text color on the final image
|
|
33
33
|
|
|
34
34
|
Returns:
|
|
35
|
-
-------
|
|
36
35
|
PIL image of the text
|
|
37
36
|
"""
|
|
38
37
|
background_color = (0, 0, 0) if background_color is None else background_color
|
|
@@ -61,9 +60,9 @@ class _CharacterGenerator(AbstractDataset):
|
|
|
61
60
|
vocab: str,
|
|
62
61
|
num_samples: int,
|
|
63
62
|
cache_samples: bool = False,
|
|
64
|
-
font_family:
|
|
65
|
-
img_transforms:
|
|
66
|
-
sample_transforms:
|
|
63
|
+
font_family: str | list[str] | None = None,
|
|
64
|
+
img_transforms: Callable[[Any], Any] | None = None,
|
|
65
|
+
sample_transforms: Callable[[Any, Any], tuple[Any, Any]] | None = None,
|
|
67
66
|
) -> None:
|
|
68
67
|
self.vocab = vocab
|
|
69
68
|
self._num_samples = num_samples
|
|
@@ -78,7 +77,7 @@ class _CharacterGenerator(AbstractDataset):
|
|
|
78
77
|
self.img_transforms = img_transforms
|
|
79
78
|
self.sample_transforms = sample_transforms
|
|
80
79
|
|
|
81
|
-
self._data:
|
|
80
|
+
self._data: list[Image.Image] = []
|
|
82
81
|
if cache_samples:
|
|
83
82
|
self._data = [
|
|
84
83
|
(synthesize_text_img(char, font_family=font), idx) # type: ignore[misc]
|
|
@@ -89,7 +88,7 @@ class _CharacterGenerator(AbstractDataset):
|
|
|
89
88
|
def __len__(self) -> int:
|
|
90
89
|
return self._num_samples
|
|
91
90
|
|
|
92
|
-
def _read_sample(self, index: int) ->
|
|
91
|
+
def _read_sample(self, index: int) -> tuple[Any, int]:
|
|
93
92
|
# Samples are already cached
|
|
94
93
|
if len(self._data) > 0:
|
|
95
94
|
idx = index % len(self._data)
|
|
@@ -110,9 +109,9 @@ class _WordGenerator(AbstractDataset):
|
|
|
110
109
|
max_chars: int,
|
|
111
110
|
num_samples: int,
|
|
112
111
|
cache_samples: bool = False,
|
|
113
|
-
font_family:
|
|
114
|
-
img_transforms:
|
|
115
|
-
sample_transforms:
|
|
112
|
+
font_family: str | list[str] | None = None,
|
|
113
|
+
img_transforms: Callable[[Any], Any] | None = None,
|
|
114
|
+
sample_transforms: Callable[[Any, Any], tuple[Any, Any]] | None = None,
|
|
116
115
|
) -> None:
|
|
117
116
|
self.vocab = vocab
|
|
118
117
|
self.wordlen_range = (min_chars, max_chars)
|
|
@@ -128,7 +127,7 @@ class _WordGenerator(AbstractDataset):
|
|
|
128
127
|
self.img_transforms = img_transforms
|
|
129
128
|
self.sample_transforms = sample_transforms
|
|
130
129
|
|
|
131
|
-
self._data:
|
|
130
|
+
self._data: list[Image.Image] = []
|
|
132
131
|
if cache_samples:
|
|
133
132
|
_words = [self._generate_string(*self.wordlen_range) for _ in range(num_samples)]
|
|
134
133
|
self._data = [
|
|
@@ -143,7 +142,7 @@ class _WordGenerator(AbstractDataset):
|
|
|
143
142
|
def __len__(self) -> int:
|
|
144
143
|
return self._num_samples
|
|
145
144
|
|
|
146
|
-
def _read_sample(self, index: int) ->
|
|
145
|
+
def _read_sample(self, index: int) -> tuple[Any, str]:
|
|
147
146
|
# Samples are already cached
|
|
148
147
|
if len(self._data) > 0:
|
|
149
148
|
pil_img, target = self._data[index] # type: ignore[misc]
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
@@ -18,7 +18,6 @@ class CharacterGenerator(_CharacterGenerator):
|
|
|
18
18
|
>>> img, target = ds[0]
|
|
19
19
|
|
|
20
20
|
Args:
|
|
21
|
-
----
|
|
22
21
|
vocab: vocabulary to take the character from
|
|
23
22
|
num_samples: number of samples that will be generated iterating over the dataset
|
|
24
23
|
cache_samples: whether generated images should be cached firsthand
|
|
@@ -40,7 +39,6 @@ class WordGenerator(_WordGenerator):
|
|
|
40
39
|
>>> img, target = ds[0]
|
|
41
40
|
|
|
42
41
|
Args:
|
|
43
|
-
----
|
|
44
42
|
vocab: vocabulary to take the character from
|
|
45
43
|
min_chars: minimum number of characters in a word
|
|
46
44
|
max_chars: maximum number of characters in a word
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
@@ -18,7 +18,6 @@ class CharacterGenerator(_CharacterGenerator):
|
|
|
18
18
|
>>> img, target = ds[0]
|
|
19
19
|
|
|
20
20
|
Args:
|
|
21
|
-
----
|
|
22
21
|
vocab: vocabulary to take the character from
|
|
23
22
|
num_samples: number of samples that will be generated iterating over the dataset
|
|
24
23
|
cache_samples: whether generated images should be cached firsthand
|
|
@@ -46,7 +45,6 @@ class WordGenerator(_WordGenerator):
|
|
|
46
45
|
>>> img, target = ds[0]
|
|
47
46
|
|
|
48
47
|
Args:
|
|
49
|
-
----
|
|
50
48
|
vocab: vocabulary to take the character from
|
|
51
49
|
min_chars: minimum number of characters in a word
|
|
52
50
|
max_chars: maximum number of characters in a word
|
doctr/datasets/ic03.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
import os
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any
|
|
8
8
|
|
|
9
9
|
import defusedxml.ElementTree as ET
|
|
10
10
|
import numpy as np
|
|
@@ -28,10 +28,10 @@ class IC03(VisionDataset):
|
|
|
28
28
|
>>> img, target = train_set[0]
|
|
29
29
|
|
|
30
30
|
Args:
|
|
31
|
-
----
|
|
32
31
|
train: whether the subset should be the training one
|
|
33
32
|
use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
|
|
34
33
|
recognition_task: whether the dataset should be used for recognition task
|
|
34
|
+
detection_task: whether the dataset should be used for detection task
|
|
35
35
|
**kwargs: keyword arguments from `VisionDataset`.
|
|
36
36
|
"""
|
|
37
37
|
|
|
@@ -51,6 +51,7 @@ class IC03(VisionDataset):
|
|
|
51
51
|
train: bool = True,
|
|
52
52
|
use_polygons: bool = False,
|
|
53
53
|
recognition_task: bool = False,
|
|
54
|
+
detection_task: bool = False,
|
|
54
55
|
**kwargs: Any,
|
|
55
56
|
) -> None:
|
|
56
57
|
url, sha256, file_name = self.TRAIN if train else self.TEST
|
|
@@ -62,8 +63,14 @@ class IC03(VisionDataset):
|
|
|
62
63
|
pre_transforms=convert_target_to_relative if not recognition_task else None,
|
|
63
64
|
**kwargs,
|
|
64
65
|
)
|
|
66
|
+
if recognition_task and detection_task:
|
|
67
|
+
raise ValueError(
|
|
68
|
+
"`recognition_task` and `detection_task` cannot be set to True simultaneously. "
|
|
69
|
+
+ "To get the whole dataset with boxes and labels leave both parameters to False."
|
|
70
|
+
)
|
|
71
|
+
|
|
65
72
|
self.train = train
|
|
66
|
-
self.data:
|
|
73
|
+
self.data: list[tuple[str | np.ndarray, str | dict[str, Any] | np.ndarray]] = []
|
|
67
74
|
np_dtype = np.float32
|
|
68
75
|
|
|
69
76
|
# Load xml data
|
|
@@ -73,7 +80,7 @@ class IC03(VisionDataset):
|
|
|
73
80
|
xml_tree = ET.parse(os.path.join(tmp_root, "words.xml"))
|
|
74
81
|
xml_root = xml_tree.getroot()
|
|
75
82
|
|
|
76
|
-
for image in tqdm(iterable=xml_root, desc="
|
|
83
|
+
for image in tqdm(iterable=xml_root, desc="Preparing and Loading IC03", total=len(xml_root)):
|
|
77
84
|
name, _resolution, rectangles = image
|
|
78
85
|
|
|
79
86
|
# File existence check
|
|
@@ -117,6 +124,8 @@ class IC03(VisionDataset):
|
|
|
117
124
|
for crop, label in zip(crops, labels):
|
|
118
125
|
if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
|
|
119
126
|
self.data.append((crop, label))
|
|
127
|
+
elif detection_task:
|
|
128
|
+
self.data.append((name.text, boxes))
|
|
120
129
|
else:
|
|
121
130
|
self.data.append((name.text, dict(boxes=boxes, labels=labels)))
|
|
122
131
|
|
doctr/datasets/ic13.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
import csv
|
|
7
7
|
import os
|
|
8
8
|
from pathlib import Path
|
|
9
|
-
from typing import Any
|
|
9
|
+
from typing import Any
|
|
10
10
|
|
|
11
11
|
import numpy as np
|
|
12
12
|
from tqdm import tqdm
|
|
@@ -33,11 +33,11 @@ class IC13(AbstractDataset):
|
|
|
33
33
|
>>> img, target = test_set[0]
|
|
34
34
|
|
|
35
35
|
Args:
|
|
36
|
-
----
|
|
37
36
|
img_folder: folder with all the images of the dataset
|
|
38
37
|
label_folder: folder with all annotation files for the images
|
|
39
38
|
use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
|
|
40
39
|
recognition_task: whether the dataset should be used for recognition task
|
|
40
|
+
detection_task: whether the dataset should be used for detection task
|
|
41
41
|
**kwargs: keyword arguments from `AbstractDataset`.
|
|
42
42
|
"""
|
|
43
43
|
|
|
@@ -47,11 +47,17 @@ class IC13(AbstractDataset):
|
|
|
47
47
|
label_folder: str,
|
|
48
48
|
use_polygons: bool = False,
|
|
49
49
|
recognition_task: bool = False,
|
|
50
|
+
detection_task: bool = False,
|
|
50
51
|
**kwargs: Any,
|
|
51
52
|
) -> None:
|
|
52
53
|
super().__init__(
|
|
53
54
|
img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs
|
|
54
55
|
)
|
|
56
|
+
if recognition_task and detection_task:
|
|
57
|
+
raise ValueError(
|
|
58
|
+
"`recognition_task` and `detection_task` cannot be set to True simultaneously. "
|
|
59
|
+
+ "To get the whole dataset with boxes and labels leave both parameters to False."
|
|
60
|
+
)
|
|
55
61
|
|
|
56
62
|
# File existence check
|
|
57
63
|
if not os.path.exists(label_folder) or not os.path.exists(img_folder):
|
|
@@ -59,12 +65,12 @@ class IC13(AbstractDataset):
|
|
|
59
65
|
f"unable to locate {label_folder if not os.path.exists(label_folder) else img_folder}"
|
|
60
66
|
)
|
|
61
67
|
|
|
62
|
-
self.data:
|
|
68
|
+
self.data: list[tuple[Path | np.ndarray, str | dict[str, Any] | np.ndarray]] = []
|
|
63
69
|
np_dtype = np.float32
|
|
64
70
|
|
|
65
71
|
img_names = os.listdir(img_folder)
|
|
66
72
|
|
|
67
|
-
for img_name in tqdm(iterable=img_names, desc="
|
|
73
|
+
for img_name in tqdm(iterable=img_names, desc="Preparing and Loading IC13", total=len(img_names)):
|
|
68
74
|
img_path = Path(img_folder, img_name)
|
|
69
75
|
label_path = Path(label_folder, "gt_" + Path(img_name).stem + ".txt")
|
|
70
76
|
|
|
@@ -95,5 +101,7 @@ class IC13(AbstractDataset):
|
|
|
95
101
|
crops = crop_bboxes_from_image(img_path=img_path, geoms=box_targets)
|
|
96
102
|
for crop, label in zip(crops, labels):
|
|
97
103
|
self.data.append((crop, label))
|
|
104
|
+
elif detection_task:
|
|
105
|
+
self.data.append((img_path, box_targets))
|
|
98
106
|
else:
|
|
99
107
|
self.data.append((img_path, dict(boxes=box_targets, labels=labels)))
|
doctr/datasets/iiit5k.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
import os
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
import scipy.io as sio
|
|
@@ -30,10 +30,10 @@ class IIIT5K(VisionDataset):
|
|
|
30
30
|
>>> img, target = train_set[0]
|
|
31
31
|
|
|
32
32
|
Args:
|
|
33
|
-
----
|
|
34
33
|
train: whether the subset should be the training one
|
|
35
34
|
use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
|
|
36
35
|
recognition_task: whether the dataset should be used for recognition task
|
|
36
|
+
detection_task: whether the dataset should be used for detection task
|
|
37
37
|
**kwargs: keyword arguments from `VisionDataset`.
|
|
38
38
|
"""
|
|
39
39
|
|
|
@@ -45,6 +45,7 @@ class IIIT5K(VisionDataset):
|
|
|
45
45
|
train: bool = True,
|
|
46
46
|
use_polygons: bool = False,
|
|
47
47
|
recognition_task: bool = False,
|
|
48
|
+
detection_task: bool = False,
|
|
48
49
|
**kwargs: Any,
|
|
49
50
|
) -> None:
|
|
50
51
|
super().__init__(
|
|
@@ -55,6 +56,12 @@ class IIIT5K(VisionDataset):
|
|
|
55
56
|
pre_transforms=convert_target_to_relative if not recognition_task else None,
|
|
56
57
|
**kwargs,
|
|
57
58
|
)
|
|
59
|
+
if recognition_task and detection_task:
|
|
60
|
+
raise ValueError(
|
|
61
|
+
"`recognition_task` and `detection_task` cannot be set to True simultaneously. "
|
|
62
|
+
+ "To get the whole dataset with boxes and labels leave both parameters to False."
|
|
63
|
+
)
|
|
64
|
+
|
|
58
65
|
self.train = train
|
|
59
66
|
|
|
60
67
|
# Load mat data
|
|
@@ -62,10 +69,12 @@ class IIIT5K(VisionDataset):
|
|
|
62
69
|
mat_file = "trainCharBound" if self.train else "testCharBound"
|
|
63
70
|
mat_data = sio.loadmat(os.path.join(tmp_root, f"{mat_file}.mat"))[mat_file][0]
|
|
64
71
|
|
|
65
|
-
self.data:
|
|
72
|
+
self.data: list[tuple[str | np.ndarray, str | dict[str, Any] | np.ndarray]] = []
|
|
66
73
|
np_dtype = np.float32
|
|
67
74
|
|
|
68
|
-
for img_path, label, box_targets in tqdm(
|
|
75
|
+
for img_path, label, box_targets in tqdm(
|
|
76
|
+
iterable=mat_data, desc="Preparing and Loading IIIT5K", total=len(mat_data)
|
|
77
|
+
):
|
|
69
78
|
_raw_path = img_path[0]
|
|
70
79
|
_raw_label = label[0]
|
|
71
80
|
|
|
@@ -73,24 +82,26 @@ class IIIT5K(VisionDataset):
|
|
|
73
82
|
if not os.path.exists(os.path.join(tmp_root, _raw_path)):
|
|
74
83
|
raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, _raw_path)}")
|
|
75
84
|
|
|
85
|
+
if use_polygons:
|
|
86
|
+
# (x, y) coordinates of top left, top right, bottom right, bottom left corners
|
|
87
|
+
box_targets = [
|
|
88
|
+
[
|
|
89
|
+
[box[0], box[1]],
|
|
90
|
+
[box[0] + box[2], box[1]],
|
|
91
|
+
[box[0] + box[2], box[1] + box[3]],
|
|
92
|
+
[box[0], box[1] + box[3]],
|
|
93
|
+
]
|
|
94
|
+
for box in box_targets
|
|
95
|
+
]
|
|
96
|
+
else:
|
|
97
|
+
# xmin, ymin, xmax, ymax
|
|
98
|
+
box_targets = [[box[0], box[1], box[0] + box[2], box[1] + box[3]] for box in box_targets]
|
|
99
|
+
|
|
76
100
|
if recognition_task:
|
|
77
101
|
self.data.append((_raw_path, _raw_label))
|
|
102
|
+
elif detection_task:
|
|
103
|
+
self.data.append((_raw_path, np.asarray(box_targets, dtype=np_dtype)))
|
|
78
104
|
else:
|
|
79
|
-
if use_polygons:
|
|
80
|
-
# (x, y) coordinates of top left, top right, bottom right, bottom left corners
|
|
81
|
-
box_targets = [
|
|
82
|
-
[
|
|
83
|
-
[box[0], box[1]],
|
|
84
|
-
[box[0] + box[2], box[1]],
|
|
85
|
-
[box[0] + box[2], box[1] + box[3]],
|
|
86
|
-
[box[0], box[1] + box[3]],
|
|
87
|
-
]
|
|
88
|
-
for box in box_targets
|
|
89
|
-
]
|
|
90
|
-
else:
|
|
91
|
-
# xmin, ymin, xmax, ymax
|
|
92
|
-
box_targets = [[box[0], box[1], box[0] + box[2], box[1] + box[3]] for box in box_targets]
|
|
93
|
-
|
|
94
105
|
# label are casted to list where each char corresponds to the character's bounding box
|
|
95
106
|
self.data.append((
|
|
96
107
|
_raw_path,
|
doctr/datasets/iiithws.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
import os
|
|
7
7
|
from random import sample
|
|
8
|
-
from typing import Any
|
|
8
|
+
from typing import Any
|
|
9
9
|
|
|
10
10
|
from tqdm import tqdm
|
|
11
11
|
|
|
@@ -32,7 +32,6 @@ class IIITHWS(AbstractDataset):
|
|
|
32
32
|
>>> img, target = test_set[0]
|
|
33
33
|
|
|
34
34
|
Args:
|
|
35
|
-
----
|
|
36
35
|
img_folder: folder with all the images of the dataset
|
|
37
36
|
label_path: path to the file with the labels
|
|
38
37
|
train: whether the subset should be the training one
|
|
@@ -52,7 +51,7 @@ class IIITHWS(AbstractDataset):
|
|
|
52
51
|
if not os.path.exists(label_path) or not os.path.exists(img_folder):
|
|
53
52
|
raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")
|
|
54
53
|
|
|
55
|
-
self.data:
|
|
54
|
+
self.data: list[tuple[str, str]] = []
|
|
56
55
|
self.train = train
|
|
57
56
|
|
|
58
57
|
with open(label_path) as f:
|
|
@@ -64,7 +63,7 @@ class IIITHWS(AbstractDataset):
|
|
|
64
63
|
set_slice = slice(train_samples) if self.train else slice(train_samples, None)
|
|
65
64
|
|
|
66
65
|
for annotation in tqdm(
|
|
67
|
-
iterable=annotations[set_slice], desc="
|
|
66
|
+
iterable=annotations[set_slice], desc="Preparing and Loading IIITHWS", total=len(annotations[set_slice])
|
|
68
67
|
):
|
|
69
68
|
img_path, label = annotation.split()[0:2]
|
|
70
69
|
img_path = os.path.join(img_folder, img_path)
|
doctr/datasets/imgur5k.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
@@ -7,7 +7,7 @@ import glob
|
|
|
7
7
|
import json
|
|
8
8
|
import os
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
from typing import Any
|
|
10
|
+
from typing import Any
|
|
11
11
|
|
|
12
12
|
import cv2
|
|
13
13
|
import numpy as np
|
|
@@ -40,12 +40,12 @@ class IMGUR5K(AbstractDataset):
|
|
|
40
40
|
>>> img, target = test_set[0]
|
|
41
41
|
|
|
42
42
|
Args:
|
|
43
|
-
----
|
|
44
43
|
img_folder: folder with all the images of the dataset
|
|
45
44
|
label_path: path to the annotations file of the dataset
|
|
46
45
|
train: whether the subset should be the training one
|
|
47
46
|
use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
|
|
48
47
|
recognition_task: whether the dataset should be used for recognition task
|
|
48
|
+
detection_task: whether the dataset should be used for detection task
|
|
49
49
|
**kwargs: keyword arguments from `AbstractDataset`.
|
|
50
50
|
"""
|
|
51
51
|
|
|
@@ -56,17 +56,23 @@ class IMGUR5K(AbstractDataset):
|
|
|
56
56
|
train: bool = True,
|
|
57
57
|
use_polygons: bool = False,
|
|
58
58
|
recognition_task: bool = False,
|
|
59
|
+
detection_task: bool = False,
|
|
59
60
|
**kwargs: Any,
|
|
60
61
|
) -> None:
|
|
61
62
|
super().__init__(
|
|
62
63
|
img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs
|
|
63
64
|
)
|
|
65
|
+
if recognition_task and detection_task:
|
|
66
|
+
raise ValueError(
|
|
67
|
+
"`recognition_task` and `detection_task` cannot be set to True simultaneously. "
|
|
68
|
+
+ "To get the whole dataset with boxes and labels leave both parameters to False."
|
|
69
|
+
)
|
|
64
70
|
|
|
65
71
|
# File existence check
|
|
66
72
|
if not os.path.exists(label_path) or not os.path.exists(img_folder):
|
|
67
73
|
raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")
|
|
68
74
|
|
|
69
|
-
self.data:
|
|
75
|
+
self.data: list[tuple[str | Path | np.ndarray, str | dict[str, Any] | np.ndarray]] = []
|
|
70
76
|
self.train = train
|
|
71
77
|
np_dtype = np.float32
|
|
72
78
|
|
|
@@ -89,7 +95,9 @@ class IMGUR5K(AbstractDataset):
|
|
|
89
95
|
with open(label_path) as f:
|
|
90
96
|
annotation_file = json.load(f)
|
|
91
97
|
|
|
92
|
-
for img_name in tqdm(
|
|
98
|
+
for img_name in tqdm(
|
|
99
|
+
iterable=img_names[set_slice], desc="Preparing and Loading IMGUR5K", total=len(img_names[set_slice])
|
|
100
|
+
):
|
|
93
101
|
img_path = Path(img_folder, img_name)
|
|
94
102
|
img_id = img_name.split(".")[0]
|
|
95
103
|
|
|
@@ -132,6 +140,8 @@ class IMGUR5K(AbstractDataset):
|
|
|
132
140
|
tmp_img = Image.fromarray(crop)
|
|
133
141
|
tmp_img.save(os.path.join(reco_folder_path, f"{reco_images_counter}.png"))
|
|
134
142
|
reco_images_counter += 1
|
|
143
|
+
elif detection_task:
|
|
144
|
+
self.data.append((img_path, np.asarray(box_targets, dtype=np_dtype)))
|
|
135
145
|
else:
|
|
136
146
|
self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=labels)))
|
|
137
147
|
|
doctr/datasets/loader.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
import math
|
|
7
|
-
from
|
|
7
|
+
from collections.abc import Callable
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
import tensorflow as tf
|
|
@@ -16,12 +16,10 @@ def default_collate(samples):
|
|
|
16
16
|
"""Collate multiple elements into batches
|
|
17
17
|
|
|
18
18
|
Args:
|
|
19
|
-
----
|
|
20
19
|
samples: list of N tuples containing M elements
|
|
21
20
|
|
|
22
21
|
Returns:
|
|
23
|
-
|
|
24
|
-
Tuple of M sequences contianing N elements each
|
|
22
|
+
tuple of M sequences contianing N elements each
|
|
25
23
|
"""
|
|
26
24
|
batch_data = zip(*samples)
|
|
27
25
|
|
|
@@ -40,7 +38,6 @@ class DataLoader:
|
|
|
40
38
|
>>> images, targets = next(train_iter)
|
|
41
39
|
|
|
42
40
|
Args:
|
|
43
|
-
----
|
|
44
41
|
dataset: the dataset
|
|
45
42
|
shuffle: whether the samples should be shuffled before passing it to the iterator
|
|
46
43
|
batch_size: number of elements in each batch
|
|
@@ -54,7 +51,7 @@ class DataLoader:
|
|
|
54
51
|
shuffle: bool = True,
|
|
55
52
|
batch_size: int = 1,
|
|
56
53
|
drop_last: bool = False,
|
|
57
|
-
collate_fn:
|
|
54
|
+
collate_fn: Callable | None = None,
|
|
58
55
|
) -> None:
|
|
59
56
|
self.dataset = dataset
|
|
60
57
|
self.shuffle = shuffle
|
doctr/datasets/mjsynth.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
import os
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any
|
|
8
8
|
|
|
9
9
|
from tqdm import tqdm
|
|
10
10
|
|
|
@@ -30,7 +30,6 @@ class MJSynth(AbstractDataset):
|
|
|
30
30
|
>>> img, target = test_set[0]
|
|
31
31
|
|
|
32
32
|
Args:
|
|
33
|
-
----
|
|
34
33
|
img_folder: folder with all the images of the dataset
|
|
35
34
|
label_path: path to the file with the labels
|
|
36
35
|
train: whether the subset should be the training one
|
|
@@ -86,7 +85,7 @@ class MJSynth(AbstractDataset):
|
|
|
86
85
|
if not os.path.exists(label_path) or not os.path.exists(img_folder):
|
|
87
86
|
raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")
|
|
88
87
|
|
|
89
|
-
self.data:
|
|
88
|
+
self.data: list[tuple[str, str]] = []
|
|
90
89
|
self.train = train
|
|
91
90
|
|
|
92
91
|
with open(label_path) as f:
|
|
@@ -95,7 +94,9 @@ class MJSynth(AbstractDataset):
|
|
|
95
94
|
train_samples = int(len(img_paths) * 0.9)
|
|
96
95
|
set_slice = slice(train_samples) if self.train else slice(train_samples, None)
|
|
97
96
|
|
|
98
|
-
for path in tqdm(
|
|
97
|
+
for path in tqdm(
|
|
98
|
+
iterable=img_paths[set_slice], desc="Preparing and Loading MJSynth", total=len(img_paths[set_slice])
|
|
99
|
+
):
|
|
99
100
|
if path not in self.BLACKLIST:
|
|
100
101
|
label = path.split("_")[1]
|
|
101
102
|
img_path = os.path.join(img_folder, path[2:]).strip()
|
doctr/datasets/ocr.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
import json
|
|
7
7
|
import os
|
|
8
8
|
from pathlib import Path
|
|
9
|
-
from typing import Any
|
|
9
|
+
from typing import Any
|
|
10
10
|
|
|
11
11
|
import numpy as np
|
|
12
12
|
|
|
@@ -24,7 +24,6 @@ class OCRDataset(AbstractDataset):
|
|
|
24
24
|
>>> img, target = train_set[0]
|
|
25
25
|
|
|
26
26
|
Args:
|
|
27
|
-
----
|
|
28
27
|
img_folder: local path to image folder (all jpg at the root)
|
|
29
28
|
label_file: local path to the label file
|
|
30
29
|
use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
|
|
@@ -41,7 +40,7 @@ class OCRDataset(AbstractDataset):
|
|
|
41
40
|
super().__init__(img_folder, **kwargs)
|
|
42
41
|
|
|
43
42
|
# List images
|
|
44
|
-
self.data:
|
|
43
|
+
self.data: list[tuple[str, dict[str, Any]]] = []
|
|
45
44
|
np_dtype = np.float32
|
|
46
45
|
with open(label_file, "rb") as f:
|
|
47
46
|
data = json.load(f)
|
doctr/datasets/orientation.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
import os
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
|
|
@@ -21,7 +21,6 @@ class OrientationDataset(AbstractDataset):
|
|
|
21
21
|
>>> img, target = train_set[0]
|
|
22
22
|
|
|
23
23
|
Args:
|
|
24
|
-
----
|
|
25
24
|
img_folder: folder with all the images of the dataset
|
|
26
25
|
**kwargs: keyword arguments from `AbstractDataset`.
|
|
27
26
|
"""
|
|
@@ -37,4 +36,4 @@ class OrientationDataset(AbstractDataset):
|
|
|
37
36
|
)
|
|
38
37
|
|
|
39
38
|
# initialize dataset with 0 degree rotation targets
|
|
40
|
-
self.data:
|
|
39
|
+
self.data: list[tuple[str, np.ndarray]] = [(img_name, np.array([0])) for img_name in os.listdir(self.root)]
|
doctr/datasets/recognition.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
import json
|
|
7
7
|
import os
|
|
8
8
|
from pathlib import Path
|
|
9
|
-
from typing import Any
|
|
9
|
+
from typing import Any
|
|
10
10
|
|
|
11
11
|
from .datasets import AbstractDataset
|
|
12
12
|
|
|
@@ -22,7 +22,6 @@ class RecognitionDataset(AbstractDataset):
|
|
|
22
22
|
>>> img, target = train_set[0]
|
|
23
23
|
|
|
24
24
|
Args:
|
|
25
|
-
----
|
|
26
25
|
img_folder: path to the images folder
|
|
27
26
|
labels_path: pathe to the json file containing all labels (character sequences)
|
|
28
27
|
**kwargs: keyword arguments from `AbstractDataset`.
|
|
@@ -36,7 +35,7 @@ class RecognitionDataset(AbstractDataset):
|
|
|
36
35
|
) -> None:
|
|
37
36
|
super().__init__(img_folder, **kwargs)
|
|
38
37
|
|
|
39
|
-
self.data:
|
|
38
|
+
self.data: list[tuple[str, str]] = []
|
|
40
39
|
with open(labels_path, encoding="utf-8") as f:
|
|
41
40
|
labels = json.load(f)
|
|
42
41
|
|