python-doctr 0.7.0__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctr/datasets/__init__.py +2 -0
- doctr/datasets/cord.py +6 -4
- doctr/datasets/datasets/base.py +3 -2
- doctr/datasets/datasets/pytorch.py +4 -2
- doctr/datasets/datasets/tensorflow.py +4 -2
- doctr/datasets/detection.py +6 -3
- doctr/datasets/doc_artefacts.py +2 -1
- doctr/datasets/funsd.py +7 -8
- doctr/datasets/generator/base.py +3 -2
- doctr/datasets/generator/pytorch.py +3 -1
- doctr/datasets/generator/tensorflow.py +3 -1
- doctr/datasets/ic03.py +3 -2
- doctr/datasets/ic13.py +2 -1
- doctr/datasets/iiit5k.py +6 -4
- doctr/datasets/iiithws.py +2 -1
- doctr/datasets/imgur5k.py +3 -2
- doctr/datasets/loader.py +4 -2
- doctr/datasets/mjsynth.py +2 -1
- doctr/datasets/ocr.py +2 -1
- doctr/datasets/orientation.py +40 -0
- doctr/datasets/recognition.py +3 -2
- doctr/datasets/sroie.py +2 -1
- doctr/datasets/svhn.py +2 -1
- doctr/datasets/svt.py +3 -2
- doctr/datasets/synthtext.py +2 -1
- doctr/datasets/utils.py +27 -11
- doctr/datasets/vocabs.py +26 -1
- doctr/datasets/wildreceipt.py +111 -0
- doctr/file_utils.py +3 -1
- doctr/io/elements.py +52 -35
- doctr/io/html.py +5 -3
- doctr/io/image/base.py +5 -4
- doctr/io/image/pytorch.py +12 -7
- doctr/io/image/tensorflow.py +11 -6
- doctr/io/pdf.py +5 -4
- doctr/io/reader.py +13 -5
- doctr/models/_utils.py +30 -53
- doctr/models/artefacts/barcode.py +4 -3
- doctr/models/artefacts/face.py +4 -2
- doctr/models/builder.py +58 -43
- doctr/models/classification/__init__.py +1 -0
- doctr/models/classification/magc_resnet/pytorch.py +5 -2
- doctr/models/classification/magc_resnet/tensorflow.py +5 -2
- doctr/models/classification/mobilenet/pytorch.py +16 -4
- doctr/models/classification/mobilenet/tensorflow.py +29 -20
- doctr/models/classification/predictor/pytorch.py +3 -2
- doctr/models/classification/predictor/tensorflow.py +2 -1
- doctr/models/classification/resnet/pytorch.py +23 -13
- doctr/models/classification/resnet/tensorflow.py +33 -26
- doctr/models/classification/textnet/__init__.py +6 -0
- doctr/models/classification/textnet/pytorch.py +275 -0
- doctr/models/classification/textnet/tensorflow.py +267 -0
- doctr/models/classification/vgg/pytorch.py +4 -2
- doctr/models/classification/vgg/tensorflow.py +5 -2
- doctr/models/classification/vit/pytorch.py +9 -3
- doctr/models/classification/vit/tensorflow.py +9 -3
- doctr/models/classification/zoo.py +7 -2
- doctr/models/core.py +1 -1
- doctr/models/detection/__init__.py +1 -0
- doctr/models/detection/_utils/pytorch.py +7 -1
- doctr/models/detection/_utils/tensorflow.py +7 -3
- doctr/models/detection/core.py +9 -3
- doctr/models/detection/differentiable_binarization/base.py +37 -25
- doctr/models/detection/differentiable_binarization/pytorch.py +80 -104
- doctr/models/detection/differentiable_binarization/tensorflow.py +74 -55
- doctr/models/detection/fast/__init__.py +6 -0
- doctr/models/detection/fast/base.py +256 -0
- doctr/models/detection/fast/pytorch.py +442 -0
- doctr/models/detection/fast/tensorflow.py +428 -0
- doctr/models/detection/linknet/base.py +12 -5
- doctr/models/detection/linknet/pytorch.py +28 -15
- doctr/models/detection/linknet/tensorflow.py +68 -88
- doctr/models/detection/predictor/pytorch.py +16 -6
- doctr/models/detection/predictor/tensorflow.py +13 -5
- doctr/models/detection/zoo.py +19 -16
- doctr/models/factory/hub.py +20 -10
- doctr/models/kie_predictor/base.py +2 -1
- doctr/models/kie_predictor/pytorch.py +28 -36
- doctr/models/kie_predictor/tensorflow.py +27 -27
- doctr/models/modules/__init__.py +1 -0
- doctr/models/modules/layers/__init__.py +6 -0
- doctr/models/modules/layers/pytorch.py +166 -0
- doctr/models/modules/layers/tensorflow.py +175 -0
- doctr/models/modules/transformer/pytorch.py +24 -22
- doctr/models/modules/transformer/tensorflow.py +6 -4
- doctr/models/modules/vision_transformer/pytorch.py +2 -4
- doctr/models/modules/vision_transformer/tensorflow.py +2 -4
- doctr/models/obj_detection/faster_rcnn/pytorch.py +4 -2
- doctr/models/predictor/base.py +14 -3
- doctr/models/predictor/pytorch.py +26 -29
- doctr/models/predictor/tensorflow.py +25 -22
- doctr/models/preprocessor/pytorch.py +14 -9
- doctr/models/preprocessor/tensorflow.py +10 -5
- doctr/models/recognition/core.py +4 -1
- doctr/models/recognition/crnn/pytorch.py +23 -16
- doctr/models/recognition/crnn/tensorflow.py +25 -17
- doctr/models/recognition/master/base.py +4 -1
- doctr/models/recognition/master/pytorch.py +20 -9
- doctr/models/recognition/master/tensorflow.py +20 -8
- doctr/models/recognition/parseq/base.py +4 -1
- doctr/models/recognition/parseq/pytorch.py +28 -22
- doctr/models/recognition/parseq/tensorflow.py +22 -11
- doctr/models/recognition/predictor/_utils.py +3 -2
- doctr/models/recognition/predictor/pytorch.py +3 -2
- doctr/models/recognition/predictor/tensorflow.py +2 -1
- doctr/models/recognition/sar/pytorch.py +14 -7
- doctr/models/recognition/sar/tensorflow.py +23 -14
- doctr/models/recognition/utils.py +5 -1
- doctr/models/recognition/vitstr/base.py +4 -1
- doctr/models/recognition/vitstr/pytorch.py +22 -13
- doctr/models/recognition/vitstr/tensorflow.py +21 -10
- doctr/models/recognition/zoo.py +4 -2
- doctr/models/utils/pytorch.py +24 -6
- doctr/models/utils/tensorflow.py +22 -3
- doctr/models/zoo.py +21 -3
- doctr/transforms/functional/base.py +8 -3
- doctr/transforms/functional/pytorch.py +23 -6
- doctr/transforms/functional/tensorflow.py +25 -5
- doctr/transforms/modules/base.py +12 -5
- doctr/transforms/modules/pytorch.py +10 -12
- doctr/transforms/modules/tensorflow.py +17 -9
- doctr/utils/common_types.py +1 -1
- doctr/utils/data.py +4 -2
- doctr/utils/fonts.py +3 -2
- doctr/utils/geometry.py +95 -26
- doctr/utils/metrics.py +36 -22
- doctr/utils/multithreading.py +5 -3
- doctr/utils/repr.py +3 -1
- doctr/utils/visualization.py +31 -8
- doctr/version.py +1 -1
- {python_doctr-0.7.0.dist-info → python_doctr-0.8.1.dist-info}/METADATA +67 -31
- python_doctr-0.8.1.dist-info/RECORD +173 -0
- {python_doctr-0.7.0.dist-info → python_doctr-0.8.1.dist-info}/WHEEL +1 -1
- python_doctr-0.7.0.dist-info/RECORD +0 -161
- {python_doctr-0.7.0.dist-info → python_doctr-0.8.1.dist-info}/LICENSE +0 -0
- {python_doctr-0.7.0.dist-info → python_doctr-0.8.1.dist-info}/top_level.txt +0 -0
- {python_doctr-0.7.0.dist-info → python_doctr-0.8.1.dist-info}/zip-safe +0 -0
doctr/datasets/utils.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2024, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
@@ -8,9 +8,8 @@ import unicodedata
|
|
|
8
8
|
from collections.abc import Sequence
|
|
9
9
|
from functools import partial
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import Any, Dict, List, Optional
|
|
11
|
+
from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
|
|
12
12
|
from typing import Sequence as SequenceType
|
|
13
|
-
from typing import Tuple, TypeVar, Union
|
|
14
13
|
|
|
15
14
|
import numpy as np
|
|
16
15
|
from PIL import Image
|
|
@@ -33,13 +32,15 @@ def translate(
|
|
|
33
32
|
"""Translate a string input in a given vocabulary
|
|
34
33
|
|
|
35
34
|
Args:
|
|
35
|
+
----
|
|
36
36
|
input_string: input string to translate
|
|
37
37
|
vocab_name: vocabulary to use (french, latin, ...)
|
|
38
38
|
unknown_char: unknown character for non-translatable characters
|
|
39
39
|
|
|
40
40
|
Returns:
|
|
41
|
-
|
|
42
|
-
|
|
41
|
+
-------
|
|
42
|
+
A string translated in a given vocab
|
|
43
|
+
"""
|
|
43
44
|
if VOCABS.get(vocab_name) is None:
|
|
44
45
|
raise KeyError("output vocabulary must be in vocabs dictionnary")
|
|
45
46
|
|
|
@@ -66,16 +67,21 @@ def encode_string(
|
|
|
66
67
|
"""Given a predefined mapping, encode the string to a sequence of numbers
|
|
67
68
|
|
|
68
69
|
Args:
|
|
70
|
+
----
|
|
69
71
|
input_string: string to encode
|
|
70
72
|
vocab: vocabulary (string), the encoding is given by the indexing of the character sequence
|
|
71
73
|
|
|
72
74
|
Returns:
|
|
73
|
-
|
|
74
|
-
|
|
75
|
+
-------
|
|
76
|
+
A list encoding the input_string
|
|
77
|
+
"""
|
|
75
78
|
try:
|
|
76
79
|
return list(map(vocab.index, input_string))
|
|
77
80
|
except ValueError:
|
|
78
|
-
raise ValueError(
|
|
81
|
+
raise ValueError(
|
|
82
|
+
f"some characters cannot be found in 'vocab'. \
|
|
83
|
+
Please check the input string {input_string} and the vocabulary {vocab}"
|
|
84
|
+
)
|
|
79
85
|
|
|
80
86
|
|
|
81
87
|
def decode_sequence(
|
|
@@ -85,13 +91,14 @@ def decode_sequence(
|
|
|
85
91
|
"""Given a predefined mapping, decode the sequence of numbers to a string
|
|
86
92
|
|
|
87
93
|
Args:
|
|
94
|
+
----
|
|
88
95
|
input_seq: array to decode
|
|
89
96
|
mapping: vocabulary (string), the encoding is given by the indexing of the character sequence
|
|
90
97
|
|
|
91
98
|
Returns:
|
|
99
|
+
-------
|
|
92
100
|
A string, decoded from input_seq
|
|
93
101
|
"""
|
|
94
|
-
|
|
95
102
|
if not isinstance(input_seq, (Sequence, np.ndarray)):
|
|
96
103
|
raise TypeError("Invalid sequence type")
|
|
97
104
|
if isinstance(input_seq, np.ndarray) and (input_seq.dtype != np.int_ or input_seq.max() >= len(mapping)):
|
|
@@ -108,11 +115,11 @@ def encode_sequences(
|
|
|
108
115
|
sos: Optional[int] = None,
|
|
109
116
|
pad: Optional[int] = None,
|
|
110
117
|
dynamic_seq_length: bool = False,
|
|
111
|
-
**kwargs: Any,
|
|
112
118
|
) -> np.ndarray:
|
|
113
119
|
"""Encode character sequences using a given vocab as mapping
|
|
114
120
|
|
|
115
121
|
Args:
|
|
122
|
+
----
|
|
116
123
|
sequences: the list of character sequences of size N
|
|
117
124
|
vocab: the ordered vocab to use for encoding
|
|
118
125
|
target_size: maximum length of the encoded data
|
|
@@ -122,9 +129,9 @@ def encode_sequences(
|
|
|
122
129
|
dynamic_seq_length: if `target_size` is specified, uses it as upper bound and enables dynamic sequence size
|
|
123
130
|
|
|
124
131
|
Returns:
|
|
132
|
+
-------
|
|
125
133
|
the padded encoded data as a tensor
|
|
126
134
|
"""
|
|
127
|
-
|
|
128
135
|
if 0 <= eos < len(vocab):
|
|
129
136
|
raise ValueError("argument 'eos' needs to be outside of vocab possible indices")
|
|
130
137
|
|
|
@@ -169,10 +176,14 @@ def convert_target_to_relative(img: ImageTensor, target: Dict[str, Any]) -> Tupl
|
|
|
169
176
|
|
|
170
177
|
def crop_bboxes_from_image(img_path: Union[str, Path], geoms: np.ndarray) -> List[np.ndarray]:
|
|
171
178
|
"""Crop a set of bounding boxes from an image
|
|
179
|
+
|
|
172
180
|
Args:
|
|
181
|
+
----
|
|
173
182
|
img_path: path to the image
|
|
174
183
|
geoms: a array of polygons of shape (N, 4, 2) or of straight boxes of shape (N, 4)
|
|
184
|
+
|
|
175
185
|
Returns:
|
|
186
|
+
-------
|
|
176
187
|
a list of cropped images
|
|
177
188
|
"""
|
|
178
189
|
img: np.ndarray = np.array(Image.open(img_path).convert("RGB"))
|
|
@@ -188,8 +199,13 @@ def pre_transform_multiclass(img, target: Tuple[np.ndarray, List]) -> Tuple[np.n
|
|
|
188
199
|
"""Converts multiclass target to relative coordinates.
|
|
189
200
|
|
|
190
201
|
Args:
|
|
202
|
+
----
|
|
191
203
|
img: Image
|
|
192
204
|
target: tuple of target polygons and their classes names
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
-------
|
|
208
|
+
Image and dictionary of boxes, with class names as keys
|
|
193
209
|
"""
|
|
194
210
|
boxes = convert_to_relative_coords(target[0], get_img_shape(img))
|
|
195
211
|
boxes_classes = target[1]
|
doctr/datasets/vocabs.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2024, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
@@ -28,6 +28,7 @@ VOCABS["legacy_french"] = VOCABS["latin"] + "°" + "àâéèêëîïôùûçÀÂ
|
|
|
28
28
|
VOCABS["french"] = VOCABS["english"] + "àâéèêëîïôùûüçÀÂÉÈÊËÎÏÔÙÛÜÇ"
|
|
29
29
|
VOCABS["portuguese"] = VOCABS["english"] + "áàâãéêíïóôõúüçÁÀÂÃÉÊÍÏÓÔÕÚÜÇ"
|
|
30
30
|
VOCABS["spanish"] = VOCABS["english"] + "áéíóúüñÁÉÍÓÚÜÑ" + "¡¿"
|
|
31
|
+
VOCABS["italian"] = VOCABS["english"] + "àèéìíîòóùúÀÈÉÌÍÎÒÓÙÚ"
|
|
31
32
|
VOCABS["german"] = VOCABS["english"] + "äöüßÄÖÜẞ"
|
|
32
33
|
VOCABS["arabic"] = (
|
|
33
34
|
VOCABS["digits"]
|
|
@@ -39,8 +40,32 @@ VOCABS["arabic"] = (
|
|
|
39
40
|
+ VOCABS["punctuation"]
|
|
40
41
|
)
|
|
41
42
|
VOCABS["czech"] = VOCABS["english"] + "áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ"
|
|
43
|
+
VOCABS["polish"] = VOCABS["english"] + "ąćęłńóśźżĄĆĘŁŃÓŚŹŻ"
|
|
44
|
+
VOCABS["dutch"] = VOCABS["english"] + "áéíóúüñÁÉÍÓÚÜÑ"
|
|
45
|
+
VOCABS["norwegian"] = VOCABS["english"] + "æøåÆØÅ"
|
|
46
|
+
VOCABS["danish"] = VOCABS["english"] + "æøåÆØÅ"
|
|
47
|
+
VOCABS["finnish"] = VOCABS["english"] + "äöÄÖ"
|
|
48
|
+
VOCABS["swedish"] = VOCABS["english"] + "åäöÅÄÖ"
|
|
42
49
|
VOCABS["vietnamese"] = (
|
|
43
50
|
VOCABS["english"]
|
|
44
51
|
+ "áàảạãăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệóòỏõọôốồổộỗơớờởợỡúùủũụưứừửữựiíìỉĩịýỳỷỹỵ"
|
|
45
52
|
+ "ÁÀẢẠÃĂẮẰẲẴẶÂẤẦẨẪẬÉÈẺẼẸÊẾỀỂỄỆÓÒỎÕỌÔỐỒỔỘỖƠỚỜỞỢỠÚÙỦŨỤƯỨỪỬỮỰIÍÌỈĨỊÝỲỶỸỴ"
|
|
46
53
|
)
|
|
54
|
+
VOCABS["hebrew"] = VOCABS["english"] + "אבגדהוזחטיכלמנסעפצקרשת" + "₪"
|
|
55
|
+
VOCABS["multilingual"] = "".join(
|
|
56
|
+
dict.fromkeys(
|
|
57
|
+
VOCABS["french"]
|
|
58
|
+
+ VOCABS["portuguese"]
|
|
59
|
+
+ VOCABS["spanish"]
|
|
60
|
+
+ VOCABS["german"]
|
|
61
|
+
+ VOCABS["czech"]
|
|
62
|
+
+ VOCABS["polish"]
|
|
63
|
+
+ VOCABS["dutch"]
|
|
64
|
+
+ VOCABS["italian"]
|
|
65
|
+
+ VOCABS["norwegian"]
|
|
66
|
+
+ VOCABS["danish"]
|
|
67
|
+
+ VOCABS["finnish"]
|
|
68
|
+
+ VOCABS["swedish"]
|
|
69
|
+
+ "§"
|
|
70
|
+
)
|
|
71
|
+
)
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# Copyright (C) 2021-2024, Mindee.
|
|
2
|
+
|
|
3
|
+
# This program is licensed under the Apache License 2.0.
|
|
4
|
+
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
|
+
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Dict, List, Tuple, Union
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
|
|
13
|
+
from .datasets import AbstractDataset
|
|
14
|
+
from .utils import convert_target_to_relative, crop_bboxes_from_image
|
|
15
|
+
|
|
16
|
+
__all__ = ["WILDRECEIPT"]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class WILDRECEIPT(AbstractDataset):
|
|
20
|
+
"""WildReceipt dataset from `"Spatial Dual-Modality Graph Reasoning for Key Information Extraction"
|
|
21
|
+
<https://arxiv.org/abs/2103.14470v1>`_ |
|
|
22
|
+
`repository <https://download.openmmlab.com/mmocr/data/wildreceipt.tar>`_.
|
|
23
|
+
|
|
24
|
+
.. image:: https://doctr-static.mindee.com/models?id=v0.7.0/wildreceipt-dataset.jpg&src=0
|
|
25
|
+
:align: center
|
|
26
|
+
|
|
27
|
+
>>> # NOTE: You need to download the dataset first.
|
|
28
|
+
>>> from doctr.datasets import WILDRECEIPT
|
|
29
|
+
>>> train_set = WILDRECEIPT(train=True, img_folder="/path/to/wildreceipt/",
|
|
30
|
+
>>> label_path="/path/to/wildreceipt/train.txt")
|
|
31
|
+
>>> img, target = train_set[0]
|
|
32
|
+
>>> test_set = WILDRECEIPT(train=False, img_folder="/path/to/wildreceipt/",
|
|
33
|
+
>>> label_path="/path/to/wildreceipt/test.txt")
|
|
34
|
+
>>> img, target = test_set[0]
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
----
|
|
38
|
+
img_folder: folder with all the images of the dataset
|
|
39
|
+
label_path: path to the annotations file of the dataset
|
|
40
|
+
train: whether the subset should be the training one
|
|
41
|
+
use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
|
|
42
|
+
recognition_task: whether the dataset should be used for recognition task
|
|
43
|
+
**kwargs: keyword arguments from `AbstractDataset`.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
img_folder: str,
|
|
49
|
+
label_path: str,
|
|
50
|
+
train: bool = True,
|
|
51
|
+
use_polygons: bool = False,
|
|
52
|
+
recognition_task: bool = False,
|
|
53
|
+
**kwargs: Any,
|
|
54
|
+
) -> None:
|
|
55
|
+
super().__init__(
|
|
56
|
+
img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs
|
|
57
|
+
)
|
|
58
|
+
# File existence check
|
|
59
|
+
if not os.path.exists(label_path) or not os.path.exists(img_folder):
|
|
60
|
+
raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")
|
|
61
|
+
|
|
62
|
+
tmp_root = img_folder
|
|
63
|
+
self.train = train
|
|
64
|
+
np_dtype = np.float32
|
|
65
|
+
self.data: List[Tuple[Union[str, Path, np.ndarray], Union[str, Dict[str, Any]]]] = []
|
|
66
|
+
|
|
67
|
+
with open(label_path, "r") as file:
|
|
68
|
+
data = file.read()
|
|
69
|
+
# Split the text file into separate JSON strings
|
|
70
|
+
json_strings = data.strip().split("\n")
|
|
71
|
+
box: Union[List[float], np.ndarray]
|
|
72
|
+
_targets = []
|
|
73
|
+
for json_string in json_strings:
|
|
74
|
+
json_data = json.loads(json_string)
|
|
75
|
+
img_path = json_data["file_name"]
|
|
76
|
+
annotations = json_data["annotations"]
|
|
77
|
+
for annotation in annotations:
|
|
78
|
+
coordinates = annotation["box"]
|
|
79
|
+
if use_polygons:
|
|
80
|
+
# (x, y) coordinates of top left, top right, bottom right, bottom left corners
|
|
81
|
+
box = np.array(
|
|
82
|
+
[
|
|
83
|
+
[coordinates[0], coordinates[1]],
|
|
84
|
+
[coordinates[2], coordinates[3]],
|
|
85
|
+
[coordinates[4], coordinates[5]],
|
|
86
|
+
[coordinates[6], coordinates[7]],
|
|
87
|
+
],
|
|
88
|
+
dtype=np_dtype,
|
|
89
|
+
)
|
|
90
|
+
else:
|
|
91
|
+
x, y = coordinates[::2], coordinates[1::2]
|
|
92
|
+
box = [min(x), min(y), max(x), max(y)]
|
|
93
|
+
_targets.append((annotation["text"], box))
|
|
94
|
+
text_targets, box_targets = zip(*_targets)
|
|
95
|
+
|
|
96
|
+
if recognition_task:
|
|
97
|
+
crops = crop_bboxes_from_image(
|
|
98
|
+
img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0)
|
|
99
|
+
)
|
|
100
|
+
for crop, label in zip(crops, list(text_targets)):
|
|
101
|
+
if label and " " not in label:
|
|
102
|
+
self.data.append((crop, label))
|
|
103
|
+
else:
|
|
104
|
+
self.data.append((
|
|
105
|
+
img_path,
|
|
106
|
+
dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets)),
|
|
107
|
+
))
|
|
108
|
+
self.root = tmp_root
|
|
109
|
+
|
|
110
|
+
def extra_repr(self) -> str:
|
|
111
|
+
return f"train={self.train}"
|
doctr/file_utils.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2024, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
@@ -83,8 +83,10 @@ if not _torch_available and not _tf_available: # pragma: no cover
|
|
|
83
83
|
|
|
84
84
|
|
|
85
85
|
def is_torch_available():
|
|
86
|
+
"""Whether PyTorch is installed."""
|
|
86
87
|
return _torch_available
|
|
87
88
|
|
|
88
89
|
|
|
89
90
|
def is_tf_available():
|
|
91
|
+
"""Whether TensorFlow is installed."""
|
|
90
92
|
return _tf_available
|
doctr/io/elements.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2024, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
@@ -39,7 +39,6 @@ class Element(NestedObject):
|
|
|
39
39
|
|
|
40
40
|
def export(self) -> Dict[str, Any]:
|
|
41
41
|
"""Exports the object into a nested dict format"""
|
|
42
|
-
|
|
43
42
|
export_dict = {k: getattr(self, k) for k in self._exported_keys}
|
|
44
43
|
for children_name in self._children_names:
|
|
45
44
|
if children_name in ["predictions"]:
|
|
@@ -63,6 +62,7 @@ class Word(Element):
|
|
|
63
62
|
"""Implements a word element
|
|
64
63
|
|
|
65
64
|
Args:
|
|
65
|
+
----
|
|
66
66
|
value: the text string of the word
|
|
67
67
|
confidence: the confidence associated with the text prediction
|
|
68
68
|
geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
|
|
@@ -95,6 +95,7 @@ class Artefact(Element):
|
|
|
95
95
|
"""Implements a non-textual element
|
|
96
96
|
|
|
97
97
|
Args:
|
|
98
|
+
----
|
|
98
99
|
artefact_type: the type of artefact
|
|
99
100
|
confidence: the confidence of the type prediction
|
|
100
101
|
geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
|
|
@@ -127,6 +128,7 @@ class Line(Element):
|
|
|
127
128
|
"""Implements a line element as a collection of words
|
|
128
129
|
|
|
129
130
|
Args:
|
|
131
|
+
----
|
|
130
132
|
words: list of word elements
|
|
131
133
|
geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
|
|
132
134
|
the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing
|
|
@@ -158,11 +160,9 @@ class Line(Element):
|
|
|
158
160
|
@classmethod
|
|
159
161
|
def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
|
|
160
162
|
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
161
|
-
kwargs.update(
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
}
|
|
165
|
-
)
|
|
163
|
+
kwargs.update({
|
|
164
|
+
"words": [Word.from_dict(_dict) for _dict in save_dict["words"]],
|
|
165
|
+
})
|
|
166
166
|
return cls(**kwargs)
|
|
167
167
|
|
|
168
168
|
|
|
@@ -181,6 +181,7 @@ class Block(Element):
|
|
|
181
181
|
"""Implements a block element as a collection of lines and artefacts
|
|
182
182
|
|
|
183
183
|
Args:
|
|
184
|
+
----
|
|
184
185
|
lines: list of line elements
|
|
185
186
|
artefacts: list of artefacts
|
|
186
187
|
geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
|
|
@@ -218,12 +219,10 @@ class Block(Element):
|
|
|
218
219
|
@classmethod
|
|
219
220
|
def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
|
|
220
221
|
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
221
|
-
kwargs.update(
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
}
|
|
226
|
-
)
|
|
222
|
+
kwargs.update({
|
|
223
|
+
"lines": [Line.from_dict(_dict) for _dict in save_dict["lines"]],
|
|
224
|
+
"artefacts": [Artefact.from_dict(_dict) for _dict in save_dict["artefacts"]],
|
|
225
|
+
})
|
|
227
226
|
return cls(**kwargs)
|
|
228
227
|
|
|
229
228
|
|
|
@@ -231,6 +230,8 @@ class Page(Element):
|
|
|
231
230
|
"""Implements a page element as a collection of blocks
|
|
232
231
|
|
|
233
232
|
Args:
|
|
233
|
+
----
|
|
234
|
+
page: image encoded as a numpy array in uint8
|
|
234
235
|
blocks: list of block elements
|
|
235
236
|
page_idx: the index of the page in the input raw document
|
|
236
237
|
dimensions: the page size in pixels in format (height, width)
|
|
@@ -244,6 +245,7 @@ class Page(Element):
|
|
|
244
245
|
|
|
245
246
|
def __init__(
|
|
246
247
|
self,
|
|
248
|
+
page: np.ndarray,
|
|
247
249
|
blocks: List[Block],
|
|
248
250
|
page_idx: int,
|
|
249
251
|
dimensions: Tuple[int, int],
|
|
@@ -251,6 +253,7 @@ class Page(Element):
|
|
|
251
253
|
language: Optional[Dict[str, Any]] = None,
|
|
252
254
|
) -> None:
|
|
253
255
|
super().__init__(blocks=blocks)
|
|
256
|
+
self.page = page
|
|
254
257
|
self.page_idx = page_idx
|
|
255
258
|
self.dimensions = dimensions
|
|
256
259
|
self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None)
|
|
@@ -263,24 +266,24 @@ class Page(Element):
|
|
|
263
266
|
def extra_repr(self) -> str:
|
|
264
267
|
return f"dimensions={self.dimensions}"
|
|
265
268
|
|
|
266
|
-
def show(self,
|
|
269
|
+
def show(self, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None:
|
|
267
270
|
"""Overlay the result on a given image
|
|
268
271
|
|
|
269
272
|
Args:
|
|
270
|
-
page: image encoded as a numpy array in uint8
|
|
271
273
|
interactive: whether the display should be interactive
|
|
272
274
|
preserve_aspect_ratio: pass True if you passed True to the predictor
|
|
275
|
+
**kwargs: additional keyword arguments passed to the matplotlib.pyplot.show method
|
|
273
276
|
"""
|
|
274
|
-
visualize_page(self.export(), page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio)
|
|
277
|
+
visualize_page(self.export(), self.page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio)
|
|
275
278
|
plt.show(**kwargs)
|
|
276
279
|
|
|
277
280
|
def synthesize(self, **kwargs) -> np.ndarray:
|
|
278
281
|
"""Synthesize the page from the predictions
|
|
279
282
|
|
|
280
|
-
Returns
|
|
283
|
+
Returns
|
|
284
|
+
-------
|
|
281
285
|
synthesized page
|
|
282
286
|
"""
|
|
283
|
-
|
|
284
287
|
return synthesize_page(self.export(), **kwargs)
|
|
285
288
|
|
|
286
289
|
def export_as_xml(self, file_title: str = "docTR - XML export (hOCR)") -> Tuple[bytes, ET.ElementTree]:
|
|
@@ -288,9 +291,11 @@ class Page(Element):
|
|
|
288
291
|
convention: https://github.com/kba/hocr-spec/blob/master/1.2/spec.md
|
|
289
292
|
|
|
290
293
|
Args:
|
|
294
|
+
----
|
|
291
295
|
file_title: the title of the XML file
|
|
292
296
|
|
|
293
297
|
Returns:
|
|
298
|
+
-------
|
|
294
299
|
a tuple of the XML byte string, and its ElementTree
|
|
295
300
|
"""
|
|
296
301
|
p_idx = self.page_idx
|
|
@@ -398,7 +403,9 @@ class KIEPage(Element):
|
|
|
398
403
|
"""Implements a KIE page element as a collection of predictions
|
|
399
404
|
|
|
400
405
|
Args:
|
|
406
|
+
----
|
|
401
407
|
predictions: Dictionary with list of block elements for each detection class
|
|
408
|
+
page: image encoded as a numpy array in uint8
|
|
402
409
|
page_idx: the index of the page in the input raw document
|
|
403
410
|
dimensions: the page size in pixels in format (height, width)
|
|
404
411
|
orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction
|
|
@@ -411,6 +418,7 @@ class KIEPage(Element):
|
|
|
411
418
|
|
|
412
419
|
def __init__(
|
|
413
420
|
self,
|
|
421
|
+
page: np.ndarray,
|
|
414
422
|
predictions: Dict[str, List[Prediction]],
|
|
415
423
|
page_idx: int,
|
|
416
424
|
dimensions: Tuple[int, int],
|
|
@@ -418,6 +426,7 @@ class KIEPage(Element):
|
|
|
418
426
|
language: Optional[Dict[str, Any]] = None,
|
|
419
427
|
) -> None:
|
|
420
428
|
super().__init__(predictions=predictions)
|
|
429
|
+
self.page = page
|
|
421
430
|
self.page_idx = page_idx
|
|
422
431
|
self.dimensions = dimensions
|
|
423
432
|
self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None)
|
|
@@ -432,24 +441,30 @@ class KIEPage(Element):
|
|
|
432
441
|
def extra_repr(self) -> str:
|
|
433
442
|
return f"dimensions={self.dimensions}"
|
|
434
443
|
|
|
435
|
-
def show(self,
|
|
444
|
+
def show(self, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None:
|
|
436
445
|
"""Overlay the result on a given image
|
|
437
446
|
|
|
438
447
|
Args:
|
|
439
|
-
page: image encoded as a numpy array in uint8
|
|
440
448
|
interactive: whether the display should be interactive
|
|
441
449
|
preserve_aspect_ratio: pass True if you passed True to the predictor
|
|
450
|
+
**kwargs: keyword arguments passed to the matplotlib.pyplot.show method
|
|
442
451
|
"""
|
|
443
|
-
visualize_kie_page(
|
|
452
|
+
visualize_kie_page(
|
|
453
|
+
self.export(), self.page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio
|
|
454
|
+
)
|
|
444
455
|
plt.show(**kwargs)
|
|
445
456
|
|
|
446
457
|
def synthesize(self, **kwargs) -> np.ndarray:
|
|
447
458
|
"""Synthesize the page from the predictions
|
|
448
459
|
|
|
460
|
+
Args:
|
|
461
|
+
----
|
|
462
|
+
**kwargs: keyword arguments passed to the matplotlib.pyplot.show method
|
|
463
|
+
|
|
449
464
|
Returns:
|
|
465
|
+
-------
|
|
450
466
|
synthesized page
|
|
451
467
|
"""
|
|
452
|
-
|
|
453
468
|
return synthesize_kie_page(self.export(), **kwargs)
|
|
454
469
|
|
|
455
470
|
def export_as_xml(self, file_title: str = "docTR - XML export (hOCR)") -> Tuple[bytes, ET.ElementTree]:
|
|
@@ -457,9 +472,11 @@ class KIEPage(Element):
|
|
|
457
472
|
convention: https://github.com/kba/hocr-spec/blob/master/1.2/spec.md
|
|
458
473
|
|
|
459
474
|
Args:
|
|
475
|
+
----
|
|
460
476
|
file_title: the title of the XML file
|
|
461
477
|
|
|
462
478
|
Returns:
|
|
479
|
+
-------
|
|
463
480
|
a tuple of the XML byte string, and its ElementTree
|
|
464
481
|
"""
|
|
465
482
|
p_idx = self.page_idx
|
|
@@ -517,9 +534,9 @@ class KIEPage(Element):
|
|
|
517
534
|
@classmethod
|
|
518
535
|
def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
|
|
519
536
|
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
520
|
-
kwargs.update(
|
|
521
|
-
|
|
522
|
-
)
|
|
537
|
+
kwargs.update({
|
|
538
|
+
"predictions": [Prediction.from_dict(predictions_dict) for predictions_dict in save_dict["predictions"]]
|
|
539
|
+
})
|
|
523
540
|
return cls(**kwargs)
|
|
524
541
|
|
|
525
542
|
|
|
@@ -527,6 +544,7 @@ class Document(Element):
|
|
|
527
544
|
"""Implements a document element as a collection of pages
|
|
528
545
|
|
|
529
546
|
Args:
|
|
547
|
+
----
|
|
530
548
|
pages: list of page elements
|
|
531
549
|
"""
|
|
532
550
|
|
|
@@ -543,31 +561,29 @@ class Document(Element):
|
|
|
543
561
|
"""Renders the full text of the element"""
|
|
544
562
|
return page_break.join(p.render() for p in self.pages)
|
|
545
563
|
|
|
546
|
-
def show(self,
|
|
547
|
-
"""Overlay the result on a given image
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
pages: list of images encoded as numpy arrays in uint8
|
|
551
|
-
"""
|
|
552
|
-
for img, result in zip(pages, self.pages):
|
|
553
|
-
result.show(img, **kwargs)
|
|
564
|
+
def show(self, **kwargs) -> None:
|
|
565
|
+
"""Overlay the result on a given image"""
|
|
566
|
+
for result in self.pages:
|
|
567
|
+
result.show(**kwargs)
|
|
554
568
|
|
|
555
569
|
def synthesize(self, **kwargs) -> List[np.ndarray]:
|
|
556
570
|
"""Synthesize all pages from their predictions
|
|
557
571
|
|
|
558
|
-
Returns
|
|
572
|
+
Returns
|
|
573
|
+
-------
|
|
559
574
|
list of synthesized pages
|
|
560
575
|
"""
|
|
561
|
-
|
|
562
576
|
return [page.synthesize() for page in self.pages]
|
|
563
577
|
|
|
564
578
|
def export_as_xml(self, **kwargs) -> List[Tuple[bytes, ET.ElementTree]]:
|
|
565
579
|
"""Export the document as XML (hOCR-format)
|
|
566
580
|
|
|
567
581
|
Args:
|
|
582
|
+
----
|
|
568
583
|
**kwargs: additional keyword arguments passed to the Page.export_as_xml method
|
|
569
584
|
|
|
570
585
|
Returns:
|
|
586
|
+
-------
|
|
571
587
|
list of tuple of (bytes, ElementTree)
|
|
572
588
|
"""
|
|
573
589
|
return [page.export_as_xml(**kwargs) for page in self.pages]
|
|
@@ -583,6 +599,7 @@ class KIEDocument(Document):
|
|
|
583
599
|
"""Implements a document element as a collection of pages
|
|
584
600
|
|
|
585
601
|
Args:
|
|
602
|
+
----
|
|
586
603
|
pages: list of page elements
|
|
587
604
|
"""
|
|
588
605
|
|
doctr/io/html.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2024, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
@@ -13,14 +13,16 @@ __all__ = ["read_html"]
|
|
|
13
13
|
def read_html(url: str, **kwargs: Any) -> bytes:
|
|
14
14
|
"""Read a PDF file and convert it into an image in numpy format
|
|
15
15
|
|
|
16
|
-
>>> from doctr.
|
|
16
|
+
>>> from doctr.io import read_html
|
|
17
17
|
>>> doc = read_html("https://www.yoursite.com")
|
|
18
18
|
|
|
19
19
|
Args:
|
|
20
|
+
----
|
|
20
21
|
url: URL of the target web page
|
|
22
|
+
**kwargs: keyword arguments from `weasyprint.HTML`
|
|
21
23
|
|
|
22
24
|
Returns:
|
|
25
|
+
-------
|
|
23
26
|
decoded PDF file as a bytes stream
|
|
24
27
|
"""
|
|
25
|
-
|
|
26
28
|
return HTML(url, **kwargs).write_pdf()
|
doctr/io/image/base.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2024, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
@@ -21,18 +21,19 @@ def read_img_as_numpy(
|
|
|
21
21
|
) -> np.ndarray:
|
|
22
22
|
"""Read an image file into numpy format
|
|
23
23
|
|
|
24
|
-
>>> from doctr.
|
|
25
|
-
>>> page =
|
|
24
|
+
>>> from doctr.io import read_img_as_numpy
|
|
25
|
+
>>> page = read_img_as_numpy("path/to/your/doc.jpg")
|
|
26
26
|
|
|
27
27
|
Args:
|
|
28
|
+
----
|
|
28
29
|
file: the path to the image file
|
|
29
30
|
output_size: the expected output size of each page in format H x W
|
|
30
31
|
rgb_output: whether the output ndarray channel order should be RGB instead of BGR.
|
|
31
32
|
|
|
32
33
|
Returns:
|
|
34
|
+
-------
|
|
33
35
|
the page decoded as numpy ndarray of shape H x W x 3
|
|
34
36
|
"""
|
|
35
|
-
|
|
36
37
|
if isinstance(file, (str, Path)):
|
|
37
38
|
if not Path(file).is_file():
|
|
38
39
|
raise FileNotFoundError(f"unable to access {file}")
|