python-doctr 0.9.0__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctr/contrib/__init__.py +1 -0
- doctr/contrib/artefacts.py +7 -9
- doctr/contrib/base.py +8 -17
- doctr/datasets/cord.py +17 -7
- doctr/datasets/datasets/__init__.py +4 -4
- doctr/datasets/datasets/base.py +16 -16
- doctr/datasets/datasets/pytorch.py +12 -12
- doctr/datasets/datasets/tensorflow.py +10 -10
- doctr/datasets/detection.py +6 -9
- doctr/datasets/doc_artefacts.py +3 -4
- doctr/datasets/funsd.py +17 -6
- doctr/datasets/generator/__init__.py +4 -4
- doctr/datasets/generator/base.py +16 -17
- doctr/datasets/generator/pytorch.py +1 -3
- doctr/datasets/generator/tensorflow.py +1 -3
- doctr/datasets/ic03.py +14 -5
- doctr/datasets/ic13.py +13 -5
- doctr/datasets/iiit5k.py +31 -20
- doctr/datasets/iiithws.py +4 -5
- doctr/datasets/imgur5k.py +15 -5
- doctr/datasets/loader.py +4 -7
- doctr/datasets/mjsynth.py +6 -5
- doctr/datasets/ocr.py +3 -4
- doctr/datasets/orientation.py +3 -4
- doctr/datasets/recognition.py +3 -4
- doctr/datasets/sroie.py +16 -5
- doctr/datasets/svhn.py +16 -5
- doctr/datasets/svt.py +14 -5
- doctr/datasets/synthtext.py +14 -5
- doctr/datasets/utils.py +37 -27
- doctr/datasets/vocabs.py +21 -7
- doctr/datasets/wildreceipt.py +25 -10
- doctr/file_utils.py +18 -4
- doctr/io/elements.py +69 -81
- doctr/io/html.py +1 -3
- doctr/io/image/__init__.py +3 -3
- doctr/io/image/base.py +2 -5
- doctr/io/image/pytorch.py +3 -12
- doctr/io/image/tensorflow.py +2 -11
- doctr/io/pdf.py +5 -7
- doctr/io/reader.py +5 -11
- doctr/models/_utils.py +14 -22
- doctr/models/builder.py +32 -50
- doctr/models/classification/magc_resnet/__init__.py +3 -3
- doctr/models/classification/magc_resnet/pytorch.py +10 -13
- doctr/models/classification/magc_resnet/tensorflow.py +21 -17
- doctr/models/classification/mobilenet/__init__.py +3 -3
- doctr/models/classification/mobilenet/pytorch.py +7 -17
- doctr/models/classification/mobilenet/tensorflow.py +22 -29
- doctr/models/classification/predictor/__init__.py +4 -4
- doctr/models/classification/predictor/pytorch.py +13 -11
- doctr/models/classification/predictor/tensorflow.py +13 -11
- doctr/models/classification/resnet/__init__.py +4 -4
- doctr/models/classification/resnet/pytorch.py +21 -31
- doctr/models/classification/resnet/tensorflow.py +41 -39
- doctr/models/classification/textnet/__init__.py +3 -3
- doctr/models/classification/textnet/pytorch.py +10 -17
- doctr/models/classification/textnet/tensorflow.py +19 -20
- doctr/models/classification/vgg/__init__.py +3 -3
- doctr/models/classification/vgg/pytorch.py +5 -7
- doctr/models/classification/vgg/tensorflow.py +18 -15
- doctr/models/classification/vit/__init__.py +3 -3
- doctr/models/classification/vit/pytorch.py +8 -14
- doctr/models/classification/vit/tensorflow.py +16 -16
- doctr/models/classification/zoo.py +36 -19
- doctr/models/core.py +3 -3
- doctr/models/detection/_utils/__init__.py +4 -4
- doctr/models/detection/_utils/base.py +4 -7
- doctr/models/detection/_utils/pytorch.py +1 -5
- doctr/models/detection/_utils/tensorflow.py +1 -5
- doctr/models/detection/core.py +2 -8
- doctr/models/detection/differentiable_binarization/__init__.py +4 -4
- doctr/models/detection/differentiable_binarization/base.py +7 -17
- doctr/models/detection/differentiable_binarization/pytorch.py +27 -30
- doctr/models/detection/differentiable_binarization/tensorflow.py +49 -37
- doctr/models/detection/fast/__init__.py +4 -4
- doctr/models/detection/fast/base.py +6 -14
- doctr/models/detection/fast/pytorch.py +24 -31
- doctr/models/detection/fast/tensorflow.py +28 -37
- doctr/models/detection/linknet/__init__.py +4 -4
- doctr/models/detection/linknet/base.py +6 -15
- doctr/models/detection/linknet/pytorch.py +24 -27
- doctr/models/detection/linknet/tensorflow.py +36 -33
- doctr/models/detection/predictor/__init__.py +5 -5
- doctr/models/detection/predictor/pytorch.py +6 -7
- doctr/models/detection/predictor/tensorflow.py +7 -8
- doctr/models/detection/zoo.py +27 -7
- doctr/models/factory/hub.py +8 -13
- doctr/models/kie_predictor/__init__.py +5 -5
- doctr/models/kie_predictor/base.py +8 -5
- doctr/models/kie_predictor/pytorch.py +22 -19
- doctr/models/kie_predictor/tensorflow.py +21 -15
- doctr/models/modules/layers/__init__.py +3 -3
- doctr/models/modules/layers/pytorch.py +6 -9
- doctr/models/modules/layers/tensorflow.py +5 -7
- doctr/models/modules/transformer/__init__.py +3 -3
- doctr/models/modules/transformer/pytorch.py +12 -13
- doctr/models/modules/transformer/tensorflow.py +9 -12
- doctr/models/modules/vision_transformer/__init__.py +3 -3
- doctr/models/modules/vision_transformer/pytorch.py +3 -4
- doctr/models/modules/vision_transformer/tensorflow.py +4 -4
- doctr/models/predictor/__init__.py +5 -5
- doctr/models/predictor/base.py +52 -41
- doctr/models/predictor/pytorch.py +16 -13
- doctr/models/predictor/tensorflow.py +16 -10
- doctr/models/preprocessor/__init__.py +4 -4
- doctr/models/preprocessor/pytorch.py +13 -17
- doctr/models/preprocessor/tensorflow.py +11 -15
- doctr/models/recognition/core.py +3 -7
- doctr/models/recognition/crnn/__init__.py +4 -4
- doctr/models/recognition/crnn/pytorch.py +20 -28
- doctr/models/recognition/crnn/tensorflow.py +19 -29
- doctr/models/recognition/master/__init__.py +3 -3
- doctr/models/recognition/master/base.py +3 -7
- doctr/models/recognition/master/pytorch.py +22 -24
- doctr/models/recognition/master/tensorflow.py +21 -26
- doctr/models/recognition/parseq/__init__.py +3 -3
- doctr/models/recognition/parseq/base.py +3 -7
- doctr/models/recognition/parseq/pytorch.py +26 -26
- doctr/models/recognition/parseq/tensorflow.py +26 -30
- doctr/models/recognition/predictor/__init__.py +5 -5
- doctr/models/recognition/predictor/_utils.py +7 -10
- doctr/models/recognition/predictor/pytorch.py +6 -6
- doctr/models/recognition/predictor/tensorflow.py +5 -6
- doctr/models/recognition/sar/__init__.py +4 -4
- doctr/models/recognition/sar/pytorch.py +20 -21
- doctr/models/recognition/sar/tensorflow.py +19 -24
- doctr/models/recognition/utils.py +5 -10
- doctr/models/recognition/vitstr/__init__.py +4 -4
- doctr/models/recognition/vitstr/base.py +3 -7
- doctr/models/recognition/vitstr/pytorch.py +18 -20
- doctr/models/recognition/vitstr/tensorflow.py +21 -24
- doctr/models/recognition/zoo.py +22 -11
- doctr/models/utils/__init__.py +4 -4
- doctr/models/utils/pytorch.py +13 -16
- doctr/models/utils/tensorflow.py +31 -30
- doctr/models/zoo.py +1 -5
- doctr/transforms/functional/__init__.py +3 -3
- doctr/transforms/functional/base.py +4 -11
- doctr/transforms/functional/pytorch.py +21 -29
- doctr/transforms/functional/tensorflow.py +10 -22
- doctr/transforms/modules/__init__.py +4 -4
- doctr/transforms/modules/base.py +48 -55
- doctr/transforms/modules/pytorch.py +65 -28
- doctr/transforms/modules/tensorflow.py +33 -44
- doctr/utils/common_types.py +8 -9
- doctr/utils/data.py +8 -12
- doctr/utils/fonts.py +2 -7
- doctr/utils/geometry.py +120 -64
- doctr/utils/metrics.py +18 -38
- doctr/utils/multithreading.py +4 -6
- doctr/utils/reconstitution.py +157 -75
- doctr/utils/repr.py +2 -3
- doctr/utils/visualization.py +16 -29
- doctr/version.py +1 -1
- {python_doctr-0.9.0.dist-info → python_doctr-0.11.0.dist-info}/METADATA +59 -57
- python_doctr-0.11.0.dist-info/RECORD +173 -0
- {python_doctr-0.9.0.dist-info → python_doctr-0.11.0.dist-info}/WHEEL +1 -1
- python_doctr-0.9.0.dist-info/RECORD +0 -173
- {python_doctr-0.9.0.dist-info → python_doctr-0.11.0.dist-info}/LICENSE +0 -0
- {python_doctr-0.9.0.dist-info → python_doctr-0.11.0.dist-info}/top_level.txt +0 -0
- {python_doctr-0.9.0.dist-info → python_doctr-0.11.0.dist-info}/zip-safe +0 -0
doctr/io/elements.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any
|
|
7
7
|
|
|
8
8
|
from defusedxml import defuse_stdlib
|
|
9
9
|
|
|
@@ -32,8 +32,8 @@ __all__ = ["Element", "Word", "Artefact", "Line", "Prediction", "Block", "Page",
|
|
|
32
32
|
class Element(NestedObject):
|
|
33
33
|
"""Implements an abstract document element with exporting and text rendering capabilities"""
|
|
34
34
|
|
|
35
|
-
_children_names:
|
|
36
|
-
_exported_keys:
|
|
35
|
+
_children_names: list[str] = []
|
|
36
|
+
_exported_keys: list[str] = []
|
|
37
37
|
|
|
38
38
|
def __init__(self, **kwargs: Any) -> None:
|
|
39
39
|
for k, v in kwargs.items():
|
|
@@ -42,7 +42,7 @@ class Element(NestedObject):
|
|
|
42
42
|
else:
|
|
43
43
|
raise KeyError(f"{self.__class__.__name__} object does not have any attribute named '{k}'")
|
|
44
44
|
|
|
45
|
-
def export(self) ->
|
|
45
|
+
def export(self) -> dict[str, Any]:
|
|
46
46
|
"""Exports the object into a nested dict format"""
|
|
47
47
|
export_dict = {k: getattr(self, k) for k in self._exported_keys}
|
|
48
48
|
for children_name in self._children_names:
|
|
@@ -56,7 +56,7 @@ class Element(NestedObject):
|
|
|
56
56
|
return export_dict
|
|
57
57
|
|
|
58
58
|
@classmethod
|
|
59
|
-
def from_dict(cls, save_dict:
|
|
59
|
+
def from_dict(cls, save_dict: dict[str, Any], **kwargs):
|
|
60
60
|
raise NotImplementedError
|
|
61
61
|
|
|
62
62
|
def render(self) -> str:
|
|
@@ -67,7 +67,6 @@ class Word(Element):
|
|
|
67
67
|
"""Implements a word element
|
|
68
68
|
|
|
69
69
|
Args:
|
|
70
|
-
----
|
|
71
70
|
value: the text string of the word
|
|
72
71
|
confidence: the confidence associated with the text prediction
|
|
73
72
|
geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
|
|
@@ -76,16 +75,16 @@ class Word(Element):
|
|
|
76
75
|
crop_orientation: the general orientation of the crop in degrees and its confidence
|
|
77
76
|
"""
|
|
78
77
|
|
|
79
|
-
_exported_keys:
|
|
80
|
-
_children_names:
|
|
78
|
+
_exported_keys: list[str] = ["value", "confidence", "geometry", "objectness_score", "crop_orientation"]
|
|
79
|
+
_children_names: list[str] = []
|
|
81
80
|
|
|
82
81
|
def __init__(
|
|
83
82
|
self,
|
|
84
83
|
value: str,
|
|
85
84
|
confidence: float,
|
|
86
|
-
geometry:
|
|
85
|
+
geometry: BoundingBox | np.ndarray,
|
|
87
86
|
objectness_score: float,
|
|
88
|
-
crop_orientation:
|
|
87
|
+
crop_orientation: dict[str, Any],
|
|
89
88
|
) -> None:
|
|
90
89
|
super().__init__()
|
|
91
90
|
self.value = value
|
|
@@ -102,7 +101,7 @@ class Word(Element):
|
|
|
102
101
|
return f"value='{self.value}', confidence={self.confidence:.2}"
|
|
103
102
|
|
|
104
103
|
@classmethod
|
|
105
|
-
def from_dict(cls, save_dict:
|
|
104
|
+
def from_dict(cls, save_dict: dict[str, Any], **kwargs):
|
|
106
105
|
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
107
106
|
return cls(**kwargs)
|
|
108
107
|
|
|
@@ -111,15 +110,14 @@ class Artefact(Element):
|
|
|
111
110
|
"""Implements a non-textual element
|
|
112
111
|
|
|
113
112
|
Args:
|
|
114
|
-
----
|
|
115
113
|
artefact_type: the type of artefact
|
|
116
114
|
confidence: the confidence of the type prediction
|
|
117
115
|
geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
|
|
118
116
|
the page's size.
|
|
119
117
|
"""
|
|
120
118
|
|
|
121
|
-
_exported_keys:
|
|
122
|
-
_children_names:
|
|
119
|
+
_exported_keys: list[str] = ["geometry", "type", "confidence"]
|
|
120
|
+
_children_names: list[str] = []
|
|
123
121
|
|
|
124
122
|
def __init__(self, artefact_type: str, confidence: float, geometry: BoundingBox) -> None:
|
|
125
123
|
super().__init__()
|
|
@@ -135,7 +133,7 @@ class Artefact(Element):
|
|
|
135
133
|
return f"type='{self.type}', confidence={self.confidence:.2}"
|
|
136
134
|
|
|
137
135
|
@classmethod
|
|
138
|
-
def from_dict(cls, save_dict:
|
|
136
|
+
def from_dict(cls, save_dict: dict[str, Any], **kwargs):
|
|
139
137
|
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
140
138
|
return cls(**kwargs)
|
|
141
139
|
|
|
@@ -144,22 +142,21 @@ class Line(Element):
|
|
|
144
142
|
"""Implements a line element as a collection of words
|
|
145
143
|
|
|
146
144
|
Args:
|
|
147
|
-
----
|
|
148
145
|
words: list of word elements
|
|
149
146
|
geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
|
|
150
147
|
the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing
|
|
151
148
|
all words in it.
|
|
152
149
|
"""
|
|
153
150
|
|
|
154
|
-
_exported_keys:
|
|
155
|
-
_children_names:
|
|
156
|
-
words:
|
|
151
|
+
_exported_keys: list[str] = ["geometry", "objectness_score"]
|
|
152
|
+
_children_names: list[str] = ["words"]
|
|
153
|
+
words: list[Word] = []
|
|
157
154
|
|
|
158
155
|
def __init__(
|
|
159
156
|
self,
|
|
160
|
-
words:
|
|
161
|
-
geometry:
|
|
162
|
-
objectness_score:
|
|
157
|
+
words: list[Word],
|
|
158
|
+
geometry: BoundingBox | np.ndarray | None = None,
|
|
159
|
+
objectness_score: float | None = None,
|
|
163
160
|
) -> None:
|
|
164
161
|
# Compute the objectness score of the line
|
|
165
162
|
if objectness_score is None:
|
|
@@ -168,7 +165,7 @@ class Line(Element):
|
|
|
168
165
|
if geometry is None:
|
|
169
166
|
# Check whether this is a rotated or straight box
|
|
170
167
|
box_resolution_fn = resolve_enclosing_rbbox if len(words[0].geometry) == 4 else resolve_enclosing_bbox
|
|
171
|
-
geometry = box_resolution_fn([w.geometry for w in words]) # type: ignore[
|
|
168
|
+
geometry = box_resolution_fn([w.geometry for w in words]) # type: ignore[misc]
|
|
172
169
|
|
|
173
170
|
super().__init__(words=words)
|
|
174
171
|
self.geometry = geometry
|
|
@@ -179,7 +176,7 @@ class Line(Element):
|
|
|
179
176
|
return " ".join(w.render() for w in self.words)
|
|
180
177
|
|
|
181
178
|
@classmethod
|
|
182
|
-
def from_dict(cls, save_dict:
|
|
179
|
+
def from_dict(cls, save_dict: dict[str, Any], **kwargs):
|
|
183
180
|
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
184
181
|
kwargs.update({
|
|
185
182
|
"words": [Word.from_dict(_dict) for _dict in save_dict["words"]],
|
|
@@ -202,7 +199,6 @@ class Block(Element):
|
|
|
202
199
|
"""Implements a block element as a collection of lines and artefacts
|
|
203
200
|
|
|
204
201
|
Args:
|
|
205
|
-
----
|
|
206
202
|
lines: list of line elements
|
|
207
203
|
artefacts: list of artefacts
|
|
208
204
|
geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
|
|
@@ -210,17 +206,17 @@ class Block(Element):
|
|
|
210
206
|
all lines and artefacts in it.
|
|
211
207
|
"""
|
|
212
208
|
|
|
213
|
-
_exported_keys:
|
|
214
|
-
_children_names:
|
|
215
|
-
lines:
|
|
216
|
-
artefacts:
|
|
209
|
+
_exported_keys: list[str] = ["geometry", "objectness_score"]
|
|
210
|
+
_children_names: list[str] = ["lines", "artefacts"]
|
|
211
|
+
lines: list[Line] = []
|
|
212
|
+
artefacts: list[Artefact] = []
|
|
217
213
|
|
|
218
214
|
def __init__(
|
|
219
215
|
self,
|
|
220
|
-
lines:
|
|
221
|
-
artefacts:
|
|
222
|
-
geometry:
|
|
223
|
-
objectness_score:
|
|
216
|
+
lines: list[Line] = [],
|
|
217
|
+
artefacts: list[Artefact] = [],
|
|
218
|
+
geometry: BoundingBox | np.ndarray | None = None,
|
|
219
|
+
objectness_score: float | None = None,
|
|
224
220
|
) -> None:
|
|
225
221
|
# Compute the objectness score of the line
|
|
226
222
|
if objectness_score is None:
|
|
@@ -232,7 +228,7 @@ class Block(Element):
|
|
|
232
228
|
box_resolution_fn = (
|
|
233
229
|
resolve_enclosing_rbbox if isinstance(lines[0].geometry, np.ndarray) else resolve_enclosing_bbox
|
|
234
230
|
)
|
|
235
|
-
geometry = box_resolution_fn(line_boxes + artefact_boxes) # type: ignore
|
|
231
|
+
geometry = box_resolution_fn(line_boxes + artefact_boxes) # type: ignore
|
|
236
232
|
|
|
237
233
|
super().__init__(lines=lines, artefacts=artefacts)
|
|
238
234
|
self.geometry = geometry
|
|
@@ -243,7 +239,7 @@ class Block(Element):
|
|
|
243
239
|
return line_break.join(line.render() for line in self.lines)
|
|
244
240
|
|
|
245
241
|
@classmethod
|
|
246
|
-
def from_dict(cls, save_dict:
|
|
242
|
+
def from_dict(cls, save_dict: dict[str, Any], **kwargs):
|
|
247
243
|
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
248
244
|
kwargs.update({
|
|
249
245
|
"lines": [Line.from_dict(_dict) for _dict in save_dict["lines"]],
|
|
@@ -256,7 +252,6 @@ class Page(Element):
|
|
|
256
252
|
"""Implements a page element as a collection of blocks
|
|
257
253
|
|
|
258
254
|
Args:
|
|
259
|
-
----
|
|
260
255
|
page: image encoded as a numpy array in uint8
|
|
261
256
|
blocks: list of block elements
|
|
262
257
|
page_idx: the index of the page in the input raw document
|
|
@@ -265,18 +260,18 @@ class Page(Element):
|
|
|
265
260
|
language: a dictionary with the language value and confidence of the prediction
|
|
266
261
|
"""
|
|
267
262
|
|
|
268
|
-
_exported_keys:
|
|
269
|
-
_children_names:
|
|
270
|
-
blocks:
|
|
263
|
+
_exported_keys: list[str] = ["page_idx", "dimensions", "orientation", "language"]
|
|
264
|
+
_children_names: list[str] = ["blocks"]
|
|
265
|
+
blocks: list[Block] = []
|
|
271
266
|
|
|
272
267
|
def __init__(
|
|
273
268
|
self,
|
|
274
269
|
page: np.ndarray,
|
|
275
|
-
blocks:
|
|
270
|
+
blocks: list[Block],
|
|
276
271
|
page_idx: int,
|
|
277
|
-
dimensions:
|
|
278
|
-
orientation:
|
|
279
|
-
language:
|
|
272
|
+
dimensions: tuple[int, int],
|
|
273
|
+
orientation: dict[str, Any] | None = None,
|
|
274
|
+
language: dict[str, Any] | None = None,
|
|
280
275
|
) -> None:
|
|
281
276
|
super().__init__(blocks=blocks)
|
|
282
277
|
self.page = page
|
|
@@ -310,22 +305,22 @@ class Page(Element):
|
|
|
310
305
|
def synthesize(self, **kwargs) -> np.ndarray:
|
|
311
306
|
"""Synthesize the page from the predictions
|
|
312
307
|
|
|
313
|
-
|
|
314
|
-
|
|
308
|
+
Args:
|
|
309
|
+
**kwargs: keyword arguments passed to the `synthesize_page` method
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
315
312
|
synthesized page
|
|
316
313
|
"""
|
|
317
314
|
return synthesize_page(self.export(), **kwargs)
|
|
318
315
|
|
|
319
|
-
def export_as_xml(self, file_title: str = "docTR - XML export (hOCR)") ->
|
|
316
|
+
def export_as_xml(self, file_title: str = "docTR - XML export (hOCR)") -> tuple[bytes, ET.ElementTree]:
|
|
320
317
|
"""Export the page as XML (hOCR-format)
|
|
321
318
|
convention: https://github.com/kba/hocr-spec/blob/master/1.2/spec.md
|
|
322
319
|
|
|
323
320
|
Args:
|
|
324
|
-
----
|
|
325
321
|
file_title: the title of the XML file
|
|
326
322
|
|
|
327
323
|
Returns:
|
|
328
|
-
-------
|
|
329
324
|
a tuple of the XML byte string, and its ElementTree
|
|
330
325
|
"""
|
|
331
326
|
p_idx = self.page_idx
|
|
@@ -423,7 +418,7 @@ class Page(Element):
|
|
|
423
418
|
return (ET.tostring(page_hocr, encoding="utf-8", method="xml"), ET.ElementTree(page_hocr))
|
|
424
419
|
|
|
425
420
|
@classmethod
|
|
426
|
-
def from_dict(cls, save_dict:
|
|
421
|
+
def from_dict(cls, save_dict: dict[str, Any], **kwargs):
|
|
427
422
|
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
428
423
|
kwargs.update({"blocks": [Block.from_dict(block_dict) for block_dict in save_dict["blocks"]]})
|
|
429
424
|
return cls(**kwargs)
|
|
@@ -433,7 +428,6 @@ class KIEPage(Element):
|
|
|
433
428
|
"""Implements a KIE page element as a collection of predictions
|
|
434
429
|
|
|
435
430
|
Args:
|
|
436
|
-
----
|
|
437
431
|
predictions: Dictionary with list of block elements for each detection class
|
|
438
432
|
page: image encoded as a numpy array in uint8
|
|
439
433
|
page_idx: the index of the page in the input raw document
|
|
@@ -442,18 +436,18 @@ class KIEPage(Element):
|
|
|
442
436
|
language: a dictionary with the language value and confidence of the prediction
|
|
443
437
|
"""
|
|
444
438
|
|
|
445
|
-
_exported_keys:
|
|
446
|
-
_children_names:
|
|
447
|
-
predictions:
|
|
439
|
+
_exported_keys: list[str] = ["page_idx", "dimensions", "orientation", "language"]
|
|
440
|
+
_children_names: list[str] = ["predictions"]
|
|
441
|
+
predictions: dict[str, list[Prediction]] = {}
|
|
448
442
|
|
|
449
443
|
def __init__(
|
|
450
444
|
self,
|
|
451
445
|
page: np.ndarray,
|
|
452
|
-
predictions:
|
|
446
|
+
predictions: dict[str, list[Prediction]],
|
|
453
447
|
page_idx: int,
|
|
454
|
-
dimensions:
|
|
455
|
-
orientation:
|
|
456
|
-
language:
|
|
448
|
+
dimensions: tuple[int, int],
|
|
449
|
+
orientation: dict[str, Any] | None = None,
|
|
450
|
+
language: dict[str, Any] | None = None,
|
|
457
451
|
) -> None:
|
|
458
452
|
super().__init__(predictions=predictions)
|
|
459
453
|
self.page = page
|
|
@@ -492,25 +486,21 @@ class KIEPage(Element):
|
|
|
492
486
|
"""Synthesize the page from the predictions
|
|
493
487
|
|
|
494
488
|
Args:
|
|
495
|
-
|
|
496
|
-
**kwargs: keyword arguments passed to the matplotlib.pyplot.show method
|
|
489
|
+
**kwargs: keyword arguments passed to the `synthesize_kie_page` method
|
|
497
490
|
|
|
498
491
|
Returns:
|
|
499
|
-
-------
|
|
500
492
|
synthesized page
|
|
501
493
|
"""
|
|
502
494
|
return synthesize_kie_page(self.export(), **kwargs)
|
|
503
495
|
|
|
504
|
-
def export_as_xml(self, file_title: str = "docTR - XML export (hOCR)") ->
|
|
496
|
+
def export_as_xml(self, file_title: str = "docTR - XML export (hOCR)") -> tuple[bytes, ET.ElementTree]:
|
|
505
497
|
"""Export the page as XML (hOCR-format)
|
|
506
498
|
convention: https://github.com/kba/hocr-spec/blob/master/1.2/spec.md
|
|
507
499
|
|
|
508
500
|
Args:
|
|
509
|
-
----
|
|
510
501
|
file_title: the title of the XML file
|
|
511
502
|
|
|
512
503
|
Returns:
|
|
513
|
-
-------
|
|
514
504
|
a tuple of the XML byte string, and its ElementTree
|
|
515
505
|
"""
|
|
516
506
|
p_idx = self.page_idx
|
|
@@ -566,7 +556,7 @@ class KIEPage(Element):
|
|
|
566
556
|
return ET.tostring(page_hocr, encoding="utf-8", method="xml"), ET.ElementTree(page_hocr)
|
|
567
557
|
|
|
568
558
|
@classmethod
|
|
569
|
-
def from_dict(cls, save_dict:
|
|
559
|
+
def from_dict(cls, save_dict: dict[str, Any], **kwargs):
|
|
570
560
|
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
571
561
|
kwargs.update({
|
|
572
562
|
"predictions": [Prediction.from_dict(predictions_dict) for predictions_dict in save_dict["predictions"]]
|
|
@@ -578,16 +568,15 @@ class Document(Element):
|
|
|
578
568
|
"""Implements a document element as a collection of pages
|
|
579
569
|
|
|
580
570
|
Args:
|
|
581
|
-
----
|
|
582
571
|
pages: list of page elements
|
|
583
572
|
"""
|
|
584
573
|
|
|
585
|
-
_children_names:
|
|
586
|
-
pages:
|
|
574
|
+
_children_names: list[str] = ["pages"]
|
|
575
|
+
pages: list[Page] = []
|
|
587
576
|
|
|
588
577
|
def __init__(
|
|
589
578
|
self,
|
|
590
|
-
pages:
|
|
579
|
+
pages: list[Page],
|
|
591
580
|
) -> None:
|
|
592
581
|
super().__init__(pages=pages)
|
|
593
582
|
|
|
@@ -600,30 +589,30 @@ class Document(Element):
|
|
|
600
589
|
for result in self.pages:
|
|
601
590
|
result.show(**kwargs)
|
|
602
591
|
|
|
603
|
-
def synthesize(self, **kwargs) ->
|
|
592
|
+
def synthesize(self, **kwargs) -> list[np.ndarray]:
|
|
604
593
|
"""Synthesize all pages from their predictions
|
|
605
594
|
|
|
606
|
-
|
|
607
|
-
|
|
595
|
+
Args:
|
|
596
|
+
**kwargs: keyword arguments passed to the `Page.synthesize` method
|
|
597
|
+
|
|
598
|
+
Returns:
|
|
608
599
|
list of synthesized pages
|
|
609
600
|
"""
|
|
610
|
-
return [page.synthesize() for page in self.pages]
|
|
601
|
+
return [page.synthesize(**kwargs) for page in self.pages]
|
|
611
602
|
|
|
612
|
-
def export_as_xml(self, **kwargs) ->
|
|
603
|
+
def export_as_xml(self, **kwargs) -> list[tuple[bytes, ET.ElementTree]]:
|
|
613
604
|
"""Export the document as XML (hOCR-format)
|
|
614
605
|
|
|
615
606
|
Args:
|
|
616
|
-
----
|
|
617
607
|
**kwargs: additional keyword arguments passed to the Page.export_as_xml method
|
|
618
608
|
|
|
619
609
|
Returns:
|
|
620
|
-
-------
|
|
621
610
|
list of tuple of (bytes, ElementTree)
|
|
622
611
|
"""
|
|
623
612
|
return [page.export_as_xml(**kwargs) for page in self.pages]
|
|
624
613
|
|
|
625
614
|
@classmethod
|
|
626
|
-
def from_dict(cls, save_dict:
|
|
615
|
+
def from_dict(cls, save_dict: dict[str, Any], **kwargs):
|
|
627
616
|
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
628
617
|
kwargs.update({"pages": [Page.from_dict(page_dict) for page_dict in save_dict["pages"]]})
|
|
629
618
|
return cls(**kwargs)
|
|
@@ -633,15 +622,14 @@ class KIEDocument(Document):
|
|
|
633
622
|
"""Implements a document element as a collection of pages
|
|
634
623
|
|
|
635
624
|
Args:
|
|
636
|
-
----
|
|
637
625
|
pages: list of page elements
|
|
638
626
|
"""
|
|
639
627
|
|
|
640
|
-
_children_names:
|
|
641
|
-
pages:
|
|
628
|
+
_children_names: list[str] = ["pages"]
|
|
629
|
+
pages: list[KIEPage] = [] # type: ignore[assignment]
|
|
642
630
|
|
|
643
631
|
def __init__(
|
|
644
632
|
self,
|
|
645
|
-
pages:
|
|
633
|
+
pages: list[KIEPage],
|
|
646
634
|
) -> None:
|
|
647
635
|
super().__init__(pages=pages) # type: ignore[arg-type]
|
doctr/io/html.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
@@ -15,12 +15,10 @@ def read_html(url: str, **kwargs: Any) -> bytes:
|
|
|
15
15
|
>>> doc = read_html("https://www.yoursite.com")
|
|
16
16
|
|
|
17
17
|
Args:
|
|
18
|
-
----
|
|
19
18
|
url: URL of the target web page
|
|
20
19
|
**kwargs: keyword arguments from `weasyprint.HTML`
|
|
21
20
|
|
|
22
21
|
Returns:
|
|
23
|
-
-------
|
|
24
22
|
decoded PDF file as a bytes stream
|
|
25
23
|
"""
|
|
26
24
|
from weasyprint import HTML
|
doctr/io/image/__init__.py
CHANGED
|
@@ -2,7 +2,7 @@ from doctr.file_utils import is_tf_available, is_torch_available
|
|
|
2
2
|
|
|
3
3
|
from .base import *
|
|
4
4
|
|
|
5
|
-
if
|
|
6
|
-
from .tensorflow import *
|
|
7
|
-
elif is_torch_available():
|
|
5
|
+
if is_torch_available():
|
|
8
6
|
from .pytorch import *
|
|
7
|
+
elif is_tf_available():
|
|
8
|
+
from .tensorflow import *
|
doctr/io/image/base.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Optional, Tuple
|
|
8
7
|
|
|
9
8
|
import cv2
|
|
10
9
|
import numpy as np
|
|
@@ -16,7 +15,7 @@ __all__ = ["read_img_as_numpy"]
|
|
|
16
15
|
|
|
17
16
|
def read_img_as_numpy(
|
|
18
17
|
file: AbstractFile,
|
|
19
|
-
output_size:
|
|
18
|
+
output_size: tuple[int, int] | None = None,
|
|
20
19
|
rgb_output: bool = True,
|
|
21
20
|
) -> np.ndarray:
|
|
22
21
|
"""Read an image file into numpy format
|
|
@@ -25,13 +24,11 @@ def read_img_as_numpy(
|
|
|
25
24
|
>>> page = read_img_as_numpy("path/to/your/doc.jpg")
|
|
26
25
|
|
|
27
26
|
Args:
|
|
28
|
-
----
|
|
29
27
|
file: the path to the image file
|
|
30
28
|
output_size: the expected output size of each page in format H x W
|
|
31
29
|
rgb_output: whether the output ndarray channel order should be RGB instead of BGR.
|
|
32
30
|
|
|
33
31
|
Returns:
|
|
34
|
-
-------
|
|
35
32
|
the page decoded as numpy ndarray of shape H x W x 3
|
|
36
33
|
"""
|
|
37
34
|
if isinstance(file, (str, Path)):
|
doctr/io/image/pytorch.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
from io import BytesIO
|
|
7
|
-
from typing import Tuple
|
|
8
7
|
|
|
9
8
|
import numpy as np
|
|
10
9
|
import torch
|
|
@@ -20,12 +19,10 @@ def tensor_from_pil(pil_img: Image.Image, dtype: torch.dtype = torch.float32) ->
|
|
|
20
19
|
"""Convert a PIL Image to a PyTorch tensor
|
|
21
20
|
|
|
22
21
|
Args:
|
|
23
|
-
----
|
|
24
22
|
pil_img: a PIL image
|
|
25
23
|
dtype: the output tensor data type
|
|
26
24
|
|
|
27
25
|
Returns:
|
|
28
|
-
-------
|
|
29
26
|
decoded image as tensor
|
|
30
27
|
"""
|
|
31
28
|
if dtype == torch.float32:
|
|
@@ -40,12 +37,10 @@ def read_img_as_tensor(img_path: AbstractPath, dtype: torch.dtype = torch.float3
|
|
|
40
37
|
"""Read an image file as a PyTorch tensor
|
|
41
38
|
|
|
42
39
|
Args:
|
|
43
|
-
----
|
|
44
40
|
img_path: location of the image file
|
|
45
41
|
dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
|
|
46
42
|
|
|
47
43
|
Returns:
|
|
48
|
-
-------
|
|
49
44
|
decoded image as a tensor
|
|
50
45
|
"""
|
|
51
46
|
if dtype not in (torch.uint8, torch.float16, torch.float32):
|
|
@@ -59,12 +54,10 @@ def decode_img_as_tensor(img_content: bytes, dtype: torch.dtype = torch.float32)
|
|
|
59
54
|
"""Read a byte stream as a PyTorch tensor
|
|
60
55
|
|
|
61
56
|
Args:
|
|
62
|
-
----
|
|
63
57
|
img_content: bytes of a decoded image
|
|
64
58
|
dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
|
|
65
59
|
|
|
66
60
|
Returns:
|
|
67
|
-
-------
|
|
68
61
|
decoded image as a tensor
|
|
69
62
|
"""
|
|
70
63
|
if dtype not in (torch.uint8, torch.float16, torch.float32):
|
|
@@ -78,12 +71,10 @@ def tensor_from_numpy(npy_img: np.ndarray, dtype: torch.dtype = torch.float32) -
|
|
|
78
71
|
"""Read an image file as a PyTorch tensor
|
|
79
72
|
|
|
80
73
|
Args:
|
|
81
|
-
----
|
|
82
74
|
npy_img: image encoded as a numpy array of shape (H, W, C) in np.uint8
|
|
83
75
|
dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
|
|
84
76
|
|
|
85
77
|
Returns:
|
|
86
|
-
-------
|
|
87
78
|
same image as a tensor of shape (C, H, W)
|
|
88
79
|
"""
|
|
89
80
|
if dtype not in (torch.uint8, torch.float16, torch.float32):
|
|
@@ -102,6 +93,6 @@ def tensor_from_numpy(npy_img: np.ndarray, dtype: torch.dtype = torch.float32) -
|
|
|
102
93
|
return img
|
|
103
94
|
|
|
104
95
|
|
|
105
|
-
def get_img_shape(img: torch.Tensor) ->
|
|
96
|
+
def get_img_shape(img: torch.Tensor) -> tuple[int, int]:
|
|
106
97
|
"""Get the shape of an image"""
|
|
107
|
-
return img.shape[-2:]
|
|
98
|
+
return img.shape[-2:]
|
doctr/io/image/tensorflow.py
CHANGED
|
@@ -1,9 +1,8 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
|
-
from typing import Tuple
|
|
7
6
|
|
|
8
7
|
import numpy as np
|
|
9
8
|
import tensorflow as tf
|
|
@@ -19,12 +18,10 @@ def tensor_from_pil(pil_img: Image.Image, dtype: tf.dtypes.DType = tf.float32) -
|
|
|
19
18
|
"""Convert a PIL Image to a TensorFlow tensor
|
|
20
19
|
|
|
21
20
|
Args:
|
|
22
|
-
----
|
|
23
21
|
pil_img: a PIL image
|
|
24
22
|
dtype: the output tensor data type
|
|
25
23
|
|
|
26
24
|
Returns:
|
|
27
|
-
-------
|
|
28
25
|
decoded image as tensor
|
|
29
26
|
"""
|
|
30
27
|
npy_img = img_to_array(pil_img)
|
|
@@ -36,12 +33,10 @@ def read_img_as_tensor(img_path: AbstractPath, dtype: tf.dtypes.DType = tf.float
|
|
|
36
33
|
"""Read an image file as a TensorFlow tensor
|
|
37
34
|
|
|
38
35
|
Args:
|
|
39
|
-
----
|
|
40
36
|
img_path: location of the image file
|
|
41
37
|
dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
|
|
42
38
|
|
|
43
39
|
Returns:
|
|
44
|
-
-------
|
|
45
40
|
decoded image as a tensor
|
|
46
41
|
"""
|
|
47
42
|
if dtype not in (tf.uint8, tf.float16, tf.float32):
|
|
@@ -61,12 +56,10 @@ def decode_img_as_tensor(img_content: bytes, dtype: tf.dtypes.DType = tf.float32
|
|
|
61
56
|
"""Read a byte stream as a TensorFlow tensor
|
|
62
57
|
|
|
63
58
|
Args:
|
|
64
|
-
----
|
|
65
59
|
img_content: bytes of a decoded image
|
|
66
60
|
dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
|
|
67
61
|
|
|
68
62
|
Returns:
|
|
69
|
-
-------
|
|
70
63
|
decoded image as a tensor
|
|
71
64
|
"""
|
|
72
65
|
if dtype not in (tf.uint8, tf.float16, tf.float32):
|
|
@@ -85,12 +78,10 @@ def tensor_from_numpy(npy_img: np.ndarray, dtype: tf.dtypes.DType = tf.float32)
|
|
|
85
78
|
"""Read an image file as a TensorFlow tensor
|
|
86
79
|
|
|
87
80
|
Args:
|
|
88
|
-
----
|
|
89
81
|
npy_img: image encoded as a numpy array of shape (H, W, C) in np.uint8
|
|
90
82
|
dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
|
|
91
83
|
|
|
92
84
|
Returns:
|
|
93
|
-
-------
|
|
94
85
|
same image as a tensor of shape (H, W, C)
|
|
95
86
|
"""
|
|
96
87
|
if dtype not in (tf.uint8, tf.float16, tf.float32):
|
|
@@ -105,6 +96,6 @@ def tensor_from_numpy(npy_img: np.ndarray, dtype: tf.dtypes.DType = tf.float32)
|
|
|
105
96
|
return img
|
|
106
97
|
|
|
107
98
|
|
|
108
|
-
def get_img_shape(img: tf.Tensor) ->
|
|
99
|
+
def get_img_shape(img: tf.Tensor) -> tuple[int, int]:
|
|
109
100
|
"""Get the shape of an image"""
|
|
110
101
|
return img.shape[:2]
|
doctr/io/pdf.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any
|
|
7
7
|
|
|
8
8
|
import numpy as np
|
|
9
9
|
import pypdfium2 as pdfium
|
|
@@ -15,18 +15,17 @@ __all__ = ["read_pdf"]
|
|
|
15
15
|
|
|
16
16
|
def read_pdf(
|
|
17
17
|
file: AbstractFile,
|
|
18
|
-
scale:
|
|
18
|
+
scale: int = 2,
|
|
19
19
|
rgb_mode: bool = True,
|
|
20
|
-
password:
|
|
20
|
+
password: str | None = None,
|
|
21
21
|
**kwargs: Any,
|
|
22
|
-
) ->
|
|
22
|
+
) -> list[np.ndarray]:
|
|
23
23
|
"""Read a PDF file and convert it into an image in numpy format
|
|
24
24
|
|
|
25
25
|
>>> from doctr.io import read_pdf
|
|
26
26
|
>>> doc = read_pdf("path/to/your/doc.pdf")
|
|
27
27
|
|
|
28
28
|
Args:
|
|
29
|
-
----
|
|
30
29
|
file: the path to the PDF file
|
|
31
30
|
scale: rendering scale (1 corresponds to 72dpi)
|
|
32
31
|
rgb_mode: if True, the output will be RGB, otherwise BGR
|
|
@@ -34,7 +33,6 @@ def read_pdf(
|
|
|
34
33
|
**kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render`
|
|
35
34
|
|
|
36
35
|
Returns:
|
|
37
|
-
-------
|
|
38
36
|
the list of pages decoded as numpy ndarray of shape H x W x C
|
|
39
37
|
"""
|
|
40
38
|
# Rasterise pages to numpy ndarrays with pypdfium2
|