python-doctr 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctr/contrib/__init__.py +1 -0
- doctr/contrib/artefacts.py +7 -9
- doctr/contrib/base.py +8 -17
- doctr/datasets/cord.py +8 -7
- doctr/datasets/datasets/__init__.py +4 -4
- doctr/datasets/datasets/base.py +16 -16
- doctr/datasets/datasets/pytorch.py +12 -12
- doctr/datasets/datasets/tensorflow.py +10 -10
- doctr/datasets/detection.py +6 -9
- doctr/datasets/doc_artefacts.py +3 -4
- doctr/datasets/funsd.py +7 -6
- doctr/datasets/generator/__init__.py +4 -4
- doctr/datasets/generator/base.py +16 -17
- doctr/datasets/generator/pytorch.py +1 -3
- doctr/datasets/generator/tensorflow.py +1 -3
- doctr/datasets/ic03.py +4 -5
- doctr/datasets/ic13.py +4 -5
- doctr/datasets/iiit5k.py +6 -5
- doctr/datasets/iiithws.py +4 -5
- doctr/datasets/imgur5k.py +6 -5
- doctr/datasets/loader.py +4 -7
- doctr/datasets/mjsynth.py +6 -5
- doctr/datasets/ocr.py +3 -4
- doctr/datasets/orientation.py +3 -4
- doctr/datasets/recognition.py +3 -4
- doctr/datasets/sroie.py +6 -5
- doctr/datasets/svhn.py +6 -5
- doctr/datasets/svt.py +4 -5
- doctr/datasets/synthtext.py +4 -5
- doctr/datasets/utils.py +34 -29
- doctr/datasets/vocabs.py +17 -7
- doctr/datasets/wildreceipt.py +14 -10
- doctr/file_utils.py +2 -7
- doctr/io/elements.py +59 -79
- doctr/io/html.py +1 -3
- doctr/io/image/__init__.py +3 -3
- doctr/io/image/base.py +2 -5
- doctr/io/image/pytorch.py +3 -12
- doctr/io/image/tensorflow.py +2 -11
- doctr/io/pdf.py +5 -7
- doctr/io/reader.py +5 -11
- doctr/models/_utils.py +14 -22
- doctr/models/builder.py +30 -48
- doctr/models/classification/magc_resnet/__init__.py +3 -3
- doctr/models/classification/magc_resnet/pytorch.py +10 -13
- doctr/models/classification/magc_resnet/tensorflow.py +8 -11
- doctr/models/classification/mobilenet/__init__.py +3 -3
- doctr/models/classification/mobilenet/pytorch.py +5 -17
- doctr/models/classification/mobilenet/tensorflow.py +8 -21
- doctr/models/classification/predictor/__init__.py +4 -4
- doctr/models/classification/predictor/pytorch.py +6 -8
- doctr/models/classification/predictor/tensorflow.py +6 -8
- doctr/models/classification/resnet/__init__.py +4 -4
- doctr/models/classification/resnet/pytorch.py +21 -31
- doctr/models/classification/resnet/tensorflow.py +20 -31
- doctr/models/classification/textnet/__init__.py +3 -3
- doctr/models/classification/textnet/pytorch.py +10 -17
- doctr/models/classification/textnet/tensorflow.py +8 -15
- doctr/models/classification/vgg/__init__.py +3 -3
- doctr/models/classification/vgg/pytorch.py +5 -7
- doctr/models/classification/vgg/tensorflow.py +9 -12
- doctr/models/classification/vit/__init__.py +3 -3
- doctr/models/classification/vit/pytorch.py +8 -14
- doctr/models/classification/vit/tensorflow.py +6 -12
- doctr/models/classification/zoo.py +19 -14
- doctr/models/core.py +3 -3
- doctr/models/detection/_utils/__init__.py +4 -4
- doctr/models/detection/_utils/base.py +4 -7
- doctr/models/detection/_utils/pytorch.py +1 -5
- doctr/models/detection/_utils/tensorflow.py +1 -5
- doctr/models/detection/core.py +2 -8
- doctr/models/detection/differentiable_binarization/__init__.py +4 -4
- doctr/models/detection/differentiable_binarization/base.py +7 -17
- doctr/models/detection/differentiable_binarization/pytorch.py +27 -30
- doctr/models/detection/differentiable_binarization/tensorflow.py +15 -25
- doctr/models/detection/fast/__init__.py +4 -4
- doctr/models/detection/fast/base.py +6 -14
- doctr/models/detection/fast/pytorch.py +24 -31
- doctr/models/detection/fast/tensorflow.py +14 -26
- doctr/models/detection/linknet/__init__.py +4 -4
- doctr/models/detection/linknet/base.py +6 -15
- doctr/models/detection/linknet/pytorch.py +24 -27
- doctr/models/detection/linknet/tensorflow.py +14 -23
- doctr/models/detection/predictor/__init__.py +5 -5
- doctr/models/detection/predictor/pytorch.py +6 -7
- doctr/models/detection/predictor/tensorflow.py +5 -6
- doctr/models/detection/zoo.py +27 -7
- doctr/models/factory/hub.py +3 -7
- doctr/models/kie_predictor/__init__.py +5 -5
- doctr/models/kie_predictor/base.py +4 -5
- doctr/models/kie_predictor/pytorch.py +18 -19
- doctr/models/kie_predictor/tensorflow.py +13 -14
- doctr/models/modules/layers/__init__.py +3 -3
- doctr/models/modules/layers/pytorch.py +6 -9
- doctr/models/modules/layers/tensorflow.py +5 -7
- doctr/models/modules/transformer/__init__.py +3 -3
- doctr/models/modules/transformer/pytorch.py +12 -13
- doctr/models/modules/transformer/tensorflow.py +9 -10
- doctr/models/modules/vision_transformer/__init__.py +3 -3
- doctr/models/modules/vision_transformer/pytorch.py +2 -3
- doctr/models/modules/vision_transformer/tensorflow.py +3 -3
- doctr/models/predictor/__init__.py +5 -5
- doctr/models/predictor/base.py +28 -29
- doctr/models/predictor/pytorch.py +12 -13
- doctr/models/predictor/tensorflow.py +8 -9
- doctr/models/preprocessor/__init__.py +4 -4
- doctr/models/preprocessor/pytorch.py +13 -17
- doctr/models/preprocessor/tensorflow.py +10 -14
- doctr/models/recognition/core.py +3 -7
- doctr/models/recognition/crnn/__init__.py +4 -4
- doctr/models/recognition/crnn/pytorch.py +20 -28
- doctr/models/recognition/crnn/tensorflow.py +11 -23
- doctr/models/recognition/master/__init__.py +3 -3
- doctr/models/recognition/master/base.py +3 -7
- doctr/models/recognition/master/pytorch.py +22 -24
- doctr/models/recognition/master/tensorflow.py +12 -22
- doctr/models/recognition/parseq/__init__.py +3 -3
- doctr/models/recognition/parseq/base.py +3 -7
- doctr/models/recognition/parseq/pytorch.py +26 -26
- doctr/models/recognition/parseq/tensorflow.py +16 -22
- doctr/models/recognition/predictor/__init__.py +5 -5
- doctr/models/recognition/predictor/_utils.py +7 -10
- doctr/models/recognition/predictor/pytorch.py +6 -6
- doctr/models/recognition/predictor/tensorflow.py +5 -6
- doctr/models/recognition/sar/__init__.py +4 -4
- doctr/models/recognition/sar/pytorch.py +20 -21
- doctr/models/recognition/sar/tensorflow.py +12 -21
- doctr/models/recognition/utils.py +5 -10
- doctr/models/recognition/vitstr/__init__.py +4 -4
- doctr/models/recognition/vitstr/base.py +3 -7
- doctr/models/recognition/vitstr/pytorch.py +18 -20
- doctr/models/recognition/vitstr/tensorflow.py +12 -20
- doctr/models/recognition/zoo.py +22 -11
- doctr/models/utils/__init__.py +4 -4
- doctr/models/utils/pytorch.py +14 -17
- doctr/models/utils/tensorflow.py +17 -16
- doctr/models/zoo.py +1 -5
- doctr/transforms/functional/__init__.py +3 -3
- doctr/transforms/functional/base.py +4 -11
- doctr/transforms/functional/pytorch.py +20 -28
- doctr/transforms/functional/tensorflow.py +10 -22
- doctr/transforms/modules/__init__.py +4 -4
- doctr/transforms/modules/base.py +48 -55
- doctr/transforms/modules/pytorch.py +58 -22
- doctr/transforms/modules/tensorflow.py +18 -32
- doctr/utils/common_types.py +8 -9
- doctr/utils/data.py +8 -12
- doctr/utils/fonts.py +2 -7
- doctr/utils/geometry.py +16 -47
- doctr/utils/metrics.py +17 -37
- doctr/utils/multithreading.py +4 -6
- doctr/utils/reconstitution.py +9 -13
- doctr/utils/repr.py +2 -3
- doctr/utils/visualization.py +16 -29
- doctr/version.py +1 -1
- {python_doctr-0.10.0.dist-info → python_doctr-0.11.0.dist-info}/METADATA +54 -52
- python_doctr-0.11.0.dist-info/RECORD +173 -0
- {python_doctr-0.10.0.dist-info → python_doctr-0.11.0.dist-info}/WHEEL +1 -1
- python_doctr-0.10.0.dist-info/RECORD +0 -173
- {python_doctr-0.10.0.dist-info → python_doctr-0.11.0.dist-info}/LICENSE +0 -0
- {python_doctr-0.10.0.dist-info → python_doctr-0.11.0.dist-info}/top_level.txt +0 -0
- {python_doctr-0.10.0.dist-info → python_doctr-0.11.0.dist-info}/zip-safe +0 -0
doctr/io/elements.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any
|
|
7
7
|
|
|
8
8
|
from defusedxml import defuse_stdlib
|
|
9
9
|
|
|
@@ -32,8 +32,8 @@ __all__ = ["Element", "Word", "Artefact", "Line", "Prediction", "Block", "Page",
|
|
|
32
32
|
class Element(NestedObject):
|
|
33
33
|
"""Implements an abstract document element with exporting and text rendering capabilities"""
|
|
34
34
|
|
|
35
|
-
_children_names:
|
|
36
|
-
_exported_keys:
|
|
35
|
+
_children_names: list[str] = []
|
|
36
|
+
_exported_keys: list[str] = []
|
|
37
37
|
|
|
38
38
|
def __init__(self, **kwargs: Any) -> None:
|
|
39
39
|
for k, v in kwargs.items():
|
|
@@ -42,7 +42,7 @@ class Element(NestedObject):
|
|
|
42
42
|
else:
|
|
43
43
|
raise KeyError(f"{self.__class__.__name__} object does not have any attribute named '{k}'")
|
|
44
44
|
|
|
45
|
-
def export(self) ->
|
|
45
|
+
def export(self) -> dict[str, Any]:
|
|
46
46
|
"""Exports the object into a nested dict format"""
|
|
47
47
|
export_dict = {k: getattr(self, k) for k in self._exported_keys}
|
|
48
48
|
for children_name in self._children_names:
|
|
@@ -56,7 +56,7 @@ class Element(NestedObject):
|
|
|
56
56
|
return export_dict
|
|
57
57
|
|
|
58
58
|
@classmethod
|
|
59
|
-
def from_dict(cls, save_dict:
|
|
59
|
+
def from_dict(cls, save_dict: dict[str, Any], **kwargs):
|
|
60
60
|
raise NotImplementedError
|
|
61
61
|
|
|
62
62
|
def render(self) -> str:
|
|
@@ -67,7 +67,6 @@ class Word(Element):
|
|
|
67
67
|
"""Implements a word element
|
|
68
68
|
|
|
69
69
|
Args:
|
|
70
|
-
----
|
|
71
70
|
value: the text string of the word
|
|
72
71
|
confidence: the confidence associated with the text prediction
|
|
73
72
|
geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
|
|
@@ -76,16 +75,16 @@ class Word(Element):
|
|
|
76
75
|
crop_orientation: the general orientation of the crop in degrees and its confidence
|
|
77
76
|
"""
|
|
78
77
|
|
|
79
|
-
_exported_keys:
|
|
80
|
-
_children_names:
|
|
78
|
+
_exported_keys: list[str] = ["value", "confidence", "geometry", "objectness_score", "crop_orientation"]
|
|
79
|
+
_children_names: list[str] = []
|
|
81
80
|
|
|
82
81
|
def __init__(
|
|
83
82
|
self,
|
|
84
83
|
value: str,
|
|
85
84
|
confidence: float,
|
|
86
|
-
geometry:
|
|
85
|
+
geometry: BoundingBox | np.ndarray,
|
|
87
86
|
objectness_score: float,
|
|
88
|
-
crop_orientation:
|
|
87
|
+
crop_orientation: dict[str, Any],
|
|
89
88
|
) -> None:
|
|
90
89
|
super().__init__()
|
|
91
90
|
self.value = value
|
|
@@ -102,7 +101,7 @@ class Word(Element):
|
|
|
102
101
|
return f"value='{self.value}', confidence={self.confidence:.2}"
|
|
103
102
|
|
|
104
103
|
@classmethod
|
|
105
|
-
def from_dict(cls, save_dict:
|
|
104
|
+
def from_dict(cls, save_dict: dict[str, Any], **kwargs):
|
|
106
105
|
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
107
106
|
return cls(**kwargs)
|
|
108
107
|
|
|
@@ -111,15 +110,14 @@ class Artefact(Element):
|
|
|
111
110
|
"""Implements a non-textual element
|
|
112
111
|
|
|
113
112
|
Args:
|
|
114
|
-
----
|
|
115
113
|
artefact_type: the type of artefact
|
|
116
114
|
confidence: the confidence of the type prediction
|
|
117
115
|
geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
|
|
118
116
|
the page's size.
|
|
119
117
|
"""
|
|
120
118
|
|
|
121
|
-
_exported_keys:
|
|
122
|
-
_children_names:
|
|
119
|
+
_exported_keys: list[str] = ["geometry", "type", "confidence"]
|
|
120
|
+
_children_names: list[str] = []
|
|
123
121
|
|
|
124
122
|
def __init__(self, artefact_type: str, confidence: float, geometry: BoundingBox) -> None:
|
|
125
123
|
super().__init__()
|
|
@@ -135,7 +133,7 @@ class Artefact(Element):
|
|
|
135
133
|
return f"type='{self.type}', confidence={self.confidence:.2}"
|
|
136
134
|
|
|
137
135
|
@classmethod
|
|
138
|
-
def from_dict(cls, save_dict:
|
|
136
|
+
def from_dict(cls, save_dict: dict[str, Any], **kwargs):
|
|
139
137
|
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
140
138
|
return cls(**kwargs)
|
|
141
139
|
|
|
@@ -144,22 +142,21 @@ class Line(Element):
|
|
|
144
142
|
"""Implements a line element as a collection of words
|
|
145
143
|
|
|
146
144
|
Args:
|
|
147
|
-
----
|
|
148
145
|
words: list of word elements
|
|
149
146
|
geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
|
|
150
147
|
the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing
|
|
151
148
|
all words in it.
|
|
152
149
|
"""
|
|
153
150
|
|
|
154
|
-
_exported_keys:
|
|
155
|
-
_children_names:
|
|
156
|
-
words:
|
|
151
|
+
_exported_keys: list[str] = ["geometry", "objectness_score"]
|
|
152
|
+
_children_names: list[str] = ["words"]
|
|
153
|
+
words: list[Word] = []
|
|
157
154
|
|
|
158
155
|
def __init__(
|
|
159
156
|
self,
|
|
160
|
-
words:
|
|
161
|
-
geometry:
|
|
162
|
-
objectness_score:
|
|
157
|
+
words: list[Word],
|
|
158
|
+
geometry: BoundingBox | np.ndarray | None = None,
|
|
159
|
+
objectness_score: float | None = None,
|
|
163
160
|
) -> None:
|
|
164
161
|
# Compute the objectness score of the line
|
|
165
162
|
if objectness_score is None:
|
|
@@ -179,7 +176,7 @@ class Line(Element):
|
|
|
179
176
|
return " ".join(w.render() for w in self.words)
|
|
180
177
|
|
|
181
178
|
@classmethod
|
|
182
|
-
def from_dict(cls, save_dict:
|
|
179
|
+
def from_dict(cls, save_dict: dict[str, Any], **kwargs):
|
|
183
180
|
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
184
181
|
kwargs.update({
|
|
185
182
|
"words": [Word.from_dict(_dict) for _dict in save_dict["words"]],
|
|
@@ -202,7 +199,6 @@ class Block(Element):
|
|
|
202
199
|
"""Implements a block element as a collection of lines and artefacts
|
|
203
200
|
|
|
204
201
|
Args:
|
|
205
|
-
----
|
|
206
202
|
lines: list of line elements
|
|
207
203
|
artefacts: list of artefacts
|
|
208
204
|
geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
|
|
@@ -210,17 +206,17 @@ class Block(Element):
|
|
|
210
206
|
all lines and artefacts in it.
|
|
211
207
|
"""
|
|
212
208
|
|
|
213
|
-
_exported_keys:
|
|
214
|
-
_children_names:
|
|
215
|
-
lines:
|
|
216
|
-
artefacts:
|
|
209
|
+
_exported_keys: list[str] = ["geometry", "objectness_score"]
|
|
210
|
+
_children_names: list[str] = ["lines", "artefacts"]
|
|
211
|
+
lines: list[Line] = []
|
|
212
|
+
artefacts: list[Artefact] = []
|
|
217
213
|
|
|
218
214
|
def __init__(
|
|
219
215
|
self,
|
|
220
|
-
lines:
|
|
221
|
-
artefacts:
|
|
222
|
-
geometry:
|
|
223
|
-
objectness_score:
|
|
216
|
+
lines: list[Line] = [],
|
|
217
|
+
artefacts: list[Artefact] = [],
|
|
218
|
+
geometry: BoundingBox | np.ndarray | None = None,
|
|
219
|
+
objectness_score: float | None = None,
|
|
224
220
|
) -> None:
|
|
225
221
|
# Compute the objectness score of the line
|
|
226
222
|
if objectness_score is None:
|
|
@@ -243,7 +239,7 @@ class Block(Element):
|
|
|
243
239
|
return line_break.join(line.render() for line in self.lines)
|
|
244
240
|
|
|
245
241
|
@classmethod
|
|
246
|
-
def from_dict(cls, save_dict:
|
|
242
|
+
def from_dict(cls, save_dict: dict[str, Any], **kwargs):
|
|
247
243
|
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
248
244
|
kwargs.update({
|
|
249
245
|
"lines": [Line.from_dict(_dict) for _dict in save_dict["lines"]],
|
|
@@ -256,7 +252,6 @@ class Page(Element):
|
|
|
256
252
|
"""Implements a page element as a collection of blocks
|
|
257
253
|
|
|
258
254
|
Args:
|
|
259
|
-
----
|
|
260
255
|
page: image encoded as a numpy array in uint8
|
|
261
256
|
blocks: list of block elements
|
|
262
257
|
page_idx: the index of the page in the input raw document
|
|
@@ -265,18 +260,18 @@ class Page(Element):
|
|
|
265
260
|
language: a dictionary with the language value and confidence of the prediction
|
|
266
261
|
"""
|
|
267
262
|
|
|
268
|
-
_exported_keys:
|
|
269
|
-
_children_names:
|
|
270
|
-
blocks:
|
|
263
|
+
_exported_keys: list[str] = ["page_idx", "dimensions", "orientation", "language"]
|
|
264
|
+
_children_names: list[str] = ["blocks"]
|
|
265
|
+
blocks: list[Block] = []
|
|
271
266
|
|
|
272
267
|
def __init__(
|
|
273
268
|
self,
|
|
274
269
|
page: np.ndarray,
|
|
275
|
-
blocks:
|
|
270
|
+
blocks: list[Block],
|
|
276
271
|
page_idx: int,
|
|
277
|
-
dimensions:
|
|
278
|
-
orientation:
|
|
279
|
-
language:
|
|
272
|
+
dimensions: tuple[int, int],
|
|
273
|
+
orientation: dict[str, Any] | None = None,
|
|
274
|
+
language: dict[str, Any] | None = None,
|
|
280
275
|
) -> None:
|
|
281
276
|
super().__init__(blocks=blocks)
|
|
282
277
|
self.page = page
|
|
@@ -311,25 +306,21 @@ class Page(Element):
|
|
|
311
306
|
"""Synthesize the page from the predictions
|
|
312
307
|
|
|
313
308
|
Args:
|
|
314
|
-
----
|
|
315
309
|
**kwargs: keyword arguments passed to the `synthesize_page` method
|
|
316
310
|
|
|
317
|
-
Returns
|
|
318
|
-
-------
|
|
311
|
+
Returns:
|
|
319
312
|
synthesized page
|
|
320
313
|
"""
|
|
321
314
|
return synthesize_page(self.export(), **kwargs)
|
|
322
315
|
|
|
323
|
-
def export_as_xml(self, file_title: str = "docTR - XML export (hOCR)") ->
|
|
316
|
+
def export_as_xml(self, file_title: str = "docTR - XML export (hOCR)") -> tuple[bytes, ET.ElementTree]:
|
|
324
317
|
"""Export the page as XML (hOCR-format)
|
|
325
318
|
convention: https://github.com/kba/hocr-spec/blob/master/1.2/spec.md
|
|
326
319
|
|
|
327
320
|
Args:
|
|
328
|
-
----
|
|
329
321
|
file_title: the title of the XML file
|
|
330
322
|
|
|
331
323
|
Returns:
|
|
332
|
-
-------
|
|
333
324
|
a tuple of the XML byte string, and its ElementTree
|
|
334
325
|
"""
|
|
335
326
|
p_idx = self.page_idx
|
|
@@ -427,7 +418,7 @@ class Page(Element):
|
|
|
427
418
|
return (ET.tostring(page_hocr, encoding="utf-8", method="xml"), ET.ElementTree(page_hocr))
|
|
428
419
|
|
|
429
420
|
@classmethod
|
|
430
|
-
def from_dict(cls, save_dict:
|
|
421
|
+
def from_dict(cls, save_dict: dict[str, Any], **kwargs):
|
|
431
422
|
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
432
423
|
kwargs.update({"blocks": [Block.from_dict(block_dict) for block_dict in save_dict["blocks"]]})
|
|
433
424
|
return cls(**kwargs)
|
|
@@ -437,7 +428,6 @@ class KIEPage(Element):
|
|
|
437
428
|
"""Implements a KIE page element as a collection of predictions
|
|
438
429
|
|
|
439
430
|
Args:
|
|
440
|
-
----
|
|
441
431
|
predictions: Dictionary with list of block elements for each detection class
|
|
442
432
|
page: image encoded as a numpy array in uint8
|
|
443
433
|
page_idx: the index of the page in the input raw document
|
|
@@ -446,18 +436,18 @@ class KIEPage(Element):
|
|
|
446
436
|
language: a dictionary with the language value and confidence of the prediction
|
|
447
437
|
"""
|
|
448
438
|
|
|
449
|
-
_exported_keys:
|
|
450
|
-
_children_names:
|
|
451
|
-
predictions:
|
|
439
|
+
_exported_keys: list[str] = ["page_idx", "dimensions", "orientation", "language"]
|
|
440
|
+
_children_names: list[str] = ["predictions"]
|
|
441
|
+
predictions: dict[str, list[Prediction]] = {}
|
|
452
442
|
|
|
453
443
|
def __init__(
|
|
454
444
|
self,
|
|
455
445
|
page: np.ndarray,
|
|
456
|
-
predictions:
|
|
446
|
+
predictions: dict[str, list[Prediction]],
|
|
457
447
|
page_idx: int,
|
|
458
|
-
dimensions:
|
|
459
|
-
orientation:
|
|
460
|
-
language:
|
|
448
|
+
dimensions: tuple[int, int],
|
|
449
|
+
orientation: dict[str, Any] | None = None,
|
|
450
|
+
language: dict[str, Any] | None = None,
|
|
461
451
|
) -> None:
|
|
462
452
|
super().__init__(predictions=predictions)
|
|
463
453
|
self.page = page
|
|
@@ -496,25 +486,21 @@ class KIEPage(Element):
|
|
|
496
486
|
"""Synthesize the page from the predictions
|
|
497
487
|
|
|
498
488
|
Args:
|
|
499
|
-
----
|
|
500
489
|
**kwargs: keyword arguments passed to the `synthesize_kie_page` method
|
|
501
490
|
|
|
502
491
|
Returns:
|
|
503
|
-
-------
|
|
504
492
|
synthesized page
|
|
505
493
|
"""
|
|
506
494
|
return synthesize_kie_page(self.export(), **kwargs)
|
|
507
495
|
|
|
508
|
-
def export_as_xml(self, file_title: str = "docTR - XML export (hOCR)") ->
|
|
496
|
+
def export_as_xml(self, file_title: str = "docTR - XML export (hOCR)") -> tuple[bytes, ET.ElementTree]:
|
|
509
497
|
"""Export the page as XML (hOCR-format)
|
|
510
498
|
convention: https://github.com/kba/hocr-spec/blob/master/1.2/spec.md
|
|
511
499
|
|
|
512
500
|
Args:
|
|
513
|
-
----
|
|
514
501
|
file_title: the title of the XML file
|
|
515
502
|
|
|
516
503
|
Returns:
|
|
517
|
-
-------
|
|
518
504
|
a tuple of the XML byte string, and its ElementTree
|
|
519
505
|
"""
|
|
520
506
|
p_idx = self.page_idx
|
|
@@ -570,7 +556,7 @@ class KIEPage(Element):
|
|
|
570
556
|
return ET.tostring(page_hocr, encoding="utf-8", method="xml"), ET.ElementTree(page_hocr)
|
|
571
557
|
|
|
572
558
|
@classmethod
|
|
573
|
-
def from_dict(cls, save_dict:
|
|
559
|
+
def from_dict(cls, save_dict: dict[str, Any], **kwargs):
|
|
574
560
|
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
575
561
|
kwargs.update({
|
|
576
562
|
"predictions": [Prediction.from_dict(predictions_dict) for predictions_dict in save_dict["predictions"]]
|
|
@@ -582,16 +568,15 @@ class Document(Element):
|
|
|
582
568
|
"""Implements a document element as a collection of pages
|
|
583
569
|
|
|
584
570
|
Args:
|
|
585
|
-
----
|
|
586
571
|
pages: list of page elements
|
|
587
572
|
"""
|
|
588
573
|
|
|
589
|
-
_children_names:
|
|
590
|
-
pages:
|
|
574
|
+
_children_names: list[str] = ["pages"]
|
|
575
|
+
pages: list[Page] = []
|
|
591
576
|
|
|
592
577
|
def __init__(
|
|
593
578
|
self,
|
|
594
|
-
pages:
|
|
579
|
+
pages: list[Page],
|
|
595
580
|
) -> None:
|
|
596
581
|
super().__init__(pages=pages)
|
|
597
582
|
|
|
@@ -604,34 +589,30 @@ class Document(Element):
|
|
|
604
589
|
for result in self.pages:
|
|
605
590
|
result.show(**kwargs)
|
|
606
591
|
|
|
607
|
-
def synthesize(self, **kwargs) ->
|
|
592
|
+
def synthesize(self, **kwargs) -> list[np.ndarray]:
|
|
608
593
|
"""Synthesize all pages from their predictions
|
|
609
594
|
|
|
610
595
|
Args:
|
|
611
|
-
----
|
|
612
596
|
**kwargs: keyword arguments passed to the `Page.synthesize` method
|
|
613
597
|
|
|
614
|
-
Returns
|
|
615
|
-
-------
|
|
598
|
+
Returns:
|
|
616
599
|
list of synthesized pages
|
|
617
600
|
"""
|
|
618
601
|
return [page.synthesize(**kwargs) for page in self.pages]
|
|
619
602
|
|
|
620
|
-
def export_as_xml(self, **kwargs) ->
|
|
603
|
+
def export_as_xml(self, **kwargs) -> list[tuple[bytes, ET.ElementTree]]:
|
|
621
604
|
"""Export the document as XML (hOCR-format)
|
|
622
605
|
|
|
623
606
|
Args:
|
|
624
|
-
----
|
|
625
607
|
**kwargs: additional keyword arguments passed to the Page.export_as_xml method
|
|
626
608
|
|
|
627
609
|
Returns:
|
|
628
|
-
-------
|
|
629
610
|
list of tuple of (bytes, ElementTree)
|
|
630
611
|
"""
|
|
631
612
|
return [page.export_as_xml(**kwargs) for page in self.pages]
|
|
632
613
|
|
|
633
614
|
@classmethod
|
|
634
|
-
def from_dict(cls, save_dict:
|
|
615
|
+
def from_dict(cls, save_dict: dict[str, Any], **kwargs):
|
|
635
616
|
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
636
617
|
kwargs.update({"pages": [Page.from_dict(page_dict) for page_dict in save_dict["pages"]]})
|
|
637
618
|
return cls(**kwargs)
|
|
@@ -641,15 +622,14 @@ class KIEDocument(Document):
|
|
|
641
622
|
"""Implements a document element as a collection of pages
|
|
642
623
|
|
|
643
624
|
Args:
|
|
644
|
-
----
|
|
645
625
|
pages: list of page elements
|
|
646
626
|
"""
|
|
647
627
|
|
|
648
|
-
_children_names:
|
|
649
|
-
pages:
|
|
628
|
+
_children_names: list[str] = ["pages"]
|
|
629
|
+
pages: list[KIEPage] = [] # type: ignore[assignment]
|
|
650
630
|
|
|
651
631
|
def __init__(
|
|
652
632
|
self,
|
|
653
|
-
pages:
|
|
633
|
+
pages: list[KIEPage],
|
|
654
634
|
) -> None:
|
|
655
635
|
super().__init__(pages=pages) # type: ignore[arg-type]
|
doctr/io/html.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
@@ -15,12 +15,10 @@ def read_html(url: str, **kwargs: Any) -> bytes:
|
|
|
15
15
|
>>> doc = read_html("https://www.yoursite.com")
|
|
16
16
|
|
|
17
17
|
Args:
|
|
18
|
-
----
|
|
19
18
|
url: URL of the target web page
|
|
20
19
|
**kwargs: keyword arguments from `weasyprint.HTML`
|
|
21
20
|
|
|
22
21
|
Returns:
|
|
23
|
-
-------
|
|
24
22
|
decoded PDF file as a bytes stream
|
|
25
23
|
"""
|
|
26
24
|
from weasyprint import HTML
|
doctr/io/image/__init__.py
CHANGED
|
@@ -2,7 +2,7 @@ from doctr.file_utils import is_tf_available, is_torch_available
|
|
|
2
2
|
|
|
3
3
|
from .base import *
|
|
4
4
|
|
|
5
|
-
if
|
|
6
|
-
from .tensorflow import *
|
|
7
|
-
elif is_torch_available():
|
|
5
|
+
if is_torch_available():
|
|
8
6
|
from .pytorch import *
|
|
7
|
+
elif is_tf_available():
|
|
8
|
+
from .tensorflow import *
|
doctr/io/image/base.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Optional, Tuple
|
|
8
7
|
|
|
9
8
|
import cv2
|
|
10
9
|
import numpy as np
|
|
@@ -16,7 +15,7 @@ __all__ = ["read_img_as_numpy"]
|
|
|
16
15
|
|
|
17
16
|
def read_img_as_numpy(
|
|
18
17
|
file: AbstractFile,
|
|
19
|
-
output_size:
|
|
18
|
+
output_size: tuple[int, int] | None = None,
|
|
20
19
|
rgb_output: bool = True,
|
|
21
20
|
) -> np.ndarray:
|
|
22
21
|
"""Read an image file into numpy format
|
|
@@ -25,13 +24,11 @@ def read_img_as_numpy(
|
|
|
25
24
|
>>> page = read_img_as_numpy("path/to/your/doc.jpg")
|
|
26
25
|
|
|
27
26
|
Args:
|
|
28
|
-
----
|
|
29
27
|
file: the path to the image file
|
|
30
28
|
output_size: the expected output size of each page in format H x W
|
|
31
29
|
rgb_output: whether the output ndarray channel order should be RGB instead of BGR.
|
|
32
30
|
|
|
33
31
|
Returns:
|
|
34
|
-
-------
|
|
35
32
|
the page decoded as numpy ndarray of shape H x W x 3
|
|
36
33
|
"""
|
|
37
34
|
if isinstance(file, (str, Path)):
|
doctr/io/image/pytorch.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
from io import BytesIO
|
|
7
|
-
from typing import Tuple
|
|
8
7
|
|
|
9
8
|
import numpy as np
|
|
10
9
|
import torch
|
|
@@ -20,12 +19,10 @@ def tensor_from_pil(pil_img: Image.Image, dtype: torch.dtype = torch.float32) ->
|
|
|
20
19
|
"""Convert a PIL Image to a PyTorch tensor
|
|
21
20
|
|
|
22
21
|
Args:
|
|
23
|
-
----
|
|
24
22
|
pil_img: a PIL image
|
|
25
23
|
dtype: the output tensor data type
|
|
26
24
|
|
|
27
25
|
Returns:
|
|
28
|
-
-------
|
|
29
26
|
decoded image as tensor
|
|
30
27
|
"""
|
|
31
28
|
if dtype == torch.float32:
|
|
@@ -40,12 +37,10 @@ def read_img_as_tensor(img_path: AbstractPath, dtype: torch.dtype = torch.float3
|
|
|
40
37
|
"""Read an image file as a PyTorch tensor
|
|
41
38
|
|
|
42
39
|
Args:
|
|
43
|
-
----
|
|
44
40
|
img_path: location of the image file
|
|
45
41
|
dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
|
|
46
42
|
|
|
47
43
|
Returns:
|
|
48
|
-
-------
|
|
49
44
|
decoded image as a tensor
|
|
50
45
|
"""
|
|
51
46
|
if dtype not in (torch.uint8, torch.float16, torch.float32):
|
|
@@ -59,12 +54,10 @@ def decode_img_as_tensor(img_content: bytes, dtype: torch.dtype = torch.float32)
|
|
|
59
54
|
"""Read a byte stream as a PyTorch tensor
|
|
60
55
|
|
|
61
56
|
Args:
|
|
62
|
-
----
|
|
63
57
|
img_content: bytes of a decoded image
|
|
64
58
|
dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
|
|
65
59
|
|
|
66
60
|
Returns:
|
|
67
|
-
-------
|
|
68
61
|
decoded image as a tensor
|
|
69
62
|
"""
|
|
70
63
|
if dtype not in (torch.uint8, torch.float16, torch.float32):
|
|
@@ -78,12 +71,10 @@ def tensor_from_numpy(npy_img: np.ndarray, dtype: torch.dtype = torch.float32) -
|
|
|
78
71
|
"""Read an image file as a PyTorch tensor
|
|
79
72
|
|
|
80
73
|
Args:
|
|
81
|
-
----
|
|
82
74
|
npy_img: image encoded as a numpy array of shape (H, W, C) in np.uint8
|
|
83
75
|
dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
|
|
84
76
|
|
|
85
77
|
Returns:
|
|
86
|
-
-------
|
|
87
78
|
same image as a tensor of shape (C, H, W)
|
|
88
79
|
"""
|
|
89
80
|
if dtype not in (torch.uint8, torch.float16, torch.float32):
|
|
@@ -102,6 +93,6 @@ def tensor_from_numpy(npy_img: np.ndarray, dtype: torch.dtype = torch.float32) -
|
|
|
102
93
|
return img
|
|
103
94
|
|
|
104
95
|
|
|
105
|
-
def get_img_shape(img: torch.Tensor) ->
|
|
96
|
+
def get_img_shape(img: torch.Tensor) -> tuple[int, int]:
|
|
106
97
|
"""Get the shape of an image"""
|
|
107
|
-
return img.shape[-2:]
|
|
98
|
+
return img.shape[-2:]
|
doctr/io/image/tensorflow.py
CHANGED
|
@@ -1,9 +1,8 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
|
-
from typing import Tuple
|
|
7
6
|
|
|
8
7
|
import numpy as np
|
|
9
8
|
import tensorflow as tf
|
|
@@ -19,12 +18,10 @@ def tensor_from_pil(pil_img: Image.Image, dtype: tf.dtypes.DType = tf.float32) -
|
|
|
19
18
|
"""Convert a PIL Image to a TensorFlow tensor
|
|
20
19
|
|
|
21
20
|
Args:
|
|
22
|
-
----
|
|
23
21
|
pil_img: a PIL image
|
|
24
22
|
dtype: the output tensor data type
|
|
25
23
|
|
|
26
24
|
Returns:
|
|
27
|
-
-------
|
|
28
25
|
decoded image as tensor
|
|
29
26
|
"""
|
|
30
27
|
npy_img = img_to_array(pil_img)
|
|
@@ -36,12 +33,10 @@ def read_img_as_tensor(img_path: AbstractPath, dtype: tf.dtypes.DType = tf.float
|
|
|
36
33
|
"""Read an image file as a TensorFlow tensor
|
|
37
34
|
|
|
38
35
|
Args:
|
|
39
|
-
----
|
|
40
36
|
img_path: location of the image file
|
|
41
37
|
dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
|
|
42
38
|
|
|
43
39
|
Returns:
|
|
44
|
-
-------
|
|
45
40
|
decoded image as a tensor
|
|
46
41
|
"""
|
|
47
42
|
if dtype not in (tf.uint8, tf.float16, tf.float32):
|
|
@@ -61,12 +56,10 @@ def decode_img_as_tensor(img_content: bytes, dtype: tf.dtypes.DType = tf.float32
|
|
|
61
56
|
"""Read a byte stream as a TensorFlow tensor
|
|
62
57
|
|
|
63
58
|
Args:
|
|
64
|
-
----
|
|
65
59
|
img_content: bytes of a decoded image
|
|
66
60
|
dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
|
|
67
61
|
|
|
68
62
|
Returns:
|
|
69
|
-
-------
|
|
70
63
|
decoded image as a tensor
|
|
71
64
|
"""
|
|
72
65
|
if dtype not in (tf.uint8, tf.float16, tf.float32):
|
|
@@ -85,12 +78,10 @@ def tensor_from_numpy(npy_img: np.ndarray, dtype: tf.dtypes.DType = tf.float32)
|
|
|
85
78
|
"""Read an image file as a TensorFlow tensor
|
|
86
79
|
|
|
87
80
|
Args:
|
|
88
|
-
----
|
|
89
81
|
npy_img: image encoded as a numpy array of shape (H, W, C) in np.uint8
|
|
90
82
|
dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
|
|
91
83
|
|
|
92
84
|
Returns:
|
|
93
|
-
-------
|
|
94
85
|
same image as a tensor of shape (H, W, C)
|
|
95
86
|
"""
|
|
96
87
|
if dtype not in (tf.uint8, tf.float16, tf.float32):
|
|
@@ -105,6 +96,6 @@ def tensor_from_numpy(npy_img: np.ndarray, dtype: tf.dtypes.DType = tf.float32)
|
|
|
105
96
|
return img
|
|
106
97
|
|
|
107
98
|
|
|
108
|
-
def get_img_shape(img: tf.Tensor) ->
|
|
99
|
+
def get_img_shape(img: tf.Tensor) -> tuple[int, int]:
|
|
109
100
|
"""Get the shape of an image"""
|
|
110
101
|
return img.shape[:2]
|
doctr/io/pdf.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any
|
|
7
7
|
|
|
8
8
|
import numpy as np
|
|
9
9
|
import pypdfium2 as pdfium
|
|
@@ -15,18 +15,17 @@ __all__ = ["read_pdf"]
|
|
|
15
15
|
|
|
16
16
|
def read_pdf(
|
|
17
17
|
file: AbstractFile,
|
|
18
|
-
scale:
|
|
18
|
+
scale: int = 2,
|
|
19
19
|
rgb_mode: bool = True,
|
|
20
|
-
password:
|
|
20
|
+
password: str | None = None,
|
|
21
21
|
**kwargs: Any,
|
|
22
|
-
) ->
|
|
22
|
+
) -> list[np.ndarray]:
|
|
23
23
|
"""Read a PDF file and convert it into an image in numpy format
|
|
24
24
|
|
|
25
25
|
>>> from doctr.io import read_pdf
|
|
26
26
|
>>> doc = read_pdf("path/to/your/doc.pdf")
|
|
27
27
|
|
|
28
28
|
Args:
|
|
29
|
-
----
|
|
30
29
|
file: the path to the PDF file
|
|
31
30
|
scale: rendering scale (1 corresponds to 72dpi)
|
|
32
31
|
rgb_mode: if True, the output will be RGB, otherwise BGR
|
|
@@ -34,7 +33,6 @@ def read_pdf(
|
|
|
34
33
|
**kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render`
|
|
35
34
|
|
|
36
35
|
Returns:
|
|
37
|
-
-------
|
|
38
36
|
the list of pages decoded as numpy ndarray of shape H x W x C
|
|
39
37
|
"""
|
|
40
38
|
# Rasterise pages to numpy ndarrays with pypdfium2
|