python-doctr 0.10.0__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctr/contrib/__init__.py +1 -0
- doctr/contrib/artefacts.py +7 -9
- doctr/contrib/base.py +8 -17
- doctr/datasets/__init__.py +1 -0
- doctr/datasets/coco_text.py +139 -0
- doctr/datasets/cord.py +10 -8
- doctr/datasets/datasets/__init__.py +4 -4
- doctr/datasets/datasets/base.py +16 -16
- doctr/datasets/datasets/pytorch.py +12 -12
- doctr/datasets/datasets/tensorflow.py +10 -10
- doctr/datasets/detection.py +6 -9
- doctr/datasets/doc_artefacts.py +3 -4
- doctr/datasets/funsd.py +9 -8
- doctr/datasets/generator/__init__.py +4 -4
- doctr/datasets/generator/base.py +16 -17
- doctr/datasets/generator/pytorch.py +1 -3
- doctr/datasets/generator/tensorflow.py +1 -3
- doctr/datasets/ic03.py +5 -6
- doctr/datasets/ic13.py +6 -6
- doctr/datasets/iiit5k.py +10 -6
- doctr/datasets/iiithws.py +4 -5
- doctr/datasets/imgur5k.py +15 -7
- doctr/datasets/loader.py +4 -7
- doctr/datasets/mjsynth.py +6 -5
- doctr/datasets/ocr.py +3 -4
- doctr/datasets/orientation.py +3 -4
- doctr/datasets/recognition.py +4 -5
- doctr/datasets/sroie.py +6 -5
- doctr/datasets/svhn.py +7 -6
- doctr/datasets/svt.py +6 -7
- doctr/datasets/synthtext.py +19 -7
- doctr/datasets/utils.py +41 -35
- doctr/datasets/vocabs.py +1107 -49
- doctr/datasets/wildreceipt.py +14 -10
- doctr/file_utils.py +11 -7
- doctr/io/elements.py +96 -82
- doctr/io/html.py +1 -3
- doctr/io/image/__init__.py +3 -3
- doctr/io/image/base.py +2 -5
- doctr/io/image/pytorch.py +3 -12
- doctr/io/image/tensorflow.py +2 -11
- doctr/io/pdf.py +5 -7
- doctr/io/reader.py +5 -11
- doctr/models/_utils.py +15 -23
- doctr/models/builder.py +30 -48
- doctr/models/classification/__init__.py +1 -0
- doctr/models/classification/magc_resnet/__init__.py +3 -3
- doctr/models/classification/magc_resnet/pytorch.py +11 -15
- doctr/models/classification/magc_resnet/tensorflow.py +11 -14
- doctr/models/classification/mobilenet/__init__.py +3 -3
- doctr/models/classification/mobilenet/pytorch.py +20 -18
- doctr/models/classification/mobilenet/tensorflow.py +19 -23
- doctr/models/classification/predictor/__init__.py +4 -4
- doctr/models/classification/predictor/pytorch.py +7 -9
- doctr/models/classification/predictor/tensorflow.py +6 -8
- doctr/models/classification/resnet/__init__.py +4 -4
- doctr/models/classification/resnet/pytorch.py +47 -34
- doctr/models/classification/resnet/tensorflow.py +45 -35
- doctr/models/classification/textnet/__init__.py +3 -3
- doctr/models/classification/textnet/pytorch.py +20 -18
- doctr/models/classification/textnet/tensorflow.py +19 -17
- doctr/models/classification/vgg/__init__.py +3 -3
- doctr/models/classification/vgg/pytorch.py +21 -8
- doctr/models/classification/vgg/tensorflow.py +20 -14
- doctr/models/classification/vip/__init__.py +4 -0
- doctr/models/classification/vip/layers/__init__.py +4 -0
- doctr/models/classification/vip/layers/pytorch.py +615 -0
- doctr/models/classification/vip/pytorch.py +505 -0
- doctr/models/classification/vit/__init__.py +3 -3
- doctr/models/classification/vit/pytorch.py +18 -15
- doctr/models/classification/vit/tensorflow.py +15 -12
- doctr/models/classification/zoo.py +23 -14
- doctr/models/core.py +3 -3
- doctr/models/detection/_utils/__init__.py +4 -4
- doctr/models/detection/_utils/base.py +4 -7
- doctr/models/detection/_utils/pytorch.py +1 -5
- doctr/models/detection/_utils/tensorflow.py +1 -5
- doctr/models/detection/core.py +2 -8
- doctr/models/detection/differentiable_binarization/__init__.py +4 -4
- doctr/models/detection/differentiable_binarization/base.py +10 -21
- doctr/models/detection/differentiable_binarization/pytorch.py +37 -31
- doctr/models/detection/differentiable_binarization/tensorflow.py +26 -29
- doctr/models/detection/fast/__init__.py +4 -4
- doctr/models/detection/fast/base.py +8 -17
- doctr/models/detection/fast/pytorch.py +37 -35
- doctr/models/detection/fast/tensorflow.py +24 -28
- doctr/models/detection/linknet/__init__.py +4 -4
- doctr/models/detection/linknet/base.py +8 -18
- doctr/models/detection/linknet/pytorch.py +34 -28
- doctr/models/detection/linknet/tensorflow.py +24 -25
- doctr/models/detection/predictor/__init__.py +5 -5
- doctr/models/detection/predictor/pytorch.py +6 -7
- doctr/models/detection/predictor/tensorflow.py +5 -6
- doctr/models/detection/zoo.py +27 -7
- doctr/models/factory/hub.py +6 -10
- doctr/models/kie_predictor/__init__.py +5 -5
- doctr/models/kie_predictor/base.py +4 -5
- doctr/models/kie_predictor/pytorch.py +19 -20
- doctr/models/kie_predictor/tensorflow.py +14 -15
- doctr/models/modules/layers/__init__.py +3 -3
- doctr/models/modules/layers/pytorch.py +55 -10
- doctr/models/modules/layers/tensorflow.py +5 -7
- doctr/models/modules/transformer/__init__.py +3 -3
- doctr/models/modules/transformer/pytorch.py +12 -13
- doctr/models/modules/transformer/tensorflow.py +9 -10
- doctr/models/modules/vision_transformer/__init__.py +3 -3
- doctr/models/modules/vision_transformer/pytorch.py +2 -3
- doctr/models/modules/vision_transformer/tensorflow.py +3 -3
- doctr/models/predictor/__init__.py +5 -5
- doctr/models/predictor/base.py +28 -29
- doctr/models/predictor/pytorch.py +13 -14
- doctr/models/predictor/tensorflow.py +9 -10
- doctr/models/preprocessor/__init__.py +4 -4
- doctr/models/preprocessor/pytorch.py +13 -17
- doctr/models/preprocessor/tensorflow.py +10 -14
- doctr/models/recognition/__init__.py +1 -0
- doctr/models/recognition/core.py +3 -7
- doctr/models/recognition/crnn/__init__.py +4 -4
- doctr/models/recognition/crnn/pytorch.py +30 -29
- doctr/models/recognition/crnn/tensorflow.py +21 -24
- doctr/models/recognition/master/__init__.py +3 -3
- doctr/models/recognition/master/base.py +3 -7
- doctr/models/recognition/master/pytorch.py +32 -25
- doctr/models/recognition/master/tensorflow.py +22 -25
- doctr/models/recognition/parseq/__init__.py +3 -3
- doctr/models/recognition/parseq/base.py +3 -7
- doctr/models/recognition/parseq/pytorch.py +47 -29
- doctr/models/recognition/parseq/tensorflow.py +29 -27
- doctr/models/recognition/predictor/__init__.py +5 -5
- doctr/models/recognition/predictor/_utils.py +111 -52
- doctr/models/recognition/predictor/pytorch.py +9 -9
- doctr/models/recognition/predictor/tensorflow.py +8 -9
- doctr/models/recognition/sar/__init__.py +4 -4
- doctr/models/recognition/sar/pytorch.py +30 -22
- doctr/models/recognition/sar/tensorflow.py +22 -24
- doctr/models/recognition/utils.py +57 -53
- doctr/models/recognition/viptr/__init__.py +4 -0
- doctr/models/recognition/viptr/pytorch.py +277 -0
- doctr/models/recognition/vitstr/__init__.py +4 -4
- doctr/models/recognition/vitstr/base.py +3 -7
- doctr/models/recognition/vitstr/pytorch.py +28 -21
- doctr/models/recognition/vitstr/tensorflow.py +22 -23
- doctr/models/recognition/zoo.py +27 -11
- doctr/models/utils/__init__.py +4 -4
- doctr/models/utils/pytorch.py +41 -34
- doctr/models/utils/tensorflow.py +31 -23
- doctr/models/zoo.py +1 -5
- doctr/transforms/functional/__init__.py +3 -3
- doctr/transforms/functional/base.py +4 -11
- doctr/transforms/functional/pytorch.py +20 -28
- doctr/transforms/functional/tensorflow.py +10 -22
- doctr/transforms/modules/__init__.py +4 -4
- doctr/transforms/modules/base.py +48 -55
- doctr/transforms/modules/pytorch.py +58 -22
- doctr/transforms/modules/tensorflow.py +18 -32
- doctr/utils/common_types.py +8 -9
- doctr/utils/data.py +9 -13
- doctr/utils/fonts.py +2 -7
- doctr/utils/geometry.py +17 -48
- doctr/utils/metrics.py +17 -37
- doctr/utils/multithreading.py +4 -6
- doctr/utils/reconstitution.py +9 -13
- doctr/utils/repr.py +2 -3
- doctr/utils/visualization.py +16 -29
- doctr/version.py +1 -1
- {python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info}/METADATA +70 -52
- python_doctr-0.12.0.dist-info/RECORD +180 -0
- {python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info}/WHEEL +1 -1
- python_doctr-0.10.0.dist-info/RECORD +0 -173
- {python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info/licenses}/LICENSE +0 -0
- {python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info}/top_level.txt +0 -0
- {python_doctr-0.10.0.dist-info → python_doctr-0.12.0.dist-info}/zip-safe +0 -0
doctr/datasets/wildreceipt.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
@@ -6,9 +6,10 @@
|
|
|
6
6
|
import json
|
|
7
7
|
import os
|
|
8
8
|
from pathlib import Path
|
|
9
|
-
from typing import Any
|
|
9
|
+
from typing import Any
|
|
10
10
|
|
|
11
11
|
import numpy as np
|
|
12
|
+
from tqdm import tqdm
|
|
12
13
|
|
|
13
14
|
from .datasets import AbstractDataset
|
|
14
15
|
from .utils import convert_target_to_relative, crop_bboxes_from_image
|
|
@@ -17,9 +18,10 @@ __all__ = ["WILDRECEIPT"]
|
|
|
17
18
|
|
|
18
19
|
|
|
19
20
|
class WILDRECEIPT(AbstractDataset):
|
|
20
|
-
"""
|
|
21
|
-
|
|
22
|
-
|
|
21
|
+
"""
|
|
22
|
+
WildReceipt dataset from `"Spatial Dual-Modality Graph Reasoning for Key Information Extraction"
|
|
23
|
+
<https://arxiv.org/abs/2103.14470v1>`_ |
|
|
24
|
+
`"repository" <https://download.openmmlab.com/mmocr/data/wildreceipt.tar>`_.
|
|
23
25
|
|
|
24
26
|
.. image:: https://doctr-static.mindee.com/models?id=v0.7.0/wildreceipt-dataset.jpg&src=0
|
|
25
27
|
:align: center
|
|
@@ -34,7 +36,6 @@ class WILDRECEIPT(AbstractDataset):
|
|
|
34
36
|
>>> img, target = test_set[0]
|
|
35
37
|
|
|
36
38
|
Args:
|
|
37
|
-
----
|
|
38
39
|
img_folder: folder with all the images of the dataset
|
|
39
40
|
label_path: path to the annotations file of the dataset
|
|
40
41
|
train: whether the subset should be the training one
|
|
@@ -71,15 +72,18 @@ class WILDRECEIPT(AbstractDataset):
|
|
|
71
72
|
tmp_root = img_folder
|
|
72
73
|
self.train = train
|
|
73
74
|
np_dtype = np.float32
|
|
74
|
-
self.data:
|
|
75
|
+
self.data: list[tuple[str | Path | np.ndarray, str | dict[str, Any] | np.ndarray]] = []
|
|
75
76
|
|
|
76
77
|
with open(label_path, "r") as file:
|
|
77
78
|
data = file.read()
|
|
78
79
|
# Split the text file into separate JSON strings
|
|
79
80
|
json_strings = data.strip().split("\n")
|
|
80
|
-
box:
|
|
81
|
-
|
|
82
|
-
for json_string in
|
|
81
|
+
box: list[float] | np.ndarray
|
|
82
|
+
|
|
83
|
+
for json_string in tqdm(
|
|
84
|
+
iterable=json_strings, desc="Preparing and Loading WILDRECEIPT", total=len(json_strings)
|
|
85
|
+
):
|
|
86
|
+
_targets = []
|
|
83
87
|
json_data = json.loads(json_string)
|
|
84
88
|
img_path = json_data["file_name"]
|
|
85
89
|
annotations = json_data["annotations"]
|
doctr/file_utils.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
@@ -9,7 +9,6 @@ import importlib.metadata
|
|
|
9
9
|
import importlib.util
|
|
10
10
|
import logging
|
|
11
11
|
import os
|
|
12
|
-
from typing import Optional
|
|
13
12
|
|
|
14
13
|
CLASS_NAME: str = "words"
|
|
15
14
|
|
|
@@ -80,10 +79,16 @@ if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VA
|
|
|
80
79
|
else:
|
|
81
80
|
logging.info(f"TensorFlow version {_tf_version} available.")
|
|
82
81
|
ensure_keras_v2()
|
|
83
|
-
import tensorflow as tf
|
|
84
82
|
|
|
85
|
-
|
|
86
|
-
|
|
83
|
+
import warnings
|
|
84
|
+
|
|
85
|
+
warnings.simplefilter("always", DeprecationWarning)
|
|
86
|
+
warnings.warn(
|
|
87
|
+
"Support for TensorFlow in DocTR is deprecated and will be removed in the next major release (v1.0.0). "
|
|
88
|
+
"Please switch to the PyTorch backend.",
|
|
89
|
+
DeprecationWarning,
|
|
90
|
+
)
|
|
91
|
+
|
|
87
92
|
else: # pragma: no cover
|
|
88
93
|
logging.info("Disabling Tensorflow because USE_TORCH is set")
|
|
89
94
|
_tf_available = False
|
|
@@ -96,12 +101,11 @@ if not _torch_available and not _tf_available: # pragma: no cover
|
|
|
96
101
|
)
|
|
97
102
|
|
|
98
103
|
|
|
99
|
-
def requires_package(name: str, extra_message:
|
|
104
|
+
def requires_package(name: str, extra_message: str | None = None) -> None: # pragma: no cover
|
|
100
105
|
"""
|
|
101
106
|
package requirement helper
|
|
102
107
|
|
|
103
108
|
Args:
|
|
104
|
-
----
|
|
105
109
|
name: name of the package
|
|
106
110
|
extra_message: additional message to display if the package is not found
|
|
107
111
|
"""
|
doctr/io/elements.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any
|
|
7
7
|
|
|
8
8
|
from defusedxml import defuse_stdlib
|
|
9
9
|
|
|
@@ -32,8 +32,8 @@ __all__ = ["Element", "Word", "Artefact", "Line", "Prediction", "Block", "Page",
|
|
|
32
32
|
class Element(NestedObject):
|
|
33
33
|
"""Implements an abstract document element with exporting and text rendering capabilities"""
|
|
34
34
|
|
|
35
|
-
_children_names:
|
|
36
|
-
_exported_keys:
|
|
35
|
+
_children_names: list[str] = []
|
|
36
|
+
_exported_keys: list[str] = []
|
|
37
37
|
|
|
38
38
|
def __init__(self, **kwargs: Any) -> None:
|
|
39
39
|
for k, v in kwargs.items():
|
|
@@ -42,7 +42,7 @@ class Element(NestedObject):
|
|
|
42
42
|
else:
|
|
43
43
|
raise KeyError(f"{self.__class__.__name__} object does not have any attribute named '{k}'")
|
|
44
44
|
|
|
45
|
-
def export(self) ->
|
|
45
|
+
def export(self) -> dict[str, Any]:
|
|
46
46
|
"""Exports the object into a nested dict format"""
|
|
47
47
|
export_dict = {k: getattr(self, k) for k in self._exported_keys}
|
|
48
48
|
for children_name in self._children_names:
|
|
@@ -56,7 +56,7 @@ class Element(NestedObject):
|
|
|
56
56
|
return export_dict
|
|
57
57
|
|
|
58
58
|
@classmethod
|
|
59
|
-
def from_dict(cls, save_dict:
|
|
59
|
+
def from_dict(cls, save_dict: dict[str, Any], **kwargs):
|
|
60
60
|
raise NotImplementedError
|
|
61
61
|
|
|
62
62
|
def render(self) -> str:
|
|
@@ -67,7 +67,6 @@ class Word(Element):
|
|
|
67
67
|
"""Implements a word element
|
|
68
68
|
|
|
69
69
|
Args:
|
|
70
|
-
----
|
|
71
70
|
value: the text string of the word
|
|
72
71
|
confidence: the confidence associated with the text prediction
|
|
73
72
|
geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
|
|
@@ -76,16 +75,16 @@ class Word(Element):
|
|
|
76
75
|
crop_orientation: the general orientation of the crop in degrees and its confidence
|
|
77
76
|
"""
|
|
78
77
|
|
|
79
|
-
_exported_keys:
|
|
80
|
-
_children_names:
|
|
78
|
+
_exported_keys: list[str] = ["value", "confidence", "geometry", "objectness_score", "crop_orientation"]
|
|
79
|
+
_children_names: list[str] = []
|
|
81
80
|
|
|
82
81
|
def __init__(
|
|
83
82
|
self,
|
|
84
83
|
value: str,
|
|
85
84
|
confidence: float,
|
|
86
|
-
geometry:
|
|
85
|
+
geometry: BoundingBox | np.ndarray,
|
|
87
86
|
objectness_score: float,
|
|
88
|
-
crop_orientation:
|
|
87
|
+
crop_orientation: dict[str, Any],
|
|
89
88
|
) -> None:
|
|
90
89
|
super().__init__()
|
|
91
90
|
self.value = value
|
|
@@ -102,7 +101,7 @@ class Word(Element):
|
|
|
102
101
|
return f"value='{self.value}', confidence={self.confidence:.2}"
|
|
103
102
|
|
|
104
103
|
@classmethod
|
|
105
|
-
def from_dict(cls, save_dict:
|
|
104
|
+
def from_dict(cls, save_dict: dict[str, Any], **kwargs):
|
|
106
105
|
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
107
106
|
return cls(**kwargs)
|
|
108
107
|
|
|
@@ -111,15 +110,14 @@ class Artefact(Element):
|
|
|
111
110
|
"""Implements a non-textual element
|
|
112
111
|
|
|
113
112
|
Args:
|
|
114
|
-
----
|
|
115
113
|
artefact_type: the type of artefact
|
|
116
114
|
confidence: the confidence of the type prediction
|
|
117
115
|
geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
|
|
118
116
|
the page's size.
|
|
119
117
|
"""
|
|
120
118
|
|
|
121
|
-
_exported_keys:
|
|
122
|
-
_children_names:
|
|
119
|
+
_exported_keys: list[str] = ["geometry", "type", "confidence"]
|
|
120
|
+
_children_names: list[str] = []
|
|
123
121
|
|
|
124
122
|
def __init__(self, artefact_type: str, confidence: float, geometry: BoundingBox) -> None:
|
|
125
123
|
super().__init__()
|
|
@@ -135,7 +133,7 @@ class Artefact(Element):
|
|
|
135
133
|
return f"type='{self.type}', confidence={self.confidence:.2}"
|
|
136
134
|
|
|
137
135
|
@classmethod
|
|
138
|
-
def from_dict(cls, save_dict:
|
|
136
|
+
def from_dict(cls, save_dict: dict[str, Any], **kwargs):
|
|
139
137
|
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
140
138
|
return cls(**kwargs)
|
|
141
139
|
|
|
@@ -144,22 +142,21 @@ class Line(Element):
|
|
|
144
142
|
"""Implements a line element as a collection of words
|
|
145
143
|
|
|
146
144
|
Args:
|
|
147
|
-
----
|
|
148
145
|
words: list of word elements
|
|
149
146
|
geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
|
|
150
147
|
the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing
|
|
151
148
|
all words in it.
|
|
152
149
|
"""
|
|
153
150
|
|
|
154
|
-
_exported_keys:
|
|
155
|
-
_children_names:
|
|
156
|
-
words:
|
|
151
|
+
_exported_keys: list[str] = ["geometry", "objectness_score"]
|
|
152
|
+
_children_names: list[str] = ["words"]
|
|
153
|
+
words: list[Word] = []
|
|
157
154
|
|
|
158
155
|
def __init__(
|
|
159
156
|
self,
|
|
160
|
-
words:
|
|
161
|
-
geometry:
|
|
162
|
-
objectness_score:
|
|
157
|
+
words: list[Word],
|
|
158
|
+
geometry: BoundingBox | np.ndarray | None = None,
|
|
159
|
+
objectness_score: float | None = None,
|
|
163
160
|
) -> None:
|
|
164
161
|
# Compute the objectness score of the line
|
|
165
162
|
if objectness_score is None:
|
|
@@ -179,7 +176,7 @@ class Line(Element):
|
|
|
179
176
|
return " ".join(w.render() for w in self.words)
|
|
180
177
|
|
|
181
178
|
@classmethod
|
|
182
|
-
def from_dict(cls, save_dict:
|
|
179
|
+
def from_dict(cls, save_dict: dict[str, Any], **kwargs):
|
|
183
180
|
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
184
181
|
kwargs.update({
|
|
185
182
|
"words": [Word.from_dict(_dict) for _dict in save_dict["words"]],
|
|
@@ -202,7 +199,6 @@ class Block(Element):
|
|
|
202
199
|
"""Implements a block element as a collection of lines and artefacts
|
|
203
200
|
|
|
204
201
|
Args:
|
|
205
|
-
----
|
|
206
202
|
lines: list of line elements
|
|
207
203
|
artefacts: list of artefacts
|
|
208
204
|
geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
|
|
@@ -210,17 +206,17 @@ class Block(Element):
|
|
|
210
206
|
all lines and artefacts in it.
|
|
211
207
|
"""
|
|
212
208
|
|
|
213
|
-
_exported_keys:
|
|
214
|
-
_children_names:
|
|
215
|
-
lines:
|
|
216
|
-
artefacts:
|
|
209
|
+
_exported_keys: list[str] = ["geometry", "objectness_score"]
|
|
210
|
+
_children_names: list[str] = ["lines", "artefacts"]
|
|
211
|
+
lines: list[Line] = []
|
|
212
|
+
artefacts: list[Artefact] = []
|
|
217
213
|
|
|
218
214
|
def __init__(
|
|
219
215
|
self,
|
|
220
|
-
lines:
|
|
221
|
-
artefacts:
|
|
222
|
-
geometry:
|
|
223
|
-
objectness_score:
|
|
216
|
+
lines: list[Line] = [],
|
|
217
|
+
artefacts: list[Artefact] = [],
|
|
218
|
+
geometry: BoundingBox | np.ndarray | None = None,
|
|
219
|
+
objectness_score: float | None = None,
|
|
224
220
|
) -> None:
|
|
225
221
|
# Compute the objectness score of the line
|
|
226
222
|
if objectness_score is None:
|
|
@@ -243,7 +239,7 @@ class Block(Element):
|
|
|
243
239
|
return line_break.join(line.render() for line in self.lines)
|
|
244
240
|
|
|
245
241
|
@classmethod
|
|
246
|
-
def from_dict(cls, save_dict:
|
|
242
|
+
def from_dict(cls, save_dict: dict[str, Any], **kwargs):
|
|
247
243
|
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
248
244
|
kwargs.update({
|
|
249
245
|
"lines": [Line.from_dict(_dict) for _dict in save_dict["lines"]],
|
|
@@ -256,7 +252,6 @@ class Page(Element):
|
|
|
256
252
|
"""Implements a page element as a collection of blocks
|
|
257
253
|
|
|
258
254
|
Args:
|
|
259
|
-
----
|
|
260
255
|
page: image encoded as a numpy array in uint8
|
|
261
256
|
blocks: list of block elements
|
|
262
257
|
page_idx: the index of the page in the input raw document
|
|
@@ -265,18 +260,18 @@ class Page(Element):
|
|
|
265
260
|
language: a dictionary with the language value and confidence of the prediction
|
|
266
261
|
"""
|
|
267
262
|
|
|
268
|
-
_exported_keys:
|
|
269
|
-
_children_names:
|
|
270
|
-
blocks:
|
|
263
|
+
_exported_keys: list[str] = ["page_idx", "dimensions", "orientation", "language"]
|
|
264
|
+
_children_names: list[str] = ["blocks"]
|
|
265
|
+
blocks: list[Block] = []
|
|
271
266
|
|
|
272
267
|
def __init__(
|
|
273
268
|
self,
|
|
274
269
|
page: np.ndarray,
|
|
275
|
-
blocks:
|
|
270
|
+
blocks: list[Block],
|
|
276
271
|
page_idx: int,
|
|
277
|
-
dimensions:
|
|
278
|
-
orientation:
|
|
279
|
-
language:
|
|
272
|
+
dimensions: tuple[int, int],
|
|
273
|
+
orientation: dict[str, Any] | None = None,
|
|
274
|
+
language: dict[str, Any] | None = None,
|
|
280
275
|
) -> None:
|
|
281
276
|
super().__init__(blocks=blocks)
|
|
282
277
|
self.page = page
|
|
@@ -311,25 +306,21 @@ class Page(Element):
|
|
|
311
306
|
"""Synthesize the page from the predictions
|
|
312
307
|
|
|
313
308
|
Args:
|
|
314
|
-
----
|
|
315
309
|
**kwargs: keyword arguments passed to the `synthesize_page` method
|
|
316
310
|
|
|
317
|
-
Returns
|
|
318
|
-
-------
|
|
311
|
+
Returns:
|
|
319
312
|
synthesized page
|
|
320
313
|
"""
|
|
321
314
|
return synthesize_page(self.export(), **kwargs)
|
|
322
315
|
|
|
323
|
-
def export_as_xml(self, file_title: str = "docTR - XML export (hOCR)") ->
|
|
316
|
+
def export_as_xml(self, file_title: str = "docTR - XML export (hOCR)") -> tuple[bytes, ET.ElementTree]:
|
|
324
317
|
"""Export the page as XML (hOCR-format)
|
|
325
318
|
convention: https://github.com/kba/hocr-spec/blob/master/1.2/spec.md
|
|
326
319
|
|
|
327
320
|
Args:
|
|
328
|
-
----
|
|
329
321
|
file_title: the title of the XML file
|
|
330
322
|
|
|
331
323
|
Returns:
|
|
332
|
-
-------
|
|
333
324
|
a tuple of the XML byte string, and its ElementTree
|
|
334
325
|
"""
|
|
335
326
|
p_idx = self.page_idx
|
|
@@ -356,7 +347,7 @@ class Page(Element):
|
|
|
356
347
|
)
|
|
357
348
|
# Create the body
|
|
358
349
|
body = SubElement(page_hocr, "body")
|
|
359
|
-
SubElement(
|
|
350
|
+
page_div = SubElement(
|
|
360
351
|
body,
|
|
361
352
|
"div",
|
|
362
353
|
attrib={
|
|
@@ -371,7 +362,7 @@ class Page(Element):
|
|
|
371
362
|
raise TypeError("XML export is only available for straight bounding boxes for now.")
|
|
372
363
|
(xmin, ymin), (xmax, ymax) = block.geometry
|
|
373
364
|
block_div = SubElement(
|
|
374
|
-
|
|
365
|
+
page_div,
|
|
375
366
|
"div",
|
|
376
367
|
attrib={
|
|
377
368
|
"class": "ocr_carea",
|
|
@@ -427,7 +418,7 @@ class Page(Element):
|
|
|
427
418
|
return (ET.tostring(page_hocr, encoding="utf-8", method="xml"), ET.ElementTree(page_hocr))
|
|
428
419
|
|
|
429
420
|
@classmethod
|
|
430
|
-
def from_dict(cls, save_dict:
|
|
421
|
+
def from_dict(cls, save_dict: dict[str, Any], **kwargs):
|
|
431
422
|
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
432
423
|
kwargs.update({"blocks": [Block.from_dict(block_dict) for block_dict in save_dict["blocks"]]})
|
|
433
424
|
return cls(**kwargs)
|
|
@@ -437,7 +428,6 @@ class KIEPage(Element):
|
|
|
437
428
|
"""Implements a KIE page element as a collection of predictions
|
|
438
429
|
|
|
439
430
|
Args:
|
|
440
|
-
----
|
|
441
431
|
predictions: Dictionary with list of block elements for each detection class
|
|
442
432
|
page: image encoded as a numpy array in uint8
|
|
443
433
|
page_idx: the index of the page in the input raw document
|
|
@@ -446,18 +436,18 @@ class KIEPage(Element):
|
|
|
446
436
|
language: a dictionary with the language value and confidence of the prediction
|
|
447
437
|
"""
|
|
448
438
|
|
|
449
|
-
_exported_keys:
|
|
450
|
-
_children_names:
|
|
451
|
-
predictions:
|
|
439
|
+
_exported_keys: list[str] = ["page_idx", "dimensions", "orientation", "language"]
|
|
440
|
+
_children_names: list[str] = ["predictions"]
|
|
441
|
+
predictions: dict[str, list[Prediction]] = {}
|
|
452
442
|
|
|
453
443
|
def __init__(
|
|
454
444
|
self,
|
|
455
445
|
page: np.ndarray,
|
|
456
|
-
predictions:
|
|
446
|
+
predictions: dict[str, list[Prediction]],
|
|
457
447
|
page_idx: int,
|
|
458
|
-
dimensions:
|
|
459
|
-
orientation:
|
|
460
|
-
language:
|
|
448
|
+
dimensions: tuple[int, int],
|
|
449
|
+
orientation: dict[str, Any] | None = None,
|
|
450
|
+
language: dict[str, Any] | None = None,
|
|
461
451
|
) -> None:
|
|
462
452
|
super().__init__(predictions=predictions)
|
|
463
453
|
self.page = page
|
|
@@ -496,25 +486,21 @@ class KIEPage(Element):
|
|
|
496
486
|
"""Synthesize the page from the predictions
|
|
497
487
|
|
|
498
488
|
Args:
|
|
499
|
-
----
|
|
500
489
|
**kwargs: keyword arguments passed to the `synthesize_kie_page` method
|
|
501
490
|
|
|
502
491
|
Returns:
|
|
503
|
-
-------
|
|
504
492
|
synthesized page
|
|
505
493
|
"""
|
|
506
494
|
return synthesize_kie_page(self.export(), **kwargs)
|
|
507
495
|
|
|
508
|
-
def export_as_xml(self, file_title: str = "docTR - XML export (hOCR)") ->
|
|
496
|
+
def export_as_xml(self, file_title: str = "docTR - XML export (hOCR)") -> tuple[bytes, ET.ElementTree]:
|
|
509
497
|
"""Export the page as XML (hOCR-format)
|
|
510
498
|
convention: https://github.com/kba/hocr-spec/blob/master/1.2/spec.md
|
|
511
499
|
|
|
512
500
|
Args:
|
|
513
|
-
----
|
|
514
501
|
file_title: the title of the XML file
|
|
515
502
|
|
|
516
503
|
Returns:
|
|
517
|
-
-------
|
|
518
504
|
a tuple of the XML byte string, and its ElementTree
|
|
519
505
|
"""
|
|
520
506
|
p_idx = self.page_idx
|
|
@@ -564,13 +550,47 @@ class KIEPage(Element):
|
|
|
564
550
|
{int(round(xmax * width))} {int(round(ymax * height))}",
|
|
565
551
|
},
|
|
566
552
|
)
|
|
567
|
-
|
|
553
|
+
# NOTE: ocr_par, ocr_line and ocrx_word are the same because the KIE predictions contain only words
|
|
554
|
+
# This is a workaround to make it PDF/A compatible
|
|
555
|
+
par_div = SubElement(
|
|
556
|
+
prediction_div,
|
|
557
|
+
"p",
|
|
558
|
+
attrib={
|
|
559
|
+
"class": "ocr_par",
|
|
560
|
+
"id": f"{class_name}_par_{prediction_count}",
|
|
561
|
+
"title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \
|
|
562
|
+
{int(round(xmax * width))} {int(round(ymax * height))}",
|
|
563
|
+
},
|
|
564
|
+
)
|
|
565
|
+
line_span = SubElement(
|
|
566
|
+
par_div,
|
|
567
|
+
"span",
|
|
568
|
+
attrib={
|
|
569
|
+
"class": "ocr_line",
|
|
570
|
+
"id": f"{class_name}_line_{prediction_count}",
|
|
571
|
+
"title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \
|
|
572
|
+
{int(round(xmax * width))} {int(round(ymax * height))}; \
|
|
573
|
+
baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0",
|
|
574
|
+
},
|
|
575
|
+
)
|
|
576
|
+
word_div = SubElement(
|
|
577
|
+
line_span,
|
|
578
|
+
"span",
|
|
579
|
+
attrib={
|
|
580
|
+
"class": "ocrx_word",
|
|
581
|
+
"id": f"{class_name}_word_{prediction_count}",
|
|
582
|
+
"title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \
|
|
583
|
+
{int(round(xmax * width))} {int(round(ymax * height))}; \
|
|
584
|
+
x_wconf {int(round(prediction.confidence * 100))}",
|
|
585
|
+
},
|
|
586
|
+
)
|
|
587
|
+
word_div.text = prediction.value
|
|
568
588
|
prediction_count += 1
|
|
569
589
|
|
|
570
590
|
return ET.tostring(page_hocr, encoding="utf-8", method="xml"), ET.ElementTree(page_hocr)
|
|
571
591
|
|
|
572
592
|
@classmethod
|
|
573
|
-
def from_dict(cls, save_dict:
|
|
593
|
+
def from_dict(cls, save_dict: dict[str, Any], **kwargs):
|
|
574
594
|
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
575
595
|
kwargs.update({
|
|
576
596
|
"predictions": [Prediction.from_dict(predictions_dict) for predictions_dict in save_dict["predictions"]]
|
|
@@ -582,16 +602,15 @@ class Document(Element):
|
|
|
582
602
|
"""Implements a document element as a collection of pages
|
|
583
603
|
|
|
584
604
|
Args:
|
|
585
|
-
----
|
|
586
605
|
pages: list of page elements
|
|
587
606
|
"""
|
|
588
607
|
|
|
589
|
-
_children_names:
|
|
590
|
-
pages:
|
|
608
|
+
_children_names: list[str] = ["pages"]
|
|
609
|
+
pages: list[Page] = []
|
|
591
610
|
|
|
592
611
|
def __init__(
|
|
593
612
|
self,
|
|
594
|
-
pages:
|
|
613
|
+
pages: list[Page],
|
|
595
614
|
) -> None:
|
|
596
615
|
super().__init__(pages=pages)
|
|
597
616
|
|
|
@@ -604,34 +623,30 @@ class Document(Element):
|
|
|
604
623
|
for result in self.pages:
|
|
605
624
|
result.show(**kwargs)
|
|
606
625
|
|
|
607
|
-
def synthesize(self, **kwargs) ->
|
|
626
|
+
def synthesize(self, **kwargs) -> list[np.ndarray]:
|
|
608
627
|
"""Synthesize all pages from their predictions
|
|
609
628
|
|
|
610
629
|
Args:
|
|
611
|
-
----
|
|
612
630
|
**kwargs: keyword arguments passed to the `Page.synthesize` method
|
|
613
631
|
|
|
614
|
-
Returns
|
|
615
|
-
-------
|
|
632
|
+
Returns:
|
|
616
633
|
list of synthesized pages
|
|
617
634
|
"""
|
|
618
635
|
return [page.synthesize(**kwargs) for page in self.pages]
|
|
619
636
|
|
|
620
|
-
def export_as_xml(self, **kwargs) ->
|
|
637
|
+
def export_as_xml(self, **kwargs) -> list[tuple[bytes, ET.ElementTree]]:
|
|
621
638
|
"""Export the document as XML (hOCR-format)
|
|
622
639
|
|
|
623
640
|
Args:
|
|
624
|
-
----
|
|
625
641
|
**kwargs: additional keyword arguments passed to the Page.export_as_xml method
|
|
626
642
|
|
|
627
643
|
Returns:
|
|
628
|
-
-------
|
|
629
644
|
list of tuple of (bytes, ElementTree)
|
|
630
645
|
"""
|
|
631
646
|
return [page.export_as_xml(**kwargs) for page in self.pages]
|
|
632
647
|
|
|
633
648
|
@classmethod
|
|
634
|
-
def from_dict(cls, save_dict:
|
|
649
|
+
def from_dict(cls, save_dict: dict[str, Any], **kwargs):
|
|
635
650
|
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
636
651
|
kwargs.update({"pages": [Page.from_dict(page_dict) for page_dict in save_dict["pages"]]})
|
|
637
652
|
return cls(**kwargs)
|
|
@@ -641,15 +656,14 @@ class KIEDocument(Document):
|
|
|
641
656
|
"""Implements a document element as a collection of pages
|
|
642
657
|
|
|
643
658
|
Args:
|
|
644
|
-
----
|
|
645
659
|
pages: list of page elements
|
|
646
660
|
"""
|
|
647
661
|
|
|
648
|
-
_children_names:
|
|
649
|
-
pages:
|
|
662
|
+
_children_names: list[str] = ["pages"]
|
|
663
|
+
pages: list[KIEPage] = [] # type: ignore[assignment]
|
|
650
664
|
|
|
651
665
|
def __init__(
|
|
652
666
|
self,
|
|
653
|
-
pages:
|
|
667
|
+
pages: list[KIEPage],
|
|
654
668
|
) -> None:
|
|
655
669
|
super().__init__(pages=pages) # type: ignore[arg-type]
|
doctr/io/html.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
@@ -15,12 +15,10 @@ def read_html(url: str, **kwargs: Any) -> bytes:
|
|
|
15
15
|
>>> doc = read_html("https://www.yoursite.com")
|
|
16
16
|
|
|
17
17
|
Args:
|
|
18
|
-
----
|
|
19
18
|
url: URL of the target web page
|
|
20
19
|
**kwargs: keyword arguments from `weasyprint.HTML`
|
|
21
20
|
|
|
22
21
|
Returns:
|
|
23
|
-
-------
|
|
24
22
|
decoded PDF file as a bytes stream
|
|
25
23
|
"""
|
|
26
24
|
from weasyprint import HTML
|
doctr/io/image/__init__.py
CHANGED
|
@@ -2,7 +2,7 @@ from doctr.file_utils import is_tf_available, is_torch_available
|
|
|
2
2
|
|
|
3
3
|
from .base import *
|
|
4
4
|
|
|
5
|
-
if
|
|
6
|
-
from .tensorflow import *
|
|
7
|
-
elif is_torch_available():
|
|
5
|
+
if is_torch_available():
|
|
8
6
|
from .pytorch import *
|
|
7
|
+
elif is_tf_available():
|
|
8
|
+
from .tensorflow import *
|
doctr/io/image/base.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
|
1
|
-
# Copyright (C) 2021-
|
|
1
|
+
# Copyright (C) 2021-2025, Mindee.
|
|
2
2
|
|
|
3
3
|
# This program is licensed under the Apache License 2.0.
|
|
4
4
|
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
5
|
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Optional, Tuple
|
|
8
7
|
|
|
9
8
|
import cv2
|
|
10
9
|
import numpy as np
|
|
@@ -16,7 +15,7 @@ __all__ = ["read_img_as_numpy"]
|
|
|
16
15
|
|
|
17
16
|
def read_img_as_numpy(
|
|
18
17
|
file: AbstractFile,
|
|
19
|
-
output_size:
|
|
18
|
+
output_size: tuple[int, int] | None = None,
|
|
20
19
|
rgb_output: bool = True,
|
|
21
20
|
) -> np.ndarray:
|
|
22
21
|
"""Read an image file into numpy format
|
|
@@ -25,13 +24,11 @@ def read_img_as_numpy(
|
|
|
25
24
|
>>> page = read_img_as_numpy("path/to/your/doc.jpg")
|
|
26
25
|
|
|
27
26
|
Args:
|
|
28
|
-
----
|
|
29
27
|
file: the path to the image file
|
|
30
28
|
output_size: the expected output size of each page in format H x W
|
|
31
29
|
rgb_output: whether the output ndarray channel order should be RGB instead of BGR.
|
|
32
30
|
|
|
33
31
|
Returns:
|
|
34
|
-
-------
|
|
35
32
|
the page decoded as numpy ndarray of shape H x W x 3
|
|
36
33
|
"""
|
|
37
34
|
if isinstance(file, (str, Path)):
|