onnxtr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- onnxtr/__init__.py +2 -0
- onnxtr/contrib/__init__.py +0 -0
- onnxtr/contrib/artefacts.py +131 -0
- onnxtr/contrib/base.py +105 -0
- onnxtr/file_utils.py +33 -0
- onnxtr/io/__init__.py +5 -0
- onnxtr/io/elements.py +455 -0
- onnxtr/io/html.py +28 -0
- onnxtr/io/image.py +56 -0
- onnxtr/io/pdf.py +42 -0
- onnxtr/io/reader.py +85 -0
- onnxtr/models/__init__.py +4 -0
- onnxtr/models/_utils.py +141 -0
- onnxtr/models/builder.py +355 -0
- onnxtr/models/classification/__init__.py +2 -0
- onnxtr/models/classification/models/__init__.py +1 -0
- onnxtr/models/classification/models/mobilenet.py +120 -0
- onnxtr/models/classification/predictor/__init__.py +1 -0
- onnxtr/models/classification/predictor/base.py +57 -0
- onnxtr/models/classification/zoo.py +76 -0
- onnxtr/models/detection/__init__.py +2 -0
- onnxtr/models/detection/core.py +101 -0
- onnxtr/models/detection/models/__init__.py +3 -0
- onnxtr/models/detection/models/differentiable_binarization.py +159 -0
- onnxtr/models/detection/models/fast.py +160 -0
- onnxtr/models/detection/models/linknet.py +160 -0
- onnxtr/models/detection/postprocessor/__init__.py +0 -0
- onnxtr/models/detection/postprocessor/base.py +144 -0
- onnxtr/models/detection/predictor/__init__.py +1 -0
- onnxtr/models/detection/predictor/base.py +54 -0
- onnxtr/models/detection/zoo.py +73 -0
- onnxtr/models/engine.py +50 -0
- onnxtr/models/predictor/__init__.py +1 -0
- onnxtr/models/predictor/base.py +175 -0
- onnxtr/models/predictor/predictor.py +145 -0
- onnxtr/models/preprocessor/__init__.py +1 -0
- onnxtr/models/preprocessor/base.py +118 -0
- onnxtr/models/recognition/__init__.py +2 -0
- onnxtr/models/recognition/core.py +28 -0
- onnxtr/models/recognition/models/__init__.py +5 -0
- onnxtr/models/recognition/models/crnn.py +226 -0
- onnxtr/models/recognition/models/master.py +145 -0
- onnxtr/models/recognition/models/parseq.py +134 -0
- onnxtr/models/recognition/models/sar.py +134 -0
- onnxtr/models/recognition/models/vitstr.py +166 -0
- onnxtr/models/recognition/predictor/__init__.py +1 -0
- onnxtr/models/recognition/predictor/_utils.py +86 -0
- onnxtr/models/recognition/predictor/base.py +79 -0
- onnxtr/models/recognition/utils.py +89 -0
- onnxtr/models/recognition/zoo.py +69 -0
- onnxtr/models/zoo.py +114 -0
- onnxtr/transforms/__init__.py +1 -0
- onnxtr/transforms/base.py +112 -0
- onnxtr/utils/__init__.py +4 -0
- onnxtr/utils/common_types.py +18 -0
- onnxtr/utils/data.py +126 -0
- onnxtr/utils/fonts.py +41 -0
- onnxtr/utils/geometry.py +498 -0
- onnxtr/utils/multithreading.py +50 -0
- onnxtr/utils/reconstitution.py +70 -0
- onnxtr/utils/repr.py +64 -0
- onnxtr/utils/visualization.py +291 -0
- onnxtr/utils/vocabs.py +71 -0
- onnxtr/version.py +1 -0
- onnxtr-0.1.0.dist-info/LICENSE +201 -0
- onnxtr-0.1.0.dist-info/METADATA +481 -0
- onnxtr-0.1.0.dist-info/RECORD +70 -0
- onnxtr-0.1.0.dist-info/WHEEL +5 -0
- onnxtr-0.1.0.dist-info/top_level.txt +2 -0
- onnxtr-0.1.0.dist-info/zip-safe +1 -0
onnxtr/io/elements.py
ADDED
|
@@ -0,0 +1,455 @@
|
|
|
1
|
+
# Copyright (C) 2021-2024, Mindee | Felix Dittrich.
|
|
2
|
+
|
|
3
|
+
# This program is licensed under the Apache License 2.0.
|
|
4
|
+
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
|
+
|
|
6
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
7
|
+
|
|
8
|
+
from defusedxml import defuse_stdlib
|
|
9
|
+
|
|
10
|
+
defuse_stdlib()
|
|
11
|
+
from xml.etree import ElementTree as ET
|
|
12
|
+
from xml.etree.ElementTree import Element as ETElement
|
|
13
|
+
from xml.etree.ElementTree import SubElement
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
|
|
17
|
+
import onnxtr
|
|
18
|
+
from onnxtr.file_utils import requires_package
|
|
19
|
+
from onnxtr.utils.common_types import BoundingBox
|
|
20
|
+
from onnxtr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
|
|
21
|
+
from onnxtr.utils.reconstitution import synthesize_page
|
|
22
|
+
from onnxtr.utils.repr import NestedObject
|
|
23
|
+
|
|
24
|
+
try: # optional dependency for visualization
|
|
25
|
+
from onnxtr.utils.visualization import visualize_page
|
|
26
|
+
except ModuleNotFoundError: # pragma: no cover
|
|
27
|
+
pass
|
|
28
|
+
|
|
29
|
+
__all__ = ["Element", "Word", "Artefact", "Line", "Block", "Page", "Document"]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class Element(NestedObject):
|
|
33
|
+
"""Implements an abstract document element with exporting and text rendering capabilities"""
|
|
34
|
+
|
|
35
|
+
_children_names: List[str] = []
|
|
36
|
+
_exported_keys: List[str] = []
|
|
37
|
+
|
|
38
|
+
def __init__(self, **kwargs: Any) -> None:
|
|
39
|
+
for k, v in kwargs.items():
|
|
40
|
+
if k in self._children_names:
|
|
41
|
+
setattr(self, k, v)
|
|
42
|
+
else:
|
|
43
|
+
raise KeyError(f"{self.__class__.__name__} object does not have any attribute named '{k}'")
|
|
44
|
+
|
|
45
|
+
def export(self) -> Dict[str, Any]:
|
|
46
|
+
"""Exports the object into a nested dict format"""
|
|
47
|
+
export_dict = {k: getattr(self, k) for k in self._exported_keys}
|
|
48
|
+
for children_name in self._children_names:
|
|
49
|
+
export_dict[children_name] = [c.export() for c in getattr(self, children_name)]
|
|
50
|
+
|
|
51
|
+
return export_dict
|
|
52
|
+
|
|
53
|
+
@classmethod
|
|
54
|
+
def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
|
|
55
|
+
raise NotImplementedError
|
|
56
|
+
|
|
57
|
+
def render(self) -> str:
|
|
58
|
+
raise NotImplementedError
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class Word(Element):
|
|
62
|
+
"""Implements a word element
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
----
|
|
66
|
+
value: the text string of the word
|
|
67
|
+
confidence: the confidence associated with the text prediction
|
|
68
|
+
geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
|
|
69
|
+
the page's size
|
|
70
|
+
crop_orientation: the general orientation of the crop in degrees and its confidence
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
_exported_keys: List[str] = ["value", "confidence", "geometry", "crop_orientation"]
|
|
74
|
+
_children_names: List[str] = []
|
|
75
|
+
|
|
76
|
+
def __init__(
|
|
77
|
+
self,
|
|
78
|
+
value: str,
|
|
79
|
+
confidence: float,
|
|
80
|
+
geometry: Union[BoundingBox, np.ndarray],
|
|
81
|
+
crop_orientation: Dict[str, Any],
|
|
82
|
+
) -> None:
|
|
83
|
+
super().__init__()
|
|
84
|
+
self.value = value
|
|
85
|
+
self.confidence = confidence
|
|
86
|
+
self.geometry = geometry
|
|
87
|
+
self.crop_orientation = crop_orientation
|
|
88
|
+
|
|
89
|
+
def render(self) -> str:
|
|
90
|
+
"""Renders the full text of the element"""
|
|
91
|
+
return self.value
|
|
92
|
+
|
|
93
|
+
def extra_repr(self) -> str:
|
|
94
|
+
return f"value='{self.value}', confidence={self.confidence:.2}"
|
|
95
|
+
|
|
96
|
+
@classmethod
|
|
97
|
+
def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
|
|
98
|
+
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
99
|
+
return cls(**kwargs)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class Artefact(Element):
|
|
103
|
+
"""Implements a non-textual element
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
----
|
|
107
|
+
artefact_type: the type of artefact
|
|
108
|
+
confidence: the confidence of the type prediction
|
|
109
|
+
geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
|
|
110
|
+
the page's size.
|
|
111
|
+
"""
|
|
112
|
+
|
|
113
|
+
_exported_keys: List[str] = ["geometry", "type", "confidence"]
|
|
114
|
+
_children_names: List[str] = []
|
|
115
|
+
|
|
116
|
+
def __init__(self, artefact_type: str, confidence: float, geometry: BoundingBox) -> None:
|
|
117
|
+
super().__init__()
|
|
118
|
+
self.geometry = geometry
|
|
119
|
+
self.type = artefact_type
|
|
120
|
+
self.confidence = confidence
|
|
121
|
+
|
|
122
|
+
def render(self) -> str:
|
|
123
|
+
"""Renders the full text of the element"""
|
|
124
|
+
return f"[{self.type.upper()}]"
|
|
125
|
+
|
|
126
|
+
def extra_repr(self) -> str:
|
|
127
|
+
return f"type='{self.type}', confidence={self.confidence:.2}"
|
|
128
|
+
|
|
129
|
+
@classmethod
|
|
130
|
+
def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
|
|
131
|
+
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
132
|
+
return cls(**kwargs)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class Line(Element):
|
|
136
|
+
"""Implements a line element as a collection of words
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
----
|
|
140
|
+
words: list of word elements
|
|
141
|
+
geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
|
|
142
|
+
the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing
|
|
143
|
+
all words in it.
|
|
144
|
+
"""
|
|
145
|
+
|
|
146
|
+
_exported_keys: List[str] = ["geometry"]
|
|
147
|
+
_children_names: List[str] = ["words"]
|
|
148
|
+
words: List[Word] = []
|
|
149
|
+
|
|
150
|
+
def __init__(
|
|
151
|
+
self,
|
|
152
|
+
words: List[Word],
|
|
153
|
+
geometry: Optional[Union[BoundingBox, np.ndarray]] = None,
|
|
154
|
+
) -> None:
|
|
155
|
+
# Resolve the geometry using the smallest enclosing bounding box
|
|
156
|
+
if geometry is None:
|
|
157
|
+
# Check whether this is a rotated or straight box
|
|
158
|
+
box_resolution_fn = resolve_enclosing_rbbox if len(words[0].geometry) == 4 else resolve_enclosing_bbox
|
|
159
|
+
geometry = box_resolution_fn([w.geometry for w in words]) # type: ignore[operator]
|
|
160
|
+
|
|
161
|
+
super().__init__(words=words)
|
|
162
|
+
self.geometry = geometry
|
|
163
|
+
|
|
164
|
+
def render(self) -> str:
|
|
165
|
+
"""Renders the full text of the element"""
|
|
166
|
+
return " ".join(w.render() for w in self.words)
|
|
167
|
+
|
|
168
|
+
@classmethod
|
|
169
|
+
def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
|
|
170
|
+
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
171
|
+
kwargs.update({
|
|
172
|
+
"words": [Word.from_dict(_dict) for _dict in save_dict["words"]],
|
|
173
|
+
})
|
|
174
|
+
return cls(**kwargs)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
class Block(Element):
|
|
178
|
+
"""Implements a block element as a collection of lines and artefacts
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
----
|
|
182
|
+
lines: list of line elements
|
|
183
|
+
artefacts: list of artefacts
|
|
184
|
+
geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
|
|
185
|
+
the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing
|
|
186
|
+
all lines and artefacts in it.
|
|
187
|
+
"""
|
|
188
|
+
|
|
189
|
+
_exported_keys: List[str] = ["geometry"]
|
|
190
|
+
_children_names: List[str] = ["lines", "artefacts"]
|
|
191
|
+
lines: List[Line] = []
|
|
192
|
+
artefacts: List[Artefact] = []
|
|
193
|
+
|
|
194
|
+
def __init__(
|
|
195
|
+
self,
|
|
196
|
+
lines: List[Line] = [],
|
|
197
|
+
artefacts: List[Artefact] = [],
|
|
198
|
+
geometry: Optional[Union[BoundingBox, np.ndarray]] = None,
|
|
199
|
+
) -> None:
|
|
200
|
+
# Resolve the geometry using the smallest enclosing bounding box
|
|
201
|
+
if geometry is None:
|
|
202
|
+
line_boxes = [word.geometry for line in lines for word in line.words]
|
|
203
|
+
artefact_boxes = [artefact.geometry for artefact in artefacts]
|
|
204
|
+
box_resolution_fn = (
|
|
205
|
+
resolve_enclosing_rbbox if isinstance(lines[0].geometry, np.ndarray) else resolve_enclosing_bbox
|
|
206
|
+
)
|
|
207
|
+
geometry = box_resolution_fn(line_boxes + artefact_boxes) # type: ignore[operator]
|
|
208
|
+
|
|
209
|
+
super().__init__(lines=lines, artefacts=artefacts)
|
|
210
|
+
self.geometry = geometry
|
|
211
|
+
|
|
212
|
+
def render(self, line_break: str = "\n") -> str:
|
|
213
|
+
"""Renders the full text of the element"""
|
|
214
|
+
return line_break.join(line.render() for line in self.lines)
|
|
215
|
+
|
|
216
|
+
@classmethod
|
|
217
|
+
def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
|
|
218
|
+
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
219
|
+
kwargs.update({
|
|
220
|
+
"lines": [Line.from_dict(_dict) for _dict in save_dict["lines"]],
|
|
221
|
+
"artefacts": [Artefact.from_dict(_dict) for _dict in save_dict["artefacts"]],
|
|
222
|
+
})
|
|
223
|
+
return cls(**kwargs)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
class Page(Element):
|
|
227
|
+
"""Implements a page element as a collection of blocks
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
----
|
|
231
|
+
page: image encoded as a numpy array in uint8
|
|
232
|
+
blocks: list of block elements
|
|
233
|
+
page_idx: the index of the page in the input raw document
|
|
234
|
+
dimensions: the page size in pixels in format (height, width)
|
|
235
|
+
orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction
|
|
236
|
+
language: a dictionary with the language value and confidence of the prediction
|
|
237
|
+
"""
|
|
238
|
+
|
|
239
|
+
_exported_keys: List[str] = ["page_idx", "dimensions", "orientation", "language"]
|
|
240
|
+
_children_names: List[str] = ["blocks"]
|
|
241
|
+
blocks: List[Block] = []
|
|
242
|
+
|
|
243
|
+
def __init__(
|
|
244
|
+
self,
|
|
245
|
+
page: np.ndarray,
|
|
246
|
+
blocks: List[Block],
|
|
247
|
+
page_idx: int,
|
|
248
|
+
dimensions: Tuple[int, int],
|
|
249
|
+
orientation: Optional[Dict[str, Any]] = None,
|
|
250
|
+
language: Optional[Dict[str, Any]] = None,
|
|
251
|
+
) -> None:
|
|
252
|
+
super().__init__(blocks=blocks)
|
|
253
|
+
self.page = page
|
|
254
|
+
self.page_idx = page_idx
|
|
255
|
+
self.dimensions = dimensions
|
|
256
|
+
self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None)
|
|
257
|
+
self.language = language if isinstance(language, dict) else dict(value=None, confidence=None)
|
|
258
|
+
|
|
259
|
+
def render(self, block_break: str = "\n\n") -> str:
|
|
260
|
+
"""Renders the full text of the element"""
|
|
261
|
+
return block_break.join(b.render() for b in self.blocks)
|
|
262
|
+
|
|
263
|
+
def extra_repr(self) -> str:
|
|
264
|
+
return f"dimensions={self.dimensions}"
|
|
265
|
+
|
|
266
|
+
def show(self, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None:
|
|
267
|
+
"""Overlay the result on a given image
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
interactive: whether the display should be interactive
|
|
271
|
+
preserve_aspect_ratio: pass True if you passed True to the predictor
|
|
272
|
+
**kwargs: additional keyword arguments passed to the matplotlib.pyplot.show method
|
|
273
|
+
"""
|
|
274
|
+
requires_package("matplotlib", "`.show()` requires matplotlib & mplcursors installed")
|
|
275
|
+
requires_package("mplcursors", "`.show()` requires matplotlib & mplcursors installed")
|
|
276
|
+
import matplotlib.pyplot as plt
|
|
277
|
+
|
|
278
|
+
visualize_page(self.export(), self.page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio)
|
|
279
|
+
plt.show(**kwargs)
|
|
280
|
+
|
|
281
|
+
def synthesize(self, **kwargs) -> np.ndarray:
|
|
282
|
+
"""Synthesize the page from the predictions
|
|
283
|
+
|
|
284
|
+
Returns
|
|
285
|
+
-------
|
|
286
|
+
synthesized page
|
|
287
|
+
"""
|
|
288
|
+
return synthesize_page(self.export(), **kwargs)
|
|
289
|
+
|
|
290
|
+
def export_as_xml(self, file_title: str = "OnnxTR - XML export (hOCR)") -> Tuple[bytes, ET.ElementTree]:
|
|
291
|
+
"""Export the page as XML (hOCR-format)
|
|
292
|
+
convention: https://github.com/kba/hocr-spec/blob/master/1.2/spec.md
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
----
|
|
296
|
+
file_title: the title of the XML file
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
-------
|
|
300
|
+
a tuple of the XML byte string, and its ElementTree
|
|
301
|
+
"""
|
|
302
|
+
p_idx = self.page_idx
|
|
303
|
+
block_count: int = 1
|
|
304
|
+
line_count: int = 1
|
|
305
|
+
word_count: int = 1
|
|
306
|
+
height, width = self.dimensions
|
|
307
|
+
language = self.language if "language" in self.language.keys() else "en"
|
|
308
|
+
# Create the XML root element
|
|
309
|
+
page_hocr = ETElement("html", attrib={"xmlns": "http://www.w3.org/1999/xhtml", "xml:lang": str(language)})
|
|
310
|
+
# Create the header / SubElements of the root element
|
|
311
|
+
head = SubElement(page_hocr, "head")
|
|
312
|
+
SubElement(head, "title").text = file_title
|
|
313
|
+
SubElement(head, "meta", attrib={"http-equiv": "Content-Type", "content": "text/html; charset=utf-8"})
|
|
314
|
+
SubElement(
|
|
315
|
+
head,
|
|
316
|
+
"meta",
|
|
317
|
+
attrib={"name": "ocr-system", "content": f" {onnxtr.__version__}"}, # type: ignore[attr-defined]
|
|
318
|
+
)
|
|
319
|
+
SubElement(
|
|
320
|
+
head,
|
|
321
|
+
"meta",
|
|
322
|
+
attrib={"name": "ocr-capabilities", "content": "ocr_page ocr_carea ocr_par ocr_line ocrx_word"},
|
|
323
|
+
)
|
|
324
|
+
# Create the body
|
|
325
|
+
body = SubElement(page_hocr, "body")
|
|
326
|
+
SubElement(
|
|
327
|
+
body,
|
|
328
|
+
"div",
|
|
329
|
+
attrib={
|
|
330
|
+
"class": "ocr_page",
|
|
331
|
+
"id": f"page_{p_idx + 1}",
|
|
332
|
+
"title": f"image; bbox 0 0 {width} {height}; ppageno 0",
|
|
333
|
+
},
|
|
334
|
+
)
|
|
335
|
+
# iterate over the blocks / lines / words and create the XML elements in body line by line with the attributes
|
|
336
|
+
for block in self.blocks:
|
|
337
|
+
if len(block.geometry) != 2:
|
|
338
|
+
raise TypeError("XML export is only available for straight bounding boxes for now.")
|
|
339
|
+
(xmin, ymin), (xmax, ymax) = block.geometry
|
|
340
|
+
block_div = SubElement(
|
|
341
|
+
body,
|
|
342
|
+
"div",
|
|
343
|
+
attrib={
|
|
344
|
+
"class": "ocr_carea",
|
|
345
|
+
"id": f"block_{block_count}",
|
|
346
|
+
"title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \
|
|
347
|
+
{int(round(xmax * width))} {int(round(ymax * height))}",
|
|
348
|
+
},
|
|
349
|
+
)
|
|
350
|
+
paragraph = SubElement(
|
|
351
|
+
block_div,
|
|
352
|
+
"p",
|
|
353
|
+
attrib={
|
|
354
|
+
"class": "ocr_par",
|
|
355
|
+
"id": f"par_{block_count}",
|
|
356
|
+
"title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \
|
|
357
|
+
{int(round(xmax * width))} {int(round(ymax * height))}",
|
|
358
|
+
},
|
|
359
|
+
)
|
|
360
|
+
block_count += 1
|
|
361
|
+
for line in block.lines:
|
|
362
|
+
(xmin, ymin), (xmax, ymax) = line.geometry
|
|
363
|
+
# NOTE: baseline, x_size, x_descenders, x_ascenders is currently initalized to 0
|
|
364
|
+
line_span = SubElement(
|
|
365
|
+
paragraph,
|
|
366
|
+
"span",
|
|
367
|
+
attrib={
|
|
368
|
+
"class": "ocr_line",
|
|
369
|
+
"id": f"line_{line_count}",
|
|
370
|
+
"title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \
|
|
371
|
+
{int(round(xmax * width))} {int(round(ymax * height))}; \
|
|
372
|
+
baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0",
|
|
373
|
+
},
|
|
374
|
+
)
|
|
375
|
+
line_count += 1
|
|
376
|
+
for word in line.words:
|
|
377
|
+
(xmin, ymin), (xmax, ymax) = word.geometry
|
|
378
|
+
conf = word.confidence
|
|
379
|
+
word_div = SubElement(
|
|
380
|
+
line_span,
|
|
381
|
+
"span",
|
|
382
|
+
attrib={
|
|
383
|
+
"class": "ocrx_word",
|
|
384
|
+
"id": f"word_{word_count}",
|
|
385
|
+
"title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \
|
|
386
|
+
{int(round(xmax * width))} {int(round(ymax * height))}; \
|
|
387
|
+
x_wconf {int(round(conf * 100))}",
|
|
388
|
+
},
|
|
389
|
+
)
|
|
390
|
+
# set the text
|
|
391
|
+
word_div.text = word.value
|
|
392
|
+
word_count += 1
|
|
393
|
+
|
|
394
|
+
return (ET.tostring(page_hocr, encoding="utf-8", method="xml"), ET.ElementTree(page_hocr))
|
|
395
|
+
|
|
396
|
+
@classmethod
|
|
397
|
+
def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
|
|
398
|
+
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
399
|
+
kwargs.update({"blocks": [Block.from_dict(block_dict) for block_dict in save_dict["blocks"]]})
|
|
400
|
+
return cls(**kwargs)
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
class Document(Element):
|
|
404
|
+
"""Implements a document element as a collection of pages
|
|
405
|
+
|
|
406
|
+
Args:
|
|
407
|
+
----
|
|
408
|
+
pages: list of page elements
|
|
409
|
+
"""
|
|
410
|
+
|
|
411
|
+
_children_names: List[str] = ["pages"]
|
|
412
|
+
pages: List[Page] = []
|
|
413
|
+
|
|
414
|
+
def __init__(
|
|
415
|
+
self,
|
|
416
|
+
pages: List[Page],
|
|
417
|
+
) -> None:
|
|
418
|
+
super().__init__(pages=pages)
|
|
419
|
+
|
|
420
|
+
def render(self, page_break: str = "\n\n\n\n") -> str:
|
|
421
|
+
"""Renders the full text of the element"""
|
|
422
|
+
return page_break.join(p.render() for p in self.pages)
|
|
423
|
+
|
|
424
|
+
def show(self, **kwargs) -> None:
|
|
425
|
+
"""Overlay the result on a given image"""
|
|
426
|
+
for result in self.pages:
|
|
427
|
+
result.show(**kwargs)
|
|
428
|
+
|
|
429
|
+
def synthesize(self, **kwargs) -> List[np.ndarray]:
|
|
430
|
+
"""Synthesize all pages from their predictions
|
|
431
|
+
|
|
432
|
+
Returns
|
|
433
|
+
-------
|
|
434
|
+
list of synthesized pages
|
|
435
|
+
"""
|
|
436
|
+
return [page.synthesize() for page in self.pages]
|
|
437
|
+
|
|
438
|
+
def export_as_xml(self, **kwargs) -> List[Tuple[bytes, ET.ElementTree]]:
|
|
439
|
+
"""Export the document as XML (hOCR-format)
|
|
440
|
+
|
|
441
|
+
Args:
|
|
442
|
+
----
|
|
443
|
+
**kwargs: additional keyword arguments passed to the Page.export_as_xml method
|
|
444
|
+
|
|
445
|
+
Returns:
|
|
446
|
+
-------
|
|
447
|
+
list of tuple of (bytes, ElementTree)
|
|
448
|
+
"""
|
|
449
|
+
return [page.export_as_xml(**kwargs) for page in self.pages]
|
|
450
|
+
|
|
451
|
+
@classmethod
|
|
452
|
+
def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
|
|
453
|
+
kwargs = {k: save_dict[k] for k in cls._exported_keys}
|
|
454
|
+
kwargs.update({"pages": [Page.from_dict(page_dict) for page_dict in save_dict["pages"]]})
|
|
455
|
+
return cls(**kwargs)
|
onnxtr/io/html.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Copyright (C) 2021-2024, Mindee | Felix Dittrich.
|
|
2
|
+
|
|
3
|
+
# This program is licensed under the Apache License 2.0.
|
|
4
|
+
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
|
+
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
__all__ = ["read_html"]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def read_html(url: str, **kwargs: Any) -> bytes:
|
|
12
|
+
"""Read a PDF file and convert it into an image in numpy format
|
|
13
|
+
|
|
14
|
+
>>> from onnxtr.io import read_html
|
|
15
|
+
>>> doc = read_html("https://www.yoursite.com")
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
----
|
|
19
|
+
url: URL of the target web page
|
|
20
|
+
**kwargs: keyword arguments from `weasyprint.HTML`
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
-------
|
|
24
|
+
decoded PDF file as a bytes stream
|
|
25
|
+
"""
|
|
26
|
+
from weasyprint import HTML
|
|
27
|
+
|
|
28
|
+
return HTML(url, **kwargs).write_pdf()
|
onnxtr/io/image.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# Copyright (C) 2021-2024, Mindee | Felix Dittrich.
|
|
2
|
+
|
|
3
|
+
# This program is licensed under the Apache License 2.0.
|
|
4
|
+
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
|
+
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Optional, Tuple
|
|
8
|
+
|
|
9
|
+
import cv2
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
from onnxtr.utils.common_types import AbstractFile
|
|
13
|
+
|
|
14
|
+
__all__ = ["read_img_as_numpy"]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def read_img_as_numpy(
|
|
18
|
+
file: AbstractFile,
|
|
19
|
+
output_size: Optional[Tuple[int, int]] = None,
|
|
20
|
+
rgb_output: bool = True,
|
|
21
|
+
) -> np.ndarray:
|
|
22
|
+
"""Read an image file into numpy format
|
|
23
|
+
|
|
24
|
+
>>> from onnxtr.io import read_img_as_numpy
|
|
25
|
+
>>> page = read_img_as_numpy("path/to/your/doc.jpg")
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
----
|
|
29
|
+
file: the path to the image file
|
|
30
|
+
output_size: the expected output size of each page in format H x W
|
|
31
|
+
rgb_output: whether the output ndarray channel order should be RGB instead of BGR.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
-------
|
|
35
|
+
the page decoded as numpy ndarray of shape H x W x 3
|
|
36
|
+
"""
|
|
37
|
+
if isinstance(file, (str, Path)):
|
|
38
|
+
if not Path(file).is_file():
|
|
39
|
+
raise FileNotFoundError(f"unable to access {file}")
|
|
40
|
+
img = cv2.imread(str(file), cv2.IMREAD_COLOR)
|
|
41
|
+
elif isinstance(file, bytes):
|
|
42
|
+
_file: np.ndarray = np.frombuffer(file, np.uint8)
|
|
43
|
+
img = cv2.imdecode(_file, cv2.IMREAD_COLOR)
|
|
44
|
+
else:
|
|
45
|
+
raise TypeError("unsupported object type for argument 'file'")
|
|
46
|
+
|
|
47
|
+
# Validity check
|
|
48
|
+
if img is None:
|
|
49
|
+
raise ValueError("unable to read file.")
|
|
50
|
+
# Resizing
|
|
51
|
+
if isinstance(output_size, tuple):
|
|
52
|
+
img = cv2.resize(img, output_size[::-1], interpolation=cv2.INTER_LINEAR)
|
|
53
|
+
# Switch the channel order
|
|
54
|
+
if rgb_output:
|
|
55
|
+
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
|
56
|
+
return img
|
onnxtr/io/pdf.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Copyright (C) 2021-2024, Mindee | Felix Dittrich.
|
|
2
|
+
|
|
3
|
+
# This program is licensed under the Apache License 2.0.
|
|
4
|
+
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
|
+
|
|
6
|
+
from typing import Any, List, Optional
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pypdfium2 as pdfium
|
|
10
|
+
|
|
11
|
+
from onnxtr.utils.common_types import AbstractFile
|
|
12
|
+
|
|
13
|
+
__all__ = ["read_pdf"]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def read_pdf(
|
|
17
|
+
file: AbstractFile,
|
|
18
|
+
scale: float = 2,
|
|
19
|
+
rgb_mode: bool = True,
|
|
20
|
+
password: Optional[str] = None,
|
|
21
|
+
**kwargs: Any,
|
|
22
|
+
) -> List[np.ndarray]:
|
|
23
|
+
"""Read a PDF file and convert it into an image in numpy format
|
|
24
|
+
|
|
25
|
+
>>> from onnxtr.io import read_pdf
|
|
26
|
+
>>> doc = read_pdf("path/to/your/doc.pdf")
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
----
|
|
30
|
+
file: the path to the PDF file
|
|
31
|
+
scale: rendering scale (1 corresponds to 72dpi)
|
|
32
|
+
rgb_mode: if True, the output will be RGB, otherwise BGR
|
|
33
|
+
password: a password to unlock the document, if encrypted
|
|
34
|
+
**kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render`
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
-------
|
|
38
|
+
the list of pages decoded as numpy ndarray of shape H x W x C
|
|
39
|
+
"""
|
|
40
|
+
# Rasterise pages to numpy ndarrays with pypdfium2
|
|
41
|
+
pdf = pdfium.PdfDocument(file, password=password, autoclose=True)
|
|
42
|
+
return [page.render(scale=scale, rev_byteorder=rgb_mode, **kwargs).to_numpy() for page in pdf]
|
onnxtr/io/reader.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# Copyright (C) 2021-2024, Mindee | Felix Dittrich.
|
|
2
|
+
|
|
3
|
+
# This program is licensed under the Apache License 2.0.
|
|
4
|
+
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
|
|
5
|
+
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import List, Sequence, Union
|
|
8
|
+
|
|
9
|
+
import numpy as np
|
|
10
|
+
|
|
11
|
+
from onnxtr.file_utils import requires_package
|
|
12
|
+
from onnxtr.utils.common_types import AbstractFile
|
|
13
|
+
|
|
14
|
+
from .html import read_html
|
|
15
|
+
from .image import read_img_as_numpy
|
|
16
|
+
from .pdf import read_pdf
|
|
17
|
+
|
|
18
|
+
__all__ = ["DocumentFile"]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DocumentFile:
|
|
22
|
+
"""Read a document from multiple extensions"""
|
|
23
|
+
|
|
24
|
+
@classmethod
|
|
25
|
+
def from_pdf(cls, file: AbstractFile, **kwargs) -> List[np.ndarray]:
|
|
26
|
+
"""Read a PDF file
|
|
27
|
+
|
|
28
|
+
>>> from onnxtr.io import DocumentFile
|
|
29
|
+
>>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
----
|
|
33
|
+
file: the path to the PDF file or a binary stream
|
|
34
|
+
**kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render`
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
-------
|
|
38
|
+
the list of pages decoded as numpy ndarray of shape H x W x 3
|
|
39
|
+
"""
|
|
40
|
+
return read_pdf(file, **kwargs)
|
|
41
|
+
|
|
42
|
+
@classmethod
|
|
43
|
+
def from_url(cls, url: str, **kwargs) -> List[np.ndarray]:
|
|
44
|
+
"""Interpret a web page as a PDF document
|
|
45
|
+
|
|
46
|
+
>>> from onnxtr.io import DocumentFile
|
|
47
|
+
>>> doc = DocumentFile.from_url("https://www.yoursite.com")
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
----
|
|
51
|
+
url: the URL of the target web page
|
|
52
|
+
**kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render`
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
-------
|
|
56
|
+
the list of pages decoded as numpy ndarray of shape H x W x 3
|
|
57
|
+
"""
|
|
58
|
+
requires_package(
|
|
59
|
+
"weasyprint",
|
|
60
|
+
"`.from_url` requires weasyprint installed.\n"
|
|
61
|
+
+ "Installation instructions: https://doc.courtbouillon.org/weasyprint/stable/first_steps.html#installation",
|
|
62
|
+
)
|
|
63
|
+
pdf_stream = read_html(url)
|
|
64
|
+
return cls.from_pdf(pdf_stream, **kwargs)
|
|
65
|
+
|
|
66
|
+
@classmethod
|
|
67
|
+
def from_images(cls, files: Union[Sequence[AbstractFile], AbstractFile], **kwargs) -> List[np.ndarray]:
|
|
68
|
+
"""Read an image file (or a collection of image files) and convert it into an image in numpy format
|
|
69
|
+
|
|
70
|
+
>>> from onnxtr.io import DocumentFile
|
|
71
|
+
>>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"])
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
----
|
|
75
|
+
files: the path to the image file or a binary stream, or a collection of those
|
|
76
|
+
**kwargs: additional parameters to :meth:`onnxtr.io.image.read_img_as_numpy`
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
-------
|
|
80
|
+
the list of pages decoded as numpy ndarray of shape H x W x 3
|
|
81
|
+
"""
|
|
82
|
+
if isinstance(files, (str, Path, bytes)):
|
|
83
|
+
files = [files]
|
|
84
|
+
|
|
85
|
+
return [read_img_as_numpy(file, **kwargs) for file in files]
|