onnxtr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. onnxtr/__init__.py +2 -0
  2. onnxtr/contrib/__init__.py +0 -0
  3. onnxtr/contrib/artefacts.py +131 -0
  4. onnxtr/contrib/base.py +105 -0
  5. onnxtr/file_utils.py +33 -0
  6. onnxtr/io/__init__.py +5 -0
  7. onnxtr/io/elements.py +455 -0
  8. onnxtr/io/html.py +28 -0
  9. onnxtr/io/image.py +56 -0
  10. onnxtr/io/pdf.py +42 -0
  11. onnxtr/io/reader.py +85 -0
  12. onnxtr/models/__init__.py +4 -0
  13. onnxtr/models/_utils.py +141 -0
  14. onnxtr/models/builder.py +355 -0
  15. onnxtr/models/classification/__init__.py +2 -0
  16. onnxtr/models/classification/models/__init__.py +1 -0
  17. onnxtr/models/classification/models/mobilenet.py +120 -0
  18. onnxtr/models/classification/predictor/__init__.py +1 -0
  19. onnxtr/models/classification/predictor/base.py +57 -0
  20. onnxtr/models/classification/zoo.py +76 -0
  21. onnxtr/models/detection/__init__.py +2 -0
  22. onnxtr/models/detection/core.py +101 -0
  23. onnxtr/models/detection/models/__init__.py +3 -0
  24. onnxtr/models/detection/models/differentiable_binarization.py +159 -0
  25. onnxtr/models/detection/models/fast.py +160 -0
  26. onnxtr/models/detection/models/linknet.py +160 -0
  27. onnxtr/models/detection/postprocessor/__init__.py +0 -0
  28. onnxtr/models/detection/postprocessor/base.py +144 -0
  29. onnxtr/models/detection/predictor/__init__.py +1 -0
  30. onnxtr/models/detection/predictor/base.py +54 -0
  31. onnxtr/models/detection/zoo.py +73 -0
  32. onnxtr/models/engine.py +50 -0
  33. onnxtr/models/predictor/__init__.py +1 -0
  34. onnxtr/models/predictor/base.py +175 -0
  35. onnxtr/models/predictor/predictor.py +145 -0
  36. onnxtr/models/preprocessor/__init__.py +1 -0
  37. onnxtr/models/preprocessor/base.py +118 -0
  38. onnxtr/models/recognition/__init__.py +2 -0
  39. onnxtr/models/recognition/core.py +28 -0
  40. onnxtr/models/recognition/models/__init__.py +5 -0
  41. onnxtr/models/recognition/models/crnn.py +226 -0
  42. onnxtr/models/recognition/models/master.py +145 -0
  43. onnxtr/models/recognition/models/parseq.py +134 -0
  44. onnxtr/models/recognition/models/sar.py +134 -0
  45. onnxtr/models/recognition/models/vitstr.py +166 -0
  46. onnxtr/models/recognition/predictor/__init__.py +1 -0
  47. onnxtr/models/recognition/predictor/_utils.py +86 -0
  48. onnxtr/models/recognition/predictor/base.py +79 -0
  49. onnxtr/models/recognition/utils.py +89 -0
  50. onnxtr/models/recognition/zoo.py +69 -0
  51. onnxtr/models/zoo.py +114 -0
  52. onnxtr/transforms/__init__.py +1 -0
  53. onnxtr/transforms/base.py +112 -0
  54. onnxtr/utils/__init__.py +4 -0
  55. onnxtr/utils/common_types.py +18 -0
  56. onnxtr/utils/data.py +126 -0
  57. onnxtr/utils/fonts.py +41 -0
  58. onnxtr/utils/geometry.py +498 -0
  59. onnxtr/utils/multithreading.py +50 -0
  60. onnxtr/utils/reconstitution.py +70 -0
  61. onnxtr/utils/repr.py +64 -0
  62. onnxtr/utils/visualization.py +291 -0
  63. onnxtr/utils/vocabs.py +71 -0
  64. onnxtr/version.py +1 -0
  65. onnxtr-0.1.0.dist-info/LICENSE +201 -0
  66. onnxtr-0.1.0.dist-info/METADATA +481 -0
  67. onnxtr-0.1.0.dist-info/RECORD +70 -0
  68. onnxtr-0.1.0.dist-info/WHEEL +5 -0
  69. onnxtr-0.1.0.dist-info/top_level.txt +2 -0
  70. onnxtr-0.1.0.dist-info/zip-safe +1 -0
onnxtr/io/elements.py ADDED
@@ -0,0 +1,455 @@
1
+ # Copyright (C) 2021-2024, Mindee | Felix Dittrich.
2
+
3
+ # This program is licensed under the Apache License 2.0.
4
+ # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
+
6
+ from typing import Any, Dict, List, Optional, Tuple, Union
7
+
8
+ from defusedxml import defuse_stdlib
9
+
10
+ defuse_stdlib()
11
+ from xml.etree import ElementTree as ET
12
+ from xml.etree.ElementTree import Element as ETElement
13
+ from xml.etree.ElementTree import SubElement
14
+
15
+ import numpy as np
16
+
17
+ import onnxtr
18
+ from onnxtr.file_utils import requires_package
19
+ from onnxtr.utils.common_types import BoundingBox
20
+ from onnxtr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
21
+ from onnxtr.utils.reconstitution import synthesize_page
22
+ from onnxtr.utils.repr import NestedObject
23
+
24
+ try: # optional dependency for visualization
25
+ from onnxtr.utils.visualization import visualize_page
26
+ except ModuleNotFoundError: # pragma: no cover
27
+ pass
28
+
29
+ __all__ = ["Element", "Word", "Artefact", "Line", "Block", "Page", "Document"]
30
+
31
+
32
+ class Element(NestedObject):
33
+ """Implements an abstract document element with exporting and text rendering capabilities"""
34
+
35
+ _children_names: List[str] = []
36
+ _exported_keys: List[str] = []
37
+
38
+ def __init__(self, **kwargs: Any) -> None:
39
+ for k, v in kwargs.items():
40
+ if k in self._children_names:
41
+ setattr(self, k, v)
42
+ else:
43
+ raise KeyError(f"{self.__class__.__name__} object does not have any attribute named '{k}'")
44
+
45
+ def export(self) -> Dict[str, Any]:
46
+ """Exports the object into a nested dict format"""
47
+ export_dict = {k: getattr(self, k) for k in self._exported_keys}
48
+ for children_name in self._children_names:
49
+ export_dict[children_name] = [c.export() for c in getattr(self, children_name)]
50
+
51
+ return export_dict
52
+
53
+ @classmethod
54
+ def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
55
+ raise NotImplementedError
56
+
57
+ def render(self) -> str:
58
+ raise NotImplementedError
59
+
60
+
61
+ class Word(Element):
62
+ """Implements a word element
63
+
64
+ Args:
65
+ ----
66
+ value: the text string of the word
67
+ confidence: the confidence associated with the text prediction
68
+ geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
69
+ the page's size
70
+ crop_orientation: the general orientation of the crop in degrees and its confidence
71
+ """
72
+
73
+ _exported_keys: List[str] = ["value", "confidence", "geometry", "crop_orientation"]
74
+ _children_names: List[str] = []
75
+
76
+ def __init__(
77
+ self,
78
+ value: str,
79
+ confidence: float,
80
+ geometry: Union[BoundingBox, np.ndarray],
81
+ crop_orientation: Dict[str, Any],
82
+ ) -> None:
83
+ super().__init__()
84
+ self.value = value
85
+ self.confidence = confidence
86
+ self.geometry = geometry
87
+ self.crop_orientation = crop_orientation
88
+
89
+ def render(self) -> str:
90
+ """Renders the full text of the element"""
91
+ return self.value
92
+
93
+ def extra_repr(self) -> str:
94
+ return f"value='{self.value}', confidence={self.confidence:.2}"
95
+
96
+ @classmethod
97
+ def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
98
+ kwargs = {k: save_dict[k] for k in cls._exported_keys}
99
+ return cls(**kwargs)
100
+
101
+
102
+ class Artefact(Element):
103
+ """Implements a non-textual element
104
+
105
+ Args:
106
+ ----
107
+ artefact_type: the type of artefact
108
+ confidence: the confidence of the type prediction
109
+ geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
110
+ the page's size.
111
+ """
112
+
113
+ _exported_keys: List[str] = ["geometry", "type", "confidence"]
114
+ _children_names: List[str] = []
115
+
116
+ def __init__(self, artefact_type: str, confidence: float, geometry: BoundingBox) -> None:
117
+ super().__init__()
118
+ self.geometry = geometry
119
+ self.type = artefact_type
120
+ self.confidence = confidence
121
+
122
+ def render(self) -> str:
123
+ """Renders the full text of the element"""
124
+ return f"[{self.type.upper()}]"
125
+
126
+ def extra_repr(self) -> str:
127
+ return f"type='{self.type}', confidence={self.confidence:.2}"
128
+
129
+ @classmethod
130
+ def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
131
+ kwargs = {k: save_dict[k] for k in cls._exported_keys}
132
+ return cls(**kwargs)
133
+
134
+
135
+ class Line(Element):
136
+ """Implements a line element as a collection of words
137
+
138
+ Args:
139
+ ----
140
+ words: list of word elements
141
+ geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
142
+ the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing
143
+ all words in it.
144
+ """
145
+
146
+ _exported_keys: List[str] = ["geometry"]
147
+ _children_names: List[str] = ["words"]
148
+ words: List[Word] = []
149
+
150
+ def __init__(
151
+ self,
152
+ words: List[Word],
153
+ geometry: Optional[Union[BoundingBox, np.ndarray]] = None,
154
+ ) -> None:
155
+ # Resolve the geometry using the smallest enclosing bounding box
156
+ if geometry is None:
157
+ # Check whether this is a rotated or straight box
158
+ box_resolution_fn = resolve_enclosing_rbbox if len(words[0].geometry) == 4 else resolve_enclosing_bbox
159
+ geometry = box_resolution_fn([w.geometry for w in words]) # type: ignore[operator]
160
+
161
+ super().__init__(words=words)
162
+ self.geometry = geometry
163
+
164
+ def render(self) -> str:
165
+ """Renders the full text of the element"""
166
+ return " ".join(w.render() for w in self.words)
167
+
168
+ @classmethod
169
+ def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
170
+ kwargs = {k: save_dict[k] for k in cls._exported_keys}
171
+ kwargs.update({
172
+ "words": [Word.from_dict(_dict) for _dict in save_dict["words"]],
173
+ })
174
+ return cls(**kwargs)
175
+
176
+
177
+ class Block(Element):
178
+ """Implements a block element as a collection of lines and artefacts
179
+
180
+ Args:
181
+ ----
182
+ lines: list of line elements
183
+ artefacts: list of artefacts
184
+ geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
185
+ the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing
186
+ all lines and artefacts in it.
187
+ """
188
+
189
+ _exported_keys: List[str] = ["geometry"]
190
+ _children_names: List[str] = ["lines", "artefacts"]
191
+ lines: List[Line] = []
192
+ artefacts: List[Artefact] = []
193
+
194
+ def __init__(
195
+ self,
196
+ lines: List[Line] = [],
197
+ artefacts: List[Artefact] = [],
198
+ geometry: Optional[Union[BoundingBox, np.ndarray]] = None,
199
+ ) -> None:
200
+ # Resolve the geometry using the smallest enclosing bounding box
201
+ if geometry is None:
202
+ line_boxes = [word.geometry for line in lines for word in line.words]
203
+ artefact_boxes = [artefact.geometry for artefact in artefacts]
204
+ box_resolution_fn = (
205
+ resolve_enclosing_rbbox if isinstance(lines[0].geometry, np.ndarray) else resolve_enclosing_bbox
206
+ )
207
+ geometry = box_resolution_fn(line_boxes + artefact_boxes) # type: ignore[operator]
208
+
209
+ super().__init__(lines=lines, artefacts=artefacts)
210
+ self.geometry = geometry
211
+
212
+ def render(self, line_break: str = "\n") -> str:
213
+ """Renders the full text of the element"""
214
+ return line_break.join(line.render() for line in self.lines)
215
+
216
+ @classmethod
217
+ def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
218
+ kwargs = {k: save_dict[k] for k in cls._exported_keys}
219
+ kwargs.update({
220
+ "lines": [Line.from_dict(_dict) for _dict in save_dict["lines"]],
221
+ "artefacts": [Artefact.from_dict(_dict) for _dict in save_dict["artefacts"]],
222
+ })
223
+ return cls(**kwargs)
224
+
225
+
226
+ class Page(Element):
227
+ """Implements a page element as a collection of blocks
228
+
229
+ Args:
230
+ ----
231
+ page: image encoded as a numpy array in uint8
232
+ blocks: list of block elements
233
+ page_idx: the index of the page in the input raw document
234
+ dimensions: the page size in pixels in format (height, width)
235
+ orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction
236
+ language: a dictionary with the language value and confidence of the prediction
237
+ """
238
+
239
+ _exported_keys: List[str] = ["page_idx", "dimensions", "orientation", "language"]
240
+ _children_names: List[str] = ["blocks"]
241
+ blocks: List[Block] = []
242
+
243
+ def __init__(
244
+ self,
245
+ page: np.ndarray,
246
+ blocks: List[Block],
247
+ page_idx: int,
248
+ dimensions: Tuple[int, int],
249
+ orientation: Optional[Dict[str, Any]] = None,
250
+ language: Optional[Dict[str, Any]] = None,
251
+ ) -> None:
252
+ super().__init__(blocks=blocks)
253
+ self.page = page
254
+ self.page_idx = page_idx
255
+ self.dimensions = dimensions
256
+ self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None)
257
+ self.language = language if isinstance(language, dict) else dict(value=None, confidence=None)
258
+
259
+ def render(self, block_break: str = "\n\n") -> str:
260
+ """Renders the full text of the element"""
261
+ return block_break.join(b.render() for b in self.blocks)
262
+
263
+ def extra_repr(self) -> str:
264
+ return f"dimensions={self.dimensions}"
265
+
266
+ def show(self, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None:
267
+ """Overlay the result on a given image
268
+
269
+ Args:
270
+ interactive: whether the display should be interactive
271
+ preserve_aspect_ratio: pass True if you passed True to the predictor
272
+ **kwargs: additional keyword arguments passed to the matplotlib.pyplot.show method
273
+ """
274
+ requires_package("matplotlib", "`.show()` requires matplotlib & mplcursors installed")
275
+ requires_package("mplcursors", "`.show()` requires matplotlib & mplcursors installed")
276
+ import matplotlib.pyplot as plt
277
+
278
+ visualize_page(self.export(), self.page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio)
279
+ plt.show(**kwargs)
280
+
281
+ def synthesize(self, **kwargs) -> np.ndarray:
282
+ """Synthesize the page from the predictions
283
+
284
+ Returns
285
+ -------
286
+ synthesized page
287
+ """
288
+ return synthesize_page(self.export(), **kwargs)
289
+
290
+ def export_as_xml(self, file_title: str = "OnnxTR - XML export (hOCR)") -> Tuple[bytes, ET.ElementTree]:
291
+ """Export the page as XML (hOCR-format)
292
+ convention: https://github.com/kba/hocr-spec/blob/master/1.2/spec.md
293
+
294
+ Args:
295
+ ----
296
+ file_title: the title of the XML file
297
+
298
+ Returns:
299
+ -------
300
+ a tuple of the XML byte string, and its ElementTree
301
+ """
302
+ p_idx = self.page_idx
303
+ block_count: int = 1
304
+ line_count: int = 1
305
+ word_count: int = 1
306
+ height, width = self.dimensions
307
+ language = self.language if "language" in self.language.keys() else "en"
308
+ # Create the XML root element
309
+ page_hocr = ETElement("html", attrib={"xmlns": "http://www.w3.org/1999/xhtml", "xml:lang": str(language)})
310
+ # Create the header / SubElements of the root element
311
+ head = SubElement(page_hocr, "head")
312
+ SubElement(head, "title").text = file_title
313
+ SubElement(head, "meta", attrib={"http-equiv": "Content-Type", "content": "text/html; charset=utf-8"})
314
+ SubElement(
315
+ head,
316
+ "meta",
317
+ attrib={"name": "ocr-system", "content": f" {onnxtr.__version__}"}, # type: ignore[attr-defined]
318
+ )
319
+ SubElement(
320
+ head,
321
+ "meta",
322
+ attrib={"name": "ocr-capabilities", "content": "ocr_page ocr_carea ocr_par ocr_line ocrx_word"},
323
+ )
324
+ # Create the body
325
+ body = SubElement(page_hocr, "body")
326
+ SubElement(
327
+ body,
328
+ "div",
329
+ attrib={
330
+ "class": "ocr_page",
331
+ "id": f"page_{p_idx + 1}",
332
+ "title": f"image; bbox 0 0 {width} {height}; ppageno 0",
333
+ },
334
+ )
335
+ # iterate over the blocks / lines / words and create the XML elements in body line by line with the attributes
336
+ for block in self.blocks:
337
+ if len(block.geometry) != 2:
338
+ raise TypeError("XML export is only available for straight bounding boxes for now.")
339
+ (xmin, ymin), (xmax, ymax) = block.geometry
340
+ block_div = SubElement(
341
+ body,
342
+ "div",
343
+ attrib={
344
+ "class": "ocr_carea",
345
+ "id": f"block_{block_count}",
346
+ "title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \
347
+ {int(round(xmax * width))} {int(round(ymax * height))}",
348
+ },
349
+ )
350
+ paragraph = SubElement(
351
+ block_div,
352
+ "p",
353
+ attrib={
354
+ "class": "ocr_par",
355
+ "id": f"par_{block_count}",
356
+ "title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \
357
+ {int(round(xmax * width))} {int(round(ymax * height))}",
358
+ },
359
+ )
360
+ block_count += 1
361
+ for line in block.lines:
362
+ (xmin, ymin), (xmax, ymax) = line.geometry
363
+ # NOTE: baseline, x_size, x_descenders, x_ascenders is currently initalized to 0
364
+ line_span = SubElement(
365
+ paragraph,
366
+ "span",
367
+ attrib={
368
+ "class": "ocr_line",
369
+ "id": f"line_{line_count}",
370
+ "title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \
371
+ {int(round(xmax * width))} {int(round(ymax * height))}; \
372
+ baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0",
373
+ },
374
+ )
375
+ line_count += 1
376
+ for word in line.words:
377
+ (xmin, ymin), (xmax, ymax) = word.geometry
378
+ conf = word.confidence
379
+ word_div = SubElement(
380
+ line_span,
381
+ "span",
382
+ attrib={
383
+ "class": "ocrx_word",
384
+ "id": f"word_{word_count}",
385
+ "title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \
386
+ {int(round(xmax * width))} {int(round(ymax * height))}; \
387
+ x_wconf {int(round(conf * 100))}",
388
+ },
389
+ )
390
+ # set the text
391
+ word_div.text = word.value
392
+ word_count += 1
393
+
394
+ return (ET.tostring(page_hocr, encoding="utf-8", method="xml"), ET.ElementTree(page_hocr))
395
+
396
+ @classmethod
397
+ def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
398
+ kwargs = {k: save_dict[k] for k in cls._exported_keys}
399
+ kwargs.update({"blocks": [Block.from_dict(block_dict) for block_dict in save_dict["blocks"]]})
400
+ return cls(**kwargs)
401
+
402
+
403
+ class Document(Element):
404
+ """Implements a document element as a collection of pages
405
+
406
+ Args:
407
+ ----
408
+ pages: list of page elements
409
+ """
410
+
411
+ _children_names: List[str] = ["pages"]
412
+ pages: List[Page] = []
413
+
414
+ def __init__(
415
+ self,
416
+ pages: List[Page],
417
+ ) -> None:
418
+ super().__init__(pages=pages)
419
+
420
+ def render(self, page_break: str = "\n\n\n\n") -> str:
421
+ """Renders the full text of the element"""
422
+ return page_break.join(p.render() for p in self.pages)
423
+
424
+ def show(self, **kwargs) -> None:
425
+ """Overlay the result on a given image"""
426
+ for result in self.pages:
427
+ result.show(**kwargs)
428
+
429
+ def synthesize(self, **kwargs) -> List[np.ndarray]:
430
+ """Synthesize all pages from their predictions
431
+
432
+ Returns
433
+ -------
434
+ list of synthesized pages
435
+ """
436
+ return [page.synthesize() for page in self.pages]
437
+
438
+ def export_as_xml(self, **kwargs) -> List[Tuple[bytes, ET.ElementTree]]:
439
+ """Export the document as XML (hOCR-format)
440
+
441
+ Args:
442
+ ----
443
+ **kwargs: additional keyword arguments passed to the Page.export_as_xml method
444
+
445
+ Returns:
446
+ -------
447
+ list of tuple of (bytes, ElementTree)
448
+ """
449
+ return [page.export_as_xml(**kwargs) for page in self.pages]
450
+
451
+ @classmethod
452
+ def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
453
+ kwargs = {k: save_dict[k] for k in cls._exported_keys}
454
+ kwargs.update({"pages": [Page.from_dict(page_dict) for page_dict in save_dict["pages"]]})
455
+ return cls(**kwargs)
onnxtr/io/html.py ADDED
@@ -0,0 +1,28 @@
1
+ # Copyright (C) 2021-2024, Mindee | Felix Dittrich.
2
+
3
+ # This program is licensed under the Apache License 2.0.
4
+ # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
+
6
+ from typing import Any
7
+
8
+ __all__ = ["read_html"]
9
+
10
+
11
+ def read_html(url: str, **kwargs: Any) -> bytes:
12
+ """Read a PDF file and convert it into an image in numpy format
13
+
14
+ >>> from onnxtr.io import read_html
15
+ >>> doc = read_html("https://www.yoursite.com")
16
+
17
+ Args:
18
+ ----
19
+ url: URL of the target web page
20
+ **kwargs: keyword arguments from `weasyprint.HTML`
21
+
22
+ Returns:
23
+ -------
24
+ decoded PDF file as a bytes stream
25
+ """
26
+ from weasyprint import HTML
27
+
28
+ return HTML(url, **kwargs).write_pdf()
onnxtr/io/image.py ADDED
@@ -0,0 +1,56 @@
1
+ # Copyright (C) 2021-2024, Mindee | Felix Dittrich.
2
+
3
+ # This program is licensed under the Apache License 2.0.
4
+ # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
+
6
+ from pathlib import Path
7
+ from typing import Optional, Tuple
8
+
9
+ import cv2
10
+ import numpy as np
11
+
12
+ from onnxtr.utils.common_types import AbstractFile
13
+
14
+ __all__ = ["read_img_as_numpy"]
15
+
16
+
17
+ def read_img_as_numpy(
18
+ file: AbstractFile,
19
+ output_size: Optional[Tuple[int, int]] = None,
20
+ rgb_output: bool = True,
21
+ ) -> np.ndarray:
22
+ """Read an image file into numpy format
23
+
24
+ >>> from onnxtr.io import read_img_as_numpy
25
+ >>> page = read_img_as_numpy("path/to/your/doc.jpg")
26
+
27
+ Args:
28
+ ----
29
+ file: the path to the image file
30
+ output_size: the expected output size of each page in format H x W
31
+ rgb_output: whether the output ndarray channel order should be RGB instead of BGR.
32
+
33
+ Returns:
34
+ -------
35
+ the page decoded as numpy ndarray of shape H x W x 3
36
+ """
37
+ if isinstance(file, (str, Path)):
38
+ if not Path(file).is_file():
39
+ raise FileNotFoundError(f"unable to access {file}")
40
+ img = cv2.imread(str(file), cv2.IMREAD_COLOR)
41
+ elif isinstance(file, bytes):
42
+ _file: np.ndarray = np.frombuffer(file, np.uint8)
43
+ img = cv2.imdecode(_file, cv2.IMREAD_COLOR)
44
+ else:
45
+ raise TypeError("unsupported object type for argument 'file'")
46
+
47
+ # Validity check
48
+ if img is None:
49
+ raise ValueError("unable to read file.")
50
+ # Resizing
51
+ if isinstance(output_size, tuple):
52
+ img = cv2.resize(img, output_size[::-1], interpolation=cv2.INTER_LINEAR)
53
+ # Switch the channel order
54
+ if rgb_output:
55
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
56
+ return img
onnxtr/io/pdf.py ADDED
@@ -0,0 +1,42 @@
1
+ # Copyright (C) 2021-2024, Mindee | Felix Dittrich.
2
+
3
+ # This program is licensed under the Apache License 2.0.
4
+ # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
+
6
+ from typing import Any, List, Optional
7
+
8
+ import numpy as np
9
+ import pypdfium2 as pdfium
10
+
11
+ from onnxtr.utils.common_types import AbstractFile
12
+
13
+ __all__ = ["read_pdf"]
14
+
15
+
16
+ def read_pdf(
17
+ file: AbstractFile,
18
+ scale: float = 2,
19
+ rgb_mode: bool = True,
20
+ password: Optional[str] = None,
21
+ **kwargs: Any,
22
+ ) -> List[np.ndarray]:
23
+ """Read a PDF file and convert it into an image in numpy format
24
+
25
+ >>> from onnxtr.io import read_pdf
26
+ >>> doc = read_pdf("path/to/your/doc.pdf")
27
+
28
+ Args:
29
+ ----
30
+ file: the path to the PDF file
31
+ scale: rendering scale (1 corresponds to 72dpi)
32
+ rgb_mode: if True, the output will be RGB, otherwise BGR
33
+ password: a password to unlock the document, if encrypted
34
+ **kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render`
35
+
36
+ Returns:
37
+ -------
38
+ the list of pages decoded as numpy ndarray of shape H x W x C
39
+ """
40
+ # Rasterise pages to numpy ndarrays with pypdfium2
41
+ pdf = pdfium.PdfDocument(file, password=password, autoclose=True)
42
+ return [page.render(scale=scale, rev_byteorder=rgb_mode, **kwargs).to_numpy() for page in pdf]
onnxtr/io/reader.py ADDED
@@ -0,0 +1,85 @@
1
+ # Copyright (C) 2021-2024, Mindee | Felix Dittrich.
2
+
3
+ # This program is licensed under the Apache License 2.0.
4
+ # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
5
+
6
+ from pathlib import Path
7
+ from typing import List, Sequence, Union
8
+
9
+ import numpy as np
10
+
11
+ from onnxtr.file_utils import requires_package
12
+ from onnxtr.utils.common_types import AbstractFile
13
+
14
+ from .html import read_html
15
+ from .image import read_img_as_numpy
16
+ from .pdf import read_pdf
17
+
18
+ __all__ = ["DocumentFile"]
19
+
20
+
21
+ class DocumentFile:
22
+ """Read a document from multiple extensions"""
23
+
24
+ @classmethod
25
+ def from_pdf(cls, file: AbstractFile, **kwargs) -> List[np.ndarray]:
26
+ """Read a PDF file
27
+
28
+ >>> from onnxtr.io import DocumentFile
29
+ >>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
30
+
31
+ Args:
32
+ ----
33
+ file: the path to the PDF file or a binary stream
34
+ **kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render`
35
+
36
+ Returns:
37
+ -------
38
+ the list of pages decoded as numpy ndarray of shape H x W x 3
39
+ """
40
+ return read_pdf(file, **kwargs)
41
+
42
+ @classmethod
43
+ def from_url(cls, url: str, **kwargs) -> List[np.ndarray]:
44
+ """Interpret a web page as a PDF document
45
+
46
+ >>> from onnxtr.io import DocumentFile
47
+ >>> doc = DocumentFile.from_url("https://www.yoursite.com")
48
+
49
+ Args:
50
+ ----
51
+ url: the URL of the target web page
52
+ **kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render`
53
+
54
+ Returns:
55
+ -------
56
+ the list of pages decoded as numpy ndarray of shape H x W x 3
57
+ """
58
+ requires_package(
59
+ "weasyprint",
60
+ "`.from_url` requires weasyprint installed.\n"
61
+ + "Installation instructions: https://doc.courtbouillon.org/weasyprint/stable/first_steps.html#installation",
62
+ )
63
+ pdf_stream = read_html(url)
64
+ return cls.from_pdf(pdf_stream, **kwargs)
65
+
66
+ @classmethod
67
+ def from_images(cls, files: Union[Sequence[AbstractFile], AbstractFile], **kwargs) -> List[np.ndarray]:
68
+ """Read an image file (or a collection of image files) and convert it into an image in numpy format
69
+
70
+ >>> from onnxtr.io import DocumentFile
71
+ >>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"])
72
+
73
+ Args:
74
+ ----
75
+ files: the path to the image file or a binary stream, or a collection of those
76
+ **kwargs: additional parameters to :meth:`onnxtr.io.image.read_img_as_numpy`
77
+
78
+ Returns:
79
+ -------
80
+ the list of pages decoded as numpy ndarray of shape H x W x 3
81
+ """
82
+ if isinstance(files, (str, Path, bytes)):
83
+ files = [files]
84
+
85
+ return [read_img_as_numpy(file, **kwargs) for file in files]
@@ -0,0 +1,4 @@
1
+ from .classification import *
2
+ from .detection import *
3
+ from .recognition import *
4
+ from .zoo import *