deepdoctection 0.34__py3-none-any.whl → 0.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

@@ -1,3 +1,4 @@
1
+ USE_ROTATOR: False
1
2
  USE_LAYOUT: True
2
3
  USE_TABLE_SEGMENTATION: True
3
4
  TF:
@@ -97,3 +98,7 @@ TEXT_ORDERING:
97
98
  BROKEN_LINE_TOLERANCE: 0.003
98
99
  HEIGHT_TOLERANCE: 2.0
99
100
  PARAGRAPH_BREAK: 0.035
101
+ USE_LAYOUT_LINK: False
102
+ LAYOUT_LINK:
103
+ PARENTAL_CATEGORIES:
104
+ CHILD_CATEGORIES:
@@ -527,5 +527,5 @@ class ContainerAnnotation(CategoryAnnotation):
527
527
  def from_dict(cls, **kwargs: AnnotationDict) -> ContainerAnnotation:
528
528
  container_ann = ann_from_dict(cls, **kwargs)
529
529
  value = kwargs.get("value", "")
530
- container_ann.value = value if isinstance(value, str) else list(value)
530
+ container_ann.value = value if isinstance(value, (int, float, str)) else list(value)
531
531
  return container_ann
@@ -143,11 +143,13 @@ def convert_pdf_bytes_to_np_array(pdf_bytes: bytes, dpi: Optional[int] = None) -
143
143
  return np_array.astype(uint8)
144
144
 
145
145
 
146
- def convert_pdf_bytes_to_np_array_v2(pdf_bytes: bytes, dpi: Optional[int] = None) -> PixelValues:
146
+ def convert_pdf_bytes_to_np_array_v2(pdf_bytes: bytes, dpi: Optional[int] = 200) -> PixelValues:
147
147
  """
148
- Converts a pdf passed as bytes into a numpy array. Note, that this method expects poppler to be installed. This
149
- function, however does not rely on the wrapper pdf2image but uses a function of this lib which calls poppler
150
- directly.
148
+ Converts a pdf passed as bytes into a numpy array. We use poppler or pdfmium to convert the pdf to an image.
149
+ If both is available you can steer the selection of the render engine with environment variables:
150
+
151
+ USE_DD_POPPLER: Set to 1, "TRUE", "True" to use poppler
152
+ USE_DD_PDFIUM: Set to 1, "TRUE", "True" to use pdfium
151
153
 
152
154
  :param pdf_bytes: A pdf as bytes object. A byte representation can from a pdf file can be generated e.g. with
153
155
  `utils.fs.load_bytes_from_pdf_file`
@@ -23,7 +23,7 @@ from __future__ import annotations
23
23
  import json
24
24
  from collections import defaultdict
25
25
  from dataclasses import dataclass, field
26
- from os import environ
26
+ from os import environ, fspath
27
27
  from pathlib import Path
28
28
  from typing import Any, Optional, Sequence, Union, no_type_check
29
29
 
@@ -412,13 +412,22 @@ class Image:
412
412
  img_dict["_image"] = None
413
413
  return img_dict
414
414
 
415
+ def as_json(self) -> str:
416
+ """
417
+ Returns the full image dataclass as json string.
418
+
419
+ :return: A json string.
420
+ """
421
+
422
+ return json.dumps(self.as_dict(), indent=4)
423
+
415
424
  @staticmethod
416
425
  def remove_keys() -> list[str]:
417
426
  """
418
427
  A list of attributes to suspend from as_dict creation.
419
428
  """
420
429
 
421
- return ["_image", "_annotation_ids"]
430
+ return ["_image", "_annotation_ids", "_category_name"]
422
431
 
423
432
  def define_annotation_id(self, annotation: Annotation) -> str:
424
433
  """
@@ -443,7 +452,8 @@ class Image:
443
452
 
444
453
  Calls `List.remove`. Make sure, the element is in the list for otherwise a ValueError will be raised.
445
454
 
446
- :param annotation: The annotation to remove
455
+ :param annotation_ids: The annotation to remove
456
+ :param service_ids: The service id to remove
447
457
  """
448
458
  ann_id_to_annotation_maps = self.get_annotation_id_to_annotation_maps()
449
459
 
@@ -703,13 +713,13 @@ class Image:
703
713
  path = path / self.image_id
704
714
  suffix = path.suffix
705
715
  if suffix:
706
- path_json = path.as_posix().replace(suffix, ".json")
716
+ path_json = fspath(path).replace(suffix, ".json")
707
717
  else:
708
- path_json = path.as_posix() + ".json"
718
+ path_json = fspath(path) + ".json"
709
719
  if highest_hierarchy_only:
710
720
  self.remove_image_from_lower_hierachy()
711
721
  export_dict = self.as_dict()
712
- export_dict["location"] = str(export_dict["location"])
722
+ export_dict["location"] = fspath(export_dict["location"])
713
723
  if not image_to_json:
714
724
  export_dict["_image"] = None
715
725
  if dry:
@@ -25,6 +25,7 @@ from copy import copy
25
25
  from typing import Any, Mapping, Optional, Sequence, Type, TypedDict, Union, no_type_check
26
26
 
27
27
  import numpy as np
28
+ from typing_extensions import LiteralString
28
29
 
29
30
  from ..utils.error import AnnotationError, ImageError
30
31
  from ..utils.logger import LoggingRecord, logger
@@ -40,10 +41,12 @@ from ..utils.settings import (
40
41
  WordType,
41
42
  get_type,
42
43
  )
44
+ from ..utils.transform import ResizeTransform
43
45
  from ..utils.types import HTML, AnnotationDict, Chunks, ImageDict, PathLikeOrStr, PixelValues, Text_, csv
44
46
  from ..utils.viz import draw_boxes, interactive_imshow, viz_handler
45
47
  from .annotation import CategoryAnnotation, ContainerAnnotation, ImageAnnotation, ann_from_dict
46
48
  from .box import BoundingBox, crop_box_from_image
49
+ from .convert import box_to_point4, point4_to_box
47
50
  from .image import Image
48
51
 
49
52
 
@@ -101,7 +104,7 @@ class ImageAnnotationBaseView(ImageAnnotation):
101
104
  return np_image
102
105
  raise AnnotationError(f"base_page.image is None for {self.annotation_id}")
103
106
 
104
- def __getattr__(self, item: str) -> Optional[Union[str, int, list[str]]]:
107
+ def __getattr__(self, item: str) -> Optional[Union[str, int, list[str], list[ImageAnnotationBaseView]]]:
105
108
  """
106
109
  Get attributes defined by registered `self.get_attribute_names()` in a multi step process:
107
110
 
@@ -126,6 +129,9 @@ class ImageAnnotationBaseView(ImageAnnotation):
126
129
  if isinstance(sub_cat, ContainerAnnotation):
127
130
  return sub_cat.value
128
131
  return sub_cat.category_id
132
+ if item in self.relationships:
133
+ relationship_ids = self.get_relationship(get_type(item))
134
+ return self.base_page.get_annotation(annotation_ids=relationship_ids)
129
135
  if self.image is not None:
130
136
  if item in self.image.summary.sub_categories:
131
137
  sub_cat = self.get_summary(get_type(item))
@@ -165,7 +171,11 @@ class Word(ImageAnnotationBaseView):
165
171
  """
166
172
 
167
173
  def get_attribute_names(self) -> set[str]:
168
- return set(WordType).union(super().get_attribute_names()).union({Relationships.READING_ORDER})
174
+ return (
175
+ set(WordType)
176
+ .union(super().get_attribute_names())
177
+ .union({Relationships.READING_ORDER, Relationships.LAYOUT_LINK})
178
+ )
169
179
 
170
180
 
171
181
  class Layout(ImageAnnotationBaseView):
@@ -246,7 +256,11 @@ class Layout(ImageAnnotationBaseView):
246
256
  }
247
257
 
248
258
  def get_attribute_names(self) -> set[str]:
249
- return {"words", "text"}.union(super().get_attribute_names()).union({Relationships.READING_ORDER})
259
+ return (
260
+ {"words", "text"}
261
+ .union(super().get_attribute_names())
262
+ .union({Relationships.READING_ORDER, Relationships.LAYOUT_LINK})
263
+ )
250
264
 
251
265
  def __len__(self) -> int:
252
266
  """len of text counted by number of characters"""
@@ -433,8 +447,8 @@ class ImageDefaults(TypedDict):
433
447
  """ImageDefaults"""
434
448
 
435
449
  text_container: LayoutType
436
- floating_text_block_categories: tuple[LayoutType, ...]
437
- text_block_categories: tuple[LayoutType, ...]
450
+ floating_text_block_categories: tuple[Union[LayoutType, CellType], ...]
451
+ text_block_categories: tuple[Union[LayoutType, CellType], ...]
438
452
 
439
453
 
440
454
  IMAGE_DEFAULTS: ImageDefaults = {
@@ -448,9 +462,13 @@ IMAGE_DEFAULTS: ImageDefaults = {
448
462
  "text_block_categories": (
449
463
  LayoutType.TEXT,
450
464
  LayoutType.TITLE,
451
- LayoutType.FIGURE,
452
465
  LayoutType.LIST,
453
466
  LayoutType.CELL,
467
+ LayoutType.FIGURE,
468
+ CellType.COLUMN_HEADER,
469
+ CellType.PROJECTED_ROW_HEADER,
470
+ CellType.SPANNING,
471
+ CellType.ROW_HEADER,
454
472
  ),
455
473
  }
456
474
 
@@ -509,6 +527,9 @@ class Page(Image):
509
527
  "location",
510
528
  "document_id",
511
529
  "page_number",
530
+ "angle",
531
+ "figures",
532
+ "residual_layouts",
512
533
  }
513
534
  include_residual_text_container: bool = True
514
535
 
@@ -607,6 +628,41 @@ class Page(Image):
607
628
  """
608
629
  return self.get_annotation(category_names=LayoutType.TABLE)
609
630
 
631
+ @property
632
+ def figures(self) -> list[ImageAnnotationBaseView]:
633
+ """
634
+ A list of a figures.
635
+ """
636
+ return self.get_annotation(category_names=LayoutType.FIGURE)
637
+
638
+ @property
639
+ def residual_layouts(self) -> list[ImageAnnotationBaseView]:
640
+ """
641
+ A list of all residual layouts. Residual layouts are all layouts that are
642
+ - not floating text blocks,
643
+ - not text containers,
644
+ - not tables,
645
+ - not figures
646
+ - not cells
647
+ - not rows
648
+ - not columns
649
+ """
650
+ return self.get_annotation(category_names=self._get_residual_layout())
651
+
652
+ def _get_residual_layout(self) -> list[LiteralString]:
653
+ layouts = copy(list(self.floating_text_block_categories))
654
+ layouts.extend(
655
+ [
656
+ LayoutType.TABLE,
657
+ LayoutType.FIGURE,
658
+ self.text_container,
659
+ LayoutType.CELL,
660
+ LayoutType.ROW,
661
+ LayoutType.COLUMN,
662
+ ]
663
+ )
664
+ return [layout for layout in LayoutType if layout not in layouts]
665
+
610
666
  @classmethod
611
667
  def from_image(
612
668
  cls,
@@ -800,12 +856,15 @@ class Page(Image):
800
856
  self,
801
857
  show_tables: bool = True,
802
858
  show_layouts: bool = True,
859
+ show_figures: bool = False,
860
+ show_residual_layouts: bool = False,
803
861
  show_cells: bool = True,
804
862
  show_table_structure: bool = True,
805
863
  show_words: bool = False,
806
864
  show_token_class: bool = True,
807
865
  ignore_default_token_class: bool = False,
808
866
  interactive: bool = False,
867
+ scaled_width: int = 600,
809
868
  **debug_kwargs: str,
810
869
  ) -> Optional[PixelValues]:
811
870
  """
@@ -826,12 +885,14 @@ class Page(Image):
826
885
 
827
886
  :param show_tables: Will display all tables boxes as well as cells, rows and columns
828
887
  :param show_layouts: Will display all other layout components.
888
+ :param show_figures: Will display all figures
829
889
  :param show_cells: Will display cells within tables. (Only available if `show_tables=True`)
830
890
  :param show_table_structure: Will display rows and columns
831
891
  :param show_words: Will display bounding boxes around words labeled with token class and bio tag (experimental)
832
892
  :param show_token_class: Will display token class instead of token tags (i.e. token classes with tags)
833
893
  :param interactive: If set to True will open an interactive image, otherwise it will return a numpy array that
834
894
  can be displayed differently.
895
+ :param scaled_width: Width of the image to display
835
896
  :param ignore_default_token_class: Will ignore displaying word bounding boxes with default or None token class
836
897
  label
837
898
  :return: If `interactive=False` will return a numpy array.
@@ -857,6 +918,11 @@ class Page(Image):
857
918
  box_stack.append(item.bbox)
858
919
  category_names_list.append(item.category_name.value)
859
920
 
921
+ if show_figures and not debug_kwargs:
922
+ for item in self.figures:
923
+ box_stack.append(item.bbox)
924
+ category_names_list.append(item.category_name.value)
925
+
860
926
  if show_tables and not debug_kwargs:
861
927
  for table in self.tables:
862
928
  box_stack.append(table.bbox)
@@ -913,24 +979,34 @@ class Page(Image):
913
979
  else:
914
980
  category_names_list.append(word.token_tag.value if word.token_tag is not None else None)
915
981
 
982
+ if show_residual_layouts and not debug_kwargs:
983
+ for item in self.residual_layouts:
984
+ box_stack.append(item.bbox)
985
+ category_names_list.append(item.category_name.value)
986
+
916
987
  if self.image is not None:
988
+ scale_fx = scaled_width / self.width
989
+ scaled_height = int(self.height * scale_fx)
990
+ img = viz_handler.resize(self.image, scaled_width, scaled_height, "VIZ")
991
+
917
992
  if box_stack:
918
993
  boxes = np.vstack(box_stack)
994
+ boxes = box_to_point4(boxes)
995
+ resizer = ResizeTransform(self.height, self.width, scaled_height, scaled_width, "VIZ")
996
+ boxes = resizer.apply_coords(boxes)
997
+ boxes = point4_to_box(boxes)
919
998
  if show_words:
920
999
  img = draw_boxes(
921
- self.image,
922
- boxes,
923
- category_names_list,
1000
+ np_image=img,
1001
+ boxes=boxes,
1002
+ category_names_list=category_names_list,
924
1003
  font_scale=1.0,
925
1004
  rectangle_thickness=4,
926
1005
  )
927
1006
  else:
928
- img = draw_boxes(self.image, boxes, category_names_list)
929
- scale_fx, scale_fy = 1.3, 1.3
930
- scaled_width, scaled_height = int(self.width * scale_fx), int(self.height * scale_fy)
931
- img = viz_handler.resize(img, scaled_width, scaled_height, "VIZ")
932
- else:
933
- img = self.image
1007
+ img = draw_boxes(
1008
+ np_image=img, boxes=boxes, category_names_list=category_names_list, show_palette=False
1009
+ )
934
1010
 
935
1011
  if interactive:
936
1012
  interactive_imshow(img)
@@ -71,8 +71,8 @@ https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/cocoeva
71
71
 
72
72
 
73
73
  def _summarize( # type: ignore
74
- self, ap: int = 1, iouThr: float = 0.9, areaRng: str = "all", maxDets: int = 100
75
- ) -> float:
74
+ self, ap: int = 1, iouThr: float = 0.9, areaRng: str = "all", maxDets: int = 100, per_category: bool = False
75
+ ) -> Union[float, list[float]]:
76
76
  # pylint: disable=C0103
77
77
  p = self.params
78
78
  iStr = " {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}"
@@ -86,6 +86,36 @@ def _summarize( # type: ignore
86
86
 
87
87
  aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
88
88
  mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
89
+ if per_category:
90
+ if ap == 1:
91
+ s = self.eval["precision"]
92
+ num_classes = s.shape[2]
93
+ results_per_class = []
94
+ for idx in range(num_classes):
95
+ if iouThr is not None:
96
+ s = self.eval["precision"]
97
+ t = np.where(iouThr == p.iouThrs)[0]
98
+ s = s[t]
99
+ precision = s[:, :, idx, aind, mind]
100
+ precision = precision[precision > -1]
101
+ res = np.mean(precision) if precision.size else float("nan")
102
+ results_per_class.append(float(res))
103
+ print(f"Precision for class {idx+1}: @[ IoU={iouStr} | area={areaRng} | maxDets={maxDets} ] = {res}")
104
+ else:
105
+ s = self.eval["recall"]
106
+ num_classes = s.shape[1]
107
+ results_per_class = []
108
+ for idx in range(num_classes):
109
+ if iouThr is not None:
110
+ s = self.eval["recall"]
111
+ t = np.where(iouThr == p.iouThrs)[0]
112
+ s = s[t]
113
+ recall = s[:, idx, aind, mind]
114
+ recall = recall[recall > -1]
115
+ res = np.mean(recall) if recall.size else float("nan")
116
+ results_per_class.append(float(res))
117
+ print(f"Recall for class {idx+1}: @[ IoU={iouStr} | area={areaRng} | maxDets={maxDets} ] = {res}")
118
+ return results_per_class
89
119
  if ap == 1:
90
120
  # dimension of precision: [TxRxKxAxM]
91
121
  s = self.eval["precision"]
@@ -124,6 +154,7 @@ class CocoMetric(MetricBase):
124
154
  mapper = image_to_coco
125
155
  _f1_score = None
126
156
  _f1_iou = None
157
+ _per_category = False
127
158
  _params: dict[str, Union[list[int], list[list[int]]]] = {}
128
159
 
129
160
  @classmethod
@@ -176,18 +207,28 @@ class CocoMetric(MetricBase):
176
207
 
177
208
  if cls._f1_score:
178
209
  summary_bbox = [
179
- metric.summarize_f1(1, cls._f1_iou, maxDets=metric.params.maxDets[2]),
180
- metric.summarize_f1(0, cls._f1_iou, maxDets=metric.params.maxDets[2]),
210
+ metric.summarize_f1(1, cls._f1_iou, maxDets=metric.params.maxDets[2], per_category=cls._per_category),
211
+ metric.summarize_f1(0, cls._f1_iou, maxDets=metric.params.maxDets[2], per_category=cls._per_category),
181
212
  ]
182
213
  else:
183
214
  metric.summarize()
184
215
  summary_bbox = metric.stats
185
216
 
186
217
  results = []
187
- for params, value in zip(cls.get_summary_default_parameters(), summary_bbox):
218
+
219
+ default_parameters = cls.get_summary_default_parameters()
220
+ if cls._per_category:
221
+ default_parameters = default_parameters * len(summary_bbox[0])
222
+ summary_bbox = [item for pair in zip(*summary_bbox) for item in pair]
223
+ val = 0
224
+ for idx, (params, value) in enumerate(zip(default_parameters, summary_bbox)):
188
225
  params = copy(params)
189
226
  params["mode"] = "bbox"
190
227
  params["val"] = value
228
+ if cls._per_category:
229
+ if idx % 2 == 0:
230
+ val += 1
231
+ params["category_id"] = val
191
232
  results.append(params)
192
233
 
193
234
  return results
@@ -201,15 +242,16 @@ class CocoMetric(MetricBase):
201
242
  area range and maximum detections.
202
243
  """
203
244
  if cls._f1_score:
245
+ for el, idx in zip(_F1_DEFAULTS, [2, 2]):
246
+ if cls._params:
247
+ if cls._params.get("maxDets") is not None:
248
+ el["maxDets"] = cls._params["maxDets"][idx]
249
+ el["iouThr"] = cls._f1_iou
250
+ return _F1_DEFAULTS
251
+
252
+ for el, idx in zip(_COCOEVAL_DEFAULTS, _MAX_DET_INDEX):
204
253
  if cls._params:
205
254
  if cls._params.get("maxDets") is not None:
206
- for el, idx in zip(_F1_DEFAULTS, [2, 2]):
207
- el["maxDets"] = cls._params["maxDets"][idx]
208
- el["iouThr"] = cls._f1_iou
209
- return _F1_DEFAULTS
210
- if cls._params:
211
- if cls._params.get("maxDets") is not None:
212
- for el, idx in zip(_COCOEVAL_DEFAULTS, _MAX_DET_INDEX):
213
255
  el["maxDets"] = cls._params["maxDets"][idx]
214
256
  return _COCOEVAL_DEFAULTS
215
257
 
@@ -220,13 +262,16 @@ class CocoMetric(MetricBase):
220
262
  area_range: Optional[list[list[int]]] = None,
221
263
  f1_score: bool = False,
222
264
  f1_iou: float = 0.9,
265
+ per_category: bool = False,
223
266
  ) -> None:
224
267
  """
225
268
  Setting params for different coco metric modes.
226
269
 
227
270
  :param max_detections: The maximum number of detections to consider
228
271
  :param area_range: The area range to classify objects as "all", "small", "medium" and "large"
229
- :param f1_score: Will use f1 score setting with default iouThr 0.9
272
+ :param f1_score: Will use f1 score setting with default iouThr 0.9. To be more precise it does not calculate
273
+ the f1 score but the precision and recall for a given iou threshold. Use the harmonic mean to
274
+ get the ultimate f1 score.
230
275
  :param f1_iou: Use with f1_score True and reset the f1 iou threshold
231
276
  """
232
277
  if max_detections is not None:
@@ -238,6 +283,7 @@ class CocoMetric(MetricBase):
238
283
 
239
284
  cls._f1_score = f1_score
240
285
  cls._f1_iou = f1_iou
286
+ cls._per_category = per_category
241
287
 
242
288
  @classmethod
243
289
  def get_requirements(cls) -> list[Requirement]:
@@ -24,21 +24,25 @@ from typing import Optional
24
24
  from lazy_imports import try_import
25
25
 
26
26
  from ..utils.context import save_tmp_file
27
- from ..utils.file_utils import get_pdfplumber_requirement
27
+ from ..utils.file_utils import get_pdfplumber_requirement, get_pypdfium2_requirement
28
28
  from ..utils.settings import LayoutType, ObjectTypes
29
29
  from ..utils.types import Requirement
30
30
  from .base import DetectionResult, ModelCategories, PdfMiner
31
31
 
32
- with try_import() as import_guard:
32
+ with try_import() as pdfplumber_import_guard:
33
33
  from pdfplumber.pdf import PDF, Page
34
34
 
35
+ with try_import() as pypdfmium_import_guard:
36
+ import pypdfium2.raw as pypdfium_c
37
+ from pypdfium2 import PdfDocument
35
38
 
36
- def _to_detect_result(word: dict[str, str]) -> DetectionResult:
39
+
40
+ def _to_detect_result(word: dict[str, str], class_name: ObjectTypes) -> DetectionResult:
37
41
  return DetectionResult(
38
42
  box=[float(word["x0"]), float(word["top"]), float(word["x1"]), float(word["bottom"])],
39
43
  class_id=1,
40
44
  text=word["text"],
41
- class_name=LayoutType.WORD,
45
+ class_name=class_name,
42
46
  )
43
47
 
44
48
 
@@ -49,6 +53,7 @@ class PdfPlumberTextDetector(PdfMiner):
49
53
 
50
54
  pdf_plumber = PdfPlumberTextDetector()
51
55
  df = SerializerPdfDoc.load("path/to/document.pdf")
56
+ df.reset_state()
52
57
 
53
58
  for dp in df:
54
59
  detection_results = pdf_plumber.predict(dp["pdf_bytes"])
@@ -61,6 +66,8 @@ class PdfPlumberTextDetector(PdfMiner):
61
66
  pipe = DoctectionPipe([text_extract])
62
67
 
63
68
  df = pipe.analyze(path="path/to/document.pdf")
69
+ df.reset_state()
70
+
64
71
  for dp in df:
65
72
  ...
66
73
 
@@ -87,7 +94,7 @@ class PdfPlumberTextDetector(PdfMiner):
87
94
  self._page = PDF(fin).pages[0]
88
95
  self._pdf_bytes = pdf_bytes
89
96
  words = self._page.extract_words(x_tolerance=self.x_tolerance, y_tolerance=self.y_tolerance)
90
- detect_results = list(map(_to_detect_result, words))
97
+ detect_results = [_to_detect_result(word, self.get_category_names()[0]) for word in words]
91
98
  return detect_results
92
99
 
93
100
  @classmethod
@@ -113,3 +120,87 @@ class PdfPlumberTextDetector(PdfMiner):
113
120
 
114
121
  def get_category_names(self) -> tuple[ObjectTypes, ...]:
115
122
  return self.categories.get_categories(as_dict=False)
123
+
124
+
125
+ class Pdfmium2TextDetector(PdfMiner):
126
+ """
127
+ Text miner based on the pypdfium2 engine. It will return text on text line level and not on word level
128
+
129
+ pdfmium2 = Pdfmium2TextDetector()
130
+ df = SerializerPdfDoc.load("path/to/document.pdf")
131
+ df.reset_state()
132
+
133
+ for dp in df:
134
+ detection_results = pdfmium2.predict(dp["pdf_bytes"])
135
+
136
+ To use it in a more integrated way:
137
+
138
+ pdfmium2 = Pdfmium2TextDetector()
139
+ text_extract = TextExtractionService(pdfmium2)
140
+
141
+ pipe = DoctectionPipe([text_extract])
142
+
143
+ df = pipe.analyze(path="path/to/document.pdf")
144
+ df.reset_state()
145
+ for dp in df:
146
+ ...
147
+
148
+ """
149
+
150
+ def __init__(self) -> None:
151
+ self.name = "Pdfmium"
152
+ self.model_id = self.get_model_id()
153
+ self.categories = ModelCategories(init_categories={1: LayoutType.LINE})
154
+ self._page: Optional[Page] = None
155
+
156
+ def predict(self, pdf_bytes: bytes) -> list[DetectionResult]:
157
+ """
158
+ Call pypdfium2 and returns detected text as detection results
159
+
160
+ :param pdf_bytes: bytes of a single pdf page
161
+ :return: A list of DetectionResult
162
+ """
163
+
164
+ pdf = PdfDocument(pdf_bytes)
165
+ page = pdf.get_page(0)
166
+ text = page.get_textpage()
167
+ words = []
168
+ height = page.get_height()
169
+ for obj in page.get_objects((pypdfium_c.FPDF_PAGEOBJ_TEXT,)):
170
+ box = obj.get_pos()
171
+ if all(x > 0 for x in box):
172
+ words.append(
173
+ {
174
+ "text": text.get_text_bounded(*box),
175
+ "x0": box[0],
176
+ "x1": box[2],
177
+ "top": height - box[3],
178
+ "bottom": height - box[1],
179
+ }
180
+ )
181
+ detect_results = [_to_detect_result(word, self.get_category_names()[0]) for word in words]
182
+ return detect_results
183
+
184
+ @classmethod
185
+ def get_requirements(cls) -> list[Requirement]:
186
+ return [get_pypdfium2_requirement()]
187
+
188
+ def get_width_height(self, pdf_bytes: bytes) -> tuple[float, float]:
189
+ """
190
+ Get the width and height of the full page
191
+ :param pdf_bytes: pdf_bytes generating the pdf
192
+ :return: width and height
193
+ """
194
+
195
+ if self._pdf_bytes == pdf_bytes and self._page is not None:
196
+ return self._page.bbox[2], self._page.bbox[3] # pylint: disable=E1101
197
+ # if the pdf bytes is not equal to the cached pdf, will recalculate values
198
+ pdf = PdfDocument(pdf_bytes)
199
+ self._page = pdf.get_page(0)
200
+ self._pdf_bytes = pdf_bytes
201
+ if self._page is not None:
202
+ return self._page.get_width(), self._page.get_height() # type: ignore
203
+ raise ValueError("Page not found")
204
+
205
+ def get_category_names(self) -> tuple[ObjectTypes, ...]:
206
+ return self.categories.get_categories(as_dict=False)
@@ -421,6 +421,7 @@ class TesseractRotationTransformer(ImageTransformer):
421
421
  def __init__(self) -> None:
422
422
  self.name = fspath(_TESS_PATH) + "-rotation"
423
423
  self.categories = ModelCategories(init_categories={1: PageType.ANGLE})
424
+ self.model_id = self.get_model_id()
424
425
 
425
426
  def transform(self, np_img: PixelValues, specification: DetectionResult) -> PixelValues:
426
427
  """
@@ -193,5 +193,7 @@ def match_anns_by_distance(
193
193
  child_anns = dp.get_annotation(annotation_ids=child_ann_ids, category_names=child_ann_category_names)
194
194
  child_centers = [block.get_bounding_box(dp.image_id).center for block in child_anns]
195
195
  parent_centers = [block.get_bounding_box(dp.image_id).center for block in parent_anns]
196
- child_indices = distance.cdist(parent_centers, child_centers).argmin(axis=1)
197
- return [(parent_anns[i], child_anns[j]) for i, j in enumerate(child_indices)]
196
+ if child_centers and parent_centers:
197
+ child_indices = distance.cdist(parent_centers, child_centers).argmin(axis=1)
198
+ return [(parent_anns[i], child_anns[j]) for i, j in enumerate(child_indices)]
199
+ return []