deepdoctection 0.33__py3-none-any.whl → 0.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (40) hide show
  1. deepdoctection/__init__.py +11 -12
  2. deepdoctection/analyzer/__init__.py +1 -0
  3. deepdoctection/analyzer/_config.py +150 -0
  4. deepdoctection/analyzer/dd.py +42 -358
  5. deepdoctection/analyzer/factory.py +522 -0
  6. deepdoctection/configs/conf_dd_one.yaml +1 -0
  7. deepdoctection/datapoint/annotation.py +41 -3
  8. deepdoctection/datapoint/convert.py +6 -4
  9. deepdoctection/datapoint/image.py +132 -46
  10. deepdoctection/datapoint/view.py +2 -1
  11. deepdoctection/datasets/base.py +1 -1
  12. deepdoctection/datasets/instances/fintabnet.py +1 -1
  13. deepdoctection/datasets/instances/xfund.py +29 -7
  14. deepdoctection/eval/eval.py +7 -1
  15. deepdoctection/extern/model.py +2 -1
  16. deepdoctection/extern/pdftext.py +96 -5
  17. deepdoctection/extern/tessocr.py +1 -0
  18. deepdoctection/mapper/cats.py +11 -13
  19. deepdoctection/mapper/cocostruct.py +6 -2
  20. deepdoctection/mapper/d2struct.py +2 -1
  21. deepdoctection/mapper/laylmstruct.py +1 -1
  22. deepdoctection/mapper/match.py +31 -0
  23. deepdoctection/mapper/misc.py +1 -1
  24. deepdoctection/mapper/prodigystruct.py +1 -1
  25. deepdoctection/pipe/anngen.py +27 -0
  26. deepdoctection/pipe/base.py +23 -0
  27. deepdoctection/pipe/common.py +123 -38
  28. deepdoctection/pipe/segment.py +1 -1
  29. deepdoctection/pipe/sub_layout.py +1 -1
  30. deepdoctection/utils/env_info.py +31 -2
  31. deepdoctection/utils/file_utils.py +19 -0
  32. deepdoctection/utils/fs.py +27 -4
  33. deepdoctection/utils/metacfg.py +12 -0
  34. deepdoctection/utils/pdf_utils.py +114 -6
  35. deepdoctection/utils/settings.py +3 -0
  36. {deepdoctection-0.33.dist-info → deepdoctection-0.35.dist-info}/METADATA +20 -11
  37. {deepdoctection-0.33.dist-info → deepdoctection-0.35.dist-info}/RECORD +40 -38
  38. {deepdoctection-0.33.dist-info → deepdoctection-0.35.dist-info}/WHEEL +1 -1
  39. {deepdoctection-0.33.dist-info → deepdoctection-0.35.dist-info}/LICENSE +0 -0
  40. {deepdoctection-0.33.dist-info → deepdoctection-0.35.dist-info}/top_level.txt +0 -0
@@ -21,10 +21,11 @@ Dataclass Image
21
21
  from __future__ import annotations
22
22
 
23
23
  import json
24
+ from collections import defaultdict
24
25
  from dataclasses import dataclass, field
25
- from os import environ
26
+ from os import environ, fspath
26
27
  from pathlib import Path
27
- from typing import Any, Iterable, Optional, Sequence, Union, no_type_check
28
+ from typing import Any, Optional, Sequence, Union, no_type_check
28
29
 
29
30
  import numpy as np
30
31
  from numpy import uint8
@@ -33,7 +34,7 @@ from ..utils.error import AnnotationError, BoundingBoxError, ImageError, UUIDErr
33
34
  from ..utils.identifier import get_uuid, is_uuid_like
34
35
  from ..utils.settings import ObjectTypes, SummaryType, get_type
35
36
  from ..utils.types import ImageDict, PathLikeOrStr, PixelValues
36
- from .annotation import Annotation, BoundingBox, CategoryAnnotation, ImageAnnotation
37
+ from .annotation import Annotation, AnnotationMap, BoundingBox, CategoryAnnotation, ImageAnnotation
37
38
  from .box import crop_box_from_image, global_to_local_coords, intersection_box
38
39
  from .convert import as_dict, convert_b64_to_np_array, convert_np_array_to_b64, convert_pdf_bytes_to_np_array_v2
39
40
 
@@ -303,6 +304,15 @@ class Image:
303
304
 
304
305
  return self.embeddings[image_id]
305
306
 
307
+ def remove_embedding(self, image_id: str) -> None:
308
+ """
309
+ Remove an embedding from the image.
310
+
311
+ :param image_id: uuid string of the embedding image
312
+ """
313
+ if image_id in self.embeddings:
314
+ self.embeddings.pop(image_id)
315
+
306
316
  def _self_embedding(self) -> None:
307
317
  if self._bbox is not None:
308
318
  self.set_embedding(self.image_id, self._bbox)
@@ -387,39 +397,6 @@ class Image:
387
397
 
388
398
  return list(anns)
389
399
 
390
- def get_annotation_iter(
391
- self,
392
- category_names: Optional[Union[str, ObjectTypes, Sequence[Union[str, ObjectTypes]]]] = None,
393
- annotation_ids: Optional[Union[str, Sequence[str]]] = None,
394
- service_id: Optional[Union[str, Sequence[str]]] = None,
395
- model_id: Optional[Union[str, Sequence[str]]] = None,
396
- session_ids: Optional[Union[str, Sequence[str]]] = None,
397
- ignore_inactive: bool = True,
398
- ) -> Iterable[ImageAnnotation]:
399
- """
400
- Get annotation as an iterator. Same as `get_annotation` but returns an iterator instead of a list.
401
-
402
- :param category_names: A single name or list of names
403
- :param annotation_ids: A single id or list of ids
404
- :param service_id: A single service name or list of service names
405
- :param model_id: A single model name or list of model names
406
- :param session_ids: A single session id or list of session ids
407
- :param ignore_inactive: If set to `True` only active annotations are returned.
408
-
409
- :return: A (possibly empty) list of annotations
410
- """
411
-
412
- return iter(
413
- self.get_annotation(
414
- category_names=category_names,
415
- annotation_ids=annotation_ids,
416
- service_id=service_id,
417
- model_id=model_id,
418
- session_ids=session_ids,
419
- ignore_inactive=ignore_inactive,
420
- )
421
- )
422
-
423
400
  def as_dict(self) -> dict[str, Any]:
424
401
  """
425
402
  Returns the full image dataclass as dict. Uses the custom `convert.as_dict` to disregard attributes
@@ -435,13 +412,22 @@ class Image:
435
412
  img_dict["_image"] = None
436
413
  return img_dict
437
414
 
415
+ def as_json(self) -> str:
416
+ """
417
+ Returns the full image dataclass as json string.
418
+
419
+ :return: A json string.
420
+ """
421
+
422
+ return json.dumps(self.as_dict(), indent=4)
423
+
438
424
  @staticmethod
439
425
  def remove_keys() -> list[str]:
440
426
  """
441
427
  A list of attributes to suspend from as_dict creation.
442
428
  """
443
429
 
444
- return ["_image"]
430
+ return ["_image", "_annotation_ids", "_category_name"]
445
431
 
446
432
  def define_annotation_id(self, annotation: Annotation) -> str:
447
433
  """
@@ -456,17 +442,79 @@ class Image:
456
442
  attributes_values = [str(getattr(annotation, attribute)) for attribute in attributes]
457
443
  return get_uuid(*attributes_values, str(self.image_id))
458
444
 
459
- def remove(self, annotation: ImageAnnotation) -> None:
445
+ def remove(
446
+ self,
447
+ annotation_ids: Optional[Union[str, list[str]]] = None,
448
+ service_ids: Optional[Union[str, list[str]]] = None,
449
+ ) -> None:
460
450
  """
461
451
  Instead of removing consider deactivating annotations.
462
452
 
463
453
  Calls `List.remove`. Make sure, the element is in the list for otherwise a ValueError will be raised.
464
454
 
465
- :param annotation: The annotation to remove
455
+ :param annotation_ids: The annotation to remove
456
+ :param service_ids: The service id to remove
466
457
  """
458
+ ann_id_to_annotation_maps = self.get_annotation_id_to_annotation_maps()
459
+
460
+ if annotation_ids is not None:
461
+ annotation_ids = [annotation_ids] if isinstance(annotation_ids, str) else annotation_ids
462
+
463
+ for ann_id in annotation_ids:
464
+ if ann_id not in ann_id_to_annotation_maps:
465
+ raise ImageError(f"Annotation with id {ann_id} not found")
466
+ annotation_maps = ann_id_to_annotation_maps[ann_id]
467
+
468
+ for annotation_map in annotation_maps:
469
+ self._remove_by_annotation_id(ann_id, annotation_map)
470
+
471
+ if service_ids is not None:
472
+ service_ids = [service_ids] if isinstance(service_ids, str) else service_ids
473
+ service_id_to_annotation_id = self.get_service_id_to_annotation_id()
474
+
475
+ for service_id in service_ids:
476
+ if service_id not in service_id_to_annotation_id:
477
+ raise ImageError(f"Service id {service_id} not found")
478
+ annotation_ids = service_id_to_annotation_id[service_id]
479
+
480
+ for ann_id in annotation_ids:
481
+ if ann_id not in ann_id_to_annotation_maps:
482
+ raise ImageError(f"Annotation with id {ann_id} not found")
483
+ annotation_maps = ann_id_to_annotation_maps[ann_id]
484
+
485
+ for annotation_map in annotation_maps:
486
+ self._remove_by_annotation_id(ann_id, annotation_map)
487
+
488
+ def _remove_by_annotation_id(self, annotation_id: str, location_dict: AnnotationMap) -> None:
489
+ image_annotation_id = location_dict.image_annotation_id
490
+ annotations = self.get_annotation(annotation_ids=image_annotation_id)
491
+ if not annotations:
492
+ return
493
+ # There can only be one annotation with a given id
494
+ annotation = annotations[0]
495
+
496
+ if (
497
+ location_dict.sub_category_key is None
498
+ and location_dict.relationship_key is None
499
+ and location_dict.summary_key is None
500
+ ):
501
+ self.annotations.remove(annotation)
502
+ self._annotation_ids.remove(annotation.annotation_id)
503
+
504
+ sub_category_key = location_dict.sub_category_key
505
+
506
+ if sub_category_key is not None:
507
+ annotation.remove_sub_category(sub_category_key)
467
508
 
468
- self.annotations.remove(annotation)
469
- self._annotation_ids.remove(annotation.annotation_id)
509
+ relationship_key = location_dict.relationship_key
510
+
511
+ if relationship_key is not None:
512
+ annotation.remove_relationship(relationship_key, annotation_id)
513
+
514
+ summary_key = location_dict.summary_key
515
+ if summary_key is not None:
516
+ if annotation.image is not None:
517
+ annotation.image.summary.remove_sub_category(summary_key)
470
518
 
471
519
  def image_ann_to_image(self, annotation_id: str, crop_image: bool = False) -> None:
472
520
  """
@@ -580,6 +628,7 @@ class Image:
580
628
  if summary_dict := kwargs.get("_summary", kwargs.get("summary")):
581
629
  image.summary = CategoryAnnotation.from_dict(**summary_dict)
582
630
  image.summary.category_name = SummaryType.SUMMARY
631
+
583
632
  return image
584
633
 
585
634
  @classmethod
@@ -645,7 +694,7 @@ class Image:
645
694
  highest_hierarchy_only: bool = False,
646
695
  path: Optional[PathLikeOrStr] = None,
647
696
  dry: bool = False,
648
- ) -> Optional[ImageDict]:
697
+ ) -> Optional[Union[ImageDict, str]]:
649
698
  """
650
699
  Export image as dictionary. As numpy array cannot be serialized `image` values will be converted into
651
700
  base64 encodings.
@@ -664,21 +713,58 @@ class Image:
664
713
  path = path / self.image_id
665
714
  suffix = path.suffix
666
715
  if suffix:
667
- path_json = path.as_posix().replace(suffix, ".json")
716
+ path_json = fspath(path).replace(suffix, ".json")
668
717
  else:
669
- path_json = path.as_posix() + ".json"
718
+ path_json = fspath(path) + ".json"
670
719
  if highest_hierarchy_only:
671
720
  self.remove_image_from_lower_hierachy()
672
721
  export_dict = self.as_dict()
673
- export_dict["location"] = str(export_dict["location"])
722
+ export_dict["location"] = fspath(export_dict["location"])
674
723
  if not image_to_json:
675
724
  export_dict["_image"] = None
676
725
  if dry:
677
726
  return export_dict
678
727
  with open(path_json, "w", encoding="UTF-8") as file:
679
728
  json.dump(export_dict, file, indent=2)
680
- return None
729
+ return path_json
681
730
 
682
731
  def get_categories_from_current_state(self) -> set[str]:
683
732
  """Returns all active dumped categories"""
684
733
  return {ann.category_name for ann in self.get_annotation()}
734
+
735
+ def get_service_id_to_annotation_id(self) -> defaultdict[str, list[str]]:
736
+ """
737
+ Returns a dictionary with service ids as keys and lists of annotation ids that have been generated by the
738
+ service
739
+ :return: default with service ids as keys and lists of annotation ids as values
740
+ """
741
+ service_id_dict = defaultdict(list)
742
+ for ann in self.get_annotation():
743
+ if ann.service_id:
744
+ service_id_dict[ann.service_id].append(ann.annotation_id)
745
+ for sub_cat_key in ann.sub_categories:
746
+ sub_cat = ann.get_sub_category(sub_cat_key)
747
+ if sub_cat.service_id:
748
+ service_id_dict[sub_cat.service_id].append(sub_cat.annotation_id)
749
+ if ann.image is not None:
750
+ for summary_cat_key in ann.image.summary:
751
+ summary_cat = ann.get_summary(summary_cat_key)
752
+ if summary_cat.service_id:
753
+ service_id_dict[summary_cat.service_id].append(summary_cat.annotation_id)
754
+
755
+ return service_id_dict
756
+
757
+ def get_annotation_id_to_annotation_maps(self) -> defaultdict[str, list[AnnotationMap]]:
758
+ """
759
+ Returns a dictionary with annotation ids as keys and lists of AnnotationMap as values. The range of ids
760
+ is the union of all ImageAnnotation, CategoryAnnotation and ContainerAnnotation of the image.
761
+
762
+ :return: default dict with annotation ids as keys and lists of AnnotationMap as values
763
+ """
764
+ all_ann_id_dict = defaultdict(list)
765
+ for ann in self.get_annotation():
766
+ ann_id_dict = ann.get_annotation_map()
767
+ for key, val in ann_id_dict.items():
768
+ all_ann_id_dict[key].extend(val)
769
+
770
+ return all_ann_id_dict
@@ -509,6 +509,7 @@ class Page(Image):
509
509
  "location",
510
510
  "document_id",
511
511
  "page_number",
512
+ "angle",
512
513
  }
513
514
  include_residual_text_container: bool = True
514
515
 
@@ -971,7 +972,7 @@ class Page(Image):
971
972
  highest_hierarchy_only: bool = False,
972
973
  path: Optional[PathLikeOrStr] = None,
973
974
  dry: bool = False,
974
- ) -> Optional[ImageDict]:
975
+ ) -> Optional[Union[ImageDict, str]]:
975
976
  """
976
977
  Export image as dictionary. As numpy array cannot be serialized `image` values will be converted into
977
978
  base64 encodings.
@@ -451,7 +451,7 @@ class CustomDataset(DatasetBase):
451
451
  return self.dataflow_builder
452
452
 
453
453
  @staticmethod
454
- def from_dataset_card(file_path: str, dataflow_builder: Type[DataFlowBaseBuilder]) -> CustomDataset:
454
+ def from_dataset_card(file_path: PathLikeOrStr, dataflow_builder: Type[DataFlowBaseBuilder]) -> CustomDataset:
455
455
  """
456
456
  This static method creates a CustomDataset instance from a dataset card.
457
457
 
@@ -264,7 +264,7 @@ class FintabnetBuilder(DataFlowBaseBuilder):
264
264
  add_summary=True,
265
265
  ),
266
266
  )
267
- df = MapData(df, lambda dp: [ann.image for ann in dp.get_annotation_iter(category_names=LayoutType.TABLE)])
267
+ df = MapData(df, lambda dp: [ann.image for ann in dp.get_annotation(category_names=LayoutType.TABLE)])
268
268
  df = FlattenData(df)
269
269
  df = MapData(df, lambda dp: dp[0])
270
270
 
@@ -180,13 +180,35 @@ class XfundBuilder(DataFlowBaseBuilder):
180
180
  "answer": TokenClasses.ANSWER,
181
181
  "header": TokenClasses.HEADER,
182
182
  }
183
- ner_token_to_id_mapping = self.categories.get_sub_categories(
184
- categories=LayoutType.WORD,
185
- sub_categories={LayoutType.WORD: [WordType.TOKEN_TAG, WordType.TAG, WordType.TOKEN_CLASS]},
186
- keys=False,
187
- values_as_dict=True,
188
- name_as_key=True,
189
- )
183
+ if LayoutType.WORD in self.categories.get_categories(filtered=True, name_as_key=True):
184
+ ner_token_to_id_mapping = self.categories.get_sub_categories(
185
+ categories=LayoutType.WORD,
186
+ sub_categories={LayoutType.WORD: [WordType.TOKEN_TAG, WordType.TAG, WordType.TOKEN_CLASS]},
187
+ keys=False,
188
+ values_as_dict=True,
189
+ name_as_key=True,
190
+ )
191
+ else:
192
+ ner_token_to_id_mapping = {
193
+ LayoutType.WORD: {
194
+ WordType.TAG: {BioTag.BEGIN: 3, BioTag.INSIDE: 1, BioTag.OUTSIDE: 2},
195
+ WordType.TOKEN_CLASS: {
196
+ TokenClasses.ANSWER: 3,
197
+ TokenClasses.HEADER: 4,
198
+ TokenClasses.OTHER: 1,
199
+ TokenClasses.QUESTION: 2,
200
+ },
201
+ WordType.TOKEN_TAG: {
202
+ TokenClassWithTag.B_ANSWER: 1,
203
+ TokenClassWithTag.B_HEADER: 2,
204
+ TokenClassWithTag.B_QUESTION: 3,
205
+ TokenClassWithTag.I_ANSWER: 4,
206
+ TokenClassWithTag.I_HEADER: 5,
207
+ TokenClassWithTag.I_QUESTION: 6,
208
+ BioTag.OUTSIDE: 7,
209
+ },
210
+ }
211
+ }
190
212
  df = MapData(
191
213
  df,
192
214
  xfund_to_image(
@@ -293,6 +293,8 @@ class Evaluator:
293
293
  show_words = kwargs.pop("show_words", False)
294
294
  show_token_class = kwargs.pop("show_token_class", True)
295
295
  ignore_default_token_class = kwargs.pop("ignore_default_token_class", False)
296
+ floating_text_block_categories = kwargs.pop("floating_text_block_categories", None)
297
+ include_residual_text_containers = kwargs.pop("include_residual_Text_containers", True)
296
298
 
297
299
  df_gt = self.dataset.dataflow.build(**kwargs)
298
300
  df_pr = self.dataset.dataflow.build(**kwargs)
@@ -301,7 +303,11 @@ class Evaluator:
301
303
  df_pr = MapData(df_pr, deepcopy)
302
304
  df_pr = self._clean_up_predict_dataflow_annotations(df_pr)
303
305
 
304
- page_parsing_component = PageParsingService(text_container=LayoutType.WORD)
306
+ page_parsing_component = PageParsingService(
307
+ text_container=LayoutType.WORD,
308
+ floating_text_block_categories=floating_text_block_categories, # type: ignore
309
+ include_residual_text_container=bool(include_residual_text_containers),
310
+ )
305
311
  df_gt = page_parsing_component.predict_dataflow(df_gt)
306
312
 
307
313
  if self.pipe_component:
@@ -1051,7 +1051,8 @@ class ModelCatalog:
1051
1051
  with jsonlines.open(path) as reader:
1052
1052
  for obj in reader:
1053
1053
  if not obj["name"] in ModelCatalog.CATALOG:
1054
- obj["categories"] = {int(key): get_type(val) for key, val in obj["categories"].items()}
1054
+ categories = obj.get("categories") or {}
1055
+ obj["categories"] = {int(key): get_type(val) for key, val in categories.items()}
1055
1056
  ModelCatalog.register(obj["name"], ModelProfile(**obj))
1056
1057
 
1057
1058
  @staticmethod
@@ -24,21 +24,25 @@ from typing import Optional
24
24
  from lazy_imports import try_import
25
25
 
26
26
  from ..utils.context import save_tmp_file
27
- from ..utils.file_utils import get_pdfplumber_requirement
27
+ from ..utils.file_utils import get_pdfplumber_requirement, get_pypdfium2_requirement
28
28
  from ..utils.settings import LayoutType, ObjectTypes
29
29
  from ..utils.types import Requirement
30
30
  from .base import DetectionResult, ModelCategories, PdfMiner
31
31
 
32
- with try_import() as import_guard:
32
+ with try_import() as pdfplumber_import_guard:
33
33
  from pdfplumber.pdf import PDF, Page
34
34
 
35
+ with try_import() as pypdfmium_import_guard:
36
+ import pypdfium2.raw as pypdfium_c
37
+ from pypdfium2 import PdfDocument
35
38
 
36
- def _to_detect_result(word: dict[str, str]) -> DetectionResult:
39
+
40
+ def _to_detect_result(word: dict[str, str], class_name: ObjectTypes) -> DetectionResult:
37
41
  return DetectionResult(
38
42
  box=[float(word["x0"]), float(word["top"]), float(word["x1"]), float(word["bottom"])],
39
43
  class_id=1,
40
44
  text=word["text"],
41
- class_name=LayoutType.WORD,
45
+ class_name=class_name,
42
46
  )
43
47
 
44
48
 
@@ -49,6 +53,7 @@ class PdfPlumberTextDetector(PdfMiner):
49
53
 
50
54
  pdf_plumber = PdfPlumberTextDetector()
51
55
  df = SerializerPdfDoc.load("path/to/document.pdf")
56
+ df.reset_state()
52
57
 
53
58
  for dp in df:
54
59
  detection_results = pdf_plumber.predict(dp["pdf_bytes"])
@@ -61,6 +66,8 @@ class PdfPlumberTextDetector(PdfMiner):
61
66
  pipe = DoctectionPipe([text_extract])
62
67
 
63
68
  df = pipe.analyze(path="path/to/document.pdf")
69
+ df.reset_state()
70
+
64
71
  for dp in df:
65
72
  ...
66
73
 
@@ -87,7 +94,7 @@ class PdfPlumberTextDetector(PdfMiner):
87
94
  self._page = PDF(fin).pages[0]
88
95
  self._pdf_bytes = pdf_bytes
89
96
  words = self._page.extract_words(x_tolerance=self.x_tolerance, y_tolerance=self.y_tolerance)
90
- detect_results = list(map(_to_detect_result, words))
97
+ detect_results = [_to_detect_result(word, self.get_category_names()[0]) for word in words]
91
98
  return detect_results
92
99
 
93
100
  @classmethod
@@ -113,3 +120,87 @@ class PdfPlumberTextDetector(PdfMiner):
113
120
 
114
121
  def get_category_names(self) -> tuple[ObjectTypes, ...]:
115
122
  return self.categories.get_categories(as_dict=False)
123
+
124
+
125
+ class Pdfmium2TextDetector(PdfMiner):
126
+ """
127
+ Text miner based on the pypdfium2 engine. It will return text on text line level and not on word level
128
+
129
+ pdfmium2 = Pdfmium2TextDetector()
130
+ df = SerializerPdfDoc.load("path/to/document.pdf")
131
+ df.reset_state()
132
+
133
+ for dp in df:
134
+ detection_results = pdfmium2.predict(dp["pdf_bytes"])
135
+
136
+ To use it in a more integrated way:
137
+
138
+ pdfmium2 = Pdfmium2TextDetector()
139
+ text_extract = TextExtractionService(pdfmium2)
140
+
141
+ pipe = DoctectionPipe([text_extract])
142
+
143
+ df = pipe.analyze(path="path/to/document.pdf")
144
+ df.reset_state()
145
+ for dp in df:
146
+ ...
147
+
148
+ """
149
+
150
+ def __init__(self) -> None:
151
+ self.name = "Pdfmium"
152
+ self.model_id = self.get_model_id()
153
+ self.categories = ModelCategories(init_categories={1: LayoutType.LINE})
154
+ self._page: Optional[Page] = None
155
+
156
+ def predict(self, pdf_bytes: bytes) -> list[DetectionResult]:
157
+ """
158
+ Call pypdfium2 and returns detected text as detection results
159
+
160
+ :param pdf_bytes: bytes of a single pdf page
161
+ :return: A list of DetectionResult
162
+ """
163
+
164
+ pdf = PdfDocument(pdf_bytes)
165
+ page = pdf.get_page(0)
166
+ text = page.get_textpage()
167
+ words = []
168
+ height = page.get_height()
169
+ for obj in page.get_objects((pypdfium_c.FPDF_PAGEOBJ_TEXT,)):
170
+ box = obj.get_pos()
171
+ if all(x > 0 for x in box):
172
+ words.append(
173
+ {
174
+ "text": text.get_text_bounded(*box),
175
+ "x0": box[0],
176
+ "x1": box[2],
177
+ "top": height - box[3],
178
+ "bottom": height - box[1],
179
+ }
180
+ )
181
+ detect_results = [_to_detect_result(word, self.get_category_names()[0]) for word in words]
182
+ return detect_results
183
+
184
+ @classmethod
185
+ def get_requirements(cls) -> list[Requirement]:
186
+ return [get_pypdfium2_requirement()]
187
+
188
+ def get_width_height(self, pdf_bytes: bytes) -> tuple[float, float]:
189
+ """
190
+ Get the width and height of the full page
191
+ :param pdf_bytes: pdf_bytes generating the pdf
192
+ :return: width and height
193
+ """
194
+
195
+ if self._pdf_bytes == pdf_bytes and self._page is not None:
196
+ return self._page.bbox[2], self._page.bbox[3] # pylint: disable=E1101
197
+ # if the pdf bytes is not equal to the cached pdf, will recalculate values
198
+ pdf = PdfDocument(pdf_bytes)
199
+ self._page = pdf.get_page(0)
200
+ self._pdf_bytes = pdf_bytes
201
+ if self._page is not None:
202
+ return self._page.get_width(), self._page.get_height() # type: ignore
203
+ raise ValueError("Page not found")
204
+
205
+ def get_category_names(self) -> tuple[ObjectTypes, ...]:
206
+ return self.categories.get_categories(as_dict=False)
@@ -421,6 +421,7 @@ class TesseractRotationTransformer(ImageTransformer):
421
421
  def __init__(self) -> None:
422
422
  self.name = fspath(_TESS_PATH) + "-rotation"
423
423
  self.categories = ModelCategories(init_categories={1: PageType.ANGLE})
424
+ self.model_id = self.get_model_id()
424
425
 
425
426
  def transform(self, np_img: PixelValues, specification: DetectionResult) -> PixelValues:
426
427
  """
@@ -23,7 +23,7 @@ builder method of a dataset.
23
23
  from collections import defaultdict
24
24
  from typing import Any, Literal, Mapping, Optional, Sequence, Union
25
25
 
26
- from ..datapoint.annotation import DEFAULT_CATEGORY_ID, CategoryAnnotation, ContainerAnnotation, ImageAnnotation
26
+ from ..datapoint.annotation import DEFAULT_CATEGORY_ID, CategoryAnnotation, ContainerAnnotation
27
27
  from ..datapoint.image import Image
28
28
  from ..utils.settings import ObjectTypes, SummaryType, TypeOrStr, get_type
29
29
  from .maputils import LabelSummarizer, curry
@@ -49,7 +49,7 @@ def cat_to_sub_cat(
49
49
  if cat_to_sub_cat_dict is None:
50
50
  return dp
51
51
  cat_to_sub_cat_dict_obj_type = {get_type(key): get_type(value) for key, value in cat_to_sub_cat_dict.items()}
52
- for ann in dp.get_annotation_iter(category_names=list(cat_to_sub_cat_dict_obj_type.keys())):
52
+ for ann in dp.get_annotation(category_names=list(cat_to_sub_cat_dict_obj_type.keys())):
53
53
  sub_cat_type = cat_to_sub_cat_dict_obj_type[get_type(ann.category_name)]
54
54
  sub_cat = ann.get_sub_category(sub_cat_type)
55
55
  if sub_cat:
@@ -88,13 +88,13 @@ def re_assign_cat_ids(
88
88
  :return: Image
89
89
  """
90
90
 
91
- anns_to_remove: list[ImageAnnotation] = []
92
- for ann in dp.get_annotation_iter():
91
+ ann_ids_to_remove: list[str] = []
92
+ for ann in dp.get_annotation():
93
93
  if categories_dict_name_as_key is not None:
94
94
  if ann.category_name in categories_dict_name_as_key:
95
95
  ann.category_id = categories_dict_name_as_key[ann.category_name]
96
96
  else:
97
- anns_to_remove.append(ann)
97
+ ann_ids_to_remove.append(ann.annotation_id)
98
98
 
99
99
  if cat_to_sub_cat_mapping:
100
100
  if ann.category_name in cat_to_sub_cat_mapping:
@@ -104,8 +104,7 @@ def re_assign_cat_ids(
104
104
  sub_category = ann.get_sub_category(key)
105
105
  sub_category.category_id = sub_cat_values_dict.get(sub_category.category_name, DEFAULT_CATEGORY_ID)
106
106
 
107
- for ann in anns_to_remove:
108
- dp.remove(ann)
107
+ dp.remove(annotation_ids=ann_ids_to_remove)
109
108
 
110
109
  return dp
111
110
 
@@ -249,7 +248,7 @@ def image_to_cat_id(
249
248
  raise ValueError(f"id_name_or_value must be in ('id', 'name', 'value') but is {id_name_or_value}")
250
249
 
251
250
  if category_names or sub_categories:
252
- for ann in dp.get_annotation_iter():
251
+ for ann in dp.get_annotation():
253
252
  if ann.category_name in category_names:
254
253
  cat_container[ann.category_name].append(ann.category_id)
255
254
  if ann.category_name in tmp_sub_category_names:
@@ -321,11 +320,11 @@ def remove_cats(
321
320
  if isinstance(summary_sub_categories, str):
322
321
  summary_sub_categories = [summary_sub_categories]
323
322
 
324
- anns_to_remove = []
323
+ ann_ids_to_remove = []
325
324
 
326
- for ann in dp.get_annotation_iter():
325
+ for ann in dp.get_annotation():
327
326
  if ann.category_name in category_names:
328
- anns_to_remove.append(ann)
327
+ ann_ids_to_remove.append(ann.annotation_id)
329
328
  if ann.category_name in sub_categories.keys():
330
329
  sub_cats_to_remove = sub_categories[ann.category_name]
331
330
  if isinstance(sub_cats_to_remove, str):
@@ -339,8 +338,7 @@ def remove_cats(
339
338
  for relation in relationships_to_remove:
340
339
  ann.remove_relationship(key=get_type(relation))
341
340
 
342
- for ann in anns_to_remove:
343
- dp.remove(ann)
341
+ dp.remove(annotation_ids=ann_ids_to_remove)
344
342
 
345
343
  if summary_sub_categories is not None:
346
344
  for sub_cat in summary_sub_categories:
@@ -129,7 +129,7 @@ def image_to_coco(dp: Image) -> tuple[JsonDict, list[JsonDict]]:
129
129
  img["height"] = dp.height
130
130
  img["file_name"] = dp.file_name
131
131
 
132
- for img_ann in dp.get_annotation_iter():
132
+ for img_ann in dp.get_annotation():
133
133
  ann: JsonDict = {
134
134
  "id": int("".join([s for s in img_ann.annotation_id if s.isdigit()])),
135
135
  "image_id": img["id"],
@@ -139,7 +139,11 @@ def image_to_coco(dp: Image) -> tuple[JsonDict, list[JsonDict]]:
139
139
  ann["score"] = img_ann.score
140
140
  ann["iscrowd"] = 0
141
141
  bounding_box = img_ann.get_bounding_box(dp.image_id)
142
- ann["area"] = bounding_box.area
142
+ ann["area"] = (
143
+ bounding_box.area
144
+ if bounding_box.absolute_coords
145
+ else bounding_box.transform(dp.width, dp.height, absolute_coords=True).area
146
+ )
143
147
  ann["bbox"] = bounding_box.to_list(mode="xywh")
144
148
  anns.append(ann)
145
149
 
@@ -41,7 +41,7 @@ with try_import() as d2_import_guard:
41
41
  from detectron2.structures import BoxMode
42
42
 
43
43
  with try_import() as wb_import_guard:
44
- from wandb import Classes
44
+ from wandb import Classes # type: ignore
45
45
  from wandb import Image as Wbimage
46
46
 
47
47
 
@@ -189,6 +189,7 @@ def to_wandb_image(
189
189
  class_set = Classes([{"name": val, "id": key} for key, val in sub_categories.items()])
190
190
  else:
191
191
  class_set = Classes([{"name": val, "id": key} for key, val in categories.items()])
192
+ class_labels = dict(categories.items())
192
193
 
193
194
  for ann in anns:
194
195
  bounding_box = ann.get_bounding_box(dp.image_id)
@@ -127,7 +127,7 @@ def image_to_raw_layoutlm_features(
127
127
  all_boxes = []
128
128
  all_labels: list[int] = []
129
129
 
130
- anns = dp.get_annotation_iter(category_names=LayoutType.WORD)
130
+ anns = dp.get_annotation(category_names=LayoutType.WORD)
131
131
 
132
132
  word_id_to_segment_box = {}
133
133
  if segment_positions: