deepdoctection 0.33__py3-none-any.whl → 0.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +11 -12
- deepdoctection/analyzer/__init__.py +1 -0
- deepdoctection/analyzer/_config.py +150 -0
- deepdoctection/analyzer/dd.py +42 -358
- deepdoctection/analyzer/factory.py +522 -0
- deepdoctection/configs/conf_dd_one.yaml +1 -0
- deepdoctection/datapoint/annotation.py +41 -3
- deepdoctection/datapoint/convert.py +6 -4
- deepdoctection/datapoint/image.py +132 -46
- deepdoctection/datapoint/view.py +2 -1
- deepdoctection/datasets/base.py +1 -1
- deepdoctection/datasets/instances/fintabnet.py +1 -1
- deepdoctection/datasets/instances/xfund.py +29 -7
- deepdoctection/eval/eval.py +7 -1
- deepdoctection/extern/model.py +2 -1
- deepdoctection/extern/pdftext.py +96 -5
- deepdoctection/extern/tessocr.py +1 -0
- deepdoctection/mapper/cats.py +11 -13
- deepdoctection/mapper/cocostruct.py +6 -2
- deepdoctection/mapper/d2struct.py +2 -1
- deepdoctection/mapper/laylmstruct.py +1 -1
- deepdoctection/mapper/match.py +31 -0
- deepdoctection/mapper/misc.py +1 -1
- deepdoctection/mapper/prodigystruct.py +1 -1
- deepdoctection/pipe/anngen.py +27 -0
- deepdoctection/pipe/base.py +23 -0
- deepdoctection/pipe/common.py +123 -38
- deepdoctection/pipe/segment.py +1 -1
- deepdoctection/pipe/sub_layout.py +1 -1
- deepdoctection/utils/env_info.py +31 -2
- deepdoctection/utils/file_utils.py +19 -0
- deepdoctection/utils/fs.py +27 -4
- deepdoctection/utils/metacfg.py +12 -0
- deepdoctection/utils/pdf_utils.py +114 -6
- deepdoctection/utils/settings.py +3 -0
- {deepdoctection-0.33.dist-info → deepdoctection-0.35.dist-info}/METADATA +20 -11
- {deepdoctection-0.33.dist-info → deepdoctection-0.35.dist-info}/RECORD +40 -38
- {deepdoctection-0.33.dist-info → deepdoctection-0.35.dist-info}/WHEEL +1 -1
- {deepdoctection-0.33.dist-info → deepdoctection-0.35.dist-info}/LICENSE +0 -0
- {deepdoctection-0.33.dist-info → deepdoctection-0.35.dist-info}/top_level.txt +0 -0
|
@@ -21,10 +21,11 @@ Dataclass Image
|
|
|
21
21
|
from __future__ import annotations
|
|
22
22
|
|
|
23
23
|
import json
|
|
24
|
+
from collections import defaultdict
|
|
24
25
|
from dataclasses import dataclass, field
|
|
25
|
-
from os import environ
|
|
26
|
+
from os import environ, fspath
|
|
26
27
|
from pathlib import Path
|
|
27
|
-
from typing import Any,
|
|
28
|
+
from typing import Any, Optional, Sequence, Union, no_type_check
|
|
28
29
|
|
|
29
30
|
import numpy as np
|
|
30
31
|
from numpy import uint8
|
|
@@ -33,7 +34,7 @@ from ..utils.error import AnnotationError, BoundingBoxError, ImageError, UUIDErr
|
|
|
33
34
|
from ..utils.identifier import get_uuid, is_uuid_like
|
|
34
35
|
from ..utils.settings import ObjectTypes, SummaryType, get_type
|
|
35
36
|
from ..utils.types import ImageDict, PathLikeOrStr, PixelValues
|
|
36
|
-
from .annotation import Annotation, BoundingBox, CategoryAnnotation, ImageAnnotation
|
|
37
|
+
from .annotation import Annotation, AnnotationMap, BoundingBox, CategoryAnnotation, ImageAnnotation
|
|
37
38
|
from .box import crop_box_from_image, global_to_local_coords, intersection_box
|
|
38
39
|
from .convert import as_dict, convert_b64_to_np_array, convert_np_array_to_b64, convert_pdf_bytes_to_np_array_v2
|
|
39
40
|
|
|
@@ -303,6 +304,15 @@ class Image:
|
|
|
303
304
|
|
|
304
305
|
return self.embeddings[image_id]
|
|
305
306
|
|
|
307
|
+
def remove_embedding(self, image_id: str) -> None:
|
|
308
|
+
"""
|
|
309
|
+
Remove an embedding from the image.
|
|
310
|
+
|
|
311
|
+
:param image_id: uuid string of the embedding image
|
|
312
|
+
"""
|
|
313
|
+
if image_id in self.embeddings:
|
|
314
|
+
self.embeddings.pop(image_id)
|
|
315
|
+
|
|
306
316
|
def _self_embedding(self) -> None:
|
|
307
317
|
if self._bbox is not None:
|
|
308
318
|
self.set_embedding(self.image_id, self._bbox)
|
|
@@ -387,39 +397,6 @@ class Image:
|
|
|
387
397
|
|
|
388
398
|
return list(anns)
|
|
389
399
|
|
|
390
|
-
def get_annotation_iter(
|
|
391
|
-
self,
|
|
392
|
-
category_names: Optional[Union[str, ObjectTypes, Sequence[Union[str, ObjectTypes]]]] = None,
|
|
393
|
-
annotation_ids: Optional[Union[str, Sequence[str]]] = None,
|
|
394
|
-
service_id: Optional[Union[str, Sequence[str]]] = None,
|
|
395
|
-
model_id: Optional[Union[str, Sequence[str]]] = None,
|
|
396
|
-
session_ids: Optional[Union[str, Sequence[str]]] = None,
|
|
397
|
-
ignore_inactive: bool = True,
|
|
398
|
-
) -> Iterable[ImageAnnotation]:
|
|
399
|
-
"""
|
|
400
|
-
Get annotation as an iterator. Same as `get_annotation` but returns an iterator instead of a list.
|
|
401
|
-
|
|
402
|
-
:param category_names: A single name or list of names
|
|
403
|
-
:param annotation_ids: A single id or list of ids
|
|
404
|
-
:param service_id: A single service name or list of service names
|
|
405
|
-
:param model_id: A single model name or list of model names
|
|
406
|
-
:param session_ids: A single session id or list of session ids
|
|
407
|
-
:param ignore_inactive: If set to `True` only active annotations are returned.
|
|
408
|
-
|
|
409
|
-
:return: A (possibly empty) list of annotations
|
|
410
|
-
"""
|
|
411
|
-
|
|
412
|
-
return iter(
|
|
413
|
-
self.get_annotation(
|
|
414
|
-
category_names=category_names,
|
|
415
|
-
annotation_ids=annotation_ids,
|
|
416
|
-
service_id=service_id,
|
|
417
|
-
model_id=model_id,
|
|
418
|
-
session_ids=session_ids,
|
|
419
|
-
ignore_inactive=ignore_inactive,
|
|
420
|
-
)
|
|
421
|
-
)
|
|
422
|
-
|
|
423
400
|
def as_dict(self) -> dict[str, Any]:
|
|
424
401
|
"""
|
|
425
402
|
Returns the full image dataclass as dict. Uses the custom `convert.as_dict` to disregard attributes
|
|
@@ -435,13 +412,22 @@ class Image:
|
|
|
435
412
|
img_dict["_image"] = None
|
|
436
413
|
return img_dict
|
|
437
414
|
|
|
415
|
+
def as_json(self) -> str:
|
|
416
|
+
"""
|
|
417
|
+
Returns the full image dataclass as json string.
|
|
418
|
+
|
|
419
|
+
:return: A json string.
|
|
420
|
+
"""
|
|
421
|
+
|
|
422
|
+
return json.dumps(self.as_dict(), indent=4)
|
|
423
|
+
|
|
438
424
|
@staticmethod
|
|
439
425
|
def remove_keys() -> list[str]:
|
|
440
426
|
"""
|
|
441
427
|
A list of attributes to suspend from as_dict creation.
|
|
442
428
|
"""
|
|
443
429
|
|
|
444
|
-
return ["_image"]
|
|
430
|
+
return ["_image", "_annotation_ids", "_category_name"]
|
|
445
431
|
|
|
446
432
|
def define_annotation_id(self, annotation: Annotation) -> str:
|
|
447
433
|
"""
|
|
@@ -456,17 +442,79 @@ class Image:
|
|
|
456
442
|
attributes_values = [str(getattr(annotation, attribute)) for attribute in attributes]
|
|
457
443
|
return get_uuid(*attributes_values, str(self.image_id))
|
|
458
444
|
|
|
459
|
-
def remove(
|
|
445
|
+
def remove(
|
|
446
|
+
self,
|
|
447
|
+
annotation_ids: Optional[Union[str, list[str]]] = None,
|
|
448
|
+
service_ids: Optional[Union[str, list[str]]] = None,
|
|
449
|
+
) -> None:
|
|
460
450
|
"""
|
|
461
451
|
Instead of removing consider deactivating annotations.
|
|
462
452
|
|
|
463
453
|
Calls `List.remove`. Make sure, the element is in the list for otherwise a ValueError will be raised.
|
|
464
454
|
|
|
465
|
-
:param
|
|
455
|
+
:param annotation_ids: The annotation to remove
|
|
456
|
+
:param service_ids: The service id to remove
|
|
466
457
|
"""
|
|
458
|
+
ann_id_to_annotation_maps = self.get_annotation_id_to_annotation_maps()
|
|
459
|
+
|
|
460
|
+
if annotation_ids is not None:
|
|
461
|
+
annotation_ids = [annotation_ids] if isinstance(annotation_ids, str) else annotation_ids
|
|
462
|
+
|
|
463
|
+
for ann_id in annotation_ids:
|
|
464
|
+
if ann_id not in ann_id_to_annotation_maps:
|
|
465
|
+
raise ImageError(f"Annotation with id {ann_id} not found")
|
|
466
|
+
annotation_maps = ann_id_to_annotation_maps[ann_id]
|
|
467
|
+
|
|
468
|
+
for annotation_map in annotation_maps:
|
|
469
|
+
self._remove_by_annotation_id(ann_id, annotation_map)
|
|
470
|
+
|
|
471
|
+
if service_ids is not None:
|
|
472
|
+
service_ids = [service_ids] if isinstance(service_ids, str) else service_ids
|
|
473
|
+
service_id_to_annotation_id = self.get_service_id_to_annotation_id()
|
|
474
|
+
|
|
475
|
+
for service_id in service_ids:
|
|
476
|
+
if service_id not in service_id_to_annotation_id:
|
|
477
|
+
raise ImageError(f"Service id {service_id} not found")
|
|
478
|
+
annotation_ids = service_id_to_annotation_id[service_id]
|
|
479
|
+
|
|
480
|
+
for ann_id in annotation_ids:
|
|
481
|
+
if ann_id not in ann_id_to_annotation_maps:
|
|
482
|
+
raise ImageError(f"Annotation with id {ann_id} not found")
|
|
483
|
+
annotation_maps = ann_id_to_annotation_maps[ann_id]
|
|
484
|
+
|
|
485
|
+
for annotation_map in annotation_maps:
|
|
486
|
+
self._remove_by_annotation_id(ann_id, annotation_map)
|
|
487
|
+
|
|
488
|
+
def _remove_by_annotation_id(self, annotation_id: str, location_dict: AnnotationMap) -> None:
|
|
489
|
+
image_annotation_id = location_dict.image_annotation_id
|
|
490
|
+
annotations = self.get_annotation(annotation_ids=image_annotation_id)
|
|
491
|
+
if not annotations:
|
|
492
|
+
return
|
|
493
|
+
# There can only be one annotation with a given id
|
|
494
|
+
annotation = annotations[0]
|
|
495
|
+
|
|
496
|
+
if (
|
|
497
|
+
location_dict.sub_category_key is None
|
|
498
|
+
and location_dict.relationship_key is None
|
|
499
|
+
and location_dict.summary_key is None
|
|
500
|
+
):
|
|
501
|
+
self.annotations.remove(annotation)
|
|
502
|
+
self._annotation_ids.remove(annotation.annotation_id)
|
|
503
|
+
|
|
504
|
+
sub_category_key = location_dict.sub_category_key
|
|
505
|
+
|
|
506
|
+
if sub_category_key is not None:
|
|
507
|
+
annotation.remove_sub_category(sub_category_key)
|
|
467
508
|
|
|
468
|
-
|
|
469
|
-
|
|
509
|
+
relationship_key = location_dict.relationship_key
|
|
510
|
+
|
|
511
|
+
if relationship_key is not None:
|
|
512
|
+
annotation.remove_relationship(relationship_key, annotation_id)
|
|
513
|
+
|
|
514
|
+
summary_key = location_dict.summary_key
|
|
515
|
+
if summary_key is not None:
|
|
516
|
+
if annotation.image is not None:
|
|
517
|
+
annotation.image.summary.remove_sub_category(summary_key)
|
|
470
518
|
|
|
471
519
|
def image_ann_to_image(self, annotation_id: str, crop_image: bool = False) -> None:
|
|
472
520
|
"""
|
|
@@ -580,6 +628,7 @@ class Image:
|
|
|
580
628
|
if summary_dict := kwargs.get("_summary", kwargs.get("summary")):
|
|
581
629
|
image.summary = CategoryAnnotation.from_dict(**summary_dict)
|
|
582
630
|
image.summary.category_name = SummaryType.SUMMARY
|
|
631
|
+
|
|
583
632
|
return image
|
|
584
633
|
|
|
585
634
|
@classmethod
|
|
@@ -645,7 +694,7 @@ class Image:
|
|
|
645
694
|
highest_hierarchy_only: bool = False,
|
|
646
695
|
path: Optional[PathLikeOrStr] = None,
|
|
647
696
|
dry: bool = False,
|
|
648
|
-
) -> Optional[ImageDict]:
|
|
697
|
+
) -> Optional[Union[ImageDict, str]]:
|
|
649
698
|
"""
|
|
650
699
|
Export image as dictionary. As numpy array cannot be serialized `image` values will be converted into
|
|
651
700
|
base64 encodings.
|
|
@@ -664,21 +713,58 @@ class Image:
|
|
|
664
713
|
path = path / self.image_id
|
|
665
714
|
suffix = path.suffix
|
|
666
715
|
if suffix:
|
|
667
|
-
path_json = path
|
|
716
|
+
path_json = fspath(path).replace(suffix, ".json")
|
|
668
717
|
else:
|
|
669
|
-
path_json = path
|
|
718
|
+
path_json = fspath(path) + ".json"
|
|
670
719
|
if highest_hierarchy_only:
|
|
671
720
|
self.remove_image_from_lower_hierachy()
|
|
672
721
|
export_dict = self.as_dict()
|
|
673
|
-
export_dict["location"] =
|
|
722
|
+
export_dict["location"] = fspath(export_dict["location"])
|
|
674
723
|
if not image_to_json:
|
|
675
724
|
export_dict["_image"] = None
|
|
676
725
|
if dry:
|
|
677
726
|
return export_dict
|
|
678
727
|
with open(path_json, "w", encoding="UTF-8") as file:
|
|
679
728
|
json.dump(export_dict, file, indent=2)
|
|
680
|
-
return
|
|
729
|
+
return path_json
|
|
681
730
|
|
|
682
731
|
def get_categories_from_current_state(self) -> set[str]:
|
|
683
732
|
"""Returns all active dumped categories"""
|
|
684
733
|
return {ann.category_name for ann in self.get_annotation()}
|
|
734
|
+
|
|
735
|
+
def get_service_id_to_annotation_id(self) -> defaultdict[str, list[str]]:
|
|
736
|
+
"""
|
|
737
|
+
Returns a dictionary with service ids as keys and lists of annotation ids that have been generated by the
|
|
738
|
+
service
|
|
739
|
+
:return: default with service ids as keys and lists of annotation ids as values
|
|
740
|
+
"""
|
|
741
|
+
service_id_dict = defaultdict(list)
|
|
742
|
+
for ann in self.get_annotation():
|
|
743
|
+
if ann.service_id:
|
|
744
|
+
service_id_dict[ann.service_id].append(ann.annotation_id)
|
|
745
|
+
for sub_cat_key in ann.sub_categories:
|
|
746
|
+
sub_cat = ann.get_sub_category(sub_cat_key)
|
|
747
|
+
if sub_cat.service_id:
|
|
748
|
+
service_id_dict[sub_cat.service_id].append(sub_cat.annotation_id)
|
|
749
|
+
if ann.image is not None:
|
|
750
|
+
for summary_cat_key in ann.image.summary:
|
|
751
|
+
summary_cat = ann.get_summary(summary_cat_key)
|
|
752
|
+
if summary_cat.service_id:
|
|
753
|
+
service_id_dict[summary_cat.service_id].append(summary_cat.annotation_id)
|
|
754
|
+
|
|
755
|
+
return service_id_dict
|
|
756
|
+
|
|
757
|
+
def get_annotation_id_to_annotation_maps(self) -> defaultdict[str, list[AnnotationMap]]:
|
|
758
|
+
"""
|
|
759
|
+
Returns a dictionary with annotation ids as keys and lists of AnnotationMap as values. The range of ids
|
|
760
|
+
is the union of all ImageAnnotation, CategoryAnnotation and ContainerAnnotation of the image.
|
|
761
|
+
|
|
762
|
+
:return: default dict with annotation ids as keys and lists of AnnotationMap as values
|
|
763
|
+
"""
|
|
764
|
+
all_ann_id_dict = defaultdict(list)
|
|
765
|
+
for ann in self.get_annotation():
|
|
766
|
+
ann_id_dict = ann.get_annotation_map()
|
|
767
|
+
for key, val in ann_id_dict.items():
|
|
768
|
+
all_ann_id_dict[key].extend(val)
|
|
769
|
+
|
|
770
|
+
return all_ann_id_dict
|
deepdoctection/datapoint/view.py
CHANGED
|
@@ -509,6 +509,7 @@ class Page(Image):
|
|
|
509
509
|
"location",
|
|
510
510
|
"document_id",
|
|
511
511
|
"page_number",
|
|
512
|
+
"angle",
|
|
512
513
|
}
|
|
513
514
|
include_residual_text_container: bool = True
|
|
514
515
|
|
|
@@ -971,7 +972,7 @@ class Page(Image):
|
|
|
971
972
|
highest_hierarchy_only: bool = False,
|
|
972
973
|
path: Optional[PathLikeOrStr] = None,
|
|
973
974
|
dry: bool = False,
|
|
974
|
-
) -> Optional[ImageDict]:
|
|
975
|
+
) -> Optional[Union[ImageDict, str]]:
|
|
975
976
|
"""
|
|
976
977
|
Export image as dictionary. As numpy array cannot be serialized `image` values will be converted into
|
|
977
978
|
base64 encodings.
|
deepdoctection/datasets/base.py
CHANGED
|
@@ -451,7 +451,7 @@ class CustomDataset(DatasetBase):
|
|
|
451
451
|
return self.dataflow_builder
|
|
452
452
|
|
|
453
453
|
@staticmethod
|
|
454
|
-
def from_dataset_card(file_path:
|
|
454
|
+
def from_dataset_card(file_path: PathLikeOrStr, dataflow_builder: Type[DataFlowBaseBuilder]) -> CustomDataset:
|
|
455
455
|
"""
|
|
456
456
|
This static method creates a CustomDataset instance from a dataset card.
|
|
457
457
|
|
|
@@ -264,7 +264,7 @@ class FintabnetBuilder(DataFlowBaseBuilder):
|
|
|
264
264
|
add_summary=True,
|
|
265
265
|
),
|
|
266
266
|
)
|
|
267
|
-
df = MapData(df, lambda dp: [ann.image for ann in dp.
|
|
267
|
+
df = MapData(df, lambda dp: [ann.image for ann in dp.get_annotation(category_names=LayoutType.TABLE)])
|
|
268
268
|
df = FlattenData(df)
|
|
269
269
|
df = MapData(df, lambda dp: dp[0])
|
|
270
270
|
|
|
@@ -180,13 +180,35 @@ class XfundBuilder(DataFlowBaseBuilder):
|
|
|
180
180
|
"answer": TokenClasses.ANSWER,
|
|
181
181
|
"header": TokenClasses.HEADER,
|
|
182
182
|
}
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
183
|
+
if LayoutType.WORD in self.categories.get_categories(filtered=True, name_as_key=True):
|
|
184
|
+
ner_token_to_id_mapping = self.categories.get_sub_categories(
|
|
185
|
+
categories=LayoutType.WORD,
|
|
186
|
+
sub_categories={LayoutType.WORD: [WordType.TOKEN_TAG, WordType.TAG, WordType.TOKEN_CLASS]},
|
|
187
|
+
keys=False,
|
|
188
|
+
values_as_dict=True,
|
|
189
|
+
name_as_key=True,
|
|
190
|
+
)
|
|
191
|
+
else:
|
|
192
|
+
ner_token_to_id_mapping = {
|
|
193
|
+
LayoutType.WORD: {
|
|
194
|
+
WordType.TAG: {BioTag.BEGIN: 3, BioTag.INSIDE: 1, BioTag.OUTSIDE: 2},
|
|
195
|
+
WordType.TOKEN_CLASS: {
|
|
196
|
+
TokenClasses.ANSWER: 3,
|
|
197
|
+
TokenClasses.HEADER: 4,
|
|
198
|
+
TokenClasses.OTHER: 1,
|
|
199
|
+
TokenClasses.QUESTION: 2,
|
|
200
|
+
},
|
|
201
|
+
WordType.TOKEN_TAG: {
|
|
202
|
+
TokenClassWithTag.B_ANSWER: 1,
|
|
203
|
+
TokenClassWithTag.B_HEADER: 2,
|
|
204
|
+
TokenClassWithTag.B_QUESTION: 3,
|
|
205
|
+
TokenClassWithTag.I_ANSWER: 4,
|
|
206
|
+
TokenClassWithTag.I_HEADER: 5,
|
|
207
|
+
TokenClassWithTag.I_QUESTION: 6,
|
|
208
|
+
BioTag.OUTSIDE: 7,
|
|
209
|
+
},
|
|
210
|
+
}
|
|
211
|
+
}
|
|
190
212
|
df = MapData(
|
|
191
213
|
df,
|
|
192
214
|
xfund_to_image(
|
deepdoctection/eval/eval.py
CHANGED
|
@@ -293,6 +293,8 @@ class Evaluator:
|
|
|
293
293
|
show_words = kwargs.pop("show_words", False)
|
|
294
294
|
show_token_class = kwargs.pop("show_token_class", True)
|
|
295
295
|
ignore_default_token_class = kwargs.pop("ignore_default_token_class", False)
|
|
296
|
+
floating_text_block_categories = kwargs.pop("floating_text_block_categories", None)
|
|
297
|
+
include_residual_text_containers = kwargs.pop("include_residual_Text_containers", True)
|
|
296
298
|
|
|
297
299
|
df_gt = self.dataset.dataflow.build(**kwargs)
|
|
298
300
|
df_pr = self.dataset.dataflow.build(**kwargs)
|
|
@@ -301,7 +303,11 @@ class Evaluator:
|
|
|
301
303
|
df_pr = MapData(df_pr, deepcopy)
|
|
302
304
|
df_pr = self._clean_up_predict_dataflow_annotations(df_pr)
|
|
303
305
|
|
|
304
|
-
page_parsing_component = PageParsingService(
|
|
306
|
+
page_parsing_component = PageParsingService(
|
|
307
|
+
text_container=LayoutType.WORD,
|
|
308
|
+
floating_text_block_categories=floating_text_block_categories, # type: ignore
|
|
309
|
+
include_residual_text_container=bool(include_residual_text_containers),
|
|
310
|
+
)
|
|
305
311
|
df_gt = page_parsing_component.predict_dataflow(df_gt)
|
|
306
312
|
|
|
307
313
|
if self.pipe_component:
|
deepdoctection/extern/model.py
CHANGED
|
@@ -1051,7 +1051,8 @@ class ModelCatalog:
|
|
|
1051
1051
|
with jsonlines.open(path) as reader:
|
|
1052
1052
|
for obj in reader:
|
|
1053
1053
|
if not obj["name"] in ModelCatalog.CATALOG:
|
|
1054
|
-
|
|
1054
|
+
categories = obj.get("categories") or {}
|
|
1055
|
+
obj["categories"] = {int(key): get_type(val) for key, val in categories.items()}
|
|
1055
1056
|
ModelCatalog.register(obj["name"], ModelProfile(**obj))
|
|
1056
1057
|
|
|
1057
1058
|
@staticmethod
|
deepdoctection/extern/pdftext.py
CHANGED
|
@@ -24,21 +24,25 @@ from typing import Optional
|
|
|
24
24
|
from lazy_imports import try_import
|
|
25
25
|
|
|
26
26
|
from ..utils.context import save_tmp_file
|
|
27
|
-
from ..utils.file_utils import get_pdfplumber_requirement
|
|
27
|
+
from ..utils.file_utils import get_pdfplumber_requirement, get_pypdfium2_requirement
|
|
28
28
|
from ..utils.settings import LayoutType, ObjectTypes
|
|
29
29
|
from ..utils.types import Requirement
|
|
30
30
|
from .base import DetectionResult, ModelCategories, PdfMiner
|
|
31
31
|
|
|
32
|
-
with try_import() as
|
|
32
|
+
with try_import() as pdfplumber_import_guard:
|
|
33
33
|
from pdfplumber.pdf import PDF, Page
|
|
34
34
|
|
|
35
|
+
with try_import() as pypdfmium_import_guard:
|
|
36
|
+
import pypdfium2.raw as pypdfium_c
|
|
37
|
+
from pypdfium2 import PdfDocument
|
|
35
38
|
|
|
36
|
-
|
|
39
|
+
|
|
40
|
+
def _to_detect_result(word: dict[str, str], class_name: ObjectTypes) -> DetectionResult:
|
|
37
41
|
return DetectionResult(
|
|
38
42
|
box=[float(word["x0"]), float(word["top"]), float(word["x1"]), float(word["bottom"])],
|
|
39
43
|
class_id=1,
|
|
40
44
|
text=word["text"],
|
|
41
|
-
class_name=
|
|
45
|
+
class_name=class_name,
|
|
42
46
|
)
|
|
43
47
|
|
|
44
48
|
|
|
@@ -49,6 +53,7 @@ class PdfPlumberTextDetector(PdfMiner):
|
|
|
49
53
|
|
|
50
54
|
pdf_plumber = PdfPlumberTextDetector()
|
|
51
55
|
df = SerializerPdfDoc.load("path/to/document.pdf")
|
|
56
|
+
df.reset_state()
|
|
52
57
|
|
|
53
58
|
for dp in df:
|
|
54
59
|
detection_results = pdf_plumber.predict(dp["pdf_bytes"])
|
|
@@ -61,6 +66,8 @@ class PdfPlumberTextDetector(PdfMiner):
|
|
|
61
66
|
pipe = DoctectionPipe([text_extract])
|
|
62
67
|
|
|
63
68
|
df = pipe.analyze(path="path/to/document.pdf")
|
|
69
|
+
df.reset_state()
|
|
70
|
+
|
|
64
71
|
for dp in df:
|
|
65
72
|
...
|
|
66
73
|
|
|
@@ -87,7 +94,7 @@ class PdfPlumberTextDetector(PdfMiner):
|
|
|
87
94
|
self._page = PDF(fin).pages[0]
|
|
88
95
|
self._pdf_bytes = pdf_bytes
|
|
89
96
|
words = self._page.extract_words(x_tolerance=self.x_tolerance, y_tolerance=self.y_tolerance)
|
|
90
|
-
detect_results =
|
|
97
|
+
detect_results = [_to_detect_result(word, self.get_category_names()[0]) for word in words]
|
|
91
98
|
return detect_results
|
|
92
99
|
|
|
93
100
|
@classmethod
|
|
@@ -113,3 +120,87 @@ class PdfPlumberTextDetector(PdfMiner):
|
|
|
113
120
|
|
|
114
121
|
def get_category_names(self) -> tuple[ObjectTypes, ...]:
|
|
115
122
|
return self.categories.get_categories(as_dict=False)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class Pdfmium2TextDetector(PdfMiner):
|
|
126
|
+
"""
|
|
127
|
+
Text miner based on the pypdfium2 engine. It will return text on text line level and not on word level
|
|
128
|
+
|
|
129
|
+
pdfmium2 = Pdfmium2TextDetector()
|
|
130
|
+
df = SerializerPdfDoc.load("path/to/document.pdf")
|
|
131
|
+
df.reset_state()
|
|
132
|
+
|
|
133
|
+
for dp in df:
|
|
134
|
+
detection_results = pdfmium2.predict(dp["pdf_bytes"])
|
|
135
|
+
|
|
136
|
+
To use it in a more integrated way:
|
|
137
|
+
|
|
138
|
+
pdfmium2 = Pdfmium2TextDetector()
|
|
139
|
+
text_extract = TextExtractionService(pdfmium2)
|
|
140
|
+
|
|
141
|
+
pipe = DoctectionPipe([text_extract])
|
|
142
|
+
|
|
143
|
+
df = pipe.analyze(path="path/to/document.pdf")
|
|
144
|
+
df.reset_state()
|
|
145
|
+
for dp in df:
|
|
146
|
+
...
|
|
147
|
+
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
def __init__(self) -> None:
|
|
151
|
+
self.name = "Pdfmium"
|
|
152
|
+
self.model_id = self.get_model_id()
|
|
153
|
+
self.categories = ModelCategories(init_categories={1: LayoutType.LINE})
|
|
154
|
+
self._page: Optional[Page] = None
|
|
155
|
+
|
|
156
|
+
def predict(self, pdf_bytes: bytes) -> list[DetectionResult]:
|
|
157
|
+
"""
|
|
158
|
+
Call pypdfium2 and returns detected text as detection results
|
|
159
|
+
|
|
160
|
+
:param pdf_bytes: bytes of a single pdf page
|
|
161
|
+
:return: A list of DetectionResult
|
|
162
|
+
"""
|
|
163
|
+
|
|
164
|
+
pdf = PdfDocument(pdf_bytes)
|
|
165
|
+
page = pdf.get_page(0)
|
|
166
|
+
text = page.get_textpage()
|
|
167
|
+
words = []
|
|
168
|
+
height = page.get_height()
|
|
169
|
+
for obj in page.get_objects((pypdfium_c.FPDF_PAGEOBJ_TEXT,)):
|
|
170
|
+
box = obj.get_pos()
|
|
171
|
+
if all(x > 0 for x in box):
|
|
172
|
+
words.append(
|
|
173
|
+
{
|
|
174
|
+
"text": text.get_text_bounded(*box),
|
|
175
|
+
"x0": box[0],
|
|
176
|
+
"x1": box[2],
|
|
177
|
+
"top": height - box[3],
|
|
178
|
+
"bottom": height - box[1],
|
|
179
|
+
}
|
|
180
|
+
)
|
|
181
|
+
detect_results = [_to_detect_result(word, self.get_category_names()[0]) for word in words]
|
|
182
|
+
return detect_results
|
|
183
|
+
|
|
184
|
+
@classmethod
|
|
185
|
+
def get_requirements(cls) -> list[Requirement]:
|
|
186
|
+
return [get_pypdfium2_requirement()]
|
|
187
|
+
|
|
188
|
+
def get_width_height(self, pdf_bytes: bytes) -> tuple[float, float]:
|
|
189
|
+
"""
|
|
190
|
+
Get the width and height of the full page
|
|
191
|
+
:param pdf_bytes: pdf_bytes generating the pdf
|
|
192
|
+
:return: width and height
|
|
193
|
+
"""
|
|
194
|
+
|
|
195
|
+
if self._pdf_bytes == pdf_bytes and self._page is not None:
|
|
196
|
+
return self._page.bbox[2], self._page.bbox[3] # pylint: disable=E1101
|
|
197
|
+
# if the pdf bytes is not equal to the cached pdf, will recalculate values
|
|
198
|
+
pdf = PdfDocument(pdf_bytes)
|
|
199
|
+
self._page = pdf.get_page(0)
|
|
200
|
+
self._pdf_bytes = pdf_bytes
|
|
201
|
+
if self._page is not None:
|
|
202
|
+
return self._page.get_width(), self._page.get_height() # type: ignore
|
|
203
|
+
raise ValueError("Page not found")
|
|
204
|
+
|
|
205
|
+
def get_category_names(self) -> tuple[ObjectTypes, ...]:
|
|
206
|
+
return self.categories.get_categories(as_dict=False)
|
deepdoctection/extern/tessocr.py
CHANGED
|
@@ -421,6 +421,7 @@ class TesseractRotationTransformer(ImageTransformer):
|
|
|
421
421
|
def __init__(self) -> None:
|
|
422
422
|
self.name = fspath(_TESS_PATH) + "-rotation"
|
|
423
423
|
self.categories = ModelCategories(init_categories={1: PageType.ANGLE})
|
|
424
|
+
self.model_id = self.get_model_id()
|
|
424
425
|
|
|
425
426
|
def transform(self, np_img: PixelValues, specification: DetectionResult) -> PixelValues:
|
|
426
427
|
"""
|
deepdoctection/mapper/cats.py
CHANGED
|
@@ -23,7 +23,7 @@ builder method of a dataset.
|
|
|
23
23
|
from collections import defaultdict
|
|
24
24
|
from typing import Any, Literal, Mapping, Optional, Sequence, Union
|
|
25
25
|
|
|
26
|
-
from ..datapoint.annotation import DEFAULT_CATEGORY_ID, CategoryAnnotation, ContainerAnnotation
|
|
26
|
+
from ..datapoint.annotation import DEFAULT_CATEGORY_ID, CategoryAnnotation, ContainerAnnotation
|
|
27
27
|
from ..datapoint.image import Image
|
|
28
28
|
from ..utils.settings import ObjectTypes, SummaryType, TypeOrStr, get_type
|
|
29
29
|
from .maputils import LabelSummarizer, curry
|
|
@@ -49,7 +49,7 @@ def cat_to_sub_cat(
|
|
|
49
49
|
if cat_to_sub_cat_dict is None:
|
|
50
50
|
return dp
|
|
51
51
|
cat_to_sub_cat_dict_obj_type = {get_type(key): get_type(value) for key, value in cat_to_sub_cat_dict.items()}
|
|
52
|
-
for ann in dp.
|
|
52
|
+
for ann in dp.get_annotation(category_names=list(cat_to_sub_cat_dict_obj_type.keys())):
|
|
53
53
|
sub_cat_type = cat_to_sub_cat_dict_obj_type[get_type(ann.category_name)]
|
|
54
54
|
sub_cat = ann.get_sub_category(sub_cat_type)
|
|
55
55
|
if sub_cat:
|
|
@@ -88,13 +88,13 @@ def re_assign_cat_ids(
|
|
|
88
88
|
:return: Image
|
|
89
89
|
"""
|
|
90
90
|
|
|
91
|
-
|
|
92
|
-
for ann in dp.
|
|
91
|
+
ann_ids_to_remove: list[str] = []
|
|
92
|
+
for ann in dp.get_annotation():
|
|
93
93
|
if categories_dict_name_as_key is not None:
|
|
94
94
|
if ann.category_name in categories_dict_name_as_key:
|
|
95
95
|
ann.category_id = categories_dict_name_as_key[ann.category_name]
|
|
96
96
|
else:
|
|
97
|
-
|
|
97
|
+
ann_ids_to_remove.append(ann.annotation_id)
|
|
98
98
|
|
|
99
99
|
if cat_to_sub_cat_mapping:
|
|
100
100
|
if ann.category_name in cat_to_sub_cat_mapping:
|
|
@@ -104,8 +104,7 @@ def re_assign_cat_ids(
|
|
|
104
104
|
sub_category = ann.get_sub_category(key)
|
|
105
105
|
sub_category.category_id = sub_cat_values_dict.get(sub_category.category_name, DEFAULT_CATEGORY_ID)
|
|
106
106
|
|
|
107
|
-
|
|
108
|
-
dp.remove(ann)
|
|
107
|
+
dp.remove(annotation_ids=ann_ids_to_remove)
|
|
109
108
|
|
|
110
109
|
return dp
|
|
111
110
|
|
|
@@ -249,7 +248,7 @@ def image_to_cat_id(
|
|
|
249
248
|
raise ValueError(f"id_name_or_value must be in ('id', 'name', 'value') but is {id_name_or_value}")
|
|
250
249
|
|
|
251
250
|
if category_names or sub_categories:
|
|
252
|
-
for ann in dp.
|
|
251
|
+
for ann in dp.get_annotation():
|
|
253
252
|
if ann.category_name in category_names:
|
|
254
253
|
cat_container[ann.category_name].append(ann.category_id)
|
|
255
254
|
if ann.category_name in tmp_sub_category_names:
|
|
@@ -321,11 +320,11 @@ def remove_cats(
|
|
|
321
320
|
if isinstance(summary_sub_categories, str):
|
|
322
321
|
summary_sub_categories = [summary_sub_categories]
|
|
323
322
|
|
|
324
|
-
|
|
323
|
+
ann_ids_to_remove = []
|
|
325
324
|
|
|
326
|
-
for ann in dp.
|
|
325
|
+
for ann in dp.get_annotation():
|
|
327
326
|
if ann.category_name in category_names:
|
|
328
|
-
|
|
327
|
+
ann_ids_to_remove.append(ann.annotation_id)
|
|
329
328
|
if ann.category_name in sub_categories.keys():
|
|
330
329
|
sub_cats_to_remove = sub_categories[ann.category_name]
|
|
331
330
|
if isinstance(sub_cats_to_remove, str):
|
|
@@ -339,8 +338,7 @@ def remove_cats(
|
|
|
339
338
|
for relation in relationships_to_remove:
|
|
340
339
|
ann.remove_relationship(key=get_type(relation))
|
|
341
340
|
|
|
342
|
-
|
|
343
|
-
dp.remove(ann)
|
|
341
|
+
dp.remove(annotation_ids=ann_ids_to_remove)
|
|
344
342
|
|
|
345
343
|
if summary_sub_categories is not None:
|
|
346
344
|
for sub_cat in summary_sub_categories:
|
|
@@ -129,7 +129,7 @@ def image_to_coco(dp: Image) -> tuple[JsonDict, list[JsonDict]]:
|
|
|
129
129
|
img["height"] = dp.height
|
|
130
130
|
img["file_name"] = dp.file_name
|
|
131
131
|
|
|
132
|
-
for img_ann in dp.
|
|
132
|
+
for img_ann in dp.get_annotation():
|
|
133
133
|
ann: JsonDict = {
|
|
134
134
|
"id": int("".join([s for s in img_ann.annotation_id if s.isdigit()])),
|
|
135
135
|
"image_id": img["id"],
|
|
@@ -139,7 +139,11 @@ def image_to_coco(dp: Image) -> tuple[JsonDict, list[JsonDict]]:
|
|
|
139
139
|
ann["score"] = img_ann.score
|
|
140
140
|
ann["iscrowd"] = 0
|
|
141
141
|
bounding_box = img_ann.get_bounding_box(dp.image_id)
|
|
142
|
-
ann["area"] =
|
|
142
|
+
ann["area"] = (
|
|
143
|
+
bounding_box.area
|
|
144
|
+
if bounding_box.absolute_coords
|
|
145
|
+
else bounding_box.transform(dp.width, dp.height, absolute_coords=True).area
|
|
146
|
+
)
|
|
143
147
|
ann["bbox"] = bounding_box.to_list(mode="xywh")
|
|
144
148
|
anns.append(ann)
|
|
145
149
|
|
|
@@ -41,7 +41,7 @@ with try_import() as d2_import_guard:
|
|
|
41
41
|
from detectron2.structures import BoxMode
|
|
42
42
|
|
|
43
43
|
with try_import() as wb_import_guard:
|
|
44
|
-
from wandb import Classes
|
|
44
|
+
from wandb import Classes # type: ignore
|
|
45
45
|
from wandb import Image as Wbimage
|
|
46
46
|
|
|
47
47
|
|
|
@@ -189,6 +189,7 @@ def to_wandb_image(
|
|
|
189
189
|
class_set = Classes([{"name": val, "id": key} for key, val in sub_categories.items()])
|
|
190
190
|
else:
|
|
191
191
|
class_set = Classes([{"name": val, "id": key} for key, val in categories.items()])
|
|
192
|
+
class_labels = dict(categories.items())
|
|
192
193
|
|
|
193
194
|
for ann in anns:
|
|
194
195
|
bounding_box = ann.get_bounding_box(dp.image_id)
|
|
@@ -127,7 +127,7 @@ def image_to_raw_layoutlm_features(
|
|
|
127
127
|
all_boxes = []
|
|
128
128
|
all_labels: list[int] = []
|
|
129
129
|
|
|
130
|
-
anns = dp.
|
|
130
|
+
anns = dp.get_annotation(category_names=LayoutType.WORD)
|
|
131
131
|
|
|
132
132
|
word_id_to_segment_box = {}
|
|
133
133
|
if segment_positions:
|