deepdoctection 0.35__py3-none-any.whl → 0.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +5 -6
- deepdoctection/analyzer/_config.py +10 -18
- deepdoctection/analyzer/factory.py +214 -18
- deepdoctection/configs/conf_dd_one.yaml +4 -0
- deepdoctection/dataflow/custom_serialize.py +1 -1
- deepdoctection/datapoint/convert.py +11 -0
- deepdoctection/datapoint/image.py +2 -2
- deepdoctection/datapoint/view.py +90 -15
- deepdoctection/datasets/save.py +1 -1
- deepdoctection/eval/cocometric.py +59 -13
- deepdoctection/extern/base.py +2 -3
- deepdoctection/mapper/match.py +4 -2
- deepdoctection/mapper/misc.py +5 -1
- deepdoctection/pipe/doctectionpipe.py +77 -10
- deepdoctection/utils/fs.py +8 -7
- deepdoctection/utils/pdf_utils.py +45 -17
- deepdoctection/utils/utils.py +39 -0
- deepdoctection/utils/viz.py +49 -13
- {deepdoctection-0.35.dist-info → deepdoctection-0.37.dist-info}/METADATA +116 -112
- {deepdoctection-0.35.dist-info → deepdoctection-0.37.dist-info}/RECORD +23 -23
- {deepdoctection-0.35.dist-info → deepdoctection-0.37.dist-info}/WHEEL +1 -1
- {deepdoctection-0.35.dist-info → deepdoctection-0.37.dist-info}/LICENSE +0 -0
- {deepdoctection-0.35.dist-info → deepdoctection-0.37.dist-info}/top_level.txt +0 -0
deepdoctection/datapoint/view.py
CHANGED
|
@@ -25,6 +25,7 @@ from copy import copy
|
|
|
25
25
|
from typing import Any, Mapping, Optional, Sequence, Type, TypedDict, Union, no_type_check
|
|
26
26
|
|
|
27
27
|
import numpy as np
|
|
28
|
+
from typing_extensions import LiteralString
|
|
28
29
|
|
|
29
30
|
from ..utils.error import AnnotationError, ImageError
|
|
30
31
|
from ..utils.logger import LoggingRecord, logger
|
|
@@ -40,10 +41,12 @@ from ..utils.settings import (
|
|
|
40
41
|
WordType,
|
|
41
42
|
get_type,
|
|
42
43
|
)
|
|
44
|
+
from ..utils.transform import ResizeTransform
|
|
43
45
|
from ..utils.types import HTML, AnnotationDict, Chunks, ImageDict, PathLikeOrStr, PixelValues, Text_, csv
|
|
44
46
|
from ..utils.viz import draw_boxes, interactive_imshow, viz_handler
|
|
45
47
|
from .annotation import CategoryAnnotation, ContainerAnnotation, ImageAnnotation, ann_from_dict
|
|
46
48
|
from .box import BoundingBox, crop_box_from_image
|
|
49
|
+
from .convert import box_to_point4, point4_to_box
|
|
47
50
|
from .image import Image
|
|
48
51
|
|
|
49
52
|
|
|
@@ -101,7 +104,7 @@ class ImageAnnotationBaseView(ImageAnnotation):
|
|
|
101
104
|
return np_image
|
|
102
105
|
raise AnnotationError(f"base_page.image is None for {self.annotation_id}")
|
|
103
106
|
|
|
104
|
-
def __getattr__(self, item: str) -> Optional[Union[str, int, list[str]]]:
|
|
107
|
+
def __getattr__(self, item: str) -> Optional[Union[str, int, list[str], list[ImageAnnotationBaseView]]]:
|
|
105
108
|
"""
|
|
106
109
|
Get attributes defined by registered `self.get_attribute_names()` in a multi step process:
|
|
107
110
|
|
|
@@ -126,6 +129,9 @@ class ImageAnnotationBaseView(ImageAnnotation):
|
|
|
126
129
|
if isinstance(sub_cat, ContainerAnnotation):
|
|
127
130
|
return sub_cat.value
|
|
128
131
|
return sub_cat.category_id
|
|
132
|
+
if item in self.relationships:
|
|
133
|
+
relationship_ids = self.get_relationship(get_type(item))
|
|
134
|
+
return self.base_page.get_annotation(annotation_ids=relationship_ids)
|
|
129
135
|
if self.image is not None:
|
|
130
136
|
if item in self.image.summary.sub_categories:
|
|
131
137
|
sub_cat = self.get_summary(get_type(item))
|
|
@@ -165,7 +171,11 @@ class Word(ImageAnnotationBaseView):
|
|
|
165
171
|
"""
|
|
166
172
|
|
|
167
173
|
def get_attribute_names(self) -> set[str]:
|
|
168
|
-
return
|
|
174
|
+
return (
|
|
175
|
+
set(WordType)
|
|
176
|
+
.union(super().get_attribute_names())
|
|
177
|
+
.union({Relationships.READING_ORDER, Relationships.LAYOUT_LINK})
|
|
178
|
+
)
|
|
169
179
|
|
|
170
180
|
|
|
171
181
|
class Layout(ImageAnnotationBaseView):
|
|
@@ -246,7 +256,11 @@ class Layout(ImageAnnotationBaseView):
|
|
|
246
256
|
}
|
|
247
257
|
|
|
248
258
|
def get_attribute_names(self) -> set[str]:
|
|
249
|
-
return
|
|
259
|
+
return (
|
|
260
|
+
{"words", "text"}
|
|
261
|
+
.union(super().get_attribute_names())
|
|
262
|
+
.union({Relationships.READING_ORDER, Relationships.LAYOUT_LINK})
|
|
263
|
+
)
|
|
250
264
|
|
|
251
265
|
def __len__(self) -> int:
|
|
252
266
|
"""len of text counted by number of characters"""
|
|
@@ -433,8 +447,8 @@ class ImageDefaults(TypedDict):
|
|
|
433
447
|
"""ImageDefaults"""
|
|
434
448
|
|
|
435
449
|
text_container: LayoutType
|
|
436
|
-
floating_text_block_categories: tuple[LayoutType, ...]
|
|
437
|
-
text_block_categories: tuple[LayoutType, ...]
|
|
450
|
+
floating_text_block_categories: tuple[Union[LayoutType, CellType], ...]
|
|
451
|
+
text_block_categories: tuple[Union[LayoutType, CellType], ...]
|
|
438
452
|
|
|
439
453
|
|
|
440
454
|
IMAGE_DEFAULTS: ImageDefaults = {
|
|
@@ -448,9 +462,13 @@ IMAGE_DEFAULTS: ImageDefaults = {
|
|
|
448
462
|
"text_block_categories": (
|
|
449
463
|
LayoutType.TEXT,
|
|
450
464
|
LayoutType.TITLE,
|
|
451
|
-
LayoutType.FIGURE,
|
|
452
465
|
LayoutType.LIST,
|
|
453
466
|
LayoutType.CELL,
|
|
467
|
+
LayoutType.FIGURE,
|
|
468
|
+
CellType.COLUMN_HEADER,
|
|
469
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
470
|
+
CellType.SPANNING,
|
|
471
|
+
CellType.ROW_HEADER,
|
|
454
472
|
),
|
|
455
473
|
}
|
|
456
474
|
|
|
@@ -510,6 +528,8 @@ class Page(Image):
|
|
|
510
528
|
"document_id",
|
|
511
529
|
"page_number",
|
|
512
530
|
"angle",
|
|
531
|
+
"figures",
|
|
532
|
+
"residual_layouts",
|
|
513
533
|
}
|
|
514
534
|
include_residual_text_container: bool = True
|
|
515
535
|
|
|
@@ -608,6 +628,41 @@ class Page(Image):
|
|
|
608
628
|
"""
|
|
609
629
|
return self.get_annotation(category_names=LayoutType.TABLE)
|
|
610
630
|
|
|
631
|
+
@property
|
|
632
|
+
def figures(self) -> list[ImageAnnotationBaseView]:
|
|
633
|
+
"""
|
|
634
|
+
A list of a figures.
|
|
635
|
+
"""
|
|
636
|
+
return self.get_annotation(category_names=LayoutType.FIGURE)
|
|
637
|
+
|
|
638
|
+
@property
|
|
639
|
+
def residual_layouts(self) -> list[ImageAnnotationBaseView]:
|
|
640
|
+
"""
|
|
641
|
+
A list of all residual layouts. Residual layouts are all layouts that are
|
|
642
|
+
- not floating text blocks,
|
|
643
|
+
- not text containers,
|
|
644
|
+
- not tables,
|
|
645
|
+
- not figures
|
|
646
|
+
- not cells
|
|
647
|
+
- not rows
|
|
648
|
+
- not columns
|
|
649
|
+
"""
|
|
650
|
+
return self.get_annotation(category_names=self._get_residual_layout())
|
|
651
|
+
|
|
652
|
+
def _get_residual_layout(self) -> list[LiteralString]:
|
|
653
|
+
layouts = copy(list(self.floating_text_block_categories))
|
|
654
|
+
layouts.extend(
|
|
655
|
+
[
|
|
656
|
+
LayoutType.TABLE,
|
|
657
|
+
LayoutType.FIGURE,
|
|
658
|
+
self.text_container,
|
|
659
|
+
LayoutType.CELL,
|
|
660
|
+
LayoutType.ROW,
|
|
661
|
+
LayoutType.COLUMN,
|
|
662
|
+
]
|
|
663
|
+
)
|
|
664
|
+
return [layout for layout in LayoutType if layout not in layouts]
|
|
665
|
+
|
|
611
666
|
@classmethod
|
|
612
667
|
def from_image(
|
|
613
668
|
cls,
|
|
@@ -801,12 +856,15 @@ class Page(Image):
|
|
|
801
856
|
self,
|
|
802
857
|
show_tables: bool = True,
|
|
803
858
|
show_layouts: bool = True,
|
|
859
|
+
show_figures: bool = False,
|
|
860
|
+
show_residual_layouts: bool = False,
|
|
804
861
|
show_cells: bool = True,
|
|
805
862
|
show_table_structure: bool = True,
|
|
806
863
|
show_words: bool = False,
|
|
807
864
|
show_token_class: bool = True,
|
|
808
865
|
ignore_default_token_class: bool = False,
|
|
809
866
|
interactive: bool = False,
|
|
867
|
+
scaled_width: int = 600,
|
|
810
868
|
**debug_kwargs: str,
|
|
811
869
|
) -> Optional[PixelValues]:
|
|
812
870
|
"""
|
|
@@ -827,12 +885,14 @@ class Page(Image):
|
|
|
827
885
|
|
|
828
886
|
:param show_tables: Will display all tables boxes as well as cells, rows and columns
|
|
829
887
|
:param show_layouts: Will display all other layout components.
|
|
888
|
+
:param show_figures: Will display all figures
|
|
830
889
|
:param show_cells: Will display cells within tables. (Only available if `show_tables=True`)
|
|
831
890
|
:param show_table_structure: Will display rows and columns
|
|
832
891
|
:param show_words: Will display bounding boxes around words labeled with token class and bio tag (experimental)
|
|
833
892
|
:param show_token_class: Will display token class instead of token tags (i.e. token classes with tags)
|
|
834
893
|
:param interactive: If set to True will open an interactive image, otherwise it will return a numpy array that
|
|
835
894
|
can be displayed differently.
|
|
895
|
+
:param scaled_width: Width of the image to display
|
|
836
896
|
:param ignore_default_token_class: Will ignore displaying word bounding boxes with default or None token class
|
|
837
897
|
label
|
|
838
898
|
:return: If `interactive=False` will return a numpy array.
|
|
@@ -858,6 +918,11 @@ class Page(Image):
|
|
|
858
918
|
box_stack.append(item.bbox)
|
|
859
919
|
category_names_list.append(item.category_name.value)
|
|
860
920
|
|
|
921
|
+
if show_figures and not debug_kwargs:
|
|
922
|
+
for item in self.figures:
|
|
923
|
+
box_stack.append(item.bbox)
|
|
924
|
+
category_names_list.append(item.category_name.value)
|
|
925
|
+
|
|
861
926
|
if show_tables and not debug_kwargs:
|
|
862
927
|
for table in self.tables:
|
|
863
928
|
box_stack.append(table.bbox)
|
|
@@ -914,24 +979,34 @@ class Page(Image):
|
|
|
914
979
|
else:
|
|
915
980
|
category_names_list.append(word.token_tag.value if word.token_tag is not None else None)
|
|
916
981
|
|
|
982
|
+
if show_residual_layouts and not debug_kwargs:
|
|
983
|
+
for item in self.residual_layouts:
|
|
984
|
+
box_stack.append(item.bbox)
|
|
985
|
+
category_names_list.append(item.category_name.value)
|
|
986
|
+
|
|
917
987
|
if self.image is not None:
|
|
988
|
+
scale_fx = scaled_width / self.width
|
|
989
|
+
scaled_height = int(self.height * scale_fx)
|
|
990
|
+
img = viz_handler.resize(self.image, scaled_width, scaled_height, "VIZ")
|
|
991
|
+
|
|
918
992
|
if box_stack:
|
|
919
993
|
boxes = np.vstack(box_stack)
|
|
994
|
+
boxes = box_to_point4(boxes)
|
|
995
|
+
resizer = ResizeTransform(self.height, self.width, scaled_height, scaled_width, "VIZ")
|
|
996
|
+
boxes = resizer.apply_coords(boxes)
|
|
997
|
+
boxes = point4_to_box(boxes)
|
|
920
998
|
if show_words:
|
|
921
999
|
img = draw_boxes(
|
|
922
|
-
|
|
923
|
-
boxes,
|
|
924
|
-
category_names_list,
|
|
1000
|
+
np_image=img,
|
|
1001
|
+
boxes=boxes,
|
|
1002
|
+
category_names_list=category_names_list,
|
|
925
1003
|
font_scale=1.0,
|
|
926
1004
|
rectangle_thickness=4,
|
|
927
1005
|
)
|
|
928
1006
|
else:
|
|
929
|
-
img = draw_boxes(
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
img = viz_handler.resize(img, scaled_width, scaled_height, "VIZ")
|
|
933
|
-
else:
|
|
934
|
-
img = self.image
|
|
1007
|
+
img = draw_boxes(
|
|
1008
|
+
np_image=img, boxes=boxes, category_names_list=category_names_list, show_palette=False
|
|
1009
|
+
)
|
|
935
1010
|
|
|
936
1011
|
if interactive:
|
|
937
1012
|
interactive_imshow(img)
|
deepdoctection/datasets/save.py
CHANGED
|
@@ -71,8 +71,8 @@ https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/cocoeva
|
|
|
71
71
|
|
|
72
72
|
|
|
73
73
|
def _summarize( # type: ignore
|
|
74
|
-
self, ap: int = 1, iouThr: float = 0.9, areaRng: str = "all", maxDets: int = 100
|
|
75
|
-
) -> float:
|
|
74
|
+
self, ap: int = 1, iouThr: float = 0.9, areaRng: str = "all", maxDets: int = 100, per_category: bool = False
|
|
75
|
+
) -> Union[float, list[float]]:
|
|
76
76
|
# pylint: disable=C0103
|
|
77
77
|
p = self.params
|
|
78
78
|
iStr = " {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}"
|
|
@@ -86,6 +86,36 @@ def _summarize( # type: ignore
|
|
|
86
86
|
|
|
87
87
|
aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
|
|
88
88
|
mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
|
|
89
|
+
if per_category:
|
|
90
|
+
if ap == 1:
|
|
91
|
+
s = self.eval["precision"]
|
|
92
|
+
num_classes = s.shape[2]
|
|
93
|
+
results_per_class = []
|
|
94
|
+
for idx in range(num_classes):
|
|
95
|
+
if iouThr is not None:
|
|
96
|
+
s = self.eval["precision"]
|
|
97
|
+
t = np.where(iouThr == p.iouThrs)[0]
|
|
98
|
+
s = s[t]
|
|
99
|
+
precision = s[:, :, idx, aind, mind]
|
|
100
|
+
precision = precision[precision > -1]
|
|
101
|
+
res = np.mean(precision) if precision.size else float("nan")
|
|
102
|
+
results_per_class.append(float(res))
|
|
103
|
+
print(f"Precision for class {idx+1}: @[ IoU={iouStr} | area={areaRng} | maxDets={maxDets} ] = {res}")
|
|
104
|
+
else:
|
|
105
|
+
s = self.eval["recall"]
|
|
106
|
+
num_classes = s.shape[1]
|
|
107
|
+
results_per_class = []
|
|
108
|
+
for idx in range(num_classes):
|
|
109
|
+
if iouThr is not None:
|
|
110
|
+
s = self.eval["recall"]
|
|
111
|
+
t = np.where(iouThr == p.iouThrs)[0]
|
|
112
|
+
s = s[t]
|
|
113
|
+
recall = s[:, idx, aind, mind]
|
|
114
|
+
recall = recall[recall > -1]
|
|
115
|
+
res = np.mean(recall) if recall.size else float("nan")
|
|
116
|
+
results_per_class.append(float(res))
|
|
117
|
+
print(f"Recall for class {idx+1}: @[ IoU={iouStr} | area={areaRng} | maxDets={maxDets} ] = {res}")
|
|
118
|
+
return results_per_class
|
|
89
119
|
if ap == 1:
|
|
90
120
|
# dimension of precision: [TxRxKxAxM]
|
|
91
121
|
s = self.eval["precision"]
|
|
@@ -124,6 +154,7 @@ class CocoMetric(MetricBase):
|
|
|
124
154
|
mapper = image_to_coco
|
|
125
155
|
_f1_score = None
|
|
126
156
|
_f1_iou = None
|
|
157
|
+
_per_category = False
|
|
127
158
|
_params: dict[str, Union[list[int], list[list[int]]]] = {}
|
|
128
159
|
|
|
129
160
|
@classmethod
|
|
@@ -176,18 +207,28 @@ class CocoMetric(MetricBase):
|
|
|
176
207
|
|
|
177
208
|
if cls._f1_score:
|
|
178
209
|
summary_bbox = [
|
|
179
|
-
metric.summarize_f1(1, cls._f1_iou, maxDets=metric.params.maxDets[2]),
|
|
180
|
-
metric.summarize_f1(0, cls._f1_iou, maxDets=metric.params.maxDets[2]),
|
|
210
|
+
metric.summarize_f1(1, cls._f1_iou, maxDets=metric.params.maxDets[2], per_category=cls._per_category),
|
|
211
|
+
metric.summarize_f1(0, cls._f1_iou, maxDets=metric.params.maxDets[2], per_category=cls._per_category),
|
|
181
212
|
]
|
|
182
213
|
else:
|
|
183
214
|
metric.summarize()
|
|
184
215
|
summary_bbox = metric.stats
|
|
185
216
|
|
|
186
217
|
results = []
|
|
187
|
-
|
|
218
|
+
|
|
219
|
+
default_parameters = cls.get_summary_default_parameters()
|
|
220
|
+
if cls._per_category:
|
|
221
|
+
default_parameters = default_parameters * len(summary_bbox[0])
|
|
222
|
+
summary_bbox = [item for pair in zip(*summary_bbox) for item in pair]
|
|
223
|
+
val = 0
|
|
224
|
+
for idx, (params, value) in enumerate(zip(default_parameters, summary_bbox)):
|
|
188
225
|
params = copy(params)
|
|
189
226
|
params["mode"] = "bbox"
|
|
190
227
|
params["val"] = value
|
|
228
|
+
if cls._per_category:
|
|
229
|
+
if idx % 2 == 0:
|
|
230
|
+
val += 1
|
|
231
|
+
params["category_id"] = val
|
|
191
232
|
results.append(params)
|
|
192
233
|
|
|
193
234
|
return results
|
|
@@ -201,15 +242,16 @@ class CocoMetric(MetricBase):
|
|
|
201
242
|
area range and maximum detections.
|
|
202
243
|
"""
|
|
203
244
|
if cls._f1_score:
|
|
245
|
+
for el, idx in zip(_F1_DEFAULTS, [2, 2]):
|
|
246
|
+
if cls._params:
|
|
247
|
+
if cls._params.get("maxDets") is not None:
|
|
248
|
+
el["maxDets"] = cls._params["maxDets"][idx]
|
|
249
|
+
el["iouThr"] = cls._f1_iou
|
|
250
|
+
return _F1_DEFAULTS
|
|
251
|
+
|
|
252
|
+
for el, idx in zip(_COCOEVAL_DEFAULTS, _MAX_DET_INDEX):
|
|
204
253
|
if cls._params:
|
|
205
254
|
if cls._params.get("maxDets") is not None:
|
|
206
|
-
for el, idx in zip(_F1_DEFAULTS, [2, 2]):
|
|
207
|
-
el["maxDets"] = cls._params["maxDets"][idx]
|
|
208
|
-
el["iouThr"] = cls._f1_iou
|
|
209
|
-
return _F1_DEFAULTS
|
|
210
|
-
if cls._params:
|
|
211
|
-
if cls._params.get("maxDets") is not None:
|
|
212
|
-
for el, idx in zip(_COCOEVAL_DEFAULTS, _MAX_DET_INDEX):
|
|
213
255
|
el["maxDets"] = cls._params["maxDets"][idx]
|
|
214
256
|
return _COCOEVAL_DEFAULTS
|
|
215
257
|
|
|
@@ -220,13 +262,16 @@ class CocoMetric(MetricBase):
|
|
|
220
262
|
area_range: Optional[list[list[int]]] = None,
|
|
221
263
|
f1_score: bool = False,
|
|
222
264
|
f1_iou: float = 0.9,
|
|
265
|
+
per_category: bool = False,
|
|
223
266
|
) -> None:
|
|
224
267
|
"""
|
|
225
268
|
Setting params for different coco metric modes.
|
|
226
269
|
|
|
227
270
|
:param max_detections: The maximum number of detections to consider
|
|
228
271
|
:param area_range: The area range to classify objects as "all", "small", "medium" and "large"
|
|
229
|
-
:param f1_score: Will use f1 score setting with default iouThr 0.9
|
|
272
|
+
:param f1_score: Will use f1 score setting with default iouThr 0.9. To be more precise it does not calculate
|
|
273
|
+
the f1 score but the precision and recall for a given iou threshold. Use the harmonic mean to
|
|
274
|
+
get the ultimate f1 score.
|
|
230
275
|
:param f1_iou: Use with f1_score True and reset the f1 iou threshold
|
|
231
276
|
"""
|
|
232
277
|
if max_detections is not None:
|
|
@@ -238,6 +283,7 @@ class CocoMetric(MetricBase):
|
|
|
238
283
|
|
|
239
284
|
cls._f1_score = f1_score
|
|
240
285
|
cls._f1_iou = f1_iou
|
|
286
|
+
cls._per_category = per_category
|
|
241
287
|
|
|
242
288
|
@classmethod
|
|
243
289
|
def get_requirements(cls) -> list[Requirement]:
|
deepdoctection/extern/base.py
CHANGED
|
@@ -69,8 +69,7 @@ class ModelCategories:
|
|
|
69
69
|
if self.init_categories:
|
|
70
70
|
self._init_categories = MappingProxyType({key: get_type(val) for key, val in self.init_categories.items()})
|
|
71
71
|
else:
|
|
72
|
-
|
|
73
|
-
self._init_categories = MappingProxyType({})
|
|
72
|
+
self._init_categories = MappingProxyType({})
|
|
74
73
|
self.categories = self._init_categories
|
|
75
74
|
|
|
76
75
|
@overload
|
|
@@ -181,7 +180,7 @@ class NerModelCategories(ModelCategories):
|
|
|
181
180
|
self._init_categories = self.merge_bio_semantics_categories(
|
|
182
181
|
self._categories_semantics, self._categories_bio
|
|
183
182
|
)
|
|
184
|
-
|
|
183
|
+
self.categories = self._init_categories
|
|
185
184
|
|
|
186
185
|
@staticmethod
|
|
187
186
|
def merge_bio_semantics_categories(
|
deepdoctection/mapper/match.py
CHANGED
|
@@ -193,5 +193,7 @@ def match_anns_by_distance(
|
|
|
193
193
|
child_anns = dp.get_annotation(annotation_ids=child_ann_ids, category_names=child_ann_category_names)
|
|
194
194
|
child_centers = [block.get_bounding_box(dp.image_id).center for block in child_anns]
|
|
195
195
|
parent_centers = [block.get_bounding_box(dp.image_id).center for block in parent_anns]
|
|
196
|
-
|
|
197
|
-
|
|
196
|
+
if child_centers and parent_centers:
|
|
197
|
+
child_indices = distance.cdist(parent_centers, child_centers).argmin(axis=1)
|
|
198
|
+
return [(parent_anns[i], child_anns[j]) for i, j in enumerate(child_indices)]
|
|
199
|
+
return []
|
deepdoctection/mapper/misc.py
CHANGED
|
@@ -27,7 +27,7 @@ from typing import Mapping, Optional, Sequence, Union
|
|
|
27
27
|
|
|
28
28
|
from lazy_imports import try_import
|
|
29
29
|
|
|
30
|
-
from ..datapoint.convert import convert_pdf_bytes_to_np_array_v2
|
|
30
|
+
from ..datapoint.convert import convert_bytes_to_np_array, convert_pdf_bytes_to_np_array_v2
|
|
31
31
|
from ..datapoint.image import Image
|
|
32
32
|
from ..utils.fs import get_load_image_func, load_image_from_file
|
|
33
33
|
from ..utils.types import JsonDict
|
|
@@ -49,6 +49,7 @@ def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int]
|
|
|
49
49
|
|
|
50
50
|
file_name: Optional[str]
|
|
51
51
|
location: Optional[str]
|
|
52
|
+
image_bytes: Optional[bytes] = None
|
|
52
53
|
|
|
53
54
|
if isinstance(dp, str):
|
|
54
55
|
_, file_name = os.path.split(dp)
|
|
@@ -62,6 +63,7 @@ def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int]
|
|
|
62
63
|
document_id = dp.get("document_id")
|
|
63
64
|
if location == "":
|
|
64
65
|
location = str(dp.get("path", ""))
|
|
66
|
+
image_bytes = dp.get("image_bytes")
|
|
65
67
|
else:
|
|
66
68
|
raise TypeError("datapoint not of expected type for converting to image")
|
|
67
69
|
|
|
@@ -76,6 +78,8 @@ def to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int]
|
|
|
76
78
|
if dp_image.pdf_bytes is not None:
|
|
77
79
|
if isinstance(dp_image.pdf_bytes, bytes):
|
|
78
80
|
dp_image.image = convert_pdf_bytes_to_np_array_v2(dp_image.pdf_bytes, dpi=dpi)
|
|
81
|
+
elif image_bytes is not None:
|
|
82
|
+
dp_image.image = convert_bytes_to_np_array(image_bytes)
|
|
79
83
|
else:
|
|
80
84
|
dp_image.image = load_image_from_file(location)
|
|
81
85
|
|
|
@@ -23,31 +23,38 @@ import os
|
|
|
23
23
|
from pathlib import Path
|
|
24
24
|
from typing import List, Mapping, Optional, Sequence, Tuple, Union
|
|
25
25
|
|
|
26
|
-
from ..dataflow import DataFlow, MapData
|
|
26
|
+
from ..dataflow import CustomDataFromIterable, DataFlow, DataFromList, MapData
|
|
27
27
|
from ..dataflow.custom_serialize import SerializerFiles, SerializerPdfDoc
|
|
28
28
|
from ..datapoint.image import Image
|
|
29
29
|
from ..datapoint.view import IMAGE_DEFAULTS
|
|
30
30
|
from ..mapper.maputils import curry
|
|
31
31
|
from ..mapper.misc import to_image
|
|
32
32
|
from ..utils.fs import maybe_path_or_pdf
|
|
33
|
+
from ..utils.identifier import get_uuid_from_str
|
|
33
34
|
from ..utils.logger import LoggingRecord, logger
|
|
35
|
+
from ..utils.pdf_utils import PDFStreamer
|
|
34
36
|
from ..utils.types import PathLikeOrStr
|
|
37
|
+
from ..utils.utils import is_file_extension
|
|
35
38
|
from .base import Pipeline, PipelineComponent
|
|
36
39
|
from .common import PageParsingService
|
|
37
40
|
|
|
38
41
|
|
|
39
42
|
def _collect_from_kwargs(
|
|
40
|
-
**kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
|
|
41
|
-
) -> Tuple[Optional[str],
|
|
43
|
+
**kwargs: Union[Optional[str], bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
|
|
44
|
+
) -> Tuple[Optional[str], Union[str, Sequence[str]], bool, int, str, DataFlow, Optional[bytes]]:
|
|
45
|
+
b_bytes = kwargs.get("bytes")
|
|
42
46
|
dataset_dataflow = kwargs.get("dataset_dataflow")
|
|
43
47
|
path = kwargs.get("path")
|
|
44
48
|
if path is None and dataset_dataflow is None:
|
|
45
49
|
raise ValueError("Pass either path or dataset_dataflow as argument")
|
|
50
|
+
if path is None and b_bytes:
|
|
51
|
+
raise ValueError("When passing bytes, a path to the source document must be provided")
|
|
46
52
|
|
|
47
53
|
shuffle = kwargs.get("shuffle", False)
|
|
48
54
|
if not isinstance(shuffle, bool):
|
|
49
55
|
raise TypeError(f"shuffle must be of type bool but is of type {type(shuffle)}")
|
|
50
56
|
|
|
57
|
+
file_type = None
|
|
51
58
|
doc_path = None
|
|
52
59
|
if path:
|
|
53
60
|
if not isinstance(path, (str, Path)):
|
|
@@ -56,15 +63,27 @@ def _collect_from_kwargs(
|
|
|
56
63
|
if path_type == 2:
|
|
57
64
|
doc_path = path
|
|
58
65
|
path = None
|
|
66
|
+
file_type = ".pdf"
|
|
67
|
+
elif path_type == 3:
|
|
68
|
+
if is_file_extension(path, ".jpg"):
|
|
69
|
+
file_type = ".jpg"
|
|
70
|
+
if is_file_extension(path, ".png"):
|
|
71
|
+
file_type = ".png"
|
|
72
|
+
if is_file_extension(path, ".jpeg"):
|
|
73
|
+
file_type = ".jpeg"
|
|
74
|
+
if not b_bytes:
|
|
75
|
+
raise ValueError("When passing a path to a single image, bytes of the image must be passed")
|
|
59
76
|
elif not path_type:
|
|
60
77
|
raise ValueError("Pass only a path to a directory or to a pdf file")
|
|
61
78
|
|
|
62
|
-
file_type = kwargs.get(
|
|
79
|
+
file_type = kwargs.get(
|
|
80
|
+
"file_type", [".jpg", ".png", ".jpeg", ".tif"] if file_type is None else file_type # type: ignore
|
|
81
|
+
)
|
|
63
82
|
|
|
64
83
|
max_datapoints = kwargs.get("max_datapoints")
|
|
65
84
|
if not isinstance(max_datapoints, (int, type(None))):
|
|
66
85
|
raise TypeError(f"max_datapoints must be of type int, but is of type {type(max_datapoints)}")
|
|
67
|
-
return path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow # type: ignore
|
|
86
|
+
return path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow, b_bytes # type: ignore
|
|
68
87
|
|
|
69
88
|
|
|
70
89
|
@curry
|
|
@@ -142,12 +161,18 @@ class DoctectionPipe(Pipeline):
|
|
|
142
161
|
|
|
143
162
|
super().__init__(pipeline_component_list)
|
|
144
163
|
|
|
145
|
-
def _entry(self, **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]])
|
|
146
|
-
|
|
164
|
+
def _entry(self, **kwargs: Union[str, bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]) \
|
|
165
|
+
-> DataFlow:
|
|
166
|
+
path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow, b_bytes = _collect_from_kwargs(**kwargs)
|
|
147
167
|
|
|
148
168
|
df: DataFlow
|
|
149
169
|
|
|
150
|
-
if isinstance(
|
|
170
|
+
if isinstance(b_bytes, bytes):
|
|
171
|
+
df = DoctectionPipe.bytes_to_dataflow(path=doc_path if path is None else path,
|
|
172
|
+
b_bytes=b_bytes,
|
|
173
|
+
file_type=file_type)
|
|
174
|
+
|
|
175
|
+
elif isinstance(path, (str, Path)):
|
|
151
176
|
if not isinstance(file_type, (str, list)):
|
|
152
177
|
raise TypeError(f"file_type must be of type string or list, but is of type {type(file_type)}")
|
|
153
178
|
df = DoctectionPipe.path_to_dataflow(path=path, file_type=file_type, shuffle=shuffle)
|
|
@@ -162,7 +187,7 @@ class DoctectionPipe(Pipeline):
|
|
|
162
187
|
|
|
163
188
|
df = MapData(df, _proto_process(path, doc_path))
|
|
164
189
|
if dataset_dataflow is None:
|
|
165
|
-
df = MapData(df, _to_image(dpi=300)) # pylint: disable=E1120
|
|
190
|
+
df = MapData(df, _to_image(dpi=os.environ.get("DPI", 300))) # pylint: disable=E1120
|
|
166
191
|
return df
|
|
167
192
|
|
|
168
193
|
@staticmethod
|
|
@@ -197,6 +222,44 @@ class DoctectionPipe(Pipeline):
|
|
|
197
222
|
"""
|
|
198
223
|
return _doc_to_dataflow(path, max_datapoints)
|
|
199
224
|
|
|
225
|
+
@staticmethod
|
|
226
|
+
def bytes_to_dataflow(
|
|
227
|
+
path: str, b_bytes: bytes, file_type: Union[str, Sequence[str]], max_datapoints: Optional[int] = None
|
|
228
|
+
) -> DataFlow:
|
|
229
|
+
"""
|
|
230
|
+
Converts a bytes object to a dataflow
|
|
231
|
+
|
|
232
|
+
:param path: path to directory or an image file
|
|
233
|
+
:param b_bytes: bytes object
|
|
234
|
+
:param file_type: e.g. ".pdf", ".jpg" or [".jpg", ".png", ".jpeg", ".tif"]
|
|
235
|
+
:param max_datapoints: max number of datapoints to consider
|
|
236
|
+
:return: DataFlow
|
|
237
|
+
"""
|
|
238
|
+
|
|
239
|
+
file_name = os.path.split(path)[1]
|
|
240
|
+
if isinstance(file_type, str):
|
|
241
|
+
if file_type == ".pdf":
|
|
242
|
+
prefix, suffix = os.path.splitext(file_name)
|
|
243
|
+
df: DataFlow
|
|
244
|
+
df = CustomDataFromIterable(PDFStreamer(path_or_bytes=b_bytes), max_datapoints=max_datapoints)
|
|
245
|
+
df = MapData(
|
|
246
|
+
df,
|
|
247
|
+
lambda dp: {
|
|
248
|
+
"path": path,
|
|
249
|
+
"file_name": prefix + f"_{dp[1]}" + suffix,
|
|
250
|
+
"pdf_bytes": dp[0],
|
|
251
|
+
"page_number": dp[1],
|
|
252
|
+
"document_id": get_uuid_from_str(prefix),
|
|
253
|
+
},
|
|
254
|
+
)
|
|
255
|
+
else:
|
|
256
|
+
df = DataFromList(lst=[{"path": path, "file_name": file_name, "image_bytes": b_bytes}])
|
|
257
|
+
return df
|
|
258
|
+
raise ValueError(
|
|
259
|
+
f"pass: {path}, b_bytes: {b_bytes!r}, file_type: {file_type} and max_datapoints: {max_datapoints} "
|
|
260
|
+
f"not supported"
|
|
261
|
+
)
|
|
262
|
+
|
|
200
263
|
def dataflow_to_page(self, df: DataFlow) -> DataFlow:
|
|
201
264
|
"""
|
|
202
265
|
Converts a dataflow of images to a dataflow of pages
|
|
@@ -206,7 +269,9 @@ class DoctectionPipe(Pipeline):
|
|
|
206
269
|
"""
|
|
207
270
|
return self.page_parser.predict_dataflow(df)
|
|
208
271
|
|
|
209
|
-
def analyze(
|
|
272
|
+
def analyze(
|
|
273
|
+
self, **kwargs: Union[str, bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
|
|
274
|
+
) -> DataFlow:
|
|
210
275
|
"""
|
|
211
276
|
`kwargs key dataset_dataflow:` Transfer a dataflow of a dataset via its dataflow builder
|
|
212
277
|
|
|
@@ -215,6 +280,8 @@ class DoctectionPipe(Pipeline):
|
|
|
215
280
|
only the first page is processed through the pipeline.
|
|
216
281
|
Alternatively, a path to a pdf document with multiple pages.
|
|
217
282
|
|
|
283
|
+
`kwargs key bytes:` A bytes object of an image
|
|
284
|
+
|
|
218
285
|
`kwargs key file_type:` Selection of the file type, if: args:`file_type` is passed
|
|
219
286
|
|
|
220
287
|
`kwargs key max_datapoints:` Stops processing as soon as max_datapoints images have been processed
|
deepdoctection/utils/fs.py
CHANGED
|
@@ -227,20 +227,21 @@ def get_load_image_func(
|
|
|
227
227
|
|
|
228
228
|
def maybe_path_or_pdf(path: PathLikeOrStr) -> int:
|
|
229
229
|
"""
|
|
230
|
-
Checks if the path points to a directory or a
|
|
231
|
-
if the path points to a pdf doc or 0
|
|
230
|
+
Checks if the path points to a directory, a pdf document or a single image. Returns 1 if the path points to a
|
|
231
|
+
directory, 2 if the path points to a pdf doc and 3 if path points to either a PNG, JPG or JPEG or 0 if none of the
|
|
232
|
+
previous is true.
|
|
232
233
|
|
|
233
234
|
:param path: A path
|
|
234
|
-
:return: A value of 0,1,2
|
|
235
|
+
:return: A value of 0,1,2,3
|
|
235
236
|
"""
|
|
236
237
|
|
|
237
|
-
|
|
238
|
-
if is_dir:
|
|
238
|
+
if os.path.isdir(path):
|
|
239
239
|
return 1
|
|
240
240
|
file_name = os.path.split(path)[1]
|
|
241
|
-
|
|
242
|
-
if is_pdf:
|
|
241
|
+
if is_file_extension(file_name, ".pdf"):
|
|
243
242
|
return 2
|
|
243
|
+
if is_file_extension(file_name, [".png", ".jpeg", ".jpg", ".tif"]):
|
|
244
|
+
return 3
|
|
244
245
|
return 0
|
|
245
246
|
|
|
246
247
|
|