deepdoctection 0.33__py3-none-any.whl → 0.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +6 -3
- deepdoctection/analyzer/dd.py +39 -31
- deepdoctection/datapoint/annotation.py +40 -2
- deepdoctection/datapoint/image.py +117 -41
- deepdoctection/datapoint/view.py +1 -1
- deepdoctection/datasets/base.py +1 -1
- deepdoctection/datasets/instances/fintabnet.py +1 -1
- deepdoctection/datasets/instances/xfund.py +29 -7
- deepdoctection/eval/eval.py +7 -1
- deepdoctection/extern/model.py +2 -1
- deepdoctection/mapper/cats.py +11 -13
- deepdoctection/mapper/cocostruct.py +6 -2
- deepdoctection/mapper/d2struct.py +2 -1
- deepdoctection/mapper/laylmstruct.py +1 -1
- deepdoctection/mapper/match.py +31 -0
- deepdoctection/mapper/misc.py +1 -1
- deepdoctection/mapper/prodigystruct.py +1 -1
- deepdoctection/pipe/anngen.py +27 -0
- deepdoctection/pipe/base.py +23 -0
- deepdoctection/pipe/common.py +123 -38
- deepdoctection/pipe/segment.py +1 -1
- deepdoctection/pipe/sub_layout.py +1 -1
- deepdoctection/utils/env_info.py +1 -1
- deepdoctection/utils/fs.py +27 -4
- deepdoctection/utils/pdf_utils.py +28 -3
- deepdoctection/utils/settings.py +3 -0
- {deepdoctection-0.33.dist-info → deepdoctection-0.34.dist-info}/METADATA +4 -1
- {deepdoctection-0.33.dist-info → deepdoctection-0.34.dist-info}/RECORD +31 -31
- {deepdoctection-0.33.dist-info → deepdoctection-0.34.dist-info}/WHEEL +1 -1
- {deepdoctection-0.33.dist-info → deepdoctection-0.34.dist-info}/LICENSE +0 -0
- {deepdoctection-0.33.dist-info → deepdoctection-0.34.dist-info}/top_level.txt +0 -0
|
@@ -129,7 +129,7 @@ def image_to_coco(dp: Image) -> tuple[JsonDict, list[JsonDict]]:
|
|
|
129
129
|
img["height"] = dp.height
|
|
130
130
|
img["file_name"] = dp.file_name
|
|
131
131
|
|
|
132
|
-
for img_ann in dp.
|
|
132
|
+
for img_ann in dp.get_annotation():
|
|
133
133
|
ann: JsonDict = {
|
|
134
134
|
"id": int("".join([s for s in img_ann.annotation_id if s.isdigit()])),
|
|
135
135
|
"image_id": img["id"],
|
|
@@ -139,7 +139,11 @@ def image_to_coco(dp: Image) -> tuple[JsonDict, list[JsonDict]]:
|
|
|
139
139
|
ann["score"] = img_ann.score
|
|
140
140
|
ann["iscrowd"] = 0
|
|
141
141
|
bounding_box = img_ann.get_bounding_box(dp.image_id)
|
|
142
|
-
ann["area"] =
|
|
142
|
+
ann["area"] = (
|
|
143
|
+
bounding_box.area
|
|
144
|
+
if bounding_box.absolute_coords
|
|
145
|
+
else bounding_box.transform(dp.width, dp.height, absolute_coords=True).area
|
|
146
|
+
)
|
|
143
147
|
ann["bbox"] = bounding_box.to_list(mode="xywh")
|
|
144
148
|
anns.append(ann)
|
|
145
149
|
|
|
@@ -41,7 +41,7 @@ with try_import() as d2_import_guard:
|
|
|
41
41
|
from detectron2.structures import BoxMode
|
|
42
42
|
|
|
43
43
|
with try_import() as wb_import_guard:
|
|
44
|
-
from wandb import Classes
|
|
44
|
+
from wandb import Classes # type: ignore
|
|
45
45
|
from wandb import Image as Wbimage
|
|
46
46
|
|
|
47
47
|
|
|
@@ -189,6 +189,7 @@ def to_wandb_image(
|
|
|
189
189
|
class_set = Classes([{"name": val, "id": key} for key, val in sub_categories.items()])
|
|
190
190
|
else:
|
|
191
191
|
class_set = Classes([{"name": val, "id": key} for key, val in categories.items()])
|
|
192
|
+
class_labels = dict(categories.items())
|
|
192
193
|
|
|
193
194
|
for ann in anns:
|
|
194
195
|
bounding_box = ann.get_bounding_box(dp.image_id)
|
|
@@ -127,7 +127,7 @@ def image_to_raw_layoutlm_features(
|
|
|
127
127
|
all_boxes = []
|
|
128
128
|
all_labels: list[int] = []
|
|
129
129
|
|
|
130
|
-
anns = dp.
|
|
130
|
+
anns = dp.get_annotation(category_names=LayoutType.WORD)
|
|
131
131
|
|
|
132
132
|
word_id_to_segment_box = {}
|
|
133
133
|
if segment_positions:
|
deepdoctection/mapper/match.py
CHANGED
|
@@ -23,6 +23,7 @@ from typing import Any, Literal, Optional, Sequence, Union
|
|
|
23
23
|
|
|
24
24
|
import numpy as np
|
|
25
25
|
from numpy.typing import NDArray
|
|
26
|
+
from scipy.spatial import distance
|
|
26
27
|
|
|
27
28
|
from ..datapoint.annotation import ImageAnnotation
|
|
28
29
|
from ..datapoint.box import iou
|
|
@@ -164,3 +165,33 @@ def match_anns_by_intersection(
|
|
|
164
165
|
return [], [], [], []
|
|
165
166
|
|
|
166
167
|
return child_index, parent_index, child_anns, parent_anns
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def match_anns_by_distance(
|
|
171
|
+
dp: Image,
|
|
172
|
+
parent_ann_category_names: Union[TypeOrStr, Sequence[TypeOrStr]],
|
|
173
|
+
child_ann_category_names: Union[TypeOrStr, Sequence[TypeOrStr]],
|
|
174
|
+
parent_ann_ids: Optional[Union[Sequence[str], str]] = None,
|
|
175
|
+
child_ann_ids: Optional[Union[str, Sequence[str]]] = None,
|
|
176
|
+
) -> list[tuple[ImageAnnotation, ImageAnnotation]]:
|
|
177
|
+
"""
|
|
178
|
+
Generates pairs of parent and child annotations by calculating the euclidean distance between the centers of the
|
|
179
|
+
parent and child bounding boxes. It will return the closest child for each parent. Note, that a child can be
|
|
180
|
+
assigned multiple times to different parents.
|
|
181
|
+
|
|
182
|
+
:param dp: image datapoint
|
|
183
|
+
:param parent_ann_category_names: single str or list of category names
|
|
184
|
+
:param child_ann_category_names: single str or list of category names
|
|
185
|
+
:param parent_ann_ids: Additional filter condition. If some ids are selected, it will ignore all other parent candi-
|
|
186
|
+
dates which are not in the list.
|
|
187
|
+
:param child_ann_ids: Additional filter condition. If some ids are selected, it will ignore all other children
|
|
188
|
+
candidates which are not in the list.
|
|
189
|
+
:return:
|
|
190
|
+
"""
|
|
191
|
+
|
|
192
|
+
parent_anns = dp.get_annotation(annotation_ids=parent_ann_ids, category_names=parent_ann_category_names)
|
|
193
|
+
child_anns = dp.get_annotation(annotation_ids=child_ann_ids, category_names=child_ann_category_names)
|
|
194
|
+
child_centers = [block.get_bounding_box(dp.image_id).center for block in child_anns]
|
|
195
|
+
parent_centers = [block.get_bounding_box(dp.image_id).center for block in parent_anns]
|
|
196
|
+
child_indices = distance.cdist(parent_centers, child_centers).argmin(axis=1)
|
|
197
|
+
return [(parent_anns[i], child_anns[j]) for i, j in enumerate(child_indices)]
|
deepdoctection/mapper/misc.py
CHANGED
|
@@ -145,7 +145,7 @@ def image_ann_to_image(dp: Image, category_names: Union[str, list[str]], crop_im
|
|
|
145
145
|
:return: Image
|
|
146
146
|
"""
|
|
147
147
|
|
|
148
|
-
img_anns = dp.
|
|
148
|
+
img_anns = dp.get_annotation(category_names=category_names)
|
|
149
149
|
for ann in img_anns:
|
|
150
150
|
dp.image_ann_to_image(annotation_id=ann.annotation_id, crop_image=crop_image)
|
|
151
151
|
|
|
@@ -163,7 +163,7 @@ def image_to_prodigy(dp: Image, category_names: Optional[Sequence[ObjectTypes]]
|
|
|
163
163
|
output["image_id"] = dp.image_id
|
|
164
164
|
|
|
165
165
|
spans = []
|
|
166
|
-
for ann in dp.
|
|
166
|
+
for ann in dp.get_annotation(category_names=category_names):
|
|
167
167
|
bounding_box = ann.get_bounding_box(dp.image_id)
|
|
168
168
|
if not bounding_box.absolute_coords:
|
|
169
169
|
bounding_box = bounding_box.transform(dp.width, dp.height, absolute_coords=True)
|
deepdoctection/pipe/anngen.py
CHANGED
|
@@ -272,6 +272,33 @@ class DatapointManager:
|
|
|
272
272
|
return None
|
|
273
273
|
return cont_ann.annotation_id
|
|
274
274
|
|
|
275
|
+
def set_relationship_annotation(
|
|
276
|
+
self, relationship_name: ObjectTypes, target_annotation_id: str, annotation_id: str
|
|
277
|
+
) -> Optional[str]:
|
|
278
|
+
"""
|
|
279
|
+
Create a relationship annotation and dump it to the target annotation.
|
|
280
|
+
|
|
281
|
+
:param relationship_name: The relationship key
|
|
282
|
+
:param target_annotation_id: Annotation_id of the parent `ImageAnnotation`
|
|
283
|
+
:param annotation_id: The annotation_id to dump the relationship to
|
|
284
|
+
|
|
285
|
+
:return: Annotation_id of the parent `ImageAnnotation` for references if the dumpy has been successful
|
|
286
|
+
"""
|
|
287
|
+
self.assert_datapoint_passed()
|
|
288
|
+
with MappingContextManager(
|
|
289
|
+
dp_name=self.datapoint.file_name,
|
|
290
|
+
filter_level="annotation",
|
|
291
|
+
relationship_annotation={
|
|
292
|
+
"relationship_name": relationship_name.value,
|
|
293
|
+
"target_annotation_id": target_annotation_id,
|
|
294
|
+
"annotation_id": annotation_id,
|
|
295
|
+
},
|
|
296
|
+
) as annotation_context:
|
|
297
|
+
self._cache_anns[target_annotation_id].dump_relationship(relationship_name, annotation_id)
|
|
298
|
+
if annotation_context.context_error:
|
|
299
|
+
return None
|
|
300
|
+
return target_annotation_id
|
|
301
|
+
|
|
275
302
|
def set_summary_annotation(
|
|
276
303
|
self,
|
|
277
304
|
summary_key: ObjectTypes,
|
deepdoctection/pipe/base.py
CHANGED
|
@@ -163,6 +163,29 @@ class PipelineComponent(ABC):
|
|
|
163
163
|
return True
|
|
164
164
|
return False
|
|
165
165
|
|
|
166
|
+
def _undo(self, dp: Image) -> Image:
|
|
167
|
+
"""
|
|
168
|
+
Undo the processing of the pipeline component. It will remove `ImageAnnotation`, `CategoryAnnotation` and
|
|
169
|
+
`ContainerAnnotation` with the service_id of the pipeline component.
|
|
170
|
+
"""
|
|
171
|
+
if self.timer_on:
|
|
172
|
+
with timed_operation(self.__class__.__name__):
|
|
173
|
+
self.dp_manager.datapoint = dp
|
|
174
|
+
dp.remove(service_ids=self.service_id)
|
|
175
|
+
else:
|
|
176
|
+
self.dp_manager.datapoint = dp
|
|
177
|
+
dp.remove(service_ids=self.service_id)
|
|
178
|
+
return self.dp_manager.datapoint
|
|
179
|
+
|
|
180
|
+
def undo(self, df: DataFlow) -> DataFlow:
|
|
181
|
+
"""
|
|
182
|
+
Mapping a datapoint via `_undo` within a dataflow pipeline
|
|
183
|
+
|
|
184
|
+
:param df: An input dataflow of Images
|
|
185
|
+
:return: A output dataflow of Images
|
|
186
|
+
"""
|
|
187
|
+
return MapData(df, self._undo)
|
|
188
|
+
|
|
166
189
|
|
|
167
190
|
class Pipeline(ABC):
|
|
168
191
|
"""
|
deepdoctection/pipe/common.py
CHANGED
|
@@ -29,8 +29,7 @@ import numpy as np
|
|
|
29
29
|
from ..dataflow import DataFlow, MapData
|
|
30
30
|
from ..datapoint.image import Image
|
|
31
31
|
from ..datapoint.view import IMAGE_DEFAULTS, Page
|
|
32
|
-
from ..mapper.
|
|
33
|
-
from ..mapper.match import match_anns_by_intersection
|
|
32
|
+
from ..mapper.match import match_anns_by_distance, match_anns_by_intersection
|
|
34
33
|
from ..mapper.misc import to_image
|
|
35
34
|
from ..utils.settings import LayoutType, ObjectTypes, Relationships, TypeOrStr, get_type
|
|
36
35
|
from .base import MetaAnnotation, PipelineComponent
|
|
@@ -76,21 +75,23 @@ class ImageCroppingService(PipelineComponent):
|
|
|
76
75
|
pass
|
|
77
76
|
|
|
78
77
|
|
|
79
|
-
|
|
80
|
-
class MatchingService(PipelineComponent):
|
|
78
|
+
class IntersectionMatcher:
|
|
81
79
|
"""
|
|
82
|
-
Objects of two object classes can be assigned to one another by determining their pairwise
|
|
83
|
-
a limit, a relation is created between them.
|
|
84
|
-
The parent object class (based on its category) and the child object class are defined for the service.
|
|
85
|
-
relation is created in the parent class if the conditions are met.
|
|
80
|
+
Objects of two object classes can be assigned to one another by determining their pairwise intersection. If this is
|
|
81
|
+
above a limit, a relation is created between them.
|
|
82
|
+
The parent object class (based on its category) and the child object class are defined for the service.
|
|
86
83
|
|
|
87
84
|
Either `iou` (intersection-over-union) or `ioa` (intersection-over-area) can be selected as the matching rule.
|
|
88
85
|
|
|
89
86
|
# the following will assign word annotations to text and title annotation, provided that their ioa-threshold
|
|
90
87
|
# is above 0.7. words below that threshold will not be assigned.
|
|
91
88
|
|
|
92
|
-
|
|
93
|
-
|
|
89
|
+
matcher = IntersectionMatcher(matching_rule="ioa", threshold=0.7)
|
|
90
|
+
|
|
91
|
+
match_service = MatchingService(parent_categories=["text","title"],
|
|
92
|
+
child_categories="word",
|
|
93
|
+
matcher=matcher,
|
|
94
|
+
relationship_key=Relationships.CHILD)
|
|
94
95
|
|
|
95
96
|
# Assigning means that text and title annotation will receive a relationship called "CHILD" which is a list
|
|
96
97
|
of annotation ids of mapped words.
|
|
@@ -98,16 +99,12 @@ class MatchingService(PipelineComponent):
|
|
|
98
99
|
|
|
99
100
|
def __init__(
|
|
100
101
|
self,
|
|
101
|
-
parent_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
|
|
102
|
-
child_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
|
|
103
102
|
matching_rule: Literal["iou", "ioa"],
|
|
104
103
|
threshold: float,
|
|
105
104
|
use_weighted_intersections: bool = False,
|
|
106
105
|
max_parent_only: bool = False,
|
|
107
106
|
) -> None:
|
|
108
107
|
"""
|
|
109
|
-
:param parent_categories: list of categories to be used a for parent class. Will generate a child-relationship
|
|
110
|
-
:param child_categories: list of categories to be used for a child class.
|
|
111
108
|
:param matching_rule: "iou" or "ioa"
|
|
112
109
|
:param threshold: iou/ioa threshold. Value between [0,1]
|
|
113
110
|
:param use_weighted_intersections: This is currently only implemented for matching_rule 'ioa'. Instead of using
|
|
@@ -115,7 +112,105 @@ class MatchingService(PipelineComponent):
|
|
|
115
112
|
that intersections with more cells will likely decrease the ioa value. By
|
|
116
113
|
multiplying the ioa with the number of all intersection for each child this
|
|
117
114
|
value calibrate the ioa.
|
|
118
|
-
:param max_parent_only: Will assign to each child at most one parent with maximum ioa
|
|
115
|
+
:param max_parent_only: Will assign to each child at most one parent with maximum ioa"""
|
|
116
|
+
|
|
117
|
+
if matching_rule not in ("iou", "ioa"):
|
|
118
|
+
raise ValueError("segment rule must be either iou or ioa")
|
|
119
|
+
self.matching_rule = matching_rule
|
|
120
|
+
self.threshold = threshold
|
|
121
|
+
self.use_weighted_intersections = use_weighted_intersections
|
|
122
|
+
self.max_parent_only = max_parent_only
|
|
123
|
+
|
|
124
|
+
def match(
|
|
125
|
+
self,
|
|
126
|
+
dp: Image,
|
|
127
|
+
parent_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
|
|
128
|
+
child_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
|
|
129
|
+
) -> list[tuple[str, str]]:
|
|
130
|
+
"""
|
|
131
|
+
The matching algorithm
|
|
132
|
+
|
|
133
|
+
:param dp: datapoint image
|
|
134
|
+
:param parent_categories: list of categories to be used a for parent class. Will generate a child-relationship
|
|
135
|
+
:param child_categories: list of categories to be used for a child class.
|
|
136
|
+
|
|
137
|
+
:return: A list of tuples with parent and child annotation ids
|
|
138
|
+
"""
|
|
139
|
+
child_index, parent_index, child_anns, parent_anns = match_anns_by_intersection(
|
|
140
|
+
dp,
|
|
141
|
+
parent_ann_category_names=parent_categories,
|
|
142
|
+
child_ann_category_names=child_categories,
|
|
143
|
+
matching_rule=self.matching_rule,
|
|
144
|
+
threshold=self.threshold,
|
|
145
|
+
use_weighted_intersections=self.use_weighted_intersections,
|
|
146
|
+
max_parent_only=self.max_parent_only,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
matched_child_anns = np.take(child_anns, child_index) # type: ignore
|
|
150
|
+
matched_parent_anns = np.take(parent_anns, parent_index) # type: ignore
|
|
151
|
+
|
|
152
|
+
all_parent_child_relations = []
|
|
153
|
+
for idx, parent in enumerate(matched_parent_anns):
|
|
154
|
+
all_parent_child_relations.append((parent.annotation_id, matched_child_anns[idx].annotation_id))
|
|
155
|
+
|
|
156
|
+
return all_parent_child_relations
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class NeighbourMatcher:
|
|
160
|
+
"""
|
|
161
|
+
Objects of two object classes can be assigned to one another by determining their pairwise distance.
|
|
162
|
+
|
|
163
|
+
# the following will assign caption annotations to figure annotation
|
|
164
|
+
|
|
165
|
+
matcher = NeighbourMatcher()
|
|
166
|
+
|
|
167
|
+
match_service = MatchingService(parent_categories=["figure"],
|
|
168
|
+
child_categories="caption",
|
|
169
|
+
matcher=matcher,
|
|
170
|
+
relationship_key=Relationships.LAYOUT_LINK)
|
|
171
|
+
|
|
172
|
+
"""
|
|
173
|
+
|
|
174
|
+
def match(
|
|
175
|
+
self,
|
|
176
|
+
dp: Image,
|
|
177
|
+
parent_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
|
|
178
|
+
child_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
|
|
179
|
+
) -> list[tuple[str, str]]:
|
|
180
|
+
"""
|
|
181
|
+
The matching algorithm
|
|
182
|
+
|
|
183
|
+
:param dp: datapoint image
|
|
184
|
+
:param parent_categories: list of categories to be used a for parent class. Will generate a child-relationship
|
|
185
|
+
:param child_categories: list of categories to be used for a child class.
|
|
186
|
+
|
|
187
|
+
:return: A list of tuples with parent and child annotation ids
|
|
188
|
+
"""
|
|
189
|
+
|
|
190
|
+
return [
|
|
191
|
+
(pair[0].annotation_id, pair[1].annotation_id)
|
|
192
|
+
for pair in match_anns_by_distance(dp, parent_categories, child_categories)
|
|
193
|
+
]
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
@pipeline_component_registry.register("MatchingService")
|
|
197
|
+
class MatchingService(PipelineComponent):
|
|
198
|
+
"""
|
|
199
|
+
A service to match annotations of two categories by intersection or distance. The matched annotations will be
|
|
200
|
+
assigned a relationship. The parent category will receive a relationship to the child category.
|
|
201
|
+
"""
|
|
202
|
+
|
|
203
|
+
def __init__(
|
|
204
|
+
self,
|
|
205
|
+
parent_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
|
|
206
|
+
child_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
|
|
207
|
+
matcher: Union[IntersectionMatcher, NeighbourMatcher],
|
|
208
|
+
relationship_key: Relationships,
|
|
209
|
+
) -> None:
|
|
210
|
+
"""
|
|
211
|
+
:param parent_categories: list of categories to be used a for parent class. Will generate a child-relationship
|
|
212
|
+
:param child_categories: list of categories to be used for a child class.
|
|
213
|
+
|
|
119
214
|
"""
|
|
120
215
|
self.parent_categories = (
|
|
121
216
|
(get_type(parent_categories),)
|
|
@@ -127,13 +222,8 @@ class MatchingService(PipelineComponent):
|
|
|
127
222
|
if isinstance(child_categories, str)
|
|
128
223
|
else (tuple(get_type(category_name) for category_name in child_categories))
|
|
129
224
|
)
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
self.matching_rule = matching_rule
|
|
134
|
-
self.threshold = threshold
|
|
135
|
-
self.use_weighted_intersections = use_weighted_intersections
|
|
136
|
-
self.max_parent_only = max_parent_only
|
|
225
|
+
self.matcher = matcher
|
|
226
|
+
self.relationship_key = relationship_key
|
|
137
227
|
super().__init__("matching")
|
|
138
228
|
|
|
139
229
|
def serve(self, dp: Image) -> None:
|
|
@@ -143,24 +233,14 @@ class MatchingService(PipelineComponent):
|
|
|
143
233
|
|
|
144
234
|
:param dp: datapoint image
|
|
145
235
|
"""
|
|
146
|
-
child_index, parent_index, child_anns, parent_anns = match_anns_by_intersection(
|
|
147
|
-
dp,
|
|
148
|
-
parent_ann_category_names=self.parent_categories,
|
|
149
|
-
child_ann_category_names=self.child_categories,
|
|
150
|
-
matching_rule=self.matching_rule,
|
|
151
|
-
threshold=self.threshold,
|
|
152
|
-
use_weighted_intersections=self.use_weighted_intersections,
|
|
153
|
-
max_parent_only=self.max_parent_only,
|
|
154
|
-
)
|
|
155
236
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
parent.dump_relationship(Relationships.CHILD, matched_child_anns[idx].annotation_id)
|
|
237
|
+
matched_pairs = self.matcher.match(dp, self.parent_categories, self.child_categories)
|
|
238
|
+
|
|
239
|
+
for pair in matched_pairs:
|
|
240
|
+
self.dp_manager.set_relationship_annotation(self.relationship_key, pair[0], pair[1])
|
|
161
241
|
|
|
162
242
|
def clone(self) -> PipelineComponent:
|
|
163
|
-
return self.__class__(self.parent_categories, self.child_categories, self.
|
|
243
|
+
return self.__class__(self.parent_categories, self.child_categories, self.matcher, self.relationship_key)
|
|
164
244
|
|
|
165
245
|
def get_meta_annotation(self) -> MetaAnnotation:
|
|
166
246
|
return MetaAnnotation(
|
|
@@ -215,7 +295,12 @@ class PageParsingService(PipelineComponent):
|
|
|
215
295
|
:param dp: Image
|
|
216
296
|
:return: Page
|
|
217
297
|
"""
|
|
218
|
-
return Page.from_image(
|
|
298
|
+
return Page.from_image(
|
|
299
|
+
dp,
|
|
300
|
+
text_container=self.text_container,
|
|
301
|
+
floating_text_block_categories=self.floating_text_block_categories,
|
|
302
|
+
include_residual_text_container=self.include_residual_text_container,
|
|
303
|
+
)
|
|
219
304
|
|
|
220
305
|
def _init_sanity_checks(self) -> None:
|
|
221
306
|
assert self.text_container in (
|
deepdoctection/pipe/segment.py
CHANGED
|
@@ -372,7 +372,7 @@ def stretch_items(
|
|
|
372
372
|
:param remove_iou_threshold_cols: iou threshold for removing overlapping columns
|
|
373
373
|
:return: An Image
|
|
374
374
|
"""
|
|
375
|
-
table_anns = dp.
|
|
375
|
+
table_anns = dp.get_annotation(category_names=table_name)
|
|
376
376
|
|
|
377
377
|
for table in table_anns:
|
|
378
378
|
dp = stretch_item_per_table(dp, table, row_name, col_name, remove_iou_threshold_rows, remove_iou_threshold_cols)
|
|
@@ -190,7 +190,7 @@ class SubImageLayoutService(PipelineComponent):
|
|
|
190
190
|
- Optionally invoke the DetectResultGenerator
|
|
191
191
|
- Generate ImageAnnotations and dump to parent image and sub image.
|
|
192
192
|
"""
|
|
193
|
-
sub_image_anns = dp.
|
|
193
|
+
sub_image_anns = dp.get_annotation(category_names=self.sub_image_name)
|
|
194
194
|
for sub_image_ann in sub_image_anns:
|
|
195
195
|
np_image = self.prepare_np_image(sub_image_ann)
|
|
196
196
|
detect_result_list = self.predictor.predict(np_image)
|
deepdoctection/utils/env_info.py
CHANGED
|
@@ -176,7 +176,7 @@ def collect_installed_dependencies(data: KeyValEnvInfos) -> KeyValEnvInfos:
|
|
|
176
176
|
data.append(("Pycocotools", "None"))
|
|
177
177
|
|
|
178
178
|
if scipy_available():
|
|
179
|
-
import scipy
|
|
179
|
+
import scipy
|
|
180
180
|
|
|
181
181
|
data.append(("Scipy", scipy.__version__))
|
|
182
182
|
else:
|
deepdoctection/utils/fs.py
CHANGED
|
@@ -25,6 +25,7 @@ import os
|
|
|
25
25
|
from base64 import b64encode
|
|
26
26
|
from io import BytesIO
|
|
27
27
|
from pathlib import Path
|
|
28
|
+
from shutil import copyfile
|
|
28
29
|
from typing import Callable, Literal, Optional, Protocol, Union, overload
|
|
29
30
|
from urllib.request import urlretrieve
|
|
30
31
|
|
|
@@ -50,6 +51,7 @@ __all__ = [
|
|
|
50
51
|
"get_configs_dir_path",
|
|
51
52
|
"get_weights_dir_path",
|
|
52
53
|
"get_dataset_dir_path",
|
|
54
|
+
"maybe_copy_config_to_cache",
|
|
53
55
|
]
|
|
54
56
|
|
|
55
57
|
|
|
@@ -254,34 +256,55 @@ def load_json(path_ann: PathLikeOrStr) -> JsonDict:
|
|
|
254
256
|
return json_dict
|
|
255
257
|
|
|
256
258
|
|
|
257
|
-
def get_package_path() ->
|
|
259
|
+
def get_package_path() -> Path:
|
|
258
260
|
"""
|
|
259
261
|
:return: full base path of this package
|
|
260
262
|
"""
|
|
261
263
|
return PATH
|
|
262
264
|
|
|
263
265
|
|
|
264
|
-
def get_weights_dir_path() ->
|
|
266
|
+
def get_weights_dir_path() -> Path:
|
|
265
267
|
"""
|
|
266
268
|
:return: full base path to the model dir
|
|
267
269
|
"""
|
|
268
270
|
return MODEL_DIR
|
|
269
271
|
|
|
270
272
|
|
|
271
|
-
def get_configs_dir_path() ->
|
|
273
|
+
def get_configs_dir_path() -> Path:
|
|
272
274
|
"""
|
|
273
275
|
:return: full base path to the configs dir
|
|
274
276
|
"""
|
|
275
277
|
return CONFIGS
|
|
276
278
|
|
|
277
279
|
|
|
278
|
-
def get_dataset_dir_path() ->
|
|
280
|
+
def get_dataset_dir_path() -> Path:
|
|
279
281
|
"""
|
|
280
282
|
:return: full base path to the dataset dir
|
|
281
283
|
"""
|
|
282
284
|
return DATASET_DIR
|
|
283
285
|
|
|
284
286
|
|
|
287
|
+
def maybe_copy_config_to_cache(
|
|
288
|
+
package_path: PathLikeOrStr, configs_dir_path: PathLikeOrStr, file_name: str, force_copy: bool = True
|
|
289
|
+
) -> str:
|
|
290
|
+
"""
|
|
291
|
+
Initial copying of various files
|
|
292
|
+
:param package_path: base path to directory of source file `file_name`
|
|
293
|
+
:param configs_dir_path: base path to target directory
|
|
294
|
+
:param file_name: file to copy
|
|
295
|
+
:param force_copy: If file is already in target directory, will re-copy the file
|
|
296
|
+
|
|
297
|
+
:return: path to the copied file_name
|
|
298
|
+
"""
|
|
299
|
+
|
|
300
|
+
absolute_path_source = os.path.join(package_path, file_name)
|
|
301
|
+
absolute_path = os.path.join(configs_dir_path, os.path.join(os.path.split(file_name)[1]))
|
|
302
|
+
mkdir_p(os.path.split(absolute_path)[0])
|
|
303
|
+
if not os.path.isfile(absolute_path) or force_copy:
|
|
304
|
+
copyfile(absolute_path_source, absolute_path)
|
|
305
|
+
return absolute_path
|
|
306
|
+
|
|
307
|
+
|
|
285
308
|
@deprecated("Use pathlib operations instead", "2022-06-08")
|
|
286
309
|
def sub_path(anchor_dir: PathLikeOrStr, *paths: PathLikeOrStr) -> PathLikeOrStr:
|
|
287
310
|
"""
|
|
@@ -107,8 +107,7 @@ def get_pdf_file_reader(path: PathLikeOrStr) -> PdfReader:
|
|
|
107
107
|
)
|
|
108
108
|
sys.exit()
|
|
109
109
|
|
|
110
|
-
|
|
111
|
-
return file_reader
|
|
110
|
+
return PdfReader(os.fspath(path))
|
|
112
111
|
|
|
113
112
|
|
|
114
113
|
def get_pdf_file_writer() -> PdfWriter:
|
|
@@ -125,12 +124,24 @@ class PDFStreamer:
|
|
|
125
124
|
|
|
126
125
|
**Example:**
|
|
127
126
|
|
|
128
|
-
|
|
127
|
+
# Building a Dataflow with a PDFStreamer
|
|
128
|
+
df = dataflow.DataFromIterable(PDFStreamer(path=path))
|
|
129
129
|
df.reset_state()
|
|
130
130
|
|
|
131
131
|
for page in df:
|
|
132
132
|
... # do whatever you like
|
|
133
133
|
|
|
134
|
+
# Something else you can do:
|
|
135
|
+
streamer = PDFStreamer(path=path)
|
|
136
|
+
pages = len(streamer) # get the number of pages
|
|
137
|
+
random_int = random.sample(range(0, pages), 2) # select some pages
|
|
138
|
+
for ran in random_int:
|
|
139
|
+
pdf_bytes = streamer[ran] # get the page bytes directly
|
|
140
|
+
|
|
141
|
+
streamer.close() # Do not forget to close the streamer, otherwise the file will never be closed and might
|
|
142
|
+
# cause memory leaks if you open many files.
|
|
143
|
+
|
|
144
|
+
|
|
134
145
|
"""
|
|
135
146
|
|
|
136
147
|
def __init__(self, path: PathLikeOrStr) -> None:
|
|
@@ -150,6 +161,20 @@ class PDFStreamer:
|
|
|
150
161
|
writer.add_page(self.file_reader.pages[k])
|
|
151
162
|
writer.write(buffer)
|
|
152
163
|
yield buffer.getvalue(), k
|
|
164
|
+
self.file_reader.close()
|
|
165
|
+
|
|
166
|
+
def __getitem__(self, index: int) -> bytes:
|
|
167
|
+
buffer = BytesIO()
|
|
168
|
+
writer = get_pdf_file_writer()
|
|
169
|
+
writer.add_page(self.file_reader.pages[index])
|
|
170
|
+
writer.write(buffer)
|
|
171
|
+
return buffer.getvalue()
|
|
172
|
+
|
|
173
|
+
def close(self) -> None:
|
|
174
|
+
"""
|
|
175
|
+
Close the file reader
|
|
176
|
+
"""
|
|
177
|
+
self.file_reader.close()
|
|
153
178
|
|
|
154
179
|
|
|
155
180
|
# The following functions are modified versions from the Python poppler wrapper
|
deepdoctection/utils/settings.py
CHANGED
|
@@ -101,6 +101,7 @@ class DocumentType(ObjectTypes):
|
|
|
101
101
|
GOVERNMENT_TENDERS = "government_tenders"
|
|
102
102
|
MANUALS = "manuals"
|
|
103
103
|
PATENTS = "patents"
|
|
104
|
+
MARK = "mark"
|
|
104
105
|
|
|
105
106
|
|
|
106
107
|
@object_types_registry.register("LayoutType")
|
|
@@ -130,6 +131,7 @@ class LayoutType(ObjectTypes):
|
|
|
130
131
|
BACKGROUND = "background"
|
|
131
132
|
PAGE_NUMBER = "page_number"
|
|
132
133
|
KEY_VALUE_AREA = "key_value_area"
|
|
134
|
+
LIST_ITEM = "list_item"
|
|
133
135
|
|
|
134
136
|
|
|
135
137
|
@object_types_registry.register("TableType")
|
|
@@ -221,6 +223,7 @@ class Relationships(ObjectTypes):
|
|
|
221
223
|
CHILD = "child"
|
|
222
224
|
READING_ORDER = "reading_order"
|
|
223
225
|
SEMANTIC_ENTITY_LINK = "semantic_entity_link"
|
|
226
|
+
LAYOUT_LINK = "layout_link"
|
|
224
227
|
|
|
225
228
|
|
|
226
229
|
@object_types_registry.register("Languages")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: deepdoctection
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.34
|
|
4
4
|
Summary: Repository for Document AI
|
|
5
5
|
Home-page: https://github.com/deepdoctection/deepdoctection
|
|
6
6
|
Author: Dr. Janis Meyer
|
|
@@ -29,6 +29,7 @@ Requires-Dist: Pillow >=10.0.0
|
|
|
29
29
|
Requires-Dist: pypdf >=3.16.0
|
|
30
30
|
Requires-Dist: pyyaml >=6.0.1
|
|
31
31
|
Requires-Dist: pyzmq >=16
|
|
32
|
+
Requires-Dist: scipy >=1.13.1
|
|
32
33
|
Requires-Dist: termcolor >=1.1
|
|
33
34
|
Requires-Dist: tabulate >=0.7.7
|
|
34
35
|
Requires-Dist: tqdm ==4.64.0
|
|
@@ -74,6 +75,7 @@ Requires-Dist: Pillow >=10.0.0 ; extra == 'pt'
|
|
|
74
75
|
Requires-Dist: pypdf >=3.16.0 ; extra == 'pt'
|
|
75
76
|
Requires-Dist: pyyaml >=6.0.1 ; extra == 'pt'
|
|
76
77
|
Requires-Dist: pyzmq >=16 ; extra == 'pt'
|
|
78
|
+
Requires-Dist: scipy >=1.13.1 ; extra == 'pt'
|
|
77
79
|
Requires-Dist: termcolor >=1.1 ; extra == 'pt'
|
|
78
80
|
Requires-Dist: tabulate >=0.7.7 ; extra == 'pt'
|
|
79
81
|
Requires-Dist: tqdm ==4.64.0 ; extra == 'pt'
|
|
@@ -105,6 +107,7 @@ Requires-Dist: Pillow >=10.0.0 ; extra == 'tf'
|
|
|
105
107
|
Requires-Dist: pypdf >=3.16.0 ; extra == 'tf'
|
|
106
108
|
Requires-Dist: pyyaml >=6.0.1 ; extra == 'tf'
|
|
107
109
|
Requires-Dist: pyzmq >=16 ; extra == 'tf'
|
|
110
|
+
Requires-Dist: scipy >=1.13.1 ; extra == 'tf'
|
|
108
111
|
Requires-Dist: termcolor >=1.1 ; extra == 'tf'
|
|
109
112
|
Requires-Dist: tabulate >=0.7.7 ; extra == 'tf'
|
|
110
113
|
Requires-Dist: tqdm ==4.64.0 ; extra == 'tf'
|