deepdoctection 0.32__py3-none-any.whl → 0.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +8 -25
- deepdoctection/analyzer/dd.py +84 -71
- deepdoctection/dataflow/common.py +9 -5
- deepdoctection/dataflow/custom.py +5 -5
- deepdoctection/dataflow/custom_serialize.py +75 -18
- deepdoctection/dataflow/parallel_map.py +3 -3
- deepdoctection/dataflow/serialize.py +4 -4
- deepdoctection/dataflow/stats.py +3 -3
- deepdoctection/datapoint/annotation.py +78 -56
- deepdoctection/datapoint/box.py +7 -7
- deepdoctection/datapoint/convert.py +6 -6
- deepdoctection/datapoint/image.py +157 -75
- deepdoctection/datapoint/view.py +175 -151
- deepdoctection/datasets/adapter.py +30 -24
- deepdoctection/datasets/base.py +10 -10
- deepdoctection/datasets/dataflow_builder.py +3 -3
- deepdoctection/datasets/info.py +23 -25
- deepdoctection/datasets/instances/doclaynet.py +48 -49
- deepdoctection/datasets/instances/fintabnet.py +44 -45
- deepdoctection/datasets/instances/funsd.py +23 -23
- deepdoctection/datasets/instances/iiitar13k.py +8 -8
- deepdoctection/datasets/instances/layouttest.py +2 -2
- deepdoctection/datasets/instances/publaynet.py +3 -3
- deepdoctection/datasets/instances/pubtables1m.py +18 -18
- deepdoctection/datasets/instances/pubtabnet.py +30 -29
- deepdoctection/datasets/instances/rvlcdip.py +28 -29
- deepdoctection/datasets/instances/xfund.py +51 -30
- deepdoctection/datasets/save.py +6 -6
- deepdoctection/eval/accmetric.py +32 -33
- deepdoctection/eval/base.py +8 -9
- deepdoctection/eval/cocometric.py +13 -12
- deepdoctection/eval/eval.py +32 -26
- deepdoctection/eval/tedsmetric.py +16 -12
- deepdoctection/eval/tp_eval_callback.py +7 -16
- deepdoctection/extern/base.py +339 -134
- deepdoctection/extern/d2detect.py +69 -89
- deepdoctection/extern/deskew.py +11 -10
- deepdoctection/extern/doctrocr.py +81 -64
- deepdoctection/extern/fastlang.py +23 -16
- deepdoctection/extern/hfdetr.py +53 -38
- deepdoctection/extern/hflayoutlm.py +216 -155
- deepdoctection/extern/hflm.py +35 -30
- deepdoctection/extern/model.py +433 -255
- deepdoctection/extern/pdftext.py +15 -15
- deepdoctection/extern/pt/ptutils.py +4 -2
- deepdoctection/extern/tessocr.py +39 -38
- deepdoctection/extern/texocr.py +14 -16
- deepdoctection/extern/tp/tfutils.py +16 -2
- deepdoctection/extern/tp/tpcompat.py +11 -7
- deepdoctection/extern/tp/tpfrcnn/config/config.py +4 -4
- deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +1 -1
- deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +5 -5
- deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +6 -6
- deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +4 -4
- deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +5 -3
- deepdoctection/extern/tp/tpfrcnn/preproc.py +5 -5
- deepdoctection/extern/tpdetect.py +40 -45
- deepdoctection/mapper/cats.py +36 -40
- deepdoctection/mapper/cocostruct.py +16 -12
- deepdoctection/mapper/d2struct.py +22 -22
- deepdoctection/mapper/hfstruct.py +7 -7
- deepdoctection/mapper/laylmstruct.py +22 -24
- deepdoctection/mapper/maputils.py +9 -10
- deepdoctection/mapper/match.py +33 -2
- deepdoctection/mapper/misc.py +6 -7
- deepdoctection/mapper/pascalstruct.py +4 -4
- deepdoctection/mapper/prodigystruct.py +6 -6
- deepdoctection/mapper/pubstruct.py +84 -92
- deepdoctection/mapper/tpstruct.py +3 -3
- deepdoctection/mapper/xfundstruct.py +33 -33
- deepdoctection/pipe/anngen.py +39 -14
- deepdoctection/pipe/base.py +68 -99
- deepdoctection/pipe/common.py +181 -85
- deepdoctection/pipe/concurrency.py +14 -10
- deepdoctection/pipe/doctectionpipe.py +24 -21
- deepdoctection/pipe/language.py +20 -25
- deepdoctection/pipe/layout.py +18 -16
- deepdoctection/pipe/lm.py +49 -47
- deepdoctection/pipe/order.py +63 -65
- deepdoctection/pipe/refine.py +102 -109
- deepdoctection/pipe/segment.py +157 -162
- deepdoctection/pipe/sub_layout.py +50 -40
- deepdoctection/pipe/text.py +37 -36
- deepdoctection/pipe/transform.py +19 -16
- deepdoctection/train/d2_frcnn_train.py +27 -25
- deepdoctection/train/hf_detr_train.py +22 -18
- deepdoctection/train/hf_layoutlm_train.py +49 -48
- deepdoctection/train/tp_frcnn_train.py +10 -11
- deepdoctection/utils/concurrency.py +1 -1
- deepdoctection/utils/context.py +13 -6
- deepdoctection/utils/develop.py +4 -4
- deepdoctection/utils/env_info.py +52 -14
- deepdoctection/utils/file_utils.py +6 -11
- deepdoctection/utils/fs.py +41 -14
- deepdoctection/utils/identifier.py +2 -2
- deepdoctection/utils/logger.py +15 -15
- deepdoctection/utils/metacfg.py +7 -7
- deepdoctection/utils/pdf_utils.py +39 -14
- deepdoctection/utils/settings.py +188 -182
- deepdoctection/utils/tqdm.py +1 -1
- deepdoctection/utils/transform.py +14 -9
- deepdoctection/utils/types.py +104 -0
- deepdoctection/utils/utils.py +7 -7
- deepdoctection/utils/viz.py +70 -69
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/METADATA +7 -4
- deepdoctection-0.34.dist-info/RECORD +146 -0
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/WHEEL +1 -1
- deepdoctection/utils/detection_types.py +0 -68
- deepdoctection-0.32.dist-info/RECORD +0 -146
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/LICENSE +0 -0
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/top_level.txt +0 -0
deepdoctection/pipe/common.py
CHANGED
|
@@ -21,21 +21,18 @@ Module for common pipeline components
|
|
|
21
21
|
from __future__ import annotations
|
|
22
22
|
|
|
23
23
|
import os
|
|
24
|
-
|
|
25
|
-
from
|
|
26
|
-
from typing import List, Literal, Mapping, Optional, Sequence, Union
|
|
24
|
+
from copy import deepcopy
|
|
25
|
+
from typing import Literal, Mapping, Optional, Sequence, Union
|
|
27
26
|
|
|
28
27
|
import numpy as np
|
|
29
28
|
|
|
30
29
|
from ..dataflow import DataFlow, MapData
|
|
31
30
|
from ..datapoint.image import Image
|
|
32
31
|
from ..datapoint.view import IMAGE_DEFAULTS, Page
|
|
33
|
-
from ..mapper.
|
|
34
|
-
from ..mapper.match import match_anns_by_intersection
|
|
32
|
+
from ..mapper.match import match_anns_by_distance, match_anns_by_intersection
|
|
35
33
|
from ..mapper.misc import to_image
|
|
36
|
-
from ..utils.detection_types import JsonDict
|
|
37
34
|
from ..utils.settings import LayoutType, ObjectTypes, Relationships, TypeOrStr, get_type
|
|
38
|
-
from .base import PipelineComponent
|
|
35
|
+
from .base import MetaAnnotation, PipelineComponent
|
|
39
36
|
from .registry import pipeline_component_registry
|
|
40
37
|
|
|
41
38
|
if os.environ.get("DD_USE_TORCH"):
|
|
@@ -57,37 +54,44 @@ class ImageCroppingService(PipelineComponent):
|
|
|
57
54
|
:param category_names: A single name or a list of category names to crop
|
|
58
55
|
"""
|
|
59
56
|
|
|
60
|
-
|
|
61
|
-
category_names
|
|
62
|
-
|
|
57
|
+
self.category_names = (
|
|
58
|
+
(category_names,)
|
|
59
|
+
if isinstance(category_names, str)
|
|
60
|
+
else tuple(get_type(category_name) for category_name in category_names)
|
|
61
|
+
)
|
|
63
62
|
super().__init__("image_crop")
|
|
64
63
|
|
|
65
64
|
def serve(self, dp: Image) -> None:
|
|
66
65
|
for ann in dp.get_annotation(category_names=self.category_names):
|
|
67
66
|
dp.image_ann_to_image(ann.annotation_id, crop_image=True)
|
|
68
67
|
|
|
69
|
-
def clone(self) ->
|
|
68
|
+
def clone(self) -> ImageCroppingService:
|
|
70
69
|
return self.__class__(self.category_names)
|
|
71
70
|
|
|
72
|
-
def get_meta_annotation(self) ->
|
|
73
|
-
return
|
|
71
|
+
def get_meta_annotation(self) -> MetaAnnotation:
|
|
72
|
+
return MetaAnnotation(image_annotations=(), sub_categories={}, relationships={}, summaries=())
|
|
74
73
|
|
|
74
|
+
def clear_predictor(self) -> None:
|
|
75
|
+
pass
|
|
75
76
|
|
|
76
|
-
|
|
77
|
-
class
|
|
77
|
+
|
|
78
|
+
class IntersectionMatcher:
|
|
78
79
|
"""
|
|
79
|
-
Objects of two object classes can be assigned to one another by determining their pairwise
|
|
80
|
-
a limit, a relation is created between them.
|
|
81
|
-
The parent object class (based on its category) and the child object class are defined for the service.
|
|
82
|
-
relation is created in the parent class if the conditions are met.
|
|
80
|
+
Objects of two object classes can be assigned to one another by determining their pairwise intersection. If this is
|
|
81
|
+
above a limit, a relation is created between them.
|
|
82
|
+
The parent object class (based on its category) and the child object class are defined for the service.
|
|
83
83
|
|
|
84
84
|
Either `iou` (intersection-over-union) or `ioa` (intersection-over-area) can be selected as the matching rule.
|
|
85
85
|
|
|
86
86
|
# the following will assign word annotations to text and title annotation, provided that their ioa-threshold
|
|
87
87
|
# is above 0.7. words below that threshold will not be assigned.
|
|
88
88
|
|
|
89
|
-
|
|
90
|
-
|
|
89
|
+
matcher = IntersectionMatcher(matching_rule="ioa", threshold=0.7)
|
|
90
|
+
|
|
91
|
+
match_service = MatchingService(parent_categories=["text","title"],
|
|
92
|
+
child_categories="word",
|
|
93
|
+
matcher=matcher,
|
|
94
|
+
relationship_key=Relationships.CHILD)
|
|
91
95
|
|
|
92
96
|
# Assigning means that text and title annotation will receive a relationship called "CHILD" which is a list
|
|
93
97
|
of annotation ids of mapped words.
|
|
@@ -95,16 +99,12 @@ class MatchingService(PipelineComponent):
|
|
|
95
99
|
|
|
96
100
|
def __init__(
|
|
97
101
|
self,
|
|
98
|
-
parent_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
|
|
99
|
-
child_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
|
|
100
102
|
matching_rule: Literal["iou", "ioa"],
|
|
101
103
|
threshold: float,
|
|
102
104
|
use_weighted_intersections: bool = False,
|
|
103
105
|
max_parent_only: bool = False,
|
|
104
106
|
) -> None:
|
|
105
107
|
"""
|
|
106
|
-
:param parent_categories: list of categories to be used a for parent class. Will generate a child-relationship
|
|
107
|
-
:param child_categories: list of categories to be used for a child class.
|
|
108
108
|
:param matching_rule: "iou" or "ioa"
|
|
109
109
|
:param threshold: iou/ioa threshold. Value between [0,1]
|
|
110
110
|
:param use_weighted_intersections: This is currently only implemented for matching_rule 'ioa'. Instead of using
|
|
@@ -112,64 +112,150 @@ class MatchingService(PipelineComponent):
|
|
|
112
112
|
that intersections with more cells will likely decrease the ioa value. By
|
|
113
113
|
multiplying the ioa with the number of all intersection for each child this
|
|
114
114
|
value calibrate the ioa.
|
|
115
|
-
:param max_parent_only: Will assign to each child at most one parent with maximum ioa
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
if not isinstance(parent_categories, (list, set))
|
|
120
|
-
else [get_type(parent_category) for parent_category in parent_categories]
|
|
121
|
-
)
|
|
122
|
-
self.child_categories = (
|
|
123
|
-
[get_type(child_categories)] # type: ignore
|
|
124
|
-
if not isinstance(child_categories, (list, set))
|
|
125
|
-
else [get_type(child_category) for child_category in child_categories]
|
|
126
|
-
)
|
|
127
|
-
assert matching_rule in ["iou", "ioa"], "segment rule must be either iou or ioa"
|
|
115
|
+
:param max_parent_only: Will assign to each child at most one parent with maximum ioa"""
|
|
116
|
+
|
|
117
|
+
if matching_rule not in ("iou", "ioa"):
|
|
118
|
+
raise ValueError("segment rule must be either iou or ioa")
|
|
128
119
|
self.matching_rule = matching_rule
|
|
129
120
|
self.threshold = threshold
|
|
130
121
|
self.use_weighted_intersections = use_weighted_intersections
|
|
131
122
|
self.max_parent_only = max_parent_only
|
|
132
|
-
super().__init__("matching")
|
|
133
123
|
|
|
134
|
-
def
|
|
124
|
+
def match(
|
|
125
|
+
self,
|
|
126
|
+
dp: Image,
|
|
127
|
+
parent_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
|
|
128
|
+
child_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
|
|
129
|
+
) -> list[tuple[str, str]]:
|
|
135
130
|
"""
|
|
136
|
-
|
|
137
|
-
- generates child relationship at parent level
|
|
131
|
+
The matching algorithm
|
|
138
132
|
|
|
139
133
|
:param dp: datapoint image
|
|
134
|
+
:param parent_categories: list of categories to be used a for parent class. Will generate a child-relationship
|
|
135
|
+
:param child_categories: list of categories to be used for a child class.
|
|
136
|
+
|
|
137
|
+
:return: A list of tuples with parent and child annotation ids
|
|
140
138
|
"""
|
|
141
139
|
child_index, parent_index, child_anns, parent_anns = match_anns_by_intersection(
|
|
142
140
|
dp,
|
|
143
|
-
parent_ann_category_names=
|
|
144
|
-
child_ann_category_names=
|
|
141
|
+
parent_ann_category_names=parent_categories,
|
|
142
|
+
child_ann_category_names=child_categories,
|
|
145
143
|
matching_rule=self.matching_rule,
|
|
146
144
|
threshold=self.threshold,
|
|
147
145
|
use_weighted_intersections=self.use_weighted_intersections,
|
|
148
146
|
max_parent_only=self.max_parent_only,
|
|
149
147
|
)
|
|
150
148
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
149
|
+
matched_child_anns = np.take(child_anns, child_index) # type: ignore
|
|
150
|
+
matched_parent_anns = np.take(parent_anns, parent_index) # type: ignore
|
|
151
|
+
|
|
152
|
+
all_parent_child_relations = []
|
|
153
|
+
for idx, parent in enumerate(matched_parent_anns):
|
|
154
|
+
all_parent_child_relations.append((parent.annotation_id, matched_child_anns[idx].annotation_id))
|
|
155
|
+
|
|
156
|
+
return all_parent_child_relations
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class NeighbourMatcher:
|
|
160
|
+
"""
|
|
161
|
+
Objects of two object classes can be assigned to one another by determining their pairwise distance.
|
|
162
|
+
|
|
163
|
+
# the following will assign caption annotations to figure annotation
|
|
164
|
+
|
|
165
|
+
matcher = NeighbourMatcher()
|
|
166
|
+
|
|
167
|
+
match_service = MatchingService(parent_categories=["figure"],
|
|
168
|
+
child_categories="caption",
|
|
169
|
+
matcher=matcher,
|
|
170
|
+
relationship_key=Relationships.LAYOUT_LINK)
|
|
171
|
+
|
|
172
|
+
"""
|
|
173
|
+
|
|
174
|
+
def match(
|
|
175
|
+
self,
|
|
176
|
+
dp: Image,
|
|
177
|
+
parent_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
|
|
178
|
+
child_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
|
|
179
|
+
) -> list[tuple[str, str]]:
|
|
180
|
+
"""
|
|
181
|
+
The matching algorithm
|
|
182
|
+
|
|
183
|
+
:param dp: datapoint image
|
|
184
|
+
:param parent_categories: list of categories to be used a for parent class. Will generate a child-relationship
|
|
185
|
+
:param child_categories: list of categories to be used for a child class.
|
|
186
|
+
|
|
187
|
+
:return: A list of tuples with parent and child annotation ids
|
|
188
|
+
"""
|
|
189
|
+
|
|
190
|
+
return [
|
|
191
|
+
(pair[0].annotation_id, pair[1].annotation_id)
|
|
192
|
+
for pair in match_anns_by_distance(dp, parent_categories, child_categories)
|
|
193
|
+
]
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
@pipeline_component_registry.register("MatchingService")
|
|
197
|
+
class MatchingService(PipelineComponent):
|
|
198
|
+
"""
|
|
199
|
+
A service to match annotations of two categories by intersection or distance. The matched annotations will be
|
|
200
|
+
assigned a relationship. The parent category will receive a relationship to the child category.
|
|
201
|
+
"""
|
|
202
|
+
|
|
203
|
+
def __init__(
|
|
204
|
+
self,
|
|
205
|
+
parent_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
|
|
206
|
+
child_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
|
|
207
|
+
matcher: Union[IntersectionMatcher, NeighbourMatcher],
|
|
208
|
+
relationship_key: Relationships,
|
|
209
|
+
) -> None:
|
|
210
|
+
"""
|
|
211
|
+
:param parent_categories: list of categories to be used a for parent class. Will generate a child-relationship
|
|
212
|
+
:param child_categories: list of categories to be used for a child class.
|
|
213
|
+
|
|
214
|
+
"""
|
|
215
|
+
self.parent_categories = (
|
|
216
|
+
(get_type(parent_categories),)
|
|
217
|
+
if isinstance(parent_categories, str)
|
|
218
|
+
else tuple(get_type(category_name) for category_name in parent_categories)
|
|
219
|
+
)
|
|
220
|
+
self.child_categories = (
|
|
221
|
+
(get_type(child_categories),)
|
|
222
|
+
if isinstance(child_categories, str)
|
|
223
|
+
else (tuple(get_type(category_name) for category_name in child_categories))
|
|
224
|
+
)
|
|
225
|
+
self.matcher = matcher
|
|
226
|
+
self.relationship_key = relationship_key
|
|
227
|
+
super().__init__("matching")
|
|
228
|
+
|
|
229
|
+
def serve(self, dp: Image) -> None:
|
|
230
|
+
"""
|
|
231
|
+
- generates pairwise match-score by intersection
|
|
232
|
+
- generates child relationship at parent level
|
|
233
|
+
|
|
234
|
+
:param dp: datapoint image
|
|
235
|
+
"""
|
|
236
|
+
|
|
237
|
+
matched_pairs = self.matcher.match(dp, self.parent_categories, self.child_categories)
|
|
238
|
+
|
|
239
|
+
for pair in matched_pairs:
|
|
240
|
+
self.dp_manager.set_relationship_annotation(self.relationship_key, pair[0], pair[1])
|
|
156
241
|
|
|
157
242
|
def clone(self) -> PipelineComponent:
|
|
158
|
-
return self.__class__(self.parent_categories, self.child_categories, self.
|
|
159
|
-
|
|
160
|
-
def get_meta_annotation(self) ->
|
|
161
|
-
return
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
("summaries", []),
|
|
167
|
-
]
|
|
243
|
+
return self.__class__(self.parent_categories, self.child_categories, self.matcher, self.relationship_key)
|
|
244
|
+
|
|
245
|
+
def get_meta_annotation(self) -> MetaAnnotation:
|
|
246
|
+
return MetaAnnotation(
|
|
247
|
+
image_annotations=(),
|
|
248
|
+
sub_categories={},
|
|
249
|
+
relationships={parent: {Relationships.CHILD} for parent in self.parent_categories},
|
|
250
|
+
summaries=(),
|
|
168
251
|
)
|
|
169
252
|
|
|
253
|
+
def clear_predictor(self) -> None:
|
|
254
|
+
pass
|
|
255
|
+
|
|
170
256
|
|
|
171
257
|
@pipeline_component_registry.register("PageParsingService")
|
|
172
|
-
class PageParsingService:
|
|
258
|
+
class PageParsingService(PipelineComponent):
|
|
173
259
|
"""
|
|
174
260
|
A "pseudo" pipeline component that can be added to a pipeline to convert `Image`s into `Page` formats. It allows a
|
|
175
261
|
custom parsing depending on customizing options of other pipeline components.
|
|
@@ -188,14 +274,20 @@ class PageParsingService:
|
|
|
188
274
|
"""
|
|
189
275
|
self.name = "page_parser"
|
|
190
276
|
if isinstance(floating_text_block_categories, (str, ObjectTypes)):
|
|
191
|
-
floating_text_block_categories =
|
|
277
|
+
floating_text_block_categories = (get_type(floating_text_block_categories),)
|
|
192
278
|
if floating_text_block_categories is None:
|
|
193
|
-
floating_text_block_categories =
|
|
279
|
+
floating_text_block_categories = IMAGE_DEFAULTS["floating_text_block_categories"]
|
|
194
280
|
|
|
195
281
|
self.text_container = get_type(text_container)
|
|
196
|
-
self.floating_text_block_categories =
|
|
282
|
+
self.floating_text_block_categories = tuple(
|
|
283
|
+
(get_type(text_block) for text_block in floating_text_block_categories)
|
|
284
|
+
)
|
|
197
285
|
self.include_residual_text_container = include_residual_text_container
|
|
198
286
|
self._init_sanity_checks()
|
|
287
|
+
super().__init__(self.name)
|
|
288
|
+
|
|
289
|
+
def serve(self, dp: Image) -> None:
|
|
290
|
+
raise NotImplementedError("PageParsingService is not meant to be used in serve method")
|
|
199
291
|
|
|
200
292
|
def pass_datapoint(self, dp: Image) -> Page:
|
|
201
293
|
"""
|
|
@@ -203,29 +295,24 @@ class PageParsingService:
|
|
|
203
295
|
:param dp: Image
|
|
204
296
|
:return: Page
|
|
205
297
|
"""
|
|
206
|
-
return Page.from_image(
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
:param df: An input dataflow
|
|
213
|
-
:return: A output dataflow
|
|
214
|
-
"""
|
|
215
|
-
return MapData(df, self.pass_datapoint)
|
|
298
|
+
return Page.from_image(
|
|
299
|
+
dp,
|
|
300
|
+
text_container=self.text_container,
|
|
301
|
+
floating_text_block_categories=self.floating_text_block_categories,
|
|
302
|
+
include_residual_text_container=self.include_residual_text_container,
|
|
303
|
+
)
|
|
216
304
|
|
|
217
305
|
def _init_sanity_checks(self) -> None:
|
|
218
306
|
assert self.text_container in (
|
|
219
|
-
LayoutType.
|
|
220
|
-
LayoutType.
|
|
221
|
-
), f"text_container must be either {LayoutType.
|
|
307
|
+
LayoutType.WORD,
|
|
308
|
+
LayoutType.LINE,
|
|
309
|
+
), f"text_container must be either {LayoutType.WORD} or {LayoutType.LINE}"
|
|
222
310
|
|
|
223
|
-
|
|
224
|
-
def get_meta_annotation() -> JsonDict:
|
|
311
|
+
def get_meta_annotation(self) -> MetaAnnotation:
|
|
225
312
|
"""
|
|
226
313
|
meta annotation. We do not generate any new annotations here
|
|
227
314
|
"""
|
|
228
|
-
return
|
|
315
|
+
return MetaAnnotation(image_annotations=(), sub_categories={}, relationships={}, summaries=())
|
|
229
316
|
|
|
230
317
|
def clone(self) -> PageParsingService:
|
|
231
318
|
"""clone"""
|
|
@@ -235,6 +322,9 @@ class PageParsingService:
|
|
|
235
322
|
self.include_residual_text_container,
|
|
236
323
|
)
|
|
237
324
|
|
|
325
|
+
def clear_predictor(self) -> None:
|
|
326
|
+
pass
|
|
327
|
+
|
|
238
328
|
|
|
239
329
|
@pipeline_component_registry.register("AnnotationNmsService")
|
|
240
330
|
class AnnotationNmsService(PipelineComponent):
|
|
@@ -259,8 +349,8 @@ class AnnotationNmsService(PipelineComponent):
|
|
|
259
349
|
def __init__(
|
|
260
350
|
self,
|
|
261
351
|
nms_pairs: Sequence[Sequence[TypeOrStr]],
|
|
262
|
-
thresholds: Union[float,
|
|
263
|
-
priority: Optional[
|
|
352
|
+
thresholds: Union[float, list[float]],
|
|
353
|
+
priority: Optional[list[Union[Optional[TypeOrStr]]]] = None,
|
|
264
354
|
):
|
|
265
355
|
"""
|
|
266
356
|
:param nms_pairs: Groups of categories, either as string or by `ObjectType`.
|
|
@@ -297,8 +387,11 @@ class AnnotationNmsService(PipelineComponent):
|
|
|
297
387
|
def clone(self) -> PipelineComponent:
|
|
298
388
|
return self.__class__(deepcopy(self.nms_pairs), self.threshold)
|
|
299
389
|
|
|
300
|
-
def get_meta_annotation(self) ->
|
|
301
|
-
return
|
|
390
|
+
def get_meta_annotation(self) -> MetaAnnotation:
|
|
391
|
+
return MetaAnnotation(image_annotations=(), sub_categories={}, relationships={}, summaries=())
|
|
392
|
+
|
|
393
|
+
def clear_predictor(self) -> None:
|
|
394
|
+
pass
|
|
302
395
|
|
|
303
396
|
|
|
304
397
|
@pipeline_component_registry.register("ImageParsingService")
|
|
@@ -333,8 +426,11 @@ class ImageParsingService:
|
|
|
333
426
|
return self.__class__(self.dpi)
|
|
334
427
|
|
|
335
428
|
@staticmethod
|
|
336
|
-
def get_meta_annotation() ->
|
|
429
|
+
def get_meta_annotation() -> MetaAnnotation:
|
|
337
430
|
"""
|
|
338
431
|
meta annotation. We do not generate any new annotations here
|
|
339
432
|
"""
|
|
340
|
-
return
|
|
433
|
+
return MetaAnnotation(image_annotations=(), sub_categories={}, relationships={}, summaries=())
|
|
434
|
+
|
|
435
|
+
def clear_predictor(self) -> None:
|
|
436
|
+
"""clear predictor. Will do nothing"""
|
|
@@ -24,16 +24,16 @@ import itertools
|
|
|
24
24
|
import queue
|
|
25
25
|
from concurrent.futures import ThreadPoolExecutor
|
|
26
26
|
from contextlib import ExitStack
|
|
27
|
-
from typing import Callable,
|
|
27
|
+
from typing import Callable, Optional, Sequence, Union
|
|
28
28
|
|
|
29
29
|
import tqdm
|
|
30
30
|
|
|
31
31
|
from ..dataflow import DataFlow, MapData
|
|
32
32
|
from ..datapoint.image import Image
|
|
33
33
|
from ..utils.context import timed_operation
|
|
34
|
-
from ..utils.detection_types import JsonDict, QueueType, TqdmType
|
|
35
34
|
from ..utils.tqdm import get_tqdm
|
|
36
|
-
from .
|
|
35
|
+
from ..utils.types import QueueType, TqdmType
|
|
36
|
+
from .base import MetaAnnotation, PipelineComponent
|
|
37
37
|
from .common import ImageParsingService, PageParsingService
|
|
38
38
|
from .registry import pipeline_component_registry
|
|
39
39
|
|
|
@@ -100,7 +100,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
|
|
|
100
100
|
|
|
101
101
|
def __init__(
|
|
102
102
|
self,
|
|
103
|
-
pipeline_components: Sequence[Union[PipelineComponent,
|
|
103
|
+
pipeline_components: Sequence[Union[PipelineComponent, ImageParsingService]],
|
|
104
104
|
pre_proc_func: Optional[Callable[[Image], Image]] = None,
|
|
105
105
|
post_proc_func: Optional[Callable[[Image], Image]] = None,
|
|
106
106
|
max_datapoints: Optional[int] = None,
|
|
@@ -123,7 +123,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
|
|
|
123
123
|
self.timer_on = False
|
|
124
124
|
super().__init__(f"multi_thread_{self.pipe_components[0].name}")
|
|
125
125
|
|
|
126
|
-
def put_task(self, df: Union[DataFlow,
|
|
126
|
+
def put_task(self, df: Union[DataFlow, list[Image]]) -> None:
|
|
127
127
|
"""
|
|
128
128
|
Put a dataflow or a list of datapoints to the queue. Note, that the process will not start before `start`
|
|
129
129
|
is called. If you do not know how many datapoints will be cached, use max_datapoint to ensure no oom.
|
|
@@ -133,7 +133,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
|
|
|
133
133
|
|
|
134
134
|
self._put_datapoints_to_queue(df)
|
|
135
135
|
|
|
136
|
-
def start(self) ->
|
|
136
|
+
def start(self) -> list[Image]:
|
|
137
137
|
"""
|
|
138
138
|
Creates a worker for each component and starts processing the data points of the queue. A list of the results
|
|
139
139
|
is returned once all points in the queue have been processed.
|
|
@@ -165,7 +165,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
|
|
|
165
165
|
tqdm_bar: Optional[TqdmType] = None,
|
|
166
166
|
pre_proc_func: Optional[Callable[[Image], Image]] = None,
|
|
167
167
|
post_proc_func: Optional[Callable[[Image], Image]] = None,
|
|
168
|
-
) ->
|
|
168
|
+
) -> list[Image]:
|
|
169
169
|
outputs = []
|
|
170
170
|
|
|
171
171
|
with ExitStack() as stack:
|
|
@@ -184,7 +184,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
|
|
|
184
184
|
tqdm_bar.update(1)
|
|
185
185
|
return outputs
|
|
186
186
|
|
|
187
|
-
def _put_datapoints_to_queue(self, df: Union[DataFlow,
|
|
187
|
+
def _put_datapoints_to_queue(self, df: Union[DataFlow, list[Image]]) -> None:
|
|
188
188
|
if isinstance(df, DataFlow):
|
|
189
189
|
df.reset_state()
|
|
190
190
|
for idx, dp in enumerate(df):
|
|
@@ -193,7 +193,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
|
|
|
193
193
|
break
|
|
194
194
|
self.input_queue.put(dp)
|
|
195
195
|
|
|
196
|
-
def pass_datapoints(self, dpts:
|
|
196
|
+
def pass_datapoints(self, dpts: list[Image]) -> list[Image]:
|
|
197
197
|
"""
|
|
198
198
|
Putting the list of datapoints into a thread-save queue and start for each pipeline
|
|
199
199
|
component a separate thread. It will return a list of datapoints where the order of appearance
|
|
@@ -225,5 +225,9 @@ class MultiThreadPipelineComponent(PipelineComponent):
|
|
|
225
225
|
def clone(self) -> MultiThreadPipelineComponent:
|
|
226
226
|
raise NotImplementedError("MultiThreadPipelineComponent does not allow cloning")
|
|
227
227
|
|
|
228
|
-
def get_meta_annotation(self) ->
|
|
228
|
+
def get_meta_annotation(self) -> MetaAnnotation:
|
|
229
229
|
return self.pipe_components[0].get_meta_annotation()
|
|
230
|
+
|
|
231
|
+
def clear_predictor(self) -> None:
|
|
232
|
+
for pipe in self.pipe_components:
|
|
233
|
+
pipe.clear_predictor()
|
|
@@ -26,18 +26,18 @@ from typing import List, Mapping, Optional, Sequence, Tuple, Union
|
|
|
26
26
|
from ..dataflow import DataFlow, MapData
|
|
27
27
|
from ..dataflow.custom_serialize import SerializerFiles, SerializerPdfDoc
|
|
28
28
|
from ..datapoint.image import Image
|
|
29
|
+
from ..datapoint.view import IMAGE_DEFAULTS
|
|
29
30
|
from ..mapper.maputils import curry
|
|
30
31
|
from ..mapper.misc import to_image
|
|
31
|
-
from ..utils.detection_types import Pathlike
|
|
32
32
|
from ..utils.fs import maybe_path_or_pdf
|
|
33
33
|
from ..utils.logger import LoggingRecord, logger
|
|
34
|
-
from ..utils.
|
|
35
|
-
from .base import Pipeline, PipelineComponent
|
|
34
|
+
from ..utils.types import PathLikeOrStr
|
|
35
|
+
from .base import Pipeline, PipelineComponent
|
|
36
36
|
from .common import PageParsingService
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
def _collect_from_kwargs(
|
|
40
|
-
**kwargs: Union[str, DataFlow, bool, int,
|
|
40
|
+
**kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
|
|
41
41
|
) -> Tuple[Optional[str], Optional[str], bool, int, str, DataFlow]:
|
|
42
42
|
dataset_dataflow = kwargs.get("dataset_dataflow")
|
|
43
43
|
path = kwargs.get("path")
|
|
@@ -69,7 +69,7 @@ def _collect_from_kwargs(
|
|
|
69
69
|
|
|
70
70
|
@curry
|
|
71
71
|
def _proto_process(
|
|
72
|
-
dp: Union[str, Mapping[str, str]], path: Optional[
|
|
72
|
+
dp: Union[str, Mapping[str, str]], path: Optional[PathLikeOrStr], doc_path: Optional[PathLikeOrStr]
|
|
73
73
|
) -> Union[str, Mapping[str, str]]:
|
|
74
74
|
if isinstance(dp, str):
|
|
75
75
|
file_name = Path(dp).name
|
|
@@ -78,10 +78,14 @@ def _proto_process(
|
|
|
78
78
|
else:
|
|
79
79
|
file_name = dp["file_name"]
|
|
80
80
|
if path is None:
|
|
81
|
-
path_tmp = doc_path
|
|
81
|
+
path_tmp = doc_path or ""
|
|
82
82
|
else:
|
|
83
83
|
path_tmp = path
|
|
84
|
-
logger.info(
|
|
84
|
+
logger.info(
|
|
85
|
+
LoggingRecord(
|
|
86
|
+
f"Processing {file_name}", {"path": os.fspath(path_tmp), "df": os.fspath(path_tmp), "file_name": file_name}
|
|
87
|
+
)
|
|
88
|
+
)
|
|
85
89
|
return dp
|
|
86
90
|
|
|
87
91
|
|
|
@@ -90,7 +94,7 @@ def _to_image(dp: Union[str, Mapping[str, Union[str, bytes]]], dpi: Optional[int
|
|
|
90
94
|
return to_image(dp, dpi)
|
|
91
95
|
|
|
92
96
|
|
|
93
|
-
def _doc_to_dataflow(path:
|
|
97
|
+
def _doc_to_dataflow(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
|
|
94
98
|
if not os.path.isfile(path):
|
|
95
99
|
raise FileExistsError(f"{path} not a file")
|
|
96
100
|
|
|
@@ -127,19 +131,18 @@ class DoctectionPipe(Pipeline):
|
|
|
127
131
|
|
|
128
132
|
def __init__(
|
|
129
133
|
self,
|
|
130
|
-
pipeline_component_list: List[
|
|
134
|
+
pipeline_component_list: List[PipelineComponent],
|
|
131
135
|
page_parsing_service: Optional[PageParsingService] = None,
|
|
132
136
|
):
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
assert all(
|
|
138
|
-
isinstance(element, (PipelineComponent, PredictorPipelineComponent)) for element in pipeline_component_list
|
|
137
|
+
self.page_parser = (
|
|
138
|
+
PageParsingService(text_container=IMAGE_DEFAULTS["text_container"])
|
|
139
|
+
if page_parsing_service is None
|
|
140
|
+
else page_parsing_service
|
|
139
141
|
)
|
|
142
|
+
|
|
140
143
|
super().__init__(pipeline_component_list)
|
|
141
144
|
|
|
142
|
-
def _entry(self, **kwargs: Union[str, DataFlow, bool, int,
|
|
145
|
+
def _entry(self, **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]) -> DataFlow:
|
|
143
146
|
path, file_type, shuffle, max_datapoints, doc_path, dataset_dataflow = _collect_from_kwargs(**kwargs)
|
|
144
147
|
|
|
145
148
|
df: DataFlow
|
|
@@ -147,7 +150,7 @@ class DoctectionPipe(Pipeline):
|
|
|
147
150
|
if isinstance(path, (str, Path)):
|
|
148
151
|
if not isinstance(file_type, (str, list)):
|
|
149
152
|
raise TypeError(f"file_type must be of type string or list, but is of type {type(file_type)}")
|
|
150
|
-
df = DoctectionPipe.path_to_dataflow(path, file_type, shuffle=shuffle)
|
|
153
|
+
df = DoctectionPipe.path_to_dataflow(path=path, file_type=file_type, shuffle=shuffle)
|
|
151
154
|
elif isinstance(doc_path, (str, Path)):
|
|
152
155
|
df = DoctectionPipe.doc_to_dataflow(
|
|
153
156
|
path=doc_path, max_datapoints=int(max_datapoints) if max_datapoints is not None else None
|
|
@@ -164,7 +167,7 @@ class DoctectionPipe(Pipeline):
|
|
|
164
167
|
|
|
165
168
|
@staticmethod
|
|
166
169
|
def path_to_dataflow(
|
|
167
|
-
path:
|
|
170
|
+
path: PathLikeOrStr,
|
|
168
171
|
file_type: Union[str, Sequence[str]],
|
|
169
172
|
max_datapoints: Optional[int] = None,
|
|
170
173
|
shuffle: bool = False,
|
|
@@ -179,12 +182,12 @@ class DoctectionPipe(Pipeline):
|
|
|
179
182
|
:return: dataflow
|
|
180
183
|
"""
|
|
181
184
|
if not os.path.isdir(path):
|
|
182
|
-
raise NotADirectoryError(f"{path} not a directory")
|
|
185
|
+
raise NotADirectoryError(f"{os.fspath(path)} not a directory")
|
|
183
186
|
df = SerializerFiles.load(path, file_type, max_datapoints, shuffle)
|
|
184
187
|
return df
|
|
185
188
|
|
|
186
189
|
@staticmethod
|
|
187
|
-
def doc_to_dataflow(path:
|
|
190
|
+
def doc_to_dataflow(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
|
|
188
191
|
"""
|
|
189
192
|
Processing method for documents
|
|
190
193
|
|
|
@@ -203,7 +206,7 @@ class DoctectionPipe(Pipeline):
|
|
|
203
206
|
"""
|
|
204
207
|
return self.page_parser.predict_dataflow(df)
|
|
205
208
|
|
|
206
|
-
def analyze(self, **kwargs: Union[str, DataFlow, bool, int,
|
|
209
|
+
def analyze(self, **kwargs: Union[str, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]) -> DataFlow:
|
|
207
210
|
"""
|
|
208
211
|
`kwargs key dataset_dataflow:` Transfer a dataflow of a dataset via its dataflow builder
|
|
209
212
|
|