deepdoctection 0.30__py3-none-any.whl → 0.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +4 -2
- deepdoctection/analyzer/dd.py +6 -5
- deepdoctection/dataflow/base.py +0 -19
- deepdoctection/dataflow/custom.py +4 -3
- deepdoctection/dataflow/custom_serialize.py +14 -5
- deepdoctection/dataflow/parallel_map.py +12 -11
- deepdoctection/dataflow/serialize.py +5 -4
- deepdoctection/datapoint/annotation.py +33 -12
- deepdoctection/datapoint/box.py +1 -4
- deepdoctection/datapoint/convert.py +3 -1
- deepdoctection/datapoint/image.py +66 -29
- deepdoctection/datapoint/view.py +57 -25
- deepdoctection/datasets/adapter.py +1 -1
- deepdoctection/datasets/base.py +83 -10
- deepdoctection/datasets/dataflow_builder.py +1 -1
- deepdoctection/datasets/info.py +2 -2
- deepdoctection/datasets/instances/layouttest.py +2 -7
- deepdoctection/eval/accmetric.py +1 -1
- deepdoctection/eval/base.py +5 -4
- deepdoctection/eval/eval.py +2 -2
- deepdoctection/eval/tp_eval_callback.py +5 -4
- deepdoctection/extern/base.py +39 -13
- deepdoctection/extern/d2detect.py +164 -64
- deepdoctection/extern/deskew.py +32 -7
- deepdoctection/extern/doctrocr.py +227 -39
- deepdoctection/extern/fastlang.py +45 -7
- deepdoctection/extern/hfdetr.py +90 -33
- deepdoctection/extern/hflayoutlm.py +109 -22
- deepdoctection/extern/pdftext.py +2 -1
- deepdoctection/extern/pt/ptutils.py +3 -2
- deepdoctection/extern/tessocr.py +134 -22
- deepdoctection/extern/texocr.py +2 -0
- deepdoctection/extern/tp/tpcompat.py +4 -4
- deepdoctection/extern/tp/tpfrcnn/preproc.py +2 -7
- deepdoctection/extern/tpdetect.py +50 -23
- deepdoctection/mapper/d2struct.py +1 -1
- deepdoctection/mapper/hfstruct.py +1 -1
- deepdoctection/mapper/laylmstruct.py +1 -1
- deepdoctection/mapper/maputils.py +13 -2
- deepdoctection/mapper/prodigystruct.py +1 -1
- deepdoctection/mapper/pubstruct.py +10 -10
- deepdoctection/mapper/tpstruct.py +1 -1
- deepdoctection/pipe/anngen.py +35 -8
- deepdoctection/pipe/base.py +53 -19
- deepdoctection/pipe/cell.py +29 -8
- deepdoctection/pipe/common.py +12 -4
- deepdoctection/pipe/doctectionpipe.py +2 -2
- deepdoctection/pipe/language.py +3 -2
- deepdoctection/pipe/layout.py +3 -2
- deepdoctection/pipe/lm.py +2 -2
- deepdoctection/pipe/refine.py +18 -10
- deepdoctection/pipe/segment.py +21 -16
- deepdoctection/pipe/text.py +14 -8
- deepdoctection/pipe/transform.py +16 -9
- deepdoctection/train/d2_frcnn_train.py +15 -12
- deepdoctection/train/hf_detr_train.py +8 -6
- deepdoctection/train/hf_layoutlm_train.py +16 -11
- deepdoctection/utils/__init__.py +3 -0
- deepdoctection/utils/concurrency.py +1 -1
- deepdoctection/utils/context.py +2 -2
- deepdoctection/utils/env_info.py +55 -22
- deepdoctection/utils/error.py +84 -0
- deepdoctection/utils/file_utils.py +4 -15
- deepdoctection/utils/fs.py +7 -7
- deepdoctection/utils/pdf_utils.py +5 -4
- deepdoctection/utils/settings.py +5 -1
- deepdoctection/utils/transform.py +1 -1
- deepdoctection/utils/utils.py +0 -6
- deepdoctection/utils/viz.py +44 -2
- {deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/METADATA +33 -58
- {deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/RECORD +74 -73
- {deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/WHEEL +1 -1
- {deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/LICENSE +0 -0
- {deepdoctection-0.30.dist-info → deepdoctection-0.31.dist-info}/top_level.txt +0 -0
|
@@ -19,9 +19,10 @@
|
|
|
19
19
|
TP Faster RCNN model as predictor for deepdoctection pipeline
|
|
20
20
|
"""
|
|
21
21
|
|
|
22
|
+
from abc import ABC
|
|
22
23
|
from copy import copy
|
|
23
24
|
from pathlib import Path
|
|
24
|
-
from typing import List, Mapping, Optional, Sequence, Union
|
|
25
|
+
from typing import Dict, List, Mapping, Optional, Sequence, Union
|
|
25
26
|
|
|
26
27
|
from ..utils.detection_types import ImageType, Requirement
|
|
27
28
|
from ..utils.file_utils import get_tensorflow_requirement, get_tensorpack_requirement, tensorpack_available
|
|
@@ -36,7 +37,46 @@ if tensorpack_available():
|
|
|
36
37
|
from .tp.tpfrcnn.predict import tp_predict_image
|
|
37
38
|
|
|
38
39
|
|
|
39
|
-
class
|
|
40
|
+
class TPFrcnnDetectorMixin(ObjectDetector, ABC):
|
|
41
|
+
"""Base class for TP FRCNN detector. This class only implements the basic wrapper functions"""
|
|
42
|
+
|
|
43
|
+
def __init__(self, categories: Mapping[str, TypeOrStr], filter_categories: Optional[Sequence[TypeOrStr]] = None):
|
|
44
|
+
self.categories = copy(categories) # type: ignore
|
|
45
|
+
if filter_categories:
|
|
46
|
+
filter_categories = [get_type(cat) for cat in filter_categories]
|
|
47
|
+
self.filter_categories = filter_categories
|
|
48
|
+
self._tp_categories = self._map_to_tp_categories(categories)
|
|
49
|
+
|
|
50
|
+
def _map_category_names(self, detection_results: List[DetectionResult]) -> List[DetectionResult]:
|
|
51
|
+
"""
|
|
52
|
+
Populating category names to detection results
|
|
53
|
+
|
|
54
|
+
:param detection_results: list of detection results
|
|
55
|
+
:return: List of detection results with attribute class_name populated
|
|
56
|
+
"""
|
|
57
|
+
filtered_detection_result: List[DetectionResult] = []
|
|
58
|
+
for result in detection_results:
|
|
59
|
+
result.class_name = self._tp_categories[str(result.class_id)]
|
|
60
|
+
if self.filter_categories:
|
|
61
|
+
if result.class_name not in self.filter_categories:
|
|
62
|
+
filtered_detection_result.append(result)
|
|
63
|
+
else:
|
|
64
|
+
filtered_detection_result.append(result)
|
|
65
|
+
return filtered_detection_result
|
|
66
|
+
|
|
67
|
+
@staticmethod
|
|
68
|
+
def _map_to_tp_categories(categories: Mapping[str, TypeOrStr]) -> Dict[str, ObjectTypes]:
|
|
69
|
+
categories = {str(key): get_type(categories[val]) for key, val in enumerate(categories, 1)}
|
|
70
|
+
categories["0"] = get_type("background")
|
|
71
|
+
return categories # type: ignore
|
|
72
|
+
|
|
73
|
+
@staticmethod
|
|
74
|
+
def get_name(path_weights: str, architecture: str) -> str:
|
|
75
|
+
"""Returns the name of the model"""
|
|
76
|
+
return f"Tensorpack_{architecture}" + "_".join(Path(path_weights).parts[-2:])
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class TPFrcnnDetector(TensorpackPredictor, TPFrcnnDetectorMixin):
|
|
40
80
|
"""
|
|
41
81
|
Tensorpack Faster-RCNN implementation with FPN and optional Cascade-RCNN. The backbones Resnet-50, Resnet-101 and
|
|
42
82
|
their Resnext counterparts are also available. Normalization options (group normalization, synchronized batch
|
|
@@ -87,19 +127,23 @@ class TPFrcnnDetector(TensorpackPredictor, ObjectDetector):
|
|
|
87
127
|
:param filter_categories: The model might return objects that are not supposed to be predicted and that should
|
|
88
128
|
be filtered. Pass a list of category names that must not be returned
|
|
89
129
|
"""
|
|
90
|
-
self.name = "_".join(Path(path_weights).parts[-3:])
|
|
91
130
|
self.path_yaml = path_yaml
|
|
131
|
+
|
|
92
132
|
self.categories = copy(categories) # type: ignore
|
|
93
133
|
self.config_overwrite = config_overwrite
|
|
94
134
|
if filter_categories:
|
|
95
135
|
filter_categories = [get_type(cat) for cat in filter_categories]
|
|
96
136
|
self.filter_categories = filter_categories
|
|
97
|
-
model = TPFrcnnDetector.
|
|
98
|
-
|
|
137
|
+
model = TPFrcnnDetector.get_wrapped_model(path_yaml, self.categories, config_overwrite)
|
|
138
|
+
TensorpackPredictor.__init__(self, model, path_weights, ignore_mismatch)
|
|
139
|
+
TPFrcnnDetectorMixin.__init__(self, categories, filter_categories)
|
|
140
|
+
|
|
141
|
+
self.name = self.get_name(path_weights, self._model.cfg.TAG)
|
|
142
|
+
self.model_id = self.get_model_id()
|
|
99
143
|
assert self._number_gpus > 0, "Model only support inference with GPU"
|
|
100
144
|
|
|
101
145
|
@staticmethod
|
|
102
|
-
def
|
|
146
|
+
def get_wrapped_model(
|
|
103
147
|
path_yaml: str, categories: Mapping[str, ObjectTypes], config_overwrite: Union[List[str], None]
|
|
104
148
|
) -> ResNetFPNModel:
|
|
105
149
|
"""
|
|
@@ -138,23 +182,6 @@ class TPFrcnnDetector(TensorpackPredictor, ObjectDetector):
|
|
|
138
182
|
)
|
|
139
183
|
return self._map_category_names(detection_results)
|
|
140
184
|
|
|
141
|
-
def _map_category_names(self, detection_results: List[DetectionResult]) -> List[DetectionResult]:
|
|
142
|
-
"""
|
|
143
|
-
Populating category names to detection results
|
|
144
|
-
|
|
145
|
-
:param detection_results: list of detection results
|
|
146
|
-
:return: List of detection results with attribute class_name populated
|
|
147
|
-
"""
|
|
148
|
-
filtered_detection_result: List[DetectionResult] = []
|
|
149
|
-
for result in detection_results:
|
|
150
|
-
result.class_name = self._model.cfg.DATA.CLASS_DICT[str(result.class_id)]
|
|
151
|
-
if self.filter_categories:
|
|
152
|
-
if result.class_name not in self.filter_categories:
|
|
153
|
-
filtered_detection_result.append(result)
|
|
154
|
-
else:
|
|
155
|
-
filtered_detection_result.append(result)
|
|
156
|
-
return filtered_detection_result
|
|
157
|
-
|
|
158
185
|
@classmethod
|
|
159
186
|
def get_requirements(cls) -> List[Requirement]:
|
|
160
187
|
return [get_tensorflow_requirement(), get_tensorpack_requirement()]
|
|
@@ -146,7 +146,7 @@ def image_to_raw_layoutlm_features(
|
|
|
146
146
|
raise TypeError(f"char_cat must be of type ContainerAnnotation but is of type {type(char_cat)}")
|
|
147
147
|
word = char_cat.value
|
|
148
148
|
if not isinstance(word, str):
|
|
149
|
-
raise
|
|
149
|
+
raise TypeError(f"word must be of type str but is of type {type(word)}")
|
|
150
150
|
all_words.append(word)
|
|
151
151
|
|
|
152
152
|
box = ann.get_bounding_box(dp.image_id)
|
|
@@ -28,8 +28,8 @@ import numpy as np
|
|
|
28
28
|
from tabulate import tabulate
|
|
29
29
|
from termcolor import colored
|
|
30
30
|
|
|
31
|
-
from ..datapoint.box import BoundingBoxError
|
|
32
31
|
from ..utils.detection_types import DP, BaseExceptionType, S, T
|
|
32
|
+
from ..utils.error import AnnotationError, BoundingBoxError, ImageError, UUIDError
|
|
33
33
|
from ..utils.logger import LoggingRecord, logger
|
|
34
34
|
from ..utils.settings import ObjectTypes
|
|
35
35
|
|
|
@@ -72,7 +72,18 @@ class MappingContextManager:
|
|
|
72
72
|
"""
|
|
73
73
|
if (
|
|
74
74
|
exc_type
|
|
75
|
-
in (
|
|
75
|
+
in (
|
|
76
|
+
KeyError,
|
|
77
|
+
ValueError,
|
|
78
|
+
IndexError,
|
|
79
|
+
AssertionError,
|
|
80
|
+
TypeError,
|
|
81
|
+
FileNotFoundError,
|
|
82
|
+
BoundingBoxError,
|
|
83
|
+
AnnotationError,
|
|
84
|
+
ImageError,
|
|
85
|
+
UUIDError,
|
|
86
|
+
)
|
|
76
87
|
and exc_tb is not None
|
|
77
88
|
):
|
|
78
89
|
frame_summary = traceback.extract_tb(exc_tb)[0]
|
|
@@ -128,7 +128,7 @@ def prodigy_to_image(
|
|
|
128
128
|
else:
|
|
129
129
|
label = span["label"]
|
|
130
130
|
if not isinstance(label, str):
|
|
131
|
-
raise
|
|
131
|
+
raise TypeError("label must be a string")
|
|
132
132
|
|
|
133
133
|
annotation = ImageAnnotation(
|
|
134
134
|
category_name=label,
|
|
@@ -75,12 +75,14 @@ def _cell_token(html: Sequence[str]) -> List[List[int]]:
|
|
|
75
75
|
def _item_spans(html: Sequence[str], index_cells: Sequence[Sequence[int]], item: str) -> List[List[int]]:
|
|
76
76
|
item_spans = [
|
|
77
77
|
[
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
78
|
+
(
|
|
79
|
+
int(html[index_cell - 1].replace(item + "=", "").replace('"', ""))
|
|
80
|
+
if (item in html[index_cell - 1] and html[index_cell] == ">")
|
|
81
|
+
else (
|
|
82
|
+
int(html[index_cell - 2].replace(item + "=", "").replace('"', ""))
|
|
83
|
+
if (item in html[index_cell - 2] and html[index_cell] == ">")
|
|
84
|
+
else 1
|
|
85
|
+
)
|
|
84
86
|
)
|
|
85
87
|
for index_cell in index_cell_per_row
|
|
86
88
|
]
|
|
@@ -210,9 +212,7 @@ def _add_items(image: Image, item_type: str, categories_name_as_key: Dict[str, s
|
|
|
210
212
|
items = image.get_annotation(category_names=TableType.item)
|
|
211
213
|
item_type_anns = [ann for ann in items if ann.get_sub_category(TableType.item).category_name == item_type]
|
|
212
214
|
item_type_anns.sort(
|
|
213
|
-
key=lambda x: x.bounding_box.cx # type: ignore
|
|
214
|
-
if item_type == LayoutType.column
|
|
215
|
-
else x.bounding_box.cy # type: ignore
|
|
215
|
+
key=lambda x: (x.bounding_box.cx if item_type == LayoutType.column else x.bounding_box.cy) # type: ignore
|
|
216
216
|
)
|
|
217
217
|
if table.bounding_box:
|
|
218
218
|
tmp_item_xy = table.bounding_box.uly + 1.0 if item_type == LayoutType.row else table.bounding_box.ulx + 1.0
|
|
@@ -389,7 +389,7 @@ def pub_to_image_uncur( # pylint: disable=R0914
|
|
|
389
389
|
with MappingContextManager(str(idx)) as mapping_context:
|
|
390
390
|
max_rs, max_cs = 0, 0
|
|
391
391
|
if idx is None:
|
|
392
|
-
raise
|
|
392
|
+
raise TypeError("imgid is None but must be a string")
|
|
393
393
|
|
|
394
394
|
image = Image(file_name=os.path.split(dp["filename"])[1], location=dp["filename"], external_id=idx)
|
|
395
395
|
|
|
@@ -67,7 +67,7 @@ def image_to_tp_frcnn_training(
|
|
|
67
67
|
all_categories.append(ann.category_id)
|
|
68
68
|
|
|
69
69
|
if add_mask:
|
|
70
|
-
raise NotImplementedError
|
|
70
|
+
raise NotImplementedError()
|
|
71
71
|
|
|
72
72
|
output["gt_boxes"] = np.asarray(all_boxes, dtype="float32")
|
|
73
73
|
output["gt_labels"] = np.asarray(all_categories, dtype="int32")
|
deepdoctection/pipe/anngen.py
CHANGED
|
@@ -42,11 +42,14 @@ class DatapointManager:
|
|
|
42
42
|
The manager is part of each `PipelineComponent`.
|
|
43
43
|
"""
|
|
44
44
|
|
|
45
|
-
def __init__(self) -> None:
|
|
45
|
+
def __init__(self, service_id: str, model_id: Optional[str] = None) -> None:
|
|
46
46
|
self._datapoint: Optional[Image] = None
|
|
47
47
|
self._cache_anns: Dict[str, ImageAnnotation] = {}
|
|
48
48
|
self.datapoint_is_passed: bool = False
|
|
49
49
|
self.category_id_mapping: Optional[Mapping[int, int]] = None
|
|
50
|
+
self.service_id = service_id
|
|
51
|
+
self.model_id = model_id
|
|
52
|
+
self.session_id: Optional[str] = None
|
|
50
53
|
|
|
51
54
|
@property
|
|
52
55
|
def datapoint(self) -> Image:
|
|
@@ -55,7 +58,7 @@ class DatapointManager:
|
|
|
55
58
|
"""
|
|
56
59
|
if self._datapoint is not None:
|
|
57
60
|
return self._datapoint
|
|
58
|
-
raise ValueError("
|
|
61
|
+
raise ValueError("No datapoint passed")
|
|
59
62
|
|
|
60
63
|
@datapoint.setter
|
|
61
64
|
def datapoint(self, dp: Image) -> None:
|
|
@@ -154,6 +157,9 @@ class DatapointManager:
|
|
|
154
157
|
bounding_box=box,
|
|
155
158
|
category_id=str(detect_result.class_id),
|
|
156
159
|
score=detect_result.score,
|
|
160
|
+
service_id=self.service_id,
|
|
161
|
+
model_id=self.model_id,
|
|
162
|
+
session_id=self.session_id,
|
|
157
163
|
)
|
|
158
164
|
if to_annotation_id is not None:
|
|
159
165
|
parent_ann = self._cache_anns[to_annotation_id]
|
|
@@ -208,7 +214,14 @@ class DatapointManager:
|
|
|
208
214
|
"annotation_id": annotation_id,
|
|
209
215
|
},
|
|
210
216
|
) as annotation_context:
|
|
211
|
-
cat_ann = CategoryAnnotation(
|
|
217
|
+
cat_ann = CategoryAnnotation(
|
|
218
|
+
category_name=category_name,
|
|
219
|
+
category_id=str(category_id),
|
|
220
|
+
score=score,
|
|
221
|
+
service_id=self.service_id,
|
|
222
|
+
model_id=self.model_id,
|
|
223
|
+
session_id=self.session_id,
|
|
224
|
+
)
|
|
212
225
|
self._cache_anns[annotation_id].dump_sub_category(sub_cat_key, cat_ann)
|
|
213
226
|
if annotation_context.context_error:
|
|
214
227
|
return None
|
|
@@ -246,7 +259,13 @@ class DatapointManager:
|
|
|
246
259
|
},
|
|
247
260
|
) as annotation_context:
|
|
248
261
|
cont_ann = ContainerAnnotation(
|
|
249
|
-
category_name=category_name,
|
|
262
|
+
category_name=category_name,
|
|
263
|
+
category_id=str(category_id),
|
|
264
|
+
value=value,
|
|
265
|
+
score=score,
|
|
266
|
+
service_id=self.service_id,
|
|
267
|
+
model_id=self.model_id,
|
|
268
|
+
session_id=self.session_id,
|
|
250
269
|
)
|
|
251
270
|
self._cache_anns[annotation_id].dump_sub_category(sub_cat_key, cont_ann)
|
|
252
271
|
if annotation_context.context_error:
|
|
@@ -257,7 +276,7 @@ class DatapointManager:
|
|
|
257
276
|
self,
|
|
258
277
|
summary_key: ObjectTypes,
|
|
259
278
|
summary_name: ObjectTypes,
|
|
260
|
-
summary_number: int,
|
|
279
|
+
summary_number: Optional[int] = None,
|
|
261
280
|
summary_value: Optional[str] = None,
|
|
262
281
|
summary_score: Optional[float] = None,
|
|
263
282
|
annotation_id: Optional[str] = None,
|
|
@@ -294,16 +313,24 @@ class DatapointManager:
|
|
|
294
313
|
"annotation_id": annotation_id,
|
|
295
314
|
},
|
|
296
315
|
) as annotation_context:
|
|
297
|
-
if summary_value:
|
|
316
|
+
if summary_value is not None:
|
|
298
317
|
ann = ContainerAnnotation(
|
|
299
318
|
category_name=summary_name,
|
|
300
|
-
category_id=str(summary_number),
|
|
319
|
+
category_id=str(summary_number) if summary_number is not None else "",
|
|
301
320
|
value=summary_value,
|
|
302
321
|
score=summary_score,
|
|
322
|
+
service_id=self.service_id,
|
|
323
|
+
model_id=self.model_id,
|
|
324
|
+
session_id=self.session_id,
|
|
303
325
|
)
|
|
304
326
|
else:
|
|
305
327
|
ann = CategoryAnnotation(
|
|
306
|
-
category_name=summary_name,
|
|
328
|
+
category_name=summary_name,
|
|
329
|
+
category_id=str(summary_number) if summary_number is not None else "",
|
|
330
|
+
score=summary_score,
|
|
331
|
+
service_id=self.service_id,
|
|
332
|
+
model_id=self.model_id,
|
|
333
|
+
session_id=self.session_id,
|
|
307
334
|
)
|
|
308
335
|
image.summary.dump_sub_category(summary_key, ann, image.image_id)
|
|
309
336
|
|
deepdoctection/pipe/base.py
CHANGED
|
@@ -23,12 +23,14 @@ from abc import ABC, abstractmethod
|
|
|
23
23
|
from collections import defaultdict
|
|
24
24
|
from copy import deepcopy
|
|
25
25
|
from typing import Any, Callable, DefaultDict, Dict, List, Mapping, Optional, Set, Union
|
|
26
|
+
from uuid import uuid1
|
|
26
27
|
|
|
27
28
|
from ..dataflow import DataFlow, MapData
|
|
28
29
|
from ..datapoint.image import Image
|
|
29
30
|
from ..extern.base import ImageTransformer, ObjectDetector, PdfMiner, TextRecognizer
|
|
30
31
|
from ..utils.context import timed_operation
|
|
31
32
|
from ..utils.detection_types import JsonDict
|
|
33
|
+
from ..utils.identifier import get_uuid_from_str
|
|
32
34
|
from .anngen import DatapointManager
|
|
33
35
|
|
|
34
36
|
|
|
@@ -58,8 +60,9 @@ class PipelineComponent(ABC):
|
|
|
58
60
|
pipeline. Use something that describe the task of the pipeline.
|
|
59
61
|
"""
|
|
60
62
|
self.name = name
|
|
63
|
+
self.service_id = self.get_service_id()
|
|
61
64
|
self._meta_has_all_types()
|
|
62
|
-
self.dp_manager = DatapointManager()
|
|
65
|
+
self.dp_manager = DatapointManager(self.service_id)
|
|
63
66
|
self.timer_on = False
|
|
64
67
|
|
|
65
68
|
@abstractmethod
|
|
@@ -75,7 +78,7 @@ class PipelineComponent(ABC):
|
|
|
75
78
|
As a simplified interface `serve` does not have to return a dp. The data point is passed on within
|
|
76
79
|
pipelines internally (via `pass_datapoint`).
|
|
77
80
|
"""
|
|
78
|
-
raise NotImplementedError
|
|
81
|
+
raise NotImplementedError()
|
|
79
82
|
|
|
80
83
|
def pass_datapoint(self, dp: Image) -> Image:
|
|
81
84
|
"""
|
|
@@ -109,7 +112,7 @@ class PipelineComponent(ABC):
|
|
|
109
112
|
"""
|
|
110
113
|
Clone an instance
|
|
111
114
|
"""
|
|
112
|
-
raise NotImplementedError
|
|
115
|
+
raise NotImplementedError()
|
|
113
116
|
|
|
114
117
|
@abstractmethod
|
|
115
118
|
def get_meta_annotation(self) -> JsonDict:
|
|
@@ -122,7 +125,7 @@ class PipelineComponent(ABC):
|
|
|
122
125
|
`summaries` with values: A list of summary sub categories
|
|
123
126
|
:return: Dict with meta infos as just described
|
|
124
127
|
"""
|
|
125
|
-
raise NotImplementedError
|
|
128
|
+
raise NotImplementedError()
|
|
126
129
|
|
|
127
130
|
def _meta_has_all_types(self) -> None:
|
|
128
131
|
if not {"image_annotations", "sub_categories", "relationships", "summaries"}.issubset(
|
|
@@ -133,6 +136,12 @@ class PipelineComponent(ABC):
|
|
|
133
136
|
f"Got {self.get_meta_annotation().keys()}"
|
|
134
137
|
)
|
|
135
138
|
|
|
139
|
+
def get_service_id(self) -> str:
|
|
140
|
+
"""
|
|
141
|
+
Get the generating model
|
|
142
|
+
"""
|
|
143
|
+
return get_uuid_from_str(self.name)[:8]
|
|
144
|
+
|
|
136
145
|
|
|
137
146
|
class PredictorPipelineComponent(PipelineComponent, ABC):
|
|
138
147
|
"""
|
|
@@ -151,10 +160,11 @@ class PredictorPipelineComponent(PipelineComponent, ABC):
|
|
|
151
160
|
"""
|
|
152
161
|
self.predictor = predictor
|
|
153
162
|
super().__init__(name)
|
|
163
|
+
self.dp_manager = DatapointManager(self.service_id, self.predictor.model_id)
|
|
154
164
|
|
|
155
165
|
@abstractmethod
|
|
156
166
|
def clone(self) -> "PredictorPipelineComponent":
|
|
157
|
-
raise NotImplementedError
|
|
167
|
+
raise NotImplementedError()
|
|
158
168
|
|
|
159
169
|
|
|
160
170
|
class LanguageModelPipelineComponent(PipelineComponent, ABC):
|
|
@@ -175,15 +185,15 @@ class LanguageModelPipelineComponent(PipelineComponent, ABC):
|
|
|
175
185
|
"""
|
|
176
186
|
|
|
177
187
|
self.tokenizer = tokenizer
|
|
178
|
-
self.mapping_to_lm_input_func = mapping_to_lm_input_func
|
|
179
188
|
super().__init__(name)
|
|
189
|
+
self.mapping_to_lm_input_func = mapping_to_lm_input_func
|
|
180
190
|
|
|
181
191
|
@abstractmethod
|
|
182
192
|
def clone(self) -> "LanguageModelPipelineComponent":
|
|
183
193
|
"""
|
|
184
194
|
Clone an instance
|
|
185
195
|
"""
|
|
186
|
-
raise NotImplementedError
|
|
196
|
+
raise NotImplementedError()
|
|
187
197
|
|
|
188
198
|
|
|
189
199
|
class ImageTransformPipelineComponent(PipelineComponent, ABC):
|
|
@@ -206,7 +216,7 @@ class ImageTransformPipelineComponent(PipelineComponent, ABC):
|
|
|
206
216
|
"""
|
|
207
217
|
Clone an instance
|
|
208
218
|
"""
|
|
209
|
-
raise NotImplementedError
|
|
219
|
+
raise NotImplementedError()
|
|
210
220
|
|
|
211
221
|
|
|
212
222
|
class Pipeline(ABC):
|
|
@@ -228,7 +238,7 @@ class Pipeline(ABC):
|
|
|
228
238
|
|
|
229
239
|
layout = LayoutPipeComponent(layout_detector ...)
|
|
230
240
|
text = TextExtractPipeComponent(text_detector ...)
|
|
231
|
-
simple_pipe = MyPipeline
|
|
241
|
+
simple_pipe = MyPipeline(pipeline_component = [layout, text])
|
|
232
242
|
doc_dataflow = simple_pipe.analyze(input = path / to / dir)
|
|
233
243
|
|
|
234
244
|
for page in doc_dataflow:
|
|
@@ -238,6 +248,18 @@ class Pipeline(ABC):
|
|
|
238
248
|
model or already processed further).
|
|
239
249
|
|
|
240
250
|
In addition to `analyze`, the internal `_entry` is used to bundle preprocessing steps.
|
|
251
|
+
|
|
252
|
+
It is possible to set a session id for the pipeline. This is useful for logging purposes. The session id can be
|
|
253
|
+
either passed to the pipeline via the `analyze` method or generated automatically.
|
|
254
|
+
|
|
255
|
+
To generate a session_id automatically:
|
|
256
|
+
|
|
257
|
+
**Example:**
|
|
258
|
+
|
|
259
|
+
pipe = MyPipeline(pipeline_component = [layout, text])
|
|
260
|
+
pipe.set_session_id = True
|
|
261
|
+
|
|
262
|
+
df = pipe.analyze(input = "path/to/dir") # session_id is generated automatically
|
|
241
263
|
"""
|
|
242
264
|
|
|
243
265
|
def __init__(self, pipeline_component_list: List[PipelineComponent]) -> None:
|
|
@@ -245,6 +267,7 @@ class Pipeline(ABC):
|
|
|
245
267
|
:param pipeline_component_list: A list of pipeline components.
|
|
246
268
|
"""
|
|
247
269
|
self.pipe_component_list = pipeline_component_list
|
|
270
|
+
self.set_session_id = False
|
|
248
271
|
|
|
249
272
|
@abstractmethod
|
|
250
273
|
def _entry(self, **kwargs: Any) -> DataFlow:
|
|
@@ -254,14 +277,17 @@ class Pipeline(ABC):
|
|
|
254
277
|
|
|
255
278
|
:param kwargs: Arguments, for dynamic customizing of the processing or for the transfer of processing types
|
|
256
279
|
"""
|
|
257
|
-
raise NotImplementedError
|
|
280
|
+
raise NotImplementedError()
|
|
258
281
|
|
|
259
|
-
def _build_pipe(self, df: DataFlow) -> DataFlow:
|
|
282
|
+
def _build_pipe(self, df: DataFlow, session_id: Optional[str] = None) -> DataFlow:
|
|
260
283
|
"""
|
|
261
284
|
Composition of the backbone
|
|
262
285
|
"""
|
|
286
|
+
if session_id is None and self.set_session_id:
|
|
287
|
+
session_id = self.get_session_id()
|
|
263
288
|
for component in self.pipe_component_list:
|
|
264
289
|
component.timer_on = True
|
|
290
|
+
component.dp_manager.session_id = session_id
|
|
265
291
|
df = component.predict_dataflow(df)
|
|
266
292
|
return df
|
|
267
293
|
|
|
@@ -277,7 +303,7 @@ class Pipeline(ABC):
|
|
|
277
303
|
|
|
278
304
|
can be triggered.
|
|
279
305
|
"""
|
|
280
|
-
raise NotImplementedError
|
|
306
|
+
raise NotImplementedError()
|
|
281
307
|
|
|
282
308
|
def get_meta_annotation(self) -> JsonDict:
|
|
283
309
|
"""
|
|
@@ -301,22 +327,30 @@ class Pipeline(ABC):
|
|
|
301
327
|
for key, value in meta_anns["relationships"].items():
|
|
302
328
|
pipeline_populations["relationships"][key].update(value)
|
|
303
329
|
pipeline_populations["summaries"].extend(meta_anns["summaries"]) # type: ignore
|
|
304
|
-
|
|
330
|
+
pipeline_populations["sub_categories"] = dict(pipeline_populations["sub_categories"]) # type: ignore
|
|
331
|
+
pipeline_populations["relationships"] = dict(pipeline_populations["relationships"]) # type: ignore
|
|
305
332
|
return pipeline_populations
|
|
306
333
|
|
|
307
334
|
def get_pipeline_info(
|
|
308
|
-
self,
|
|
309
|
-
) -> Union[Mapping[
|
|
335
|
+
self, service_id: Optional[str] = None, name: Optional[str] = None
|
|
336
|
+
) -> Union[str, Mapping[str, str]]:
|
|
310
337
|
"""Get pipeline information: Returns a dictionary with a description of each pipeline component
|
|
311
|
-
:param
|
|
338
|
+
:param service_id: service_id of the pipeline component to search for
|
|
312
339
|
:param name: name of the pipeline component to search for
|
|
313
340
|
:return: Either a full dictionary with position and name of all pipeline components or the name, if the position
|
|
314
341
|
has been passed or the position if the name has been passed.
|
|
315
342
|
"""
|
|
316
|
-
comp_info = {
|
|
343
|
+
comp_info = {comp.service_id: comp.name for comp in self.pipe_component_list}
|
|
317
344
|
comp_info_name_as_key = {value: key for key, value in comp_info.items()}
|
|
318
|
-
if
|
|
319
|
-
return comp_info[
|
|
345
|
+
if service_id is not None:
|
|
346
|
+
return comp_info[service_id]
|
|
320
347
|
if name is not None:
|
|
321
348
|
return comp_info_name_as_key[name]
|
|
322
349
|
return comp_info
|
|
350
|
+
|
|
351
|
+
@staticmethod
|
|
352
|
+
def get_session_id() -> str:
|
|
353
|
+
"""
|
|
354
|
+
Get the generating a session id
|
|
355
|
+
"""
|
|
356
|
+
return str(uuid1())[:8]
|
deepdoctection/pipe/cell.py
CHANGED
|
@@ -24,9 +24,11 @@ from typing import Dict, List, Mapping, Optional, Sequence, Union
|
|
|
24
24
|
|
|
25
25
|
import numpy as np
|
|
26
26
|
|
|
27
|
+
from ..datapoint.annotation import ImageAnnotation
|
|
28
|
+
from ..datapoint.box import crop_box_from_image
|
|
27
29
|
from ..datapoint.image import Image
|
|
28
30
|
from ..extern.base import DetectionResult, ObjectDetector, PdfMiner
|
|
29
|
-
from ..utils.detection_types import JsonDict
|
|
31
|
+
from ..utils.detection_types import ImageType, JsonDict
|
|
30
32
|
from ..utils.settings import ObjectTypes, Relationships
|
|
31
33
|
from ..utils.transform import PadTransform
|
|
32
34
|
from .base import PredictorPipelineComponent
|
|
@@ -181,18 +183,14 @@ class SubImageLayoutService(PredictorPipelineComponent):
|
|
|
181
183
|
"""
|
|
182
184
|
sub_image_anns = dp.get_annotation_iter(category_names=self.sub_image_name)
|
|
183
185
|
for sub_image_ann in sub_image_anns:
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
np_image = sub_image_ann.image.image
|
|
187
|
-
if self.padder:
|
|
188
|
-
np_image = self.padder.apply_image(np_image)
|
|
189
|
-
detect_result_list = self.predictor.predict(np_image)
|
|
186
|
+
np_image = self.prepare_np_image(sub_image_ann)
|
|
187
|
+
detect_result_list = self.predictor.predict(np_image) # type: ignore
|
|
190
188
|
if self.padder and detect_result_list:
|
|
191
189
|
boxes = np.array([detect_result.box for detect_result in detect_result_list])
|
|
192
190
|
boxes_orig = self.padder.inverse_apply_coords(boxes)
|
|
193
191
|
for idx, detect_result in enumerate(detect_result_list):
|
|
194
192
|
detect_result.box = boxes_orig[idx, :].tolist()
|
|
195
|
-
if self.detect_result_generator:
|
|
193
|
+
if self.detect_result_generator and sub_image_ann.image:
|
|
196
194
|
self.detect_result_generator.width = sub_image_ann.image.width
|
|
197
195
|
self.detect_result_generator.height = sub_image_ann.image.height
|
|
198
196
|
detect_result_list = self.detect_result_generator.create_detection_result(detect_result_list)
|
|
@@ -235,3 +233,26 @@ class SubImageLayoutService(PredictorPipelineComponent):
|
|
|
235
233
|
deepcopy(self.detect_result_generator),
|
|
236
234
|
padder_clone,
|
|
237
235
|
)
|
|
236
|
+
|
|
237
|
+
def prepare_np_image(self, sub_image_ann: ImageAnnotation) -> ImageType:
|
|
238
|
+
"""Maybe crop and pad a np_array before passing it to the predictor.
|
|
239
|
+
|
|
240
|
+
Note that we currently assume to a two level hierachy of images, e.g. we can crop a sub-image from the base
|
|
241
|
+
image, e.g. the original input but we cannot crop a sub-image from an image which is itself a sub-image.
|
|
242
|
+
|
|
243
|
+
:param sub_image_ann: ImageAnnotation to be processed
|
|
244
|
+
:return: processed np_image
|
|
245
|
+
"""
|
|
246
|
+
if sub_image_ann.image is None:
|
|
247
|
+
raise ValueError("sub_image_ann.image is None, but must be an datapoint.Image")
|
|
248
|
+
np_image = sub_image_ann.image.image
|
|
249
|
+
if np_image is None and self.dp_manager.datapoint.image is not None:
|
|
250
|
+
np_image = crop_box_from_image(
|
|
251
|
+
self.dp_manager.datapoint.image,
|
|
252
|
+
sub_image_ann.get_bounding_box(self.dp_manager.datapoint.image_id),
|
|
253
|
+
self.dp_manager.datapoint.width,
|
|
254
|
+
self.dp_manager.datapoint.height,
|
|
255
|
+
)
|
|
256
|
+
if self.padder:
|
|
257
|
+
np_image = self.padder.apply_image(np_image)
|
|
258
|
+
return np_image
|
deepdoctection/pipe/common.py
CHANGED
|
@@ -93,8 +93,8 @@ class MatchingService(PipelineComponent):
|
|
|
93
93
|
|
|
94
94
|
def __init__(
|
|
95
95
|
self,
|
|
96
|
-
parent_categories: Union[TypeOrStr,
|
|
97
|
-
child_categories: Union[TypeOrStr,
|
|
96
|
+
parent_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
|
|
97
|
+
child_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
|
|
98
98
|
matching_rule: Literal["iou", "ioa"],
|
|
99
99
|
threshold: float,
|
|
100
100
|
use_weighted_intersections: bool = False,
|
|
@@ -112,8 +112,16 @@ class MatchingService(PipelineComponent):
|
|
|
112
112
|
value calibrate the ioa.
|
|
113
113
|
:param max_parent_only: Will assign to each child at most one parent with maximum ioa
|
|
114
114
|
"""
|
|
115
|
-
self.parent_categories =
|
|
116
|
-
|
|
115
|
+
self.parent_categories = (
|
|
116
|
+
[get_type(parent_categories)] # type: ignore
|
|
117
|
+
if not isinstance(parent_categories, (list, set))
|
|
118
|
+
else [get_type(parent_category) for parent_category in parent_categories]
|
|
119
|
+
)
|
|
120
|
+
self.child_categories = (
|
|
121
|
+
[get_type(child_categories)] # type: ignore
|
|
122
|
+
if not isinstance(child_categories, (list, set))
|
|
123
|
+
else [get_type(child_category) for child_category in child_categories]
|
|
124
|
+
)
|
|
117
125
|
assert matching_rule in ["iou", "ioa"], "segment rule must be either iou or ioa"
|
|
118
126
|
self.matching_rule = matching_rule
|
|
119
127
|
self.threshold = threshold
|
|
@@ -82,7 +82,6 @@ def _proto_process(
|
|
|
82
82
|
else:
|
|
83
83
|
path_tmp = path
|
|
84
84
|
logger.info(LoggingRecord(f"Processing {file_name}", {"path": path_tmp, "df": path_tmp, "file_name": file_name}))
|
|
85
|
-
# logger.info("Processing %s", file_name, {"path": path_tmp, "df": path_tmp, "file_name": file_name})
|
|
86
85
|
return dp
|
|
87
86
|
|
|
88
87
|
|
|
@@ -221,9 +220,10 @@ class DoctectionPipe(Pipeline):
|
|
|
221
220
|
"""
|
|
222
221
|
|
|
223
222
|
output = kwargs.get("output", "page")
|
|
223
|
+
session_id = kwargs.get("session_id")
|
|
224
224
|
assert output in ("page", "image", "dict"), "output must be either page image or dict"
|
|
225
225
|
df = self._entry(**kwargs)
|
|
226
|
-
df = self._build_pipe(df)
|
|
226
|
+
df = self._build_pipe(df, session_id=session_id) # type: ignore
|
|
227
227
|
if output == "page":
|
|
228
228
|
df = self.dataflow_to_page(df)
|
|
229
229
|
elif output == "dict":
|