deepdoctection 0.30__py3-none-any.whl → 0.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +38 -29
- deepdoctection/analyzer/dd.py +36 -29
- deepdoctection/configs/conf_dd_one.yaml +34 -31
- deepdoctection/dataflow/base.py +0 -19
- deepdoctection/dataflow/custom.py +4 -3
- deepdoctection/dataflow/custom_serialize.py +14 -5
- deepdoctection/dataflow/parallel_map.py +12 -11
- deepdoctection/dataflow/serialize.py +5 -4
- deepdoctection/datapoint/annotation.py +35 -13
- deepdoctection/datapoint/box.py +3 -5
- deepdoctection/datapoint/convert.py +3 -1
- deepdoctection/datapoint/image.py +79 -36
- deepdoctection/datapoint/view.py +152 -49
- deepdoctection/datasets/__init__.py +1 -4
- deepdoctection/datasets/adapter.py +6 -3
- deepdoctection/datasets/base.py +86 -11
- deepdoctection/datasets/dataflow_builder.py +1 -1
- deepdoctection/datasets/info.py +4 -4
- deepdoctection/datasets/instances/doclaynet.py +3 -2
- deepdoctection/datasets/instances/fintabnet.py +2 -1
- deepdoctection/datasets/instances/funsd.py +2 -1
- deepdoctection/datasets/instances/iiitar13k.py +5 -2
- deepdoctection/datasets/instances/layouttest.py +4 -8
- deepdoctection/datasets/instances/publaynet.py +2 -2
- deepdoctection/datasets/instances/pubtables1m.py +6 -3
- deepdoctection/datasets/instances/pubtabnet.py +2 -1
- deepdoctection/datasets/instances/rvlcdip.py +2 -1
- deepdoctection/datasets/instances/xfund.py +2 -1
- deepdoctection/eval/__init__.py +1 -4
- deepdoctection/eval/accmetric.py +1 -1
- deepdoctection/eval/base.py +5 -4
- deepdoctection/eval/cocometric.py +2 -1
- deepdoctection/eval/eval.py +19 -15
- deepdoctection/eval/tedsmetric.py +14 -11
- deepdoctection/eval/tp_eval_callback.py +14 -7
- deepdoctection/extern/__init__.py +2 -7
- deepdoctection/extern/base.py +39 -13
- deepdoctection/extern/d2detect.py +182 -90
- deepdoctection/extern/deskew.py +36 -9
- deepdoctection/extern/doctrocr.py +265 -83
- deepdoctection/extern/fastlang.py +49 -9
- deepdoctection/extern/hfdetr.py +106 -55
- deepdoctection/extern/hflayoutlm.py +441 -122
- deepdoctection/extern/hflm.py +225 -0
- deepdoctection/extern/model.py +56 -47
- deepdoctection/extern/pdftext.py +10 -5
- deepdoctection/extern/pt/__init__.py +1 -3
- deepdoctection/extern/pt/nms.py +6 -2
- deepdoctection/extern/pt/ptutils.py +27 -18
- deepdoctection/extern/tessocr.py +134 -22
- deepdoctection/extern/texocr.py +6 -2
- deepdoctection/extern/tp/tfutils.py +43 -9
- deepdoctection/extern/tp/tpcompat.py +14 -11
- deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
- deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
- deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
- deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
- deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
- deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
- deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
- deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
- deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
- deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
- deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
- deepdoctection/extern/tp/tpfrcnn/preproc.py +8 -9
- deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
- deepdoctection/extern/tpdetect.py +54 -30
- deepdoctection/mapper/__init__.py +3 -8
- deepdoctection/mapper/d2struct.py +9 -7
- deepdoctection/mapper/hfstruct.py +7 -2
- deepdoctection/mapper/laylmstruct.py +164 -21
- deepdoctection/mapper/maputils.py +16 -3
- deepdoctection/mapper/misc.py +6 -3
- deepdoctection/mapper/prodigystruct.py +1 -1
- deepdoctection/mapper/pubstruct.py +10 -10
- deepdoctection/mapper/tpstruct.py +3 -3
- deepdoctection/pipe/__init__.py +1 -1
- deepdoctection/pipe/anngen.py +35 -8
- deepdoctection/pipe/base.py +53 -19
- deepdoctection/pipe/common.py +23 -13
- deepdoctection/pipe/concurrency.py +2 -1
- deepdoctection/pipe/doctectionpipe.py +2 -2
- deepdoctection/pipe/language.py +3 -2
- deepdoctection/pipe/layout.py +6 -3
- deepdoctection/pipe/lm.py +34 -66
- deepdoctection/pipe/order.py +142 -35
- deepdoctection/pipe/refine.py +26 -24
- deepdoctection/pipe/segment.py +21 -16
- deepdoctection/pipe/{cell.py → sub_layout.py} +30 -9
- deepdoctection/pipe/text.py +14 -8
- deepdoctection/pipe/transform.py +16 -9
- deepdoctection/train/__init__.py +6 -12
- deepdoctection/train/d2_frcnn_train.py +36 -28
- deepdoctection/train/hf_detr_train.py +26 -17
- deepdoctection/train/hf_layoutlm_train.py +133 -111
- deepdoctection/train/tp_frcnn_train.py +21 -19
- deepdoctection/utils/__init__.py +3 -0
- deepdoctection/utils/concurrency.py +1 -1
- deepdoctection/utils/context.py +2 -2
- deepdoctection/utils/env_info.py +41 -84
- deepdoctection/utils/error.py +84 -0
- deepdoctection/utils/file_utils.py +4 -15
- deepdoctection/utils/fs.py +7 -7
- deepdoctection/utils/logger.py +1 -0
- deepdoctection/utils/mocks.py +93 -0
- deepdoctection/utils/pdf_utils.py +5 -4
- deepdoctection/utils/settings.py +6 -1
- deepdoctection/utils/transform.py +1 -1
- deepdoctection/utils/utils.py +0 -6
- deepdoctection/utils/viz.py +48 -5
- {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/METADATA +57 -73
- deepdoctection-0.32.dist-info/RECORD +146 -0
- {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/WHEEL +1 -1
- deepdoctection-0.30.dist-info/RECORD +0 -143
- {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/LICENSE +0 -0
- {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/top_level.txt +0 -0
deepdoctection/pipe/base.py
CHANGED
|
@@ -23,12 +23,14 @@ from abc import ABC, abstractmethod
|
|
|
23
23
|
from collections import defaultdict
|
|
24
24
|
from copy import deepcopy
|
|
25
25
|
from typing import Any, Callable, DefaultDict, Dict, List, Mapping, Optional, Set, Union
|
|
26
|
+
from uuid import uuid1
|
|
26
27
|
|
|
27
28
|
from ..dataflow import DataFlow, MapData
|
|
28
29
|
from ..datapoint.image import Image
|
|
29
30
|
from ..extern.base import ImageTransformer, ObjectDetector, PdfMiner, TextRecognizer
|
|
30
31
|
from ..utils.context import timed_operation
|
|
31
32
|
from ..utils.detection_types import JsonDict
|
|
33
|
+
from ..utils.identifier import get_uuid_from_str
|
|
32
34
|
from .anngen import DatapointManager
|
|
33
35
|
|
|
34
36
|
|
|
@@ -58,8 +60,9 @@ class PipelineComponent(ABC):
|
|
|
58
60
|
pipeline. Use something that describe the task of the pipeline.
|
|
59
61
|
"""
|
|
60
62
|
self.name = name
|
|
63
|
+
self.service_id = self.get_service_id()
|
|
61
64
|
self._meta_has_all_types()
|
|
62
|
-
self.dp_manager = DatapointManager()
|
|
65
|
+
self.dp_manager = DatapointManager(self.service_id)
|
|
63
66
|
self.timer_on = False
|
|
64
67
|
|
|
65
68
|
@abstractmethod
|
|
@@ -75,7 +78,7 @@ class PipelineComponent(ABC):
|
|
|
75
78
|
As a simplified interface `serve` does not have to return a dp. The data point is passed on within
|
|
76
79
|
pipelines internally (via `pass_datapoint`).
|
|
77
80
|
"""
|
|
78
|
-
raise NotImplementedError
|
|
81
|
+
raise NotImplementedError()
|
|
79
82
|
|
|
80
83
|
def pass_datapoint(self, dp: Image) -> Image:
|
|
81
84
|
"""
|
|
@@ -109,7 +112,7 @@ class PipelineComponent(ABC):
|
|
|
109
112
|
"""
|
|
110
113
|
Clone an instance
|
|
111
114
|
"""
|
|
112
|
-
raise NotImplementedError
|
|
115
|
+
raise NotImplementedError()
|
|
113
116
|
|
|
114
117
|
@abstractmethod
|
|
115
118
|
def get_meta_annotation(self) -> JsonDict:
|
|
@@ -122,7 +125,7 @@ class PipelineComponent(ABC):
|
|
|
122
125
|
`summaries` with values: A list of summary sub categories
|
|
123
126
|
:return: Dict with meta infos as just described
|
|
124
127
|
"""
|
|
125
|
-
raise NotImplementedError
|
|
128
|
+
raise NotImplementedError()
|
|
126
129
|
|
|
127
130
|
def _meta_has_all_types(self) -> None:
|
|
128
131
|
if not {"image_annotations", "sub_categories", "relationships", "summaries"}.issubset(
|
|
@@ -133,6 +136,12 @@ class PipelineComponent(ABC):
|
|
|
133
136
|
f"Got {self.get_meta_annotation().keys()}"
|
|
134
137
|
)
|
|
135
138
|
|
|
139
|
+
def get_service_id(self) -> str:
|
|
140
|
+
"""
|
|
141
|
+
Get the generating model
|
|
142
|
+
"""
|
|
143
|
+
return get_uuid_from_str(self.name)[:8]
|
|
144
|
+
|
|
136
145
|
|
|
137
146
|
class PredictorPipelineComponent(PipelineComponent, ABC):
|
|
138
147
|
"""
|
|
@@ -151,10 +160,11 @@ class PredictorPipelineComponent(PipelineComponent, ABC):
|
|
|
151
160
|
"""
|
|
152
161
|
self.predictor = predictor
|
|
153
162
|
super().__init__(name)
|
|
163
|
+
self.dp_manager = DatapointManager(self.service_id, self.predictor.model_id)
|
|
154
164
|
|
|
155
165
|
@abstractmethod
|
|
156
166
|
def clone(self) -> "PredictorPipelineComponent":
|
|
157
|
-
raise NotImplementedError
|
|
167
|
+
raise NotImplementedError()
|
|
158
168
|
|
|
159
169
|
|
|
160
170
|
class LanguageModelPipelineComponent(PipelineComponent, ABC):
|
|
@@ -175,15 +185,15 @@ class LanguageModelPipelineComponent(PipelineComponent, ABC):
|
|
|
175
185
|
"""
|
|
176
186
|
|
|
177
187
|
self.tokenizer = tokenizer
|
|
178
|
-
self.mapping_to_lm_input_func = mapping_to_lm_input_func
|
|
179
188
|
super().__init__(name)
|
|
189
|
+
self.mapping_to_lm_input_func = mapping_to_lm_input_func
|
|
180
190
|
|
|
181
191
|
@abstractmethod
|
|
182
192
|
def clone(self) -> "LanguageModelPipelineComponent":
|
|
183
193
|
"""
|
|
184
194
|
Clone an instance
|
|
185
195
|
"""
|
|
186
|
-
raise NotImplementedError
|
|
196
|
+
raise NotImplementedError()
|
|
187
197
|
|
|
188
198
|
|
|
189
199
|
class ImageTransformPipelineComponent(PipelineComponent, ABC):
|
|
@@ -206,7 +216,7 @@ class ImageTransformPipelineComponent(PipelineComponent, ABC):
|
|
|
206
216
|
"""
|
|
207
217
|
Clone an instance
|
|
208
218
|
"""
|
|
209
|
-
raise NotImplementedError
|
|
219
|
+
raise NotImplementedError()
|
|
210
220
|
|
|
211
221
|
|
|
212
222
|
class Pipeline(ABC):
|
|
@@ -228,7 +238,7 @@ class Pipeline(ABC):
|
|
|
228
238
|
|
|
229
239
|
layout = LayoutPipeComponent(layout_detector ...)
|
|
230
240
|
text = TextExtractPipeComponent(text_detector ...)
|
|
231
|
-
simple_pipe = MyPipeline
|
|
241
|
+
simple_pipe = MyPipeline(pipeline_component = [layout, text])
|
|
232
242
|
doc_dataflow = simple_pipe.analyze(input = path / to / dir)
|
|
233
243
|
|
|
234
244
|
for page in doc_dataflow:
|
|
@@ -238,6 +248,18 @@ class Pipeline(ABC):
|
|
|
238
248
|
model or already processed further).
|
|
239
249
|
|
|
240
250
|
In addition to `analyze`, the internal `_entry` is used to bundle preprocessing steps.
|
|
251
|
+
|
|
252
|
+
It is possible to set a session id for the pipeline. This is useful for logging purposes. The session id can be
|
|
253
|
+
either passed to the pipeline via the `analyze` method or generated automatically.
|
|
254
|
+
|
|
255
|
+
To generate a session_id automatically:
|
|
256
|
+
|
|
257
|
+
**Example:**
|
|
258
|
+
|
|
259
|
+
pipe = MyPipeline(pipeline_component = [layout, text])
|
|
260
|
+
pipe.set_session_id = True
|
|
261
|
+
|
|
262
|
+
df = pipe.analyze(input = "path/to/dir") # session_id is generated automatically
|
|
241
263
|
"""
|
|
242
264
|
|
|
243
265
|
def __init__(self, pipeline_component_list: List[PipelineComponent]) -> None:
|
|
@@ -245,6 +267,7 @@ class Pipeline(ABC):
|
|
|
245
267
|
:param pipeline_component_list: A list of pipeline components.
|
|
246
268
|
"""
|
|
247
269
|
self.pipe_component_list = pipeline_component_list
|
|
270
|
+
self.set_session_id = False
|
|
248
271
|
|
|
249
272
|
@abstractmethod
|
|
250
273
|
def _entry(self, **kwargs: Any) -> DataFlow:
|
|
@@ -254,14 +277,17 @@ class Pipeline(ABC):
|
|
|
254
277
|
|
|
255
278
|
:param kwargs: Arguments, for dynamic customizing of the processing or for the transfer of processing types
|
|
256
279
|
"""
|
|
257
|
-
raise NotImplementedError
|
|
280
|
+
raise NotImplementedError()
|
|
258
281
|
|
|
259
|
-
def _build_pipe(self, df: DataFlow) -> DataFlow:
|
|
282
|
+
def _build_pipe(self, df: DataFlow, session_id: Optional[str] = None) -> DataFlow:
|
|
260
283
|
"""
|
|
261
284
|
Composition of the backbone
|
|
262
285
|
"""
|
|
286
|
+
if session_id is None and self.set_session_id:
|
|
287
|
+
session_id = self.get_session_id()
|
|
263
288
|
for component in self.pipe_component_list:
|
|
264
289
|
component.timer_on = True
|
|
290
|
+
component.dp_manager.session_id = session_id
|
|
265
291
|
df = component.predict_dataflow(df)
|
|
266
292
|
return df
|
|
267
293
|
|
|
@@ -277,7 +303,7 @@ class Pipeline(ABC):
|
|
|
277
303
|
|
|
278
304
|
can be triggered.
|
|
279
305
|
"""
|
|
280
|
-
raise NotImplementedError
|
|
306
|
+
raise NotImplementedError()
|
|
281
307
|
|
|
282
308
|
def get_meta_annotation(self) -> JsonDict:
|
|
283
309
|
"""
|
|
@@ -301,22 +327,30 @@ class Pipeline(ABC):
|
|
|
301
327
|
for key, value in meta_anns["relationships"].items():
|
|
302
328
|
pipeline_populations["relationships"][key].update(value)
|
|
303
329
|
pipeline_populations["summaries"].extend(meta_anns["summaries"]) # type: ignore
|
|
304
|
-
|
|
330
|
+
pipeline_populations["sub_categories"] = dict(pipeline_populations["sub_categories"]) # type: ignore
|
|
331
|
+
pipeline_populations["relationships"] = dict(pipeline_populations["relationships"]) # type: ignore
|
|
305
332
|
return pipeline_populations
|
|
306
333
|
|
|
307
334
|
def get_pipeline_info(
|
|
308
|
-
self,
|
|
309
|
-
) -> Union[Mapping[
|
|
335
|
+
self, service_id: Optional[str] = None, name: Optional[str] = None
|
|
336
|
+
) -> Union[str, Mapping[str, str]]:
|
|
310
337
|
"""Get pipeline information: Returns a dictionary with a description of each pipeline component
|
|
311
|
-
:param
|
|
338
|
+
:param service_id: service_id of the pipeline component to search for
|
|
312
339
|
:param name: name of the pipeline component to search for
|
|
313
340
|
:return: Either a full dictionary with position and name of all pipeline components or the name, if the position
|
|
314
341
|
has been passed or the position if the name has been passed.
|
|
315
342
|
"""
|
|
316
|
-
comp_info = {
|
|
343
|
+
comp_info = {comp.service_id: comp.name for comp in self.pipe_component_list}
|
|
317
344
|
comp_info_name_as_key = {value: key for key, value in comp_info.items()}
|
|
318
|
-
if
|
|
319
|
-
return comp_info[
|
|
345
|
+
if service_id is not None:
|
|
346
|
+
return comp_info[service_id]
|
|
320
347
|
if name is not None:
|
|
321
348
|
return comp_info_name_as_key[name]
|
|
322
349
|
return comp_info
|
|
350
|
+
|
|
351
|
+
@staticmethod
|
|
352
|
+
def get_session_id() -> str:
|
|
353
|
+
"""
|
|
354
|
+
Get the generating a session id
|
|
355
|
+
"""
|
|
356
|
+
return str(uuid1())[:8]
|
deepdoctection/pipe/common.py
CHANGED
|
@@ -18,6 +18,10 @@
|
|
|
18
18
|
"""
|
|
19
19
|
Module for common pipeline components
|
|
20
20
|
"""
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import os
|
|
24
|
+
|
|
21
25
|
from copy import copy, deepcopy
|
|
22
26
|
from typing import List, Literal, Mapping, Optional, Sequence, Union
|
|
23
27
|
|
|
@@ -30,16 +34,14 @@ from ..mapper.maputils import MappingContextManager
|
|
|
30
34
|
from ..mapper.match import match_anns_by_intersection
|
|
31
35
|
from ..mapper.misc import to_image
|
|
32
36
|
from ..utils.detection_types import JsonDict
|
|
33
|
-
from ..utils.file_utils import detectron2_available, pytorch_available, tf_available
|
|
34
37
|
from ..utils.settings import LayoutType, ObjectTypes, Relationships, TypeOrStr, get_type
|
|
35
38
|
from .base import PipelineComponent
|
|
36
39
|
from .registry import pipeline_component_registry
|
|
37
40
|
|
|
38
|
-
if
|
|
39
|
-
from ..mapper.tpstruct import tf_nms_image_annotations as nms_image_annotations
|
|
40
|
-
|
|
41
|
-
elif pytorch_available() and detectron2_available():
|
|
41
|
+
if os.environ.get("DD_USE_TORCH"):
|
|
42
42
|
from ..mapper.d2struct import pt_nms_image_annotations as nms_image_annotations
|
|
43
|
+
elif os.environ.get("DD_USE_TF"):
|
|
44
|
+
from ..mapper.tpstruct import tf_nms_image_annotations as nms_image_annotations
|
|
43
45
|
|
|
44
46
|
|
|
45
47
|
@pipeline_component_registry.register("ImageCroppingService")
|
|
@@ -64,7 +66,7 @@ class ImageCroppingService(PipelineComponent):
|
|
|
64
66
|
for ann in dp.get_annotation(category_names=self.category_names):
|
|
65
67
|
dp.image_ann_to_image(ann.annotation_id, crop_image=True)
|
|
66
68
|
|
|
67
|
-
def clone(self) ->
|
|
69
|
+
def clone(self) -> PipelineComponent:
|
|
68
70
|
return self.__class__(self.category_names)
|
|
69
71
|
|
|
70
72
|
def get_meta_annotation(self) -> JsonDict:
|
|
@@ -93,8 +95,8 @@ class MatchingService(PipelineComponent):
|
|
|
93
95
|
|
|
94
96
|
def __init__(
|
|
95
97
|
self,
|
|
96
|
-
parent_categories: Union[TypeOrStr,
|
|
97
|
-
child_categories: Union[TypeOrStr,
|
|
98
|
+
parent_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
|
|
99
|
+
child_categories: Union[TypeOrStr, Sequence[TypeOrStr]],
|
|
98
100
|
matching_rule: Literal["iou", "ioa"],
|
|
99
101
|
threshold: float,
|
|
100
102
|
use_weighted_intersections: bool = False,
|
|
@@ -112,8 +114,16 @@ class MatchingService(PipelineComponent):
|
|
|
112
114
|
value calibrate the ioa.
|
|
113
115
|
:param max_parent_only: Will assign to each child at most one parent with maximum ioa
|
|
114
116
|
"""
|
|
115
|
-
self.parent_categories =
|
|
116
|
-
|
|
117
|
+
self.parent_categories = (
|
|
118
|
+
[get_type(parent_categories)] # type: ignore
|
|
119
|
+
if not isinstance(parent_categories, (list, set))
|
|
120
|
+
else [get_type(parent_category) for parent_category in parent_categories]
|
|
121
|
+
)
|
|
122
|
+
self.child_categories = (
|
|
123
|
+
[get_type(child_categories)] # type: ignore
|
|
124
|
+
if not isinstance(child_categories, (list, set))
|
|
125
|
+
else [get_type(child_category) for child_category in child_categories]
|
|
126
|
+
)
|
|
117
127
|
assert matching_rule in ["iou", "ioa"], "segment rule must be either iou or ioa"
|
|
118
128
|
self.matching_rule = matching_rule
|
|
119
129
|
self.threshold = threshold
|
|
@@ -217,7 +227,7 @@ class PageParsingService:
|
|
|
217
227
|
"""
|
|
218
228
|
return dict([("image_annotations", []), ("sub_categories", {}), ("relationships", {}), ("summaries", [])])
|
|
219
229
|
|
|
220
|
-
def clone(self) ->
|
|
230
|
+
def clone(self) -> PageParsingService:
|
|
221
231
|
"""clone"""
|
|
222
232
|
return self.__class__(
|
|
223
233
|
deepcopy(self.text_container),
|
|
@@ -284,7 +294,7 @@ class AnnotationNmsService(PipelineComponent):
|
|
|
284
294
|
if ann.annotation_id not in ann_ids_to_keep:
|
|
285
295
|
self.dp_manager.deactivate_annotation(ann.annotation_id)
|
|
286
296
|
|
|
287
|
-
def clone(self) ->
|
|
297
|
+
def clone(self) -> PipelineComponent:
|
|
288
298
|
return self.__class__(deepcopy(self.nms_pairs), self.threshold)
|
|
289
299
|
|
|
290
300
|
def get_meta_annotation(self) -> JsonDict:
|
|
@@ -318,7 +328,7 @@ class ImageParsingService:
|
|
|
318
328
|
"""
|
|
319
329
|
return MapData(df, self.pass_datapoint)
|
|
320
330
|
|
|
321
|
-
def clone(self) ->
|
|
331
|
+
def clone(self) -> ImageParsingService:
|
|
322
332
|
"""clone"""
|
|
323
333
|
return self.__class__(self.dpi)
|
|
324
334
|
|
|
@@ -18,6 +18,7 @@
|
|
|
18
18
|
"""
|
|
19
19
|
Module for multithreading tasks
|
|
20
20
|
"""
|
|
21
|
+
from __future__ import annotations
|
|
21
22
|
|
|
22
23
|
import itertools
|
|
23
24
|
import queue
|
|
@@ -221,7 +222,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
|
|
|
221
222
|
def serve(self, dp: Image) -> None:
|
|
222
223
|
raise NotImplementedError("MultiThreadPipelineComponent does not follow the PipelineComponent implementation")
|
|
223
224
|
|
|
224
|
-
def clone(self) ->
|
|
225
|
+
def clone(self) -> MultiThreadPipelineComponent:
|
|
225
226
|
raise NotImplementedError("MultiThreadPipelineComponent does not allow cloning")
|
|
226
227
|
|
|
227
228
|
def get_meta_annotation(self) -> JsonDict:
|
|
@@ -82,7 +82,6 @@ def _proto_process(
|
|
|
82
82
|
else:
|
|
83
83
|
path_tmp = path
|
|
84
84
|
logger.info(LoggingRecord(f"Processing {file_name}", {"path": path_tmp, "df": path_tmp, "file_name": file_name}))
|
|
85
|
-
# logger.info("Processing %s", file_name, {"path": path_tmp, "df": path_tmp, "file_name": file_name})
|
|
86
85
|
return dp
|
|
87
86
|
|
|
88
87
|
|
|
@@ -221,9 +220,10 @@ class DoctectionPipe(Pipeline):
|
|
|
221
220
|
"""
|
|
222
221
|
|
|
223
222
|
output = kwargs.get("output", "page")
|
|
223
|
+
session_id = kwargs.get("session_id")
|
|
224
224
|
assert output in ("page", "image", "dict"), "output must be either page image or dict"
|
|
225
225
|
df = self._entry(**kwargs)
|
|
226
|
-
df = self._build_pipe(df)
|
|
226
|
+
df = self._build_pipe(df, session_id=session_id) # type: ignore
|
|
227
227
|
if output == "page":
|
|
228
228
|
df = self.dataflow_to_page(df)
|
|
229
229
|
elif output == "dict":
|
deepdoctection/pipe/language.py
CHANGED
|
@@ -25,6 +25,7 @@ from ..datapoint.image import Image
|
|
|
25
25
|
from ..datapoint.view import Page
|
|
26
26
|
from ..extern.base import LanguageDetector, ObjectDetector
|
|
27
27
|
from ..utils.detection_types import JsonDict
|
|
28
|
+
from ..utils.error import ImageError
|
|
28
29
|
from ..utils.settings import PageType, TypeOrStr, get_type
|
|
29
30
|
from .base import PipelineComponent
|
|
30
31
|
from .registry import pipeline_component_registry
|
|
@@ -86,7 +87,7 @@ class LanguageDetectionService(PipelineComponent):
|
|
|
86
87
|
text = page.text_no_line_break
|
|
87
88
|
else:
|
|
88
89
|
if dp.image is None:
|
|
89
|
-
raise
|
|
90
|
+
raise ImageError("image cannot be None")
|
|
90
91
|
detect_result_list = self.text_detector.predict(dp.image)
|
|
91
92
|
# this is a concatenation of all detection result. No reading order
|
|
92
93
|
text = " ".join([result.text for result in detect_result_list if result.text is not None])
|
|
@@ -98,7 +99,7 @@ class LanguageDetectionService(PipelineComponent):
|
|
|
98
99
|
def clone(self) -> PipelineComponent:
|
|
99
100
|
predictor = self.predictor.clone()
|
|
100
101
|
if not isinstance(predictor, LanguageDetector):
|
|
101
|
-
raise
|
|
102
|
+
raise TypeError(f"Predictor must be of type LanguageDetector, but is of type {type(predictor)}")
|
|
102
103
|
return self.__class__(
|
|
103
104
|
predictor,
|
|
104
105
|
copy(self.text_container),
|
deepdoctection/pipe/layout.py
CHANGED
|
@@ -18,6 +18,8 @@
|
|
|
18
18
|
"""
|
|
19
19
|
Module for layout pipeline component
|
|
20
20
|
"""
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
21
23
|
from typing import Optional
|
|
22
24
|
|
|
23
25
|
import numpy as np
|
|
@@ -25,6 +27,7 @@ import numpy as np
|
|
|
25
27
|
from ..datapoint.image import Image
|
|
26
28
|
from ..extern.base import ObjectDetector, PdfMiner
|
|
27
29
|
from ..utils.detection_types import JsonDict
|
|
30
|
+
from ..utils.error import ImageError
|
|
28
31
|
from ..utils.transform import PadTransform
|
|
29
32
|
from .base import PredictorPipelineComponent
|
|
30
33
|
from .registry import pipeline_component_registry
|
|
@@ -79,7 +82,7 @@ class ImageLayoutService(PredictorPipelineComponent):
|
|
|
79
82
|
if anns:
|
|
80
83
|
return
|
|
81
84
|
if dp.image is None:
|
|
82
|
-
raise
|
|
85
|
+
raise ImageError("image cannot be None")
|
|
83
86
|
np_image = dp.image
|
|
84
87
|
if self.padder:
|
|
85
88
|
np_image = self.padder.apply_image(np_image)
|
|
@@ -108,11 +111,11 @@ class ImageLayoutService(PredictorPipelineComponent):
|
|
|
108
111
|
def _get_name(predictor_name: str) -> str:
|
|
109
112
|
return f"image_{predictor_name}"
|
|
110
113
|
|
|
111
|
-
def clone(self) ->
|
|
114
|
+
def clone(self) -> PredictorPipelineComponent:
|
|
112
115
|
predictor = self.predictor.clone()
|
|
113
116
|
padder_clone = None
|
|
114
117
|
if self.padder:
|
|
115
118
|
padder_clone = self.padder.clone()
|
|
116
119
|
if not isinstance(predictor, ObjectDetector):
|
|
117
|
-
raise
|
|
120
|
+
raise TypeError(f"predictor must be of type ObjectDetector, but is of type {type(predictor)}")
|
|
118
121
|
return self.__class__(predictor, self.to_image, self.crop_image, padder_clone, self.skip_if_layout_extracted)
|
deepdoctection/pipe/lm.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
|
-
# File:
|
|
2
|
+
# File: lm.py
|
|
3
3
|
|
|
4
4
|
# Copyright 2021 Dr. Janis Meyer. All rights reserved.
|
|
5
5
|
#
|
|
@@ -18,57 +18,19 @@
|
|
|
18
18
|
"""
|
|
19
19
|
Module for token classification pipeline
|
|
20
20
|
"""
|
|
21
|
+
from __future__ import annotations
|
|
21
22
|
|
|
22
23
|
from copy import copy
|
|
23
|
-
from typing import Any, List, Literal, Optional, Sequence, Union
|
|
24
|
+
from typing import Any, Callable, List, Literal, Optional, Sequence, Union
|
|
24
25
|
|
|
25
26
|
from ..datapoint.image import Image
|
|
26
27
|
from ..extern.hflayoutlm import HFLayoutLmSequenceClassifierBase, HFLayoutLmTokenClassifierBase
|
|
27
|
-
from ..mapper.laylmstruct import image_to_layoutlm_features
|
|
28
|
+
from ..mapper.laylmstruct import image_to_layoutlm_features, image_to_lm_features
|
|
28
29
|
from ..utils.detection_types import JsonDict
|
|
29
|
-
from ..utils.file_utils import transformers_available
|
|
30
30
|
from ..utils.settings import BioTag, LayoutType, ObjectTypes, PageType, TokenClasses, WordType
|
|
31
31
|
from .base import LanguageModelPipelineComponent
|
|
32
32
|
from .registry import pipeline_component_registry
|
|
33
33
|
|
|
34
|
-
if transformers_available():
|
|
35
|
-
from transformers import LayoutLMTokenizerFast, RobertaTokenizerFast, XLMRobertaTokenizerFast
|
|
36
|
-
|
|
37
|
-
_ARCHITECTURES_TO_TOKENIZER = {
|
|
38
|
-
("LayoutLMForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
|
|
39
|
-
"microsoft/layoutlm-base-uncased"
|
|
40
|
-
),
|
|
41
|
-
("LayoutLMForSequenceClassification", False): LayoutLMTokenizerFast.from_pretrained(
|
|
42
|
-
"microsoft/layoutlm-base-uncased"
|
|
43
|
-
),
|
|
44
|
-
("LayoutLMv2ForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
|
|
45
|
-
"microsoft/layoutlm-base-uncased"
|
|
46
|
-
),
|
|
47
|
-
("LayoutLMv2ForSequenceClassification", False): LayoutLMTokenizerFast.from_pretrained(
|
|
48
|
-
"microsoft/layoutlm-base-uncased"
|
|
49
|
-
),
|
|
50
|
-
("LayoutLMv2ForTokenClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
|
|
51
|
-
("LayoutLMv2ForSequenceClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
|
|
52
|
-
("LayoutLMv3ForSequenceClassification", False): RobertaTokenizerFast.from_pretrained(
|
|
53
|
-
"roberta-base", add_prefix_space=True
|
|
54
|
-
),
|
|
55
|
-
("LayoutLMv3ForTokenClassification", False): RobertaTokenizerFast.from_pretrained(
|
|
56
|
-
"roberta-base", add_prefix_space=True
|
|
57
|
-
),
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
def get_tokenizer_from_architecture(architecture_name: str, use_xlm_tokenizer: bool) -> Any:
|
|
62
|
-
"""
|
|
63
|
-
We do not use the tokenizer for a particular model that the transformer library provides. Thie mapping therefore
|
|
64
|
-
returns the tokenizer that should be used for a particular model.
|
|
65
|
-
|
|
66
|
-
:param architecture_name: The model as stated in the transformer library.
|
|
67
|
-
:param use_xlm_tokenizer: True if one uses the LayoutXLM. (The model cannot be distinguished from LayoutLMv2).
|
|
68
|
-
:return: Tokenizer instance to use.
|
|
69
|
-
"""
|
|
70
|
-
return _ARCHITECTURES_TO_TOKENIZER[(architecture_name, use_xlm_tokenizer)]
|
|
71
|
-
|
|
72
34
|
|
|
73
35
|
@pipeline_component_registry.register("LMTokenClassifierService")
|
|
74
36
|
class LMTokenClassifierService(LanguageModelPipelineComponent):
|
|
@@ -154,7 +116,8 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
|
|
|
154
116
|
else:
|
|
155
117
|
self.default_key = TokenClasses.other
|
|
156
118
|
self.other_name_as_key = {self.default_key: categories_name_as_key[self.default_key]}
|
|
157
|
-
|
|
119
|
+
image_to_features_func = self.image_to_features_func(self.language_model.image_to_features_mapping())
|
|
120
|
+
super().__init__(self._get_name(), tokenizer, image_to_features_func)
|
|
158
121
|
self.required_kwargs = {
|
|
159
122
|
"tokenizer": self.tokenizer,
|
|
160
123
|
"padding": self.padding,
|
|
@@ -218,7 +181,9 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
|
|
|
218
181
|
word.annotation_id,
|
|
219
182
|
)
|
|
220
183
|
|
|
221
|
-
def clone(self) ->
|
|
184
|
+
def clone(self) -> LMTokenClassifierService:
|
|
185
|
+
# ToDo: replace copying of tokenizer with a proper clone method. Otherwise we cannot run the evaluation with
|
|
186
|
+
# multiple threads
|
|
222
187
|
return self.__class__(
|
|
223
188
|
copy(self.tokenizer),
|
|
224
189
|
self.language_model.clone(),
|
|
@@ -244,19 +209,20 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
|
|
|
244
209
|
return f"lm_token_class_{self.language_model.name}"
|
|
245
210
|
|
|
246
211
|
def _init_sanity_checks(self) -> None:
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
tokenizer_reference = get_tokenizer_from_architecture(
|
|
252
|
-
self.language_model.model.__class__.__name__, use_xlm_tokenizer
|
|
253
|
-
)
|
|
254
|
-
if not isinstance(self.tokenizer, type(tokenizer_reference)):
|
|
255
|
-
raise ValueError(
|
|
256
|
-
f"You want to use {type(self.tokenizer)} but you should use {type(tokenizer_reference)} "
|
|
212
|
+
tokenizer_class_name = self.language_model.model.config.tokenizer_class
|
|
213
|
+
if tokenizer_class_name != self.tokenizer.__class__.__name__:
|
|
214
|
+
raise TypeError(
|
|
215
|
+
f"You want to use {type(self.tokenizer)} but you should use {tokenizer_class_name} "
|
|
257
216
|
f"in this framework"
|
|
258
217
|
)
|
|
259
218
|
|
|
219
|
+
@staticmethod
|
|
220
|
+
def image_to_features_func(mapping_str: str) -> Callable[..., Callable[[Image], Optional[Any]]]:
|
|
221
|
+
"""Replacing eval functions"""
|
|
222
|
+
return {"image_to_layoutlm_features": image_to_layoutlm_features, "image_to_lm_features": image_to_lm_features}[
|
|
223
|
+
mapping_str
|
|
224
|
+
]
|
|
225
|
+
|
|
260
226
|
|
|
261
227
|
@pipeline_component_registry.register("LMSequenceClassifierService")
|
|
262
228
|
class LMSequenceClassifierService(LanguageModelPipelineComponent):
|
|
@@ -315,7 +281,8 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
|
|
|
315
281
|
self.padding = padding
|
|
316
282
|
self.truncation = truncation
|
|
317
283
|
self.return_overflowing_tokens = return_overflowing_tokens
|
|
318
|
-
|
|
284
|
+
image_to_features_func = self.image_to_features_func(self.language_model.image_to_features_mapping())
|
|
285
|
+
super().__init__(self._get_name(), tokenizer, image_to_features_func)
|
|
319
286
|
self.required_kwargs = {
|
|
320
287
|
"tokenizer": self.tokenizer,
|
|
321
288
|
"padding": self.padding,
|
|
@@ -335,7 +302,7 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
|
|
|
335
302
|
PageType.document_type, lm_output.class_name, lm_output.class_id, None, lm_output.score
|
|
336
303
|
)
|
|
337
304
|
|
|
338
|
-
def clone(self) ->
|
|
305
|
+
def clone(self) -> LMSequenceClassifierService:
|
|
339
306
|
return self.__class__(
|
|
340
307
|
copy(self.tokenizer),
|
|
341
308
|
self.language_model.clone(),
|
|
@@ -358,15 +325,16 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
|
|
|
358
325
|
return f"lm_sequence_class_{self.language_model.name}"
|
|
359
326
|
|
|
360
327
|
def _init_sanity_checks(self) -> None:
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
tokenizer_reference = get_tokenizer_from_architecture(
|
|
366
|
-
self.language_model.model.__class__.__name__, use_xlm_tokenizer
|
|
367
|
-
)
|
|
368
|
-
if not isinstance(self.tokenizer, type(tokenizer_reference)):
|
|
369
|
-
raise ValueError(
|
|
370
|
-
f"You want to use {type(self.tokenizer)} but you should use {type(tokenizer_reference)} "
|
|
328
|
+
tokenizer_class_name = self.language_model.model.config.tokenizer_class
|
|
329
|
+
if tokenizer_class_name != self.tokenizer.__class__.__name__:
|
|
330
|
+
raise TypeError(
|
|
331
|
+
f"You want to use {type(self.tokenizer)} but you should use {tokenizer_class_name} "
|
|
371
332
|
f"in this framework"
|
|
372
333
|
)
|
|
334
|
+
|
|
335
|
+
@staticmethod
|
|
336
|
+
def image_to_features_func(mapping_str: str) -> Callable[..., Callable[[Image], Optional[Any]]]:
|
|
337
|
+
"""Replacing eval functions"""
|
|
338
|
+
return {"image_to_layoutlm_features": image_to_layoutlm_features, "image_to_lm_features": image_to_lm_features}[
|
|
339
|
+
mapping_str
|
|
340
|
+
]
|