deepdoctection 0.42.1__py3-none-any.whl → 0.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +2 -1
- deepdoctection/analyzer/__init__.py +2 -1
- deepdoctection/analyzer/config.py +904 -0
- deepdoctection/analyzer/dd.py +36 -62
- deepdoctection/analyzer/factory.py +311 -141
- deepdoctection/configs/conf_dd_one.yaml +100 -44
- deepdoctection/configs/profiles.jsonl +32 -0
- deepdoctection/dataflow/__init__.py +9 -6
- deepdoctection/dataflow/base.py +33 -15
- deepdoctection/dataflow/common.py +96 -75
- deepdoctection/dataflow/custom.py +36 -29
- deepdoctection/dataflow/custom_serialize.py +135 -91
- deepdoctection/dataflow/parallel_map.py +33 -31
- deepdoctection/dataflow/serialize.py +15 -10
- deepdoctection/dataflow/stats.py +41 -28
- deepdoctection/datapoint/__init__.py +4 -6
- deepdoctection/datapoint/annotation.py +104 -66
- deepdoctection/datapoint/box.py +190 -130
- deepdoctection/datapoint/convert.py +66 -39
- deepdoctection/datapoint/image.py +151 -95
- deepdoctection/datapoint/view.py +383 -236
- deepdoctection/datasets/__init__.py +2 -6
- deepdoctection/datasets/adapter.py +11 -11
- deepdoctection/datasets/base.py +118 -81
- deepdoctection/datasets/dataflow_builder.py +18 -12
- deepdoctection/datasets/info.py +76 -57
- deepdoctection/datasets/instances/__init__.py +6 -2
- deepdoctection/datasets/instances/doclaynet.py +17 -14
- deepdoctection/datasets/instances/fintabnet.py +16 -22
- deepdoctection/datasets/instances/funsd.py +11 -6
- deepdoctection/datasets/instances/iiitar13k.py +9 -9
- deepdoctection/datasets/instances/layouttest.py +9 -9
- deepdoctection/datasets/instances/publaynet.py +9 -9
- deepdoctection/datasets/instances/pubtables1m.py +13 -13
- deepdoctection/datasets/instances/pubtabnet.py +13 -15
- deepdoctection/datasets/instances/rvlcdip.py +8 -8
- deepdoctection/datasets/instances/xfund.py +11 -9
- deepdoctection/datasets/registry.py +18 -11
- deepdoctection/datasets/save.py +12 -11
- deepdoctection/eval/__init__.py +3 -2
- deepdoctection/eval/accmetric.py +72 -52
- deepdoctection/eval/base.py +29 -10
- deepdoctection/eval/cocometric.py +14 -12
- deepdoctection/eval/eval.py +56 -41
- deepdoctection/eval/registry.py +6 -3
- deepdoctection/eval/tedsmetric.py +24 -9
- deepdoctection/eval/tp_eval_callback.py +13 -12
- deepdoctection/extern/__init__.py +1 -1
- deepdoctection/extern/base.py +176 -97
- deepdoctection/extern/d2detect.py +127 -92
- deepdoctection/extern/deskew.py +19 -10
- deepdoctection/extern/doctrocr.py +157 -106
- deepdoctection/extern/fastlang.py +25 -17
- deepdoctection/extern/hfdetr.py +137 -60
- deepdoctection/extern/hflayoutlm.py +329 -248
- deepdoctection/extern/hflm.py +67 -33
- deepdoctection/extern/model.py +108 -762
- deepdoctection/extern/pdftext.py +37 -12
- deepdoctection/extern/pt/nms.py +15 -1
- deepdoctection/extern/pt/ptutils.py +13 -9
- deepdoctection/extern/tessocr.py +87 -54
- deepdoctection/extern/texocr.py +29 -14
- deepdoctection/extern/tp/tfutils.py +36 -8
- deepdoctection/extern/tp/tpcompat.py +54 -16
- deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
- deepdoctection/extern/tpdetect.py +4 -2
- deepdoctection/mapper/__init__.py +1 -1
- deepdoctection/mapper/cats.py +117 -76
- deepdoctection/mapper/cocostruct.py +35 -17
- deepdoctection/mapper/d2struct.py +56 -29
- deepdoctection/mapper/hfstruct.py +32 -19
- deepdoctection/mapper/laylmstruct.py +221 -185
- deepdoctection/mapper/maputils.py +71 -35
- deepdoctection/mapper/match.py +76 -62
- deepdoctection/mapper/misc.py +68 -44
- deepdoctection/mapper/pascalstruct.py +13 -12
- deepdoctection/mapper/prodigystruct.py +33 -19
- deepdoctection/mapper/pubstruct.py +42 -32
- deepdoctection/mapper/tpstruct.py +39 -19
- deepdoctection/mapper/xfundstruct.py +20 -13
- deepdoctection/pipe/__init__.py +1 -2
- deepdoctection/pipe/anngen.py +104 -62
- deepdoctection/pipe/base.py +226 -107
- deepdoctection/pipe/common.py +206 -123
- deepdoctection/pipe/concurrency.py +74 -47
- deepdoctection/pipe/doctectionpipe.py +108 -47
- deepdoctection/pipe/language.py +41 -24
- deepdoctection/pipe/layout.py +45 -18
- deepdoctection/pipe/lm.py +146 -78
- deepdoctection/pipe/order.py +196 -113
- deepdoctection/pipe/refine.py +111 -63
- deepdoctection/pipe/registry.py +1 -1
- deepdoctection/pipe/segment.py +213 -142
- deepdoctection/pipe/sub_layout.py +76 -46
- deepdoctection/pipe/text.py +52 -33
- deepdoctection/pipe/transform.py +8 -6
- deepdoctection/train/d2_frcnn_train.py +87 -69
- deepdoctection/train/hf_detr_train.py +72 -40
- deepdoctection/train/hf_layoutlm_train.py +85 -46
- deepdoctection/train/tp_frcnn_train.py +56 -28
- deepdoctection/utils/concurrency.py +59 -16
- deepdoctection/utils/context.py +40 -19
- deepdoctection/utils/develop.py +25 -17
- deepdoctection/utils/env_info.py +85 -36
- deepdoctection/utils/error.py +16 -10
- deepdoctection/utils/file_utils.py +246 -62
- deepdoctection/utils/fs.py +162 -43
- deepdoctection/utils/identifier.py +29 -16
- deepdoctection/utils/logger.py +49 -32
- deepdoctection/utils/metacfg.py +83 -21
- deepdoctection/utils/pdf_utils.py +119 -62
- deepdoctection/utils/settings.py +24 -10
- deepdoctection/utils/tqdm.py +10 -5
- deepdoctection/utils/transform.py +182 -46
- deepdoctection/utils/utils.py +61 -28
- deepdoctection/utils/viz.py +150 -104
- deepdoctection-0.43.dist-info/METADATA +376 -0
- deepdoctection-0.43.dist-info/RECORD +149 -0
- deepdoctection/analyzer/_config.py +0 -146
- deepdoctection-0.42.1.dist-info/METADATA +0 -431
- deepdoctection-0.42.1.dist-info/RECORD +0 -148
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/WHEEL +0 -0
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/licenses/LICENSE +0 -0
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/top_level.txt +0 -0
|
@@ -41,53 +41,64 @@ from .registry import pipeline_component_registry
|
|
|
41
41
|
@pipeline_component_registry.register("ImageCroppingService")
|
|
42
42
|
class MultiThreadPipelineComponent(PipelineComponent):
|
|
43
43
|
"""
|
|
44
|
-
|
|
45
|
-
and processed once calling the `start
|
|
44
|
+
This module provides functionality for running pipeline components in multiple threads to increase throughput.
|
|
45
|
+
Datapoints are queued and processed once calling the `start` method.
|
|
46
46
|
|
|
47
|
-
|
|
48
|
-
|
|
47
|
+
Note:
|
|
48
|
+
The number of threads is derived from the list of `pipeline_components`. It makes no sense to create various
|
|
49
|
+
components.
|
|
49
50
|
|
|
50
|
-
|
|
51
|
-
|
|
51
|
+
Think of the pipeline component as an asynchronous process. Because the entire data flow is loaded into memory
|
|
52
|
+
before the process is started, storage capacity must be guaranteed.
|
|
52
53
|
|
|
53
|
-
|
|
54
|
-
|
|
54
|
+
If pre- and post-processing are to be carried out before the task within the wrapped pipeline component, this
|
|
55
|
+
can also be transferred as a function. These tasks are also assigned to the threads.
|
|
55
56
|
|
|
56
|
-
|
|
57
|
+
The order in the dataflow and when returning lists is generally no longer retained.
|
|
57
58
|
|
|
59
|
+
Example:
|
|
60
|
+
```python
|
|
58
61
|
some_component = SubImageLayoutService(some_predictor, some_category)
|
|
59
|
-
|
|
62
|
+
some_component_clone = some_component.clone()
|
|
60
63
|
|
|
61
|
-
multi_thread_comp = MultiThreadPipelineComponent(
|
|
62
|
-
|
|
63
|
-
|
|
64
|
+
multi_thread_comp = MultiThreadPipelineComponent(
|
|
65
|
+
pipeline_components=[some_component, some_component_clone],
|
|
66
|
+
pre_proc_func=maybe_load_image,
|
|
67
|
+
post_proc_func=maybe_remove_image
|
|
68
|
+
)
|
|
64
69
|
|
|
65
70
|
multi_thread_comp.put_task(some_dataflow)
|
|
66
71
|
output_list = multi_thread_comp.start()
|
|
72
|
+
```
|
|
67
73
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
74
|
+
Info:
|
|
75
|
+
You cannot run `MultiThreadPipelineComponent` in `DoctectionPipe` as this requires batching datapoints and
|
|
76
|
+
neither can you run `MultiThreadPipelineComponent` in combination with a humble `PipelineComponent` unless you
|
|
77
|
+
take care of batching/unbatching between each component by yourself. The easiest way to build a pipeline with
|
|
78
|
+
`MultiThreadPipelineComponent` can be accomplished as follows:
|
|
72
79
|
|
|
80
|
+
Example:
|
|
81
|
+
```python
|
|
73
82
|
# define the pipeline component
|
|
74
|
-
|
|
75
|
-
|
|
83
|
+
some_component = SubImageLayoutService(some_predictor, some_category)
|
|
84
|
+
some_component_clone = some_component.clone()
|
|
76
85
|
|
|
77
86
|
# creating two threads, one for each component
|
|
78
|
-
multi_thread_comp = MultiThreadPipelineComponent(
|
|
79
|
-
|
|
80
|
-
|
|
87
|
+
multi_thread_comp = MultiThreadPipelineComponent(
|
|
88
|
+
pipeline_components=[some_component, some_component_clone],
|
|
89
|
+
pre_proc_func=maybe_load_image,
|
|
90
|
+
post_proc_func=maybe_remove_image
|
|
91
|
+
)
|
|
81
92
|
|
|
82
93
|
# currying `to_image`, so that you can call it in `MapData`.
|
|
83
94
|
@curry
|
|
84
|
-
def _to_image(dp,dpi):
|
|
85
|
-
return to_image(dp,dpi)
|
|
95
|
+
def _to_image(dp, dpi):
|
|
96
|
+
return to_image(dp, dpi)
|
|
86
97
|
|
|
87
98
|
# set-up the dataflow/stream, e.g.
|
|
88
99
|
df = SerializerPdfDoc.load(path, max_datapoints=max_datapoints)
|
|
89
100
|
df = MapData(df, to_image(dpi=300))
|
|
90
|
-
df = BatchData(df, batch_size=32,remainder=True)
|
|
101
|
+
df = BatchData(df, batch_size=32, remainder=True)
|
|
91
102
|
df = multi_thread_comp.predict_dataflow(df)
|
|
92
103
|
df = FlattenData(df)
|
|
93
104
|
df = MapData(df, lambda x: x[0])
|
|
@@ -95,7 +106,8 @@ class MultiThreadPipelineComponent(PipelineComponent):
|
|
|
95
106
|
df.reset_state()
|
|
96
107
|
|
|
97
108
|
for dp in df:
|
|
98
|
-
|
|
109
|
+
...
|
|
110
|
+
```
|
|
99
111
|
"""
|
|
100
112
|
|
|
101
113
|
def __init__(
|
|
@@ -106,13 +118,12 @@ class MultiThreadPipelineComponent(PipelineComponent):
|
|
|
106
118
|
max_datapoints: Optional[int] = None,
|
|
107
119
|
) -> None:
|
|
108
120
|
"""
|
|
109
|
-
:
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
:param max_datapoints: max datapoints to process
|
|
121
|
+
Args:
|
|
122
|
+
pipeline_components: List of identical `PipelineComponent`. Number of threads created is determined by
|
|
123
|
+
`len`.
|
|
124
|
+
pre_proc_func: Function that reads and returns an image. Will execute before entering the pipe component.
|
|
125
|
+
post_proc_func: Function that reads and returns an image. Will execute after entering the pipe component.
|
|
126
|
+
max_datapoints: Maximum datapoints to process.
|
|
116
127
|
"""
|
|
117
128
|
|
|
118
129
|
self.pipe_components = pipeline_components
|
|
@@ -125,20 +136,29 @@ class MultiThreadPipelineComponent(PipelineComponent):
|
|
|
125
136
|
|
|
126
137
|
def put_task(self, df: Union[DataFlow, list[Image]]) -> None:
|
|
127
138
|
"""
|
|
128
|
-
Put a
|
|
129
|
-
|
|
139
|
+
Put a `DataFlow` or a list of datapoints to the queue.
|
|
140
|
+
|
|
141
|
+
Note:
|
|
142
|
+
The process will not start before `start` is called. If you do not know how many datapoints will be cached,
|
|
143
|
+
use `max_datapoints` to ensure no OOM.
|
|
130
144
|
|
|
131
|
-
:
|
|
145
|
+
Args:
|
|
146
|
+
df: A list or a `DataFlow` of `Image`.
|
|
132
147
|
"""
|
|
133
148
|
|
|
134
149
|
self._put_datapoints_to_queue(df)
|
|
135
150
|
|
|
136
151
|
def start(self) -> list[Image]:
|
|
137
152
|
"""
|
|
138
|
-
Creates a worker for each component and starts processing the
|
|
139
|
-
is returned once all points in the queue have been processed.
|
|
153
|
+
Creates a worker for each component and starts processing the datapoints of the queue.
|
|
140
154
|
|
|
141
|
-
:
|
|
155
|
+
Example:
|
|
156
|
+
```python
|
|
157
|
+
output_list = multi_thread_comp.start()
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
A list of `Image` objects.
|
|
142
162
|
"""
|
|
143
163
|
with ThreadPoolExecutor(
|
|
144
164
|
max_workers=len(self.pipe_components), thread_name_prefix="EvalWorker"
|
|
@@ -195,11 +215,15 @@ class MultiThreadPipelineComponent(PipelineComponent):
|
|
|
195
215
|
|
|
196
216
|
def pass_datapoints(self, dpts: list[Image]) -> list[Image]:
|
|
197
217
|
"""
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
of the output might be
|
|
201
|
-
|
|
202
|
-
:
|
|
218
|
+
Put the list of datapoints into a thread-safe queue and start a separate thread for each pipeline component.
|
|
219
|
+
|
|
220
|
+
The order of appearance of the output might not be the same as the input.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
dpts: List of `Image` datapoints.
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
List of processed `Image` datapoints.
|
|
203
227
|
"""
|
|
204
228
|
for dp in dpts:
|
|
205
229
|
self.input_queue.put(dp)
|
|
@@ -212,10 +236,13 @@ class MultiThreadPipelineComponent(PipelineComponent):
|
|
|
212
236
|
|
|
213
237
|
def predict_dataflow(self, df: DataFlow) -> DataFlow:
|
|
214
238
|
"""
|
|
215
|
-
|
|
239
|
+
Map a datapoint via `pass_datapoints` within a dataflow pipeline.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
df: An input `DataFlow`.
|
|
216
243
|
|
|
217
|
-
:
|
|
218
|
-
|
|
244
|
+
Returns:
|
|
245
|
+
An output `DataFlow`.
|
|
219
246
|
"""
|
|
220
247
|
return MapData(df, self.pass_datapoints)
|
|
221
248
|
|
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
# limitations under the License.
|
|
17
17
|
|
|
18
18
|
"""
|
|
19
|
-
Module for
|
|
19
|
+
Module for document processing pipeline
|
|
20
20
|
"""
|
|
21
21
|
|
|
22
22
|
import os
|
|
@@ -42,6 +42,21 @@ from .common import PageParsingService
|
|
|
42
42
|
def _collect_from_kwargs(
|
|
43
43
|
**kwargs: Union[Optional[str], bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
|
|
44
44
|
) -> Tuple[Optional[str], Union[str, Sequence[str]], bool, int, str, DataFlow, Optional[bytes]]:
|
|
45
|
+
"""
|
|
46
|
+
Collects and validates keyword arguments for dataflow construction.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
**kwargs: Keyword arguments that may include `path`, `bytes`, `dataset_dataflow`, `shuffle`, `file_type`, and
|
|
50
|
+
`max_datapoints`.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
Tuple containing `path`, `file_type`, `shuffle`, `max_datapoints`, `doc_path`, `dataset_dataflow`, and
|
|
54
|
+
`b_bytes`.
|
|
55
|
+
|
|
56
|
+
Raises:
|
|
57
|
+
ValueError: If neither `path` nor `dataset_dataflow` is provided, or if required arguments are missing.
|
|
58
|
+
TypeError: If argument types are incorrect.
|
|
59
|
+
"""
|
|
45
60
|
b_bytes = kwargs.get("bytes")
|
|
46
61
|
dataset_dataflow = kwargs.get("dataset_dataflow")
|
|
47
62
|
path = kwargs.get("path")
|
|
@@ -115,10 +130,35 @@ def _to_image(
|
|
|
115
130
|
width: Optional[int] = None,
|
|
116
131
|
height: Optional[int] = None,
|
|
117
132
|
) -> Optional[Image]:
|
|
133
|
+
"""
|
|
134
|
+
Converts a data point to an `Image` object.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
dp: The data point, which can be a string or a mapping.
|
|
138
|
+
dpi: Dots per inch for the image.
|
|
139
|
+
width: Width of the image.
|
|
140
|
+
height: Height of the image.
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
An `Image` object or None.
|
|
144
|
+
"""
|
|
118
145
|
return to_image(dp, dpi, width, height)
|
|
119
146
|
|
|
120
147
|
|
|
121
148
|
def _doc_to_dataflow(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
|
|
149
|
+
"""
|
|
150
|
+
Creates a dataflow from a PDF document.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
path: Path to the PDF document.
|
|
154
|
+
max_datapoints: Maximum number of data points to consider.
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
A `DataFlow` object.
|
|
158
|
+
|
|
159
|
+
Raises:
|
|
160
|
+
FileExistsError: If the file does not exist.
|
|
161
|
+
"""
|
|
122
162
|
if not os.path.isfile(path):
|
|
123
163
|
raise FileExistsError(f"{path} not a file")
|
|
124
164
|
|
|
@@ -129,28 +169,24 @@ def _doc_to_dataflow(path: PathLikeOrStr, max_datapoints: Optional[int] = None)
|
|
|
129
169
|
|
|
130
170
|
class DoctectionPipe(Pipeline):
|
|
131
171
|
"""
|
|
132
|
-
Prototype for a document layout pipeline.
|
|
133
|
-
single PDF document, dataflow from datasets), conversions in dataflows and building a pipeline.
|
|
134
|
-
|
|
172
|
+
Prototype for a document layout pipeline.
|
|
135
173
|
|
|
174
|
+
Contains implementation for loading document types (images in directory, single PDF document, dataflow from
|
|
175
|
+
datasets), conversions in dataflows, and building a pipeline.
|
|
136
176
|
|
|
137
177
|
See `deepdoctection.analyzer.dd` for a concrete implementation.
|
|
138
178
|
|
|
139
179
|
See also the explanations in `base.Pipeline`.
|
|
140
180
|
|
|
141
|
-
By default, `DoctectionPipe` will instantiate a default `PageParsingService
|
|
142
|
-
|
|
143
|
-
PageParsingService(text_container=LayoutType.word,
|
|
144
|
-
text_block_categories=[LayoutType.title,
|
|
145
|
-
LayoutType.text,
|
|
146
|
-
LayoutType.list,
|
|
147
|
-
LayoutType.table])
|
|
148
|
-
|
|
149
|
-
but you can overwrite the current setting:
|
|
181
|
+
By default, `DoctectionPipe` will instantiate a default `PageParsingService`:
|
|
150
182
|
|
|
151
|
-
|
|
183
|
+
Example:
|
|
184
|
+
```python
|
|
185
|
+
pipe = DoctectionPipe([comp_1, com_2], PageParsingService(text_container= my_custom_setting))
|
|
186
|
+
```
|
|
152
187
|
|
|
153
|
-
|
|
188
|
+
Note:
|
|
189
|
+
You can overwrite the current setting by providing a custom `PageParsingService`.
|
|
154
190
|
"""
|
|
155
191
|
|
|
156
192
|
def __init__(
|
|
@@ -158,8 +194,17 @@ class DoctectionPipe(Pipeline):
|
|
|
158
194
|
pipeline_component_list: List[PipelineComponent],
|
|
159
195
|
page_parsing_service: Optional[PageParsingService] = None,
|
|
160
196
|
):
|
|
197
|
+
"""
|
|
198
|
+
Initializes the `DoctectionPipe`.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
pipeline_component_list: List of pipeline components.
|
|
202
|
+
page_parsing_service: Optional custom `PageParsingService`.
|
|
203
|
+
"""
|
|
161
204
|
self.page_parser = (
|
|
162
|
-
PageParsingService(
|
|
205
|
+
PageParsingService(
|
|
206
|
+
text_container=IMAGE_DEFAULTS.TEXT_CONTAINER,
|
|
207
|
+
)
|
|
163
208
|
if page_parsing_service is None
|
|
164
209
|
else page_parsing_service
|
|
165
210
|
)
|
|
@@ -216,13 +261,19 @@ class DoctectionPipe(Pipeline):
|
|
|
216
261
|
shuffle: bool = False,
|
|
217
262
|
) -> DataFlow:
|
|
218
263
|
"""
|
|
219
|
-
Processing method for directories
|
|
264
|
+
Processing method for directories.
|
|
220
265
|
|
|
221
|
-
:
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
266
|
+
Args:
|
|
267
|
+
path: Path to directory.
|
|
268
|
+
file_type: File type to consider (single string or list of strings).
|
|
269
|
+
max_datapoints: Maximum number of data points to consider.
|
|
270
|
+
shuffle: Whether to shuffle file names for random streaming.
|
|
271
|
+
|
|
272
|
+
Returns:
|
|
273
|
+
A `DataFlow` object.
|
|
274
|
+
|
|
275
|
+
Raises:
|
|
276
|
+
NotADirectoryError: If the path is not a directory.
|
|
226
277
|
"""
|
|
227
278
|
if not os.path.isdir(path):
|
|
228
279
|
raise NotADirectoryError(f"{os.fspath(path)} not a directory")
|
|
@@ -232,11 +283,14 @@ class DoctectionPipe(Pipeline):
|
|
|
232
283
|
@staticmethod
|
|
233
284
|
def doc_to_dataflow(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
|
|
234
285
|
"""
|
|
235
|
-
Processing method for documents
|
|
286
|
+
Processing method for documents.
|
|
236
287
|
|
|
237
|
-
:
|
|
238
|
-
|
|
239
|
-
|
|
288
|
+
Args:
|
|
289
|
+
path: Path to the document.
|
|
290
|
+
max_datapoints: Maximum number of data points to consider.
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
A `DataFlow` object.
|
|
240
294
|
"""
|
|
241
295
|
return _doc_to_dataflow(path, max_datapoints)
|
|
242
296
|
|
|
@@ -245,13 +299,19 @@ class DoctectionPipe(Pipeline):
|
|
|
245
299
|
path: str, b_bytes: bytes, file_type: Union[str, Sequence[str]], max_datapoints: Optional[int] = None
|
|
246
300
|
) -> DataFlow:
|
|
247
301
|
"""
|
|
248
|
-
Converts a bytes object to a dataflow
|
|
302
|
+
Converts a bytes object to a dataflow.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
path: Path to directory or an image file.
|
|
306
|
+
b_bytes: Bytes object.
|
|
307
|
+
file_type: File type, e.g., `.pdf`, `.jpg`, or a list of image file types.
|
|
308
|
+
max_datapoints: Maximum number of data points to consider.
|
|
249
309
|
|
|
250
|
-
:
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
:
|
|
254
|
-
|
|
310
|
+
Returns:
|
|
311
|
+
A `DataFlow` object.
|
|
312
|
+
|
|
313
|
+
Raises:
|
|
314
|
+
ValueError: If the combination of arguments is not supported.
|
|
255
315
|
"""
|
|
256
316
|
|
|
257
317
|
file_name = os.path.split(path)[1]
|
|
@@ -280,10 +340,13 @@ class DoctectionPipe(Pipeline):
|
|
|
280
340
|
|
|
281
341
|
def dataflow_to_page(self, df: DataFlow) -> DataFlow:
|
|
282
342
|
"""
|
|
283
|
-
Converts a dataflow of images to a dataflow of pages
|
|
343
|
+
Converts a dataflow of images to a dataflow of pages.
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
df: Dataflow.
|
|
284
347
|
|
|
285
|
-
:
|
|
286
|
-
|
|
348
|
+
Returns:
|
|
349
|
+
A dataflow of pages.
|
|
287
350
|
"""
|
|
288
351
|
return self.page_parser.predict_dataflow(df)
|
|
289
352
|
|
|
@@ -291,18 +354,16 @@ class DoctectionPipe(Pipeline):
|
|
|
291
354
|
self, **kwargs: Union[str, bytes, DataFlow, bool, int, PathLikeOrStr, Union[str, List[str]]]
|
|
292
355
|
) -> DataFlow:
|
|
293
356
|
"""
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
`kwargs key max_datapoints:` Stops processing as soon as max_datapoints images have been processed
|
|
357
|
+
Args:
|
|
358
|
+
`kwargs:
|
|
359
|
+
dataset_dataflow (Dataflow):` Transfer a dataflow of a dataset via its dataflow builder
|
|
360
|
+
path (TypeOrStr):` A path to a directory in which either image documents or pdf files are located. It
|
|
361
|
+
is assumed that the pdf documents consist of only one page. If there are multiple pages,
|
|
362
|
+
only the first page is processed through the pipeline.
|
|
363
|
+
Alternatively, a path to a pdf document with multiple pages.
|
|
364
|
+
bytes:` A bytes object of an image
|
|
365
|
+
file_type:` Selection of the file type, if: args:`file_type` is passed
|
|
366
|
+
max_datapoints:` Stops processing as soon as max_datapoints images have been processed
|
|
306
367
|
|
|
307
368
|
:return: dataflow
|
|
308
369
|
"""
|
deepdoctection/pipe/language.py
CHANGED
|
@@ -21,7 +21,7 @@ Module for language detection pipeline component
|
|
|
21
21
|
from typing import Optional, Sequence
|
|
22
22
|
|
|
23
23
|
from ..datapoint.image import Image
|
|
24
|
-
from ..datapoint.view import Page
|
|
24
|
+
from ..datapoint.view import ImageDefaults, Page
|
|
25
25
|
from ..extern.base import LanguageDetector, ObjectDetector
|
|
26
26
|
from ..utils.error import ImageError
|
|
27
27
|
from ..utils.settings import PageType, TypeOrStr, get_type
|
|
@@ -36,20 +36,22 @@ class LanguageDetectionService(PipelineComponent):
|
|
|
36
36
|
|
|
37
37
|
There are two ways to use this component:
|
|
38
38
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
39
|
+
1. By analyzing the already extracted and ordered text. For this purpose, a `Page` object is parsed internally and
|
|
40
|
+
the full text is passed to the `language_detector`. This approach provides the greatest precision.
|
|
41
|
+
|
|
42
|
+
2. By previous text extraction with an object detector and subsequent transfer of concatenated word elements to the
|
|
43
|
+
`language_detector`. Only one OCR detector can be used here. This method can be used, for example, to select an OCR
|
|
44
|
+
detector that specializes in a language. Although the word recognition is less accurate
|
|
45
|
+
when choosing any detector, the results are confident enough to rely on, especially when extracting
|
|
46
|
+
longer text passages. So, a `TextExtractionService`, for example, can be selected as the subsequent pipeline
|
|
47
|
+
component. The words determined by the OCR detector are not transferred to the image object.
|
|
48
|
+
|
|
49
|
+
Example:
|
|
50
|
+
```python
|
|
51
|
+
lang_detector = FasttextLangDetector(path_weights, profile.categories)
|
|
52
|
+
component = LanguageDetectionService(lang_detector, text_container="word",
|
|
53
|
+
text_block_names=["text", "title", "table"])
|
|
54
|
+
```
|
|
53
55
|
"""
|
|
54
56
|
|
|
55
57
|
def __init__(
|
|
@@ -60,18 +62,20 @@ class LanguageDetectionService(PipelineComponent):
|
|
|
60
62
|
floating_text_block_categories: Optional[Sequence[TypeOrStr]] = None,
|
|
61
63
|
):
|
|
62
64
|
"""
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
65
|
+
Initializes a `LanguageDetectionService` instance.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
language_detector: Detector to determine text.
|
|
69
|
+
text_container: Text container, needed to generate the reading order. Not necessary when passing a
|
|
70
|
+
`text_detector`.
|
|
71
|
+
text_detector: Object detector to extract text. You cannot use a Pdfminer here.
|
|
72
|
+
floating_text_block_categories: Text blocks, needed for generating the reading order. Not necessary
|
|
73
|
+
when passing a `text_detector`.
|
|
70
74
|
"""
|
|
71
75
|
|
|
72
76
|
self.predictor = language_detector
|
|
73
77
|
self.text_detector = text_detector
|
|
74
|
-
self.text_container = get_type(text_container) if text_container is not None else
|
|
78
|
+
self.text_container = get_type(text_container) if text_container is not None else ImageDefaults.TEXT_CONTAINER
|
|
75
79
|
self.floating_text_block_categories = (
|
|
76
80
|
tuple(get_type(text_block) for text_block in floating_text_block_categories)
|
|
77
81
|
if (floating_text_block_categories is not None)
|
|
@@ -81,8 +85,21 @@ class LanguageDetectionService(PipelineComponent):
|
|
|
81
85
|
super().__init__(self._get_name(self.predictor.name))
|
|
82
86
|
|
|
83
87
|
def serve(self, dp: Image) -> None:
|
|
88
|
+
"""
|
|
89
|
+
Serves the language detection on the given `Image`.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
dp: The `Image` datapoint to process.
|
|
93
|
+
|
|
94
|
+
Raises:
|
|
95
|
+
ImageError: If `dp.image` is `None` and a `text_detector` is used.
|
|
96
|
+
"""
|
|
84
97
|
if self.text_detector is None:
|
|
85
|
-
page = Page.from_image(
|
|
98
|
+
page = Page.from_image(
|
|
99
|
+
image_orig=dp,
|
|
100
|
+
text_container=self.text_container,
|
|
101
|
+
floating_text_block_categories=self.floating_text_block_categories,
|
|
102
|
+
)
|
|
86
103
|
text = page.text_no_line_break
|
|
87
104
|
else:
|
|
88
105
|
if dp.image is None:
|
deepdoctection/pipe/layout.py
CHANGED
|
@@ -43,11 +43,20 @@ def skip_if_category_or_service_extracted(
|
|
|
43
43
|
"""
|
|
44
44
|
Skip the processing of the pipeline component if the category or service is already extracted.
|
|
45
45
|
|
|
46
|
-
|
|
47
|
-
|
|
46
|
+
Example:
|
|
47
|
+
```python
|
|
48
48
|
detector = # some detector
|
|
49
49
|
item_component = ImageLayoutService(detector)
|
|
50
50
|
item_component.set_inbound_filter(skip_if_category_or_service_extracted(detector.get_categories(as_dict=False)))
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
dp: The `Image` datapoint to check.
|
|
55
|
+
category_names: Optional category names or sequence of `ObjectTypes` to check for.
|
|
56
|
+
service_ids: Optional service IDs or sequence of service IDs to check for.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
Whether to skip processing based on existing annotation.
|
|
51
60
|
"""
|
|
52
61
|
|
|
53
62
|
if dp.get_annotation(category_names=category_names, service_ids=service_ids):
|
|
@@ -58,18 +67,21 @@ def skip_if_category_or_service_extracted(
|
|
|
58
67
|
@pipeline_component_registry.register("ImageLayoutService")
|
|
59
68
|
class ImageLayoutService(PipelineComponent):
|
|
60
69
|
"""
|
|
61
|
-
Pipeline component for determining the layout.
|
|
62
|
-
usually on the data set on which the Detector was pre-trained. If the Detector has been trained on Publaynet, these
|
|
63
|
-
are layouts such as text, title, table, list and figure. If the Detector has been trained on DocBank, these are
|
|
64
|
-
rather Abstract, Author, Caption, Equation, Figure, Footer, List, Paragraph, Reference, Section, Table, Title.
|
|
70
|
+
Pipeline component for determining the layout.
|
|
65
71
|
|
|
66
|
-
|
|
67
|
-
|
|
72
|
+
Which layout blocks are determined depends on the `Detector` and thus usually on the data set on which the
|
|
73
|
+
`Detector` was pre-trained. If the `Detector` has been trained on Publaynet, these are layouts such as text, title
|
|
74
|
+
, table, list and figure. If the `Detector` has been trained on DocBank, these are rather Abstract, Author,
|
|
75
|
+
Caption, Equation, Figure, Footer, List, Paragraph, Reference, Section, Table, Title.
|
|
68
76
|
|
|
69
|
-
|
|
77
|
+
The component is usually at the beginning of the pipeline. Cropping of the layout blocks can be selected to
|
|
78
|
+
simplify further processing.
|
|
70
79
|
|
|
71
|
-
|
|
72
|
-
|
|
80
|
+
Example:
|
|
81
|
+
```python
|
|
82
|
+
d_items = TPFrcnnDetector(item_config_path, item_weights_path, {1: 'row', 2: 'column'})
|
|
83
|
+
item_component = ImageLayoutService(d_items)
|
|
84
|
+
```
|
|
73
85
|
"""
|
|
74
86
|
|
|
75
87
|
def __init__(
|
|
@@ -80,13 +92,19 @@ class ImageLayoutService(PipelineComponent):
|
|
|
80
92
|
padder: Optional[PadTransform] = None,
|
|
81
93
|
):
|
|
82
94
|
"""
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
95
|
+
Initializes the `ImageLayoutService`.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
layout_detector: The object detector.
|
|
99
|
+
to_image: Whether to generate an image for each detected block, e.g. populate `ImageAnnotation.image`.
|
|
100
|
+
Useful if you want to process only some blocks in a subsequent pipeline component.
|
|
101
|
+
crop_image: Whether to crop the detected block according to its bounding box and populate the resulting sub
|
|
102
|
+
image to `ImageAnnotation.image.image`.
|
|
103
|
+
padder: If not `None`, will apply the padder to the image before prediction and inverse apply the padder.
|
|
104
|
+
|
|
105
|
+
Note:
|
|
106
|
+
If `padder` is provided, it will be applied before prediction and inversely applied to the coordinates
|
|
107
|
+
after prediction.
|
|
90
108
|
"""
|
|
91
109
|
self.to_image = to_image
|
|
92
110
|
self.crop_image = crop_image
|
|
@@ -95,6 +113,15 @@ class ImageLayoutService(PipelineComponent):
|
|
|
95
113
|
super().__init__(self._get_name(layout_detector.name), self.predictor.model_id)
|
|
96
114
|
|
|
97
115
|
def serve(self, dp: Image) -> None:
|
|
116
|
+
"""
|
|
117
|
+
Serve the pipeline component on the given `Image`.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
dp: The `Image` datapoint to process.
|
|
121
|
+
|
|
122
|
+
Raises:
|
|
123
|
+
ImageError: If `dp.image` is `None`.
|
|
124
|
+
"""
|
|
98
125
|
if dp.image is None:
|
|
99
126
|
raise ImageError("image cannot be None")
|
|
100
127
|
np_image = dp.image
|