deepdoctection 0.42.1__py3-none-any.whl → 0.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +2 -1
- deepdoctection/analyzer/__init__.py +2 -1
- deepdoctection/analyzer/config.py +904 -0
- deepdoctection/analyzer/dd.py +36 -62
- deepdoctection/analyzer/factory.py +311 -141
- deepdoctection/configs/conf_dd_one.yaml +100 -44
- deepdoctection/configs/profiles.jsonl +32 -0
- deepdoctection/dataflow/__init__.py +9 -6
- deepdoctection/dataflow/base.py +33 -15
- deepdoctection/dataflow/common.py +96 -75
- deepdoctection/dataflow/custom.py +36 -29
- deepdoctection/dataflow/custom_serialize.py +135 -91
- deepdoctection/dataflow/parallel_map.py +33 -31
- deepdoctection/dataflow/serialize.py +15 -10
- deepdoctection/dataflow/stats.py +41 -28
- deepdoctection/datapoint/__init__.py +4 -6
- deepdoctection/datapoint/annotation.py +104 -66
- deepdoctection/datapoint/box.py +190 -130
- deepdoctection/datapoint/convert.py +66 -39
- deepdoctection/datapoint/image.py +151 -95
- deepdoctection/datapoint/view.py +383 -236
- deepdoctection/datasets/__init__.py +2 -6
- deepdoctection/datasets/adapter.py +11 -11
- deepdoctection/datasets/base.py +118 -81
- deepdoctection/datasets/dataflow_builder.py +18 -12
- deepdoctection/datasets/info.py +76 -57
- deepdoctection/datasets/instances/__init__.py +6 -2
- deepdoctection/datasets/instances/doclaynet.py +17 -14
- deepdoctection/datasets/instances/fintabnet.py +16 -22
- deepdoctection/datasets/instances/funsd.py +11 -6
- deepdoctection/datasets/instances/iiitar13k.py +9 -9
- deepdoctection/datasets/instances/layouttest.py +9 -9
- deepdoctection/datasets/instances/publaynet.py +9 -9
- deepdoctection/datasets/instances/pubtables1m.py +13 -13
- deepdoctection/datasets/instances/pubtabnet.py +13 -15
- deepdoctection/datasets/instances/rvlcdip.py +8 -8
- deepdoctection/datasets/instances/xfund.py +11 -9
- deepdoctection/datasets/registry.py +18 -11
- deepdoctection/datasets/save.py +12 -11
- deepdoctection/eval/__init__.py +3 -2
- deepdoctection/eval/accmetric.py +72 -52
- deepdoctection/eval/base.py +29 -10
- deepdoctection/eval/cocometric.py +14 -12
- deepdoctection/eval/eval.py +56 -41
- deepdoctection/eval/registry.py +6 -3
- deepdoctection/eval/tedsmetric.py +24 -9
- deepdoctection/eval/tp_eval_callback.py +13 -12
- deepdoctection/extern/__init__.py +1 -1
- deepdoctection/extern/base.py +176 -97
- deepdoctection/extern/d2detect.py +127 -92
- deepdoctection/extern/deskew.py +19 -10
- deepdoctection/extern/doctrocr.py +157 -106
- deepdoctection/extern/fastlang.py +25 -17
- deepdoctection/extern/hfdetr.py +137 -60
- deepdoctection/extern/hflayoutlm.py +329 -248
- deepdoctection/extern/hflm.py +67 -33
- deepdoctection/extern/model.py +108 -762
- deepdoctection/extern/pdftext.py +37 -12
- deepdoctection/extern/pt/nms.py +15 -1
- deepdoctection/extern/pt/ptutils.py +13 -9
- deepdoctection/extern/tessocr.py +87 -54
- deepdoctection/extern/texocr.py +29 -14
- deepdoctection/extern/tp/tfutils.py +36 -8
- deepdoctection/extern/tp/tpcompat.py +54 -16
- deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
- deepdoctection/extern/tpdetect.py +4 -2
- deepdoctection/mapper/__init__.py +1 -1
- deepdoctection/mapper/cats.py +117 -76
- deepdoctection/mapper/cocostruct.py +35 -17
- deepdoctection/mapper/d2struct.py +56 -29
- deepdoctection/mapper/hfstruct.py +32 -19
- deepdoctection/mapper/laylmstruct.py +221 -185
- deepdoctection/mapper/maputils.py +71 -35
- deepdoctection/mapper/match.py +76 -62
- deepdoctection/mapper/misc.py +68 -44
- deepdoctection/mapper/pascalstruct.py +13 -12
- deepdoctection/mapper/prodigystruct.py +33 -19
- deepdoctection/mapper/pubstruct.py +42 -32
- deepdoctection/mapper/tpstruct.py +39 -19
- deepdoctection/mapper/xfundstruct.py +20 -13
- deepdoctection/pipe/__init__.py +1 -2
- deepdoctection/pipe/anngen.py +104 -62
- deepdoctection/pipe/base.py +226 -107
- deepdoctection/pipe/common.py +206 -123
- deepdoctection/pipe/concurrency.py +74 -47
- deepdoctection/pipe/doctectionpipe.py +108 -47
- deepdoctection/pipe/language.py +41 -24
- deepdoctection/pipe/layout.py +45 -18
- deepdoctection/pipe/lm.py +146 -78
- deepdoctection/pipe/order.py +196 -113
- deepdoctection/pipe/refine.py +111 -63
- deepdoctection/pipe/registry.py +1 -1
- deepdoctection/pipe/segment.py +213 -142
- deepdoctection/pipe/sub_layout.py +76 -46
- deepdoctection/pipe/text.py +52 -33
- deepdoctection/pipe/transform.py +8 -6
- deepdoctection/train/d2_frcnn_train.py +87 -69
- deepdoctection/train/hf_detr_train.py +72 -40
- deepdoctection/train/hf_layoutlm_train.py +85 -46
- deepdoctection/train/tp_frcnn_train.py +56 -28
- deepdoctection/utils/concurrency.py +59 -16
- deepdoctection/utils/context.py +40 -19
- deepdoctection/utils/develop.py +25 -17
- deepdoctection/utils/env_info.py +85 -36
- deepdoctection/utils/error.py +16 -10
- deepdoctection/utils/file_utils.py +246 -62
- deepdoctection/utils/fs.py +162 -43
- deepdoctection/utils/identifier.py +29 -16
- deepdoctection/utils/logger.py +49 -32
- deepdoctection/utils/metacfg.py +83 -21
- deepdoctection/utils/pdf_utils.py +119 -62
- deepdoctection/utils/settings.py +24 -10
- deepdoctection/utils/tqdm.py +10 -5
- deepdoctection/utils/transform.py +182 -46
- deepdoctection/utils/utils.py +61 -28
- deepdoctection/utils/viz.py +150 -104
- deepdoctection-0.43.dist-info/METADATA +376 -0
- deepdoctection-0.43.dist-info/RECORD +149 -0
- deepdoctection/analyzer/_config.py +0 -146
- deepdoctection-0.42.1.dist-info/METADATA +0 -431
- deepdoctection-0.42.1.dist-info/RECORD +0 -148
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/WHEEL +0 -0
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/licenses/LICENSE +0 -0
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/top_level.txt +0 -0
deepdoctection/pipe/lm.py
CHANGED
|
@@ -37,31 +37,34 @@ if TYPE_CHECKING:
|
|
|
37
37
|
@pipeline_component_registry.register("LMTokenClassifierService")
|
|
38
38
|
class LMTokenClassifierService(PipelineComponent):
|
|
39
39
|
"""
|
|
40
|
-
|
|
40
|
+
Module for token classification pipeline.
|
|
41
41
|
|
|
42
|
-
|
|
42
|
+
This module provides pipeline components for token and sequence classification using language models.
|
|
43
43
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
44
|
+
Example:
|
|
45
|
+
```python
|
|
46
|
+
# setting up compulsory ocr service
|
|
47
|
+
tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
|
|
48
|
+
tess = TesseractOcrDetector(tesseract_config_path)
|
|
49
|
+
ocr_service = TextExtractionService(tess)
|
|
48
50
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
51
|
+
# hf tokenizer and token classifier
|
|
52
|
+
tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
|
|
53
|
+
layoutlm = HFLayoutLmTokenClassifier(categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
|
|
54
|
+
'E-header', 'E-question', 'I-answer', 'I-header',
|
|
55
|
+
'I-question', 'O', 'S-answer', 'S-header', 'S-question'])
|
|
54
56
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
+
# token classification service
|
|
58
|
+
layoutlm_service = LMTokenClassifierService(tokenizer, layoutlm)
|
|
57
59
|
|
|
58
|
-
|
|
60
|
+
pipe = DoctectionPipe(pipeline_component_list=[ocr_service, layoutlm_service])
|
|
59
61
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
+
path = "path/to/some/form"
|
|
63
|
+
df = pipe.analyze(path=path)
|
|
62
64
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
+
for dp in df:
|
|
66
|
+
...
|
|
67
|
+
```
|
|
65
68
|
"""
|
|
66
69
|
|
|
67
70
|
def __init__(
|
|
@@ -76,32 +79,65 @@ class LMTokenClassifierService(PipelineComponent):
|
|
|
76
79
|
sliding_window_stride: int = 0,
|
|
77
80
|
) -> None:
|
|
78
81
|
"""
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
:
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
82
|
+
Pipeline component for token classification.
|
|
83
|
+
|
|
84
|
+
Example:
|
|
85
|
+
```python
|
|
86
|
+
# setting up compulsory ocr service
|
|
87
|
+
tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
|
|
88
|
+
tess = TesseractOcrDetector(tesseract_config_path)
|
|
89
|
+
ocr_service = TextExtractionService(tess)
|
|
90
|
+
|
|
91
|
+
# hf tokenizer and token classifier
|
|
92
|
+
tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
|
|
93
|
+
layoutlm = HFLayoutLmTokenClassifier(categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
|
|
94
|
+
'E-header', 'E-question', 'I-answer', 'I-header',
|
|
95
|
+
'I-question', 'O', 'S-answer', 'S-header', 'S-question'])
|
|
96
|
+
|
|
97
|
+
# token classification service
|
|
98
|
+
layoutlm_service = LMTokenClassifierService(tokenizer, layoutlm)
|
|
99
|
+
|
|
100
|
+
pipe = DoctectionPipe(pipeline_component_list=[ocr_service, layoutlm_service])
|
|
101
|
+
|
|
102
|
+
path = "path/to/some/form"
|
|
103
|
+
df = pipe.analyze(path=path)
|
|
104
|
+
|
|
105
|
+
for dp in df:
|
|
106
|
+
...
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
tokenizer: `Token classifier`, typing allows currently anything. This will be changed in the future.
|
|
111
|
+
language_model: `language model token classifier`.
|
|
112
|
+
padding: A padding strategy to be passed to the `tokenizer`. Must be either `max_length`, `longest` or
|
|
113
|
+
`do_not_pad`.
|
|
114
|
+
truncation: If `True` will truncate to a maximum length specified with the argument `max_length` or to
|
|
115
|
+
the maximum acceptable input length for the model if that argument is not provided. This
|
|
116
|
+
will truncate token by token, removing a token from the longest sequence in the pair if a
|
|
117
|
+
pair of sequences (or a batch of pairs) is provided. If `False` then no truncation (i.e.,
|
|
118
|
+
can output batch with sequence lengths greater than the model maximum admissible input
|
|
119
|
+
size).
|
|
120
|
+
return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows the overflowing tokens
|
|
121
|
+
can be returned as an additional batch element. Note that in this case, the
|
|
122
|
+
number of input batch samples will be smaller than the output batch samples.
|
|
123
|
+
use_other_as_default_category: When predicting token classes, it might be possible that some words
|
|
124
|
+
might not get sent to the model because they are categorized as not
|
|
125
|
+
eligible token (e.g. empty string). If set to `True` it will assign all
|
|
126
|
+
words without token the `BioTag.outside` token.
|
|
127
|
+
segment_positions: Using bounding boxes of segment instead of words improves model accuracy
|
|
128
|
+
significantly for models that have been trained on segments rather than words.
|
|
129
|
+
Choose a single or a sequence of layout segments to use their bounding boxes. Note,
|
|
130
|
+
that the layout segments need to have a child-relationship with words. If a word
|
|
131
|
+
does not appear as child, it will use the word bounding box.
|
|
132
|
+
sliding_window_stride: If the output of the `tokenizer` exceeds the `max_length` sequence length, a
|
|
133
|
+
sliding window will be created with each window having `max_length` sequence
|
|
134
|
+
input. When using `sliding_window_stride=0` no strides will be created,
|
|
135
|
+
otherwise it will create slides with windows shifted `sliding_window_stride` to
|
|
136
|
+
the right.
|
|
137
|
+
|
|
138
|
+
Note:
|
|
139
|
+
If `use_other_as_default_category` is set, words without eligible tokens will be assigned the
|
|
140
|
+
`BioTag.outside` token.
|
|
105
141
|
"""
|
|
106
142
|
self.language_model = language_model
|
|
107
143
|
self.padding = padding
|
|
@@ -134,6 +170,15 @@ class LMTokenClassifierService(PipelineComponent):
|
|
|
134
170
|
self._init_sanity_checks()
|
|
135
171
|
|
|
136
172
|
def serve(self, dp: Image) -> None:
|
|
173
|
+
"""
|
|
174
|
+
Serve the token classification pipeline on a given `Image`.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
dp: The `Image` to process.
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
None
|
|
181
|
+
"""
|
|
137
182
|
lm_input = self.mapping_to_lm_input_func(**self.required_kwargs)(dp)
|
|
138
183
|
if lm_input is None:
|
|
139
184
|
return
|
|
@@ -231,30 +276,32 @@ class LMTokenClassifierService(PipelineComponent):
|
|
|
231
276
|
@pipeline_component_registry.register("LMSequenceClassifierService")
|
|
232
277
|
class LMSequenceClassifierService(PipelineComponent):
|
|
233
278
|
"""
|
|
234
|
-
Pipeline component for sequence classification
|
|
279
|
+
Pipeline component for sequence classification.
|
|
235
280
|
|
|
236
|
-
|
|
281
|
+
Example:
|
|
282
|
+
```python
|
|
283
|
+
# setting up compulsory ocr service
|
|
284
|
+
tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
|
|
285
|
+
tess = TesseractOcrDetector(tesseract_config_path)
|
|
286
|
+
ocr_service = TextExtractionService(tess)
|
|
237
287
|
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
288
|
+
# hf tokenizer and token classifier
|
|
289
|
+
tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
|
|
290
|
+
layoutlm = HFLayoutLmSequenceClassifier("path/to/config.json", "path/to/model.bin",
|
|
291
|
+
categories=["handwritten", "presentation", "resume"])
|
|
242
292
|
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
layoutlm = HFLayoutLmSequenceClassifier("path/to/config.json","path/to/model.bin",
|
|
246
|
-
categories=["handwritten", "presentation", "resume"])
|
|
293
|
+
# token classification service
|
|
294
|
+
layoutlm_service = LMSequenceClassifierService(tokenizer, layoutlm)
|
|
247
295
|
|
|
248
|
-
|
|
249
|
-
layoutlm_service = LMSequenceClassifierService(tokenizer,layoutlm)
|
|
296
|
+
pipe = DoctectionPipe(pipeline_component_list=[ocr_service, layoutlm_service])
|
|
250
297
|
|
|
251
|
-
|
|
298
|
+
path = "path/to/some/form"
|
|
299
|
+
df = pipe.analyze(path=path)
|
|
252
300
|
|
|
253
|
-
|
|
254
|
-
|
|
301
|
+
for dp in df:
|
|
302
|
+
...
|
|
303
|
+
```
|
|
255
304
|
|
|
256
|
-
for dp in df:
|
|
257
|
-
...
|
|
258
305
|
|
|
259
306
|
"""
|
|
260
307
|
|
|
@@ -268,22 +315,26 @@ class LMSequenceClassifierService(PipelineComponent):
|
|
|
268
315
|
use_other_as_default_category: bool = False,
|
|
269
316
|
) -> None:
|
|
270
317
|
"""
|
|
271
|
-
:
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
318
|
+
Args:
|
|
319
|
+
tokenizer: `Tokenizer`, typing allows currently anything. This will be changed in the future.
|
|
320
|
+
language_model: `language model sequence classifier`.
|
|
321
|
+
padding: A padding strategy to be passed to the `tokenizer`. Must be either `max_length`, `longest` or
|
|
322
|
+
`do_not_pad`.
|
|
323
|
+
truncation: If `True` will truncate to a maximum length specified with the argument `max_length` or to the
|
|
324
|
+
maximum acceptable input length for the model if that argument is not provided. This will
|
|
325
|
+
truncate token by token, removing a token from the longest sequence in the pair if a pair of
|
|
326
|
+
sequences (or a batch of pairs) is provided. If `False` then no truncation (i.e., can output
|
|
327
|
+
batch with sequence lengths greater than the model maximum admissible input size).
|
|
328
|
+
return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows the overflowing tokens
|
|
329
|
+
can be returned as an additional batch element. Note that in this case, the
|
|
330
|
+
number of input batch samples will be smaller than the output batch samples.
|
|
331
|
+
use_other_as_default_category: When predicting document classes, it might be possible that some pages do
|
|
332
|
+
not get sent to the model because they are empty. If set to `True` it will
|
|
333
|
+
assign images with no features the category `TokenClasses.OTHER`.
|
|
334
|
+
|
|
335
|
+
Note:
|
|
336
|
+
If `use_other_as_default_category` is set, images with no features will be assigned the `TokenClasses.OTHER`
|
|
337
|
+
category.
|
|
287
338
|
|
|
288
339
|
"""
|
|
289
340
|
self.language_model = language_model
|
|
@@ -305,6 +356,15 @@ class LMSequenceClassifierService(PipelineComponent):
|
|
|
305
356
|
self._init_sanity_checks()
|
|
306
357
|
|
|
307
358
|
def serve(self, dp: Image) -> None:
|
|
359
|
+
"""
|
|
360
|
+
Serve the sequence classification pipeline on a given `Image`.
|
|
361
|
+
|
|
362
|
+
Args:
|
|
363
|
+
dp: The `Image` to process.
|
|
364
|
+
|
|
365
|
+
Returns:
|
|
366
|
+
None
|
|
367
|
+
"""
|
|
308
368
|
lm_input = self.mapping_to_lm_input_func(**self.required_kwargs)(dp)
|
|
309
369
|
lm_output = None
|
|
310
370
|
if lm_input is None:
|
|
@@ -347,7 +407,15 @@ class LMSequenceClassifierService(PipelineComponent):
|
|
|
347
407
|
|
|
348
408
|
@staticmethod
|
|
349
409
|
def image_to_features_func(mapping_str: str) -> Callable[..., Callable[[Image], Optional[Any]]]:
|
|
350
|
-
"""
|
|
410
|
+
"""
|
|
411
|
+
Get the function to map images to features for the language model.
|
|
412
|
+
|
|
413
|
+
Args:
|
|
414
|
+
mapping_str: The mapping function name as a string.
|
|
415
|
+
|
|
416
|
+
Returns:
|
|
417
|
+
A callable that maps an `Image` to features.
|
|
418
|
+
"""
|
|
351
419
|
return {"image_to_layoutlm_features": image_to_layoutlm_features, "image_to_lm_features": image_to_lm_features}[
|
|
352
420
|
mapping_str
|
|
353
421
|
]
|