deepdoctection 0.42.1__py3-none-any.whl → 0.43.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +4 -2
- deepdoctection/analyzer/__init__.py +2 -1
- deepdoctection/analyzer/config.py +919 -0
- deepdoctection/analyzer/dd.py +36 -62
- deepdoctection/analyzer/factory.py +311 -141
- deepdoctection/configs/conf_dd_one.yaml +100 -44
- deepdoctection/configs/profiles.jsonl +32 -0
- deepdoctection/dataflow/__init__.py +9 -6
- deepdoctection/dataflow/base.py +33 -15
- deepdoctection/dataflow/common.py +96 -75
- deepdoctection/dataflow/custom.py +36 -29
- deepdoctection/dataflow/custom_serialize.py +135 -91
- deepdoctection/dataflow/parallel_map.py +33 -31
- deepdoctection/dataflow/serialize.py +15 -10
- deepdoctection/dataflow/stats.py +41 -28
- deepdoctection/datapoint/__init__.py +4 -6
- deepdoctection/datapoint/annotation.py +104 -66
- deepdoctection/datapoint/box.py +190 -130
- deepdoctection/datapoint/convert.py +66 -39
- deepdoctection/datapoint/image.py +151 -95
- deepdoctection/datapoint/view.py +383 -236
- deepdoctection/datasets/__init__.py +2 -6
- deepdoctection/datasets/adapter.py +11 -11
- deepdoctection/datasets/base.py +118 -81
- deepdoctection/datasets/dataflow_builder.py +18 -12
- deepdoctection/datasets/info.py +76 -57
- deepdoctection/datasets/instances/__init__.py +6 -2
- deepdoctection/datasets/instances/doclaynet.py +17 -14
- deepdoctection/datasets/instances/fintabnet.py +16 -22
- deepdoctection/datasets/instances/funsd.py +11 -6
- deepdoctection/datasets/instances/iiitar13k.py +9 -9
- deepdoctection/datasets/instances/layouttest.py +9 -9
- deepdoctection/datasets/instances/publaynet.py +9 -9
- deepdoctection/datasets/instances/pubtables1m.py +13 -13
- deepdoctection/datasets/instances/pubtabnet.py +13 -15
- deepdoctection/datasets/instances/rvlcdip.py +8 -8
- deepdoctection/datasets/instances/xfund.py +11 -9
- deepdoctection/datasets/registry.py +18 -11
- deepdoctection/datasets/save.py +12 -11
- deepdoctection/eval/__init__.py +3 -2
- deepdoctection/eval/accmetric.py +72 -52
- deepdoctection/eval/base.py +29 -10
- deepdoctection/eval/cocometric.py +14 -12
- deepdoctection/eval/eval.py +56 -41
- deepdoctection/eval/registry.py +6 -3
- deepdoctection/eval/tedsmetric.py +24 -9
- deepdoctection/eval/tp_eval_callback.py +13 -12
- deepdoctection/extern/__init__.py +1 -1
- deepdoctection/extern/base.py +176 -97
- deepdoctection/extern/d2detect.py +127 -92
- deepdoctection/extern/deskew.py +19 -10
- deepdoctection/extern/doctrocr.py +162 -108
- deepdoctection/extern/fastlang.py +25 -17
- deepdoctection/extern/hfdetr.py +137 -60
- deepdoctection/extern/hflayoutlm.py +329 -248
- deepdoctection/extern/hflm.py +67 -33
- deepdoctection/extern/model.py +108 -762
- deepdoctection/extern/pdftext.py +37 -12
- deepdoctection/extern/pt/nms.py +15 -1
- deepdoctection/extern/pt/ptutils.py +13 -9
- deepdoctection/extern/tessocr.py +87 -54
- deepdoctection/extern/texocr.py +29 -14
- deepdoctection/extern/tp/tfutils.py +36 -8
- deepdoctection/extern/tp/tpcompat.py +54 -16
- deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
- deepdoctection/extern/tpdetect.py +4 -2
- deepdoctection/mapper/__init__.py +1 -1
- deepdoctection/mapper/cats.py +117 -76
- deepdoctection/mapper/cocostruct.py +35 -17
- deepdoctection/mapper/d2struct.py +56 -29
- deepdoctection/mapper/hfstruct.py +32 -19
- deepdoctection/mapper/laylmstruct.py +221 -185
- deepdoctection/mapper/maputils.py +71 -35
- deepdoctection/mapper/match.py +76 -62
- deepdoctection/mapper/misc.py +68 -44
- deepdoctection/mapper/pascalstruct.py +13 -12
- deepdoctection/mapper/prodigystruct.py +33 -19
- deepdoctection/mapper/pubstruct.py +42 -32
- deepdoctection/mapper/tpstruct.py +39 -19
- deepdoctection/mapper/xfundstruct.py +20 -13
- deepdoctection/pipe/__init__.py +1 -2
- deepdoctection/pipe/anngen.py +104 -62
- deepdoctection/pipe/base.py +226 -107
- deepdoctection/pipe/common.py +206 -123
- deepdoctection/pipe/concurrency.py +74 -47
- deepdoctection/pipe/doctectionpipe.py +108 -47
- deepdoctection/pipe/language.py +41 -24
- deepdoctection/pipe/layout.py +45 -18
- deepdoctection/pipe/lm.py +146 -78
- deepdoctection/pipe/order.py +205 -119
- deepdoctection/pipe/refine.py +111 -63
- deepdoctection/pipe/registry.py +1 -1
- deepdoctection/pipe/segment.py +213 -142
- deepdoctection/pipe/sub_layout.py +76 -46
- deepdoctection/pipe/text.py +52 -33
- deepdoctection/pipe/transform.py +8 -6
- deepdoctection/train/d2_frcnn_train.py +87 -69
- deepdoctection/train/hf_detr_train.py +72 -40
- deepdoctection/train/hf_layoutlm_train.py +85 -46
- deepdoctection/train/tp_frcnn_train.py +56 -28
- deepdoctection/utils/concurrency.py +59 -16
- deepdoctection/utils/context.py +40 -19
- deepdoctection/utils/develop.py +26 -17
- deepdoctection/utils/env_info.py +86 -37
- deepdoctection/utils/error.py +16 -10
- deepdoctection/utils/file_utils.py +246 -71
- deepdoctection/utils/fs.py +162 -43
- deepdoctection/utils/identifier.py +29 -16
- deepdoctection/utils/logger.py +49 -32
- deepdoctection/utils/metacfg.py +83 -21
- deepdoctection/utils/pdf_utils.py +119 -62
- deepdoctection/utils/settings.py +24 -10
- deepdoctection/utils/tqdm.py +10 -5
- deepdoctection/utils/transform.py +182 -46
- deepdoctection/utils/utils.py +61 -28
- deepdoctection/utils/viz.py +150 -104
- deepdoctection-0.43.1.dist-info/METADATA +376 -0
- deepdoctection-0.43.1.dist-info/RECORD +149 -0
- deepdoctection/analyzer/_config.py +0 -146
- deepdoctection-0.42.1.dist-info/METADATA +0 -431
- deepdoctection-0.42.1.dist-info/RECORD +0 -148
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/WHEEL +0 -0
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/licenses/LICENSE +0 -0
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/top_level.txt +0 -0
|
@@ -89,35 +89,33 @@ def image_to_raw_layoutlm_features(
|
|
|
89
89
|
segment_positions: Optional[Union[LayoutType, Sequence[LayoutType]]] = None,
|
|
90
90
|
) -> Optional[RawLayoutLMFeatures]:
|
|
91
91
|
"""
|
|
92
|
-
|
|
92
|
+
Maps a datapoint into an intermediate format for LayoutLM. Features are provided in a dict and this mapping
|
|
93
93
|
can be used for sequence or token classification as well as for inference. To generate input features for the model
|
|
94
|
-
please
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
:return: dictionary with the following arguments:
|
|
120
|
-
'image_id', 'width', 'height', 'ann_ids', 'words', 'bbox' and 'dataset_type'.
|
|
94
|
+
please use `raw_features_to_layoutlm_features`.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
dp: `Image`.
|
|
98
|
+
dataset_type: Either `SEQUENCE_CLASSIFICATION` or `TOKEN_CLASSIFICATION`. When using a built-in dataset use
|
|
99
|
+
this.
|
|
100
|
+
input_width: Max width of box coordinates. Transforms the image and all box coordinates accordingly.
|
|
101
|
+
input_height: Target height of box coordinates. Transforms the image and all box coordinates accordingly.
|
|
102
|
+
image_width: Some models (e.g. `Layoutlmv2`) assume box coordinates to be normalized to `input_width`, whereas
|
|
103
|
+
the image has to be resized to a different width. This input will only resize the `image` width.
|
|
104
|
+
image_height: Some models (e.g. `Layoutlmv2`) assume box coordinates to be normalized to `input_height`,
|
|
105
|
+
whereas the image has to be resized to a different height. This input will only resize the `image` height.
|
|
106
|
+
color_mode: Either `BGR` or `RGB`. Note that LayoutLMv2 uses `BGR` because of Detectron2 backbone, whereas
|
|
107
|
+
LayoutLMv3 uses `RGB`.
|
|
108
|
+
pixel_mean: (3,) array for `BGR` or `RGB` mean.
|
|
109
|
+
pixel_std: (3,) array for `BGR` or `RGB` std.
|
|
110
|
+
use_token_tag: Used only for `dataset_type="token_classification"`. If `True`, uses labels from subcategory
|
|
111
|
+
`WordType.token_tag` (with `B,I,O` suffix), otherwise `WordType.token_class`.
|
|
112
|
+
segment_positions: Using bounding boxes of segment instead of words improves model accuracy significantly.
|
|
113
|
+
Choose a single or a sequence of layout segments to use their bounding boxes. The layout segments need to
|
|
114
|
+
have a child-relationship with words. If a word does not appear as child, it will use the word bounding box.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
Dictionary with the following arguments: `image_id`, `width`, `height`, `ann_ids`, `words`, `bbox`, and
|
|
118
|
+
`dataset_type`.
|
|
121
119
|
"""
|
|
122
120
|
|
|
123
121
|
raw_features: RawLayoutLMFeatures = RawLayoutLMFeatures({})
|
|
@@ -212,9 +210,13 @@ def image_to_raw_layoutlm_features(
|
|
|
212
210
|
|
|
213
211
|
def layoutlm_features_to_pt_tensors(features: LayoutLMFeatures) -> LayoutLMFeatures:
|
|
214
212
|
"""
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
:
|
|
213
|
+
Converts a list of floats to PyTorch tensors.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
features: `LayoutLMFeatures`.
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
`LayoutLMFeatures`.
|
|
218
220
|
"""
|
|
219
221
|
|
|
220
222
|
_image_key = "pixel_values" if "pixel_values" in features else "image"
|
|
@@ -240,13 +242,23 @@ def _tokenize_with_sliding_window(
|
|
|
240
242
|
return_tensors: Optional[Literal["pt"]] = None,
|
|
241
243
|
) -> Union[JsonDict, BatchEncoding]:
|
|
242
244
|
"""
|
|
243
|
-
Runs a tokenizer
|
|
244
|
-
If there are overflowing tokens, sliding windows
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
245
|
+
Runs a tokenizer. If there are no overflowing tokens, the tokenizer output will be returned as is.
|
|
246
|
+
If there are overflowing tokens, sliding windows are built. Sliding windows are prepared from raw tokenized outputs
|
|
247
|
+
by running the tokenizer a second time without truncating and building the sliding windows from this output.
|
|
248
|
+
|
|
249
|
+
Note:
|
|
250
|
+
The current implementation has a bug: for higher batch sizes it will only return overflowing samples.
|
|
251
|
+
If the dataset consists of many samples with lots of tokens, use a low per device batch size.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
raw_features: List of `RawLayoutLMFeatures` or `RawLMFeatures`.
|
|
255
|
+
tokenizer: `PreTrainedTokenizerFast`.
|
|
256
|
+
sliding_window_stride: Stride for sliding window.
|
|
257
|
+
max_batch_size: Maximum batch size.
|
|
258
|
+
return_tensors: If `pt`, returns torch tensors.
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
`JsonDict` or `BatchEncoding`.
|
|
250
262
|
"""
|
|
251
263
|
# first try: we require return_overflowing_tokens=True. If the number of raw features is equal to
|
|
252
264
|
# overflow_to_sample_mapping then there is nothing more to do because the sample has less than max_length
|
|
@@ -413,33 +425,36 @@ def raw_features_to_layoutlm_features(
|
|
|
413
425
|
remove_bounding_boxes: bool = False,
|
|
414
426
|
) -> LayoutLMFeatures:
|
|
415
427
|
"""
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
:
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
428
|
+
Maps raw features to tokenized input sequences for LayoutLM models.
|
|
429
|
+
|
|
430
|
+
Args:
|
|
431
|
+
raw_features: A dictionary with the following arguments: `image_id`, `width`, `height`, `ann_ids`, `words`,
|
|
432
|
+
`boxes`, `dataset_type`.
|
|
433
|
+
tokenizer: A fast tokenizer for the model. The conventional Python-based tokenizer provided by the
|
|
434
|
+
Transformers library does not return essential word_id/token_id mappings, making feature generation
|
|
435
|
+
more difficult. Only fast tokenizers are allowed.
|
|
436
|
+
padding: Padding strategy to be passed to the tokenizer. Must be either `max_length`, `longest`, or
|
|
437
|
+
`do_not_pad`.
|
|
438
|
+
truncation: If `True`, truncates to a maximum length specified with the argument `max_length` or to the
|
|
439
|
+
maximum acceptable input length for the model if that argument is not provided. Truncates token by token,
|
|
440
|
+
removing a token from the longest sequence in the pair if a pair of sequences (or a batch of pairs) is
|
|
441
|
+
provided. If `False`, no truncation (i.e., can output batch with sequence lengths greater than the model
|
|
442
|
+
maximum admissible input size).
|
|
443
|
+
return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows, the overflowing tokens can
|
|
444
|
+
be returned as an additional batch element. In this case, the number of input batch samples will be smaller
|
|
445
|
+
than the output batch samples.
|
|
446
|
+
return_tensors: If `pt`, returns torch tensors. If not provided, batches will be lists of lists.
|
|
447
|
+
remove_columns_for_training: Removes all superfluous columns that are not required for training.
|
|
448
|
+
sliding_window_stride: If the output of the tokenizer exceeds the `max_length` sequence length, sliding windows
|
|
449
|
+
will be created with each window having `max_length` sequence input. When using
|
|
450
|
+
`sliding_window_stride=0`, no strides will be created; otherwise, it will create slides with windows shifted
|
|
451
|
+
`sliding_window_stride` to the right.
|
|
452
|
+
max_batch_size: Maximum batch size.
|
|
453
|
+
remove_bounding_boxes: If `True`, removes bounding box features.
|
|
454
|
+
|
|
455
|
+
Returns:
|
|
456
|
+
Dictionary with the following arguments: `image_ids`, `width`, `height`, `ann_ids`, `input_ids`,
|
|
457
|
+
`token_type_ids`, `attention_mask`, `bbox`, `labels`.
|
|
443
458
|
"""
|
|
444
459
|
|
|
445
460
|
if isinstance(raw_features, dict):
|
|
@@ -578,28 +593,30 @@ def raw_features_to_layoutlm_features(
|
|
|
578
593
|
@dataclass
|
|
579
594
|
class LayoutLMDataCollator:
|
|
580
595
|
"""
|
|
581
|
-
Data collator that will dynamically tokenize, pad and truncate the inputs received.
|
|
582
|
-
|
|
583
|
-
:
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
596
|
+
Data collator that will dynamically tokenize, pad, and truncate the inputs received.
|
|
597
|
+
|
|
598
|
+
Args:
|
|
599
|
+
tokenizer: A fast tokenizer for the model. The conventional Python-based tokenizer provided by the
|
|
600
|
+
Transformers library does not return essential word_id/token_id mappings, making feature generation
|
|
601
|
+
more difficult. Only fast tokenizers are allowed.
|
|
602
|
+
padding: Padding strategy to be passed to the tokenizer. Must be either `max_length`, `longest`, or
|
|
603
|
+
`do_not_pad`.
|
|
604
|
+
truncation: If `True`, truncates to a maximum length specified with the argument `max_length` or to the
|
|
605
|
+
maximum acceptable input length for the model if that argument is not provided. Truncates token by token,
|
|
606
|
+
removing a token from the longest sequence in the pair if a pair of sequences (or a batch of pairs) is
|
|
607
|
+
provided.
|
|
608
|
+
If `False`, no truncation (i.e., can output batch with sequence lengths greater than the model maximum
|
|
609
|
+
admissible input size).
|
|
610
|
+
return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows, the overflowing tokens can
|
|
611
|
+
be returned as an additional batch element. In this case, the number of input batch samples will be smaller
|
|
612
|
+
than the output batch samples.
|
|
613
|
+
return_tensors: If `pt`, returns torch tensors. If not provided, batches will be lists of lists.
|
|
614
|
+
sliding_window_stride: If the output of the tokenizer exceeds the `max_length` sequence length, sliding windows
|
|
615
|
+
will be created with each window having `max_length` sequence input. When using
|
|
616
|
+
`sliding_window_stride=0`, no strides will be created; otherwise, it will create slides with windows
|
|
617
|
+
shifted `sliding_window_stride` to the right.
|
|
618
|
+
max_batch_size: Maximum batch size.
|
|
619
|
+
remove_bounding_box_features: If `True`, removes bounding box features.
|
|
603
620
|
"""
|
|
604
621
|
|
|
605
622
|
tokenizer: PreTrainedTokenizerFast
|
|
@@ -621,11 +638,15 @@ class LayoutLMDataCollator:
|
|
|
621
638
|
|
|
622
639
|
def __call__(self, raw_features: Union[RawLayoutLMFeatures, list[RawLayoutLMFeatures]]) -> LayoutLMFeatures:
|
|
623
640
|
"""
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
641
|
+
Calls the `DataCollator` to form model inputs for training and inference.
|
|
642
|
+
|
|
643
|
+
Args:
|
|
644
|
+
raw_features: A dictionary with the following arguments: `image_id`, `width`, `height`, `ann_ids`, `words`,
|
|
645
|
+
`boxes`, `dataset_type`.
|
|
646
|
+
|
|
647
|
+
Returns:
|
|
648
|
+
`LayoutLMFeatures` with arguments `image_ids`, `width`, `height`, `ann_ids`, `input_ids`,
|
|
649
|
+
`token_type_ids`, `attention_masks`, `boxes`, `labels`.
|
|
629
650
|
"""
|
|
630
651
|
return raw_features_to_layoutlm_features(
|
|
631
652
|
raw_features, # type: ignore
|
|
@@ -660,54 +681,57 @@ def image_to_layoutlm_features(
|
|
|
660
681
|
sliding_window_stride: int = 0,
|
|
661
682
|
) -> Optional[LayoutLMFeatures]:
|
|
662
683
|
"""
|
|
663
|
-
Mapping function to generate
|
|
684
|
+
Mapping function to generate LayoutLM features from `Image` to be used for inference in a pipeline component.
|
|
664
685
|
`LanguageModelPipelineComponent` has a positional argument `mapping_to_lm_input_func` that must be chosen
|
|
665
686
|
with respect to the language model chosen. This mapper is devoted to generating features for LayoutLM. It will be
|
|
666
687
|
used internally in `LMTokenClassifierService`.
|
|
667
688
|
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
:
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
689
|
+
Example:
|
|
690
|
+
```python
|
|
691
|
+
tokenizer = LayoutLMTokenizer.from_pretrained("mrm8488/layoutlm-finetuned-funsd")
|
|
692
|
+
layoutlm = HFLayoutLmTokenClassifier("path/to/config.json", "path/to/model.bin",
|
|
693
|
+
categories_explicit=['B-ANSWER', 'B-QUESTION', 'O'])
|
|
694
|
+
layoutlm_service = LMTokenClassifierService(tokenizer, layoutlm)
|
|
695
|
+
```
|
|
696
|
+
|
|
697
|
+
Args:
|
|
698
|
+
dp: `Image` datapoint.
|
|
699
|
+
tokenizer: Tokenizer compatible with the language model.
|
|
700
|
+
padding: Padding strategy to be passed to the tokenizer. Must be either `max_length`, `longest`, or
|
|
701
|
+
`do_not_pad`.
|
|
702
|
+
truncation: If `True`, truncates to a maximum length specified with the argument `max_length` or to the
|
|
703
|
+
maximum acceptable input length for the model if that argument is not provided. Truncates token by token,
|
|
704
|
+
removing a token from the longest sequence in the pair if a pair of sequences (or a batch of pairs) is
|
|
705
|
+
provided.
|
|
706
|
+
If `False`, no truncation (i.e., can output batch with sequence lengths greater than the model maximum
|
|
707
|
+
admissible input size).
|
|
708
|
+
return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows, the overflowing tokens
|
|
709
|
+
can be returned as an additional batch element. In this case, the number of input batch samples will be
|
|
710
|
+
smaller than the output batch samples.
|
|
711
|
+
return_tensors: Output tensor features. Either `pt` for PyTorch models or `None` if features should be
|
|
712
|
+
returned in list objects.
|
|
713
|
+
input_width: Standard input size for image coordinates. All LayoutLM models require input features to be
|
|
714
|
+
normalized to an image width equal to 1000.
|
|
715
|
+
input_height: Standard input size for image coordinates. All LayoutLM models require input features to be
|
|
716
|
+
normalized to an image height equal to 1000.
|
|
717
|
+
image_width: Some models (e.g. `Layoutlmv2`) assume box coordinates to be normalized to `input_width`, whereas
|
|
718
|
+
the image has to be resized to a different width. This input will only resize the `image` width.
|
|
719
|
+
image_height: Some models (e.g. `Layoutlmv2`) assume box coordinates to be normalized to `input_height`,
|
|
720
|
+
whereas the image has to be resized to a different height. This input will only resize the `image` height.
|
|
721
|
+
color_mode: Either `BGR` or `RGB`. Note that LayoutLMv2 uses `BGR` because of Detectron2 backbone, whereas
|
|
722
|
+
LayoutLMv3 uses `RGB`.
|
|
723
|
+
pixel_mean: (3,) array for `BGR` or `RGB` mean.
|
|
724
|
+
pixel_std: (3,) array for `BGR` or `RGB` std.
|
|
725
|
+
segment_positions: Using bounding boxes of segment instead of words improves model accuracy significantly.
|
|
726
|
+
Choose a single or a sequence of layout segments to use their bounding boxes. The layout segments need to
|
|
727
|
+
have a child-relationship with words. If a word does not appear as child, it will use the word bounding box.
|
|
728
|
+
sliding_window_stride: If the output of the tokenizer exceeds the `max_length` sequence length, sliding
|
|
729
|
+
windows will be created with each window having `max_length` sequence input. When using
|
|
730
|
+
`sliding_window_stride=0`, no strides will be created; otherwise, it will create slides with windows shifted
|
|
731
|
+
`sliding_window_stride` to the right.
|
|
732
|
+
|
|
733
|
+
Returns:
|
|
734
|
+
A dict of LayoutLM features.
|
|
711
735
|
"""
|
|
712
736
|
raw_features = image_to_raw_layoutlm_features(
|
|
713
737
|
None,
|
|
@@ -745,28 +769,36 @@ def image_to_raw_lm_features(
|
|
|
745
769
|
include_residual_text_container: bool = False,
|
|
746
770
|
) -> Optional[RawLMFeatures]:
|
|
747
771
|
"""
|
|
748
|
-
|
|
772
|
+
Maps a datapoint into an intermediate format for BERT-like models. Features are provided in a dict and
|
|
749
773
|
this mapping can be used for sequence or token classification as well as for inference. To generate input features
|
|
750
|
-
for the model please
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
:
|
|
764
|
-
|
|
774
|
+
for the model, please use `raw_features_to_layoutlm_features`.
|
|
775
|
+
|
|
776
|
+
Args:
|
|
777
|
+
dp: `Image`.
|
|
778
|
+
dataset_type: Either `SEQUENCE_CLASSIFICATION` or `TOKEN_CLASSIFICATION`. When using a built-in dataset use
|
|
779
|
+
this.
|
|
780
|
+
use_token_tag: Used only for `dataset_type="token_classification"`. If `True`, uses labels from subcategory
|
|
781
|
+
`WordType.token_tag` (with `B,I,O` suffix), otherwise `WordType.token_class`.
|
|
782
|
+
text_container: A `LayoutType` to get the text from. It will steer the output of `Layout.words`.
|
|
783
|
+
floating_text_block_categories: A list of top-level layout objects.
|
|
784
|
+
include_residual_text_container: Regards synthetic text line annotations as floating text blocks and therefore
|
|
785
|
+
incorporates all image annotations of category `word` when building text strings.
|
|
786
|
+
|
|
787
|
+
Returns:
|
|
788
|
+
Dictionary with the following arguments: `image_id`, `width`, `height`, `ann_ids`, `words`, `bbox`, and
|
|
789
|
+
`dataset_type`.
|
|
765
790
|
"""
|
|
766
791
|
|
|
767
792
|
raw_features: RawLMFeatures = RawLMFeatures({})
|
|
768
793
|
|
|
769
|
-
|
|
794
|
+
# We do not need to configure residual_text_block_categories here, because text_ does ignore these layout sections
|
|
795
|
+
# anyway
|
|
796
|
+
page = Page.from_image(
|
|
797
|
+
image_orig=dp,
|
|
798
|
+
text_container=text_container,
|
|
799
|
+
floating_text_block_categories=floating_text_block_categories,
|
|
800
|
+
include_residual_text_container=include_residual_text_container,
|
|
801
|
+
)
|
|
770
802
|
|
|
771
803
|
text_ = page.text_
|
|
772
804
|
|
|
@@ -808,42 +840,46 @@ def image_to_lm_features(
|
|
|
808
840
|
include_residual_text_container: bool = False,
|
|
809
841
|
) -> Optional[LayoutLMFeatures]:
|
|
810
842
|
"""
|
|
811
|
-
Mapping function to generate
|
|
843
|
+
Mapping function to generate LayoutLM features from `Image` to be used for inference in a pipeline component.
|
|
812
844
|
`LanguageModelPipelineComponent` has a positional argument `mapping_to_lm_input_func` that must be chosen
|
|
813
845
|
with respect to the language model chosen. This mapper is devoted to generating features for LayoutLM. It will be
|
|
814
846
|
used internally in `LMTokenClassifierService`.
|
|
815
847
|
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
:
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
848
|
+
Example:
|
|
849
|
+
```python
|
|
850
|
+
tokenizer = LayoutLMTokenizer.from_pretrained("mrm8488/layoutlm-finetuned-funsd")
|
|
851
|
+
layoutlm = HFLayoutLmTokenClassifier("path/to/config.json", "path/to/model.bin",
|
|
852
|
+
categories_explicit=['B-ANSWER', 'B-QUESTION', 'O'])
|
|
853
|
+
layoutlm_service = LMTokenClassifierService(tokenizer, layoutlm)
|
|
854
|
+
```
|
|
855
|
+
|
|
856
|
+
Args:
|
|
857
|
+
dp: `Image` datapoint.
|
|
858
|
+
tokenizer: Tokenizer compatible with the language model.
|
|
859
|
+
padding: Padding strategy to be passed to the tokenizer. Must be either `max_length`, `longest`, or
|
|
860
|
+
`do_not_pad`.
|
|
861
|
+
truncation: If `True`, truncates to a maximum length specified with the argument `max_length` or to the
|
|
862
|
+
maximum acceptable input length for the model if that argument is not provided. Truncates token by token,
|
|
863
|
+
removing a token from the longest sequence in the pair if a pair of sequences (or a batch of pairs) is
|
|
864
|
+
provided.
|
|
865
|
+
If `False`, no truncation (i.e., can output batch with sequence lengths greater than the model maximum
|
|
866
|
+
admissible input size).
|
|
867
|
+
return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows, the overflowing tokens
|
|
868
|
+
can be returned as an additional batch element. In this case, the number of input batch samples will be
|
|
869
|
+
smaller than the output batch samples.
|
|
870
|
+
return_tensors: Output tensor features. Either `pt` for PyTorch models or `None` if features should be
|
|
871
|
+
returned in list objects.
|
|
872
|
+
sliding_window_stride: If the output of the tokenizer exceeds the `max_length` sequence length, sliding
|
|
873
|
+
windows will be created with each window having `max_length` sequence input. When using
|
|
874
|
+
`sliding_window_stride=0`, no strides will be created; otherwise, it will create slides with windows
|
|
875
|
+
shifted `sliding_window_stride` to the right.
|
|
876
|
+
text_container: A `LayoutType` to get the text from. It will steer the output of `Layout.words`.
|
|
877
|
+
floating_text_block_categories: A list of top-level layout objects.
|
|
878
|
+
include_residual_text_container: Regards synthetic text line annotations as floating text blocks and therefore
|
|
879
|
+
incorporates all image annotations of category `word` when building text strings.
|
|
880
|
+
|
|
881
|
+
Returns:
|
|
882
|
+
A dict of LM features.
|
|
847
883
|
"""
|
|
848
884
|
raw_features = image_to_raw_lm_features( # pylint: disable=E1102
|
|
849
885
|
dataset_type=None,
|