deepdoctection 0.31__py3-none-any.whl → 0.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +35 -28
- deepdoctection/analyzer/dd.py +30 -24
- deepdoctection/configs/conf_dd_one.yaml +34 -31
- deepdoctection/datapoint/annotation.py +2 -1
- deepdoctection/datapoint/box.py +2 -1
- deepdoctection/datapoint/image.py +13 -7
- deepdoctection/datapoint/view.py +95 -24
- deepdoctection/datasets/__init__.py +1 -4
- deepdoctection/datasets/adapter.py +5 -2
- deepdoctection/datasets/base.py +5 -3
- deepdoctection/datasets/info.py +2 -2
- deepdoctection/datasets/instances/doclaynet.py +3 -2
- deepdoctection/datasets/instances/fintabnet.py +2 -1
- deepdoctection/datasets/instances/funsd.py +2 -1
- deepdoctection/datasets/instances/iiitar13k.py +5 -2
- deepdoctection/datasets/instances/layouttest.py +2 -1
- deepdoctection/datasets/instances/publaynet.py +2 -2
- deepdoctection/datasets/instances/pubtables1m.py +6 -3
- deepdoctection/datasets/instances/pubtabnet.py +2 -1
- deepdoctection/datasets/instances/rvlcdip.py +2 -1
- deepdoctection/datasets/instances/xfund.py +2 -1
- deepdoctection/eval/__init__.py +1 -4
- deepdoctection/eval/cocometric.py +2 -1
- deepdoctection/eval/eval.py +17 -13
- deepdoctection/eval/tedsmetric.py +14 -11
- deepdoctection/eval/tp_eval_callback.py +9 -3
- deepdoctection/extern/__init__.py +2 -7
- deepdoctection/extern/d2detect.py +24 -32
- deepdoctection/extern/deskew.py +4 -2
- deepdoctection/extern/doctrocr.py +75 -81
- deepdoctection/extern/fastlang.py +4 -2
- deepdoctection/extern/hfdetr.py +22 -28
- deepdoctection/extern/hflayoutlm.py +335 -103
- deepdoctection/extern/hflm.py +225 -0
- deepdoctection/extern/model.py +56 -47
- deepdoctection/extern/pdftext.py +8 -4
- deepdoctection/extern/pt/__init__.py +1 -3
- deepdoctection/extern/pt/nms.py +6 -2
- deepdoctection/extern/pt/ptutils.py +27 -19
- deepdoctection/extern/texocr.py +4 -2
- deepdoctection/extern/tp/tfutils.py +43 -9
- deepdoctection/extern/tp/tpcompat.py +10 -7
- deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
- deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
- deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
- deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
- deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
- deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
- deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
- deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
- deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
- deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
- deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
- deepdoctection/extern/tp/tpfrcnn/preproc.py +7 -3
- deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
- deepdoctection/extern/tpdetect.py +5 -8
- deepdoctection/mapper/__init__.py +3 -8
- deepdoctection/mapper/d2struct.py +8 -6
- deepdoctection/mapper/hfstruct.py +6 -1
- deepdoctection/mapper/laylmstruct.py +163 -20
- deepdoctection/mapper/maputils.py +3 -1
- deepdoctection/mapper/misc.py +6 -3
- deepdoctection/mapper/tpstruct.py +2 -2
- deepdoctection/pipe/__init__.py +1 -1
- deepdoctection/pipe/common.py +11 -9
- deepdoctection/pipe/concurrency.py +2 -1
- deepdoctection/pipe/layout.py +3 -1
- deepdoctection/pipe/lm.py +32 -64
- deepdoctection/pipe/order.py +142 -35
- deepdoctection/pipe/refine.py +8 -14
- deepdoctection/pipe/{cell.py → sub_layout.py} +1 -1
- deepdoctection/train/__init__.py +6 -12
- deepdoctection/train/d2_frcnn_train.py +21 -16
- deepdoctection/train/hf_detr_train.py +18 -11
- deepdoctection/train/hf_layoutlm_train.py +118 -101
- deepdoctection/train/tp_frcnn_train.py +21 -19
- deepdoctection/utils/env_info.py +41 -117
- deepdoctection/utils/logger.py +1 -0
- deepdoctection/utils/mocks.py +93 -0
- deepdoctection/utils/settings.py +1 -0
- deepdoctection/utils/viz.py +4 -3
- {deepdoctection-0.31.dist-info → deepdoctection-0.32.dist-info}/METADATA +27 -18
- deepdoctection-0.32.dist-info/RECORD +146 -0
- deepdoctection-0.31.dist-info/RECORD +0 -144
- {deepdoctection-0.31.dist-info → deepdoctection-0.32.dist-info}/LICENSE +0 -0
- {deepdoctection-0.31.dist-info → deepdoctection-0.32.dist-info}/WHEEL +0 -0
- {deepdoctection-0.31.dist-info → deepdoctection-0.32.dist-info}/top_level.txt +0 -0
|
@@ -20,32 +20,30 @@ Module for mapping annotations from image to layout lm input structure. Heavily
|
|
|
20
20
|
<https://github.com/NielsRogge/Transformers-Tutorials>
|
|
21
21
|
"""
|
|
22
22
|
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
23
25
|
import random
|
|
24
26
|
from dataclasses import dataclass, field
|
|
25
27
|
from typing import Any, Callable, Dict, List, Literal, NewType, Optional, Sequence, Union
|
|
26
28
|
|
|
27
29
|
import numpy as np
|
|
28
30
|
import numpy.typing as npt
|
|
31
|
+
from lazy_imports import try_import
|
|
29
32
|
|
|
30
33
|
from ..datapoint.annotation import ContainerAnnotation
|
|
31
34
|
from ..datapoint.convert import box_to_point4, point4_to_box
|
|
32
35
|
from ..datapoint.image import Image
|
|
36
|
+
from ..datapoint.view import Page
|
|
33
37
|
from ..utils.detection_types import JsonDict
|
|
34
|
-
from ..utils.file_utils import pytorch_available, transformers_available
|
|
35
38
|
from ..utils.settings import DatasetType, LayoutType, PageType, Relationships, WordType
|
|
36
39
|
from ..utils.transform import ResizeTransform, normalize_image
|
|
37
40
|
from .maputils import curry
|
|
38
41
|
|
|
39
|
-
|
|
42
|
+
with try_import() as import_guard:
|
|
40
43
|
import torch
|
|
41
44
|
|
|
42
|
-
|
|
43
|
-
from transformers import
|
|
44
|
-
BatchEncoding,
|
|
45
|
-
PreTrainedTokenizerFast,
|
|
46
|
-
RobertaTokenizerFast,
|
|
47
|
-
XLMRobertaTokenizerFast,
|
|
48
|
-
)
|
|
45
|
+
with try_import() as tr_import_guard:
|
|
46
|
+
from transformers import BatchEncoding, PreTrainedTokenizerFast # pylint: disable=W0611
|
|
49
47
|
|
|
50
48
|
__all__ = [
|
|
51
49
|
"image_to_raw_layoutlm_features",
|
|
@@ -54,12 +52,17 @@ __all__ = [
|
|
|
54
52
|
"image_to_layoutlm_features",
|
|
55
53
|
"DataCollator",
|
|
56
54
|
"LayoutLMFeatures",
|
|
55
|
+
"image_to_raw_lm_features",
|
|
56
|
+
"image_to_lm_features",
|
|
57
57
|
]
|
|
58
58
|
|
|
59
59
|
RawLayoutLMFeatures = NewType("RawLayoutLMFeatures", JsonDict)
|
|
60
|
+
RawLMFeatures = NewType("RawLMFeatures", JsonDict)
|
|
60
61
|
LayoutLMFeatures = NewType("LayoutLMFeatures", JsonDict)
|
|
62
|
+
LMFeatures = NewType("LMFeatures", JsonDict)
|
|
61
63
|
InputDataClass = NewType("InputDataClass", JsonDict)
|
|
62
64
|
|
|
65
|
+
|
|
63
66
|
"""
|
|
64
67
|
<https://github.com/huggingface/transformers/src/transformers/data/data_collator.py>
|
|
65
68
|
A DataCollator is a function that takes a list of samples from a Dataset and collate them into a batch, as a dictionary
|
|
@@ -208,7 +211,7 @@ def image_to_raw_layoutlm_features(
|
|
|
208
211
|
return raw_features
|
|
209
212
|
|
|
210
213
|
|
|
211
|
-
def
|
|
214
|
+
def layoutlm_features_to_pt_tensors(features: LayoutLMFeatures) -> LayoutLMFeatures:
|
|
212
215
|
"""
|
|
213
216
|
Converting list of floats to pytorch tensors
|
|
214
217
|
:param features: LayoutLMFeatures
|
|
@@ -216,7 +219,8 @@ def features_to_pt_tensors(features: LayoutLMFeatures) -> LayoutLMFeatures:
|
|
|
216
219
|
"""
|
|
217
220
|
|
|
218
221
|
_image_key = "pixel_values" if "pixel_values" in features else "image"
|
|
219
|
-
|
|
222
|
+
if "bbox" in features:
|
|
223
|
+
features["bbox"] = torch.tensor(features["bbox"], dtype=torch.long)
|
|
220
224
|
if "labels" in features:
|
|
221
225
|
features["labels"] = torch.tensor(features["labels"], dtype=torch.long)
|
|
222
226
|
if _image_key in features:
|
|
@@ -230,12 +234,12 @@ def features_to_pt_tensors(features: LayoutLMFeatures) -> LayoutLMFeatures:
|
|
|
230
234
|
|
|
231
235
|
|
|
232
236
|
def _tokenize_with_sliding_window(
|
|
233
|
-
raw_features: List[RawLayoutLMFeatures],
|
|
234
|
-
tokenizer:
|
|
237
|
+
raw_features: List[Union[RawLayoutLMFeatures, RawLMFeatures]],
|
|
238
|
+
tokenizer: PreTrainedTokenizerFast,
|
|
235
239
|
sliding_window_stride: int,
|
|
236
240
|
max_batch_size: int,
|
|
237
241
|
return_tensors: Optional[Literal["pt"]] = None,
|
|
238
|
-
) -> Union[JsonDict,
|
|
242
|
+
) -> Union[JsonDict, BatchEncoding]:
|
|
239
243
|
"""
|
|
240
244
|
Runs a tokenizer: If there are no overflowing tokens, the tokenizer output will be returned as it is.
|
|
241
245
|
If there are overflowing tokens, sliding windows have to be built. As it is easier to prepare the sliding windows
|
|
@@ -398,8 +402,8 @@ def _tokenize_with_sliding_window(
|
|
|
398
402
|
|
|
399
403
|
|
|
400
404
|
def raw_features_to_layoutlm_features(
|
|
401
|
-
raw_features: Union[RawLayoutLMFeatures, List[RawLayoutLMFeatures]],
|
|
402
|
-
tokenizer:
|
|
405
|
+
raw_features: Union[RawLayoutLMFeatures, RawLMFeatures, List[Union[RawLayoutLMFeatures, RawLMFeatures]]],
|
|
406
|
+
tokenizer: PreTrainedTokenizerFast,
|
|
403
407
|
padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
|
|
404
408
|
truncation: bool = True,
|
|
405
409
|
return_overflowing_tokens: bool = False,
|
|
@@ -407,6 +411,7 @@ def raw_features_to_layoutlm_features(
|
|
|
407
411
|
remove_columns_for_training: bool = False,
|
|
408
412
|
sliding_window_stride: int = 0,
|
|
409
413
|
max_batch_size: int = 0,
|
|
414
|
+
remove_bounding_boxes: bool = False,
|
|
410
415
|
) -> LayoutLMFeatures:
|
|
411
416
|
"""
|
|
412
417
|
Mapping raw features to tokenized input sequences for LayoutLM models.
|
|
@@ -563,8 +568,11 @@ def raw_features_to_layoutlm_features(
|
|
|
563
568
|
input_dict.pop("ann_ids")
|
|
564
569
|
input_dict.pop("tokens")
|
|
565
570
|
|
|
571
|
+
if remove_bounding_boxes:
|
|
572
|
+
input_dict.pop("bbox")
|
|
573
|
+
|
|
566
574
|
if return_tensors == "pt":
|
|
567
|
-
return
|
|
575
|
+
return layoutlm_features_to_pt_tensors(LayoutLMFeatures(input_dict))
|
|
568
576
|
return LayoutLMFeatures(input_dict)
|
|
569
577
|
|
|
570
578
|
|
|
@@ -595,13 +603,14 @@ class LayoutLMDataCollator:
|
|
|
595
603
|
with windows shifted `sliding_window_stride` to the right.
|
|
596
604
|
"""
|
|
597
605
|
|
|
598
|
-
tokenizer:
|
|
606
|
+
tokenizer: PreTrainedTokenizerFast
|
|
599
607
|
padding: Literal["max_length", "do_not_pad", "longest"] = field(default="max_length")
|
|
600
608
|
truncation: bool = field(default=True)
|
|
601
609
|
return_overflowing_tokens: bool = field(default=False)
|
|
602
610
|
return_tensors: Optional[Literal["pt"]] = field(default=None)
|
|
603
611
|
sliding_window_stride: int = field(default=0)
|
|
604
612
|
max_batch_size: int = field(default=0)
|
|
613
|
+
remove_bounding_box_features: bool = field(default=False)
|
|
605
614
|
|
|
606
615
|
def __post_init__(self) -> None:
|
|
607
616
|
assert isinstance(self.tokenizer, PreTrainedTokenizerFast), "Tokenizer must be a fast tokenizer"
|
|
@@ -620,7 +629,7 @@ class LayoutLMDataCollator:
|
|
|
620
629
|
token_type_ids, attention_masks, boxes, labels`.
|
|
621
630
|
"""
|
|
622
631
|
return raw_features_to_layoutlm_features(
|
|
623
|
-
raw_features,
|
|
632
|
+
raw_features, # type: ignore
|
|
624
633
|
self.tokenizer,
|
|
625
634
|
self.padding,
|
|
626
635
|
self.truncation,
|
|
@@ -629,13 +638,14 @@ class LayoutLMDataCollator:
|
|
|
629
638
|
True,
|
|
630
639
|
self.sliding_window_stride,
|
|
631
640
|
self.max_batch_size,
|
|
641
|
+
self.remove_bounding_box_features,
|
|
632
642
|
)
|
|
633
643
|
|
|
634
644
|
|
|
635
645
|
@curry
|
|
636
646
|
def image_to_layoutlm_features(
|
|
637
647
|
dp: Image,
|
|
638
|
-
tokenizer:
|
|
648
|
+
tokenizer: PreTrainedTokenizerFast,
|
|
639
649
|
padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
|
|
640
650
|
truncation: bool = True,
|
|
641
651
|
return_overflowing_tokens: bool = False,
|
|
@@ -724,3 +734,136 @@ def image_to_layoutlm_features(
|
|
|
724
734
|
sliding_window_stride=sliding_window_stride,
|
|
725
735
|
)
|
|
726
736
|
return features
|
|
737
|
+
|
|
738
|
+
|
|
739
|
+
@curry
|
|
740
|
+
def image_to_raw_lm_features(
|
|
741
|
+
dp: Image,
|
|
742
|
+
dataset_type: Optional[Literal["sequence_classification", "token_classification"]] = None,
|
|
743
|
+
use_token_tag: bool = True,
|
|
744
|
+
text_container: Optional[LayoutType] = LayoutType.word,
|
|
745
|
+
floating_text_block_categories: Optional[Sequence[LayoutType]] = None,
|
|
746
|
+
include_residual_text_container: bool = False,
|
|
747
|
+
) -> Optional[RawLMFeatures]:
|
|
748
|
+
"""
|
|
749
|
+
Mapping a datapoint into an intermediate format for bert-like models. Features will be provided into a dict and
|
|
750
|
+
this mapping can be used for sequence or token classification as well as for inference. To generate input features
|
|
751
|
+
for the model please `use raw_features_to_layoutlm_features`.
|
|
752
|
+
|
|
753
|
+
|
|
754
|
+
:param dp: Image
|
|
755
|
+
:param dataset_type: Either SEQUENCE_CLASSIFICATION or TOKEN_CLASSIFICATION. When using a built-in dataset use
|
|
756
|
+
:param use_token_tag: Will only be used for dataset_type="token_classification". If use_token_tag=True, will use
|
|
757
|
+
labels from sub category `WordType.token_tag` (with `B,I,O` suffix), otherwise
|
|
758
|
+
`WordType.token_class`.
|
|
759
|
+
:param text_container: A LayoutType to get the text from. It will steer the output of `Layout.words`.
|
|
760
|
+
:param floating_text_block_categories: A list of top level layout objects
|
|
761
|
+
:param include_residual_text_container: This will regard synthetic text line annotations as floating text
|
|
762
|
+
blocks and therefore incorporate all image annotations of category
|
|
763
|
+
`word` when building text strings.
|
|
764
|
+
:return: dictionary with the following arguments:
|
|
765
|
+
'image_id', 'width', 'height', 'ann_ids', 'words', 'bbox' and 'dataset_type'.
|
|
766
|
+
"""
|
|
767
|
+
|
|
768
|
+
raw_features: RawLMFeatures = RawLMFeatures({})
|
|
769
|
+
|
|
770
|
+
page = Page.from_image(dp, text_container, floating_text_block_categories, include_residual_text_container)
|
|
771
|
+
|
|
772
|
+
text_ = page.text_
|
|
773
|
+
|
|
774
|
+
# pylint: disable=E1137 #3162
|
|
775
|
+
raw_features["image_id"] = page.image_id
|
|
776
|
+
raw_features["width"] = page.width
|
|
777
|
+
raw_features["height"] = page.height
|
|
778
|
+
raw_features["ann_ids"] = text_["ann_ids"]
|
|
779
|
+
raw_features["words"] = text_["words"]
|
|
780
|
+
# We use a dummy bounding box for all bounding boxes so that we can pass the raw features to
|
|
781
|
+
# raw_features_to_layoutlm_features
|
|
782
|
+
raw_features["bbox"] = [_CLS_BOX] * len(text_["words"])
|
|
783
|
+
raw_features["dataset_type"] = dataset_type
|
|
784
|
+
|
|
785
|
+
if use_token_tag and text_["token_tags"]:
|
|
786
|
+
raw_features["labels"] = text_["token_tags"]
|
|
787
|
+
elif text_["token_classes"]:
|
|
788
|
+
raw_features["labels"] = text_["token_classes"]
|
|
789
|
+
elif page.document_type is not None:
|
|
790
|
+
document_type_id = (
|
|
791
|
+
int(page.image_orig.summary.get_sub_category(PageType.document_type).category_id) - 1 # type: ignore
|
|
792
|
+
)
|
|
793
|
+
raw_features["labels"] = [document_type_id]
|
|
794
|
+
|
|
795
|
+
raw_features["dataset_type"] = dataset_type
|
|
796
|
+
# pylint: enable=E1137
|
|
797
|
+
return raw_features
|
|
798
|
+
|
|
799
|
+
|
|
800
|
+
@curry
|
|
801
|
+
def image_to_lm_features(
|
|
802
|
+
dp: Image,
|
|
803
|
+
tokenizer: PreTrainedTokenizerFast,
|
|
804
|
+
padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
|
|
805
|
+
truncation: bool = True,
|
|
806
|
+
return_overflowing_tokens: bool = False,
|
|
807
|
+
return_tensors: Optional[Literal["pt"]] = "pt",
|
|
808
|
+
sliding_window_stride: int = 0,
|
|
809
|
+
text_container: Optional[LayoutType] = LayoutType.word,
|
|
810
|
+
floating_text_block_categories: Optional[Sequence[LayoutType]] = None,
|
|
811
|
+
include_residual_text_container: bool = False,
|
|
812
|
+
) -> Optional[LayoutLMFeatures]:
|
|
813
|
+
"""
|
|
814
|
+
Mapping function to generate layoutlm features from `Image` to be used for inference in a pipeline component.
|
|
815
|
+
`LanguageModelPipelineComponent` has a positional argument `mapping_to_lm_input_func` that must be chosen
|
|
816
|
+
with respect to the language model chosen. This mapper is devoted to generating features for LayoutLM. It will be
|
|
817
|
+
used internally in `LMTokenClassifierService`.
|
|
818
|
+
|
|
819
|
+
tokenizer = LayoutLMTokenizer.from_pretrained("mrm8488/layoutlm-finetuned-funsd")
|
|
820
|
+
layoutlm = HFLayoutLmTokenClassifier("path/to/config.json","path/to/model.bin",
|
|
821
|
+
categories_explicit=['B-ANSWER', 'B-QUESTION', 'O'])
|
|
822
|
+
|
|
823
|
+
layoutlm_service = LMTokenClassifierService(tokenizer,layoutlm)
|
|
824
|
+
|
|
825
|
+
:param dp: Image datapoint
|
|
826
|
+
:param tokenizer: Tokenizer compatible with the language model
|
|
827
|
+
:param padding: A padding strategy to be passed to the tokenizer. Must bei either `max_length, longest` or
|
|
828
|
+
`do_not_pad`.
|
|
829
|
+
:param truncation: If "True" will truncate to a maximum length specified with the argument max_length or to the
|
|
830
|
+
maximum acceptable input length for the model if that argument is not provided. This will
|
|
831
|
+
truncate token by token, removing a token from the longest sequence in the pair if a pair of
|
|
832
|
+
sequences (or a batch of pairs) is provided.
|
|
833
|
+
If `False` then no truncation (i.e., can output batch with sequence lengths greater than the
|
|
834
|
+
model maximum admissible input size).
|
|
835
|
+
:param return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows the overflowing tokens
|
|
836
|
+
can be returned as an additional batch element. Not that in this case, the number
|
|
837
|
+
of input batch samples will be smaller than the output batch samples.
|
|
838
|
+
:param return_tensors: Output tensor features. Either 'pt' for PyTorch models or None, if features should be
|
|
839
|
+
returned in list objects.
|
|
840
|
+
:param sliding_window_stride: If the output of the tokenizer exceeds the max_length sequence length a sliding
|
|
841
|
+
windows will be created with each window having max_length sequence input. When using
|
|
842
|
+
`sliding_window_stride=0` no strides will be created, otherwise it will create slides
|
|
843
|
+
with windows shifted `sliding_window_stride` to the right.
|
|
844
|
+
:param text_container: A LayoutType to get the text from. It will steer the output of `Layout.words`.
|
|
845
|
+
:param floating_text_block_categories: A list of top level layout objects
|
|
846
|
+
:param include_residual_text_container: This will regard synthetic text line annotations as floating text
|
|
847
|
+
blocks and therefore incorporate all image annotations of category
|
|
848
|
+
`word` when building text strings.
|
|
849
|
+
:return: A dict of lm features
|
|
850
|
+
"""
|
|
851
|
+
raw_features = image_to_raw_lm_features( # pylint: disable=E1102
|
|
852
|
+
dataset_type=None,
|
|
853
|
+
use_token_tag=True,
|
|
854
|
+
text_container=text_container,
|
|
855
|
+
floating_text_block_categories=floating_text_block_categories,
|
|
856
|
+
include_residual_text_container=include_residual_text_container,
|
|
857
|
+
)(dp)
|
|
858
|
+
if raw_features is None:
|
|
859
|
+
return None
|
|
860
|
+
features = raw_features_to_layoutlm_features(
|
|
861
|
+
raw_features,
|
|
862
|
+
tokenizer,
|
|
863
|
+
padding,
|
|
864
|
+
truncation,
|
|
865
|
+
return_overflowing_tokens,
|
|
866
|
+
return_tensors=return_tensors,
|
|
867
|
+
sliding_window_stride=sliding_window_stride,
|
|
868
|
+
)
|
|
869
|
+
return features
|
|
@@ -18,6 +18,8 @@
|
|
|
18
18
|
"""
|
|
19
19
|
Utility functions related to mapping tasks
|
|
20
20
|
"""
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
21
23
|
import functools
|
|
22
24
|
import itertools
|
|
23
25
|
import traceback
|
|
@@ -55,7 +57,7 @@ class MappingContextManager:
|
|
|
55
57
|
self.context_error = True
|
|
56
58
|
self.kwargs = kwargs
|
|
57
59
|
|
|
58
|
-
def __enter__(self) ->
|
|
60
|
+
def __enter__(self) -> MappingContextManager:
|
|
59
61
|
"""
|
|
60
62
|
context enter
|
|
61
63
|
"""
|
deepdoctection/mapper/misc.py
CHANGED
|
@@ -19,19 +19,22 @@
|
|
|
19
19
|
Module for small mapping functions
|
|
20
20
|
"""
|
|
21
21
|
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
22
24
|
import ast
|
|
23
25
|
import os
|
|
24
26
|
from typing import List, Mapping, Optional, Sequence, Union
|
|
25
27
|
|
|
28
|
+
from lazy_imports import try_import
|
|
29
|
+
|
|
26
30
|
from ..datapoint.convert import convert_pdf_bytes_to_np_array_v2
|
|
27
31
|
from ..datapoint.image import Image
|
|
28
32
|
from ..utils.detection_types import JsonDict
|
|
29
|
-
from ..utils.file_utils import lxml_available
|
|
30
33
|
from ..utils.fs import get_load_image_func, load_image_from_file
|
|
31
34
|
from ..utils.utils import is_file_extension
|
|
32
35
|
from .maputils import MappingContextManager, curry
|
|
33
36
|
|
|
34
|
-
|
|
37
|
+
with try_import() as import_guard:
|
|
35
38
|
from lxml import etree # pylint: disable=W0611
|
|
36
39
|
|
|
37
40
|
|
|
@@ -175,7 +178,7 @@ def maybe_ann_to_sub_image(
|
|
|
175
178
|
|
|
176
179
|
|
|
177
180
|
@curry
|
|
178
|
-
def xml_to_dict(dp: JsonDict, xslt_obj:
|
|
181
|
+
def xml_to_dict(dp: JsonDict, xslt_obj: etree.XSLT) -> JsonDict:
|
|
179
182
|
"""
|
|
180
183
|
Convert a xml object into a dict using a xsl style sheet.
|
|
181
184
|
|
|
@@ -22,15 +22,15 @@ import os.path
|
|
|
22
22
|
from typing import Optional, Sequence, Union
|
|
23
23
|
|
|
24
24
|
import numpy as np
|
|
25
|
+
from lazy_imports import try_import
|
|
25
26
|
|
|
26
27
|
from ..datapoint.annotation import ImageAnnotation
|
|
27
28
|
from ..datapoint.image import Image
|
|
28
29
|
from ..utils.detection_types import JsonDict
|
|
29
|
-
from ..utils.file_utils import tf_available
|
|
30
30
|
from ..utils.settings import ObjectTypes
|
|
31
31
|
from .maputils import curry
|
|
32
32
|
|
|
33
|
-
|
|
33
|
+
with try_import() as import_guard:
|
|
34
34
|
from tensorflow import convert_to_tensor, uint8 # type: ignore # pylint: disable=E0401
|
|
35
35
|
from tensorflow.image import non_max_suppression # type: ignore # pylint: disable=E0401
|
|
36
36
|
|
deepdoctection/pipe/__init__.py
CHANGED
|
@@ -22,7 +22,6 @@ Contains pipeline components that can be plugged into each other and predictors
|
|
|
22
22
|
|
|
23
23
|
from .anngen import *
|
|
24
24
|
from .base import *
|
|
25
|
-
from .cell import *
|
|
26
25
|
from .common import *
|
|
27
26
|
from .concurrency import *
|
|
28
27
|
from .doctectionpipe import *
|
|
@@ -33,5 +32,6 @@ from .order import *
|
|
|
33
32
|
from .refine import *
|
|
34
33
|
from .registry import *
|
|
35
34
|
from .segment import *
|
|
35
|
+
from .sub_layout import *
|
|
36
36
|
from .text import *
|
|
37
37
|
from .transform import *
|
deepdoctection/pipe/common.py
CHANGED
|
@@ -18,6 +18,10 @@
|
|
|
18
18
|
"""
|
|
19
19
|
Module for common pipeline components
|
|
20
20
|
"""
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import os
|
|
24
|
+
|
|
21
25
|
from copy import copy, deepcopy
|
|
22
26
|
from typing import List, Literal, Mapping, Optional, Sequence, Union
|
|
23
27
|
|
|
@@ -30,16 +34,14 @@ from ..mapper.maputils import MappingContextManager
|
|
|
30
34
|
from ..mapper.match import match_anns_by_intersection
|
|
31
35
|
from ..mapper.misc import to_image
|
|
32
36
|
from ..utils.detection_types import JsonDict
|
|
33
|
-
from ..utils.file_utils import detectron2_available, pytorch_available, tf_available
|
|
34
37
|
from ..utils.settings import LayoutType, ObjectTypes, Relationships, TypeOrStr, get_type
|
|
35
38
|
from .base import PipelineComponent
|
|
36
39
|
from .registry import pipeline_component_registry
|
|
37
40
|
|
|
38
|
-
if
|
|
39
|
-
from ..mapper.tpstruct import tf_nms_image_annotations as nms_image_annotations
|
|
40
|
-
|
|
41
|
-
elif pytorch_available() and detectron2_available():
|
|
41
|
+
if os.environ.get("DD_USE_TORCH"):
|
|
42
42
|
from ..mapper.d2struct import pt_nms_image_annotations as nms_image_annotations
|
|
43
|
+
elif os.environ.get("DD_USE_TF"):
|
|
44
|
+
from ..mapper.tpstruct import tf_nms_image_annotations as nms_image_annotations
|
|
43
45
|
|
|
44
46
|
|
|
45
47
|
@pipeline_component_registry.register("ImageCroppingService")
|
|
@@ -64,7 +66,7 @@ class ImageCroppingService(PipelineComponent):
|
|
|
64
66
|
for ann in dp.get_annotation(category_names=self.category_names):
|
|
65
67
|
dp.image_ann_to_image(ann.annotation_id, crop_image=True)
|
|
66
68
|
|
|
67
|
-
def clone(self) ->
|
|
69
|
+
def clone(self) -> PipelineComponent:
|
|
68
70
|
return self.__class__(self.category_names)
|
|
69
71
|
|
|
70
72
|
def get_meta_annotation(self) -> JsonDict:
|
|
@@ -225,7 +227,7 @@ class PageParsingService:
|
|
|
225
227
|
"""
|
|
226
228
|
return dict([("image_annotations", []), ("sub_categories", {}), ("relationships", {}), ("summaries", [])])
|
|
227
229
|
|
|
228
|
-
def clone(self) ->
|
|
230
|
+
def clone(self) -> PageParsingService:
|
|
229
231
|
"""clone"""
|
|
230
232
|
return self.__class__(
|
|
231
233
|
deepcopy(self.text_container),
|
|
@@ -292,7 +294,7 @@ class AnnotationNmsService(PipelineComponent):
|
|
|
292
294
|
if ann.annotation_id not in ann_ids_to_keep:
|
|
293
295
|
self.dp_manager.deactivate_annotation(ann.annotation_id)
|
|
294
296
|
|
|
295
|
-
def clone(self) ->
|
|
297
|
+
def clone(self) -> PipelineComponent:
|
|
296
298
|
return self.__class__(deepcopy(self.nms_pairs), self.threshold)
|
|
297
299
|
|
|
298
300
|
def get_meta_annotation(self) -> JsonDict:
|
|
@@ -326,7 +328,7 @@ class ImageParsingService:
|
|
|
326
328
|
"""
|
|
327
329
|
return MapData(df, self.pass_datapoint)
|
|
328
330
|
|
|
329
|
-
def clone(self) ->
|
|
331
|
+
def clone(self) -> ImageParsingService:
|
|
330
332
|
"""clone"""
|
|
331
333
|
return self.__class__(self.dpi)
|
|
332
334
|
|
|
@@ -18,6 +18,7 @@
|
|
|
18
18
|
"""
|
|
19
19
|
Module for multithreading tasks
|
|
20
20
|
"""
|
|
21
|
+
from __future__ import annotations
|
|
21
22
|
|
|
22
23
|
import itertools
|
|
23
24
|
import queue
|
|
@@ -221,7 +222,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
|
|
|
221
222
|
def serve(self, dp: Image) -> None:
|
|
222
223
|
raise NotImplementedError("MultiThreadPipelineComponent does not follow the PipelineComponent implementation")
|
|
223
224
|
|
|
224
|
-
def clone(self) ->
|
|
225
|
+
def clone(self) -> MultiThreadPipelineComponent:
|
|
225
226
|
raise NotImplementedError("MultiThreadPipelineComponent does not allow cloning")
|
|
226
227
|
|
|
227
228
|
def get_meta_annotation(self) -> JsonDict:
|
deepdoctection/pipe/layout.py
CHANGED
|
@@ -18,6 +18,8 @@
|
|
|
18
18
|
"""
|
|
19
19
|
Module for layout pipeline component
|
|
20
20
|
"""
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
21
23
|
from typing import Optional
|
|
22
24
|
|
|
23
25
|
import numpy as np
|
|
@@ -109,7 +111,7 @@ class ImageLayoutService(PredictorPipelineComponent):
|
|
|
109
111
|
def _get_name(predictor_name: str) -> str:
|
|
110
112
|
return f"image_{predictor_name}"
|
|
111
113
|
|
|
112
|
-
def clone(self) ->
|
|
114
|
+
def clone(self) -> PredictorPipelineComponent:
|
|
113
115
|
predictor = self.predictor.clone()
|
|
114
116
|
padder_clone = None
|
|
115
117
|
if self.padder:
|
deepdoctection/pipe/lm.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
|
-
# File:
|
|
2
|
+
# File: lm.py
|
|
3
3
|
|
|
4
4
|
# Copyright 2021 Dr. Janis Meyer. All rights reserved.
|
|
5
5
|
#
|
|
@@ -18,57 +18,19 @@
|
|
|
18
18
|
"""
|
|
19
19
|
Module for token classification pipeline
|
|
20
20
|
"""
|
|
21
|
+
from __future__ import annotations
|
|
21
22
|
|
|
22
23
|
from copy import copy
|
|
23
|
-
from typing import Any, List, Literal, Optional, Sequence, Union
|
|
24
|
+
from typing import Any, Callable, List, Literal, Optional, Sequence, Union
|
|
24
25
|
|
|
25
26
|
from ..datapoint.image import Image
|
|
26
27
|
from ..extern.hflayoutlm import HFLayoutLmSequenceClassifierBase, HFLayoutLmTokenClassifierBase
|
|
27
|
-
from ..mapper.laylmstruct import image_to_layoutlm_features
|
|
28
|
+
from ..mapper.laylmstruct import image_to_layoutlm_features, image_to_lm_features
|
|
28
29
|
from ..utils.detection_types import JsonDict
|
|
29
|
-
from ..utils.file_utils import transformers_available
|
|
30
30
|
from ..utils.settings import BioTag, LayoutType, ObjectTypes, PageType, TokenClasses, WordType
|
|
31
31
|
from .base import LanguageModelPipelineComponent
|
|
32
32
|
from .registry import pipeline_component_registry
|
|
33
33
|
|
|
34
|
-
if transformers_available():
|
|
35
|
-
from transformers import LayoutLMTokenizerFast, RobertaTokenizerFast, XLMRobertaTokenizerFast
|
|
36
|
-
|
|
37
|
-
_ARCHITECTURES_TO_TOKENIZER = {
|
|
38
|
-
("LayoutLMForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
|
|
39
|
-
"microsoft/layoutlm-base-uncased"
|
|
40
|
-
),
|
|
41
|
-
("LayoutLMForSequenceClassification", False): LayoutLMTokenizerFast.from_pretrained(
|
|
42
|
-
"microsoft/layoutlm-base-uncased"
|
|
43
|
-
),
|
|
44
|
-
("LayoutLMv2ForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
|
|
45
|
-
"microsoft/layoutlm-base-uncased"
|
|
46
|
-
),
|
|
47
|
-
("LayoutLMv2ForSequenceClassification", False): LayoutLMTokenizerFast.from_pretrained(
|
|
48
|
-
"microsoft/layoutlm-base-uncased"
|
|
49
|
-
),
|
|
50
|
-
("LayoutLMv2ForTokenClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
|
|
51
|
-
("LayoutLMv2ForSequenceClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
|
|
52
|
-
("LayoutLMv3ForSequenceClassification", False): RobertaTokenizerFast.from_pretrained(
|
|
53
|
-
"roberta-base", add_prefix_space=True
|
|
54
|
-
),
|
|
55
|
-
("LayoutLMv3ForTokenClassification", False): RobertaTokenizerFast.from_pretrained(
|
|
56
|
-
"roberta-base", add_prefix_space=True
|
|
57
|
-
),
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
def get_tokenizer_from_architecture(architecture_name: str, use_xlm_tokenizer: bool) -> Any:
|
|
62
|
-
"""
|
|
63
|
-
We do not use the tokenizer for a particular model that the transformer library provides. Thie mapping therefore
|
|
64
|
-
returns the tokenizer that should be used for a particular model.
|
|
65
|
-
|
|
66
|
-
:param architecture_name: The model as stated in the transformer library.
|
|
67
|
-
:param use_xlm_tokenizer: True if one uses the LayoutXLM. (The model cannot be distinguished from LayoutLMv2).
|
|
68
|
-
:return: Tokenizer instance to use.
|
|
69
|
-
"""
|
|
70
|
-
return _ARCHITECTURES_TO_TOKENIZER[(architecture_name, use_xlm_tokenizer)]
|
|
71
|
-
|
|
72
34
|
|
|
73
35
|
@pipeline_component_registry.register("LMTokenClassifierService")
|
|
74
36
|
class LMTokenClassifierService(LanguageModelPipelineComponent):
|
|
@@ -154,7 +116,8 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
|
|
|
154
116
|
else:
|
|
155
117
|
self.default_key = TokenClasses.other
|
|
156
118
|
self.other_name_as_key = {self.default_key: categories_name_as_key[self.default_key]}
|
|
157
|
-
|
|
119
|
+
image_to_features_func = self.image_to_features_func(self.language_model.image_to_features_mapping())
|
|
120
|
+
super().__init__(self._get_name(), tokenizer, image_to_features_func)
|
|
158
121
|
self.required_kwargs = {
|
|
159
122
|
"tokenizer": self.tokenizer,
|
|
160
123
|
"padding": self.padding,
|
|
@@ -218,7 +181,9 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
|
|
|
218
181
|
word.annotation_id,
|
|
219
182
|
)
|
|
220
183
|
|
|
221
|
-
def clone(self) ->
|
|
184
|
+
def clone(self) -> LMTokenClassifierService:
|
|
185
|
+
# ToDo: replace copying of tokenizer with a proper clone method. Otherwise we cannot run the evaluation with
|
|
186
|
+
# multiple threads
|
|
222
187
|
return self.__class__(
|
|
223
188
|
copy(self.tokenizer),
|
|
224
189
|
self.language_model.clone(),
|
|
@@ -244,19 +209,20 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
|
|
|
244
209
|
return f"lm_token_class_{self.language_model.name}"
|
|
245
210
|
|
|
246
211
|
def _init_sanity_checks(self) -> None:
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
if tokenizer_class is not None:
|
|
250
|
-
use_xlm_tokenizer = True
|
|
251
|
-
tokenizer_reference = get_tokenizer_from_architecture(
|
|
252
|
-
self.language_model.model.__class__.__name__, use_xlm_tokenizer
|
|
253
|
-
)
|
|
254
|
-
if not isinstance(self.tokenizer, type(tokenizer_reference)):
|
|
212
|
+
tokenizer_class_name = self.language_model.model.config.tokenizer_class
|
|
213
|
+
if tokenizer_class_name != self.tokenizer.__class__.__name__:
|
|
255
214
|
raise TypeError(
|
|
256
|
-
f"You want to use {type(self.tokenizer)} but you should use {
|
|
215
|
+
f"You want to use {type(self.tokenizer)} but you should use {tokenizer_class_name} "
|
|
257
216
|
f"in this framework"
|
|
258
217
|
)
|
|
259
218
|
|
|
219
|
+
@staticmethod
|
|
220
|
+
def image_to_features_func(mapping_str: str) -> Callable[..., Callable[[Image], Optional[Any]]]:
|
|
221
|
+
"""Replacing eval functions"""
|
|
222
|
+
return {"image_to_layoutlm_features": image_to_layoutlm_features, "image_to_lm_features": image_to_lm_features}[
|
|
223
|
+
mapping_str
|
|
224
|
+
]
|
|
225
|
+
|
|
260
226
|
|
|
261
227
|
@pipeline_component_registry.register("LMSequenceClassifierService")
|
|
262
228
|
class LMSequenceClassifierService(LanguageModelPipelineComponent):
|
|
@@ -315,7 +281,8 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
|
|
|
315
281
|
self.padding = padding
|
|
316
282
|
self.truncation = truncation
|
|
317
283
|
self.return_overflowing_tokens = return_overflowing_tokens
|
|
318
|
-
|
|
284
|
+
image_to_features_func = self.image_to_features_func(self.language_model.image_to_features_mapping())
|
|
285
|
+
super().__init__(self._get_name(), tokenizer, image_to_features_func)
|
|
319
286
|
self.required_kwargs = {
|
|
320
287
|
"tokenizer": self.tokenizer,
|
|
321
288
|
"padding": self.padding,
|
|
@@ -335,7 +302,7 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
|
|
|
335
302
|
PageType.document_type, lm_output.class_name, lm_output.class_id, None, lm_output.score
|
|
336
303
|
)
|
|
337
304
|
|
|
338
|
-
def clone(self) ->
|
|
305
|
+
def clone(self) -> LMSequenceClassifierService:
|
|
339
306
|
return self.__class__(
|
|
340
307
|
copy(self.tokenizer),
|
|
341
308
|
self.language_model.clone(),
|
|
@@ -358,15 +325,16 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
|
|
|
358
325
|
return f"lm_sequence_class_{self.language_model.name}"
|
|
359
326
|
|
|
360
327
|
def _init_sanity_checks(self) -> None:
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
if tokenizer_class is not None:
|
|
364
|
-
use_xlm_tokenizer = True
|
|
365
|
-
tokenizer_reference = get_tokenizer_from_architecture(
|
|
366
|
-
self.language_model.model.__class__.__name__, use_xlm_tokenizer
|
|
367
|
-
)
|
|
368
|
-
if not isinstance(self.tokenizer, type(tokenizer_reference)):
|
|
328
|
+
tokenizer_class_name = self.language_model.model.config.tokenizer_class
|
|
329
|
+
if tokenizer_class_name != self.tokenizer.__class__.__name__:
|
|
369
330
|
raise TypeError(
|
|
370
|
-
f"You want to use {type(self.tokenizer)} but you should use {
|
|
331
|
+
f"You want to use {type(self.tokenizer)} but you should use {tokenizer_class_name} "
|
|
371
332
|
f"in this framework"
|
|
372
333
|
)
|
|
334
|
+
|
|
335
|
+
@staticmethod
|
|
336
|
+
def image_to_features_func(mapping_str: str) -> Callable[..., Callable[[Image], Optional[Any]]]:
|
|
337
|
+
"""Replacing eval functions"""
|
|
338
|
+
return {"image_to_layoutlm_features": image_to_layoutlm_features, "image_to_lm_features": image_to_lm_features}[
|
|
339
|
+
mapping_str
|
|
340
|
+
]
|