deepdoctection 0.31__py3-none-any.whl → 0.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (91) hide show
  1. deepdoctection/__init__.py +35 -28
  2. deepdoctection/analyzer/dd.py +30 -24
  3. deepdoctection/configs/conf_dd_one.yaml +34 -31
  4. deepdoctection/datapoint/annotation.py +2 -1
  5. deepdoctection/datapoint/box.py +2 -1
  6. deepdoctection/datapoint/image.py +13 -7
  7. deepdoctection/datapoint/view.py +95 -24
  8. deepdoctection/datasets/__init__.py +1 -4
  9. deepdoctection/datasets/adapter.py +5 -2
  10. deepdoctection/datasets/base.py +5 -3
  11. deepdoctection/datasets/info.py +2 -2
  12. deepdoctection/datasets/instances/doclaynet.py +3 -2
  13. deepdoctection/datasets/instances/fintabnet.py +2 -1
  14. deepdoctection/datasets/instances/funsd.py +2 -1
  15. deepdoctection/datasets/instances/iiitar13k.py +5 -2
  16. deepdoctection/datasets/instances/layouttest.py +2 -1
  17. deepdoctection/datasets/instances/publaynet.py +2 -2
  18. deepdoctection/datasets/instances/pubtables1m.py +6 -3
  19. deepdoctection/datasets/instances/pubtabnet.py +2 -1
  20. deepdoctection/datasets/instances/rvlcdip.py +2 -1
  21. deepdoctection/datasets/instances/xfund.py +2 -1
  22. deepdoctection/eval/__init__.py +1 -4
  23. deepdoctection/eval/cocometric.py +2 -1
  24. deepdoctection/eval/eval.py +17 -13
  25. deepdoctection/eval/tedsmetric.py +14 -11
  26. deepdoctection/eval/tp_eval_callback.py +9 -3
  27. deepdoctection/extern/__init__.py +2 -7
  28. deepdoctection/extern/d2detect.py +24 -32
  29. deepdoctection/extern/deskew.py +4 -2
  30. deepdoctection/extern/doctrocr.py +75 -81
  31. deepdoctection/extern/fastlang.py +4 -2
  32. deepdoctection/extern/hfdetr.py +22 -28
  33. deepdoctection/extern/hflayoutlm.py +335 -103
  34. deepdoctection/extern/hflm.py +225 -0
  35. deepdoctection/extern/model.py +56 -47
  36. deepdoctection/extern/pdftext.py +8 -4
  37. deepdoctection/extern/pt/__init__.py +1 -3
  38. deepdoctection/extern/pt/nms.py +6 -2
  39. deepdoctection/extern/pt/ptutils.py +27 -19
  40. deepdoctection/extern/texocr.py +4 -2
  41. deepdoctection/extern/tp/tfutils.py +43 -9
  42. deepdoctection/extern/tp/tpcompat.py +10 -7
  43. deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
  44. deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
  45. deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
  46. deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
  47. deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
  48. deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
  49. deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
  50. deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
  51. deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
  52. deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
  53. deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
  54. deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
  55. deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
  56. deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
  57. deepdoctection/extern/tp/tpfrcnn/preproc.py +7 -3
  58. deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
  59. deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
  60. deepdoctection/extern/tpdetect.py +5 -8
  61. deepdoctection/mapper/__init__.py +3 -8
  62. deepdoctection/mapper/d2struct.py +8 -6
  63. deepdoctection/mapper/hfstruct.py +6 -1
  64. deepdoctection/mapper/laylmstruct.py +163 -20
  65. deepdoctection/mapper/maputils.py +3 -1
  66. deepdoctection/mapper/misc.py +6 -3
  67. deepdoctection/mapper/tpstruct.py +2 -2
  68. deepdoctection/pipe/__init__.py +1 -1
  69. deepdoctection/pipe/common.py +11 -9
  70. deepdoctection/pipe/concurrency.py +2 -1
  71. deepdoctection/pipe/layout.py +3 -1
  72. deepdoctection/pipe/lm.py +32 -64
  73. deepdoctection/pipe/order.py +142 -35
  74. deepdoctection/pipe/refine.py +8 -14
  75. deepdoctection/pipe/{cell.py → sub_layout.py} +1 -1
  76. deepdoctection/train/__init__.py +6 -12
  77. deepdoctection/train/d2_frcnn_train.py +21 -16
  78. deepdoctection/train/hf_detr_train.py +18 -11
  79. deepdoctection/train/hf_layoutlm_train.py +118 -101
  80. deepdoctection/train/tp_frcnn_train.py +21 -19
  81. deepdoctection/utils/env_info.py +41 -117
  82. deepdoctection/utils/logger.py +1 -0
  83. deepdoctection/utils/mocks.py +93 -0
  84. deepdoctection/utils/settings.py +1 -0
  85. deepdoctection/utils/viz.py +4 -3
  86. {deepdoctection-0.31.dist-info → deepdoctection-0.32.dist-info}/METADATA +27 -18
  87. deepdoctection-0.32.dist-info/RECORD +146 -0
  88. deepdoctection-0.31.dist-info/RECORD +0 -144
  89. {deepdoctection-0.31.dist-info → deepdoctection-0.32.dist-info}/LICENSE +0 -0
  90. {deepdoctection-0.31.dist-info → deepdoctection-0.32.dist-info}/WHEEL +0 -0
  91. {deepdoctection-0.31.dist-info → deepdoctection-0.32.dist-info}/top_level.txt +0 -0
@@ -20,32 +20,30 @@ Module for mapping annotations from image to layout lm input structure. Heavily
20
20
  <https://github.com/NielsRogge/Transformers-Tutorials>
21
21
  """
22
22
 
23
+ from __future__ import annotations
24
+
23
25
  import random
24
26
  from dataclasses import dataclass, field
25
27
  from typing import Any, Callable, Dict, List, Literal, NewType, Optional, Sequence, Union
26
28
 
27
29
  import numpy as np
28
30
  import numpy.typing as npt
31
+ from lazy_imports import try_import
29
32
 
30
33
  from ..datapoint.annotation import ContainerAnnotation
31
34
  from ..datapoint.convert import box_to_point4, point4_to_box
32
35
  from ..datapoint.image import Image
36
+ from ..datapoint.view import Page
33
37
  from ..utils.detection_types import JsonDict
34
- from ..utils.file_utils import pytorch_available, transformers_available
35
38
  from ..utils.settings import DatasetType, LayoutType, PageType, Relationships, WordType
36
39
  from ..utils.transform import ResizeTransform, normalize_image
37
40
  from .maputils import curry
38
41
 
39
- if pytorch_available():
42
+ with try_import() as import_guard:
40
43
  import torch
41
44
 
42
- if transformers_available():
43
- from transformers import ( # pylint: disable=W0611
44
- BatchEncoding,
45
- PreTrainedTokenizerFast,
46
- RobertaTokenizerFast,
47
- XLMRobertaTokenizerFast,
48
- )
45
+ with try_import() as tr_import_guard:
46
+ from transformers import BatchEncoding, PreTrainedTokenizerFast # pylint: disable=W0611
49
47
 
50
48
  __all__ = [
51
49
  "image_to_raw_layoutlm_features",
@@ -54,12 +52,17 @@ __all__ = [
54
52
  "image_to_layoutlm_features",
55
53
  "DataCollator",
56
54
  "LayoutLMFeatures",
55
+ "image_to_raw_lm_features",
56
+ "image_to_lm_features",
57
57
  ]
58
58
 
59
59
  RawLayoutLMFeatures = NewType("RawLayoutLMFeatures", JsonDict)
60
+ RawLMFeatures = NewType("RawLMFeatures", JsonDict)
60
61
  LayoutLMFeatures = NewType("LayoutLMFeatures", JsonDict)
62
+ LMFeatures = NewType("LMFeatures", JsonDict)
61
63
  InputDataClass = NewType("InputDataClass", JsonDict)
62
64
 
65
+
63
66
  """
64
67
  <https://github.com/huggingface/transformers/src/transformers/data/data_collator.py>
65
68
  A DataCollator is a function that takes a list of samples from a Dataset and collate them into a batch, as a dictionary
@@ -208,7 +211,7 @@ def image_to_raw_layoutlm_features(
208
211
  return raw_features
209
212
 
210
213
 
211
- def features_to_pt_tensors(features: LayoutLMFeatures) -> LayoutLMFeatures:
214
+ def layoutlm_features_to_pt_tensors(features: LayoutLMFeatures) -> LayoutLMFeatures:
212
215
  """
213
216
  Converting list of floats to pytorch tensors
214
217
  :param features: LayoutLMFeatures
@@ -216,7 +219,8 @@ def features_to_pt_tensors(features: LayoutLMFeatures) -> LayoutLMFeatures:
216
219
  """
217
220
 
218
221
  _image_key = "pixel_values" if "pixel_values" in features else "image"
219
- features["bbox"] = torch.tensor(features["bbox"], dtype=torch.long)
222
+ if "bbox" in features:
223
+ features["bbox"] = torch.tensor(features["bbox"], dtype=torch.long)
220
224
  if "labels" in features:
221
225
  features["labels"] = torch.tensor(features["labels"], dtype=torch.long)
222
226
  if _image_key in features:
@@ -230,12 +234,12 @@ def features_to_pt_tensors(features: LayoutLMFeatures) -> LayoutLMFeatures:
230
234
 
231
235
 
232
236
  def _tokenize_with_sliding_window(
233
- raw_features: List[RawLayoutLMFeatures],
234
- tokenizer: "PreTrainedTokenizerFast",
237
+ raw_features: List[Union[RawLayoutLMFeatures, RawLMFeatures]],
238
+ tokenizer: PreTrainedTokenizerFast,
235
239
  sliding_window_stride: int,
236
240
  max_batch_size: int,
237
241
  return_tensors: Optional[Literal["pt"]] = None,
238
- ) -> Union[JsonDict, "BatchEncoding"]:
242
+ ) -> Union[JsonDict, BatchEncoding]:
239
243
  """
240
244
  Runs a tokenizer: If there are no overflowing tokens, the tokenizer output will be returned as it is.
241
245
  If there are overflowing tokens, sliding windows have to be built. As it is easier to prepare the sliding windows
@@ -398,8 +402,8 @@ def _tokenize_with_sliding_window(
398
402
 
399
403
 
400
404
  def raw_features_to_layoutlm_features(
401
- raw_features: Union[RawLayoutLMFeatures, List[RawLayoutLMFeatures]],
402
- tokenizer: "PreTrainedTokenizerFast",
405
+ raw_features: Union[RawLayoutLMFeatures, RawLMFeatures, List[Union[RawLayoutLMFeatures, RawLMFeatures]]],
406
+ tokenizer: PreTrainedTokenizerFast,
403
407
  padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
404
408
  truncation: bool = True,
405
409
  return_overflowing_tokens: bool = False,
@@ -407,6 +411,7 @@ def raw_features_to_layoutlm_features(
407
411
  remove_columns_for_training: bool = False,
408
412
  sliding_window_stride: int = 0,
409
413
  max_batch_size: int = 0,
414
+ remove_bounding_boxes: bool = False,
410
415
  ) -> LayoutLMFeatures:
411
416
  """
412
417
  Mapping raw features to tokenized input sequences for LayoutLM models.
@@ -563,8 +568,11 @@ def raw_features_to_layoutlm_features(
563
568
  input_dict.pop("ann_ids")
564
569
  input_dict.pop("tokens")
565
570
 
571
+ if remove_bounding_boxes:
572
+ input_dict.pop("bbox")
573
+
566
574
  if return_tensors == "pt":
567
- return features_to_pt_tensors(LayoutLMFeatures(input_dict))
575
+ return layoutlm_features_to_pt_tensors(LayoutLMFeatures(input_dict))
568
576
  return LayoutLMFeatures(input_dict)
569
577
 
570
578
 
@@ -595,13 +603,14 @@ class LayoutLMDataCollator:
595
603
  with windows shifted `sliding_window_stride` to the right.
596
604
  """
597
605
 
598
- tokenizer: "PreTrainedTokenizerFast"
606
+ tokenizer: PreTrainedTokenizerFast
599
607
  padding: Literal["max_length", "do_not_pad", "longest"] = field(default="max_length")
600
608
  truncation: bool = field(default=True)
601
609
  return_overflowing_tokens: bool = field(default=False)
602
610
  return_tensors: Optional[Literal["pt"]] = field(default=None)
603
611
  sliding_window_stride: int = field(default=0)
604
612
  max_batch_size: int = field(default=0)
613
+ remove_bounding_box_features: bool = field(default=False)
605
614
 
606
615
  def __post_init__(self) -> None:
607
616
  assert isinstance(self.tokenizer, PreTrainedTokenizerFast), "Tokenizer must be a fast tokenizer"
@@ -620,7 +629,7 @@ class LayoutLMDataCollator:
620
629
  token_type_ids, attention_masks, boxes, labels`.
621
630
  """
622
631
  return raw_features_to_layoutlm_features(
623
- raw_features,
632
+ raw_features, # type: ignore
624
633
  self.tokenizer,
625
634
  self.padding,
626
635
  self.truncation,
@@ -629,13 +638,14 @@ class LayoutLMDataCollator:
629
638
  True,
630
639
  self.sliding_window_stride,
631
640
  self.max_batch_size,
641
+ self.remove_bounding_box_features,
632
642
  )
633
643
 
634
644
 
635
645
  @curry
636
646
  def image_to_layoutlm_features(
637
647
  dp: Image,
638
- tokenizer: "PreTrainedTokenizerFast",
648
+ tokenizer: PreTrainedTokenizerFast,
639
649
  padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
640
650
  truncation: bool = True,
641
651
  return_overflowing_tokens: bool = False,
@@ -724,3 +734,136 @@ def image_to_layoutlm_features(
724
734
  sliding_window_stride=sliding_window_stride,
725
735
  )
726
736
  return features
737
+
738
+
739
+ @curry
740
+ def image_to_raw_lm_features(
741
+ dp: Image,
742
+ dataset_type: Optional[Literal["sequence_classification", "token_classification"]] = None,
743
+ use_token_tag: bool = True,
744
+ text_container: Optional[LayoutType] = LayoutType.word,
745
+ floating_text_block_categories: Optional[Sequence[LayoutType]] = None,
746
+ include_residual_text_container: bool = False,
747
+ ) -> Optional[RawLMFeatures]:
748
+ """
749
+ Mapping a datapoint into an intermediate format for bert-like models. Features will be provided into a dict and
750
+ this mapping can be used for sequence or token classification as well as for inference. To generate input features
751
+ for the model please `use raw_features_to_layoutlm_features`.
752
+
753
+
754
+ :param dp: Image
755
+ :param dataset_type: Either SEQUENCE_CLASSIFICATION or TOKEN_CLASSIFICATION. When using a built-in dataset use
756
+ :param use_token_tag: Will only be used for dataset_type="token_classification". If use_token_tag=True, will use
757
+ labels from sub category `WordType.token_tag` (with `B,I,O` suffix), otherwise
758
+ `WordType.token_class`.
759
+ :param text_container: A LayoutType to get the text from. It will steer the output of `Layout.words`.
760
+ :param floating_text_block_categories: A list of top level layout objects
761
+ :param include_residual_text_container: This will regard synthetic text line annotations as floating text
762
+ blocks and therefore incorporate all image annotations of category
763
+ `word` when building text strings.
764
+ :return: dictionary with the following arguments:
765
+ 'image_id', 'width', 'height', 'ann_ids', 'words', 'bbox' and 'dataset_type'.
766
+ """
767
+
768
+ raw_features: RawLMFeatures = RawLMFeatures({})
769
+
770
+ page = Page.from_image(dp, text_container, floating_text_block_categories, include_residual_text_container)
771
+
772
+ text_ = page.text_
773
+
774
+ # pylint: disable=E1137 #3162
775
+ raw_features["image_id"] = page.image_id
776
+ raw_features["width"] = page.width
777
+ raw_features["height"] = page.height
778
+ raw_features["ann_ids"] = text_["ann_ids"]
779
+ raw_features["words"] = text_["words"]
780
+ # We use a dummy bounding box for all bounding boxes so that we can pass the raw features to
781
+ # raw_features_to_layoutlm_features
782
+ raw_features["bbox"] = [_CLS_BOX] * len(text_["words"])
783
+ raw_features["dataset_type"] = dataset_type
784
+
785
+ if use_token_tag and text_["token_tags"]:
786
+ raw_features["labels"] = text_["token_tags"]
787
+ elif text_["token_classes"]:
788
+ raw_features["labels"] = text_["token_classes"]
789
+ elif page.document_type is not None:
790
+ document_type_id = (
791
+ int(page.image_orig.summary.get_sub_category(PageType.document_type).category_id) - 1 # type: ignore
792
+ )
793
+ raw_features["labels"] = [document_type_id]
794
+
795
+ raw_features["dataset_type"] = dataset_type
796
+ # pylint: enable=E1137
797
+ return raw_features
798
+
799
+
800
+ @curry
801
+ def image_to_lm_features(
802
+ dp: Image,
803
+ tokenizer: PreTrainedTokenizerFast,
804
+ padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
805
+ truncation: bool = True,
806
+ return_overflowing_tokens: bool = False,
807
+ return_tensors: Optional[Literal["pt"]] = "pt",
808
+ sliding_window_stride: int = 0,
809
+ text_container: Optional[LayoutType] = LayoutType.word,
810
+ floating_text_block_categories: Optional[Sequence[LayoutType]] = None,
811
+ include_residual_text_container: bool = False,
812
+ ) -> Optional[LayoutLMFeatures]:
813
+ """
814
+ Mapping function to generate layoutlm features from `Image` to be used for inference in a pipeline component.
815
+ `LanguageModelPipelineComponent` has a positional argument `mapping_to_lm_input_func` that must be chosen
816
+ with respect to the language model chosen. This mapper is devoted to generating features for LayoutLM. It will be
817
+ used internally in `LMTokenClassifierService`.
818
+
819
+ tokenizer = LayoutLMTokenizer.from_pretrained("mrm8488/layoutlm-finetuned-funsd")
820
+ layoutlm = HFLayoutLmTokenClassifier("path/to/config.json","path/to/model.bin",
821
+ categories_explicit=['B-ANSWER', 'B-QUESTION', 'O'])
822
+
823
+ layoutlm_service = LMTokenClassifierService(tokenizer,layoutlm)
824
+
825
+ :param dp: Image datapoint
826
+ :param tokenizer: Tokenizer compatible with the language model
827
+ :param padding: A padding strategy to be passed to the tokenizer. Must bei either `max_length, longest` or
828
+ `do_not_pad`.
829
+ :param truncation: If "True" will truncate to a maximum length specified with the argument max_length or to the
830
+ maximum acceptable input length for the model if that argument is not provided. This will
831
+ truncate token by token, removing a token from the longest sequence in the pair if a pair of
832
+ sequences (or a batch of pairs) is provided.
833
+ If `False` then no truncation (i.e., can output batch with sequence lengths greater than the
834
+ model maximum admissible input size).
835
+ :param return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows the overflowing tokens
836
+ can be returned as an additional batch element. Not that in this case, the number
837
+ of input batch samples will be smaller than the output batch samples.
838
+ :param return_tensors: Output tensor features. Either 'pt' for PyTorch models or None, if features should be
839
+ returned in list objects.
840
+ :param sliding_window_stride: If the output of the tokenizer exceeds the max_length sequence length a sliding
841
+ windows will be created with each window having max_length sequence input. When using
842
+ `sliding_window_stride=0` no strides will be created, otherwise it will create slides
843
+ with windows shifted `sliding_window_stride` to the right.
844
+ :param text_container: A LayoutType to get the text from. It will steer the output of `Layout.words`.
845
+ :param floating_text_block_categories: A list of top level layout objects
846
+ :param include_residual_text_container: This will regard synthetic text line annotations as floating text
847
+ blocks and therefore incorporate all image annotations of category
848
+ `word` when building text strings.
849
+ :return: A dict of lm features
850
+ """
851
+ raw_features = image_to_raw_lm_features( # pylint: disable=E1102
852
+ dataset_type=None,
853
+ use_token_tag=True,
854
+ text_container=text_container,
855
+ floating_text_block_categories=floating_text_block_categories,
856
+ include_residual_text_container=include_residual_text_container,
857
+ )(dp)
858
+ if raw_features is None:
859
+ return None
860
+ features = raw_features_to_layoutlm_features(
861
+ raw_features,
862
+ tokenizer,
863
+ padding,
864
+ truncation,
865
+ return_overflowing_tokens,
866
+ return_tensors=return_tensors,
867
+ sliding_window_stride=sliding_window_stride,
868
+ )
869
+ return features
@@ -18,6 +18,8 @@
18
18
  """
19
19
  Utility functions related to mapping tasks
20
20
  """
21
+ from __future__ import annotations
22
+
21
23
  import functools
22
24
  import itertools
23
25
  import traceback
@@ -55,7 +57,7 @@ class MappingContextManager:
55
57
  self.context_error = True
56
58
  self.kwargs = kwargs
57
59
 
58
- def __enter__(self) -> "MappingContextManager":
60
+ def __enter__(self) -> MappingContextManager:
59
61
  """
60
62
  context enter
61
63
  """
@@ -19,19 +19,22 @@
19
19
  Module for small mapping functions
20
20
  """
21
21
 
22
+ from __future__ import annotations
23
+
22
24
  import ast
23
25
  import os
24
26
  from typing import List, Mapping, Optional, Sequence, Union
25
27
 
28
+ from lazy_imports import try_import
29
+
26
30
  from ..datapoint.convert import convert_pdf_bytes_to_np_array_v2
27
31
  from ..datapoint.image import Image
28
32
  from ..utils.detection_types import JsonDict
29
- from ..utils.file_utils import lxml_available
30
33
  from ..utils.fs import get_load_image_func, load_image_from_file
31
34
  from ..utils.utils import is_file_extension
32
35
  from .maputils import MappingContextManager, curry
33
36
 
34
- if lxml_available():
37
+ with try_import() as import_guard:
35
38
  from lxml import etree # pylint: disable=W0611
36
39
 
37
40
 
@@ -175,7 +178,7 @@ def maybe_ann_to_sub_image(
175
178
 
176
179
 
177
180
  @curry
178
- def xml_to_dict(dp: JsonDict, xslt_obj: "etree.XSLT") -> JsonDict:
181
+ def xml_to_dict(dp: JsonDict, xslt_obj: etree.XSLT) -> JsonDict:
179
182
  """
180
183
  Convert a xml object into a dict using a xsl style sheet.
181
184
 
@@ -22,15 +22,15 @@ import os.path
22
22
  from typing import Optional, Sequence, Union
23
23
 
24
24
  import numpy as np
25
+ from lazy_imports import try_import
25
26
 
26
27
  from ..datapoint.annotation import ImageAnnotation
27
28
  from ..datapoint.image import Image
28
29
  from ..utils.detection_types import JsonDict
29
- from ..utils.file_utils import tf_available
30
30
  from ..utils.settings import ObjectTypes
31
31
  from .maputils import curry
32
32
 
33
- if tf_available():
33
+ with try_import() as import_guard:
34
34
  from tensorflow import convert_to_tensor, uint8 # type: ignore # pylint: disable=E0401
35
35
  from tensorflow.image import non_max_suppression # type: ignore # pylint: disable=E0401
36
36
 
@@ -22,7 +22,6 @@ Contains pipeline components that can be plugged into each other and predictors
22
22
 
23
23
  from .anngen import *
24
24
  from .base import *
25
- from .cell import *
26
25
  from .common import *
27
26
  from .concurrency import *
28
27
  from .doctectionpipe import *
@@ -33,5 +32,6 @@ from .order import *
33
32
  from .refine import *
34
33
  from .registry import *
35
34
  from .segment import *
35
+ from .sub_layout import *
36
36
  from .text import *
37
37
  from .transform import *
@@ -18,6 +18,10 @@
18
18
  """
19
19
  Module for common pipeline components
20
20
  """
21
+ from __future__ import annotations
22
+
23
+ import os
24
+
21
25
  from copy import copy, deepcopy
22
26
  from typing import List, Literal, Mapping, Optional, Sequence, Union
23
27
 
@@ -30,16 +34,14 @@ from ..mapper.maputils import MappingContextManager
30
34
  from ..mapper.match import match_anns_by_intersection
31
35
  from ..mapper.misc import to_image
32
36
  from ..utils.detection_types import JsonDict
33
- from ..utils.file_utils import detectron2_available, pytorch_available, tf_available
34
37
  from ..utils.settings import LayoutType, ObjectTypes, Relationships, TypeOrStr, get_type
35
38
  from .base import PipelineComponent
36
39
  from .registry import pipeline_component_registry
37
40
 
38
- if tf_available():
39
- from ..mapper.tpstruct import tf_nms_image_annotations as nms_image_annotations
40
-
41
- elif pytorch_available() and detectron2_available():
41
+ if os.environ.get("DD_USE_TORCH"):
42
42
  from ..mapper.d2struct import pt_nms_image_annotations as nms_image_annotations
43
+ elif os.environ.get("DD_USE_TF"):
44
+ from ..mapper.tpstruct import tf_nms_image_annotations as nms_image_annotations
43
45
 
44
46
 
45
47
  @pipeline_component_registry.register("ImageCroppingService")
@@ -64,7 +66,7 @@ class ImageCroppingService(PipelineComponent):
64
66
  for ann in dp.get_annotation(category_names=self.category_names):
65
67
  dp.image_ann_to_image(ann.annotation_id, crop_image=True)
66
68
 
67
- def clone(self) -> "PipelineComponent":
69
+ def clone(self) -> PipelineComponent:
68
70
  return self.__class__(self.category_names)
69
71
 
70
72
  def get_meta_annotation(self) -> JsonDict:
@@ -225,7 +227,7 @@ class PageParsingService:
225
227
  """
226
228
  return dict([("image_annotations", []), ("sub_categories", {}), ("relationships", {}), ("summaries", [])])
227
229
 
228
- def clone(self) -> "PageParsingService":
230
+ def clone(self) -> PageParsingService:
229
231
  """clone"""
230
232
  return self.__class__(
231
233
  deepcopy(self.text_container),
@@ -292,7 +294,7 @@ class AnnotationNmsService(PipelineComponent):
292
294
  if ann.annotation_id not in ann_ids_to_keep:
293
295
  self.dp_manager.deactivate_annotation(ann.annotation_id)
294
296
 
295
- def clone(self) -> "PipelineComponent":
297
+ def clone(self) -> PipelineComponent:
296
298
  return self.__class__(deepcopy(self.nms_pairs), self.threshold)
297
299
 
298
300
  def get_meta_annotation(self) -> JsonDict:
@@ -326,7 +328,7 @@ class ImageParsingService:
326
328
  """
327
329
  return MapData(df, self.pass_datapoint)
328
330
 
329
- def clone(self) -> "ImageParsingService":
331
+ def clone(self) -> ImageParsingService:
330
332
  """clone"""
331
333
  return self.__class__(self.dpi)
332
334
 
@@ -18,6 +18,7 @@
18
18
  """
19
19
  Module for multithreading tasks
20
20
  """
21
+ from __future__ import annotations
21
22
 
22
23
  import itertools
23
24
  import queue
@@ -221,7 +222,7 @@ class MultiThreadPipelineComponent(PipelineComponent):
221
222
  def serve(self, dp: Image) -> None:
222
223
  raise NotImplementedError("MultiThreadPipelineComponent does not follow the PipelineComponent implementation")
223
224
 
224
- def clone(self) -> "MultiThreadPipelineComponent":
225
+ def clone(self) -> MultiThreadPipelineComponent:
225
226
  raise NotImplementedError("MultiThreadPipelineComponent does not allow cloning")
226
227
 
227
228
  def get_meta_annotation(self) -> JsonDict:
@@ -18,6 +18,8 @@
18
18
  """
19
19
  Module for layout pipeline component
20
20
  """
21
+ from __future__ import annotations
22
+
21
23
  from typing import Optional
22
24
 
23
25
  import numpy as np
@@ -109,7 +111,7 @@ class ImageLayoutService(PredictorPipelineComponent):
109
111
  def _get_name(predictor_name: str) -> str:
110
112
  return f"image_{predictor_name}"
111
113
 
112
- def clone(self) -> "PredictorPipelineComponent":
114
+ def clone(self) -> PredictorPipelineComponent:
113
115
  predictor = self.predictor.clone()
114
116
  padder_clone = None
115
117
  if self.padder:
deepdoctection/pipe/lm.py CHANGED
@@ -1,5 +1,5 @@
1
1
  # -*- coding: utf-8 -*-
2
- # File: tokenclass.py
2
+ # File: lm.py
3
3
 
4
4
  # Copyright 2021 Dr. Janis Meyer. All rights reserved.
5
5
  #
@@ -18,57 +18,19 @@
18
18
  """
19
19
  Module for token classification pipeline
20
20
  """
21
+ from __future__ import annotations
21
22
 
22
23
  from copy import copy
23
- from typing import Any, List, Literal, Optional, Sequence, Union
24
+ from typing import Any, Callable, List, Literal, Optional, Sequence, Union
24
25
 
25
26
  from ..datapoint.image import Image
26
27
  from ..extern.hflayoutlm import HFLayoutLmSequenceClassifierBase, HFLayoutLmTokenClassifierBase
27
- from ..mapper.laylmstruct import image_to_layoutlm_features
28
+ from ..mapper.laylmstruct import image_to_layoutlm_features, image_to_lm_features
28
29
  from ..utils.detection_types import JsonDict
29
- from ..utils.file_utils import transformers_available
30
30
  from ..utils.settings import BioTag, LayoutType, ObjectTypes, PageType, TokenClasses, WordType
31
31
  from .base import LanguageModelPipelineComponent
32
32
  from .registry import pipeline_component_registry
33
33
 
34
- if transformers_available():
35
- from transformers import LayoutLMTokenizerFast, RobertaTokenizerFast, XLMRobertaTokenizerFast
36
-
37
- _ARCHITECTURES_TO_TOKENIZER = {
38
- ("LayoutLMForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
39
- "microsoft/layoutlm-base-uncased"
40
- ),
41
- ("LayoutLMForSequenceClassification", False): LayoutLMTokenizerFast.from_pretrained(
42
- "microsoft/layoutlm-base-uncased"
43
- ),
44
- ("LayoutLMv2ForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
45
- "microsoft/layoutlm-base-uncased"
46
- ),
47
- ("LayoutLMv2ForSequenceClassification", False): LayoutLMTokenizerFast.from_pretrained(
48
- "microsoft/layoutlm-base-uncased"
49
- ),
50
- ("LayoutLMv2ForTokenClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
51
- ("LayoutLMv2ForSequenceClassification", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base"),
52
- ("LayoutLMv3ForSequenceClassification", False): RobertaTokenizerFast.from_pretrained(
53
- "roberta-base", add_prefix_space=True
54
- ),
55
- ("LayoutLMv3ForTokenClassification", False): RobertaTokenizerFast.from_pretrained(
56
- "roberta-base", add_prefix_space=True
57
- ),
58
- }
59
-
60
-
61
- def get_tokenizer_from_architecture(architecture_name: str, use_xlm_tokenizer: bool) -> Any:
62
- """
63
- We do not use the tokenizer for a particular model that the transformer library provides. Thie mapping therefore
64
- returns the tokenizer that should be used for a particular model.
65
-
66
- :param architecture_name: The model as stated in the transformer library.
67
- :param use_xlm_tokenizer: True if one uses the LayoutXLM. (The model cannot be distinguished from LayoutLMv2).
68
- :return: Tokenizer instance to use.
69
- """
70
- return _ARCHITECTURES_TO_TOKENIZER[(architecture_name, use_xlm_tokenizer)]
71
-
72
34
 
73
35
  @pipeline_component_registry.register("LMTokenClassifierService")
74
36
  class LMTokenClassifierService(LanguageModelPipelineComponent):
@@ -154,7 +116,8 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
154
116
  else:
155
117
  self.default_key = TokenClasses.other
156
118
  self.other_name_as_key = {self.default_key: categories_name_as_key[self.default_key]}
157
- super().__init__(self._get_name(), tokenizer, image_to_layoutlm_features)
119
+ image_to_features_func = self.image_to_features_func(self.language_model.image_to_features_mapping())
120
+ super().__init__(self._get_name(), tokenizer, image_to_features_func)
158
121
  self.required_kwargs = {
159
122
  "tokenizer": self.tokenizer,
160
123
  "padding": self.padding,
@@ -218,7 +181,9 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
218
181
  word.annotation_id,
219
182
  )
220
183
 
221
- def clone(self) -> "LMTokenClassifierService":
184
+ def clone(self) -> LMTokenClassifierService:
185
+ # ToDo: replace copying of tokenizer with a proper clone method. Otherwise we cannot run the evaluation with
186
+ # multiple threads
222
187
  return self.__class__(
223
188
  copy(self.tokenizer),
224
189
  self.language_model.clone(),
@@ -244,19 +209,20 @@ class LMTokenClassifierService(LanguageModelPipelineComponent):
244
209
  return f"lm_token_class_{self.language_model.name}"
245
210
 
246
211
  def _init_sanity_checks(self) -> None:
247
- tokenizer_class = self.language_model.model.config.tokenizer_class
248
- use_xlm_tokenizer = False
249
- if tokenizer_class is not None:
250
- use_xlm_tokenizer = True
251
- tokenizer_reference = get_tokenizer_from_architecture(
252
- self.language_model.model.__class__.__name__, use_xlm_tokenizer
253
- )
254
- if not isinstance(self.tokenizer, type(tokenizer_reference)):
212
+ tokenizer_class_name = self.language_model.model.config.tokenizer_class
213
+ if tokenizer_class_name != self.tokenizer.__class__.__name__:
255
214
  raise TypeError(
256
- f"You want to use {type(self.tokenizer)} but you should use {type(tokenizer_reference)} "
215
+ f"You want to use {type(self.tokenizer)} but you should use {tokenizer_class_name} "
257
216
  f"in this framework"
258
217
  )
259
218
 
219
+ @staticmethod
220
+ def image_to_features_func(mapping_str: str) -> Callable[..., Callable[[Image], Optional[Any]]]:
221
+ """Replacing eval functions"""
222
+ return {"image_to_layoutlm_features": image_to_layoutlm_features, "image_to_lm_features": image_to_lm_features}[
223
+ mapping_str
224
+ ]
225
+
260
226
 
261
227
  @pipeline_component_registry.register("LMSequenceClassifierService")
262
228
  class LMSequenceClassifierService(LanguageModelPipelineComponent):
@@ -315,7 +281,8 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
315
281
  self.padding = padding
316
282
  self.truncation = truncation
317
283
  self.return_overflowing_tokens = return_overflowing_tokens
318
- super().__init__(self._get_name(), tokenizer, image_to_layoutlm_features)
284
+ image_to_features_func = self.image_to_features_func(self.language_model.image_to_features_mapping())
285
+ super().__init__(self._get_name(), tokenizer, image_to_features_func)
319
286
  self.required_kwargs = {
320
287
  "tokenizer": self.tokenizer,
321
288
  "padding": self.padding,
@@ -335,7 +302,7 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
335
302
  PageType.document_type, lm_output.class_name, lm_output.class_id, None, lm_output.score
336
303
  )
337
304
 
338
- def clone(self) -> "LMSequenceClassifierService":
305
+ def clone(self) -> LMSequenceClassifierService:
339
306
  return self.__class__(
340
307
  copy(self.tokenizer),
341
308
  self.language_model.clone(),
@@ -358,15 +325,16 @@ class LMSequenceClassifierService(LanguageModelPipelineComponent):
358
325
  return f"lm_sequence_class_{self.language_model.name}"
359
326
 
360
327
  def _init_sanity_checks(self) -> None:
361
- tokenizer_class = self.language_model.model.config.tokenizer_class
362
- use_xlm_tokenizer = False
363
- if tokenizer_class is not None:
364
- use_xlm_tokenizer = True
365
- tokenizer_reference = get_tokenizer_from_architecture(
366
- self.language_model.model.__class__.__name__, use_xlm_tokenizer
367
- )
368
- if not isinstance(self.tokenizer, type(tokenizer_reference)):
328
+ tokenizer_class_name = self.language_model.model.config.tokenizer_class
329
+ if tokenizer_class_name != self.tokenizer.__class__.__name__:
369
330
  raise TypeError(
370
- f"You want to use {type(self.tokenizer)} but you should use {type(tokenizer_reference)} "
331
+ f"You want to use {type(self.tokenizer)} but you should use {tokenizer_class_name} "
371
332
  f"in this framework"
372
333
  )
334
+
335
+ @staticmethod
336
+ def image_to_features_func(mapping_str: str) -> Callable[..., Callable[[Image], Optional[Any]]]:
337
+ """Replacing eval functions"""
338
+ return {"image_to_layoutlm_features": image_to_layoutlm_features, "image_to_lm_features": image_to_lm_features}[
339
+ mapping_str
340
+ ]