deepdoctection 0.30__py3-none-any.whl → 0.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (120) hide show
  1. deepdoctection/__init__.py +38 -29
  2. deepdoctection/analyzer/dd.py +36 -29
  3. deepdoctection/configs/conf_dd_one.yaml +34 -31
  4. deepdoctection/dataflow/base.py +0 -19
  5. deepdoctection/dataflow/custom.py +4 -3
  6. deepdoctection/dataflow/custom_serialize.py +14 -5
  7. deepdoctection/dataflow/parallel_map.py +12 -11
  8. deepdoctection/dataflow/serialize.py +5 -4
  9. deepdoctection/datapoint/annotation.py +35 -13
  10. deepdoctection/datapoint/box.py +3 -5
  11. deepdoctection/datapoint/convert.py +3 -1
  12. deepdoctection/datapoint/image.py +79 -36
  13. deepdoctection/datapoint/view.py +152 -49
  14. deepdoctection/datasets/__init__.py +1 -4
  15. deepdoctection/datasets/adapter.py +6 -3
  16. deepdoctection/datasets/base.py +86 -11
  17. deepdoctection/datasets/dataflow_builder.py +1 -1
  18. deepdoctection/datasets/info.py +4 -4
  19. deepdoctection/datasets/instances/doclaynet.py +3 -2
  20. deepdoctection/datasets/instances/fintabnet.py +2 -1
  21. deepdoctection/datasets/instances/funsd.py +2 -1
  22. deepdoctection/datasets/instances/iiitar13k.py +5 -2
  23. deepdoctection/datasets/instances/layouttest.py +4 -8
  24. deepdoctection/datasets/instances/publaynet.py +2 -2
  25. deepdoctection/datasets/instances/pubtables1m.py +6 -3
  26. deepdoctection/datasets/instances/pubtabnet.py +2 -1
  27. deepdoctection/datasets/instances/rvlcdip.py +2 -1
  28. deepdoctection/datasets/instances/xfund.py +2 -1
  29. deepdoctection/eval/__init__.py +1 -4
  30. deepdoctection/eval/accmetric.py +1 -1
  31. deepdoctection/eval/base.py +5 -4
  32. deepdoctection/eval/cocometric.py +2 -1
  33. deepdoctection/eval/eval.py +19 -15
  34. deepdoctection/eval/tedsmetric.py +14 -11
  35. deepdoctection/eval/tp_eval_callback.py +14 -7
  36. deepdoctection/extern/__init__.py +2 -7
  37. deepdoctection/extern/base.py +39 -13
  38. deepdoctection/extern/d2detect.py +182 -90
  39. deepdoctection/extern/deskew.py +36 -9
  40. deepdoctection/extern/doctrocr.py +265 -83
  41. deepdoctection/extern/fastlang.py +49 -9
  42. deepdoctection/extern/hfdetr.py +106 -55
  43. deepdoctection/extern/hflayoutlm.py +441 -122
  44. deepdoctection/extern/hflm.py +225 -0
  45. deepdoctection/extern/model.py +56 -47
  46. deepdoctection/extern/pdftext.py +10 -5
  47. deepdoctection/extern/pt/__init__.py +1 -3
  48. deepdoctection/extern/pt/nms.py +6 -2
  49. deepdoctection/extern/pt/ptutils.py +27 -18
  50. deepdoctection/extern/tessocr.py +134 -22
  51. deepdoctection/extern/texocr.py +6 -2
  52. deepdoctection/extern/tp/tfutils.py +43 -9
  53. deepdoctection/extern/tp/tpcompat.py +14 -11
  54. deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
  55. deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
  56. deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
  57. deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
  58. deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
  59. deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
  60. deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
  61. deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
  62. deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
  63. deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
  64. deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
  65. deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
  66. deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
  67. deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
  68. deepdoctection/extern/tp/tpfrcnn/preproc.py +8 -9
  69. deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
  70. deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
  71. deepdoctection/extern/tpdetect.py +54 -30
  72. deepdoctection/mapper/__init__.py +3 -8
  73. deepdoctection/mapper/d2struct.py +9 -7
  74. deepdoctection/mapper/hfstruct.py +7 -2
  75. deepdoctection/mapper/laylmstruct.py +164 -21
  76. deepdoctection/mapper/maputils.py +16 -3
  77. deepdoctection/mapper/misc.py +6 -3
  78. deepdoctection/mapper/prodigystruct.py +1 -1
  79. deepdoctection/mapper/pubstruct.py +10 -10
  80. deepdoctection/mapper/tpstruct.py +3 -3
  81. deepdoctection/pipe/__init__.py +1 -1
  82. deepdoctection/pipe/anngen.py +35 -8
  83. deepdoctection/pipe/base.py +53 -19
  84. deepdoctection/pipe/common.py +23 -13
  85. deepdoctection/pipe/concurrency.py +2 -1
  86. deepdoctection/pipe/doctectionpipe.py +2 -2
  87. deepdoctection/pipe/language.py +3 -2
  88. deepdoctection/pipe/layout.py +6 -3
  89. deepdoctection/pipe/lm.py +34 -66
  90. deepdoctection/pipe/order.py +142 -35
  91. deepdoctection/pipe/refine.py +26 -24
  92. deepdoctection/pipe/segment.py +21 -16
  93. deepdoctection/pipe/{cell.py → sub_layout.py} +30 -9
  94. deepdoctection/pipe/text.py +14 -8
  95. deepdoctection/pipe/transform.py +16 -9
  96. deepdoctection/train/__init__.py +6 -12
  97. deepdoctection/train/d2_frcnn_train.py +36 -28
  98. deepdoctection/train/hf_detr_train.py +26 -17
  99. deepdoctection/train/hf_layoutlm_train.py +133 -111
  100. deepdoctection/train/tp_frcnn_train.py +21 -19
  101. deepdoctection/utils/__init__.py +3 -0
  102. deepdoctection/utils/concurrency.py +1 -1
  103. deepdoctection/utils/context.py +2 -2
  104. deepdoctection/utils/env_info.py +41 -84
  105. deepdoctection/utils/error.py +84 -0
  106. deepdoctection/utils/file_utils.py +4 -15
  107. deepdoctection/utils/fs.py +7 -7
  108. deepdoctection/utils/logger.py +1 -0
  109. deepdoctection/utils/mocks.py +93 -0
  110. deepdoctection/utils/pdf_utils.py +5 -4
  111. deepdoctection/utils/settings.py +6 -1
  112. deepdoctection/utils/transform.py +1 -1
  113. deepdoctection/utils/utils.py +0 -6
  114. deepdoctection/utils/viz.py +48 -5
  115. {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/METADATA +57 -73
  116. deepdoctection-0.32.dist-info/RECORD +146 -0
  117. {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/WHEEL +1 -1
  118. deepdoctection-0.30.dist-info/RECORD +0 -143
  119. {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/LICENSE +0 -0
  120. {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/top_level.txt +0 -0
@@ -20,32 +20,30 @@ Module for mapping annotations from image to layout lm input structure. Heavily
20
20
  <https://github.com/NielsRogge/Transformers-Tutorials>
21
21
  """
22
22
 
23
+ from __future__ import annotations
24
+
23
25
  import random
24
26
  from dataclasses import dataclass, field
25
27
  from typing import Any, Callable, Dict, List, Literal, NewType, Optional, Sequence, Union
26
28
 
27
29
  import numpy as np
28
30
  import numpy.typing as npt
31
+ from lazy_imports import try_import
29
32
 
30
33
  from ..datapoint.annotation import ContainerAnnotation
31
34
  from ..datapoint.convert import box_to_point4, point4_to_box
32
35
  from ..datapoint.image import Image
36
+ from ..datapoint.view import Page
33
37
  from ..utils.detection_types import JsonDict
34
- from ..utils.file_utils import pytorch_available, transformers_available
35
38
  from ..utils.settings import DatasetType, LayoutType, PageType, Relationships, WordType
36
39
  from ..utils.transform import ResizeTransform, normalize_image
37
40
  from .maputils import curry
38
41
 
39
- if pytorch_available():
42
+ with try_import() as import_guard:
40
43
  import torch
41
44
 
42
- if transformers_available():
43
- from transformers import ( # pylint: disable=W0611
44
- BatchEncoding,
45
- PreTrainedTokenizerFast,
46
- RobertaTokenizerFast,
47
- XLMRobertaTokenizerFast,
48
- )
45
+ with try_import() as tr_import_guard:
46
+ from transformers import BatchEncoding, PreTrainedTokenizerFast # pylint: disable=W0611
49
47
 
50
48
  __all__ = [
51
49
  "image_to_raw_layoutlm_features",
@@ -54,12 +52,17 @@ __all__ = [
54
52
  "image_to_layoutlm_features",
55
53
  "DataCollator",
56
54
  "LayoutLMFeatures",
55
+ "image_to_raw_lm_features",
56
+ "image_to_lm_features",
57
57
  ]
58
58
 
59
59
  RawLayoutLMFeatures = NewType("RawLayoutLMFeatures", JsonDict)
60
+ RawLMFeatures = NewType("RawLMFeatures", JsonDict)
60
61
  LayoutLMFeatures = NewType("LayoutLMFeatures", JsonDict)
62
+ LMFeatures = NewType("LMFeatures", JsonDict)
61
63
  InputDataClass = NewType("InputDataClass", JsonDict)
62
64
 
65
+
63
66
  """
64
67
  <https://github.com/huggingface/transformers/src/transformers/data/data_collator.py>
65
68
  A DataCollator is a function that takes a list of samples from a Dataset and collate them into a batch, as a dictionary
@@ -146,7 +149,7 @@ def image_to_raw_layoutlm_features(
146
149
  raise TypeError(f"char_cat must be of type ContainerAnnotation but is of type {type(char_cat)}")
147
150
  word = char_cat.value
148
151
  if not isinstance(word, str):
149
- raise ValueError(f"word must be of type str but is of type {type(word)}")
152
+ raise TypeError(f"word must be of type str but is of type {type(word)}")
150
153
  all_words.append(word)
151
154
 
152
155
  box = ann.get_bounding_box(dp.image_id)
@@ -208,7 +211,7 @@ def image_to_raw_layoutlm_features(
208
211
  return raw_features
209
212
 
210
213
 
211
- def features_to_pt_tensors(features: LayoutLMFeatures) -> LayoutLMFeatures:
214
+ def layoutlm_features_to_pt_tensors(features: LayoutLMFeatures) -> LayoutLMFeatures:
212
215
  """
213
216
  Converting list of floats to pytorch tensors
214
217
  :param features: LayoutLMFeatures
@@ -216,7 +219,8 @@ def features_to_pt_tensors(features: LayoutLMFeatures) -> LayoutLMFeatures:
216
219
  """
217
220
 
218
221
  _image_key = "pixel_values" if "pixel_values" in features else "image"
219
- features["bbox"] = torch.tensor(features["bbox"], dtype=torch.long)
222
+ if "bbox" in features:
223
+ features["bbox"] = torch.tensor(features["bbox"], dtype=torch.long)
220
224
  if "labels" in features:
221
225
  features["labels"] = torch.tensor(features["labels"], dtype=torch.long)
222
226
  if _image_key in features:
@@ -230,12 +234,12 @@ def features_to_pt_tensors(features: LayoutLMFeatures) -> LayoutLMFeatures:
230
234
 
231
235
 
232
236
  def _tokenize_with_sliding_window(
233
- raw_features: List[RawLayoutLMFeatures],
234
- tokenizer: "PreTrainedTokenizerFast",
237
+ raw_features: List[Union[RawLayoutLMFeatures, RawLMFeatures]],
238
+ tokenizer: PreTrainedTokenizerFast,
235
239
  sliding_window_stride: int,
236
240
  max_batch_size: int,
237
241
  return_tensors: Optional[Literal["pt"]] = None,
238
- ) -> Union[JsonDict, "BatchEncoding"]:
242
+ ) -> Union[JsonDict, BatchEncoding]:
239
243
  """
240
244
  Runs a tokenizer: If there are no overflowing tokens, the tokenizer output will be returned as it is.
241
245
  If there are overflowing tokens, sliding windows have to be built. As it is easier to prepare the sliding windows
@@ -398,8 +402,8 @@ def _tokenize_with_sliding_window(
398
402
 
399
403
 
400
404
  def raw_features_to_layoutlm_features(
401
- raw_features: Union[RawLayoutLMFeatures, List[RawLayoutLMFeatures]],
402
- tokenizer: "PreTrainedTokenizerFast",
405
+ raw_features: Union[RawLayoutLMFeatures, RawLMFeatures, List[Union[RawLayoutLMFeatures, RawLMFeatures]]],
406
+ tokenizer: PreTrainedTokenizerFast,
403
407
  padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
404
408
  truncation: bool = True,
405
409
  return_overflowing_tokens: bool = False,
@@ -407,6 +411,7 @@ def raw_features_to_layoutlm_features(
407
411
  remove_columns_for_training: bool = False,
408
412
  sliding_window_stride: int = 0,
409
413
  max_batch_size: int = 0,
414
+ remove_bounding_boxes: bool = False,
410
415
  ) -> LayoutLMFeatures:
411
416
  """
412
417
  Mapping raw features to tokenized input sequences for LayoutLM models.
@@ -563,8 +568,11 @@ def raw_features_to_layoutlm_features(
563
568
  input_dict.pop("ann_ids")
564
569
  input_dict.pop("tokens")
565
570
 
571
+ if remove_bounding_boxes:
572
+ input_dict.pop("bbox")
573
+
566
574
  if return_tensors == "pt":
567
- return features_to_pt_tensors(LayoutLMFeatures(input_dict))
575
+ return layoutlm_features_to_pt_tensors(LayoutLMFeatures(input_dict))
568
576
  return LayoutLMFeatures(input_dict)
569
577
 
570
578
 
@@ -595,13 +603,14 @@ class LayoutLMDataCollator:
595
603
  with windows shifted `sliding_window_stride` to the right.
596
604
  """
597
605
 
598
- tokenizer: "PreTrainedTokenizerFast"
606
+ tokenizer: PreTrainedTokenizerFast
599
607
  padding: Literal["max_length", "do_not_pad", "longest"] = field(default="max_length")
600
608
  truncation: bool = field(default=True)
601
609
  return_overflowing_tokens: bool = field(default=False)
602
610
  return_tensors: Optional[Literal["pt"]] = field(default=None)
603
611
  sliding_window_stride: int = field(default=0)
604
612
  max_batch_size: int = field(default=0)
613
+ remove_bounding_box_features: bool = field(default=False)
605
614
 
606
615
  def __post_init__(self) -> None:
607
616
  assert isinstance(self.tokenizer, PreTrainedTokenizerFast), "Tokenizer must be a fast tokenizer"
@@ -620,7 +629,7 @@ class LayoutLMDataCollator:
620
629
  token_type_ids, attention_masks, boxes, labels`.
621
630
  """
622
631
  return raw_features_to_layoutlm_features(
623
- raw_features,
632
+ raw_features, # type: ignore
624
633
  self.tokenizer,
625
634
  self.padding,
626
635
  self.truncation,
@@ -629,13 +638,14 @@ class LayoutLMDataCollator:
629
638
  True,
630
639
  self.sliding_window_stride,
631
640
  self.max_batch_size,
641
+ self.remove_bounding_box_features,
632
642
  )
633
643
 
634
644
 
635
645
  @curry
636
646
  def image_to_layoutlm_features(
637
647
  dp: Image,
638
- tokenizer: "PreTrainedTokenizerFast",
648
+ tokenizer: PreTrainedTokenizerFast,
639
649
  padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
640
650
  truncation: bool = True,
641
651
  return_overflowing_tokens: bool = False,
@@ -724,3 +734,136 @@ def image_to_layoutlm_features(
724
734
  sliding_window_stride=sliding_window_stride,
725
735
  )
726
736
  return features
737
+
738
+
739
+ @curry
740
+ def image_to_raw_lm_features(
741
+ dp: Image,
742
+ dataset_type: Optional[Literal["sequence_classification", "token_classification"]] = None,
743
+ use_token_tag: bool = True,
744
+ text_container: Optional[LayoutType] = LayoutType.word,
745
+ floating_text_block_categories: Optional[Sequence[LayoutType]] = None,
746
+ include_residual_text_container: bool = False,
747
+ ) -> Optional[RawLMFeatures]:
748
+ """
749
+ Mapping a datapoint into an intermediate format for bert-like models. Features will be provided into a dict and
750
+ this mapping can be used for sequence or token classification as well as for inference. To generate input features
751
+ for the model please `use raw_features_to_layoutlm_features`.
752
+
753
+
754
+ :param dp: Image
755
+ :param dataset_type: Either SEQUENCE_CLASSIFICATION or TOKEN_CLASSIFICATION. When using a built-in dataset use
756
+ :param use_token_tag: Will only be used for dataset_type="token_classification". If use_token_tag=True, will use
757
+ labels from sub category `WordType.token_tag` (with `B,I,O` suffix), otherwise
758
+ `WordType.token_class`.
759
+ :param text_container: A LayoutType to get the text from. It will steer the output of `Layout.words`.
760
+ :param floating_text_block_categories: A list of top level layout objects
761
+ :param include_residual_text_container: This will regard synthetic text line annotations as floating text
762
+ blocks and therefore incorporate all image annotations of category
763
+ `word` when building text strings.
764
+ :return: dictionary with the following arguments:
765
+ 'image_id', 'width', 'height', 'ann_ids', 'words', 'bbox' and 'dataset_type'.
766
+ """
767
+
768
+ raw_features: RawLMFeatures = RawLMFeatures({})
769
+
770
+ page = Page.from_image(dp, text_container, floating_text_block_categories, include_residual_text_container)
771
+
772
+ text_ = page.text_
773
+
774
+ # pylint: disable=E1137 #3162
775
+ raw_features["image_id"] = page.image_id
776
+ raw_features["width"] = page.width
777
+ raw_features["height"] = page.height
778
+ raw_features["ann_ids"] = text_["ann_ids"]
779
+ raw_features["words"] = text_["words"]
780
+ # We use a dummy bounding box for all bounding boxes so that we can pass the raw features to
781
+ # raw_features_to_layoutlm_features
782
+ raw_features["bbox"] = [_CLS_BOX] * len(text_["words"])
783
+ raw_features["dataset_type"] = dataset_type
784
+
785
+ if use_token_tag and text_["token_tags"]:
786
+ raw_features["labels"] = text_["token_tags"]
787
+ elif text_["token_classes"]:
788
+ raw_features["labels"] = text_["token_classes"]
789
+ elif page.document_type is not None:
790
+ document_type_id = (
791
+ int(page.image_orig.summary.get_sub_category(PageType.document_type).category_id) - 1 # type: ignore
792
+ )
793
+ raw_features["labels"] = [document_type_id]
794
+
795
+ raw_features["dataset_type"] = dataset_type
796
+ # pylint: enable=E1137
797
+ return raw_features
798
+
799
+
800
+ @curry
801
+ def image_to_lm_features(
802
+ dp: Image,
803
+ tokenizer: PreTrainedTokenizerFast,
804
+ padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
805
+ truncation: bool = True,
806
+ return_overflowing_tokens: bool = False,
807
+ return_tensors: Optional[Literal["pt"]] = "pt",
808
+ sliding_window_stride: int = 0,
809
+ text_container: Optional[LayoutType] = LayoutType.word,
810
+ floating_text_block_categories: Optional[Sequence[LayoutType]] = None,
811
+ include_residual_text_container: bool = False,
812
+ ) -> Optional[LayoutLMFeatures]:
813
+ """
814
+ Mapping function to generate layoutlm features from `Image` to be used for inference in a pipeline component.
815
+ `LanguageModelPipelineComponent` has a positional argument `mapping_to_lm_input_func` that must be chosen
816
+ with respect to the language model chosen. This mapper is devoted to generating features for LayoutLM. It will be
817
+ used internally in `LMTokenClassifierService`.
818
+
819
+ tokenizer = LayoutLMTokenizer.from_pretrained("mrm8488/layoutlm-finetuned-funsd")
820
+ layoutlm = HFLayoutLmTokenClassifier("path/to/config.json","path/to/model.bin",
821
+ categories_explicit=['B-ANSWER', 'B-QUESTION', 'O'])
822
+
823
+ layoutlm_service = LMTokenClassifierService(tokenizer,layoutlm)
824
+
825
+ :param dp: Image datapoint
826
+ :param tokenizer: Tokenizer compatible with the language model
827
+ :param padding: A padding strategy to be passed to the tokenizer. Must bei either `max_length, longest` or
828
+ `do_not_pad`.
829
+ :param truncation: If "True" will truncate to a maximum length specified with the argument max_length or to the
830
+ maximum acceptable input length for the model if that argument is not provided. This will
831
+ truncate token by token, removing a token from the longest sequence in the pair if a pair of
832
+ sequences (or a batch of pairs) is provided.
833
+ If `False` then no truncation (i.e., can output batch with sequence lengths greater than the
834
+ model maximum admissible input size).
835
+ :param return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows the overflowing tokens
836
+ can be returned as an additional batch element. Not that in this case, the number
837
+ of input batch samples will be smaller than the output batch samples.
838
+ :param return_tensors: Output tensor features. Either 'pt' for PyTorch models or None, if features should be
839
+ returned in list objects.
840
+ :param sliding_window_stride: If the output of the tokenizer exceeds the max_length sequence length a sliding
841
+ windows will be created with each window having max_length sequence input. When using
842
+ `sliding_window_stride=0` no strides will be created, otherwise it will create slides
843
+ with windows shifted `sliding_window_stride` to the right.
844
+ :param text_container: A LayoutType to get the text from. It will steer the output of `Layout.words`.
845
+ :param floating_text_block_categories: A list of top level layout objects
846
+ :param include_residual_text_container: This will regard synthetic text line annotations as floating text
847
+ blocks and therefore incorporate all image annotations of category
848
+ `word` when building text strings.
849
+ :return: A dict of lm features
850
+ """
851
+ raw_features = image_to_raw_lm_features( # pylint: disable=E1102
852
+ dataset_type=None,
853
+ use_token_tag=True,
854
+ text_container=text_container,
855
+ floating_text_block_categories=floating_text_block_categories,
856
+ include_residual_text_container=include_residual_text_container,
857
+ )(dp)
858
+ if raw_features is None:
859
+ return None
860
+ features = raw_features_to_layoutlm_features(
861
+ raw_features,
862
+ tokenizer,
863
+ padding,
864
+ truncation,
865
+ return_overflowing_tokens,
866
+ return_tensors=return_tensors,
867
+ sliding_window_stride=sliding_window_stride,
868
+ )
869
+ return features
@@ -18,6 +18,8 @@
18
18
  """
19
19
  Utility functions related to mapping tasks
20
20
  """
21
+ from __future__ import annotations
22
+
21
23
  import functools
22
24
  import itertools
23
25
  import traceback
@@ -28,8 +30,8 @@ import numpy as np
28
30
  from tabulate import tabulate
29
31
  from termcolor import colored
30
32
 
31
- from ..datapoint.box import BoundingBoxError
32
33
  from ..utils.detection_types import DP, BaseExceptionType, S, T
34
+ from ..utils.error import AnnotationError, BoundingBoxError, ImageError, UUIDError
33
35
  from ..utils.logger import LoggingRecord, logger
34
36
  from ..utils.settings import ObjectTypes
35
37
 
@@ -55,7 +57,7 @@ class MappingContextManager:
55
57
  self.context_error = True
56
58
  self.kwargs = kwargs
57
59
 
58
- def __enter__(self) -> "MappingContextManager":
60
+ def __enter__(self) -> MappingContextManager:
59
61
  """
60
62
  context enter
61
63
  """
@@ -72,7 +74,18 @@ class MappingContextManager:
72
74
  """
73
75
  if (
74
76
  exc_type
75
- in (KeyError, ValueError, IndexError, AssertionError, TypeError, BoundingBoxError, FileNotFoundError)
77
+ in (
78
+ KeyError,
79
+ ValueError,
80
+ IndexError,
81
+ AssertionError,
82
+ TypeError,
83
+ FileNotFoundError,
84
+ BoundingBoxError,
85
+ AnnotationError,
86
+ ImageError,
87
+ UUIDError,
88
+ )
76
89
  and exc_tb is not None
77
90
  ):
78
91
  frame_summary = traceback.extract_tb(exc_tb)[0]
@@ -19,19 +19,22 @@
19
19
  Module for small mapping functions
20
20
  """
21
21
 
22
+ from __future__ import annotations
23
+
22
24
  import ast
23
25
  import os
24
26
  from typing import List, Mapping, Optional, Sequence, Union
25
27
 
28
+ from lazy_imports import try_import
29
+
26
30
  from ..datapoint.convert import convert_pdf_bytes_to_np_array_v2
27
31
  from ..datapoint.image import Image
28
32
  from ..utils.detection_types import JsonDict
29
- from ..utils.file_utils import lxml_available
30
33
  from ..utils.fs import get_load_image_func, load_image_from_file
31
34
  from ..utils.utils import is_file_extension
32
35
  from .maputils import MappingContextManager, curry
33
36
 
34
- if lxml_available():
37
+ with try_import() as import_guard:
35
38
  from lxml import etree # pylint: disable=W0611
36
39
 
37
40
 
@@ -175,7 +178,7 @@ def maybe_ann_to_sub_image(
175
178
 
176
179
 
177
180
  @curry
178
- def xml_to_dict(dp: JsonDict, xslt_obj: "etree.XSLT") -> JsonDict:
181
+ def xml_to_dict(dp: JsonDict, xslt_obj: etree.XSLT) -> JsonDict:
179
182
  """
180
183
  Convert a xml object into a dict using a xsl style sheet.
181
184
 
@@ -128,7 +128,7 @@ def prodigy_to_image(
128
128
  else:
129
129
  label = span["label"]
130
130
  if not isinstance(label, str):
131
- raise ValueError("label could not assigned to be a string")
131
+ raise TypeError("label must be a string")
132
132
 
133
133
  annotation = ImageAnnotation(
134
134
  category_name=label,
@@ -75,12 +75,14 @@ def _cell_token(html: Sequence[str]) -> List[List[int]]:
75
75
  def _item_spans(html: Sequence[str], index_cells: Sequence[Sequence[int]], item: str) -> List[List[int]]:
76
76
  item_spans = [
77
77
  [
78
- int(html[index_cell - 1].replace(item + "=", "").replace('"', ""))
79
- if (item in html[index_cell - 1] and html[index_cell] == ">")
80
- else (
81
- int(html[index_cell - 2].replace(item + "=", "").replace('"', ""))
82
- if (item in html[index_cell - 2] and html[index_cell] == ">")
83
- else 1
78
+ (
79
+ int(html[index_cell - 1].replace(item + "=", "").replace('"', ""))
80
+ if (item in html[index_cell - 1] and html[index_cell] == ">")
81
+ else (
82
+ int(html[index_cell - 2].replace(item + "=", "").replace('"', ""))
83
+ if (item in html[index_cell - 2] and html[index_cell] == ">")
84
+ else 1
85
+ )
84
86
  )
85
87
  for index_cell in index_cell_per_row
86
88
  ]
@@ -210,9 +212,7 @@ def _add_items(image: Image, item_type: str, categories_name_as_key: Dict[str, s
210
212
  items = image.get_annotation(category_names=TableType.item)
211
213
  item_type_anns = [ann for ann in items if ann.get_sub_category(TableType.item).category_name == item_type]
212
214
  item_type_anns.sort(
213
- key=lambda x: x.bounding_box.cx # type: ignore
214
- if item_type == LayoutType.column
215
- else x.bounding_box.cy # type: ignore
215
+ key=lambda x: (x.bounding_box.cx if item_type == LayoutType.column else x.bounding_box.cy) # type: ignore
216
216
  )
217
217
  if table.bounding_box:
218
218
  tmp_item_xy = table.bounding_box.uly + 1.0 if item_type == LayoutType.row else table.bounding_box.ulx + 1.0
@@ -389,7 +389,7 @@ def pub_to_image_uncur( # pylint: disable=R0914
389
389
  with MappingContextManager(str(idx)) as mapping_context:
390
390
  max_rs, max_cs = 0, 0
391
391
  if idx is None:
392
- raise ValueError("No valid datapoint external id")
392
+ raise TypeError("imgid is None but must be a string")
393
393
 
394
394
  image = Image(file_name=os.path.split(dp["filename"])[1], location=dp["filename"], external_id=idx)
395
395
 
@@ -22,15 +22,15 @@ import os.path
22
22
  from typing import Optional, Sequence, Union
23
23
 
24
24
  import numpy as np
25
+ from lazy_imports import try_import
25
26
 
26
27
  from ..datapoint.annotation import ImageAnnotation
27
28
  from ..datapoint.image import Image
28
29
  from ..utils.detection_types import JsonDict
29
- from ..utils.file_utils import tf_available
30
30
  from ..utils.settings import ObjectTypes
31
31
  from .maputils import curry
32
32
 
33
- if tf_available():
33
+ with try_import() as import_guard:
34
34
  from tensorflow import convert_to_tensor, uint8 # type: ignore # pylint: disable=E0401
35
35
  from tensorflow.image import non_max_suppression # type: ignore # pylint: disable=E0401
36
36
 
@@ -67,7 +67,7 @@ def image_to_tp_frcnn_training(
67
67
  all_categories.append(ann.category_id)
68
68
 
69
69
  if add_mask:
70
- raise NotImplementedError
70
+ raise NotImplementedError()
71
71
 
72
72
  output["gt_boxes"] = np.asarray(all_boxes, dtype="float32")
73
73
  output["gt_labels"] = np.asarray(all_categories, dtype="int32")
@@ -22,7 +22,6 @@ Contains pipeline components that can be plugged into each other and predictors
22
22
 
23
23
  from .anngen import *
24
24
  from .base import *
25
- from .cell import *
26
25
  from .common import *
27
26
  from .concurrency import *
28
27
  from .doctectionpipe import *
@@ -33,5 +32,6 @@ from .order import *
33
32
  from .refine import *
34
33
  from .registry import *
35
34
  from .segment import *
35
+ from .sub_layout import *
36
36
  from .text import *
37
37
  from .transform import *
@@ -42,11 +42,14 @@ class DatapointManager:
42
42
  The manager is part of each `PipelineComponent`.
43
43
  """
44
44
 
45
- def __init__(self) -> None:
45
+ def __init__(self, service_id: str, model_id: Optional[str] = None) -> None:
46
46
  self._datapoint: Optional[Image] = None
47
47
  self._cache_anns: Dict[str, ImageAnnotation] = {}
48
48
  self.datapoint_is_passed: bool = False
49
49
  self.category_id_mapping: Optional[Mapping[int, int]] = None
50
+ self.service_id = service_id
51
+ self.model_id = model_id
52
+ self.session_id: Optional[str] = None
50
53
 
51
54
  @property
52
55
  def datapoint(self) -> Image:
@@ -55,7 +58,7 @@ class DatapointManager:
55
58
  """
56
59
  if self._datapoint is not None:
57
60
  return self._datapoint
58
- raise ValueError("no datapoint passed")
61
+ raise ValueError("No datapoint passed")
59
62
 
60
63
  @datapoint.setter
61
64
  def datapoint(self, dp: Image) -> None:
@@ -154,6 +157,9 @@ class DatapointManager:
154
157
  bounding_box=box,
155
158
  category_id=str(detect_result.class_id),
156
159
  score=detect_result.score,
160
+ service_id=self.service_id,
161
+ model_id=self.model_id,
162
+ session_id=self.session_id,
157
163
  )
158
164
  if to_annotation_id is not None:
159
165
  parent_ann = self._cache_anns[to_annotation_id]
@@ -208,7 +214,14 @@ class DatapointManager:
208
214
  "annotation_id": annotation_id,
209
215
  },
210
216
  ) as annotation_context:
211
- cat_ann = CategoryAnnotation(category_name=category_name, category_id=str(category_id), score=score)
217
+ cat_ann = CategoryAnnotation(
218
+ category_name=category_name,
219
+ category_id=str(category_id),
220
+ score=score,
221
+ service_id=self.service_id,
222
+ model_id=self.model_id,
223
+ session_id=self.session_id,
224
+ )
212
225
  self._cache_anns[annotation_id].dump_sub_category(sub_cat_key, cat_ann)
213
226
  if annotation_context.context_error:
214
227
  return None
@@ -246,7 +259,13 @@ class DatapointManager:
246
259
  },
247
260
  ) as annotation_context:
248
261
  cont_ann = ContainerAnnotation(
249
- category_name=category_name, category_id=str(category_id), value=value, score=score
262
+ category_name=category_name,
263
+ category_id=str(category_id),
264
+ value=value,
265
+ score=score,
266
+ service_id=self.service_id,
267
+ model_id=self.model_id,
268
+ session_id=self.session_id,
250
269
  )
251
270
  self._cache_anns[annotation_id].dump_sub_category(sub_cat_key, cont_ann)
252
271
  if annotation_context.context_error:
@@ -257,7 +276,7 @@ class DatapointManager:
257
276
  self,
258
277
  summary_key: ObjectTypes,
259
278
  summary_name: ObjectTypes,
260
- summary_number: int,
279
+ summary_number: Optional[int] = None,
261
280
  summary_value: Optional[str] = None,
262
281
  summary_score: Optional[float] = None,
263
282
  annotation_id: Optional[str] = None,
@@ -294,16 +313,24 @@ class DatapointManager:
294
313
  "annotation_id": annotation_id,
295
314
  },
296
315
  ) as annotation_context:
297
- if summary_value:
316
+ if summary_value is not None:
298
317
  ann = ContainerAnnotation(
299
318
  category_name=summary_name,
300
- category_id=str(summary_number),
319
+ category_id=str(summary_number) if summary_number is not None else "",
301
320
  value=summary_value,
302
321
  score=summary_score,
322
+ service_id=self.service_id,
323
+ model_id=self.model_id,
324
+ session_id=self.session_id,
303
325
  )
304
326
  else:
305
327
  ann = CategoryAnnotation(
306
- category_name=summary_name, category_id=str(summary_number), score=summary_score
328
+ category_name=summary_name,
329
+ category_id=str(summary_number) if summary_number is not None else "",
330
+ score=summary_score,
331
+ service_id=self.service_id,
332
+ model_id=self.model_id,
333
+ session_id=self.session_id,
307
334
  )
308
335
  image.summary.dump_sub_category(summary_key, ann, image.image_id)
309
336