deepdoctection 0.32__py3-none-any.whl → 0.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (111) hide show
  1. deepdoctection/__init__.py +8 -25
  2. deepdoctection/analyzer/dd.py +84 -71
  3. deepdoctection/dataflow/common.py +9 -5
  4. deepdoctection/dataflow/custom.py +5 -5
  5. deepdoctection/dataflow/custom_serialize.py +75 -18
  6. deepdoctection/dataflow/parallel_map.py +3 -3
  7. deepdoctection/dataflow/serialize.py +4 -4
  8. deepdoctection/dataflow/stats.py +3 -3
  9. deepdoctection/datapoint/annotation.py +78 -56
  10. deepdoctection/datapoint/box.py +7 -7
  11. deepdoctection/datapoint/convert.py +6 -6
  12. deepdoctection/datapoint/image.py +157 -75
  13. deepdoctection/datapoint/view.py +175 -151
  14. deepdoctection/datasets/adapter.py +30 -24
  15. deepdoctection/datasets/base.py +10 -10
  16. deepdoctection/datasets/dataflow_builder.py +3 -3
  17. deepdoctection/datasets/info.py +23 -25
  18. deepdoctection/datasets/instances/doclaynet.py +48 -49
  19. deepdoctection/datasets/instances/fintabnet.py +44 -45
  20. deepdoctection/datasets/instances/funsd.py +23 -23
  21. deepdoctection/datasets/instances/iiitar13k.py +8 -8
  22. deepdoctection/datasets/instances/layouttest.py +2 -2
  23. deepdoctection/datasets/instances/publaynet.py +3 -3
  24. deepdoctection/datasets/instances/pubtables1m.py +18 -18
  25. deepdoctection/datasets/instances/pubtabnet.py +30 -29
  26. deepdoctection/datasets/instances/rvlcdip.py +28 -29
  27. deepdoctection/datasets/instances/xfund.py +51 -30
  28. deepdoctection/datasets/save.py +6 -6
  29. deepdoctection/eval/accmetric.py +32 -33
  30. deepdoctection/eval/base.py +8 -9
  31. deepdoctection/eval/cocometric.py +13 -12
  32. deepdoctection/eval/eval.py +32 -26
  33. deepdoctection/eval/tedsmetric.py +16 -12
  34. deepdoctection/eval/tp_eval_callback.py +7 -16
  35. deepdoctection/extern/base.py +339 -134
  36. deepdoctection/extern/d2detect.py +69 -89
  37. deepdoctection/extern/deskew.py +11 -10
  38. deepdoctection/extern/doctrocr.py +81 -64
  39. deepdoctection/extern/fastlang.py +23 -16
  40. deepdoctection/extern/hfdetr.py +53 -38
  41. deepdoctection/extern/hflayoutlm.py +216 -155
  42. deepdoctection/extern/hflm.py +35 -30
  43. deepdoctection/extern/model.py +433 -255
  44. deepdoctection/extern/pdftext.py +15 -15
  45. deepdoctection/extern/pt/ptutils.py +4 -2
  46. deepdoctection/extern/tessocr.py +39 -38
  47. deepdoctection/extern/texocr.py +14 -16
  48. deepdoctection/extern/tp/tfutils.py +16 -2
  49. deepdoctection/extern/tp/tpcompat.py +11 -7
  50. deepdoctection/extern/tp/tpfrcnn/config/config.py +4 -4
  51. deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +1 -1
  52. deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +5 -5
  53. deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +6 -6
  54. deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +4 -4
  55. deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +5 -3
  56. deepdoctection/extern/tp/tpfrcnn/preproc.py +5 -5
  57. deepdoctection/extern/tpdetect.py +40 -45
  58. deepdoctection/mapper/cats.py +36 -40
  59. deepdoctection/mapper/cocostruct.py +16 -12
  60. deepdoctection/mapper/d2struct.py +22 -22
  61. deepdoctection/mapper/hfstruct.py +7 -7
  62. deepdoctection/mapper/laylmstruct.py +22 -24
  63. deepdoctection/mapper/maputils.py +9 -10
  64. deepdoctection/mapper/match.py +33 -2
  65. deepdoctection/mapper/misc.py +6 -7
  66. deepdoctection/mapper/pascalstruct.py +4 -4
  67. deepdoctection/mapper/prodigystruct.py +6 -6
  68. deepdoctection/mapper/pubstruct.py +84 -92
  69. deepdoctection/mapper/tpstruct.py +3 -3
  70. deepdoctection/mapper/xfundstruct.py +33 -33
  71. deepdoctection/pipe/anngen.py +39 -14
  72. deepdoctection/pipe/base.py +68 -99
  73. deepdoctection/pipe/common.py +181 -85
  74. deepdoctection/pipe/concurrency.py +14 -10
  75. deepdoctection/pipe/doctectionpipe.py +24 -21
  76. deepdoctection/pipe/language.py +20 -25
  77. deepdoctection/pipe/layout.py +18 -16
  78. deepdoctection/pipe/lm.py +49 -47
  79. deepdoctection/pipe/order.py +63 -65
  80. deepdoctection/pipe/refine.py +102 -109
  81. deepdoctection/pipe/segment.py +157 -162
  82. deepdoctection/pipe/sub_layout.py +50 -40
  83. deepdoctection/pipe/text.py +37 -36
  84. deepdoctection/pipe/transform.py +19 -16
  85. deepdoctection/train/d2_frcnn_train.py +27 -25
  86. deepdoctection/train/hf_detr_train.py +22 -18
  87. deepdoctection/train/hf_layoutlm_train.py +49 -48
  88. deepdoctection/train/tp_frcnn_train.py +10 -11
  89. deepdoctection/utils/concurrency.py +1 -1
  90. deepdoctection/utils/context.py +13 -6
  91. deepdoctection/utils/develop.py +4 -4
  92. deepdoctection/utils/env_info.py +52 -14
  93. deepdoctection/utils/file_utils.py +6 -11
  94. deepdoctection/utils/fs.py +41 -14
  95. deepdoctection/utils/identifier.py +2 -2
  96. deepdoctection/utils/logger.py +15 -15
  97. deepdoctection/utils/metacfg.py +7 -7
  98. deepdoctection/utils/pdf_utils.py +39 -14
  99. deepdoctection/utils/settings.py +188 -182
  100. deepdoctection/utils/tqdm.py +1 -1
  101. deepdoctection/utils/transform.py +14 -9
  102. deepdoctection/utils/types.py +104 -0
  103. deepdoctection/utils/utils.py +7 -7
  104. deepdoctection/utils/viz.py +70 -69
  105. {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/METADATA +7 -4
  106. deepdoctection-0.34.dist-info/RECORD +146 -0
  107. {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/WHEEL +1 -1
  108. deepdoctection/utils/detection_types.py +0 -68
  109. deepdoctection-0.32.dist-info/RECORD +0 -146
  110. {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/LICENSE +0 -0
  111. {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/top_level.txt +0 -0
@@ -21,16 +21,15 @@ Wrapper for the Hugging Face Language Model for sequence and token classificati
21
21
  from __future__ import annotations
22
22
 
23
23
  from abc import ABC
24
- from copy import copy
25
24
  from pathlib import Path
26
- from typing import Any, List, Literal, Mapping, Optional, Tuple, Union
25
+ from typing import Literal, Mapping, Optional, Union
27
26
 
28
27
  from lazy_imports import try_import
29
28
 
30
- from ..utils.detection_types import JsonDict, Requirement
31
29
  from ..utils.file_utils import get_pytorch_requirement, get_transformers_requirement
32
30
  from ..utils.settings import TypeOrStr
33
- from .base import LMSequenceClassifier, SequenceClassResult
31
+ from ..utils.types import JsonDict, PathLikeOrStr, Requirement
32
+ from .base import LMSequenceClassifier, ModelCategories, SequenceClassResult
34
33
  from .hflayoutlm import get_tokenizer_from_model_class
35
34
  from .pt.ptutils import get_torch_device
36
35
 
@@ -69,34 +68,29 @@ class HFLmSequenceClassifierBase(LMSequenceClassifier, ABC):
69
68
  Abstract base class for wrapping Bert-type models for sequence classification into the deepdoctection framework.
70
69
  """
71
70
 
72
- model: Union[XLMRobertaForSequenceClassification]
73
-
74
71
  def __init__(
75
72
  self,
76
- path_config_json: str,
77
- path_weights: str,
78
- categories: Mapping[str, TypeOrStr],
73
+ path_config_json: PathLikeOrStr,
74
+ path_weights: PathLikeOrStr,
75
+ categories: Mapping[int, TypeOrStr],
79
76
  device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
80
- use_xlm_tokenizer: bool = False,
81
77
  ):
82
- self.path_config = path_config_json
83
- self.path_weights = path_weights
84
- self.categories = copy(categories) # type: ignore
78
+ self.path_config = Path(path_config_json)
79
+ self.path_weights = Path(path_weights)
80
+ self.categories = ModelCategories(init_categories=categories)
85
81
 
86
82
  self.device = get_torch_device(device)
87
- self.model.to(self.device)
88
- self.model.config.tokenizer_class = self.get_tokenizer_class_name(use_xlm_tokenizer)
89
83
 
90
84
  @classmethod
91
- def get_requirements(cls) -> List[Requirement]:
85
+ def get_requirements(cls) -> list[Requirement]:
92
86
  return [get_pytorch_requirement(), get_transformers_requirement()]
93
87
 
94
88
  def clone(self) -> HFLmSequenceClassifierBase:
95
- return self.__class__(self.path_config, self.path_weights, self.categories, self.device)
89
+ return self.__class__(self.path_config, self.path_weights, self.categories.get_categories(), self.device)
96
90
 
97
91
  def _validate_encodings(
98
- self, **encodings: Union[List[List[str]], torch.Tensor]
99
- ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
92
+ self, **encodings: Union[list[list[str]], torch.Tensor]
93
+ ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
100
94
  input_ids = encodings.get("input_ids")
101
95
  attention_mask = encodings.get("attention_mask")
102
96
  token_type_ids = encodings.get("token_type_ids")
@@ -120,16 +114,18 @@ class HFLmSequenceClassifierBase(LMSequenceClassifier, ABC):
120
114
  return input_ids, attention_mask, token_type_ids
121
115
 
122
116
  @staticmethod
123
- def get_name(path_weights: str, architecture: str) -> str:
117
+ def get_name(path_weights: PathLikeOrStr, architecture: str) -> str:
124
118
  """Returns the name of the model"""
125
119
  return f"Transformers_{architecture}_" + "_".join(Path(path_weights).parts[-2:])
126
120
 
127
- def get_tokenizer_class_name(self, use_xlm_tokenizer: bool) -> str:
121
+ @staticmethod
122
+ def get_tokenizer_class_name(model_class_name: str, use_xlm_tokenizer: bool) -> str:
128
123
  """A refinement for adding the tokenizer class name to the model configs.
129
124
 
125
+ :param model_class_name: The model name, e.g. model.__class__.__name__
130
126
  :param use_xlm_tokenizer: Whether to use a XLM tokenizer.
131
127
  """
132
- tokenizer = get_tokenizer_from_model_class(self.model.__class__.__name__, use_xlm_tokenizer)
128
+ tokenizer = get_tokenizer_from_model_class(model_class_name, use_xlm_tokenizer)
133
129
  return tokenizer.__class__.__name__
134
130
 
135
131
  @staticmethod
@@ -177,18 +173,22 @@ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
177
173
 
178
174
  def __init__(
179
175
  self,
180
- path_config_json: str,
181
- path_weights: str,
182
- categories: Mapping[str, TypeOrStr],
176
+ path_config_json: PathLikeOrStr,
177
+ path_weights: PathLikeOrStr,
178
+ categories: Mapping[int, TypeOrStr],
183
179
  device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
184
180
  use_xlm_tokenizer: bool = True,
185
181
  ):
182
+ super().__init__(path_config_json, path_weights, categories, device)
186
183
  self.name = self.get_name(path_weights, "bert-like")
187
184
  self.model_id = self.get_model_id()
188
185
  self.model = self.get_wrapped_model(path_config_json, path_weights)
189
- super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
186
+ self.model.to(self.device)
187
+ self.model.config.tokenizer_class = self.get_tokenizer_class_name(
188
+ self.model.__class__.__name__, use_xlm_tokenizer
189
+ )
190
190
 
191
- def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> SequenceClassResult:
191
+ def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
192
192
  input_ids, attention_mask, token_type_ids = self._validate_encodings(**encodings)
193
193
 
194
194
  result = predict_sequence_classes(
@@ -199,11 +199,13 @@ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
199
199
  )
200
200
 
201
201
  result.class_id += 1
202
- result.class_name = self.categories[str(result.class_id)]
202
+ result.class_name = self.categories.categories[result.class_id]
203
203
  return result
204
204
 
205
205
  @staticmethod
206
- def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
206
+ def get_wrapped_model(
207
+ path_config_json: PathLikeOrStr, path_weights: PathLikeOrStr
208
+ ) -> XLMRobertaForSequenceClassification:
207
209
  """
208
210
  Get the inner (wrapped) model.
209
211
 
@@ -217,9 +219,12 @@ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
217
219
  )
218
220
 
219
221
  @staticmethod
220
- def default_kwargs_for_input_mapping() -> JsonDict:
222
+ def default_kwargs_for_image_to_features_mapping() -> JsonDict:
221
223
  """
222
224
  Add some default arguments that might be necessary when preparing a sample. Overwrite this method
223
225
  for some custom setting.
224
226
  """
225
227
  return {}
228
+
229
+ def clear_model(self) -> None:
230
+ self.model = None