deepdoctection 0.42.1__py3-none-any.whl → 0.43.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +4 -2
- deepdoctection/analyzer/__init__.py +2 -1
- deepdoctection/analyzer/config.py +919 -0
- deepdoctection/analyzer/dd.py +36 -62
- deepdoctection/analyzer/factory.py +311 -141
- deepdoctection/configs/conf_dd_one.yaml +100 -44
- deepdoctection/configs/profiles.jsonl +32 -0
- deepdoctection/dataflow/__init__.py +9 -6
- deepdoctection/dataflow/base.py +33 -15
- deepdoctection/dataflow/common.py +96 -75
- deepdoctection/dataflow/custom.py +36 -29
- deepdoctection/dataflow/custom_serialize.py +135 -91
- deepdoctection/dataflow/parallel_map.py +33 -31
- deepdoctection/dataflow/serialize.py +15 -10
- deepdoctection/dataflow/stats.py +41 -28
- deepdoctection/datapoint/__init__.py +4 -6
- deepdoctection/datapoint/annotation.py +104 -66
- deepdoctection/datapoint/box.py +190 -130
- deepdoctection/datapoint/convert.py +66 -39
- deepdoctection/datapoint/image.py +151 -95
- deepdoctection/datapoint/view.py +383 -236
- deepdoctection/datasets/__init__.py +2 -6
- deepdoctection/datasets/adapter.py +11 -11
- deepdoctection/datasets/base.py +118 -81
- deepdoctection/datasets/dataflow_builder.py +18 -12
- deepdoctection/datasets/info.py +76 -57
- deepdoctection/datasets/instances/__init__.py +6 -2
- deepdoctection/datasets/instances/doclaynet.py +17 -14
- deepdoctection/datasets/instances/fintabnet.py +16 -22
- deepdoctection/datasets/instances/funsd.py +11 -6
- deepdoctection/datasets/instances/iiitar13k.py +9 -9
- deepdoctection/datasets/instances/layouttest.py +9 -9
- deepdoctection/datasets/instances/publaynet.py +9 -9
- deepdoctection/datasets/instances/pubtables1m.py +13 -13
- deepdoctection/datasets/instances/pubtabnet.py +13 -15
- deepdoctection/datasets/instances/rvlcdip.py +8 -8
- deepdoctection/datasets/instances/xfund.py +11 -9
- deepdoctection/datasets/registry.py +18 -11
- deepdoctection/datasets/save.py +12 -11
- deepdoctection/eval/__init__.py +3 -2
- deepdoctection/eval/accmetric.py +72 -52
- deepdoctection/eval/base.py +29 -10
- deepdoctection/eval/cocometric.py +14 -12
- deepdoctection/eval/eval.py +56 -41
- deepdoctection/eval/registry.py +6 -3
- deepdoctection/eval/tedsmetric.py +24 -9
- deepdoctection/eval/tp_eval_callback.py +13 -12
- deepdoctection/extern/__init__.py +1 -1
- deepdoctection/extern/base.py +176 -97
- deepdoctection/extern/d2detect.py +127 -92
- deepdoctection/extern/deskew.py +19 -10
- deepdoctection/extern/doctrocr.py +162 -108
- deepdoctection/extern/fastlang.py +25 -17
- deepdoctection/extern/hfdetr.py +137 -60
- deepdoctection/extern/hflayoutlm.py +329 -248
- deepdoctection/extern/hflm.py +67 -33
- deepdoctection/extern/model.py +108 -762
- deepdoctection/extern/pdftext.py +37 -12
- deepdoctection/extern/pt/nms.py +15 -1
- deepdoctection/extern/pt/ptutils.py +13 -9
- deepdoctection/extern/tessocr.py +87 -54
- deepdoctection/extern/texocr.py +29 -14
- deepdoctection/extern/tp/tfutils.py +36 -8
- deepdoctection/extern/tp/tpcompat.py +54 -16
- deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
- deepdoctection/extern/tpdetect.py +4 -2
- deepdoctection/mapper/__init__.py +1 -1
- deepdoctection/mapper/cats.py +117 -76
- deepdoctection/mapper/cocostruct.py +35 -17
- deepdoctection/mapper/d2struct.py +56 -29
- deepdoctection/mapper/hfstruct.py +32 -19
- deepdoctection/mapper/laylmstruct.py +221 -185
- deepdoctection/mapper/maputils.py +71 -35
- deepdoctection/mapper/match.py +76 -62
- deepdoctection/mapper/misc.py +68 -44
- deepdoctection/mapper/pascalstruct.py +13 -12
- deepdoctection/mapper/prodigystruct.py +33 -19
- deepdoctection/mapper/pubstruct.py +42 -32
- deepdoctection/mapper/tpstruct.py +39 -19
- deepdoctection/mapper/xfundstruct.py +20 -13
- deepdoctection/pipe/__init__.py +1 -2
- deepdoctection/pipe/anngen.py +104 -62
- deepdoctection/pipe/base.py +226 -107
- deepdoctection/pipe/common.py +206 -123
- deepdoctection/pipe/concurrency.py +74 -47
- deepdoctection/pipe/doctectionpipe.py +108 -47
- deepdoctection/pipe/language.py +41 -24
- deepdoctection/pipe/layout.py +45 -18
- deepdoctection/pipe/lm.py +146 -78
- deepdoctection/pipe/order.py +205 -119
- deepdoctection/pipe/refine.py +111 -63
- deepdoctection/pipe/registry.py +1 -1
- deepdoctection/pipe/segment.py +213 -142
- deepdoctection/pipe/sub_layout.py +76 -46
- deepdoctection/pipe/text.py +52 -33
- deepdoctection/pipe/transform.py +8 -6
- deepdoctection/train/d2_frcnn_train.py +87 -69
- deepdoctection/train/hf_detr_train.py +72 -40
- deepdoctection/train/hf_layoutlm_train.py +85 -46
- deepdoctection/train/tp_frcnn_train.py +56 -28
- deepdoctection/utils/concurrency.py +59 -16
- deepdoctection/utils/context.py +40 -19
- deepdoctection/utils/develop.py +26 -17
- deepdoctection/utils/env_info.py +86 -37
- deepdoctection/utils/error.py +16 -10
- deepdoctection/utils/file_utils.py +246 -71
- deepdoctection/utils/fs.py +162 -43
- deepdoctection/utils/identifier.py +29 -16
- deepdoctection/utils/logger.py +49 -32
- deepdoctection/utils/metacfg.py +83 -21
- deepdoctection/utils/pdf_utils.py +119 -62
- deepdoctection/utils/settings.py +24 -10
- deepdoctection/utils/tqdm.py +10 -5
- deepdoctection/utils/transform.py +182 -46
- deepdoctection/utils/utils.py +61 -28
- deepdoctection/utils/viz.py +150 -104
- deepdoctection-0.43.1.dist-info/METADATA +376 -0
- deepdoctection-0.43.1.dist-info/RECORD +149 -0
- deepdoctection/analyzer/_config.py +0 -146
- deepdoctection-0.42.1.dist-info/METADATA +0 -431
- deepdoctection-0.42.1.dist-info/RECORD +0 -148
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/WHEEL +0 -0
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/licenses/LICENSE +0 -0
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/top_level.txt +0 -0
deepdoctection/extern/hflm.py
CHANGED
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
# limitations under the License.
|
|
17
17
|
|
|
18
18
|
"""
|
|
19
|
-
Wrapper for the
|
|
19
|
+
Wrapper for the HF Language Model for sequence and token classification
|
|
20
20
|
"""
|
|
21
21
|
from __future__ import annotations
|
|
22
22
|
|
|
@@ -48,11 +48,14 @@ def predict_sequence_classes(
|
|
|
48
48
|
model: Union[XLMRobertaForSequenceClassification],
|
|
49
49
|
) -> SequenceClassResult:
|
|
50
50
|
"""
|
|
51
|
-
:
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
51
|
+
Args:
|
|
52
|
+
input_ids: Token converted to ids to be taken from `XLMRobertaTokenizer`
|
|
53
|
+
attention_mask: The associated attention masks from padded sequences taken from `XLMRobertaTokenizer`
|
|
54
|
+
token_type_ids: Torch tensor of token type ids taken from `XLMRobertaTokenizer`
|
|
55
|
+
model: `XLMRobertaForSequenceClassification` model for sequence classification
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
`SequenceClassResult`
|
|
56
59
|
"""
|
|
57
60
|
|
|
58
61
|
outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
|
|
@@ -65,7 +68,7 @@ def predict_sequence_classes(
|
|
|
65
68
|
|
|
66
69
|
class HFLmSequenceClassifierBase(LMSequenceClassifier, ABC):
|
|
67
70
|
"""
|
|
68
|
-
Abstract base class for wrapping Bert-type models
|
|
71
|
+
Abstract base class for wrapping Bert-type models for sequence classification into the deepdoctection framework.
|
|
69
72
|
"""
|
|
70
73
|
|
|
71
74
|
def __init__(
|
|
@@ -115,27 +118,51 @@ class HFLmSequenceClassifierBase(LMSequenceClassifier, ABC):
|
|
|
115
118
|
|
|
116
119
|
@staticmethod
|
|
117
120
|
def get_name(path_weights: PathLikeOrStr, architecture: str) -> str:
|
|
118
|
-
"""
|
|
121
|
+
"""
|
|
122
|
+
Returns the name of the model
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
path_weights: Path to model weights
|
|
126
|
+
architecture: Architecture name
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
str: Model name
|
|
130
|
+
"""
|
|
119
131
|
return f"Transformers_{architecture}_" + "_".join(Path(path_weights).parts[-2:])
|
|
120
132
|
|
|
121
133
|
@staticmethod
|
|
122
134
|
def get_tokenizer_class_name(model_class_name: str, use_xlm_tokenizer: bool) -> str:
|
|
123
|
-
"""
|
|
135
|
+
"""
|
|
136
|
+
A refinement for adding the tokenizer class name to the model configs.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
model_class_name: The model name, e.g. `model.__class__.__name__`
|
|
140
|
+
use_xlm_tokenizer: Whether to use a `XLM` tokenizer.
|
|
124
141
|
|
|
125
|
-
:
|
|
126
|
-
|
|
142
|
+
Returns:
|
|
143
|
+
str: Tokenizer class name
|
|
127
144
|
"""
|
|
128
145
|
tokenizer = get_tokenizer_from_model_class(model_class_name, use_xlm_tokenizer)
|
|
129
146
|
return tokenizer.__class__.__name__
|
|
130
147
|
|
|
131
148
|
@staticmethod
|
|
132
149
|
def image_to_raw_features_mapping() -> str:
|
|
133
|
-
"""
|
|
150
|
+
"""
|
|
151
|
+
Returns the mapping function to convert images into raw features.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
str: Name of the mapping function
|
|
155
|
+
"""
|
|
134
156
|
return "image_to_raw_lm_features"
|
|
135
157
|
|
|
136
158
|
@staticmethod
|
|
137
159
|
def image_to_features_mapping() -> str:
|
|
138
|
-
"""
|
|
160
|
+
"""
|
|
161
|
+
Returns the mapping function to convert images into features.
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
str: Name of the mapping function
|
|
165
|
+
"""
|
|
139
166
|
return "image_to_lm_features"
|
|
140
167
|
|
|
141
168
|
|
|
@@ -147,28 +174,29 @@ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
|
|
|
147
174
|
Note that this model is equipped with a head that is only useful for classifying the input sequence. For token
|
|
148
175
|
classification and other things please use another model of the family.
|
|
149
176
|
|
|
150
|
-
|
|
177
|
+
Example:
|
|
178
|
+
```python
|
|
179
|
+
# setting up compulsory ocr service
|
|
180
|
+
tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
|
|
181
|
+
tess = TesseractOcrDetector(tesseract_config_path)
|
|
182
|
+
ocr_service = TextExtractionService(tess)
|
|
151
183
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
184
|
+
# hf tokenizer and token classifier
|
|
185
|
+
tokenizer = XLMRobertaTokenizerFast.from_pretrained("FacebookAI/xlm-roberta-base")
|
|
186
|
+
roberta = HFLmSequenceClassifier("path/to/config.json","path/to/model.bin",
|
|
187
|
+
categories=["handwritten", "presentation", "resume"])
|
|
156
188
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
roberta = HFLmSequenceClassifier("path/to/config.json","path/to/model.bin",
|
|
160
|
-
categories=["handwritten", "presentation", "resume"])
|
|
189
|
+
# token classification service
|
|
190
|
+
roberta_service = LMSequenceClassifierService(tokenizer,roberta)
|
|
161
191
|
|
|
162
|
-
|
|
163
|
-
roberta_service = LMSequenceClassifierService(tokenizer,roberta)
|
|
192
|
+
pipe = DoctectionPipe(pipeline_component_list=[ocr_service,roberta_service])
|
|
164
193
|
|
|
165
|
-
|
|
194
|
+
path = "path/to/some/form"
|
|
195
|
+
df = pipe.analyze(path=path)
|
|
166
196
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
for dp in df:
|
|
171
|
-
...
|
|
197
|
+
for dp in df:
|
|
198
|
+
...
|
|
199
|
+
```
|
|
172
200
|
"""
|
|
173
201
|
|
|
174
202
|
def __init__(
|
|
@@ -209,9 +237,12 @@ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
|
|
|
209
237
|
"""
|
|
210
238
|
Get the inner (wrapped) model.
|
|
211
239
|
|
|
212
|
-
:
|
|
213
|
-
|
|
214
|
-
|
|
240
|
+
Args:
|
|
241
|
+
path_config_json: path to .json config file
|
|
242
|
+
path_weights: path to model artifact
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
`XLMRobertaForSequenceClassification`
|
|
215
246
|
"""
|
|
216
247
|
config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
|
|
217
248
|
return XLMRobertaForSequenceClassification.from_pretrained(
|
|
@@ -223,6 +254,9 @@ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
|
|
|
223
254
|
"""
|
|
224
255
|
Add some default arguments that might be necessary when preparing a sample. Overwrite this method
|
|
225
256
|
for some custom setting.
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
JsonDict: Dictionary with default arguments
|
|
226
260
|
"""
|
|
227
261
|
return {}
|
|
228
262
|
|