deepdoctection 0.30__py3-none-any.whl → 0.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +38 -29
- deepdoctection/analyzer/dd.py +36 -29
- deepdoctection/configs/conf_dd_one.yaml +34 -31
- deepdoctection/dataflow/base.py +0 -19
- deepdoctection/dataflow/custom.py +4 -3
- deepdoctection/dataflow/custom_serialize.py +14 -5
- deepdoctection/dataflow/parallel_map.py +12 -11
- deepdoctection/dataflow/serialize.py +5 -4
- deepdoctection/datapoint/annotation.py +35 -13
- deepdoctection/datapoint/box.py +3 -5
- deepdoctection/datapoint/convert.py +3 -1
- deepdoctection/datapoint/image.py +79 -36
- deepdoctection/datapoint/view.py +152 -49
- deepdoctection/datasets/__init__.py +1 -4
- deepdoctection/datasets/adapter.py +6 -3
- deepdoctection/datasets/base.py +86 -11
- deepdoctection/datasets/dataflow_builder.py +1 -1
- deepdoctection/datasets/info.py +4 -4
- deepdoctection/datasets/instances/doclaynet.py +3 -2
- deepdoctection/datasets/instances/fintabnet.py +2 -1
- deepdoctection/datasets/instances/funsd.py +2 -1
- deepdoctection/datasets/instances/iiitar13k.py +5 -2
- deepdoctection/datasets/instances/layouttest.py +4 -8
- deepdoctection/datasets/instances/publaynet.py +2 -2
- deepdoctection/datasets/instances/pubtables1m.py +6 -3
- deepdoctection/datasets/instances/pubtabnet.py +2 -1
- deepdoctection/datasets/instances/rvlcdip.py +2 -1
- deepdoctection/datasets/instances/xfund.py +2 -1
- deepdoctection/eval/__init__.py +1 -4
- deepdoctection/eval/accmetric.py +1 -1
- deepdoctection/eval/base.py +5 -4
- deepdoctection/eval/cocometric.py +2 -1
- deepdoctection/eval/eval.py +19 -15
- deepdoctection/eval/tedsmetric.py +14 -11
- deepdoctection/eval/tp_eval_callback.py +14 -7
- deepdoctection/extern/__init__.py +2 -7
- deepdoctection/extern/base.py +39 -13
- deepdoctection/extern/d2detect.py +182 -90
- deepdoctection/extern/deskew.py +36 -9
- deepdoctection/extern/doctrocr.py +265 -83
- deepdoctection/extern/fastlang.py +49 -9
- deepdoctection/extern/hfdetr.py +106 -55
- deepdoctection/extern/hflayoutlm.py +441 -122
- deepdoctection/extern/hflm.py +225 -0
- deepdoctection/extern/model.py +56 -47
- deepdoctection/extern/pdftext.py +10 -5
- deepdoctection/extern/pt/__init__.py +1 -3
- deepdoctection/extern/pt/nms.py +6 -2
- deepdoctection/extern/pt/ptutils.py +27 -18
- deepdoctection/extern/tessocr.py +134 -22
- deepdoctection/extern/texocr.py +6 -2
- deepdoctection/extern/tp/tfutils.py +43 -9
- deepdoctection/extern/tp/tpcompat.py +14 -11
- deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
- deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
- deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
- deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
- deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
- deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
- deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
- deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
- deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
- deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
- deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
- deepdoctection/extern/tp/tpfrcnn/preproc.py +8 -9
- deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
- deepdoctection/extern/tpdetect.py +54 -30
- deepdoctection/mapper/__init__.py +3 -8
- deepdoctection/mapper/d2struct.py +9 -7
- deepdoctection/mapper/hfstruct.py +7 -2
- deepdoctection/mapper/laylmstruct.py +164 -21
- deepdoctection/mapper/maputils.py +16 -3
- deepdoctection/mapper/misc.py +6 -3
- deepdoctection/mapper/prodigystruct.py +1 -1
- deepdoctection/mapper/pubstruct.py +10 -10
- deepdoctection/mapper/tpstruct.py +3 -3
- deepdoctection/pipe/__init__.py +1 -1
- deepdoctection/pipe/anngen.py +35 -8
- deepdoctection/pipe/base.py +53 -19
- deepdoctection/pipe/common.py +23 -13
- deepdoctection/pipe/concurrency.py +2 -1
- deepdoctection/pipe/doctectionpipe.py +2 -2
- deepdoctection/pipe/language.py +3 -2
- deepdoctection/pipe/layout.py +6 -3
- deepdoctection/pipe/lm.py +34 -66
- deepdoctection/pipe/order.py +142 -35
- deepdoctection/pipe/refine.py +26 -24
- deepdoctection/pipe/segment.py +21 -16
- deepdoctection/pipe/{cell.py → sub_layout.py} +30 -9
- deepdoctection/pipe/text.py +14 -8
- deepdoctection/pipe/transform.py +16 -9
- deepdoctection/train/__init__.py +6 -12
- deepdoctection/train/d2_frcnn_train.py +36 -28
- deepdoctection/train/hf_detr_train.py +26 -17
- deepdoctection/train/hf_layoutlm_train.py +133 -111
- deepdoctection/train/tp_frcnn_train.py +21 -19
- deepdoctection/utils/__init__.py +3 -0
- deepdoctection/utils/concurrency.py +1 -1
- deepdoctection/utils/context.py +2 -2
- deepdoctection/utils/env_info.py +41 -84
- deepdoctection/utils/error.py +84 -0
- deepdoctection/utils/file_utils.py +4 -15
- deepdoctection/utils/fs.py +7 -7
- deepdoctection/utils/logger.py +1 -0
- deepdoctection/utils/mocks.py +93 -0
- deepdoctection/utils/pdf_utils.py +5 -4
- deepdoctection/utils/settings.py +6 -1
- deepdoctection/utils/transform.py +1 -1
- deepdoctection/utils/utils.py +0 -6
- deepdoctection/utils/viz.py +48 -5
- {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/METADATA +57 -73
- deepdoctection-0.32.dist-info/RECORD +146 -0
- {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/WHEEL +1 -1
- deepdoctection-0.30.dist-info/RECORD +0 -143
- {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/LICENSE +0 -0
- {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# File: hfml.py
|
|
3
|
+
|
|
4
|
+
# Copyright 2024 Dr. Janis Meyer. All rights reserved.
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
#
|
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
#
|
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
# See the License for the specific language governing permissions and
|
|
16
|
+
# limitations under the License.
|
|
17
|
+
|
|
18
|
+
"""
|
|
19
|
+
Wrapper for the Hugging Face Language Model for sequence and token classification
|
|
20
|
+
"""
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
from abc import ABC
|
|
24
|
+
from copy import copy
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from typing import Any, List, Literal, Mapping, Optional, Tuple, Union
|
|
27
|
+
|
|
28
|
+
from lazy_imports import try_import
|
|
29
|
+
|
|
30
|
+
from ..utils.detection_types import JsonDict, Requirement
|
|
31
|
+
from ..utils.file_utils import get_pytorch_requirement, get_transformers_requirement
|
|
32
|
+
from ..utils.settings import TypeOrStr
|
|
33
|
+
from .base import LMSequenceClassifier, SequenceClassResult
|
|
34
|
+
from .hflayoutlm import get_tokenizer_from_model_class
|
|
35
|
+
from .pt.ptutils import get_torch_device
|
|
36
|
+
|
|
37
|
+
with try_import() as pt_import_guard:
|
|
38
|
+
import torch
|
|
39
|
+
import torch.nn.functional as F
|
|
40
|
+
|
|
41
|
+
with try_import() as tr_import_guard:
|
|
42
|
+
from transformers import PretrainedConfig, XLMRobertaForSequenceClassification
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def predict_sequence_classes(
|
|
46
|
+
input_ids: torch.Tensor,
|
|
47
|
+
attention_mask: torch.Tensor,
|
|
48
|
+
token_type_ids: torch.Tensor,
|
|
49
|
+
model: Union[XLMRobertaForSequenceClassification],
|
|
50
|
+
) -> SequenceClassResult:
|
|
51
|
+
"""
|
|
52
|
+
:param input_ids: Token converted to ids to be taken from LayoutLMTokenizer
|
|
53
|
+
:param attention_mask: The associated attention masks from padded sequences taken from LayoutLMTokenizer
|
|
54
|
+
:param token_type_ids: Torch tensor of token type ids taken from LayoutLMTokenizer
|
|
55
|
+
:param model: layoutlm model for sequence classification
|
|
56
|
+
:return: SequenceClassResult
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
|
|
60
|
+
|
|
61
|
+
score = torch.max(F.softmax(outputs.logits)).tolist()
|
|
62
|
+
sequence_class_predictions = outputs.logits.argmax(-1).squeeze().tolist()
|
|
63
|
+
|
|
64
|
+
return SequenceClassResult(class_id=sequence_class_predictions, score=float(score)) # type: ignore
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class HFLmSequenceClassifierBase(LMSequenceClassifier, ABC):
|
|
68
|
+
"""
|
|
69
|
+
Abstract base class for wrapping Bert-type models for sequence classification into the deepdoctection framework.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
model: Union[XLMRobertaForSequenceClassification]
|
|
73
|
+
|
|
74
|
+
def __init__(
|
|
75
|
+
self,
|
|
76
|
+
path_config_json: str,
|
|
77
|
+
path_weights: str,
|
|
78
|
+
categories: Mapping[str, TypeOrStr],
|
|
79
|
+
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
80
|
+
use_xlm_tokenizer: bool = False,
|
|
81
|
+
):
|
|
82
|
+
self.path_config = path_config_json
|
|
83
|
+
self.path_weights = path_weights
|
|
84
|
+
self.categories = copy(categories) # type: ignore
|
|
85
|
+
|
|
86
|
+
self.device = get_torch_device(device)
|
|
87
|
+
self.model.to(self.device)
|
|
88
|
+
self.model.config.tokenizer_class = self.get_tokenizer_class_name(use_xlm_tokenizer)
|
|
89
|
+
|
|
90
|
+
@classmethod
|
|
91
|
+
def get_requirements(cls) -> List[Requirement]:
|
|
92
|
+
return [get_pytorch_requirement(), get_transformers_requirement()]
|
|
93
|
+
|
|
94
|
+
def clone(self) -> HFLmSequenceClassifierBase:
|
|
95
|
+
return self.__class__(self.path_config, self.path_weights, self.categories, self.device)
|
|
96
|
+
|
|
97
|
+
def _validate_encodings(
|
|
98
|
+
self, **encodings: Union[List[List[str]], torch.Tensor]
|
|
99
|
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
|
100
|
+
input_ids = encodings.get("input_ids")
|
|
101
|
+
attention_mask = encodings.get("attention_mask")
|
|
102
|
+
token_type_ids = encodings.get("token_type_ids")
|
|
103
|
+
|
|
104
|
+
if isinstance(input_ids, torch.Tensor):
|
|
105
|
+
input_ids = input_ids.to(self.device)
|
|
106
|
+
else:
|
|
107
|
+
raise ValueError(f"input_ids must be list but is {type(input_ids)}")
|
|
108
|
+
if isinstance(attention_mask, torch.Tensor):
|
|
109
|
+
attention_mask = attention_mask.to(self.device)
|
|
110
|
+
else:
|
|
111
|
+
raise ValueError(f"attention_mask must be list but is {type(attention_mask)}")
|
|
112
|
+
if isinstance(token_type_ids, torch.Tensor):
|
|
113
|
+
token_type_ids = token_type_ids.to(self.device)
|
|
114
|
+
else:
|
|
115
|
+
raise ValueError(f"token_type_ids must be list but is {type(token_type_ids)}")
|
|
116
|
+
|
|
117
|
+
input_ids = input_ids.to(self.device)
|
|
118
|
+
attention_mask = attention_mask.to(self.device)
|
|
119
|
+
token_type_ids = token_type_ids.to(self.device)
|
|
120
|
+
return input_ids, attention_mask, token_type_ids
|
|
121
|
+
|
|
122
|
+
@staticmethod
|
|
123
|
+
def get_name(path_weights: str, architecture: str) -> str:
|
|
124
|
+
"""Returns the name of the model"""
|
|
125
|
+
return f"Transformers_{architecture}_" + "_".join(Path(path_weights).parts[-2:])
|
|
126
|
+
|
|
127
|
+
def get_tokenizer_class_name(self, use_xlm_tokenizer: bool) -> str:
|
|
128
|
+
"""A refinement for adding the tokenizer class name to the model configs.
|
|
129
|
+
|
|
130
|
+
:param use_xlm_tokenizer: Whether to use a XLM tokenizer.
|
|
131
|
+
"""
|
|
132
|
+
tokenizer = get_tokenizer_from_model_class(self.model.__class__.__name__, use_xlm_tokenizer)
|
|
133
|
+
return tokenizer.__class__.__name__
|
|
134
|
+
|
|
135
|
+
@staticmethod
|
|
136
|
+
def image_to_raw_features_mapping() -> str:
|
|
137
|
+
"""Returns the mapping function to convert images into raw features."""
|
|
138
|
+
return "image_to_raw_lm_features"
|
|
139
|
+
|
|
140
|
+
@staticmethod
|
|
141
|
+
def image_to_features_mapping() -> str:
|
|
142
|
+
"""Returns the mapping function to convert images into features."""
|
|
143
|
+
return "image_to_lm_features"
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
|
|
147
|
+
"""
|
|
148
|
+
A wrapper class for `transformers.XLMRobertaForSequenceClassification` and similar models to use within a pipeline
|
|
149
|
+
component. Check <https://huggingface.co/docs/transformers/model_doc/xlm-roberta> for documentation of the
|
|
150
|
+
model itself.
|
|
151
|
+
Note that this model is equipped with a head that is only useful for classifying the input sequence. For token
|
|
152
|
+
classification and other things please use another model of the family.
|
|
153
|
+
|
|
154
|
+
**Example**
|
|
155
|
+
|
|
156
|
+
# setting up compulsory ocr service
|
|
157
|
+
tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
|
|
158
|
+
tess = TesseractOcrDetector(tesseract_config_path)
|
|
159
|
+
ocr_service = TextExtractionService(tess)
|
|
160
|
+
|
|
161
|
+
# hf tokenizer and token classifier
|
|
162
|
+
tokenizer = XLMRobertaTokenizerFast.from_pretrained("FacebookAI/xlm-roberta-base")
|
|
163
|
+
roberta = HFLmSequenceClassifier("path/to/config.json","path/to/model.bin",
|
|
164
|
+
categories=["handwritten", "presentation", "resume"])
|
|
165
|
+
|
|
166
|
+
# token classification service
|
|
167
|
+
roberta_service = LMSequenceClassifierService(tokenizer,roberta)
|
|
168
|
+
|
|
169
|
+
pipe = DoctectionPipe(pipeline_component_list=[ocr_service,roberta_service])
|
|
170
|
+
|
|
171
|
+
path = "path/to/some/form"
|
|
172
|
+
df = pipe.analyze(path=path)
|
|
173
|
+
|
|
174
|
+
for dp in df:
|
|
175
|
+
...
|
|
176
|
+
"""
|
|
177
|
+
|
|
178
|
+
def __init__(
|
|
179
|
+
self,
|
|
180
|
+
path_config_json: str,
|
|
181
|
+
path_weights: str,
|
|
182
|
+
categories: Mapping[str, TypeOrStr],
|
|
183
|
+
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
184
|
+
use_xlm_tokenizer: bool = True,
|
|
185
|
+
):
|
|
186
|
+
self.name = self.get_name(path_weights, "bert-like")
|
|
187
|
+
self.model_id = self.get_model_id()
|
|
188
|
+
self.model = self.get_wrapped_model(path_config_json, path_weights)
|
|
189
|
+
super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
|
|
190
|
+
|
|
191
|
+
def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> SequenceClassResult:
|
|
192
|
+
input_ids, attention_mask, token_type_ids = self._validate_encodings(**encodings)
|
|
193
|
+
|
|
194
|
+
result = predict_sequence_classes(
|
|
195
|
+
input_ids,
|
|
196
|
+
attention_mask,
|
|
197
|
+
token_type_ids,
|
|
198
|
+
self.model,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
result.class_id += 1
|
|
202
|
+
result.class_name = self.categories[str(result.class_id)]
|
|
203
|
+
return result
|
|
204
|
+
|
|
205
|
+
@staticmethod
|
|
206
|
+
def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
|
|
207
|
+
"""
|
|
208
|
+
Get the inner (wrapped) model.
|
|
209
|
+
|
|
210
|
+
:param path_config_json: path to .json config file
|
|
211
|
+
:param path_weights: path to model artifact
|
|
212
|
+
:return: 'nn.Module'
|
|
213
|
+
"""
|
|
214
|
+
config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
|
|
215
|
+
return XLMRobertaForSequenceClassification.from_pretrained(
|
|
216
|
+
pretrained_model_name_or_path=path_weights, config=config
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
@staticmethod
|
|
220
|
+
def default_kwargs_for_input_mapping() -> JsonDict:
|
|
221
|
+
"""
|
|
222
|
+
Add some default arguments that might be necessary when preparing a sample. Overwrite this method
|
|
223
|
+
for some custom setting.
|
|
224
|
+
"""
|
|
225
|
+
return {}
|
deepdoctection/extern/model.py
CHANGED
|
@@ -185,25 +185,6 @@ class ModelCatalog:
|
|
|
185
185
|
dl_library="TF",
|
|
186
186
|
model_wrapper="TPFrcnnDetector",
|
|
187
187
|
),
|
|
188
|
-
"layout/d2_model-800000-layout.pkl": ModelProfile(
|
|
189
|
-
name="layout/d2_model-800000-layout.pkl",
|
|
190
|
-
description="Detectron2 layout detection model trained on Publaynet",
|
|
191
|
-
config="dd/d2/layout/CASCADE_RCNN_R_50_FPN_GN.yaml",
|
|
192
|
-
size=[274568239],
|
|
193
|
-
tp_model=False,
|
|
194
|
-
hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_publaynet_inference_only",
|
|
195
|
-
hf_model_name="d2_model-800000-layout.pkl",
|
|
196
|
-
hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
|
|
197
|
-
categories={
|
|
198
|
-
"1": LayoutType.text,
|
|
199
|
-
"2": LayoutType.title,
|
|
200
|
-
"3": LayoutType.list,
|
|
201
|
-
"4": LayoutType.table,
|
|
202
|
-
"5": LayoutType.figure,
|
|
203
|
-
},
|
|
204
|
-
dl_library="PT",
|
|
205
|
-
model_wrapper="D2FrcnnDetector",
|
|
206
|
-
),
|
|
207
188
|
"layout/d2_model_0829999_layout_inf_only.pt": ModelProfile(
|
|
208
189
|
name="layout/d2_model_0829999_layout_inf_only.pt",
|
|
209
190
|
description="Detectron2 layout detection model trained on Publaynet",
|
|
@@ -261,19 +242,6 @@ class ModelCatalog:
|
|
|
261
242
|
dl_library="PT",
|
|
262
243
|
model_wrapper="D2FrcnnTracingDetector",
|
|
263
244
|
),
|
|
264
|
-
"cell/d2_model-1800000-cell.pkl": ModelProfile(
|
|
265
|
-
name="cell/d2_model-1800000-cell.pkl",
|
|
266
|
-
description="Detectron2 cell detection inference only model trained on Pubtabnet",
|
|
267
|
-
config="dd/d2/cell/CASCADE_RCNN_R_50_FPN_GN.yaml",
|
|
268
|
-
size=[274519039],
|
|
269
|
-
tp_model=False,
|
|
270
|
-
hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_c_inference_only",
|
|
271
|
-
hf_model_name="d2_model-1800000-cell.pkl",
|
|
272
|
-
hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
|
|
273
|
-
categories={"1": LayoutType.cell},
|
|
274
|
-
dl_library="PT",
|
|
275
|
-
model_wrapper="D2FrcnnDetector",
|
|
276
|
-
),
|
|
277
245
|
"cell/d2_model_1849999_cell_inf_only.pt": ModelProfile(
|
|
278
246
|
name="cell/d2_model_1849999_cell_inf_only.pt",
|
|
279
247
|
description="Detectron2 cell detection inference only model trained on Pubtabnet",
|
|
@@ -313,19 +281,6 @@ class ModelCatalog:
|
|
|
313
281
|
dl_library="PT",
|
|
314
282
|
model_wrapper="D2FrcnnDetector",
|
|
315
283
|
),
|
|
316
|
-
"item/d2_model-1620000-item.pkl": ModelProfile(
|
|
317
|
-
name="item/d2_model-1620000-item.pkl",
|
|
318
|
-
description="Detectron2 item detection inference only model trained on Pubtabnet",
|
|
319
|
-
config="dd/d2/item/CASCADE_RCNN_R_50_FPN_GN.yaml",
|
|
320
|
-
size=[274531339],
|
|
321
|
-
tp_model=False,
|
|
322
|
-
hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_rc_inference_only",
|
|
323
|
-
hf_model_name="d2_model-1620000-item.pkl",
|
|
324
|
-
hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
|
|
325
|
-
categories={"1": LayoutType.row, "2": LayoutType.column},
|
|
326
|
-
dl_library="PT",
|
|
327
|
-
model_wrapper="D2FrcnnDetector",
|
|
328
|
-
),
|
|
329
284
|
"item/d2_model_1639999_item.pth": ModelProfile(
|
|
330
285
|
name="item/d2_model_1639999_item.pth",
|
|
331
286
|
description="Detectron2 item detection model trained on Pubtabnet",
|
|
@@ -365,6 +320,45 @@ class ModelCatalog:
|
|
|
365
320
|
dl_library="PT",
|
|
366
321
|
model_wrapper="D2FrcnnTracingDetector",
|
|
367
322
|
),
|
|
323
|
+
"nielsr/lilt-xlm-roberta-base/pytorch_model.bin": ModelProfile(
|
|
324
|
+
name="nielsr/lilt-xlm-roberta-base/pytorch_model.bin",
|
|
325
|
+
description="LiLT build with a RobertaXLM base model",
|
|
326
|
+
config="nielsr/lilt-xlm-roberta-base/config.json",
|
|
327
|
+
size=[1136743583],
|
|
328
|
+
tp_model=False,
|
|
329
|
+
hf_repo_id="nielsr/lilt-xlm-roberta-base",
|
|
330
|
+
hf_model_name="pytorch_model.bin",
|
|
331
|
+
hf_config_file=["config.json"],
|
|
332
|
+
dl_library="PT",
|
|
333
|
+
),
|
|
334
|
+
"SCUT-DLVCLab/lilt-infoxlm-base/pytorch_model.bin": ModelProfile(
|
|
335
|
+
name="SCUT-DLVCLab/lilt-infoxlm-base/pytorch_model.bin",
|
|
336
|
+
description="Language-Independent Layout Transformer - InfoXLM model by stitching a pre-trained InfoXLM"
|
|
337
|
+
" and a pre-trained Language-Independent Layout Transformer (LiLT) together. It was introduced"
|
|
338
|
+
" in the paper LiLT: A Simple yet Effective Language-Independent Layout Transformer for"
|
|
339
|
+
" Structured Document Understanding by Wang et al. and first released in this repository.",
|
|
340
|
+
config="SCUT-DLVCLab/lilt-infoxlm-base/config.json",
|
|
341
|
+
size=[1136743583],
|
|
342
|
+
tp_model=False,
|
|
343
|
+
hf_repo_id="SCUT-DLVCLab/lilt-infoxlm-base",
|
|
344
|
+
hf_model_name="pytorch_model.bin",
|
|
345
|
+
hf_config_file=["config.json"],
|
|
346
|
+
dl_library="PT",
|
|
347
|
+
),
|
|
348
|
+
"SCUT-DLVCLab/lilt-roberta-en-base/pytorch_model.bin": ModelProfile(
|
|
349
|
+
name="SCUT-DLVCLab/lilt-roberta-en-base/pytorch_model.bin",
|
|
350
|
+
description="Language-Independent Layout Transformer - RoBERTa model by stitching a pre-trained RoBERTa"
|
|
351
|
+
" (English) and a pre-trained Language-Independent Layout Transformer (LiLT) together. It was"
|
|
352
|
+
" introduced in the paper LiLT: A Simple yet Effective Language-Independent Layout Transformer"
|
|
353
|
+
" for Structured Document Understanding by Wang et al. and first released in this repository.",
|
|
354
|
+
config="SCUT-DLVCLab/lilt-roberta-en-base/config.json",
|
|
355
|
+
size=[523151519],
|
|
356
|
+
tp_model=False,
|
|
357
|
+
hf_repo_id="SCUT-DLVCLab/lilt-roberta-en-base",
|
|
358
|
+
hf_model_name="pytorch_model.bin",
|
|
359
|
+
hf_config_file=["config.json"],
|
|
360
|
+
dl_library="PT",
|
|
361
|
+
),
|
|
368
362
|
"microsoft/layoutlm-base-uncased/pytorch_model.bin": ModelProfile(
|
|
369
363
|
name="microsoft/layoutlm-base-uncased/pytorch_model.bin",
|
|
370
364
|
description="LayoutLM is a simple but effective pre-training method of text and layout for document image"
|
|
@@ -535,6 +529,19 @@ class ModelCatalog:
|
|
|
535
529
|
model_wrapper="DoctrTextRecognizer",
|
|
536
530
|
architecture="crnn_vgg16_bn",
|
|
537
531
|
),
|
|
532
|
+
"FacebookAI/xlm-roberta-base": ModelProfile(
|
|
533
|
+
name="FacebookAI/xlm-roberta-base/pytorch_model.bin",
|
|
534
|
+
description="XLM-RoBERTa model pre-trained on 2.5TB of filtered CommonCrawl data containing 100 languages."
|
|
535
|
+
" It was introduced in the paper Unsupervised Cross-lingual Representation Learning at Scale"
|
|
536
|
+
" by Conneau et al. and first released in this repository.",
|
|
537
|
+
size=[1115590446],
|
|
538
|
+
tp_model=False,
|
|
539
|
+
config="FacebookAI/xlm-roberta-base/config.json",
|
|
540
|
+
hf_repo_id="FacebookAI/xlm-roberta-base",
|
|
541
|
+
hf_model_name="pytorch_model.bin",
|
|
542
|
+
hf_config_file=["config.json"],
|
|
543
|
+
dl_library="PT",
|
|
544
|
+
),
|
|
538
545
|
"fasttext/lid.176.bin": ModelProfile(
|
|
539
546
|
name="fasttext/lid.176.bin",
|
|
540
547
|
description="Fasttext language detection model",
|
|
@@ -980,9 +987,11 @@ class ModelDownloadManager:
|
|
|
980
987
|
else:
|
|
981
988
|
file_names.append(model_name)
|
|
982
989
|
if profile.hf_repo_id:
|
|
983
|
-
|
|
990
|
+
if not os.path.isfile(absolute_path_weights):
|
|
991
|
+
ModelDownloadManager.load_model_from_hf_hub(profile, absolute_path_weights, file_names)
|
|
984
992
|
absolute_path_configs = ModelCatalog.get_full_path_configs(name)
|
|
985
|
-
|
|
993
|
+
if not os.path.isfile(absolute_path_configs):
|
|
994
|
+
ModelDownloadManager.load_configs_from_hf_hub(profile, absolute_path_configs)
|
|
986
995
|
else:
|
|
987
996
|
ModelDownloadManager._load_from_gd(profile, absolute_path_weights, file_names)
|
|
988
997
|
|
deepdoctection/extern/pdftext.py
CHANGED
|
@@ -21,13 +21,15 @@ PDFPlumber text extraction engine
|
|
|
21
21
|
|
|
22
22
|
from typing import Dict, List, Tuple
|
|
23
23
|
|
|
24
|
+
from lazy_imports import try_import
|
|
25
|
+
|
|
24
26
|
from ..utils.context import save_tmp_file
|
|
25
27
|
from ..utils.detection_types import Requirement
|
|
26
|
-
from ..utils.file_utils import get_pdfplumber_requirement
|
|
28
|
+
from ..utils.file_utils import get_pdfplumber_requirement
|
|
27
29
|
from ..utils.settings import LayoutType, ObjectTypes
|
|
28
30
|
from .base import DetectionResult, PdfMiner
|
|
29
31
|
|
|
30
|
-
|
|
32
|
+
with try_import() as import_guard:
|
|
31
33
|
from pdfplumber.pdf import PDF
|
|
32
34
|
|
|
33
35
|
|
|
@@ -64,9 +66,12 @@ class PdfPlumberTextDetector(PdfMiner):
|
|
|
64
66
|
|
|
65
67
|
"""
|
|
66
68
|
|
|
67
|
-
def __init__(self) -> None:
|
|
68
|
-
self.name = "
|
|
69
|
+
def __init__(self, x_tolerance: int = 3, y_tolerance: int = 3) -> None:
|
|
70
|
+
self.name = "Pdfplumber"
|
|
71
|
+
self.model_id = self.get_model_id()
|
|
69
72
|
self.categories = {"1": LayoutType.word}
|
|
73
|
+
self.x_tolerance = x_tolerance
|
|
74
|
+
self.y_tolerance = y_tolerance
|
|
70
75
|
|
|
71
76
|
def predict(self, pdf_bytes: bytes) -> List[DetectionResult]:
|
|
72
77
|
"""
|
|
@@ -81,7 +86,7 @@ class PdfPlumberTextDetector(PdfMiner):
|
|
|
81
86
|
_pdf = PDF(fin)
|
|
82
87
|
self._page = _pdf.pages[0]
|
|
83
88
|
self._pdf_bytes = pdf_bytes
|
|
84
|
-
words = self._page.extract_words()
|
|
89
|
+
words = self._page.extract_words(x_tolerance=self.x_tolerance, y_tolerance=self.y_tolerance)
|
|
85
90
|
detect_results = list(map(_to_detect_result, words))
|
|
86
91
|
return detect_results
|
|
87
92
|
|
deepdoctection/extern/pt/nms.py
CHANGED
|
@@ -18,9 +18,13 @@
|
|
|
18
18
|
"""
|
|
19
19
|
Module for custom NMS functions.
|
|
20
20
|
"""
|
|
21
|
+
from __future__ import annotations
|
|
21
22
|
|
|
22
|
-
import
|
|
23
|
-
|
|
23
|
+
from lazy_imports import try_import
|
|
24
|
+
|
|
25
|
+
with try_import() as import_guard:
|
|
26
|
+
import torch
|
|
27
|
+
from torchvision.ops import boxes as box_ops # type: ignore
|
|
24
28
|
|
|
25
29
|
|
|
26
30
|
# Copy & paste from https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/nms.py
|
|
@@ -18,31 +18,40 @@
|
|
|
18
18
|
"""
|
|
19
19
|
Torch related utils
|
|
20
20
|
"""
|
|
21
|
+
from __future__ import annotations
|
|
21
22
|
|
|
23
|
+
import os
|
|
24
|
+
from typing import Optional, Union
|
|
22
25
|
|
|
23
|
-
from
|
|
26
|
+
from lazy_imports import try_import
|
|
24
27
|
|
|
28
|
+
with try_import() as import_guard:
|
|
29
|
+
import torch
|
|
25
30
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
Returns cuda device if available, otherwise cpu
|
|
31
|
+
|
|
32
|
+
def get_torch_device(device: Optional[Union[str, torch.device]] = None) -> torch.device:
|
|
29
33
|
"""
|
|
30
|
-
|
|
31
|
-
from torch import cuda, device # pylint: disable=C0415
|
|
34
|
+
Selecting a device on which to load a model. The selection follows a cascade of priorities:
|
|
32
35
|
|
|
33
|
-
|
|
34
|
-
|
|
36
|
+
- If a device string is provided, it is used.
|
|
37
|
+
- If the environment variable "USE_CUDA" is set, a GPU is used. If more GPUs are available, it will use all of them
|
|
38
|
+
unless something else is specified by CUDA_VISIBLE_DEVICES:
|
|
35
39
|
|
|
40
|
+
https://stackoverflow.com/questions/54216920/how-to-use-multiple-gpus-in-pytorch
|
|
36
41
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
Returns number of CUDA devices if pytorch is available
|
|
42
|
+
- If an MPS device is available, it is used.
|
|
43
|
+
- Otherwise, the CPU is used.
|
|
40
44
|
|
|
41
|
-
:
|
|
45
|
+
:param device: Device either as string or torch.device
|
|
46
|
+
:return: Tensorflow device
|
|
42
47
|
"""
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
48
|
+
if device is not None:
|
|
49
|
+
if isinstance(device, torch.device):
|
|
50
|
+
return device
|
|
51
|
+
if isinstance(device, str):
|
|
52
|
+
return torch.device(device)
|
|
53
|
+
if os.environ.get("USE_CUDA"):
|
|
54
|
+
return torch.device("cuda")
|
|
55
|
+
if os.environ.get("USE_MPS"):
|
|
56
|
+
return torch.device("mps")
|
|
57
|
+
return torch.device("cpu")
|