deepdoctection 0.30__py3-none-any.whl → 0.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (120) hide show
  1. deepdoctection/__init__.py +38 -29
  2. deepdoctection/analyzer/dd.py +36 -29
  3. deepdoctection/configs/conf_dd_one.yaml +34 -31
  4. deepdoctection/dataflow/base.py +0 -19
  5. deepdoctection/dataflow/custom.py +4 -3
  6. deepdoctection/dataflow/custom_serialize.py +14 -5
  7. deepdoctection/dataflow/parallel_map.py +12 -11
  8. deepdoctection/dataflow/serialize.py +5 -4
  9. deepdoctection/datapoint/annotation.py +35 -13
  10. deepdoctection/datapoint/box.py +3 -5
  11. deepdoctection/datapoint/convert.py +3 -1
  12. deepdoctection/datapoint/image.py +79 -36
  13. deepdoctection/datapoint/view.py +152 -49
  14. deepdoctection/datasets/__init__.py +1 -4
  15. deepdoctection/datasets/adapter.py +6 -3
  16. deepdoctection/datasets/base.py +86 -11
  17. deepdoctection/datasets/dataflow_builder.py +1 -1
  18. deepdoctection/datasets/info.py +4 -4
  19. deepdoctection/datasets/instances/doclaynet.py +3 -2
  20. deepdoctection/datasets/instances/fintabnet.py +2 -1
  21. deepdoctection/datasets/instances/funsd.py +2 -1
  22. deepdoctection/datasets/instances/iiitar13k.py +5 -2
  23. deepdoctection/datasets/instances/layouttest.py +4 -8
  24. deepdoctection/datasets/instances/publaynet.py +2 -2
  25. deepdoctection/datasets/instances/pubtables1m.py +6 -3
  26. deepdoctection/datasets/instances/pubtabnet.py +2 -1
  27. deepdoctection/datasets/instances/rvlcdip.py +2 -1
  28. deepdoctection/datasets/instances/xfund.py +2 -1
  29. deepdoctection/eval/__init__.py +1 -4
  30. deepdoctection/eval/accmetric.py +1 -1
  31. deepdoctection/eval/base.py +5 -4
  32. deepdoctection/eval/cocometric.py +2 -1
  33. deepdoctection/eval/eval.py +19 -15
  34. deepdoctection/eval/tedsmetric.py +14 -11
  35. deepdoctection/eval/tp_eval_callback.py +14 -7
  36. deepdoctection/extern/__init__.py +2 -7
  37. deepdoctection/extern/base.py +39 -13
  38. deepdoctection/extern/d2detect.py +182 -90
  39. deepdoctection/extern/deskew.py +36 -9
  40. deepdoctection/extern/doctrocr.py +265 -83
  41. deepdoctection/extern/fastlang.py +49 -9
  42. deepdoctection/extern/hfdetr.py +106 -55
  43. deepdoctection/extern/hflayoutlm.py +441 -122
  44. deepdoctection/extern/hflm.py +225 -0
  45. deepdoctection/extern/model.py +56 -47
  46. deepdoctection/extern/pdftext.py +10 -5
  47. deepdoctection/extern/pt/__init__.py +1 -3
  48. deepdoctection/extern/pt/nms.py +6 -2
  49. deepdoctection/extern/pt/ptutils.py +27 -18
  50. deepdoctection/extern/tessocr.py +134 -22
  51. deepdoctection/extern/texocr.py +6 -2
  52. deepdoctection/extern/tp/tfutils.py +43 -9
  53. deepdoctection/extern/tp/tpcompat.py +14 -11
  54. deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
  55. deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
  56. deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
  57. deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
  58. deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
  59. deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
  60. deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
  61. deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
  62. deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
  63. deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
  64. deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
  65. deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
  66. deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
  67. deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
  68. deepdoctection/extern/tp/tpfrcnn/preproc.py +8 -9
  69. deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
  70. deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
  71. deepdoctection/extern/tpdetect.py +54 -30
  72. deepdoctection/mapper/__init__.py +3 -8
  73. deepdoctection/mapper/d2struct.py +9 -7
  74. deepdoctection/mapper/hfstruct.py +7 -2
  75. deepdoctection/mapper/laylmstruct.py +164 -21
  76. deepdoctection/mapper/maputils.py +16 -3
  77. deepdoctection/mapper/misc.py +6 -3
  78. deepdoctection/mapper/prodigystruct.py +1 -1
  79. deepdoctection/mapper/pubstruct.py +10 -10
  80. deepdoctection/mapper/tpstruct.py +3 -3
  81. deepdoctection/pipe/__init__.py +1 -1
  82. deepdoctection/pipe/anngen.py +35 -8
  83. deepdoctection/pipe/base.py +53 -19
  84. deepdoctection/pipe/common.py +23 -13
  85. deepdoctection/pipe/concurrency.py +2 -1
  86. deepdoctection/pipe/doctectionpipe.py +2 -2
  87. deepdoctection/pipe/language.py +3 -2
  88. deepdoctection/pipe/layout.py +6 -3
  89. deepdoctection/pipe/lm.py +34 -66
  90. deepdoctection/pipe/order.py +142 -35
  91. deepdoctection/pipe/refine.py +26 -24
  92. deepdoctection/pipe/segment.py +21 -16
  93. deepdoctection/pipe/{cell.py → sub_layout.py} +30 -9
  94. deepdoctection/pipe/text.py +14 -8
  95. deepdoctection/pipe/transform.py +16 -9
  96. deepdoctection/train/__init__.py +6 -12
  97. deepdoctection/train/d2_frcnn_train.py +36 -28
  98. deepdoctection/train/hf_detr_train.py +26 -17
  99. deepdoctection/train/hf_layoutlm_train.py +133 -111
  100. deepdoctection/train/tp_frcnn_train.py +21 -19
  101. deepdoctection/utils/__init__.py +3 -0
  102. deepdoctection/utils/concurrency.py +1 -1
  103. deepdoctection/utils/context.py +2 -2
  104. deepdoctection/utils/env_info.py +41 -84
  105. deepdoctection/utils/error.py +84 -0
  106. deepdoctection/utils/file_utils.py +4 -15
  107. deepdoctection/utils/fs.py +7 -7
  108. deepdoctection/utils/logger.py +1 -0
  109. deepdoctection/utils/mocks.py +93 -0
  110. deepdoctection/utils/pdf_utils.py +5 -4
  111. deepdoctection/utils/settings.py +6 -1
  112. deepdoctection/utils/transform.py +1 -1
  113. deepdoctection/utils/utils.py +0 -6
  114. deepdoctection/utils/viz.py +48 -5
  115. {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/METADATA +57 -73
  116. deepdoctection-0.32.dist-info/RECORD +146 -0
  117. {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/WHEEL +1 -1
  118. deepdoctection-0.30.dist-info/RECORD +0 -143
  119. {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/LICENSE +0 -0
  120. {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,225 @@
1
+ # -*- coding: utf-8 -*-
2
+ # File: hfml.py
3
+
4
+ # Copyright 2024 Dr. Janis Meyer. All rights reserved.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ """
19
+ Wrapper for the Hugging Face Language Model for sequence and token classification
20
+ """
21
+ from __future__ import annotations
22
+
23
+ from abc import ABC
24
+ from copy import copy
25
+ from pathlib import Path
26
+ from typing import Any, List, Literal, Mapping, Optional, Tuple, Union
27
+
28
+ from lazy_imports import try_import
29
+
30
+ from ..utils.detection_types import JsonDict, Requirement
31
+ from ..utils.file_utils import get_pytorch_requirement, get_transformers_requirement
32
+ from ..utils.settings import TypeOrStr
33
+ from .base import LMSequenceClassifier, SequenceClassResult
34
+ from .hflayoutlm import get_tokenizer_from_model_class
35
+ from .pt.ptutils import get_torch_device
36
+
37
+ with try_import() as pt_import_guard:
38
+ import torch
39
+ import torch.nn.functional as F
40
+
41
+ with try_import() as tr_import_guard:
42
+ from transformers import PretrainedConfig, XLMRobertaForSequenceClassification
43
+
44
+
45
+ def predict_sequence_classes(
46
+ input_ids: torch.Tensor,
47
+ attention_mask: torch.Tensor,
48
+ token_type_ids: torch.Tensor,
49
+ model: Union[XLMRobertaForSequenceClassification],
50
+ ) -> SequenceClassResult:
51
+ """
52
+ :param input_ids: Token converted to ids to be taken from LayoutLMTokenizer
53
+ :param attention_mask: The associated attention masks from padded sequences taken from LayoutLMTokenizer
54
+ :param token_type_ids: Torch tensor of token type ids taken from LayoutLMTokenizer
55
+ :param model: layoutlm model for sequence classification
56
+ :return: SequenceClassResult
57
+ """
58
+
59
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
60
+
61
+ score = torch.max(F.softmax(outputs.logits)).tolist()
62
+ sequence_class_predictions = outputs.logits.argmax(-1).squeeze().tolist()
63
+
64
+ return SequenceClassResult(class_id=sequence_class_predictions, score=float(score)) # type: ignore
65
+
66
+
67
+ class HFLmSequenceClassifierBase(LMSequenceClassifier, ABC):
68
+ """
69
+ Abstract base class for wrapping Bert-type models for sequence classification into the deepdoctection framework.
70
+ """
71
+
72
+ model: Union[XLMRobertaForSequenceClassification]
73
+
74
+ def __init__(
75
+ self,
76
+ path_config_json: str,
77
+ path_weights: str,
78
+ categories: Mapping[str, TypeOrStr],
79
+ device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
80
+ use_xlm_tokenizer: bool = False,
81
+ ):
82
+ self.path_config = path_config_json
83
+ self.path_weights = path_weights
84
+ self.categories = copy(categories) # type: ignore
85
+
86
+ self.device = get_torch_device(device)
87
+ self.model.to(self.device)
88
+ self.model.config.tokenizer_class = self.get_tokenizer_class_name(use_xlm_tokenizer)
89
+
90
+ @classmethod
91
+ def get_requirements(cls) -> List[Requirement]:
92
+ return [get_pytorch_requirement(), get_transformers_requirement()]
93
+
94
+ def clone(self) -> HFLmSequenceClassifierBase:
95
+ return self.__class__(self.path_config, self.path_weights, self.categories, self.device)
96
+
97
+ def _validate_encodings(
98
+ self, **encodings: Union[List[List[str]], torch.Tensor]
99
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
100
+ input_ids = encodings.get("input_ids")
101
+ attention_mask = encodings.get("attention_mask")
102
+ token_type_ids = encodings.get("token_type_ids")
103
+
104
+ if isinstance(input_ids, torch.Tensor):
105
+ input_ids = input_ids.to(self.device)
106
+ else:
107
+ raise ValueError(f"input_ids must be list but is {type(input_ids)}")
108
+ if isinstance(attention_mask, torch.Tensor):
109
+ attention_mask = attention_mask.to(self.device)
110
+ else:
111
+ raise ValueError(f"attention_mask must be list but is {type(attention_mask)}")
112
+ if isinstance(token_type_ids, torch.Tensor):
113
+ token_type_ids = token_type_ids.to(self.device)
114
+ else:
115
+ raise ValueError(f"token_type_ids must be list but is {type(token_type_ids)}")
116
+
117
+ input_ids = input_ids.to(self.device)
118
+ attention_mask = attention_mask.to(self.device)
119
+ token_type_ids = token_type_ids.to(self.device)
120
+ return input_ids, attention_mask, token_type_ids
121
+
122
+ @staticmethod
123
+ def get_name(path_weights: str, architecture: str) -> str:
124
+ """Returns the name of the model"""
125
+ return f"Transformers_{architecture}_" + "_".join(Path(path_weights).parts[-2:])
126
+
127
+ def get_tokenizer_class_name(self, use_xlm_tokenizer: bool) -> str:
128
+ """A refinement for adding the tokenizer class name to the model configs.
129
+
130
+ :param use_xlm_tokenizer: Whether to use a XLM tokenizer.
131
+ """
132
+ tokenizer = get_tokenizer_from_model_class(self.model.__class__.__name__, use_xlm_tokenizer)
133
+ return tokenizer.__class__.__name__
134
+
135
+ @staticmethod
136
+ def image_to_raw_features_mapping() -> str:
137
+ """Returns the mapping function to convert images into raw features."""
138
+ return "image_to_raw_lm_features"
139
+
140
+ @staticmethod
141
+ def image_to_features_mapping() -> str:
142
+ """Returns the mapping function to convert images into features."""
143
+ return "image_to_lm_features"
144
+
145
+
146
+ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
147
+ """
148
+ A wrapper class for `transformers.XLMRobertaForSequenceClassification` and similar models to use within a pipeline
149
+ component. Check <https://huggingface.co/docs/transformers/model_doc/xlm-roberta> for documentation of the
150
+ model itself.
151
+ Note that this model is equipped with a head that is only useful for classifying the input sequence. For token
152
+ classification and other things please use another model of the family.
153
+
154
+ **Example**
155
+
156
+ # setting up compulsory ocr service
157
+ tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
158
+ tess = TesseractOcrDetector(tesseract_config_path)
159
+ ocr_service = TextExtractionService(tess)
160
+
161
+ # hf tokenizer and token classifier
162
+ tokenizer = XLMRobertaTokenizerFast.from_pretrained("FacebookAI/xlm-roberta-base")
163
+ roberta = HFLmSequenceClassifier("path/to/config.json","path/to/model.bin",
164
+ categories=["handwritten", "presentation", "resume"])
165
+
166
+ # token classification service
167
+ roberta_service = LMSequenceClassifierService(tokenizer,roberta)
168
+
169
+ pipe = DoctectionPipe(pipeline_component_list=[ocr_service,roberta_service])
170
+
171
+ path = "path/to/some/form"
172
+ df = pipe.analyze(path=path)
173
+
174
+ for dp in df:
175
+ ...
176
+ """
177
+
178
+ def __init__(
179
+ self,
180
+ path_config_json: str,
181
+ path_weights: str,
182
+ categories: Mapping[str, TypeOrStr],
183
+ device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
184
+ use_xlm_tokenizer: bool = True,
185
+ ):
186
+ self.name = self.get_name(path_weights, "bert-like")
187
+ self.model_id = self.get_model_id()
188
+ self.model = self.get_wrapped_model(path_config_json, path_weights)
189
+ super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
190
+
191
+ def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> SequenceClassResult:
192
+ input_ids, attention_mask, token_type_ids = self._validate_encodings(**encodings)
193
+
194
+ result = predict_sequence_classes(
195
+ input_ids,
196
+ attention_mask,
197
+ token_type_ids,
198
+ self.model,
199
+ )
200
+
201
+ result.class_id += 1
202
+ result.class_name = self.categories[str(result.class_id)]
203
+ return result
204
+
205
+ @staticmethod
206
+ def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
207
+ """
208
+ Get the inner (wrapped) model.
209
+
210
+ :param path_config_json: path to .json config file
211
+ :param path_weights: path to model artifact
212
+ :return: 'nn.Module'
213
+ """
214
+ config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
215
+ return XLMRobertaForSequenceClassification.from_pretrained(
216
+ pretrained_model_name_or_path=path_weights, config=config
217
+ )
218
+
219
+ @staticmethod
220
+ def default_kwargs_for_input_mapping() -> JsonDict:
221
+ """
222
+ Add some default arguments that might be necessary when preparing a sample. Overwrite this method
223
+ for some custom setting.
224
+ """
225
+ return {}
@@ -185,25 +185,6 @@ class ModelCatalog:
185
185
  dl_library="TF",
186
186
  model_wrapper="TPFrcnnDetector",
187
187
  ),
188
- "layout/d2_model-800000-layout.pkl": ModelProfile(
189
- name="layout/d2_model-800000-layout.pkl",
190
- description="Detectron2 layout detection model trained on Publaynet",
191
- config="dd/d2/layout/CASCADE_RCNN_R_50_FPN_GN.yaml",
192
- size=[274568239],
193
- tp_model=False,
194
- hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_publaynet_inference_only",
195
- hf_model_name="d2_model-800000-layout.pkl",
196
- hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
197
- categories={
198
- "1": LayoutType.text,
199
- "2": LayoutType.title,
200
- "3": LayoutType.list,
201
- "4": LayoutType.table,
202
- "5": LayoutType.figure,
203
- },
204
- dl_library="PT",
205
- model_wrapper="D2FrcnnDetector",
206
- ),
207
188
  "layout/d2_model_0829999_layout_inf_only.pt": ModelProfile(
208
189
  name="layout/d2_model_0829999_layout_inf_only.pt",
209
190
  description="Detectron2 layout detection model trained on Publaynet",
@@ -261,19 +242,6 @@ class ModelCatalog:
261
242
  dl_library="PT",
262
243
  model_wrapper="D2FrcnnTracingDetector",
263
244
  ),
264
- "cell/d2_model-1800000-cell.pkl": ModelProfile(
265
- name="cell/d2_model-1800000-cell.pkl",
266
- description="Detectron2 cell detection inference only model trained on Pubtabnet",
267
- config="dd/d2/cell/CASCADE_RCNN_R_50_FPN_GN.yaml",
268
- size=[274519039],
269
- tp_model=False,
270
- hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_c_inference_only",
271
- hf_model_name="d2_model-1800000-cell.pkl",
272
- hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
273
- categories={"1": LayoutType.cell},
274
- dl_library="PT",
275
- model_wrapper="D2FrcnnDetector",
276
- ),
277
245
  "cell/d2_model_1849999_cell_inf_only.pt": ModelProfile(
278
246
  name="cell/d2_model_1849999_cell_inf_only.pt",
279
247
  description="Detectron2 cell detection inference only model trained on Pubtabnet",
@@ -313,19 +281,6 @@ class ModelCatalog:
313
281
  dl_library="PT",
314
282
  model_wrapper="D2FrcnnDetector",
315
283
  ),
316
- "item/d2_model-1620000-item.pkl": ModelProfile(
317
- name="item/d2_model-1620000-item.pkl",
318
- description="Detectron2 item detection inference only model trained on Pubtabnet",
319
- config="dd/d2/item/CASCADE_RCNN_R_50_FPN_GN.yaml",
320
- size=[274531339],
321
- tp_model=False,
322
- hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_rc_inference_only",
323
- hf_model_name="d2_model-1620000-item.pkl",
324
- hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
325
- categories={"1": LayoutType.row, "2": LayoutType.column},
326
- dl_library="PT",
327
- model_wrapper="D2FrcnnDetector",
328
- ),
329
284
  "item/d2_model_1639999_item.pth": ModelProfile(
330
285
  name="item/d2_model_1639999_item.pth",
331
286
  description="Detectron2 item detection model trained on Pubtabnet",
@@ -365,6 +320,45 @@ class ModelCatalog:
365
320
  dl_library="PT",
366
321
  model_wrapper="D2FrcnnTracingDetector",
367
322
  ),
323
+ "nielsr/lilt-xlm-roberta-base/pytorch_model.bin": ModelProfile(
324
+ name="nielsr/lilt-xlm-roberta-base/pytorch_model.bin",
325
+ description="LiLT build with a RobertaXLM base model",
326
+ config="nielsr/lilt-xlm-roberta-base/config.json",
327
+ size=[1136743583],
328
+ tp_model=False,
329
+ hf_repo_id="nielsr/lilt-xlm-roberta-base",
330
+ hf_model_name="pytorch_model.bin",
331
+ hf_config_file=["config.json"],
332
+ dl_library="PT",
333
+ ),
334
+ "SCUT-DLVCLab/lilt-infoxlm-base/pytorch_model.bin": ModelProfile(
335
+ name="SCUT-DLVCLab/lilt-infoxlm-base/pytorch_model.bin",
336
+ description="Language-Independent Layout Transformer - InfoXLM model by stitching a pre-trained InfoXLM"
337
+ " and a pre-trained Language-Independent Layout Transformer (LiLT) together. It was introduced"
338
+ " in the paper LiLT: A Simple yet Effective Language-Independent Layout Transformer for"
339
+ " Structured Document Understanding by Wang et al. and first released in this repository.",
340
+ config="SCUT-DLVCLab/lilt-infoxlm-base/config.json",
341
+ size=[1136743583],
342
+ tp_model=False,
343
+ hf_repo_id="SCUT-DLVCLab/lilt-infoxlm-base",
344
+ hf_model_name="pytorch_model.bin",
345
+ hf_config_file=["config.json"],
346
+ dl_library="PT",
347
+ ),
348
+ "SCUT-DLVCLab/lilt-roberta-en-base/pytorch_model.bin": ModelProfile(
349
+ name="SCUT-DLVCLab/lilt-roberta-en-base/pytorch_model.bin",
350
+ description="Language-Independent Layout Transformer - RoBERTa model by stitching a pre-trained RoBERTa"
351
+ " (English) and a pre-trained Language-Independent Layout Transformer (LiLT) together. It was"
352
+ " introduced in the paper LiLT: A Simple yet Effective Language-Independent Layout Transformer"
353
+ " for Structured Document Understanding by Wang et al. and first released in this repository.",
354
+ config="SCUT-DLVCLab/lilt-roberta-en-base/config.json",
355
+ size=[523151519],
356
+ tp_model=False,
357
+ hf_repo_id="SCUT-DLVCLab/lilt-roberta-en-base",
358
+ hf_model_name="pytorch_model.bin",
359
+ hf_config_file=["config.json"],
360
+ dl_library="PT",
361
+ ),
368
362
  "microsoft/layoutlm-base-uncased/pytorch_model.bin": ModelProfile(
369
363
  name="microsoft/layoutlm-base-uncased/pytorch_model.bin",
370
364
  description="LayoutLM is a simple but effective pre-training method of text and layout for document image"
@@ -535,6 +529,19 @@ class ModelCatalog:
535
529
  model_wrapper="DoctrTextRecognizer",
536
530
  architecture="crnn_vgg16_bn",
537
531
  ),
532
+ "FacebookAI/xlm-roberta-base": ModelProfile(
533
+ name="FacebookAI/xlm-roberta-base/pytorch_model.bin",
534
+ description="XLM-RoBERTa model pre-trained on 2.5TB of filtered CommonCrawl data containing 100 languages."
535
+ " It was introduced in the paper Unsupervised Cross-lingual Representation Learning at Scale"
536
+ " by Conneau et al. and first released in this repository.",
537
+ size=[1115590446],
538
+ tp_model=False,
539
+ config="FacebookAI/xlm-roberta-base/config.json",
540
+ hf_repo_id="FacebookAI/xlm-roberta-base",
541
+ hf_model_name="pytorch_model.bin",
542
+ hf_config_file=["config.json"],
543
+ dl_library="PT",
544
+ ),
538
545
  "fasttext/lid.176.bin": ModelProfile(
539
546
  name="fasttext/lid.176.bin",
540
547
  description="Fasttext language detection model",
@@ -980,9 +987,11 @@ class ModelDownloadManager:
980
987
  else:
981
988
  file_names.append(model_name)
982
989
  if profile.hf_repo_id:
983
- ModelDownloadManager.load_model_from_hf_hub(profile, absolute_path_weights, file_names)
990
+ if not os.path.isfile(absolute_path_weights):
991
+ ModelDownloadManager.load_model_from_hf_hub(profile, absolute_path_weights, file_names)
984
992
  absolute_path_configs = ModelCatalog.get_full_path_configs(name)
985
- ModelDownloadManager.load_configs_from_hf_hub(profile, absolute_path_configs)
993
+ if not os.path.isfile(absolute_path_configs):
994
+ ModelDownloadManager.load_configs_from_hf_hub(profile, absolute_path_configs)
986
995
  else:
987
996
  ModelDownloadManager._load_from_gd(profile, absolute_path_weights, file_names)
988
997
 
@@ -21,13 +21,15 @@ PDFPlumber text extraction engine
21
21
 
22
22
  from typing import Dict, List, Tuple
23
23
 
24
+ from lazy_imports import try_import
25
+
24
26
  from ..utils.context import save_tmp_file
25
27
  from ..utils.detection_types import Requirement
26
- from ..utils.file_utils import get_pdfplumber_requirement, pdfplumber_available
28
+ from ..utils.file_utils import get_pdfplumber_requirement
27
29
  from ..utils.settings import LayoutType, ObjectTypes
28
30
  from .base import DetectionResult, PdfMiner
29
31
 
30
- if pdfplumber_available():
32
+ with try_import() as import_guard:
31
33
  from pdfplumber.pdf import PDF
32
34
 
33
35
 
@@ -64,9 +66,12 @@ class PdfPlumberTextDetector(PdfMiner):
64
66
 
65
67
  """
66
68
 
67
- def __init__(self) -> None:
68
- self.name = "pdfplumber"
69
+ def __init__(self, x_tolerance: int = 3, y_tolerance: int = 3) -> None:
70
+ self.name = "Pdfplumber"
71
+ self.model_id = self.get_model_id()
69
72
  self.categories = {"1": LayoutType.word}
73
+ self.x_tolerance = x_tolerance
74
+ self.y_tolerance = y_tolerance
70
75
 
71
76
  def predict(self, pdf_bytes: bytes) -> List[DetectionResult]:
72
77
  """
@@ -81,7 +86,7 @@ class PdfPlumberTextDetector(PdfMiner):
81
86
  _pdf = PDF(fin)
82
87
  self._page = _pdf.pages[0]
83
88
  self._pdf_bytes = pdf_bytes
84
- words = self._page.extract_words()
89
+ words = self._page.extract_words(x_tolerance=self.x_tolerance, y_tolerance=self.y_tolerance)
85
90
  detect_results = list(map(_to_detect_result, words))
86
91
  return detect_results
87
92
 
@@ -19,7 +19,5 @@
19
19
  Init file for pytorch compatibility package
20
20
  """
21
21
 
22
+ from .nms import *
22
23
  from .ptutils import *
23
-
24
- if pytorch_available():
25
- from .nms import *
@@ -18,9 +18,13 @@
18
18
  """
19
19
  Module for custom NMS functions.
20
20
  """
21
+ from __future__ import annotations
21
22
 
22
- import torch
23
- from torchvision.ops import boxes as box_ops # type: ignore
23
+ from lazy_imports import try_import
24
+
25
+ with try_import() as import_guard:
26
+ import torch
27
+ from torchvision.ops import boxes as box_ops # type: ignore
24
28
 
25
29
 
26
30
  # Copy & paste from https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/nms.py
@@ -18,31 +18,40 @@
18
18
  """
19
19
  Torch related utils
20
20
  """
21
+ from __future__ import annotations
21
22
 
23
+ import os
24
+ from typing import Optional, Union
22
25
 
23
- from ...utils.file_utils import pytorch_available
26
+ from lazy_imports import try_import
24
27
 
28
+ with try_import() as import_guard:
29
+ import torch
25
30
 
26
- def set_torch_auto_device() -> "torch.device": # type: ignore
27
- """
28
- Returns cuda device if available, otherwise cpu
31
+
32
+ def get_torch_device(device: Optional[Union[str, torch.device]] = None) -> torch.device:
29
33
  """
30
- if pytorch_available():
31
- from torch import cuda, device # pylint: disable=C0415
34
+ Selecting a device on which to load a model. The selection follows a cascade of priorities:
32
35
 
33
- return device("cuda" if cuda.is_available() else "cpu")
34
- raise ModuleNotFoundError("Pytorch must be installed")
36
+ - If a device string is provided, it is used.
37
+ - If the environment variable "USE_CUDA" is set, a GPU is used. If more GPUs are available, it will use all of them
38
+ unless something else is specified by CUDA_VISIBLE_DEVICES:
35
39
 
40
+ https://stackoverflow.com/questions/54216920/how-to-use-multiple-gpus-in-pytorch
36
41
 
37
- def get_num_gpu() -> int:
38
- """
39
- Returns number of CUDA devices if pytorch is available
42
+ - If an MPS device is available, it is used.
43
+ - Otherwise, the CPU is used.
40
44
 
41
- :return:
45
+ :param device: Device either as string or torch.device
46
+ :return: Tensorflow device
42
47
  """
43
-
44
- if pytorch_available():
45
- from torch import cuda # pylint: disable=C0415
46
-
47
- return cuda.device_count()
48
- raise ModuleNotFoundError("Pytorch must be installed")
48
+ if device is not None:
49
+ if isinstance(device, torch.device):
50
+ return device
51
+ if isinstance(device, str):
52
+ return torch.device(device)
53
+ if os.environ.get("USE_CUDA"):
54
+ return torch.device("cuda")
55
+ if os.environ.get("USE_MPS"):
56
+ return torch.device("mps")
57
+ return torch.device("cpu")