deepdoctection 0.42.0__py3-none-any.whl → 0.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +2 -1
- deepdoctection/analyzer/__init__.py +2 -1
- deepdoctection/analyzer/config.py +904 -0
- deepdoctection/analyzer/dd.py +36 -62
- deepdoctection/analyzer/factory.py +311 -141
- deepdoctection/configs/conf_dd_one.yaml +100 -44
- deepdoctection/configs/profiles.jsonl +32 -0
- deepdoctection/dataflow/__init__.py +9 -6
- deepdoctection/dataflow/base.py +33 -15
- deepdoctection/dataflow/common.py +96 -75
- deepdoctection/dataflow/custom.py +36 -29
- deepdoctection/dataflow/custom_serialize.py +135 -91
- deepdoctection/dataflow/parallel_map.py +33 -31
- deepdoctection/dataflow/serialize.py +15 -10
- deepdoctection/dataflow/stats.py +41 -28
- deepdoctection/datapoint/__init__.py +4 -6
- deepdoctection/datapoint/annotation.py +104 -66
- deepdoctection/datapoint/box.py +190 -130
- deepdoctection/datapoint/convert.py +66 -39
- deepdoctection/datapoint/image.py +151 -95
- deepdoctection/datapoint/view.py +383 -236
- deepdoctection/datasets/__init__.py +2 -6
- deepdoctection/datasets/adapter.py +11 -11
- deepdoctection/datasets/base.py +118 -81
- deepdoctection/datasets/dataflow_builder.py +18 -12
- deepdoctection/datasets/info.py +76 -57
- deepdoctection/datasets/instances/__init__.py +6 -2
- deepdoctection/datasets/instances/doclaynet.py +17 -14
- deepdoctection/datasets/instances/fintabnet.py +16 -22
- deepdoctection/datasets/instances/funsd.py +11 -6
- deepdoctection/datasets/instances/iiitar13k.py +9 -9
- deepdoctection/datasets/instances/layouttest.py +9 -9
- deepdoctection/datasets/instances/publaynet.py +9 -9
- deepdoctection/datasets/instances/pubtables1m.py +13 -13
- deepdoctection/datasets/instances/pubtabnet.py +13 -15
- deepdoctection/datasets/instances/rvlcdip.py +8 -8
- deepdoctection/datasets/instances/xfund.py +11 -9
- deepdoctection/datasets/registry.py +18 -11
- deepdoctection/datasets/save.py +12 -11
- deepdoctection/eval/__init__.py +3 -2
- deepdoctection/eval/accmetric.py +72 -52
- deepdoctection/eval/base.py +29 -10
- deepdoctection/eval/cocometric.py +14 -12
- deepdoctection/eval/eval.py +56 -41
- deepdoctection/eval/registry.py +6 -3
- deepdoctection/eval/tedsmetric.py +24 -9
- deepdoctection/eval/tp_eval_callback.py +13 -12
- deepdoctection/extern/__init__.py +1 -1
- deepdoctection/extern/base.py +176 -97
- deepdoctection/extern/d2detect.py +127 -92
- deepdoctection/extern/deskew.py +19 -10
- deepdoctection/extern/doctrocr.py +157 -106
- deepdoctection/extern/fastlang.py +25 -17
- deepdoctection/extern/hfdetr.py +137 -60
- deepdoctection/extern/hflayoutlm.py +329 -248
- deepdoctection/extern/hflm.py +67 -33
- deepdoctection/extern/model.py +108 -762
- deepdoctection/extern/pdftext.py +37 -12
- deepdoctection/extern/pt/nms.py +15 -1
- deepdoctection/extern/pt/ptutils.py +13 -9
- deepdoctection/extern/tessocr.py +87 -54
- deepdoctection/extern/texocr.py +29 -14
- deepdoctection/extern/tp/tfutils.py +36 -8
- deepdoctection/extern/tp/tpcompat.py +54 -16
- deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
- deepdoctection/extern/tpdetect.py +4 -2
- deepdoctection/mapper/__init__.py +1 -1
- deepdoctection/mapper/cats.py +117 -76
- deepdoctection/mapper/cocostruct.py +35 -17
- deepdoctection/mapper/d2struct.py +56 -29
- deepdoctection/mapper/hfstruct.py +32 -19
- deepdoctection/mapper/laylmstruct.py +221 -185
- deepdoctection/mapper/maputils.py +71 -35
- deepdoctection/mapper/match.py +76 -62
- deepdoctection/mapper/misc.py +68 -44
- deepdoctection/mapper/pascalstruct.py +13 -12
- deepdoctection/mapper/prodigystruct.py +33 -19
- deepdoctection/mapper/pubstruct.py +42 -32
- deepdoctection/mapper/tpstruct.py +39 -19
- deepdoctection/mapper/xfundstruct.py +20 -13
- deepdoctection/pipe/__init__.py +1 -2
- deepdoctection/pipe/anngen.py +104 -62
- deepdoctection/pipe/base.py +226 -107
- deepdoctection/pipe/common.py +206 -123
- deepdoctection/pipe/concurrency.py +74 -47
- deepdoctection/pipe/doctectionpipe.py +108 -47
- deepdoctection/pipe/language.py +41 -24
- deepdoctection/pipe/layout.py +45 -18
- deepdoctection/pipe/lm.py +146 -78
- deepdoctection/pipe/order.py +196 -113
- deepdoctection/pipe/refine.py +111 -63
- deepdoctection/pipe/registry.py +1 -1
- deepdoctection/pipe/segment.py +213 -142
- deepdoctection/pipe/sub_layout.py +76 -46
- deepdoctection/pipe/text.py +52 -33
- deepdoctection/pipe/transform.py +8 -6
- deepdoctection/train/d2_frcnn_train.py +87 -69
- deepdoctection/train/hf_detr_train.py +72 -40
- deepdoctection/train/hf_layoutlm_train.py +85 -46
- deepdoctection/train/tp_frcnn_train.py +56 -28
- deepdoctection/utils/concurrency.py +59 -16
- deepdoctection/utils/context.py +40 -19
- deepdoctection/utils/develop.py +25 -17
- deepdoctection/utils/env_info.py +85 -36
- deepdoctection/utils/error.py +16 -10
- deepdoctection/utils/file_utils.py +246 -62
- deepdoctection/utils/fs.py +162 -43
- deepdoctection/utils/identifier.py +29 -16
- deepdoctection/utils/logger.py +49 -32
- deepdoctection/utils/metacfg.py +83 -21
- deepdoctection/utils/pdf_utils.py +119 -62
- deepdoctection/utils/settings.py +24 -10
- deepdoctection/utils/tqdm.py +10 -5
- deepdoctection/utils/transform.py +182 -46
- deepdoctection/utils/utils.py +61 -28
- deepdoctection/utils/viz.py +150 -104
- deepdoctection-0.43.dist-info/METADATA +376 -0
- deepdoctection-0.43.dist-info/RECORD +149 -0
- {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/WHEEL +1 -1
- deepdoctection/analyzer/_config.py +0 -146
- deepdoctection-0.42.0.dist-info/METADATA +0 -431
- deepdoctection-0.42.0.dist-info/RECORD +0 -148
- {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/licenses/LICENSE +0 -0
- {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/top_level.txt +0 -0
deepdoctection/utils/metacfg.py
CHANGED
|
@@ -16,7 +16,8 @@
|
|
|
16
16
|
# limitations under the License.
|
|
17
17
|
|
|
18
18
|
"""
|
|
19
|
-
Class AttrDict for maintaining configs and some functions for generating and saving AttrDict instances to
|
|
19
|
+
Class `AttrDict` for maintaining configs and some functions for generating and saving `AttrDict` instances to
|
|
20
|
+
`.yaml` files
|
|
20
21
|
"""
|
|
21
22
|
from __future__ import annotations
|
|
22
23
|
|
|
@@ -32,7 +33,12 @@ from .types import PathLikeOrStr
|
|
|
32
33
|
# Licensed under the Apache License, Version 2.0 (the "License")
|
|
33
34
|
class AttrDict:
|
|
34
35
|
"""
|
|
35
|
-
Class for
|
|
36
|
+
Class `AttrDict` for maintaining configs and some functions for generating and saving `AttrDict` instances to
|
|
37
|
+
`.yaml` files.
|
|
38
|
+
|
|
39
|
+
Info:
|
|
40
|
+
This module provides a class for storing key-value pairs as attributes and functions for serializing and
|
|
41
|
+
deserializing configurations.
|
|
36
42
|
"""
|
|
37
43
|
|
|
38
44
|
_freezed = False
|
|
@@ -41,7 +47,17 @@ class AttrDict:
|
|
|
41
47
|
|
|
42
48
|
def __getattr__(self, name: str) -> Any:
|
|
43
49
|
"""
|
|
44
|
-
|
|
50
|
+
Returns the attribute value for `name`. If the attribute does not exist and the instance is not frozen, a new
|
|
51
|
+
`AttrDict` is created and assigned.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
name: The name of the attribute.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
The value of the attribute.
|
|
58
|
+
|
|
59
|
+
Raises:
|
|
60
|
+
AttributeError: If the instance is frozen or the attribute name starts with `_`.
|
|
45
61
|
"""
|
|
46
62
|
if self._freezed:
|
|
47
63
|
raise AttributeError(name)
|
|
@@ -54,29 +70,47 @@ class AttrDict:
|
|
|
54
70
|
|
|
55
71
|
def __setattr__(self, name: str, value: Any) -> None:
|
|
56
72
|
"""
|
|
57
|
-
|
|
73
|
+
Sets the attribute `name` to `value`.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
name: The name of the attribute.
|
|
77
|
+
value: The value to set.
|
|
78
|
+
|
|
79
|
+
Raises:
|
|
80
|
+
AttributeError: If the instance is frozen and `name` is not `_freezed`.
|
|
58
81
|
"""
|
|
59
|
-
if self._freezed and name
|
|
82
|
+
if self._freezed and name != "_freezed":
|
|
60
83
|
raise AttributeError(f"Config was freezed! Unknown config: {name}")
|
|
61
84
|
super().__setattr__(name, value)
|
|
62
85
|
|
|
63
86
|
def __str__(self) -> str:
|
|
64
87
|
"""
|
|
65
|
-
|
|
88
|
+
Returns a pretty-printed string representation of the configuration.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
A string representation of the configuration.
|
|
66
92
|
"""
|
|
67
93
|
return pprint.pformat(self.to_dict(), width=100, compact=True)
|
|
68
94
|
|
|
69
95
|
__repr__ = __str__
|
|
70
96
|
|
|
71
97
|
def to_dict(self) -> dict[str, Any]:
|
|
72
|
-
"""
|
|
98
|
+
"""
|
|
99
|
+
Convert to a nested dict.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
A dictionary representation of the configuration.
|
|
103
|
+
"""
|
|
73
104
|
return {
|
|
74
105
|
k: v.to_dict() if isinstance(v, AttrDict) else v for k, v in self.__dict__.items() if not k.startswith("_")
|
|
75
106
|
}
|
|
76
107
|
|
|
77
108
|
def from_dict(self, d: dict[str, Any]) -> None: # pylint: disable=C0103
|
|
78
109
|
"""
|
|
79
|
-
Generate an instance from a dict
|
|
110
|
+
Generate an instance from a dict.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
d: The dictionary to load values from.
|
|
80
114
|
"""
|
|
81
115
|
if isinstance(d, dict):
|
|
82
116
|
self.freeze(False)
|
|
@@ -90,6 +124,9 @@ class AttrDict:
|
|
|
90
124
|
def update_args(self, args: list[str]) -> None:
|
|
91
125
|
"""
|
|
92
126
|
Update from command line args.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
args: A list of command line arguments in the form `key1.key2=val`.
|
|
93
130
|
"""
|
|
94
131
|
for cfg in args:
|
|
95
132
|
keys, v = cfg.split("=", maxsplit=1) # pylint: disable=C0103
|
|
@@ -110,8 +147,11 @@ class AttrDict:
|
|
|
110
147
|
"""
|
|
111
148
|
Overwrite the current config with values from another config.
|
|
112
149
|
|
|
113
|
-
:
|
|
114
|
-
|
|
150
|
+
Args:
|
|
151
|
+
other_config: The other `AttrDict` instance to copy values from.
|
|
152
|
+
|
|
153
|
+
Raises:
|
|
154
|
+
AttributeError: If the config is frozen.
|
|
115
155
|
"""
|
|
116
156
|
if self._freezed:
|
|
117
157
|
raise AttributeError("Config was freezed! Cannot overwrite config.")
|
|
@@ -119,7 +159,10 @@ class AttrDict:
|
|
|
119
159
|
|
|
120
160
|
def freeze(self, freezed: bool = True) -> None:
|
|
121
161
|
"""
|
|
122
|
-
|
|
162
|
+
Freeze or unfreeze the instance, so that no attributes can be added or changed.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
freezed: Whether to freeze the instance.
|
|
123
166
|
"""
|
|
124
167
|
self._freezed = freezed
|
|
125
168
|
for v in self.__dict__.values(): # pylint: disable=C0103
|
|
@@ -136,9 +179,13 @@ class AttrDict:
|
|
|
136
179
|
|
|
137
180
|
def set_config_by_yaml(path_yaml: PathLikeOrStr) -> AttrDict:
|
|
138
181
|
"""
|
|
139
|
-
|
|
182
|
+
Initialize the config class from a YAML file.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
path_yaml: The path to the YAML file.
|
|
140
186
|
|
|
141
|
-
:
|
|
187
|
+
Returns:
|
|
188
|
+
An `AttrDict` instance initialized from the YAML file.
|
|
142
189
|
"""
|
|
143
190
|
config = AttrDict()
|
|
144
191
|
_C = config # pylint: disable=C0103
|
|
@@ -153,9 +200,17 @@ def set_config_by_yaml(path_yaml: PathLikeOrStr) -> AttrDict:
|
|
|
153
200
|
|
|
154
201
|
def save_config_to_yaml(config: AttrDict, path_yaml: PathLikeOrStr) -> None:
|
|
155
202
|
"""
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
:
|
|
203
|
+
Save the configuration instance as a YAML file.
|
|
204
|
+
|
|
205
|
+
Example:
|
|
206
|
+
```python
|
|
207
|
+
save_config_to_yaml(config, "config.yaml")
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
config: The configuration instance as an `AttrDict`.
|
|
212
|
+
path_yaml: The path to save the YAML file to.
|
|
213
|
+
|
|
159
214
|
"""
|
|
160
215
|
|
|
161
216
|
with open(path_yaml, "w") as file: # pylint: disable=W1514
|
|
@@ -164,12 +219,19 @@ def save_config_to_yaml(config: AttrDict, path_yaml: PathLikeOrStr) -> None:
|
|
|
164
219
|
|
|
165
220
|
def config_to_cli_str(config: AttrDict, *exclude: str) -> str:
|
|
166
221
|
"""
|
|
167
|
-
Transform an AttrDict to a string that can be passed to a
|
|
168
|
-
|
|
222
|
+
Transform an `AttrDict` to a string that can be passed to a CLI. Optionally exclude keys from the string.
|
|
223
|
+
|
|
224
|
+
Example:
|
|
225
|
+
```python
|
|
226
|
+
config_to_cli_str(config, "key1", "key2")
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
config: An `AttrDict`.
|
|
231
|
+
*exclude: Keys of the `AttrDict` to exclude.
|
|
169
232
|
|
|
170
|
-
:
|
|
171
|
-
|
|
172
|
-
:return: A string that can be passed to a cli
|
|
233
|
+
Returns:
|
|
234
|
+
A string that can be passed to a CLI.
|
|
173
235
|
"""
|
|
174
236
|
|
|
175
237
|
config_dict = config.to_dict()
|
|
@@ -16,8 +16,9 @@
|
|
|
16
16
|
# limitations under the License.
|
|
17
17
|
|
|
18
18
|
"""
|
|
19
|
-
|
|
19
|
+
Pdf processing tools
|
|
20
20
|
"""
|
|
21
|
+
|
|
21
22
|
import os
|
|
22
23
|
import platform
|
|
23
24
|
import subprocess
|
|
@@ -57,17 +58,20 @@ __all__ = [
|
|
|
57
58
|
|
|
58
59
|
def decrypt_pdf_document(path: PathLikeOrStr) -> bool:
|
|
59
60
|
"""
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
document
|
|
61
|
+
Decrypt a PDF file.
|
|
62
|
+
|
|
63
|
+
As copying a PDF document removes the password that protects the PDF, this method generates a copy and decrypts the
|
|
64
|
+
copy using `qpdf`. The result is saved as the original document.
|
|
63
65
|
|
|
64
|
-
|
|
66
|
+
Note:
|
|
67
|
+
This decryption does not work if the PDF has a readable protection, in which case no solution is provided.
|
|
68
|
+
`qpdf`: <http://qpdf.sourceforge.net/>
|
|
65
69
|
|
|
66
|
-
|
|
67
|
-
|
|
70
|
+
Args:
|
|
71
|
+
path: A path to the PDF file.
|
|
68
72
|
|
|
69
|
-
:
|
|
70
|
-
|
|
73
|
+
Returns:
|
|
74
|
+
True if the document has been successfully decrypted.
|
|
71
75
|
"""
|
|
72
76
|
if qpdf_available():
|
|
73
77
|
path_base, file_name = os.path.split(path)
|
|
@@ -88,12 +92,20 @@ def decrypt_pdf_document(path: PathLikeOrStr) -> bool:
|
|
|
88
92
|
|
|
89
93
|
def decrypt_pdf_document_from_bytes(input_bytes: bytes) -> bytes:
|
|
90
94
|
"""
|
|
91
|
-
|
|
95
|
+
Decrypt a PDF given as bytes.
|
|
96
|
+
|
|
97
|
+
Under the hood, it saves the bytes to a temporary file and then calls `decrypt_pdf_document`.
|
|
98
|
+
|
|
99
|
+
Note:
|
|
100
|
+
`qpdf`: <http://qpdf.sourceforge.net/>
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
input_bytes: A bytes object representing the PDF file.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
The decrypted bytes object.
|
|
92
107
|
|
|
93
|
-
qpdf: <http://qpdf.sourceforge.net/>
|
|
94
108
|
|
|
95
|
-
:param input_bytes: A bytes object representing the pdf file
|
|
96
|
-
:return: The decrypted bytes object
|
|
97
109
|
"""
|
|
98
110
|
with save_tmp_file(input_bytes, "pdf_") as (_, input_file_name):
|
|
99
111
|
is_decrypted = decrypt_pdf_document(input_file_name)
|
|
@@ -107,11 +119,16 @@ def decrypt_pdf_document_from_bytes(input_bytes: bytes) -> bytes:
|
|
|
107
119
|
|
|
108
120
|
def get_pdf_file_reader(path_or_bytes: Union[PathLikeOrStr, bytes]) -> PdfReader:
|
|
109
121
|
"""
|
|
110
|
-
|
|
111
|
-
|
|
122
|
+
Create a file reader object from a PDF document.
|
|
123
|
+
|
|
124
|
+
Will try to decrypt the document if it is encrypted. (See `decrypt_pdf_document` to understand what is meant with
|
|
125
|
+
"decrypt").
|
|
112
126
|
|
|
113
|
-
:
|
|
114
|
-
|
|
127
|
+
Args:
|
|
128
|
+
path_or_bytes: A path to a PDF document or bytes.
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
A file reader object from which you can iterate through the document.
|
|
115
132
|
"""
|
|
116
133
|
|
|
117
134
|
if isinstance(path_or_bytes, bytes):
|
|
@@ -153,39 +170,47 @@ def get_pdf_file_reader(path_or_bytes: Union[PathLikeOrStr, bytes]) -> PdfReader
|
|
|
153
170
|
|
|
154
171
|
def get_pdf_file_writer() -> PdfWriter:
|
|
155
172
|
"""
|
|
156
|
-
`PdfWriter` instance
|
|
173
|
+
`PdfWriter` instance.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
A new `PdfWriter` instance.
|
|
157
177
|
"""
|
|
158
178
|
return PdfWriter()
|
|
159
179
|
|
|
160
180
|
|
|
161
181
|
class PDFStreamer:
|
|
162
182
|
"""
|
|
163
|
-
A class for streaming
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
183
|
+
A class for streaming PDF documents as bytes objects.
|
|
184
|
+
|
|
185
|
+
Built as a generator, it is possible to load the document iteratively into memory. Uses `pypdf` `PdfReader` and
|
|
186
|
+
`PdfWriter`.
|
|
187
|
+
|
|
188
|
+
Example:
|
|
189
|
+
```python
|
|
190
|
+
df = dataflow.DataFromIterable(PDFStreamer(path=path))
|
|
191
|
+
df.reset_state()
|
|
192
|
+
for page in df:
|
|
193
|
+
...
|
|
194
|
+
streamer = PDFStreamer(path=path)
|
|
195
|
+
pages = len(streamer)
|
|
196
|
+
random_int = random.sample(range(0, pages), 2)
|
|
197
|
+
for ran in random_int:
|
|
198
|
+
pdf_bytes = streamer[ran]
|
|
199
|
+
streamer.close()
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
Note:
|
|
203
|
+
Do not forget to close the streamer, otherwise the file will never be closed and might cause memory leaks if
|
|
204
|
+
you open many files.
|
|
184
205
|
"""
|
|
185
206
|
|
|
186
207
|
def __init__(self, path_or_bytes: Union[PathLikeOrStr, bytes]) -> None:
|
|
187
208
|
"""
|
|
188
|
-
:
|
|
209
|
+
Args:
|
|
210
|
+
path_or_bytes: Path to a PDF.
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
None.
|
|
189
214
|
"""
|
|
190
215
|
self.file_reader = get_pdf_file_reader(path_or_bytes)
|
|
191
216
|
self.file_writer = PdfWriter()
|
|
@@ -256,10 +281,15 @@ def _input_to_cli_str(
|
|
|
256
281
|
|
|
257
282
|
class PopplerError(RuntimeError):
|
|
258
283
|
"""
|
|
259
|
-
Poppler Error
|
|
284
|
+
Poppler Error.
|
|
260
285
|
"""
|
|
261
286
|
|
|
262
287
|
def __init__(self, status: int, message: str) -> None:
|
|
288
|
+
"""
|
|
289
|
+
Args:
|
|
290
|
+
status: Error status code.
|
|
291
|
+
message: Error message.
|
|
292
|
+
"""
|
|
263
293
|
super().__init__()
|
|
264
294
|
self.status = status
|
|
265
295
|
self.message = message
|
|
@@ -283,13 +313,20 @@ def pdf_to_np_array_poppler(
|
|
|
283
313
|
pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: Optional[int] = None
|
|
284
314
|
) -> PixelValues:
|
|
285
315
|
"""
|
|
286
|
-
Convert a single
|
|
287
|
-
|
|
316
|
+
Convert a single PDF page from its byte representation to a numpy array using Poppler.
|
|
317
|
+
|
|
318
|
+
This function will save the PDF as a temporary file and then call Poppler via `pdftoppm` or `pdftocairo`.
|
|
288
319
|
|
|
289
|
-
:
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
:
|
|
320
|
+
Raises:
|
|
321
|
+
ValueError: If neither `dpi` nor `size` is provided.
|
|
322
|
+
|
|
323
|
+
Args:
|
|
324
|
+
pdf_bytes: Bytes representing the PDF file.
|
|
325
|
+
size: Size of the resulting image(s), as (width, height).
|
|
326
|
+
dpi: Image quality in DPI/dots-per-inch.
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
`np.array`.
|
|
293
330
|
"""
|
|
294
331
|
if dpi is None and size is None:
|
|
295
332
|
raise ValueError("Either dpi or size must be provided.")
|
|
@@ -302,11 +339,17 @@ def pdf_to_np_array_poppler(
|
|
|
302
339
|
|
|
303
340
|
def pdf_to_np_array_pdfmium(pdf_bytes: bytes, dpi: Optional[int] = None) -> PixelValues:
|
|
304
341
|
"""
|
|
305
|
-
Convert a single
|
|
342
|
+
Convert a single PDF page from its byte representation to a numpy array using pdfium.
|
|
343
|
+
|
|
344
|
+
Args:
|
|
345
|
+
pdf_bytes: Bytes representing the PDF file.
|
|
346
|
+
dpi: Image quality in DPI/dots-per-inch.
|
|
306
347
|
|
|
307
|
-
:
|
|
308
|
-
|
|
309
|
-
|
|
348
|
+
Returns:
|
|
349
|
+
`np.array`.
|
|
350
|
+
|
|
351
|
+
Raises:
|
|
352
|
+
ValueError: If `dpi` is not provided.
|
|
310
353
|
"""
|
|
311
354
|
if dpi is None:
|
|
312
355
|
raise ValueError("dpi must be provided.")
|
|
@@ -316,13 +359,21 @@ def pdf_to_np_array_pdfmium(pdf_bytes: bytes, dpi: Optional[int] = None) -> Pixe
|
|
|
316
359
|
|
|
317
360
|
def pdf_to_np_array(pdf_bytes: bytes, size: Optional[tuple[int, int]] = None, dpi: Optional[int] = None) -> PixelValues:
|
|
318
361
|
"""
|
|
319
|
-
Convert a single
|
|
320
|
-
|
|
362
|
+
Convert a single PDF page from its byte representation to a `np.array`.
|
|
363
|
+
|
|
364
|
+
This function will either use Poppler or pdfium to render the PDF.
|
|
365
|
+
|
|
366
|
+
Args:
|
|
367
|
+
pdf_bytes: Bytes representing the PDF file.
|
|
368
|
+
size: Size of the resulting image(s), as (width, height).
|
|
369
|
+
dpi: Image quality in DPI/dots-per-inch.
|
|
321
370
|
|
|
322
|
-
:
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
:
|
|
371
|
+
Returns:
|
|
372
|
+
`np.array`.
|
|
373
|
+
|
|
374
|
+
Note:
|
|
375
|
+
If `USE_DD_PDFIUM` is set, `pdf_to_np_array_pdfmium` does not support the `size` parameter and will use
|
|
376
|
+
`dpi` instead.
|
|
326
377
|
"""
|
|
327
378
|
if os.environ.get("USE_DD_PDFIUM", "False") in ENV_VARS_TRUE:
|
|
328
379
|
if size is not None:
|
|
@@ -339,12 +390,18 @@ def split_pdf(
|
|
|
339
390
|
pdf_path: PathLikeOrStr, output_dir: PathLikeOrStr, file_type: Literal["image", "pdf"], dpi: int = 200
|
|
340
391
|
) -> None:
|
|
341
392
|
"""
|
|
342
|
-
Split a
|
|
393
|
+
Split a PDF into single pages.
|
|
394
|
+
|
|
395
|
+
The pages are saved as single PDF or PNG files in a subfolder of the output directory.
|
|
396
|
+
|
|
397
|
+
Args:
|
|
398
|
+
pdf_path: Path to the PDF file.
|
|
399
|
+
output_dir: Path to the output directory.
|
|
400
|
+
file_type: Type of the output file. Either "image" or "pdf".
|
|
401
|
+
dpi: Image quality in DPI/dots-per-inch.
|
|
343
402
|
|
|
344
|
-
:
|
|
345
|
-
|
|
346
|
-
:param file_type: Type of the output file. Either "image" or "pdf"
|
|
347
|
-
:param dpi: Image quality in DPI/dots-per-inch (default
|
|
403
|
+
Returns:
|
|
404
|
+
None.
|
|
348
405
|
"""
|
|
349
406
|
pdf_path = Path(pdf_path)
|
|
350
407
|
filename = pdf_path.stem
|
deepdoctection/utils/settings.py
CHANGED
|
@@ -79,6 +79,7 @@ class SummaryType(ObjectTypes):
|
|
|
79
79
|
"""Summary type member"""
|
|
80
80
|
|
|
81
81
|
SUMMARY = "summary"
|
|
82
|
+
DOCUMENT_SUMMARY = "document_summary"
|
|
82
83
|
|
|
83
84
|
|
|
84
85
|
@object_types_registry.register("DocumentType")
|
|
@@ -329,12 +330,18 @@ _TOKEN_AND_TAG_TO_TOKEN_CLASS_WITH_TAG = {
|
|
|
329
330
|
|
|
330
331
|
def token_class_tag_to_token_class_with_tag(token: ObjectTypes, tag: ObjectTypes) -> ObjectTypes:
|
|
331
332
|
"""
|
|
332
|
-
|
|
333
|
-
maps to TokenClassWithTag.i_header
|
|
333
|
+
Maps a `TokenClassWithTag` enum member from a token class and tag, e.g. `TokenClasses.header` and `BioTag.inside`
|
|
334
|
+
maps to `TokenClassWithTag.i_header`.
|
|
334
335
|
|
|
335
|
-
:
|
|
336
|
-
|
|
337
|
-
|
|
336
|
+
Args:
|
|
337
|
+
token: TokenClasses member.
|
|
338
|
+
tag: BioTag member.
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
TokenClassWithTag member.
|
|
342
|
+
|
|
343
|
+
Raises:
|
|
344
|
+
TypeError: If token is not of type TokenClasses or tag is not of type BioTag.
|
|
338
345
|
"""
|
|
339
346
|
if isinstance(token, TokenClasses) and isinstance(tag, BioTag):
|
|
340
347
|
return _TOKEN_AND_TAG_TO_TOKEN_CLASS_WITH_TAG[(token, tag)]
|
|
@@ -349,8 +356,11 @@ def token_class_with_tag_to_token_class_and_tag(
|
|
|
349
356
|
"""
|
|
350
357
|
This is the reverse mapping from TokenClassWithTag members to TokenClasses and BioTag
|
|
351
358
|
|
|
352
|
-
:
|
|
353
|
-
|
|
359
|
+
Args:
|
|
360
|
+
token_class_with_tag: `TokenClassWithTag` member
|
|
361
|
+
|
|
362
|
+
Returns:
|
|
363
|
+
Tuple of `TokenClasses` member and `BioTag` member
|
|
354
364
|
"""
|
|
355
365
|
return {val: key for key, val in _TOKEN_AND_TAG_TO_TOKEN_CLASS_WITH_TAG.items()}.get(token_class_with_tag)
|
|
356
366
|
|
|
@@ -405,10 +415,13 @@ def update_black_list(item: str) -> None:
|
|
|
405
415
|
|
|
406
416
|
|
|
407
417
|
def get_type(obj_type: Union[str, ObjectTypes]) -> ObjectTypes:
|
|
408
|
-
"""
|
|
418
|
+
"""
|
|
419
|
+
Get an object type property from a given string. Does nothing if an `ObjectType` is passed
|
|
409
420
|
|
|
410
|
-
:
|
|
411
|
-
|
|
421
|
+
Args:
|
|
422
|
+
obj_type: String or ObjectTypes
|
|
423
|
+
Returns:
|
|
424
|
+
`ObjectType`
|
|
412
425
|
"""
|
|
413
426
|
if isinstance(obj_type, ObjectTypes):
|
|
414
427
|
return obj_type
|
|
@@ -435,6 +448,7 @@ if os.environ.get("DEEPDOCTECTION_CACHE"):
|
|
|
435
448
|
else:
|
|
436
449
|
dd_cache_home = Path(os.getenv("XDG_CACHE_HOME", Path.home() / ".cache")) / "deepdoctection"
|
|
437
450
|
|
|
451
|
+
CACHE_DIR = dd_cache_home
|
|
438
452
|
MODEL_DIR = dd_cache_home / "weights"
|
|
439
453
|
|
|
440
454
|
# configs cache directory
|
deepdoctection/utils/tqdm.py
CHANGED
|
@@ -32,8 +32,10 @@ def get_tqdm_default_kwargs(
|
|
|
32
32
|
**kwargs: Optional[Union[str, int, float]]
|
|
33
33
|
) -> Dict[str, Union[str, float, bool, int, None]]:
|
|
34
34
|
"""
|
|
35
|
-
Return default arguments to be used with tqdm
|
|
36
|
-
|
|
35
|
+
Return default arguments to be used with `tqdm`.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
kwargs: extra arguments to be used.
|
|
37
39
|
"""
|
|
38
40
|
|
|
39
41
|
return {
|
|
@@ -49,10 +51,13 @@ def get_tqdm_default_kwargs(
|
|
|
49
51
|
|
|
50
52
|
def get_tqdm(total: Optional[Union[int, float]] = None, **kwargs: Union[str, int, float]) -> TqdmType:
|
|
51
53
|
"""
|
|
52
|
-
Get tqdm progress bar with some default options to have consistent style.
|
|
54
|
+
Get `tqdm` progress bar with some default options to have consistent style.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
total: The number of expected iterations.
|
|
53
58
|
|
|
54
|
-
:
|
|
55
|
-
|
|
59
|
+
Returns:
|
|
60
|
+
A `tqdm` instance
|
|
56
61
|
"""
|
|
57
62
|
|
|
58
63
|
default_tqdm_setting = get_tqdm_default_kwargs(total=total)
|