deepdoctection 0.42.1__py3-none-any.whl → 0.43.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +4 -2
- deepdoctection/analyzer/__init__.py +2 -1
- deepdoctection/analyzer/config.py +919 -0
- deepdoctection/analyzer/dd.py +36 -62
- deepdoctection/analyzer/factory.py +311 -141
- deepdoctection/configs/conf_dd_one.yaml +100 -44
- deepdoctection/configs/profiles.jsonl +32 -0
- deepdoctection/dataflow/__init__.py +9 -6
- deepdoctection/dataflow/base.py +33 -15
- deepdoctection/dataflow/common.py +96 -75
- deepdoctection/dataflow/custom.py +36 -29
- deepdoctection/dataflow/custom_serialize.py +135 -91
- deepdoctection/dataflow/parallel_map.py +33 -31
- deepdoctection/dataflow/serialize.py +15 -10
- deepdoctection/dataflow/stats.py +41 -28
- deepdoctection/datapoint/__init__.py +4 -6
- deepdoctection/datapoint/annotation.py +104 -66
- deepdoctection/datapoint/box.py +190 -130
- deepdoctection/datapoint/convert.py +66 -39
- deepdoctection/datapoint/image.py +151 -95
- deepdoctection/datapoint/view.py +383 -236
- deepdoctection/datasets/__init__.py +2 -6
- deepdoctection/datasets/adapter.py +11 -11
- deepdoctection/datasets/base.py +118 -81
- deepdoctection/datasets/dataflow_builder.py +18 -12
- deepdoctection/datasets/info.py +76 -57
- deepdoctection/datasets/instances/__init__.py +6 -2
- deepdoctection/datasets/instances/doclaynet.py +17 -14
- deepdoctection/datasets/instances/fintabnet.py +16 -22
- deepdoctection/datasets/instances/funsd.py +11 -6
- deepdoctection/datasets/instances/iiitar13k.py +9 -9
- deepdoctection/datasets/instances/layouttest.py +9 -9
- deepdoctection/datasets/instances/publaynet.py +9 -9
- deepdoctection/datasets/instances/pubtables1m.py +13 -13
- deepdoctection/datasets/instances/pubtabnet.py +13 -15
- deepdoctection/datasets/instances/rvlcdip.py +8 -8
- deepdoctection/datasets/instances/xfund.py +11 -9
- deepdoctection/datasets/registry.py +18 -11
- deepdoctection/datasets/save.py +12 -11
- deepdoctection/eval/__init__.py +3 -2
- deepdoctection/eval/accmetric.py +72 -52
- deepdoctection/eval/base.py +29 -10
- deepdoctection/eval/cocometric.py +14 -12
- deepdoctection/eval/eval.py +56 -41
- deepdoctection/eval/registry.py +6 -3
- deepdoctection/eval/tedsmetric.py +24 -9
- deepdoctection/eval/tp_eval_callback.py +13 -12
- deepdoctection/extern/__init__.py +1 -1
- deepdoctection/extern/base.py +176 -97
- deepdoctection/extern/d2detect.py +127 -92
- deepdoctection/extern/deskew.py +19 -10
- deepdoctection/extern/doctrocr.py +162 -108
- deepdoctection/extern/fastlang.py +25 -17
- deepdoctection/extern/hfdetr.py +137 -60
- deepdoctection/extern/hflayoutlm.py +329 -248
- deepdoctection/extern/hflm.py +67 -33
- deepdoctection/extern/model.py +108 -762
- deepdoctection/extern/pdftext.py +37 -12
- deepdoctection/extern/pt/nms.py +15 -1
- deepdoctection/extern/pt/ptutils.py +13 -9
- deepdoctection/extern/tessocr.py +87 -54
- deepdoctection/extern/texocr.py +29 -14
- deepdoctection/extern/tp/tfutils.py +36 -8
- deepdoctection/extern/tp/tpcompat.py +54 -16
- deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
- deepdoctection/extern/tpdetect.py +4 -2
- deepdoctection/mapper/__init__.py +1 -1
- deepdoctection/mapper/cats.py +117 -76
- deepdoctection/mapper/cocostruct.py +35 -17
- deepdoctection/mapper/d2struct.py +56 -29
- deepdoctection/mapper/hfstruct.py +32 -19
- deepdoctection/mapper/laylmstruct.py +221 -185
- deepdoctection/mapper/maputils.py +71 -35
- deepdoctection/mapper/match.py +76 -62
- deepdoctection/mapper/misc.py +68 -44
- deepdoctection/mapper/pascalstruct.py +13 -12
- deepdoctection/mapper/prodigystruct.py +33 -19
- deepdoctection/mapper/pubstruct.py +42 -32
- deepdoctection/mapper/tpstruct.py +39 -19
- deepdoctection/mapper/xfundstruct.py +20 -13
- deepdoctection/pipe/__init__.py +1 -2
- deepdoctection/pipe/anngen.py +104 -62
- deepdoctection/pipe/base.py +226 -107
- deepdoctection/pipe/common.py +206 -123
- deepdoctection/pipe/concurrency.py +74 -47
- deepdoctection/pipe/doctectionpipe.py +108 -47
- deepdoctection/pipe/language.py +41 -24
- deepdoctection/pipe/layout.py +45 -18
- deepdoctection/pipe/lm.py +146 -78
- deepdoctection/pipe/order.py +205 -119
- deepdoctection/pipe/refine.py +111 -63
- deepdoctection/pipe/registry.py +1 -1
- deepdoctection/pipe/segment.py +213 -142
- deepdoctection/pipe/sub_layout.py +76 -46
- deepdoctection/pipe/text.py +52 -33
- deepdoctection/pipe/transform.py +8 -6
- deepdoctection/train/d2_frcnn_train.py +87 -69
- deepdoctection/train/hf_detr_train.py +72 -40
- deepdoctection/train/hf_layoutlm_train.py +85 -46
- deepdoctection/train/tp_frcnn_train.py +56 -28
- deepdoctection/utils/concurrency.py +59 -16
- deepdoctection/utils/context.py +40 -19
- deepdoctection/utils/develop.py +26 -17
- deepdoctection/utils/env_info.py +86 -37
- deepdoctection/utils/error.py +16 -10
- deepdoctection/utils/file_utils.py +246 -71
- deepdoctection/utils/fs.py +162 -43
- deepdoctection/utils/identifier.py +29 -16
- deepdoctection/utils/logger.py +49 -32
- deepdoctection/utils/metacfg.py +83 -21
- deepdoctection/utils/pdf_utils.py +119 -62
- deepdoctection/utils/settings.py +24 -10
- deepdoctection/utils/tqdm.py +10 -5
- deepdoctection/utils/transform.py +182 -46
- deepdoctection/utils/utils.py +61 -28
- deepdoctection/utils/viz.py +150 -104
- deepdoctection-0.43.1.dist-info/METADATA +376 -0
- deepdoctection-0.43.1.dist-info/RECORD +149 -0
- deepdoctection/analyzer/_config.py +0 -146
- deepdoctection-0.42.1.dist-info/METADATA +0 -431
- deepdoctection-0.42.1.dist-info/RECORD +0 -148
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/WHEEL +0 -0
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/licenses/LICENSE +0 -0
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/top_level.txt +0 -0
|
@@ -12,7 +12,6 @@ import importlib.util
|
|
|
12
12
|
import multiprocessing as mp
|
|
13
13
|
import string
|
|
14
14
|
import subprocess
|
|
15
|
-
import sys
|
|
16
15
|
from os import environ, path
|
|
17
16
|
from shutil import which
|
|
18
17
|
from types import ModuleType
|
|
@@ -22,7 +21,6 @@ import importlib_metadata
|
|
|
22
21
|
from packaging import version
|
|
23
22
|
|
|
24
23
|
from .error import DependencyError
|
|
25
|
-
from .logger import LoggingRecord, logger
|
|
26
24
|
from .metacfg import AttrDict
|
|
27
25
|
from .types import PathLikeOrStr, Requirement
|
|
28
26
|
|
|
@@ -41,14 +39,20 @@ _TF_ERR_MSG = f"Tensorflow must be installed. {_GENERIC_ERR_MSG}"
|
|
|
41
39
|
|
|
42
40
|
def tf_available() -> bool:
|
|
43
41
|
"""
|
|
44
|
-
Returns
|
|
42
|
+
Returns whether TensorFlow is installed.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
bool: True if TensorFlow is installed, False otherwise.
|
|
45
46
|
"""
|
|
46
47
|
return bool(_TF_AVAILABLE)
|
|
47
48
|
|
|
48
49
|
|
|
49
50
|
def get_tf_version() -> str:
|
|
50
51
|
"""
|
|
51
|
-
|
|
52
|
+
Determines the installed TensorFlow version.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
str: The installed TensorFlow version, or `0.0` if not installed.
|
|
52
56
|
"""
|
|
53
57
|
tf_version = "0.0"
|
|
54
58
|
if tf_available():
|
|
@@ -76,7 +80,10 @@ def get_tf_version() -> str:
|
|
|
76
80
|
|
|
77
81
|
def get_tensorflow_requirement() -> Requirement:
|
|
78
82
|
"""
|
|
79
|
-
Returns
|
|
83
|
+
Returns the TensorFlow requirement.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
tuple: A tuple containing the package name, whether the requirement is satisfied, and an error message.
|
|
80
87
|
"""
|
|
81
88
|
|
|
82
89
|
tf_requirement_satisfied = False
|
|
@@ -100,14 +107,20 @@ _TF_ADDONS_ERR_MSG = (
|
|
|
100
107
|
|
|
101
108
|
def tf_addons_available() -> bool:
|
|
102
109
|
"""
|
|
103
|
-
Returns
|
|
110
|
+
Returns whether `tensorflow_addons` is installed.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
bool: `True` if `tensorflow_addons` is installed, False otherwise.
|
|
104
114
|
"""
|
|
105
115
|
return bool(_TF_ADDONS_AVAILABLE)
|
|
106
116
|
|
|
107
117
|
|
|
108
118
|
def get_tf_addons_requirements() -> Requirement:
|
|
109
119
|
"""
|
|
110
|
-
Returns
|
|
120
|
+
Returns the `tensorflow_addons` requirement.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
tuple: A tuple containing the package name, whether the requirement is satisfied, and an error message.
|
|
111
124
|
"""
|
|
112
125
|
return "tensorflow-addons", tf_addons_available(), _TF_ADDONS_ERR_MSG
|
|
113
126
|
|
|
@@ -118,14 +131,20 @@ _TP_ERR_MSG = f"Tensorpack must be installed. {_GENERIC_ERR_MSG}"
|
|
|
118
131
|
|
|
119
132
|
def tensorpack_available() -> bool:
|
|
120
133
|
"""
|
|
121
|
-
Returns
|
|
134
|
+
Returns whether `tensorpack` is installed.
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
bool: `True` if `tensorpack` is installed, False otherwise.
|
|
122
138
|
"""
|
|
123
139
|
return bool(_TP_AVAILABLE)
|
|
124
140
|
|
|
125
141
|
|
|
126
142
|
def get_tensorpack_requirement() -> Requirement:
|
|
127
143
|
"""
|
|
128
|
-
Returns
|
|
144
|
+
Returns the `tensorpack` requirement.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
tuple: A tuple containing the package name, whether the requirement is satisfied, and an error message.
|
|
129
148
|
"""
|
|
130
149
|
return "tensorpack", tensorpack_available(), _TP_ERR_MSG
|
|
131
150
|
|
|
@@ -137,14 +156,20 @@ _PYTORCH_ERR_MSG = f"Pytorch must be installed. {_GENERIC_ERR_MSG}"
|
|
|
137
156
|
|
|
138
157
|
def pytorch_available() -> bool:
|
|
139
158
|
"""
|
|
140
|
-
Returns
|
|
159
|
+
Returns whether PyTorch is installed.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
bool: True if PyTorch is installed, False otherwise.
|
|
141
163
|
"""
|
|
142
164
|
return bool(_PYTORCH_AVAILABLE)
|
|
143
165
|
|
|
144
166
|
|
|
145
167
|
def get_pytorch_requirement() -> Requirement:
|
|
146
168
|
"""
|
|
147
|
-
Returns
|
|
169
|
+
Returns the PyTorch requirement.
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
tuple: A tuple containing the package name, whether the requirement is satisfied, and an error message.
|
|
148
173
|
"""
|
|
149
174
|
return "torch", pytorch_available(), _PYTORCH_ERR_MSG
|
|
150
175
|
|
|
@@ -156,14 +181,20 @@ _LXML_ERR_MSG = f"lxml must be installed. {_GENERIC_ERR_MSG}"
|
|
|
156
181
|
|
|
157
182
|
def lxml_available() -> bool:
|
|
158
183
|
"""
|
|
159
|
-
Returns
|
|
184
|
+
Returns whether `lxml` is installed.
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
bool: True if `lxml` is installed, `False` otherwise.
|
|
160
188
|
"""
|
|
161
189
|
return bool(_LXML_AVAILABLE)
|
|
162
190
|
|
|
163
191
|
|
|
164
192
|
def get_lxml_requirement() -> Requirement:
|
|
165
193
|
"""
|
|
166
|
-
Returns lxml requirement
|
|
194
|
+
Returns the `lxml` requirement.
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
tuple: A tuple containing the package name, whether the requirement is satisfied, and an error message.
|
|
167
198
|
"""
|
|
168
199
|
return "lxml", lxml_available(), _LXML_ERR_MSG
|
|
169
200
|
|
|
@@ -175,14 +206,20 @@ _APTED_ERR_MSG = f"apted must be installed. {_GENERIC_ERR_MSG}"
|
|
|
175
206
|
|
|
176
207
|
def apted_available() -> bool:
|
|
177
208
|
"""
|
|
178
|
-
Returns
|
|
209
|
+
Returns whether `apted` is available.
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
bool: `True` if `apted` is available, False otherwise.
|
|
179
213
|
"""
|
|
180
214
|
return bool(_APTED_AVAILABLE)
|
|
181
215
|
|
|
182
216
|
|
|
183
217
|
def get_apted_requirement() -> Requirement:
|
|
184
218
|
"""
|
|
185
|
-
Returns
|
|
219
|
+
Returns the `apted` requirement.
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
tuple: A tuple containing the package name, whether the requirement is satisfied, and an error message.
|
|
186
223
|
"""
|
|
187
224
|
return "apted", apted_available(), _TRANSFORMERS_ERR_MSG
|
|
188
225
|
|
|
@@ -194,14 +231,20 @@ _DISTANCE_ERR_MSG = f"distance must be installed. {_GENERIC_ERR_MSG}"
|
|
|
194
231
|
|
|
195
232
|
def distance_available() -> bool:
|
|
196
233
|
"""
|
|
197
|
-
Returns
|
|
234
|
+
Returns whether `distance` is available.
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
bool: `True` if `distance` is available, False otherwise.
|
|
198
238
|
"""
|
|
199
239
|
return bool(_DISTANCE_AVAILABLE)
|
|
200
240
|
|
|
201
241
|
|
|
202
242
|
def get_distance_requirement() -> Requirement:
|
|
203
243
|
"""
|
|
204
|
-
Returns distance requirement
|
|
244
|
+
Returns the `distance` requirement.
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
tuple: A tuple containing the package name, whether the requirement is satisfied, and an error message.
|
|
205
248
|
"""
|
|
206
249
|
return "distance", distance_available(), _DISTANCE_ERR_MSG
|
|
207
250
|
|
|
@@ -213,14 +256,20 @@ _TRANSFORMERS_ERR_MSG = f"transformers must be installed. {_GENERIC_ERR_MSG}"
|
|
|
213
256
|
|
|
214
257
|
def transformers_available() -> bool:
|
|
215
258
|
"""
|
|
216
|
-
Returns
|
|
259
|
+
Returns whether HuggingFace Transformers is installed.
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
bool: `True` if Transformers is installed, False otherwise.
|
|
217
263
|
"""
|
|
218
264
|
return bool(_TRANSFORMERS_AVAILABLE)
|
|
219
265
|
|
|
220
266
|
|
|
221
267
|
def get_transformers_requirement() -> Requirement:
|
|
222
268
|
"""
|
|
223
|
-
Returns
|
|
269
|
+
Returns the HuggingFace Transformers requirement.
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
tuple: A tuple containing the package name, whether the requirement is satisfied, and an error message.
|
|
224
273
|
"""
|
|
225
274
|
return "transformers", transformers_available(), _TRANSFORMERS_ERR_MSG
|
|
226
275
|
|
|
@@ -235,14 +284,20 @@ _DETECTRON2_ERR_MSG = (
|
|
|
235
284
|
|
|
236
285
|
def detectron2_available() -> bool:
|
|
237
286
|
"""
|
|
238
|
-
Returns
|
|
287
|
+
Returns whether `detectron2` is installed.
|
|
288
|
+
|
|
289
|
+
Returns:
|
|
290
|
+
bool: True if `detectron2` is installed, False otherwise.
|
|
239
291
|
"""
|
|
240
292
|
return bool(_DETECTRON2_AVAILABLE)
|
|
241
293
|
|
|
242
294
|
|
|
243
295
|
def get_detectron2_requirement() -> Requirement:
|
|
244
296
|
"""
|
|
245
|
-
Returns
|
|
297
|
+
Returns the `detectron2` requirement.
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
tuple: A tuple containing the package name, whether the requirement is satisfied, and an error message.
|
|
246
301
|
"""
|
|
247
302
|
return "detectron2", detectron2_available(), _DETECTRON2_ERR_MSG
|
|
248
303
|
|
|
@@ -258,10 +313,14 @@ _TESS_ERR_MSG = (
|
|
|
258
313
|
|
|
259
314
|
|
|
260
315
|
def set_tesseract_path(tesseract_path: PathLikeOrStr) -> None:
|
|
261
|
-
"""
|
|
262
|
-
|
|
316
|
+
"""
|
|
317
|
+
Sets the Tesseract path.
|
|
263
318
|
|
|
264
|
-
|
|
319
|
+
Note:
|
|
320
|
+
If you have Tesseract installed in Anaconda, you can use this function to set the Tesseract path.
|
|
321
|
+
|
|
322
|
+
Args:
|
|
323
|
+
tesseract_path: The Tesseract installation path.
|
|
265
324
|
"""
|
|
266
325
|
|
|
267
326
|
global _TESS_AVAILABLE # pylint: disable=W0603
|
|
@@ -286,7 +345,13 @@ def tesseract_available() -> bool:
|
|
|
286
345
|
|
|
287
346
|
def get_tesseract_version() -> Union[int, version.Version]:
|
|
288
347
|
"""
|
|
289
|
-
Returns
|
|
348
|
+
Returns the version of the installed Tesseract.
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
int or packaging.version.Version: The Tesseract version if installed and `>= 4.0`, otherwise `0`.
|
|
352
|
+
|
|
353
|
+
Note:
|
|
354
|
+
The minimum required version is `3.05`.
|
|
290
355
|
"""
|
|
291
356
|
try:
|
|
292
357
|
output = subprocess.check_output(
|
|
@@ -311,7 +376,13 @@ def get_tesseract_version() -> Union[int, version.Version]:
|
|
|
311
376
|
|
|
312
377
|
def get_tesseract_requirement() -> Requirement:
|
|
313
378
|
"""
|
|
314
|
-
Returns Tesseract requirement.
|
|
379
|
+
Returns the Tesseract requirement.
|
|
380
|
+
|
|
381
|
+
Returns:
|
|
382
|
+
tuple: A tuple containing the package name, whether the requirement is satisfied, and an error message.
|
|
383
|
+
|
|
384
|
+
Note:
|
|
385
|
+
The minimum required version is `3.05`.
|
|
315
386
|
"""
|
|
316
387
|
if get_tesseract_version():
|
|
317
388
|
return "tesseract", True, _TESS_ERR_MSG
|
|
@@ -326,21 +397,30 @@ _POPPLER_ERR_MSG = "Poppler cannot be found. Please check that Poppler is instal
|
|
|
326
397
|
|
|
327
398
|
def pdf_to_ppm_available() -> bool:
|
|
328
399
|
"""
|
|
329
|
-
Returns
|
|
400
|
+
Returns whether `pdftoppm` is installed.
|
|
401
|
+
|
|
402
|
+
Returns:
|
|
403
|
+
bool: True if `pdftoppm` is installed, False otherwise.
|
|
330
404
|
"""
|
|
331
405
|
return bool(_PDF_TO_PPM_AVAILABLE)
|
|
332
406
|
|
|
333
407
|
|
|
334
408
|
def pdf_to_cairo_available() -> bool:
|
|
335
409
|
"""
|
|
336
|
-
Returns
|
|
410
|
+
Returns whether `pdftocairo` is installed.
|
|
411
|
+
|
|
412
|
+
Returns:
|
|
413
|
+
bool: `True` if `pdftocairo` is installed, `False` otherwise.
|
|
337
414
|
"""
|
|
338
415
|
return bool(_PDF_TO_CAIRO_AVAILABLE)
|
|
339
416
|
|
|
340
417
|
|
|
341
418
|
def get_poppler_version() -> Union[int, version.Version]:
|
|
342
419
|
"""
|
|
343
|
-
Returns
|
|
420
|
+
Returns the version of the installed Poppler utility.
|
|
421
|
+
|
|
422
|
+
Returns:
|
|
423
|
+
int or packaging.version.Version: The Poppler version if installed, otherwise 0.
|
|
344
424
|
"""
|
|
345
425
|
|
|
346
426
|
if pdf_to_ppm_available():
|
|
@@ -367,7 +447,10 @@ def get_poppler_version() -> Union[int, version.Version]:
|
|
|
367
447
|
|
|
368
448
|
def get_poppler_requirement() -> Requirement:
|
|
369
449
|
"""
|
|
370
|
-
Returns Poppler requirement.
|
|
450
|
+
Returns the Poppler requirement.
|
|
451
|
+
|
|
452
|
+
Returns:
|
|
453
|
+
tuple: A tuple containing the package name, whether the requirement is satisfied, and an error message.
|
|
371
454
|
"""
|
|
372
455
|
if get_poppler_version():
|
|
373
456
|
return "poppler", True, _POPPLER_ERR_MSG
|
|
@@ -381,14 +464,20 @@ _PDFPLUMBER_ERR_MSG = f"pdfplumber must be installed. {_GENERIC_ERR_MSG}"
|
|
|
381
464
|
|
|
382
465
|
def pdfplumber_available() -> bool:
|
|
383
466
|
"""
|
|
384
|
-
Returns
|
|
467
|
+
Returns whether `pdfplumber` is installed.
|
|
468
|
+
|
|
469
|
+
Returns:
|
|
470
|
+
bool: `True` if `pdfplumber` is installed, False otherwise.
|
|
385
471
|
"""
|
|
386
472
|
return bool(_PDFPLUMBER_AVAILABLE)
|
|
387
473
|
|
|
388
474
|
|
|
389
475
|
def get_pdfplumber_requirement() -> Requirement:
|
|
390
476
|
"""
|
|
391
|
-
Returns pdfplumber requirement.
|
|
477
|
+
Returns the `pdfplumber` requirement.
|
|
478
|
+
|
|
479
|
+
Returns:
|
|
480
|
+
tuple: A tuple containing the package name, whether the requirement is satisfied, and an error message.
|
|
392
481
|
"""
|
|
393
482
|
return "pdfplumber", pdfplumber_available(), _PDFPLUMBER_ERR_MSG
|
|
394
483
|
|
|
@@ -400,14 +489,20 @@ _COCOTOOLS_ERR_MSG = f"pycocotools must be installed. {_GENERIC_ERR_MSG}"
|
|
|
400
489
|
|
|
401
490
|
def cocotools_available() -> bool:
|
|
402
491
|
"""
|
|
403
|
-
Returns
|
|
492
|
+
Returns whether `pycocotools` is installed.
|
|
493
|
+
|
|
494
|
+
Returns:
|
|
495
|
+
bool: `True` if `pycocotools` is installed, `False` otherwise.
|
|
404
496
|
"""
|
|
405
497
|
return bool(_COCOTOOLS_AVAILABLE)
|
|
406
498
|
|
|
407
499
|
|
|
408
500
|
def get_cocotools_requirement() -> Requirement:
|
|
409
501
|
"""
|
|
410
|
-
Returns
|
|
502
|
+
Returns the `pycocotools` requirement.
|
|
503
|
+
|
|
504
|
+
Returns:
|
|
505
|
+
tuple: A tuple containing the package name, whether the requirement is satisfied, and an error message.
|
|
411
506
|
"""
|
|
412
507
|
return "pycocotools", cocotools_available(), _COCOTOOLS_ERR_MSG
|
|
413
508
|
|
|
@@ -418,7 +513,10 @@ _SCIPY_AVAILABLE = importlib.util.find_spec("scipy") is not None
|
|
|
418
513
|
|
|
419
514
|
def scipy_available() -> bool:
|
|
420
515
|
"""
|
|
421
|
-
Returns
|
|
516
|
+
Returns whether `scipy` is installed.
|
|
517
|
+
|
|
518
|
+
Returns:
|
|
519
|
+
bool: `True` if `scipy` is installed, `False` otherwise.
|
|
422
520
|
"""
|
|
423
521
|
return bool(_SCIPY_AVAILABLE)
|
|
424
522
|
|
|
@@ -430,14 +528,20 @@ _JDESKEW_ERR_MSG = f"jdeskew must be installed. {_GENERIC_ERR_MSG}"
|
|
|
430
528
|
|
|
431
529
|
def jdeskew_available() -> bool:
|
|
432
530
|
"""
|
|
433
|
-
Returns
|
|
531
|
+
Returns whether `jdeskew` is installed.
|
|
532
|
+
|
|
533
|
+
Returns:
|
|
534
|
+
bool: `True` if `jdeskew` is installed, `False` otherwise.
|
|
434
535
|
"""
|
|
435
536
|
return bool(_JDESKEW_AVAILABLE)
|
|
436
537
|
|
|
437
538
|
|
|
438
539
|
def get_jdeskew_requirement() -> Requirement:
|
|
439
540
|
"""
|
|
440
|
-
Returns jdeskew requirement.
|
|
541
|
+
Returns the `jdeskew` requirement.
|
|
542
|
+
|
|
543
|
+
Returns:
|
|
544
|
+
tuple: A tuple containing the package name, whether the requirement is satisfied, and an error message.
|
|
441
545
|
"""
|
|
442
546
|
return "jdeskew", jdeskew_available(), _JDESKEW_ERR_MSG
|
|
443
547
|
|
|
@@ -449,14 +553,20 @@ _SKLEARN_ERR_MSG = f"scikit-learn must be installed. {_GENERIC_ERR_MSG}"
|
|
|
449
553
|
|
|
450
554
|
def sklearn_available() -> bool:
|
|
451
555
|
"""
|
|
452
|
-
Returns
|
|
556
|
+
Returns whether `sklearn` is installed.
|
|
557
|
+
|
|
558
|
+
Returns:
|
|
559
|
+
bool: `True` if `sklearn` is installed, `False` otherwise.
|
|
453
560
|
"""
|
|
454
561
|
return bool(_SKLEARN_AVAILABLE)
|
|
455
562
|
|
|
456
563
|
|
|
457
564
|
def get_sklearn_requirement() -> Requirement:
|
|
458
565
|
"""
|
|
459
|
-
Returns sklearn requirement.
|
|
566
|
+
Returns the `sklearn` requirement.
|
|
567
|
+
|
|
568
|
+
Returns:
|
|
569
|
+
tuple: A tuple containing the package name, whether the requirement is satisfied, and an error message.
|
|
460
570
|
"""
|
|
461
571
|
return "sklearn", sklearn_available(), _SKLEARN_ERR_MSG
|
|
462
572
|
|
|
@@ -467,7 +577,10 @@ _QPDF_AVAILABLE = which("qpdf") is not None
|
|
|
467
577
|
|
|
468
578
|
def qpdf_available() -> bool:
|
|
469
579
|
"""
|
|
470
|
-
Returns
|
|
580
|
+
Returns whether `qpdf` is installed.
|
|
581
|
+
|
|
582
|
+
Returns:
|
|
583
|
+
bool: `True` if `qpdf` is installed, `False` otherwise.
|
|
471
584
|
"""
|
|
472
585
|
return bool(_QPDF_AVAILABLE)
|
|
473
586
|
|
|
@@ -482,7 +595,10 @@ _AWS_ERR_MSG = "AWS CLI must be installed https://docs.aws.amazon.com/cli/latest
|
|
|
482
595
|
|
|
483
596
|
def boto3_available() -> bool:
|
|
484
597
|
"""
|
|
485
|
-
Returns
|
|
598
|
+
Returns whether `boto3` is installed.
|
|
599
|
+
|
|
600
|
+
Returns:
|
|
601
|
+
bool: `True` if `boto3` is installed, `False` otherwise.
|
|
486
602
|
"""
|
|
487
603
|
|
|
488
604
|
return bool(_BOTO3_AVAILABLE)
|
|
@@ -490,21 +606,30 @@ def boto3_available() -> bool:
|
|
|
490
606
|
|
|
491
607
|
def get_boto3_requirement() -> Requirement:
|
|
492
608
|
"""
|
|
493
|
-
|
|
609
|
+
Returns the `boto3` requirement.
|
|
610
|
+
|
|
611
|
+
Returns:
|
|
612
|
+
tuple: A tuple containing the package name, whether the requirement is satisfied, and an error message.
|
|
494
613
|
"""
|
|
495
614
|
return "boto3", boto3_available(), _BOTO3_ERR_MSG
|
|
496
615
|
|
|
497
616
|
|
|
498
617
|
def aws_available() -> bool:
|
|
499
618
|
"""
|
|
500
|
-
Returns
|
|
619
|
+
Returns whether AWS CLI is installed.
|
|
620
|
+
|
|
621
|
+
Returns:
|
|
622
|
+
bool: `True` if AWS CLI is installed, `False` otherwise.
|
|
501
623
|
"""
|
|
502
624
|
return bool(_AWS_CLI_AVAILABLE)
|
|
503
625
|
|
|
504
626
|
|
|
505
627
|
def get_aws_requirement() -> Requirement:
|
|
506
628
|
"""
|
|
507
|
-
|
|
629
|
+
Returns the AWS CLI requirement.
|
|
630
|
+
|
|
631
|
+
Returns:
|
|
632
|
+
tuple: A tuple containing the package name, whether the requirement is satisfied, and an error message.
|
|
508
633
|
"""
|
|
509
634
|
return "aws", aws_available(), _AWS_ERR_MSG
|
|
510
635
|
|
|
@@ -516,22 +641,25 @@ _DOCTR_ERR_MSG = f"DocTr must be installed. {_GENERIC_ERR_MSG}"
|
|
|
516
641
|
|
|
517
642
|
def doctr_available() -> bool:
|
|
518
643
|
"""
|
|
519
|
-
Returns
|
|
644
|
+
Returns whether `doctr` is installed.
|
|
645
|
+
|
|
646
|
+
Returns:
|
|
647
|
+
bool: `True` if `doctr` is installed, `False` otherwise.
|
|
520
648
|
"""
|
|
521
649
|
return bool(_DOCTR_AVAILABLE)
|
|
522
650
|
|
|
523
651
|
|
|
524
652
|
def get_doctr_requirement() -> Requirement:
|
|
525
653
|
"""
|
|
526
|
-
|
|
654
|
+
Returns the `doctr` requirement.
|
|
655
|
+
|
|
656
|
+
Returns:
|
|
657
|
+
tuple: A tuple containing the package name, whether the requirement is satisfied, and an error message.
|
|
658
|
+
|
|
659
|
+
Note:
|
|
660
|
+
On macOS, if `poppler` is not available, this function will recursively check the requirement.
|
|
661
|
+
It is not yet known how to check whether `pango`, `gdk-pixbuf`, and `libffi` are installed.
|
|
527
662
|
"""
|
|
528
|
-
if sys.platform == "darwin":
|
|
529
|
-
if not get_poppler_version():
|
|
530
|
-
return get_doctr_requirement()
|
|
531
|
-
# don't know yet how to check whether pango gdk-pixbuf libffi are installed
|
|
532
|
-
logger.info(
|
|
533
|
-
LoggingRecord("package requires weasyprint. Check that poppler pango gdk-pixbuf libffi are installed")
|
|
534
|
-
)
|
|
535
663
|
return "doctr", doctr_available(), _DOCTR_ERR_MSG
|
|
536
664
|
|
|
537
665
|
|
|
@@ -542,14 +670,20 @@ _FASTTEXT_ERR_MSG = f"fasttext must be installed. {_GENERIC_ERR_MSG}"
|
|
|
542
670
|
|
|
543
671
|
def fasttext_available() -> bool:
|
|
544
672
|
"""
|
|
545
|
-
Returns
|
|
673
|
+
Returns whether `fasttext` is installed.
|
|
674
|
+
|
|
675
|
+
Returns:
|
|
676
|
+
bool: `True` if `fasttext` is installed, False otherwise.
|
|
546
677
|
"""
|
|
547
678
|
return bool(_FASTTEXT_AVAILABLE)
|
|
548
679
|
|
|
549
680
|
|
|
550
681
|
def get_fasttext_requirement() -> Requirement:
|
|
551
682
|
"""
|
|
552
|
-
|
|
683
|
+
Returns the `fasttext` requirement.
|
|
684
|
+
|
|
685
|
+
Returns:
|
|
686
|
+
tuple: A tuple containing the package name, whether the requirement is satisfied, and an error message.
|
|
553
687
|
"""
|
|
554
688
|
return "fasttext", fasttext_available(), _FASTTEXT_ERR_MSG
|
|
555
689
|
|
|
@@ -561,14 +695,20 @@ _WANDB_ERR_MSG = f"WandB must be installed. {_GENERIC_ERR_MSG}"
|
|
|
561
695
|
|
|
562
696
|
def wandb_available() -> bool:
|
|
563
697
|
"""
|
|
564
|
-
Returns
|
|
698
|
+
Returns whether the W&B package `wandb` is installed.
|
|
699
|
+
|
|
700
|
+
Returns:
|
|
701
|
+
bool: `True` if `wandb` is installed, `False` otherwise.
|
|
565
702
|
"""
|
|
566
703
|
return bool(_WANDB_AVAILABLE)
|
|
567
704
|
|
|
568
705
|
|
|
569
706
|
def get_wandb_requirement() -> Requirement:
|
|
570
707
|
"""
|
|
571
|
-
|
|
708
|
+
Returns the W&B requirement.
|
|
709
|
+
|
|
710
|
+
Returns:
|
|
711
|
+
tuple: A tuple containing the package name, whether the requirement is satisfied, and an error message.
|
|
572
712
|
"""
|
|
573
713
|
return "wandb", wandb_available(), _WANDB_ERR_MSG
|
|
574
714
|
|
|
@@ -585,14 +725,20 @@ _CV2_ERR_MSG = f"OpenCV must be installed. {_GENERIC_ERR_MSG}"
|
|
|
585
725
|
|
|
586
726
|
def opencv_available() -> bool:
|
|
587
727
|
"""
|
|
588
|
-
Returns
|
|
728
|
+
Returns whether OpenCV is installed.
|
|
729
|
+
|
|
730
|
+
Returns:
|
|
731
|
+
bool: `True` if OpenCV is installed, `False` otherwise.
|
|
589
732
|
"""
|
|
590
733
|
return bool(_CV2_AVAILABLE)
|
|
591
734
|
|
|
592
735
|
|
|
593
736
|
def get_opencv_requirement() -> Requirement:
|
|
594
737
|
"""
|
|
595
|
-
|
|
738
|
+
Returns the OpenCV requirement.
|
|
739
|
+
|
|
740
|
+
Returns:
|
|
741
|
+
tuple: A tuple containing the package name, whether the requirement is satisfied, and an error message.
|
|
596
742
|
"""
|
|
597
743
|
return "opencv", opencv_available(), _CV2_ERR_MSG
|
|
598
744
|
|
|
@@ -604,14 +750,20 @@ _PILLOW_ERR_MSG = f"pillow must be installed. {_GENERIC_ERR_MSG}"
|
|
|
604
750
|
|
|
605
751
|
def pillow_available() -> bool:
|
|
606
752
|
"""
|
|
607
|
-
Returns
|
|
753
|
+
Returns whether Pillow is installed.
|
|
754
|
+
|
|
755
|
+
Returns:
|
|
756
|
+
bool: `True` if Pillow is installed, False otherwise.
|
|
608
757
|
"""
|
|
609
758
|
return bool(_PILLOW_AVAILABLE)
|
|
610
759
|
|
|
611
760
|
|
|
612
761
|
def get_pillow_requirement() -> Requirement:
|
|
613
762
|
"""
|
|
614
|
-
|
|
763
|
+
Returns the Pillow requirement.
|
|
764
|
+
|
|
765
|
+
Returns:
|
|
766
|
+
tuple: A tuple containing the package name, whether the requirement is satisfied, and an error message.
|
|
615
767
|
"""
|
|
616
768
|
return "pillow", pillow_available(), _PILLOW_ERR_MSG
|
|
617
769
|
|
|
@@ -623,14 +775,20 @@ _PYPDFIUM2_ERR_MSG = f"pypdfium2 must be installed. {_GENERIC_ERR_MSG}"
|
|
|
623
775
|
|
|
624
776
|
def pypdfium2_available() -> bool:
|
|
625
777
|
"""
|
|
626
|
-
Returns
|
|
778
|
+
Returns whether `pypdfium2` is installed.
|
|
779
|
+
|
|
780
|
+
Returns:
|
|
781
|
+
bool: `True` if `pypdfium2` is installed, `False` otherwise.
|
|
627
782
|
"""
|
|
628
783
|
return bool(_PYPDFIUM2_AVAILABLE)
|
|
629
784
|
|
|
630
785
|
|
|
631
786
|
def get_pypdfium2_requirement() -> Requirement:
|
|
632
787
|
"""
|
|
633
|
-
|
|
788
|
+
Returns the `pypdfium2` requirement.
|
|
789
|
+
|
|
790
|
+
Returns:
|
|
791
|
+
tuple: A tuple containing the package name, whether the requirement is satisfied, and an error message.
|
|
634
792
|
"""
|
|
635
793
|
return "pypdfium2", pypdfium2_available(), _PYPDFIUM2_ERR_MSG
|
|
636
794
|
|
|
@@ -642,7 +800,10 @@ _SPACY_ERR_MSG = f"SpaCy must be installed. {_GENERIC_ERR_MSG}"
|
|
|
642
800
|
|
|
643
801
|
def spacy_available() -> bool:
|
|
644
802
|
"""
|
|
645
|
-
Returns
|
|
803
|
+
Returns whether SpaCy is installed.
|
|
804
|
+
|
|
805
|
+
Returns:
|
|
806
|
+
bool: True if SpaCy is installed, False otherwise.
|
|
646
807
|
"""
|
|
647
808
|
|
|
648
809
|
return bool(_SPACY_AVAILABLE)
|
|
@@ -650,20 +811,21 @@ def spacy_available() -> bool:
|
|
|
650
811
|
|
|
651
812
|
def get_spacy_requirement() -> Requirement:
|
|
652
813
|
"""
|
|
653
|
-
|
|
814
|
+
Returns the SpaCy requirement.
|
|
815
|
+
|
|
816
|
+
Returns:
|
|
817
|
+
tuple: A tuple containing the package name, whether the requirement is satisfied, and an error message.
|
|
654
818
|
"""
|
|
655
819
|
return "spacy", spacy_available(), _SPACY_ERR_MSG
|
|
656
820
|
|
|
657
821
|
|
|
658
822
|
def set_mp_spawn() -> None:
|
|
659
823
|
"""
|
|
660
|
-
Sets multiprocessing method to "spawn".
|
|
824
|
+
Sets the multiprocessing method to "spawn".
|
|
661
825
|
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
produce more deterministic behavior & memory saving
|
|
666
|
-
However its limitation is you cannot pass a lambda function to subprocesses.
|
|
826
|
+
Note:
|
|
827
|
+
"spawn/forkserver" is safer than the default "fork" method and produces more deterministic behavior and memory
|
|
828
|
+
saving. However, its limitation is that you cannot pass a lambda function to subprocesses.
|
|
667
829
|
"""
|
|
668
830
|
|
|
669
831
|
if not _S.mp_context_set:
|
|
@@ -679,10 +841,23 @@ def set_mp_spawn() -> None:
|
|
|
679
841
|
class _LazyModule(ModuleType):
|
|
680
842
|
"""
|
|
681
843
|
Module class that surfaces all objects but only performs associated imports when the objects are requested.
|
|
844
|
+
|
|
845
|
+
|
|
846
|
+
Note:
|
|
847
|
+
This class is needed for autocompletion in an IDE.
|
|
682
848
|
"""
|
|
683
849
|
|
|
684
850
|
@no_type_check
|
|
685
851
|
def __init__(self, name, module_file, import_structure, module_spec=None, extra_objects=None):
|
|
852
|
+
"""
|
|
853
|
+
Args:
|
|
854
|
+
name: The name of the module.
|
|
855
|
+
module_file: The file path of the module.
|
|
856
|
+
import_structure: The import structure dictionary.
|
|
857
|
+
module_spec: The module specification.
|
|
858
|
+
extra_objects: Additional objects to include.
|
|
859
|
+
|
|
860
|
+
"""
|
|
686
861
|
super().__init__(name)
|
|
687
862
|
self._modules = set(import_structure.keys())
|
|
688
863
|
self._class_to_module = {}
|