deepdoctection 0.30__py3-none-any.whl → 0.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +38 -29
- deepdoctection/analyzer/dd.py +36 -29
- deepdoctection/configs/conf_dd_one.yaml +34 -31
- deepdoctection/dataflow/base.py +0 -19
- deepdoctection/dataflow/custom.py +4 -3
- deepdoctection/dataflow/custom_serialize.py +14 -5
- deepdoctection/dataflow/parallel_map.py +12 -11
- deepdoctection/dataflow/serialize.py +5 -4
- deepdoctection/datapoint/annotation.py +35 -13
- deepdoctection/datapoint/box.py +3 -5
- deepdoctection/datapoint/convert.py +3 -1
- deepdoctection/datapoint/image.py +79 -36
- deepdoctection/datapoint/view.py +152 -49
- deepdoctection/datasets/__init__.py +1 -4
- deepdoctection/datasets/adapter.py +6 -3
- deepdoctection/datasets/base.py +86 -11
- deepdoctection/datasets/dataflow_builder.py +1 -1
- deepdoctection/datasets/info.py +4 -4
- deepdoctection/datasets/instances/doclaynet.py +3 -2
- deepdoctection/datasets/instances/fintabnet.py +2 -1
- deepdoctection/datasets/instances/funsd.py +2 -1
- deepdoctection/datasets/instances/iiitar13k.py +5 -2
- deepdoctection/datasets/instances/layouttest.py +4 -8
- deepdoctection/datasets/instances/publaynet.py +2 -2
- deepdoctection/datasets/instances/pubtables1m.py +6 -3
- deepdoctection/datasets/instances/pubtabnet.py +2 -1
- deepdoctection/datasets/instances/rvlcdip.py +2 -1
- deepdoctection/datasets/instances/xfund.py +2 -1
- deepdoctection/eval/__init__.py +1 -4
- deepdoctection/eval/accmetric.py +1 -1
- deepdoctection/eval/base.py +5 -4
- deepdoctection/eval/cocometric.py +2 -1
- deepdoctection/eval/eval.py +19 -15
- deepdoctection/eval/tedsmetric.py +14 -11
- deepdoctection/eval/tp_eval_callback.py +14 -7
- deepdoctection/extern/__init__.py +2 -7
- deepdoctection/extern/base.py +39 -13
- deepdoctection/extern/d2detect.py +182 -90
- deepdoctection/extern/deskew.py +36 -9
- deepdoctection/extern/doctrocr.py +265 -83
- deepdoctection/extern/fastlang.py +49 -9
- deepdoctection/extern/hfdetr.py +106 -55
- deepdoctection/extern/hflayoutlm.py +441 -122
- deepdoctection/extern/hflm.py +225 -0
- deepdoctection/extern/model.py +56 -47
- deepdoctection/extern/pdftext.py +10 -5
- deepdoctection/extern/pt/__init__.py +1 -3
- deepdoctection/extern/pt/nms.py +6 -2
- deepdoctection/extern/pt/ptutils.py +27 -18
- deepdoctection/extern/tessocr.py +134 -22
- deepdoctection/extern/texocr.py +6 -2
- deepdoctection/extern/tp/tfutils.py +43 -9
- deepdoctection/extern/tp/tpcompat.py +14 -11
- deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
- deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
- deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
- deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
- deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
- deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
- deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
- deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
- deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
- deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
- deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
- deepdoctection/extern/tp/tpfrcnn/preproc.py +8 -9
- deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
- deepdoctection/extern/tpdetect.py +54 -30
- deepdoctection/mapper/__init__.py +3 -8
- deepdoctection/mapper/d2struct.py +9 -7
- deepdoctection/mapper/hfstruct.py +7 -2
- deepdoctection/mapper/laylmstruct.py +164 -21
- deepdoctection/mapper/maputils.py +16 -3
- deepdoctection/mapper/misc.py +6 -3
- deepdoctection/mapper/prodigystruct.py +1 -1
- deepdoctection/mapper/pubstruct.py +10 -10
- deepdoctection/mapper/tpstruct.py +3 -3
- deepdoctection/pipe/__init__.py +1 -1
- deepdoctection/pipe/anngen.py +35 -8
- deepdoctection/pipe/base.py +53 -19
- deepdoctection/pipe/common.py +23 -13
- deepdoctection/pipe/concurrency.py +2 -1
- deepdoctection/pipe/doctectionpipe.py +2 -2
- deepdoctection/pipe/language.py +3 -2
- deepdoctection/pipe/layout.py +6 -3
- deepdoctection/pipe/lm.py +34 -66
- deepdoctection/pipe/order.py +142 -35
- deepdoctection/pipe/refine.py +26 -24
- deepdoctection/pipe/segment.py +21 -16
- deepdoctection/pipe/{cell.py → sub_layout.py} +30 -9
- deepdoctection/pipe/text.py +14 -8
- deepdoctection/pipe/transform.py +16 -9
- deepdoctection/train/__init__.py +6 -12
- deepdoctection/train/d2_frcnn_train.py +36 -28
- deepdoctection/train/hf_detr_train.py +26 -17
- deepdoctection/train/hf_layoutlm_train.py +133 -111
- deepdoctection/train/tp_frcnn_train.py +21 -19
- deepdoctection/utils/__init__.py +3 -0
- deepdoctection/utils/concurrency.py +1 -1
- deepdoctection/utils/context.py +2 -2
- deepdoctection/utils/env_info.py +41 -84
- deepdoctection/utils/error.py +84 -0
- deepdoctection/utils/file_utils.py +4 -15
- deepdoctection/utils/fs.py +7 -7
- deepdoctection/utils/logger.py +1 -0
- deepdoctection/utils/mocks.py +93 -0
- deepdoctection/utils/pdf_utils.py +5 -4
- deepdoctection/utils/settings.py +6 -1
- deepdoctection/utils/transform.py +1 -1
- deepdoctection/utils/utils.py +0 -6
- deepdoctection/utils/viz.py +48 -5
- {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/METADATA +57 -73
- deepdoctection-0.32.dist-info/RECORD +146 -0
- {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/WHEEL +1 -1
- deepdoctection-0.30.dist-info/RECORD +0 -143
- {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/LICENSE +0 -0
- {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/top_level.txt +0 -0
deepdoctection/utils/env_info.py
CHANGED
|
@@ -46,7 +46,6 @@ can store an (absolute) path to a `.jsonl` file.
|
|
|
46
46
|
|
|
47
47
|
"""
|
|
48
48
|
|
|
49
|
-
import ast
|
|
50
49
|
import importlib
|
|
51
50
|
import os
|
|
52
51
|
import re
|
|
@@ -56,6 +55,7 @@ from collections import defaultdict
|
|
|
56
55
|
from typing import List, Optional, Tuple
|
|
57
56
|
|
|
58
57
|
import numpy as np
|
|
58
|
+
from packaging import version
|
|
59
59
|
from tabulate import tabulate
|
|
60
60
|
|
|
61
61
|
from .file_utils import (
|
|
@@ -68,6 +68,7 @@ from .file_utils import (
|
|
|
68
68
|
fasttext_available,
|
|
69
69
|
get_poppler_version,
|
|
70
70
|
get_tesseract_version,
|
|
71
|
+
get_tf_version,
|
|
71
72
|
jdeskew_available,
|
|
72
73
|
lxml_available,
|
|
73
74
|
opencv_available,
|
|
@@ -84,13 +85,9 @@ from .file_utils import (
|
|
|
84
85
|
transformers_available,
|
|
85
86
|
wandb_available,
|
|
86
87
|
)
|
|
87
|
-
from .logger import LoggingRecord, logger
|
|
88
88
|
|
|
89
89
|
__all__ = [
|
|
90
|
-
"collect_torch_env",
|
|
91
90
|
"collect_env_info",
|
|
92
|
-
"get_device",
|
|
93
|
-
"auto_select_lib_and_device",
|
|
94
91
|
"auto_select_viz_library",
|
|
95
92
|
]
|
|
96
93
|
|
|
@@ -270,7 +267,22 @@ def tf_info(data: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
|
|
270
267
|
if tf_available():
|
|
271
268
|
import tensorflow as tf # type: ignore # pylint: disable=E0401
|
|
272
269
|
|
|
270
|
+
os.environ["TENSORFLOW_AVAILABLE"] = "1"
|
|
271
|
+
|
|
273
272
|
data.append(("Tensorflow", tf.__version__))
|
|
273
|
+
if version.parse(get_tf_version()) > version.parse("2.4.1"):
|
|
274
|
+
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
|
|
275
|
+
try:
|
|
276
|
+
import tensorflow.python.util.deprecation as deprecation # type: ignore # pylint: disable=E0401,R0402
|
|
277
|
+
|
|
278
|
+
deprecation._PRINT_DEPRECATION_WARNINGS = False # pylint: disable=W0212
|
|
279
|
+
except Exception: # pylint: disable=W0703
|
|
280
|
+
try:
|
|
281
|
+
from tensorflow.python.util import deprecation # type: ignore # pylint: disable=E0401
|
|
282
|
+
|
|
283
|
+
deprecation._PRINT_DEPRECATION_WARNINGS = False # pylint: disable=W0212
|
|
284
|
+
except Exception: # pylint: disable=W0703
|
|
285
|
+
pass
|
|
274
286
|
else:
|
|
275
287
|
data.append(("Tensorflow", "None"))
|
|
276
288
|
return data
|
|
@@ -279,12 +291,18 @@ def tf_info(data: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
|
|
279
291
|
|
|
280
292
|
try:
|
|
281
293
|
for key, value in list(build_info.build_info.items()):
|
|
282
|
-
if key == "
|
|
294
|
+
if key == "is_cuda_build":
|
|
295
|
+
data.append(("TF compiled with CUDA", value))
|
|
296
|
+
if value and len(tf.config.list_physical_devices('GPU')):
|
|
297
|
+
os.environ["USE_CUDA"] = "1"
|
|
298
|
+
elif key == "cuda_version":
|
|
283
299
|
data.append(("TF built with CUDA", value))
|
|
284
300
|
elif key == "cudnn_version":
|
|
285
301
|
data.append(("TF built with CUDNN", value))
|
|
286
302
|
elif key == "cuda_compute_capabilities":
|
|
287
303
|
data.append(("TF compute capabilities", ",".join([k.replace("compute_", "") for k in value])))
|
|
304
|
+
elif key == "is_rocm_build":
|
|
305
|
+
data.append(("TF compiled with ROCM", value))
|
|
288
306
|
return data
|
|
289
307
|
except AttributeError:
|
|
290
308
|
pass
|
|
@@ -306,6 +324,13 @@ def pt_info(data: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
|
|
306
324
|
|
|
307
325
|
if pytorch_available():
|
|
308
326
|
import torch
|
|
327
|
+
|
|
328
|
+
os.environ["PYTORCH_AVAILABLE"] = "1"
|
|
329
|
+
|
|
330
|
+
else:
|
|
331
|
+
data.append(("PyTorch", "None"))
|
|
332
|
+
return []
|
|
333
|
+
|
|
309
334
|
has_gpu = torch.cuda.is_available() # true for both CUDA & ROCM
|
|
310
335
|
has_mps = torch.backends.mps.is_available()
|
|
311
336
|
|
|
@@ -331,12 +356,9 @@ def pt_info(data: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
|
|
331
356
|
data.append(("PyTorch", torch_version + " @" + os.path.dirname(torch.__file__)))
|
|
332
357
|
data.append(("PyTorch debug build", str(torch.version.debug)))
|
|
333
358
|
|
|
334
|
-
if not has_gpu:
|
|
335
|
-
has_gpu_text = "No: torch.cuda.is_available() == False"
|
|
336
|
-
else:
|
|
337
|
-
has_gpu_text = "Yes"
|
|
338
|
-
data.append(("GPU available", has_gpu_text))
|
|
339
359
|
if has_gpu:
|
|
360
|
+
os.environ["USE_CUDA"] = "1"
|
|
361
|
+
has_gpu_text = "Yes"
|
|
340
362
|
devices = defaultdict(list)
|
|
341
363
|
for k in range(torch.cuda.device_count()):
|
|
342
364
|
cap = ".".join((str(x) for x in torch.cuda.get_device_capability(k)))
|
|
@@ -362,6 +384,10 @@ def pt_info(data: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
|
|
362
384
|
cuda_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
|
|
363
385
|
if cuda_arch_list:
|
|
364
386
|
data.append(("TORCH_CUDA_ARCH_LIST", cuda_arch_list))
|
|
387
|
+
else:
|
|
388
|
+
has_gpu_text = "No: torch.cuda.is_available() == False"
|
|
389
|
+
|
|
390
|
+
data.append(("GPU available", has_gpu_text))
|
|
365
391
|
|
|
366
392
|
mps_build = "No: torch.backends.mps.is_built() == False"
|
|
367
393
|
if not has_mps:
|
|
@@ -369,9 +395,11 @@ def pt_info(data: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
|
|
369
395
|
else:
|
|
370
396
|
has_mps_text = "Yes"
|
|
371
397
|
mps_build = str(torch.backends.mps.is_built())
|
|
398
|
+
if mps_build == "True":
|
|
399
|
+
os.environ["USE_MPS"] = "1"
|
|
372
400
|
|
|
373
401
|
data.append(("MPS available", has_mps_text))
|
|
374
|
-
data.append(("MPS
|
|
402
|
+
data.append(("MPS built", mps_build))
|
|
375
403
|
|
|
376
404
|
try:
|
|
377
405
|
import torchvision # type: ignore
|
|
@@ -420,7 +448,7 @@ def collect_env_info() -> str:
|
|
|
420
448
|
try:
|
|
421
449
|
import prctl # type: ignore
|
|
422
450
|
|
|
423
|
-
_ = prctl.set_pdeathsig #
|
|
451
|
+
_ = prctl.set_pdeathsig # pylint: disable=E1101
|
|
424
452
|
except ModuleNotFoundError:
|
|
425
453
|
has_prctl = False
|
|
426
454
|
data.append(("python-prctl", str(has_prctl)))
|
|
@@ -452,77 +480,6 @@ def collect_env_info() -> str:
|
|
|
452
480
|
return env_str
|
|
453
481
|
|
|
454
482
|
|
|
455
|
-
def auto_select_lib_and_device() -> None:
|
|
456
|
-
"""
|
|
457
|
-
Select the DL library and subsequently the device.
|
|
458
|
-
This will set environment variable `USE_TENSORFLOW`, `USE_PYTORCH` and `USE_CUDA`
|
|
459
|
-
|
|
460
|
-
If TF is available, use TF unless a GPU is not available, in which case choose PT. If CUDA is not available and PT
|
|
461
|
-
is not installed raise ImportError.
|
|
462
|
-
"""
|
|
463
|
-
|
|
464
|
-
if tf_available() and tensorpack_available():
|
|
465
|
-
from tensorpack.utils.gpu import get_num_gpu # pylint: disable=E0401
|
|
466
|
-
|
|
467
|
-
if get_num_gpu() >= 1:
|
|
468
|
-
os.environ["USE_TENSORFLOW"] = "True"
|
|
469
|
-
os.environ["USE_PYTORCH"] = "False"
|
|
470
|
-
os.environ["USE_CUDA"] = "True"
|
|
471
|
-
os.environ["USE_MPS"] = "False"
|
|
472
|
-
return
|
|
473
|
-
if pytorch_available():
|
|
474
|
-
os.environ["USE_TENSORFLOW"] = "False"
|
|
475
|
-
os.environ["USE_PYTORCH"] = "True"
|
|
476
|
-
os.environ["USE_CUDA"] = "False"
|
|
477
|
-
return
|
|
478
|
-
logger.warning(
|
|
479
|
-
LoggingRecord("You have Tensorflow installed but no GPU is available. All Tensorflow models require a GPU.")
|
|
480
|
-
)
|
|
481
|
-
if pytorch_available():
|
|
482
|
-
import torch
|
|
483
|
-
|
|
484
|
-
if torch.cuda.is_available():
|
|
485
|
-
os.environ["USE_TENSORFLOW"] = "False"
|
|
486
|
-
os.environ["USE_PYTORCH"] = "True"
|
|
487
|
-
os.environ["USE_CUDA"] = "True"
|
|
488
|
-
return
|
|
489
|
-
if torch.backends.mps.is_available():
|
|
490
|
-
os.environ["USE_TENSORFLOW"] = "False"
|
|
491
|
-
os.environ["USE_PYTORCH"] = "True"
|
|
492
|
-
os.environ["USE_CUDA"] = "False"
|
|
493
|
-
os.environ["USE_MPS"] = "True"
|
|
494
|
-
return
|
|
495
|
-
os.environ["USE_TENSORFLOW"] = "False"
|
|
496
|
-
os.environ["USE_PYTORCH"] = "True"
|
|
497
|
-
os.environ["USE_CUDA"] = "False"
|
|
498
|
-
os.environ["USE_MPS"] = "False"
|
|
499
|
-
return
|
|
500
|
-
logger.warning(
|
|
501
|
-
LoggingRecord(
|
|
502
|
-
"Neither Tensorflow or Pytorch are available. You will not be able to use any Deep Learning "
|
|
503
|
-
"model from the library."
|
|
504
|
-
)
|
|
505
|
-
)
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
def get_device(ignore_cpu: bool = True) -> str:
|
|
509
|
-
"""
|
|
510
|
-
Device checks for running PyTorch with CUDA, MPS or optionall CPU.
|
|
511
|
-
If nothing can be found and if `disable_cpu` is deactivated it will raise a `ValueError`
|
|
512
|
-
|
|
513
|
-
:param ignore_cpu: Will not consider `cpu` as valid return value
|
|
514
|
-
:return: Either cuda or mps
|
|
515
|
-
"""
|
|
516
|
-
|
|
517
|
-
if ast.literal_eval(os.environ.get("USE_CUDA", "True")):
|
|
518
|
-
return "cuda"
|
|
519
|
-
if ast.literal_eval(os.environ.get("USE_MPS", "True")):
|
|
520
|
-
return "mps"
|
|
521
|
-
if not ignore_cpu:
|
|
522
|
-
return "cpu"
|
|
523
|
-
raise ValueError("Could not find either GPU nor MPS")
|
|
524
|
-
|
|
525
|
-
|
|
526
483
|
def auto_select_viz_library() -> None:
|
|
527
484
|
"""Setting PIL as default image library if cv2 is not installed"""
|
|
528
485
|
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# File: error.py
|
|
3
|
+
|
|
4
|
+
# Copyright 2024 Dr. Janis Meyer. All rights reserved.
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
#
|
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
#
|
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
# See the License for the specific language governing permissions and
|
|
16
|
+
# limitations under the License.
|
|
17
|
+
|
|
18
|
+
"""
|
|
19
|
+
Module for custom exceptions
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class BoundingBoxError(BaseException):
|
|
24
|
+
"""Special exception only for `datapoint.box.BoundingBox`"""
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class AnnotationError(BaseException):
|
|
28
|
+
"""Special exception only for `datapoint.annotation.Annotation`"""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ImageError(BaseException):
|
|
32
|
+
"""Special exception only for `datapoint.image.Image`"""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class UUIDError(BaseException):
|
|
36
|
+
"""Special exception only for `utils.identifier`"""
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class DependencyError(BaseException):
|
|
40
|
+
"""Special exception only for missing dependencies. We do not use the internals ImportError or
|
|
41
|
+
ModuleNotFoundError."""
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class DataFlowTerminatedError(BaseException):
|
|
45
|
+
"""
|
|
46
|
+
An exception indicating that the DataFlow is unable to produce any more
|
|
47
|
+
data, i.e. something wrong happened so that calling `__iter__`
|
|
48
|
+
cannot give a valid iterator anymore.
|
|
49
|
+
In most DataFlow this will never be raised.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class DataFlowResetStateNotCalledError(BaseException):
|
|
54
|
+
"""
|
|
55
|
+
An exception indicating that `reset_state()` has not been called before starting
|
|
56
|
+
iteration.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
def __init__(self) -> None:
|
|
60
|
+
super().__init__("Iterating a dataflow requires .reset_state() to be called first")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class MalformedData(BaseException):
|
|
64
|
+
"""
|
|
65
|
+
Exception class for malformed data. Use this class if something does not look right with the data
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class FileExtensionError(BaseException):
|
|
70
|
+
"""
|
|
71
|
+
Exception class for wrong file extensions.
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class TesseractError(RuntimeError):
|
|
76
|
+
"""
|
|
77
|
+
Tesseract Error
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
def __init__(self, status: int, message: str) -> None:
|
|
81
|
+
super().__init__()
|
|
82
|
+
self.status = status
|
|
83
|
+
self.message = message
|
|
84
|
+
self.args = (status, message)
|
|
@@ -22,6 +22,7 @@ import importlib_metadata
|
|
|
22
22
|
from packaging import version
|
|
23
23
|
|
|
24
24
|
from .detection_types import Requirement
|
|
25
|
+
from .error import DependencyError
|
|
25
26
|
from .logger import LoggingRecord, logger
|
|
26
27
|
from .metacfg import AttrDict
|
|
27
28
|
|
|
@@ -263,7 +264,7 @@ def set_tesseract_path(tesseract_path: str) -> None:
|
|
|
263
264
|
:param tesseract_path: Tesseract installation path.
|
|
264
265
|
"""
|
|
265
266
|
if tesseract_path is None:
|
|
266
|
-
raise
|
|
267
|
+
raise TypeError("tesseract_path cannot be None")
|
|
267
268
|
|
|
268
269
|
global _TESS_AVAILABLE # pylint: disable=W0603
|
|
269
270
|
global _TESS_PATH # pylint: disable=W0603
|
|
@@ -288,12 +289,6 @@ def tesseract_available() -> bool:
|
|
|
288
289
|
# copy paste from https://github.com/madmaze/pytesseract/blob/master/pytesseract/pytesseract.py
|
|
289
290
|
|
|
290
291
|
|
|
291
|
-
class TesseractNotFound(BaseException):
|
|
292
|
-
"""
|
|
293
|
-
Exception class for Tesseract being not found
|
|
294
|
-
"""
|
|
295
|
-
|
|
296
|
-
|
|
297
292
|
def get_tesseract_version() -> Union[int, version.Version]:
|
|
298
293
|
"""
|
|
299
294
|
Returns Version object of the Tesseract version. We need at least Tesseract 3.05
|
|
@@ -306,7 +301,7 @@ def get_tesseract_version() -> Union[int, version.Version]:
|
|
|
306
301
|
stdin=subprocess.DEVNULL,
|
|
307
302
|
)
|
|
308
303
|
except OSError:
|
|
309
|
-
raise
|
|
304
|
+
raise DependencyError(_TESS_ERR_MSG) from OSError
|
|
310
305
|
|
|
311
306
|
raw_version = output.decode("utf-8")
|
|
312
307
|
str_version, *_ = raw_version.lstrip(string.printable[10:]).partition(" ")
|
|
@@ -348,12 +343,6 @@ def pdf_to_cairo_available() -> bool:
|
|
|
348
343
|
return bool(_PDF_TO_CAIRO_AVAILABLE)
|
|
349
344
|
|
|
350
345
|
|
|
351
|
-
class PopplerNotFound(BaseException):
|
|
352
|
-
"""
|
|
353
|
-
Exception class for Poppler being not found
|
|
354
|
-
"""
|
|
355
|
-
|
|
356
|
-
|
|
357
346
|
def get_poppler_version() -> Union[int, version.Version]:
|
|
358
347
|
"""
|
|
359
348
|
Returns Version object of the Poppler version. We need at least Tesseract 3.05
|
|
@@ -371,7 +360,7 @@ def get_poppler_version() -> Union[int, version.Version]:
|
|
|
371
360
|
[command, "-v"], stderr=subprocess.STDOUT, env=environ, stdin=subprocess.DEVNULL
|
|
372
361
|
)
|
|
373
362
|
except OSError:
|
|
374
|
-
raise
|
|
363
|
+
raise DependencyError(_POPPLER_ERR_MSG) from OSError
|
|
375
364
|
|
|
376
365
|
raw_version = output.decode("utf-8")
|
|
377
366
|
list_version = raw_version.split("\n", maxsplit=1)[0].split(" ")[-1].split(".")
|
deepdoctection/utils/fs.py
CHANGED
|
@@ -34,7 +34,7 @@ from .logger import LoggingRecord, logger
|
|
|
34
34
|
from .pdf_utils import get_pdf_file_reader, get_pdf_file_writer
|
|
35
35
|
from .settings import CONFIGS, DATASET_DIR, MODEL_DIR, PATH
|
|
36
36
|
from .tqdm import get_tqdm
|
|
37
|
-
from .utils import
|
|
37
|
+
from .utils import is_file_extension
|
|
38
38
|
from .viz import viz_handler
|
|
39
39
|
|
|
40
40
|
__all__ = [
|
|
@@ -44,9 +44,7 @@ __all__ = [
|
|
|
44
44
|
"maybe_path_or_pdf",
|
|
45
45
|
"download",
|
|
46
46
|
"mkdir_p",
|
|
47
|
-
"is_file_extension",
|
|
48
47
|
"load_json",
|
|
49
|
-
"FileExtensionError",
|
|
50
48
|
"sub_path",
|
|
51
49
|
"get_package_path",
|
|
52
50
|
"get_configs_dir_path",
|
|
@@ -125,8 +123,8 @@ def download(url: str, directory: Pathlike, file_name: Optional[str] = None, exp
|
|
|
125
123
|
assert size > 0, f"Downloaded an empty file from {url}!"
|
|
126
124
|
|
|
127
125
|
if expect_size is not None and size != expect_size:
|
|
128
|
-
logger.
|
|
129
|
-
logger.
|
|
126
|
+
logger.warning(LoggingRecord(f"File downloaded from {url} does not match the expected size!"))
|
|
127
|
+
logger.warning(
|
|
130
128
|
LoggingRecord("You may have downloaded a broken file, or the upstream may have modified the file.")
|
|
131
129
|
)
|
|
132
130
|
|
|
@@ -210,13 +208,15 @@ def get_load_image_func(
|
|
|
210
208
|
:return: The function loading the file (and converting to its desired format)
|
|
211
209
|
"""
|
|
212
210
|
|
|
213
|
-
assert is_file_extension(path, [".png", ".jpeg", ".jpg", ".pdf", ".tif"]), f"image type not allowed: {path}"
|
|
211
|
+
assert is_file_extension(path, [".png", ".jpeg", ".jpg", ".pdf", ".tif"]), f"image type not allowed: " f"{path}"
|
|
214
212
|
|
|
215
213
|
if is_file_extension(path, [".png", ".jpeg", ".jpg", ".tif"]):
|
|
216
214
|
return load_image_from_file
|
|
217
215
|
if is_file_extension(path, [".pdf"]):
|
|
218
216
|
return load_bytes_from_pdf_file
|
|
219
|
-
|
|
217
|
+
raise NotImplementedError(
|
|
218
|
+
"File extension not supported by any loader. Please specify a file type and raise an issue"
|
|
219
|
+
)
|
|
220
220
|
|
|
221
221
|
|
|
222
222
|
def maybe_path_or_pdf(path: Pathlike) -> int:
|
deepdoctection/utils/logger.py
CHANGED
|
@@ -134,6 +134,7 @@ class FileFormatter(logging.Formatter):
|
|
|
134
134
|
_LOG_DIR = None
|
|
135
135
|
_CONFIG_DICT: Dict[str, Any] = {
|
|
136
136
|
"version": 1,
|
|
137
|
+
"disable_existing_loggers": False,
|
|
137
138
|
"filters": {"customfilter": {"()": lambda: CustomFilter()}}, # pylint: disable=W0108
|
|
138
139
|
"formatters": {
|
|
139
140
|
"streamformatter": {"()": lambda: StreamFormatter(datefmt="%m%d %H:%M.%S")},
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# File: mocks.py
|
|
3
|
+
|
|
4
|
+
# Copyright 2024 Dr. Janis Meyer. All rights reserved.
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
#
|
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
#
|
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
# See the License for the specific language governing permissions and
|
|
16
|
+
# limitations under the License.
|
|
17
|
+
|
|
18
|
+
"""
|
|
19
|
+
Some classes with the purpose to mock the original classes from the Tensorpack library, if Tensorpack is not installed
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from deepdoctection.utils.error import DependencyError
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def layer_register(log_shape): # pylint: disable=W0613
|
|
26
|
+
"""Mock layer_register function from tensorpack."""
|
|
27
|
+
|
|
28
|
+
def inner(inputs): # pylint: disable=W0613
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
return inner
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def under_name_scope():
|
|
35
|
+
"""Mock under_name_scope function from tensorpack."""
|
|
36
|
+
|
|
37
|
+
def inner(inputs): # pylint: disable=W0613
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
return inner
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def memoized(func):
|
|
44
|
+
"""Mock memoized function from tensorpack."""
|
|
45
|
+
return func
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def memoized_method(func):
|
|
49
|
+
"""Mock memoized_method function from tensorpack."""
|
|
50
|
+
return func
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def auto_reuse_variable_scope(inputs): # pylint: disable=W0613
|
|
54
|
+
"""Mock auto_reuse_variable_scope function from tensorpack."""
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class ModelDesc: # pylint: disable=R0903
|
|
58
|
+
"""Mock ModelDesc class from tensorpack."""
|
|
59
|
+
|
|
60
|
+
def __init__(self) -> None:
|
|
61
|
+
raise DependencyError("Tensorpack not found.")
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class ImageAugmentor: # pylint: disable=R0903
|
|
65
|
+
"""Mock ImageAugmentor class from tensorpack."""
|
|
66
|
+
|
|
67
|
+
def __init__(self) -> None:
|
|
68
|
+
raise DependencyError("Tensorpack not found.")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class Callback: # pylint: disable=R0903
|
|
72
|
+
"""Mock Callback class from tensor"""
|
|
73
|
+
|
|
74
|
+
def __init__(self) -> None:
|
|
75
|
+
raise DependencyError("Tensorpack not found.")
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class Config: # pylint: disable=R0903
|
|
79
|
+
"""Mock class for Config"""
|
|
80
|
+
|
|
81
|
+
pass # pylint: disable=W0107
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class Tree: # pylint: disable=R0903
|
|
85
|
+
"""Mock class for Tree"""
|
|
86
|
+
|
|
87
|
+
pass # pylint: disable=W0107
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class IterableDataset: # pylint: disable=R0903
|
|
91
|
+
"""Mock class for IterableDataset"""
|
|
92
|
+
|
|
93
|
+
pass # pylint: disable=W0107
|
|
@@ -32,9 +32,10 @@ from pypdf import PdfReader, PdfWriter, errors
|
|
|
32
32
|
|
|
33
33
|
from .context import save_tmp_file, timeout_manager
|
|
34
34
|
from .detection_types import ImageType, Pathlike
|
|
35
|
-
from .
|
|
35
|
+
from .error import DependencyError, FileExtensionError
|
|
36
|
+
from .file_utils import pdf_to_cairo_available, pdf_to_ppm_available, qpdf_available
|
|
36
37
|
from .logger import LoggingRecord, logger
|
|
37
|
-
from .utils import
|
|
38
|
+
from .utils import is_file_extension
|
|
38
39
|
from .viz import viz_handler
|
|
39
40
|
|
|
40
41
|
__all__ = ["decrypt_pdf_document", "get_pdf_file_reader", "get_pdf_file_writer", "PDFStreamer", "pdf_to_np_array"]
|
|
@@ -165,7 +166,7 @@ def _input_to_cli_str(
|
|
|
165
166
|
elif pdf_to_cairo_available():
|
|
166
167
|
command = "pdftocairo"
|
|
167
168
|
else:
|
|
168
|
-
raise
|
|
169
|
+
raise DependencyError("Poppler not found. Please install or add to your PATH.")
|
|
169
170
|
|
|
170
171
|
if platform.system() == "Windows":
|
|
171
172
|
command = command + ".exe"
|
|
@@ -201,7 +202,7 @@ def _run_poppler(poppler_args: List[str]) -> None:
|
|
|
201
202
|
except OSError as error:
|
|
202
203
|
if error.errno != ENOENT:
|
|
203
204
|
raise error from error
|
|
204
|
-
raise
|
|
205
|
+
raise DependencyError("Poppler not found. Please install or add to your PATH.") from error
|
|
205
206
|
|
|
206
207
|
with timeout_manager(proc, 0):
|
|
207
208
|
if proc.returncode:
|
deepdoctection/utils/settings.py
CHANGED
|
@@ -65,6 +65,7 @@ class PageType(ObjectTypes):
|
|
|
65
65
|
|
|
66
66
|
document_type = "document_type"
|
|
67
67
|
language = "language"
|
|
68
|
+
angle = "angle"
|
|
68
69
|
|
|
69
70
|
|
|
70
71
|
@object_types_registry.register("SummaryType")
|
|
@@ -125,6 +126,7 @@ class LayoutType(ObjectTypes):
|
|
|
125
126
|
column = "column"
|
|
126
127
|
word = "word"
|
|
127
128
|
line = "line"
|
|
129
|
+
background = "background"
|
|
128
130
|
|
|
129
131
|
|
|
130
132
|
@object_types_registry.register("TableType")
|
|
@@ -291,6 +293,7 @@ class DatasetType(ObjectTypes):
|
|
|
291
293
|
sequence_classification = "sequence_classification"
|
|
292
294
|
token_classification = "token_classification"
|
|
293
295
|
publaynet = "publaynet"
|
|
296
|
+
default = "default"
|
|
294
297
|
|
|
295
298
|
|
|
296
299
|
_TOKEN_AND_TAG_TO_TOKEN_CLASS_WITH_TAG = {
|
|
@@ -324,7 +327,9 @@ def token_class_tag_to_token_class_with_tag(token: ObjectTypes, tag: ObjectTypes
|
|
|
324
327
|
"""
|
|
325
328
|
if isinstance(token, TokenClasses) and isinstance(tag, BioTag):
|
|
326
329
|
return _TOKEN_AND_TAG_TO_TOKEN_CLASS_WITH_TAG[(token, tag)]
|
|
327
|
-
raise TypeError(
|
|
330
|
+
raise TypeError(
|
|
331
|
+
f"Token must be of type TokenClasses, is of {type(token)} and tag " f"{type(tag)} must be of type BioTag"
|
|
332
|
+
)
|
|
328
333
|
|
|
329
334
|
|
|
330
335
|
def token_class_with_tag_to_token_class_and_tag(
|
|
@@ -47,7 +47,7 @@ class BaseTransform(ABC):
|
|
|
47
47
|
@abstractmethod
|
|
48
48
|
def apply_image(self, img: ImageType) -> ImageType:
|
|
49
49
|
"""The transformation that should be applied to the image"""
|
|
50
|
-
raise NotImplementedError
|
|
50
|
+
raise NotImplementedError()
|
|
51
51
|
|
|
52
52
|
|
|
53
53
|
class ResizeTransform(BaseTransform):
|
deepdoctection/utils/utils.py
CHANGED
|
@@ -144,12 +144,6 @@ def get_rng(obj: Any = None) -> np.random.RandomState:
|
|
|
144
144
|
return np.random.RandomState(seed)
|
|
145
145
|
|
|
146
146
|
|
|
147
|
-
class FileExtensionError(BaseException):
|
|
148
|
-
"""
|
|
149
|
-
An exception indicating that a file does not seem to have an expected type
|
|
150
|
-
"""
|
|
151
|
-
|
|
152
|
-
|
|
153
147
|
def is_file_extension(file_name: Pathlike, extension: Union[str, Sequence[str]]) -> bool:
|
|
154
148
|
"""
|
|
155
149
|
Check if a given file name has a given extension
|