deepdoctection 0.30__py3-none-any.whl → 0.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (120) hide show
  1. deepdoctection/__init__.py +38 -29
  2. deepdoctection/analyzer/dd.py +36 -29
  3. deepdoctection/configs/conf_dd_one.yaml +34 -31
  4. deepdoctection/dataflow/base.py +0 -19
  5. deepdoctection/dataflow/custom.py +4 -3
  6. deepdoctection/dataflow/custom_serialize.py +14 -5
  7. deepdoctection/dataflow/parallel_map.py +12 -11
  8. deepdoctection/dataflow/serialize.py +5 -4
  9. deepdoctection/datapoint/annotation.py +35 -13
  10. deepdoctection/datapoint/box.py +3 -5
  11. deepdoctection/datapoint/convert.py +3 -1
  12. deepdoctection/datapoint/image.py +79 -36
  13. deepdoctection/datapoint/view.py +152 -49
  14. deepdoctection/datasets/__init__.py +1 -4
  15. deepdoctection/datasets/adapter.py +6 -3
  16. deepdoctection/datasets/base.py +86 -11
  17. deepdoctection/datasets/dataflow_builder.py +1 -1
  18. deepdoctection/datasets/info.py +4 -4
  19. deepdoctection/datasets/instances/doclaynet.py +3 -2
  20. deepdoctection/datasets/instances/fintabnet.py +2 -1
  21. deepdoctection/datasets/instances/funsd.py +2 -1
  22. deepdoctection/datasets/instances/iiitar13k.py +5 -2
  23. deepdoctection/datasets/instances/layouttest.py +4 -8
  24. deepdoctection/datasets/instances/publaynet.py +2 -2
  25. deepdoctection/datasets/instances/pubtables1m.py +6 -3
  26. deepdoctection/datasets/instances/pubtabnet.py +2 -1
  27. deepdoctection/datasets/instances/rvlcdip.py +2 -1
  28. deepdoctection/datasets/instances/xfund.py +2 -1
  29. deepdoctection/eval/__init__.py +1 -4
  30. deepdoctection/eval/accmetric.py +1 -1
  31. deepdoctection/eval/base.py +5 -4
  32. deepdoctection/eval/cocometric.py +2 -1
  33. deepdoctection/eval/eval.py +19 -15
  34. deepdoctection/eval/tedsmetric.py +14 -11
  35. deepdoctection/eval/tp_eval_callback.py +14 -7
  36. deepdoctection/extern/__init__.py +2 -7
  37. deepdoctection/extern/base.py +39 -13
  38. deepdoctection/extern/d2detect.py +182 -90
  39. deepdoctection/extern/deskew.py +36 -9
  40. deepdoctection/extern/doctrocr.py +265 -83
  41. deepdoctection/extern/fastlang.py +49 -9
  42. deepdoctection/extern/hfdetr.py +106 -55
  43. deepdoctection/extern/hflayoutlm.py +441 -122
  44. deepdoctection/extern/hflm.py +225 -0
  45. deepdoctection/extern/model.py +56 -47
  46. deepdoctection/extern/pdftext.py +10 -5
  47. deepdoctection/extern/pt/__init__.py +1 -3
  48. deepdoctection/extern/pt/nms.py +6 -2
  49. deepdoctection/extern/pt/ptutils.py +27 -18
  50. deepdoctection/extern/tessocr.py +134 -22
  51. deepdoctection/extern/texocr.py +6 -2
  52. deepdoctection/extern/tp/tfutils.py +43 -9
  53. deepdoctection/extern/tp/tpcompat.py +14 -11
  54. deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
  55. deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
  56. deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
  57. deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
  58. deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
  59. deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
  60. deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
  61. deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
  62. deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
  63. deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
  64. deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
  65. deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
  66. deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
  67. deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
  68. deepdoctection/extern/tp/tpfrcnn/preproc.py +8 -9
  69. deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
  70. deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
  71. deepdoctection/extern/tpdetect.py +54 -30
  72. deepdoctection/mapper/__init__.py +3 -8
  73. deepdoctection/mapper/d2struct.py +9 -7
  74. deepdoctection/mapper/hfstruct.py +7 -2
  75. deepdoctection/mapper/laylmstruct.py +164 -21
  76. deepdoctection/mapper/maputils.py +16 -3
  77. deepdoctection/mapper/misc.py +6 -3
  78. deepdoctection/mapper/prodigystruct.py +1 -1
  79. deepdoctection/mapper/pubstruct.py +10 -10
  80. deepdoctection/mapper/tpstruct.py +3 -3
  81. deepdoctection/pipe/__init__.py +1 -1
  82. deepdoctection/pipe/anngen.py +35 -8
  83. deepdoctection/pipe/base.py +53 -19
  84. deepdoctection/pipe/common.py +23 -13
  85. deepdoctection/pipe/concurrency.py +2 -1
  86. deepdoctection/pipe/doctectionpipe.py +2 -2
  87. deepdoctection/pipe/language.py +3 -2
  88. deepdoctection/pipe/layout.py +6 -3
  89. deepdoctection/pipe/lm.py +34 -66
  90. deepdoctection/pipe/order.py +142 -35
  91. deepdoctection/pipe/refine.py +26 -24
  92. deepdoctection/pipe/segment.py +21 -16
  93. deepdoctection/pipe/{cell.py → sub_layout.py} +30 -9
  94. deepdoctection/pipe/text.py +14 -8
  95. deepdoctection/pipe/transform.py +16 -9
  96. deepdoctection/train/__init__.py +6 -12
  97. deepdoctection/train/d2_frcnn_train.py +36 -28
  98. deepdoctection/train/hf_detr_train.py +26 -17
  99. deepdoctection/train/hf_layoutlm_train.py +133 -111
  100. deepdoctection/train/tp_frcnn_train.py +21 -19
  101. deepdoctection/utils/__init__.py +3 -0
  102. deepdoctection/utils/concurrency.py +1 -1
  103. deepdoctection/utils/context.py +2 -2
  104. deepdoctection/utils/env_info.py +41 -84
  105. deepdoctection/utils/error.py +84 -0
  106. deepdoctection/utils/file_utils.py +4 -15
  107. deepdoctection/utils/fs.py +7 -7
  108. deepdoctection/utils/logger.py +1 -0
  109. deepdoctection/utils/mocks.py +93 -0
  110. deepdoctection/utils/pdf_utils.py +5 -4
  111. deepdoctection/utils/settings.py +6 -1
  112. deepdoctection/utils/transform.py +1 -1
  113. deepdoctection/utils/utils.py +0 -6
  114. deepdoctection/utils/viz.py +48 -5
  115. {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/METADATA +57 -73
  116. deepdoctection-0.32.dist-info/RECORD +146 -0
  117. {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/WHEEL +1 -1
  118. deepdoctection-0.30.dist-info/RECORD +0 -143
  119. {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/LICENSE +0 -0
  120. {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/top_level.txt +0 -0
@@ -46,7 +46,6 @@ can store an (absolute) path to a `.jsonl` file.
46
46
 
47
47
  """
48
48
 
49
- import ast
50
49
  import importlib
51
50
  import os
52
51
  import re
@@ -56,6 +55,7 @@ from collections import defaultdict
56
55
  from typing import List, Optional, Tuple
57
56
 
58
57
  import numpy as np
58
+ from packaging import version
59
59
  from tabulate import tabulate
60
60
 
61
61
  from .file_utils import (
@@ -68,6 +68,7 @@ from .file_utils import (
68
68
  fasttext_available,
69
69
  get_poppler_version,
70
70
  get_tesseract_version,
71
+ get_tf_version,
71
72
  jdeskew_available,
72
73
  lxml_available,
73
74
  opencv_available,
@@ -84,13 +85,9 @@ from .file_utils import (
84
85
  transformers_available,
85
86
  wandb_available,
86
87
  )
87
- from .logger import LoggingRecord, logger
88
88
 
89
89
  __all__ = [
90
- "collect_torch_env",
91
90
  "collect_env_info",
92
- "get_device",
93
- "auto_select_lib_and_device",
94
91
  "auto_select_viz_library",
95
92
  ]
96
93
 
@@ -270,7 +267,22 @@ def tf_info(data: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
270
267
  if tf_available():
271
268
  import tensorflow as tf # type: ignore # pylint: disable=E0401
272
269
 
270
+ os.environ["TENSORFLOW_AVAILABLE"] = "1"
271
+
273
272
  data.append(("Tensorflow", tf.__version__))
273
+ if version.parse(get_tf_version()) > version.parse("2.4.1"):
274
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
275
+ try:
276
+ import tensorflow.python.util.deprecation as deprecation # type: ignore # pylint: disable=E0401,R0402
277
+
278
+ deprecation._PRINT_DEPRECATION_WARNINGS = False # pylint: disable=W0212
279
+ except Exception: # pylint: disable=W0703
280
+ try:
281
+ from tensorflow.python.util import deprecation # type: ignore # pylint: disable=E0401
282
+
283
+ deprecation._PRINT_DEPRECATION_WARNINGS = False # pylint: disable=W0212
284
+ except Exception: # pylint: disable=W0703
285
+ pass
274
286
  else:
275
287
  data.append(("Tensorflow", "None"))
276
288
  return data
@@ -279,12 +291,18 @@ def tf_info(data: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
279
291
 
280
292
  try:
281
293
  for key, value in list(build_info.build_info.items()):
282
- if key == "cuda_version":
294
+ if key == "is_cuda_build":
295
+ data.append(("TF compiled with CUDA", value))
296
+ if value and len(tf.config.list_physical_devices('GPU')):
297
+ os.environ["USE_CUDA"] = "1"
298
+ elif key == "cuda_version":
283
299
  data.append(("TF built with CUDA", value))
284
300
  elif key == "cudnn_version":
285
301
  data.append(("TF built with CUDNN", value))
286
302
  elif key == "cuda_compute_capabilities":
287
303
  data.append(("TF compute capabilities", ",".join([k.replace("compute_", "") for k in value])))
304
+ elif key == "is_rocm_build":
305
+ data.append(("TF compiled with ROCM", value))
288
306
  return data
289
307
  except AttributeError:
290
308
  pass
@@ -306,6 +324,13 @@ def pt_info(data: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
306
324
 
307
325
  if pytorch_available():
308
326
  import torch
327
+
328
+ os.environ["PYTORCH_AVAILABLE"] = "1"
329
+
330
+ else:
331
+ data.append(("PyTorch", "None"))
332
+ return []
333
+
309
334
  has_gpu = torch.cuda.is_available() # true for both CUDA & ROCM
310
335
  has_mps = torch.backends.mps.is_available()
311
336
 
@@ -331,12 +356,9 @@ def pt_info(data: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
331
356
  data.append(("PyTorch", torch_version + " @" + os.path.dirname(torch.__file__)))
332
357
  data.append(("PyTorch debug build", str(torch.version.debug)))
333
358
 
334
- if not has_gpu:
335
- has_gpu_text = "No: torch.cuda.is_available() == False"
336
- else:
337
- has_gpu_text = "Yes"
338
- data.append(("GPU available", has_gpu_text))
339
359
  if has_gpu:
360
+ os.environ["USE_CUDA"] = "1"
361
+ has_gpu_text = "Yes"
340
362
  devices = defaultdict(list)
341
363
  for k in range(torch.cuda.device_count()):
342
364
  cap = ".".join((str(x) for x in torch.cuda.get_device_capability(k)))
@@ -362,6 +384,10 @@ def pt_info(data: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
362
384
  cuda_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
363
385
  if cuda_arch_list:
364
386
  data.append(("TORCH_CUDA_ARCH_LIST", cuda_arch_list))
387
+ else:
388
+ has_gpu_text = "No: torch.cuda.is_available() == False"
389
+
390
+ data.append(("GPU available", has_gpu_text))
365
391
 
366
392
  mps_build = "No: torch.backends.mps.is_built() == False"
367
393
  if not has_mps:
@@ -369,9 +395,11 @@ def pt_info(data: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
369
395
  else:
370
396
  has_mps_text = "Yes"
371
397
  mps_build = str(torch.backends.mps.is_built())
398
+ if mps_build == "True":
399
+ os.environ["USE_MPS"] = "1"
372
400
 
373
401
  data.append(("MPS available", has_mps_text))
374
- data.append(("MPS available", mps_build))
402
+ data.append(("MPS built", mps_build))
375
403
 
376
404
  try:
377
405
  import torchvision # type: ignore
@@ -420,7 +448,7 @@ def collect_env_info() -> str:
420
448
  try:
421
449
  import prctl # type: ignore
422
450
 
423
- _ = prctl.set_pdeathsig # noqa
451
+ _ = prctl.set_pdeathsig # pylint: disable=E1101
424
452
  except ModuleNotFoundError:
425
453
  has_prctl = False
426
454
  data.append(("python-prctl", str(has_prctl)))
@@ -452,77 +480,6 @@ def collect_env_info() -> str:
452
480
  return env_str
453
481
 
454
482
 
455
- def auto_select_lib_and_device() -> None:
456
- """
457
- Select the DL library and subsequently the device.
458
- This will set environment variable `USE_TENSORFLOW`, `USE_PYTORCH` and `USE_CUDA`
459
-
460
- If TF is available, use TF unless a GPU is not available, in which case choose PT. If CUDA is not available and PT
461
- is not installed raise ImportError.
462
- """
463
-
464
- if tf_available() and tensorpack_available():
465
- from tensorpack.utils.gpu import get_num_gpu # pylint: disable=E0401
466
-
467
- if get_num_gpu() >= 1:
468
- os.environ["USE_TENSORFLOW"] = "True"
469
- os.environ["USE_PYTORCH"] = "False"
470
- os.environ["USE_CUDA"] = "True"
471
- os.environ["USE_MPS"] = "False"
472
- return
473
- if pytorch_available():
474
- os.environ["USE_TENSORFLOW"] = "False"
475
- os.environ["USE_PYTORCH"] = "True"
476
- os.environ["USE_CUDA"] = "False"
477
- return
478
- logger.warning(
479
- LoggingRecord("You have Tensorflow installed but no GPU is available. All Tensorflow models require a GPU.")
480
- )
481
- if pytorch_available():
482
- import torch
483
-
484
- if torch.cuda.is_available():
485
- os.environ["USE_TENSORFLOW"] = "False"
486
- os.environ["USE_PYTORCH"] = "True"
487
- os.environ["USE_CUDA"] = "True"
488
- return
489
- if torch.backends.mps.is_available():
490
- os.environ["USE_TENSORFLOW"] = "False"
491
- os.environ["USE_PYTORCH"] = "True"
492
- os.environ["USE_CUDA"] = "False"
493
- os.environ["USE_MPS"] = "True"
494
- return
495
- os.environ["USE_TENSORFLOW"] = "False"
496
- os.environ["USE_PYTORCH"] = "True"
497
- os.environ["USE_CUDA"] = "False"
498
- os.environ["USE_MPS"] = "False"
499
- return
500
- logger.warning(
501
- LoggingRecord(
502
- "Neither Tensorflow or Pytorch are available. You will not be able to use any Deep Learning "
503
- "model from the library."
504
- )
505
- )
506
-
507
-
508
- def get_device(ignore_cpu: bool = True) -> str:
509
- """
510
- Device checks for running PyTorch with CUDA, MPS or optionall CPU.
511
- If nothing can be found and if `disable_cpu` is deactivated it will raise a `ValueError`
512
-
513
- :param ignore_cpu: Will not consider `cpu` as valid return value
514
- :return: Either cuda or mps
515
- """
516
-
517
- if ast.literal_eval(os.environ.get("USE_CUDA", "True")):
518
- return "cuda"
519
- if ast.literal_eval(os.environ.get("USE_MPS", "True")):
520
- return "mps"
521
- if not ignore_cpu:
522
- return "cpu"
523
- raise ValueError("Could not find either GPU nor MPS")
524
-
525
-
526
483
  def auto_select_viz_library() -> None:
527
484
  """Setting PIL as default image library if cv2 is not installed"""
528
485
 
@@ -0,0 +1,84 @@
1
+ # -*- coding: utf-8 -*-
2
+ # File: error.py
3
+
4
+ # Copyright 2024 Dr. Janis Meyer. All rights reserved.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ """
19
+ Module for custom exceptions
20
+ """
21
+
22
+
23
+ class BoundingBoxError(BaseException):
24
+ """Special exception only for `datapoint.box.BoundingBox`"""
25
+
26
+
27
+ class AnnotationError(BaseException):
28
+ """Special exception only for `datapoint.annotation.Annotation`"""
29
+
30
+
31
+ class ImageError(BaseException):
32
+ """Special exception only for `datapoint.image.Image`"""
33
+
34
+
35
+ class UUIDError(BaseException):
36
+ """Special exception only for `utils.identifier`"""
37
+
38
+
39
+ class DependencyError(BaseException):
40
+ """Special exception only for missing dependencies. We do not use the internals ImportError or
41
+ ModuleNotFoundError."""
42
+
43
+
44
+ class DataFlowTerminatedError(BaseException):
45
+ """
46
+ An exception indicating that the DataFlow is unable to produce any more
47
+ data, i.e. something wrong happened so that calling `__iter__`
48
+ cannot give a valid iterator anymore.
49
+ In most DataFlow this will never be raised.
50
+ """
51
+
52
+
53
+ class DataFlowResetStateNotCalledError(BaseException):
54
+ """
55
+ An exception indicating that `reset_state()` has not been called before starting
56
+ iteration.
57
+ """
58
+
59
+ def __init__(self) -> None:
60
+ super().__init__("Iterating a dataflow requires .reset_state() to be called first")
61
+
62
+
63
+ class MalformedData(BaseException):
64
+ """
65
+ Exception class for malformed data. Use this class if something does not look right with the data
66
+ """
67
+
68
+
69
+ class FileExtensionError(BaseException):
70
+ """
71
+ Exception class for wrong file extensions.
72
+ """
73
+
74
+
75
+ class TesseractError(RuntimeError):
76
+ """
77
+ Tesseract Error
78
+ """
79
+
80
+ def __init__(self, status: int, message: str) -> None:
81
+ super().__init__()
82
+ self.status = status
83
+ self.message = message
84
+ self.args = (status, message)
@@ -22,6 +22,7 @@ import importlib_metadata
22
22
  from packaging import version
23
23
 
24
24
  from .detection_types import Requirement
25
+ from .error import DependencyError
25
26
  from .logger import LoggingRecord, logger
26
27
  from .metacfg import AttrDict
27
28
 
@@ -263,7 +264,7 @@ def set_tesseract_path(tesseract_path: str) -> None:
263
264
  :param tesseract_path: Tesseract installation path.
264
265
  """
265
266
  if tesseract_path is None:
266
- raise ValueError("tesseract_path is empty.")
267
+ raise TypeError("tesseract_path cannot be None")
267
268
 
268
269
  global _TESS_AVAILABLE # pylint: disable=W0603
269
270
  global _TESS_PATH # pylint: disable=W0603
@@ -288,12 +289,6 @@ def tesseract_available() -> bool:
288
289
  # copy paste from https://github.com/madmaze/pytesseract/blob/master/pytesseract/pytesseract.py
289
290
 
290
291
 
291
- class TesseractNotFound(BaseException):
292
- """
293
- Exception class for Tesseract being not found
294
- """
295
-
296
-
297
292
  def get_tesseract_version() -> Union[int, version.Version]:
298
293
  """
299
294
  Returns Version object of the Tesseract version. We need at least Tesseract 3.05
@@ -306,7 +301,7 @@ def get_tesseract_version() -> Union[int, version.Version]:
306
301
  stdin=subprocess.DEVNULL,
307
302
  )
308
303
  except OSError:
309
- raise TesseractNotFound(_TESS_ERR_MSG) from OSError
304
+ raise DependencyError(_TESS_ERR_MSG) from OSError
310
305
 
311
306
  raw_version = output.decode("utf-8")
312
307
  str_version, *_ = raw_version.lstrip(string.printable[10:]).partition(" ")
@@ -348,12 +343,6 @@ def pdf_to_cairo_available() -> bool:
348
343
  return bool(_PDF_TO_CAIRO_AVAILABLE)
349
344
 
350
345
 
351
- class PopplerNotFound(BaseException):
352
- """
353
- Exception class for Poppler being not found
354
- """
355
-
356
-
357
346
  def get_poppler_version() -> Union[int, version.Version]:
358
347
  """
359
348
  Returns Version object of the Poppler version. We need at least Tesseract 3.05
@@ -371,7 +360,7 @@ def get_poppler_version() -> Union[int, version.Version]:
371
360
  [command, "-v"], stderr=subprocess.STDOUT, env=environ, stdin=subprocess.DEVNULL
372
361
  )
373
362
  except OSError:
374
- raise PopplerNotFound() from OSError
363
+ raise DependencyError(_POPPLER_ERR_MSG) from OSError
375
364
 
376
365
  raw_version = output.decode("utf-8")
377
366
  list_version = raw_version.split("\n", maxsplit=1)[0].split(" ")[-1].split(".")
@@ -34,7 +34,7 @@ from .logger import LoggingRecord, logger
34
34
  from .pdf_utils import get_pdf_file_reader, get_pdf_file_writer
35
35
  from .settings import CONFIGS, DATASET_DIR, MODEL_DIR, PATH
36
36
  from .tqdm import get_tqdm
37
- from .utils import FileExtensionError, is_file_extension
37
+ from .utils import is_file_extension
38
38
  from .viz import viz_handler
39
39
 
40
40
  __all__ = [
@@ -44,9 +44,7 @@ __all__ = [
44
44
  "maybe_path_or_pdf",
45
45
  "download",
46
46
  "mkdir_p",
47
- "is_file_extension",
48
47
  "load_json",
49
- "FileExtensionError",
50
48
  "sub_path",
51
49
  "get_package_path",
52
50
  "get_configs_dir_path",
@@ -125,8 +123,8 @@ def download(url: str, directory: Pathlike, file_name: Optional[str] = None, exp
125
123
  assert size > 0, f"Downloaded an empty file from {url}!"
126
124
 
127
125
  if expect_size is not None and size != expect_size:
128
- logger.error(LoggingRecord(f"File downloaded from {url} does not match the expected size!"))
129
- logger.error(
126
+ logger.warning(LoggingRecord(f"File downloaded from {url} does not match the expected size!"))
127
+ logger.warning(
130
128
  LoggingRecord("You may have downloaded a broken file, or the upstream may have modified the file.")
131
129
  )
132
130
 
@@ -210,13 +208,15 @@ def get_load_image_func(
210
208
  :return: The function loading the file (and converting to its desired format)
211
209
  """
212
210
 
213
- assert is_file_extension(path, [".png", ".jpeg", ".jpg", ".pdf", ".tif"]), f"image type not allowed: {path}"
211
+ assert is_file_extension(path, [".png", ".jpeg", ".jpg", ".pdf", ".tif"]), f"image type not allowed: " f"{path}"
214
212
 
215
213
  if is_file_extension(path, [".png", ".jpeg", ".jpg", ".tif"]):
216
214
  return load_image_from_file
217
215
  if is_file_extension(path, [".pdf"]):
218
216
  return load_bytes_from_pdf_file
219
- return NotImplemented
217
+ raise NotImplementedError(
218
+ "File extension not supported by any loader. Please specify a file type and raise an issue"
219
+ )
220
220
 
221
221
 
222
222
  def maybe_path_or_pdf(path: Pathlike) -> int:
@@ -134,6 +134,7 @@ class FileFormatter(logging.Formatter):
134
134
  _LOG_DIR = None
135
135
  _CONFIG_DICT: Dict[str, Any] = {
136
136
  "version": 1,
137
+ "disable_existing_loggers": False,
137
138
  "filters": {"customfilter": {"()": lambda: CustomFilter()}}, # pylint: disable=W0108
138
139
  "formatters": {
139
140
  "streamformatter": {"()": lambda: StreamFormatter(datefmt="%m%d %H:%M.%S")},
@@ -0,0 +1,93 @@
1
+ # -*- coding: utf-8 -*-
2
+ # File: mocks.py
3
+
4
+ # Copyright 2024 Dr. Janis Meyer. All rights reserved.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ """
19
+ Some classes with the purpose to mock the original classes from the Tensorpack library, if Tensorpack is not installed
20
+ """
21
+
22
+ from deepdoctection.utils.error import DependencyError
23
+
24
+
25
+ def layer_register(log_shape): # pylint: disable=W0613
26
+ """Mock layer_register function from tensorpack."""
27
+
28
+ def inner(inputs): # pylint: disable=W0613
29
+ pass
30
+
31
+ return inner
32
+
33
+
34
+ def under_name_scope():
35
+ """Mock under_name_scope function from tensorpack."""
36
+
37
+ def inner(inputs): # pylint: disable=W0613
38
+ pass
39
+
40
+ return inner
41
+
42
+
43
+ def memoized(func):
44
+ """Mock memoized function from tensorpack."""
45
+ return func
46
+
47
+
48
+ def memoized_method(func):
49
+ """Mock memoized_method function from tensorpack."""
50
+ return func
51
+
52
+
53
+ def auto_reuse_variable_scope(inputs): # pylint: disable=W0613
54
+ """Mock auto_reuse_variable_scope function from tensorpack."""
55
+
56
+
57
+ class ModelDesc: # pylint: disable=R0903
58
+ """Mock ModelDesc class from tensorpack."""
59
+
60
+ def __init__(self) -> None:
61
+ raise DependencyError("Tensorpack not found.")
62
+
63
+
64
+ class ImageAugmentor: # pylint: disable=R0903
65
+ """Mock ImageAugmentor class from tensorpack."""
66
+
67
+ def __init__(self) -> None:
68
+ raise DependencyError("Tensorpack not found.")
69
+
70
+
71
+ class Callback: # pylint: disable=R0903
72
+ """Mock Callback class from tensor"""
73
+
74
+ def __init__(self) -> None:
75
+ raise DependencyError("Tensorpack not found.")
76
+
77
+
78
+ class Config: # pylint: disable=R0903
79
+ """Mock class for Config"""
80
+
81
+ pass # pylint: disable=W0107
82
+
83
+
84
+ class Tree: # pylint: disable=R0903
85
+ """Mock class for Tree"""
86
+
87
+ pass # pylint: disable=W0107
88
+
89
+
90
+ class IterableDataset: # pylint: disable=R0903
91
+ """Mock class for IterableDataset"""
92
+
93
+ pass # pylint: disable=W0107
@@ -32,9 +32,10 @@ from pypdf import PdfReader, PdfWriter, errors
32
32
 
33
33
  from .context import save_tmp_file, timeout_manager
34
34
  from .detection_types import ImageType, Pathlike
35
- from .file_utils import PopplerNotFound, pdf_to_cairo_available, pdf_to_ppm_available, qpdf_available
35
+ from .error import DependencyError, FileExtensionError
36
+ from .file_utils import pdf_to_cairo_available, pdf_to_ppm_available, qpdf_available
36
37
  from .logger import LoggingRecord, logger
37
- from .utils import FileExtensionError, is_file_extension
38
+ from .utils import is_file_extension
38
39
  from .viz import viz_handler
39
40
 
40
41
  __all__ = ["decrypt_pdf_document", "get_pdf_file_reader", "get_pdf_file_writer", "PDFStreamer", "pdf_to_np_array"]
@@ -165,7 +166,7 @@ def _input_to_cli_str(
165
166
  elif pdf_to_cairo_available():
166
167
  command = "pdftocairo"
167
168
  else:
168
- raise PopplerNotFound("Poppler not found. Please install or add to your PATH.")
169
+ raise DependencyError("Poppler not found. Please install or add to your PATH.")
169
170
 
170
171
  if platform.system() == "Windows":
171
172
  command = command + ".exe"
@@ -201,7 +202,7 @@ def _run_poppler(poppler_args: List[str]) -> None:
201
202
  except OSError as error:
202
203
  if error.errno != ENOENT:
203
204
  raise error from error
204
- raise PopplerNotFound("Poppler not found. Please install or add to your PATH.") from error
205
+ raise DependencyError("Poppler not found. Please install or add to your PATH.") from error
205
206
 
206
207
  with timeout_manager(proc, 0):
207
208
  if proc.returncode:
@@ -65,6 +65,7 @@ class PageType(ObjectTypes):
65
65
 
66
66
  document_type = "document_type"
67
67
  language = "language"
68
+ angle = "angle"
68
69
 
69
70
 
70
71
  @object_types_registry.register("SummaryType")
@@ -125,6 +126,7 @@ class LayoutType(ObjectTypes):
125
126
  column = "column"
126
127
  word = "word"
127
128
  line = "line"
129
+ background = "background"
128
130
 
129
131
 
130
132
  @object_types_registry.register("TableType")
@@ -291,6 +293,7 @@ class DatasetType(ObjectTypes):
291
293
  sequence_classification = "sequence_classification"
292
294
  token_classification = "token_classification"
293
295
  publaynet = "publaynet"
296
+ default = "default"
294
297
 
295
298
 
296
299
  _TOKEN_AND_TAG_TO_TOKEN_CLASS_WITH_TAG = {
@@ -324,7 +327,9 @@ def token_class_tag_to_token_class_with_tag(token: ObjectTypes, tag: ObjectTypes
324
327
  """
325
328
  if isinstance(token, TokenClasses) and isinstance(tag, BioTag):
326
329
  return _TOKEN_AND_TAG_TO_TOKEN_CLASS_WITH_TAG[(token, tag)]
327
- raise TypeError("Token must be of type TokenClasses and tag must be of type BioTag")
330
+ raise TypeError(
331
+ f"Token must be of type TokenClasses, is of {type(token)} and tag " f"{type(tag)} must be of type BioTag"
332
+ )
328
333
 
329
334
 
330
335
  def token_class_with_tag_to_token_class_and_tag(
@@ -47,7 +47,7 @@ class BaseTransform(ABC):
47
47
  @abstractmethod
48
48
  def apply_image(self, img: ImageType) -> ImageType:
49
49
  """The transformation that should be applied to the image"""
50
- raise NotImplementedError
50
+ raise NotImplementedError()
51
51
 
52
52
 
53
53
  class ResizeTransform(BaseTransform):
@@ -144,12 +144,6 @@ def get_rng(obj: Any = None) -> np.random.RandomState:
144
144
  return np.random.RandomState(seed)
145
145
 
146
146
 
147
- class FileExtensionError(BaseException):
148
- """
149
- An exception indicating that a file does not seem to have an expected type
150
- """
151
-
152
-
153
147
  def is_file_extension(file_name: Pathlike, extension: Union[str, Sequence[str]]) -> bool:
154
148
  """
155
149
  Check if a given file name has a given extension