deepdoctection 0.30__py3-none-any.whl → 0.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (120) hide show
  1. deepdoctection/__init__.py +38 -29
  2. deepdoctection/analyzer/dd.py +36 -29
  3. deepdoctection/configs/conf_dd_one.yaml +34 -31
  4. deepdoctection/dataflow/base.py +0 -19
  5. deepdoctection/dataflow/custom.py +4 -3
  6. deepdoctection/dataflow/custom_serialize.py +14 -5
  7. deepdoctection/dataflow/parallel_map.py +12 -11
  8. deepdoctection/dataflow/serialize.py +5 -4
  9. deepdoctection/datapoint/annotation.py +35 -13
  10. deepdoctection/datapoint/box.py +3 -5
  11. deepdoctection/datapoint/convert.py +3 -1
  12. deepdoctection/datapoint/image.py +79 -36
  13. deepdoctection/datapoint/view.py +152 -49
  14. deepdoctection/datasets/__init__.py +1 -4
  15. deepdoctection/datasets/adapter.py +6 -3
  16. deepdoctection/datasets/base.py +86 -11
  17. deepdoctection/datasets/dataflow_builder.py +1 -1
  18. deepdoctection/datasets/info.py +4 -4
  19. deepdoctection/datasets/instances/doclaynet.py +3 -2
  20. deepdoctection/datasets/instances/fintabnet.py +2 -1
  21. deepdoctection/datasets/instances/funsd.py +2 -1
  22. deepdoctection/datasets/instances/iiitar13k.py +5 -2
  23. deepdoctection/datasets/instances/layouttest.py +4 -8
  24. deepdoctection/datasets/instances/publaynet.py +2 -2
  25. deepdoctection/datasets/instances/pubtables1m.py +6 -3
  26. deepdoctection/datasets/instances/pubtabnet.py +2 -1
  27. deepdoctection/datasets/instances/rvlcdip.py +2 -1
  28. deepdoctection/datasets/instances/xfund.py +2 -1
  29. deepdoctection/eval/__init__.py +1 -4
  30. deepdoctection/eval/accmetric.py +1 -1
  31. deepdoctection/eval/base.py +5 -4
  32. deepdoctection/eval/cocometric.py +2 -1
  33. deepdoctection/eval/eval.py +19 -15
  34. deepdoctection/eval/tedsmetric.py +14 -11
  35. deepdoctection/eval/tp_eval_callback.py +14 -7
  36. deepdoctection/extern/__init__.py +2 -7
  37. deepdoctection/extern/base.py +39 -13
  38. deepdoctection/extern/d2detect.py +182 -90
  39. deepdoctection/extern/deskew.py +36 -9
  40. deepdoctection/extern/doctrocr.py +265 -83
  41. deepdoctection/extern/fastlang.py +49 -9
  42. deepdoctection/extern/hfdetr.py +106 -55
  43. deepdoctection/extern/hflayoutlm.py +441 -122
  44. deepdoctection/extern/hflm.py +225 -0
  45. deepdoctection/extern/model.py +56 -47
  46. deepdoctection/extern/pdftext.py +10 -5
  47. deepdoctection/extern/pt/__init__.py +1 -3
  48. deepdoctection/extern/pt/nms.py +6 -2
  49. deepdoctection/extern/pt/ptutils.py +27 -18
  50. deepdoctection/extern/tessocr.py +134 -22
  51. deepdoctection/extern/texocr.py +6 -2
  52. deepdoctection/extern/tp/tfutils.py +43 -9
  53. deepdoctection/extern/tp/tpcompat.py +14 -11
  54. deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
  55. deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
  56. deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
  57. deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
  58. deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
  59. deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
  60. deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
  61. deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
  62. deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
  63. deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
  64. deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
  65. deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
  66. deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
  67. deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
  68. deepdoctection/extern/tp/tpfrcnn/preproc.py +8 -9
  69. deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
  70. deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
  71. deepdoctection/extern/tpdetect.py +54 -30
  72. deepdoctection/mapper/__init__.py +3 -8
  73. deepdoctection/mapper/d2struct.py +9 -7
  74. deepdoctection/mapper/hfstruct.py +7 -2
  75. deepdoctection/mapper/laylmstruct.py +164 -21
  76. deepdoctection/mapper/maputils.py +16 -3
  77. deepdoctection/mapper/misc.py +6 -3
  78. deepdoctection/mapper/prodigystruct.py +1 -1
  79. deepdoctection/mapper/pubstruct.py +10 -10
  80. deepdoctection/mapper/tpstruct.py +3 -3
  81. deepdoctection/pipe/__init__.py +1 -1
  82. deepdoctection/pipe/anngen.py +35 -8
  83. deepdoctection/pipe/base.py +53 -19
  84. deepdoctection/pipe/common.py +23 -13
  85. deepdoctection/pipe/concurrency.py +2 -1
  86. deepdoctection/pipe/doctectionpipe.py +2 -2
  87. deepdoctection/pipe/language.py +3 -2
  88. deepdoctection/pipe/layout.py +6 -3
  89. deepdoctection/pipe/lm.py +34 -66
  90. deepdoctection/pipe/order.py +142 -35
  91. deepdoctection/pipe/refine.py +26 -24
  92. deepdoctection/pipe/segment.py +21 -16
  93. deepdoctection/pipe/{cell.py → sub_layout.py} +30 -9
  94. deepdoctection/pipe/text.py +14 -8
  95. deepdoctection/pipe/transform.py +16 -9
  96. deepdoctection/train/__init__.py +6 -12
  97. deepdoctection/train/d2_frcnn_train.py +36 -28
  98. deepdoctection/train/hf_detr_train.py +26 -17
  99. deepdoctection/train/hf_layoutlm_train.py +133 -111
  100. deepdoctection/train/tp_frcnn_train.py +21 -19
  101. deepdoctection/utils/__init__.py +3 -0
  102. deepdoctection/utils/concurrency.py +1 -1
  103. deepdoctection/utils/context.py +2 -2
  104. deepdoctection/utils/env_info.py +41 -84
  105. deepdoctection/utils/error.py +84 -0
  106. deepdoctection/utils/file_utils.py +4 -15
  107. deepdoctection/utils/fs.py +7 -7
  108. deepdoctection/utils/logger.py +1 -0
  109. deepdoctection/utils/mocks.py +93 -0
  110. deepdoctection/utils/pdf_utils.py +5 -4
  111. deepdoctection/utils/settings.py +6 -1
  112. deepdoctection/utils/transform.py +1 -1
  113. deepdoctection/utils/utils.py +0 -6
  114. deepdoctection/utils/viz.py +48 -5
  115. {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/METADATA +57 -73
  116. deepdoctection-0.32.dist-info/RECORD +146 -0
  117. {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/WHEEL +1 -1
  118. deepdoctection-0.30.dist-info/RECORD +0 -143
  119. {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/LICENSE +0 -0
  120. {deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/top_level.txt +0 -0
@@ -19,15 +19,13 @@ import os
19
19
  import sys
20
20
  from typing import TYPE_CHECKING
21
21
 
22
- from packaging import version
23
-
24
- from .utils.env_info import auto_select_lib_and_device
22
+ from .utils.env_info import collect_env_info
25
23
  from .utils.file_utils import _LazyModule, get_tf_version, pytorch_available, tf_available
26
- from .utils.logger import logger
24
+ from .utils.logger import LoggingRecord, logger
27
25
 
28
26
  # pylint: enable=wrong-import-position
29
27
 
30
- __version__ = 0.30
28
+ __version__ = 0.32
31
29
 
32
30
  _IMPORT_STRUCTURE = {
33
31
  "analyzer": [
@@ -179,8 +177,10 @@ _IMPORT_STRUCTURE = {
179
177
  "Jdeskewer",
180
178
  "DoctrTextlineDetector",
181
179
  "DoctrTextRecognizer",
180
+ "DocTrRotationTransformer",
182
181
  "FasttextLangDetector",
183
182
  "HFDetrDerivedDetector",
183
+ "get_tokenizer_from_architecture",
184
184
  "HFLayoutLmTokenClassifierBase",
185
185
  "HFLayoutLmTokenClassifier",
186
186
  "HFLayoutLmv2TokenClassifier",
@@ -188,12 +188,16 @@ _IMPORT_STRUCTURE = {
188
188
  "HFLayoutLmSequenceClassifier",
189
189
  "HFLayoutLmv2SequenceClassifier",
190
190
  "HFLayoutLmv3SequenceClassifier",
191
+ "HFLiltTokenClassifier",
192
+ "HFLiltSequenceClassifier",
193
+ "HFLmSequenceClassifier",
191
194
  "ModelProfile",
192
195
  "ModelCatalog",
193
196
  "print_model_infos",
194
197
  "ModelDownloadManager",
195
198
  "PdfPlumberTextDetector",
196
199
  "TesseractOcrDetector",
200
+ "TesseractRotationTransformer",
197
201
  "TextractOcrDetector",
198
202
  "TPFrcnnDetector",
199
203
  ],
@@ -266,11 +270,11 @@ _IMPORT_STRUCTURE = {
266
270
  "DoctectionPipe",
267
271
  "LanguageDetectionService",
268
272
  "ImageLayoutService",
269
- "get_tokenizer_from_architecture",
270
273
  "LMTokenClassifierService",
271
274
  "LMSequenceClassifierService",
272
275
  "OrderGenerator",
273
276
  "TextLineGenerator",
277
+ "TextLineService",
274
278
  "TextOrderService",
275
279
  "TableSegmentationRefinementService",
276
280
  "generate_html_string",
@@ -279,7 +283,7 @@ _IMPORT_STRUCTURE = {
279
283
  "PubtablesSegmentationService",
280
284
  "SegmentationResult",
281
285
  "TextExtractionService",
282
- "SimpleTransformPipelineComponent",
286
+ "SimpleTransformService",
283
287
  ],
284
288
  "train": [
285
289
  "D2Trainer",
@@ -295,14 +299,13 @@ _IMPORT_STRUCTURE = {
295
299
  "save_tmp_file",
296
300
  "timed_operation",
297
301
  "collect_env_info",
298
- "get_device",
299
- "auto_select_lib_and_device",
300
302
  "auto_select_viz_library",
301
303
  "get_tensorflow_requirement",
302
304
  "tf_addons_available",
303
305
  "get_tf_addons_requirements",
304
306
  "tensorpack_available",
305
307
  "get_tensorpack_requirement",
308
+ "pytorch_available",
306
309
  "get_pytorch_requirement",
307
310
  "lxml_available",
308
311
  "get_lxml_requirement",
@@ -416,25 +419,31 @@ _IMPORT_STRUCTURE = {
416
419
  ],
417
420
  }
418
421
 
422
+ # Setting some environment variables so that standard functions can be invoked with available hardware
423
+ env_info = collect_env_info()
424
+ logger.debug(LoggingRecord(msg=env_info))
419
425
 
420
- # disable TF warnings for versions > 2.4.1
421
- if tf_available():
422
- if version.parse(get_tf_version()) > version.parse("2.4.1"):
423
- os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
424
- try:
425
- import tensorflow.python.util.deprecation as deprecation # type: ignore # pylint: disable=E0401,R0402
426
-
427
- deprecation._PRINT_DEPRECATION_WARNINGS = False # pylint: disable=W0212
428
- except Exception: # pylint: disable=W0703
429
- try:
430
- from tensorflow.python.util import deprecation # type: ignore # pylint: disable=E0401
431
-
432
- deprecation._PRINT_DEPRECATION_WARNINGS = False # pylint: disable=W0212
433
- except Exception: # pylint: disable=W0703
434
- pass
426
+ if os.environ.get("PYTORCH_AVAILABLE") and os.environ.get("DD_USE_TORCH") is None:
427
+ os.environ["DD_USE_TORCH"] = "1"
428
+ os.environ["USE_TORCH"] = "1"
429
+ if os.environ.get("TENSORFLOW_AVAILABLE") and os.environ.get("DD_USE_TF") is None:
430
+ os.environ["DD_USE_TF"] = "1"
431
+ os.environ["USE_TF"] = "1"
432
+ if os.environ.get("DD_USE_TORCH") and os.environ.get("DD_USE_TF"):
433
+ logger.warning(
434
+ "Both DD_USE_TORCH and DD_USE_TF are set. Defaulting to PyTorch. If you want a different "
435
+ "behaviour, set DD_USE_TORCH to None before importing deepdoctection."
436
+ )
437
+ os.environ.pop("DD_USE_TF")
438
+ os.environ.pop("USE_TF")
435
439
 
436
- # Setting some environment variables so that standard functions can be invoked with available hardware
437
- auto_select_lib_and_device()
440
+ if not os.environ.get("PYTORCH_AVAILABLE") and not os.environ.get("TENSORFLOW_AVAILABLE"):
441
+ logger.warning(
442
+ LoggingRecord(
443
+ msg="Neither Tensorflow or Pytorch are available. You will not be able to use any Deep Learning "
444
+ "model from the library."
445
+ )
446
+ )
438
447
 
439
448
 
440
449
  # Direct imports for type-checking
@@ -442,10 +451,10 @@ if TYPE_CHECKING:
442
451
  from .analyzer import *
443
452
  from .dataflow import *
444
453
  from .datapoint import *
445
- from .datasets import *
454
+ from .datasets import * # type: ignore
446
455
  from .eval import *
447
- from .extern import *
448
- from .mapper import *
456
+ from .extern import * # type: ignore
457
+ from .mapper import * # type: ignore
449
458
  from .pipe import *
450
459
  from .train import *
451
460
  from .utils import *
@@ -23,51 +23,43 @@ Module for **deep**doctection analyzer.
23
23
  -user factory with a reduced config setting
24
24
  """
25
25
 
26
- import ast
27
26
  import os
28
27
  from os import environ
29
28
  from shutil import copyfile
30
29
  from typing import List, Optional, Union
31
30
 
31
+ from lazy_imports import try_import
32
+
32
33
  from ..extern.base import ObjectDetector
34
+ from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
33
35
  from ..extern.doctrocr import DoctrTextlineDetector, DoctrTextRecognizer
36
+ from ..extern.hfdetr import HFDetrDerivedDetector
34
37
  from ..extern.model import ModelCatalog, ModelDownloadManager
35
38
  from ..extern.pdftext import PdfPlumberTextDetector
39
+ from ..extern.pt.ptutils import get_torch_device
36
40
  from ..extern.tessocr import TesseractOcrDetector
37
41
  from ..extern.texocr import TextractOcrDetector
42
+ from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
43
+ from ..extern.tpdetect import TPFrcnnDetector
38
44
  from ..pipe.base import PipelineComponent
39
- from ..pipe.cell import DetectResultGenerator, SubImageLayoutService
40
45
  from ..pipe.common import AnnotationNmsService, MatchingService, PageParsingService
41
46
  from ..pipe.doctectionpipe import DoctectionPipe
42
47
  from ..pipe.layout import ImageLayoutService
43
48
  from ..pipe.order import TextOrderService
44
49
  from ..pipe.refine import TableSegmentationRefinementService
45
50
  from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
51
+ from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
46
52
  from ..pipe.text import TextExtractionService
47
53
  from ..utils.detection_types import Pathlike
48
- from ..utils.env_info import get_device
49
- from ..utils.file_utils import (
50
- boto3_available,
51
- detectron2_available,
52
- pytorch_available,
53
- tensorpack_available,
54
- tf_available,
55
- )
54
+ from ..utils.error import DependencyError
55
+ from ..utils.file_utils import detectron2_available, tensorpack_available
56
56
  from ..utils.fs import get_configs_dir_path, get_package_path, mkdir_p
57
57
  from ..utils.logger import LoggingRecord, logger
58
58
  from ..utils.metacfg import AttrDict, set_config_by_yaml
59
59
  from ..utils.settings import CellType, LayoutType
60
60
  from ..utils.transform import PadTransform
61
61
 
62
- if tf_available() and tensorpack_available():
63
- from ..extern.tp.tfutils import disable_tp_layer_logging
64
- from ..extern.tpdetect import TPFrcnnDetector
65
-
66
- if pytorch_available():
67
- from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
68
- from ..extern.hfdetr import HFDetrDerivedDetector
69
-
70
- if boto3_available():
62
+ with try_import() as image_guard:
71
63
  from botocore.config import Config # type: ignore
72
64
 
73
65
 
@@ -113,11 +105,12 @@ def config_sanity_checks(cfg: AttrDict) -> None:
113
105
  """Some config sanity checks"""
114
106
  if cfg.USE_PDF_MINER and cfg.USE_OCR and cfg.OCR.USE_DOCTR:
115
107
  raise ValueError("Configuration USE_PDF_MINER= True and USE_OCR=True and USE_DOCTR=True is not allowed")
116
- if cfg.OCR.USE_TESSERACT + cfg.OCR.USE_DOCTR + cfg.OCR.USE_TEXTRACT != 1:
117
- raise ValueError(
118
- "Choose either OCR.USE_TESSERACT=True or OCR.USE_DOCTR=True or OCR.USE_TEXTRACT=True and set the other two "
119
- "to False. Only one OCR system can be activated."
120
- )
108
+ if cfg.USE_OCR:
109
+ if cfg.OCR.USE_TESSERACT + cfg.OCR.USE_DOCTR + cfg.OCR.USE_TEXTRACT != 1:
110
+ raise ValueError(
111
+ "Choose either OCR.USE_TESSERACT=True or OCR.USE_DOCTR=True or OCR.USE_TEXTRACT=True "
112
+ "and set the other two to False. Only one OCR system can be activated."
113
+ )
121
114
 
122
115
 
123
116
  def build_detector(
@@ -343,11 +336,20 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
343
336
  pipe_component_list.append(table_segmentation)
344
337
 
345
338
  if cfg.USE_TABLE_REFINEMENT:
346
- table_segmentation_refinement = TableSegmentationRefinementService()
339
+ table_segmentation_refinement = TableSegmentationRefinementService(
340
+ [LayoutType.table, LayoutType.table_rotated],
341
+ [
342
+ LayoutType.cell,
343
+ CellType.column_header,
344
+ CellType.projected_row_header,
345
+ CellType.spanning,
346
+ CellType.row_header,
347
+ ],
348
+ )
347
349
  pipe_component_list.append(table_segmentation_refinement)
348
350
 
349
351
  if cfg.USE_PDF_MINER:
350
- pdf_text = PdfPlumberTextDetector()
352
+ pdf_text = PdfPlumberTextDetector(x_tolerance=cfg.PDF_MINER.X_TOLERANCE, y_tolerance=cfg.PDF_MINER.Y_TOLERANCE)
351
353
  d_text = TextExtractionService(pdf_text)
352
354
  pipe_component_list.append(d_text)
353
355
 
@@ -400,7 +402,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
400
402
 
401
403
 
402
404
  def get_dd_analyzer(
403
- reset_config_file: bool = False,
405
+ reset_config_file: bool = True,
404
406
  config_overwrite: Optional[List[str]] = None,
405
407
  path_config_file: Optional[Pathlike] = None,
406
408
  ) -> DoctectionPipe:
@@ -429,8 +431,13 @@ def get_dd_analyzer(
429
431
  :return: A DoctectionPipe instance with given configs
430
432
  """
431
433
  config_overwrite = [] if config_overwrite is None else config_overwrite
432
- lib = "TF" if ast.literal_eval(os.environ.get("USE_TENSORFLOW", "False")) else "PT"
433
- device = get_device(False)
434
+ lib = "TF" if os.environ.get("DD_USE_TF") else "PT"
435
+ if lib == "TF":
436
+ device = get_tf_device()
437
+ elif lib == "PT":
438
+ device = get_torch_device()
439
+ else:
440
+ raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
434
441
  dd_one_config_path = maybe_copy_config_to_cache(
435
442
  get_package_path(), get_configs_dir_path(), _DD_ONE, reset_config_file
436
443
  )
@@ -1,38 +1,38 @@
1
1
  USE_LAYOUT: True
2
2
  USE_TABLE_SEGMENTATION: True
3
3
  TF:
4
- LAYOUT:
5
- WEIGHTS: layout/model-800000_inf_only.data-00000-of-00001
6
- FILTER:
7
- CELL:
8
- WEIGHTS: cell/model-1800000_inf_only.data-00000-of-00001
9
- FILTER:
10
- ITEM:
11
- WEIGHTS: item/model-1620000_inf_only.data-00000-of-00001
12
- FILTER:
4
+ LAYOUT:
5
+ WEIGHTS: layout/model-800000_inf_only.data-00000-of-00001
6
+ FILTER:
7
+ CELL:
8
+ WEIGHTS: cell/model-1800000_inf_only.data-00000-of-00001
9
+ FILTER:
10
+ ITEM:
11
+ WEIGHTS: item/model-1620000_inf_only.data-00000-of-00001
12
+ FILTER:
13
13
  PT:
14
- LAYOUT:
15
- WEIGHTS: layout/d2_model_0829999_layout_inf_only.pt
16
- WEIGHTS_TS: layout/d2_model_0829999_layout_inf_only.ts
17
- FILTER:
18
- PAD:
19
- TOP: 60
20
- RIGHT: 60
21
- BOTTOM: 60
22
- LEFT: 60
23
- ITEM:
24
- WEIGHTS: item/d2_model_1639999_item_inf_only.pt
25
- WEIGHTS_TS: item/d2_model_1639999_item_inf_only.ts
26
- FILTER:
27
- PAD:
28
- TOP: 60
29
- RIGHT: 60
30
- BOTTOM: 60
31
- LEFT: 60
32
- CELL:
33
- WEIGHTS: cell/d2_model_1849999_cell_inf_only.pt
34
- WEIGHTS_TS: cell/d2_model_1849999_cell_inf_only.ts
35
- FILTER:
14
+ LAYOUT:
15
+ WEIGHTS: layout/d2_model_0829999_layout_inf_only.pt
16
+ WEIGHTS_TS: layout/d2_model_0829999_layout_inf_only.ts
17
+ FILTER:
18
+ PAD:
19
+ TOP: 60
20
+ RIGHT: 60
21
+ BOTTOM: 60
22
+ LEFT: 60
23
+ ITEM:
24
+ WEIGHTS: item/d2_model_1639999_item_inf_only.pt
25
+ WEIGHTS_TS: item/d2_model_1639999_item_inf_only.ts
26
+ FILTER:
27
+ PAD:
28
+ TOP: 60
29
+ RIGHT: 60
30
+ BOTTOM: 60
31
+ LEFT: 60
32
+ CELL:
33
+ WEIGHTS: cell/d2_model_1849999_cell_inf_only.pt
34
+ WEIGHTS_TS: cell/d2_model_1849999_cell_inf_only.ts
35
+ FILTER:
36
36
  LAYOUT_NMS_PAIRS:
37
37
  COMBINATIONS:
38
38
  THRESHOLDS:
@@ -48,6 +48,9 @@ SEGMENTATION:
48
48
  STRETCH_RULE: equal
49
49
  USE_TABLE_REFINEMENT: True
50
50
  USE_PDF_MINER: False
51
+ PDF_MINER:
52
+ X_TOLERANCE: 3
53
+ Y_TOLERANCE: 3
51
54
  USE_OCR: True
52
55
  OCR:
53
56
  USE_TESSERACT: True
@@ -17,25 +17,6 @@ from typing import Any, Iterator, no_type_check
17
17
  from ..utils.utils import get_rng
18
18
 
19
19
 
20
- class DataFlowTerminated(BaseException):
21
- """
22
- An exception indicating that the DataFlow is unable to produce any more
23
- data, i.e. something wrong happened so that calling `__iter__`
24
- cannot give a valid iterator anymore.
25
- In most DataFlow this will never be raised.
26
- """
27
-
28
-
29
- class DataFlowResetStateNotCalled(BaseException):
30
- """
31
- An exception indicating that `reset_state()` has not been called before starting
32
- iteration.
33
- """
34
-
35
- def __init__(self) -> None:
36
- super().__init__("Iterating a dataflow requires .reset_state() to be called first")
37
-
38
-
39
20
  class DataFlowReentrantGuard:
40
21
  """
41
22
  A tool to enforce non-reentrancy.
@@ -25,10 +25,11 @@ from typing import Any, Callable, Iterable, Iterator, List, Optional
25
25
 
26
26
  import numpy as np
27
27
 
28
+ from ..utils.error import DataFlowResetStateNotCalledError
28
29
  from ..utils.logger import LoggingRecord, logger
29
30
  from ..utils.tqdm import get_tqdm
30
31
  from ..utils.utils import get_rng
31
- from .base import DataFlow, DataFlowReentrantGuard, DataFlowResetStateNotCalled, ProxyDataFlow
32
+ from .base import DataFlow, DataFlowReentrantGuard, ProxyDataFlow
32
33
  from .serialize import DataFromIterable, DataFromList
33
34
 
34
35
  __all__ = ["CacheData", "CustomDataFromList", "CustomDataFromIterable"]
@@ -65,7 +66,7 @@ class CacheData(ProxyDataFlow):
65
66
 
66
67
  def __iter__(self) -> Iterator[Any]:
67
68
  if self._guard is None:
68
- raise DataFlowResetStateNotCalled()
69
+ raise DataFlowResetStateNotCalledError()
69
70
 
70
71
  with self._guard:
71
72
  if self.buffer:
@@ -139,7 +140,7 @@ class CustomDataFromList(DataFromList):
139
140
 
140
141
  def __iter__(self) -> Iterator[Any]:
141
142
  if self.rng is None:
142
- raise DataFlowResetStateNotCalled()
143
+ raise DataFlowResetStateNotCalledError()
143
144
  if self.rebalance_func is not None:
144
145
  lst_tmp = self.rebalance_func(self.lst)
145
146
  logger.info(LoggingRecord(f"CustomDataFromList: subset size after re-balancing: {len(lst_tmp)}"))
@@ -27,13 +27,16 @@ from pathlib import Path
27
27
  from typing import DefaultDict, Dict, List, Optional, Sequence, Union
28
28
 
29
29
  from jsonlines import Reader, Writer
30
+ from tabulate import tabulate
31
+ from termcolor import colored
30
32
 
31
33
  from ..utils.context import timed_operation
32
34
  from ..utils.detection_types import JsonDict, Pathlike
35
+ from ..utils.error import FileExtensionError
33
36
  from ..utils.identifier import get_uuid_from_str
34
37
  from ..utils.pdf_utils import PDFStreamer
35
38
  from ..utils.tqdm import get_tqdm
36
- from ..utils.utils import FileExtensionError, is_file_extension
39
+ from ..utils.utils import is_file_extension
37
40
  from .base import DataFlow
38
41
  from .common import FlattenData, JoinData, MapData
39
42
  from .custom import CacheData, CustomDataFromIterable, CustomDataFromList
@@ -223,7 +226,7 @@ class SerializerFiles:
223
226
  """
224
227
  Not implemented
225
228
  """
226
- raise NotImplementedError
229
+ raise NotImplementedError()
227
230
 
228
231
 
229
232
  class CocoParser:
@@ -283,8 +286,14 @@ class CocoParser:
283
286
  """
284
287
  Print information about the annotation file.
285
288
  """
289
+ rows = []
286
290
  for key, value in self.dataset["info"].items():
287
- print(f"{key}: {value}")
291
+ row = [key, value]
292
+ rows.append(row)
293
+
294
+ header = ["key", "value"]
295
+ table = tabulate(rows, headers=header, tablefmt="fancy_grid", stralign="left", numalign="left")
296
+ print(colored(table, "cyan"))
288
297
 
289
298
  def get_ann_ids(
290
299
  self,
@@ -499,7 +508,7 @@ class SerializerCoco:
499
508
  """
500
509
  Not implemented
501
510
  """
502
- raise NotImplementedError
511
+ raise NotImplementedError()
503
512
 
504
513
 
505
514
  class SerializerPdfDoc:
@@ -547,7 +556,7 @@ class SerializerPdfDoc:
547
556
  """
548
557
  Not implemented
549
558
  """
550
- raise NotImplementedError
559
+ raise NotImplementedError()
551
560
 
552
561
  @staticmethod
553
562
  def split(path: Pathlike, path_target: Optional[Pathlike] = None, max_datapoint: Optional[int] = None) -> None:
@@ -28,8 +28,9 @@ from typing import Any, Callable, Iterator, List, no_type_check
28
28
  import zmq
29
29
 
30
30
  from ..utils.concurrency import StoppableThread, enable_death_signal, start_proc_mask_signal
31
+ from ..utils.error import DataFlowTerminatedError
31
32
  from ..utils.logger import LoggingRecord, logger
32
- from .base import DataFlow, DataFlowReentrantGuard, DataFlowTerminated, ProxyDataFlow
33
+ from .base import DataFlow, DataFlowReentrantGuard, ProxyDataFlow
33
34
  from .common import RepeatedData
34
35
  from .serialize import PickleSerializer
35
36
 
@@ -49,14 +50,14 @@ def _zmq_catch_error(name):
49
50
  yield
50
51
  except zmq.ContextTerminated as exc:
51
52
  logger.info(LoggingRecord(f"_zmq_catch_error: [{name}] Context terminated."))
52
- raise DataFlowTerminated() from exc
53
+ raise DataFlowTerminatedError() from exc
53
54
  except zmq.ZMQError as exc:
54
55
  if exc.errno == errno.ENOTSOCK: # socket closed
55
56
  logger.info(LoggingRecord(f"_zmq_catch_error: [{name}] Socket closed."))
56
- raise DataFlowTerminated() from exc
57
- raise ValueError from exc
57
+ raise DataFlowTerminatedError() from exc
58
+ raise ValueError() from exc
58
59
  except Exception as exc:
59
- raise ValueError from exc
60
+ raise ValueError() from exc
60
61
 
61
62
 
62
63
  @no_type_check
@@ -78,8 +79,8 @@ def _get_pipe_name(name):
78
79
  class _ParallelMapData(ProxyDataFlow, ABC):
79
80
  def __init__(self, df: DataFlow, buffer_size: int, strict: bool = False) -> None:
80
81
  super().__init__(df)
81
- if not buffer_size:
82
- raise ValueError("buffer_size must be a positive number")
82
+ if buffer_size <= 0:
83
+ raise ValueError(f"buffer_size must be a positive number, got {buffer_size}")
83
84
  self._buffer_size = buffer_size
84
85
  self._buffer_occupancy = 0 # actual #elements in buffer, only useful in strict mode
85
86
  self._strict = strict
@@ -95,12 +96,12 @@ class _ParallelMapData(ProxyDataFlow, ABC):
95
96
  @no_type_check
96
97
  @abstractmethod
97
98
  def _recv(self):
98
- raise NotImplementedError
99
+ raise NotImplementedError()
99
100
 
100
101
  @no_type_check
101
102
  @abstractmethod
102
103
  def _send(self, dp: Any):
103
- raise NotImplementedError
104
+ raise NotImplementedError()
104
105
 
105
106
  @no_type_check
106
107
  def _recv_filter_none(self):
@@ -398,8 +399,8 @@ class MultiProcessMapData(_ParallelMapData, _MultiProcessZMQDataFlow):
398
399
 
399
400
  _ParallelMapData.__init__(self, df, buffer_size, strict)
400
401
  _MultiProcessZMQDataFlow.__init__(self)
401
- if not num_proc:
402
- raise ValueError("num_proc must be a positive number")
402
+ if num_proc <= 0:
403
+ raise ValueError(f"num_proc must be a positive number, got {num_proc}")
403
404
  self.num_proc = num_proc
404
405
  self.map_func = map_func
405
406
  self._strict = strict
@@ -16,7 +16,8 @@ from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union
16
16
 
17
17
  import numpy as np
18
18
 
19
- from .base import DataFlow, DataFlowResetStateNotCalled, RNGDataFlow
19
+ from ..utils.error import DataFlowResetStateNotCalledError
20
+ from .base import DataFlow, RNGDataFlow
20
21
 
21
22
 
22
23
  class DataFromList(RNGDataFlow):
@@ -44,7 +45,7 @@ class DataFromList(RNGDataFlow):
44
45
  for k in idxs:
45
46
  yield self.lst[k]
46
47
  else:
47
- raise DataFlowResetStateNotCalled()
48
+ raise DataFlowResetStateNotCalledError()
48
49
 
49
50
 
50
51
  class DataFromIterable(DataFlow):
@@ -63,7 +64,7 @@ class DataFromIterable(DataFlow):
63
64
 
64
65
  def __len__(self) -> int:
65
66
  if self._len is None:
66
- raise NotImplementedError
67
+ raise NotImplementedError()
67
68
  return self._len
68
69
 
69
70
  def __iter__(self) -> Iterator[Any]:
@@ -107,7 +108,7 @@ class FakeData(RNGDataFlow):
107
108
 
108
109
  def __iter__(self) -> Iterator[Any]:
109
110
  if self.rng is None:
110
- raise DataFlowResetStateNotCalled()
111
+ raise DataFlowResetStateNotCalledError()
111
112
  if self.random:
112
113
  for _ in range(self._size):
113
114
  val = []