deepdoctection 0.31__py3-none-any.whl → 0.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (131) hide show
  1. deepdoctection/__init__.py +16 -29
  2. deepdoctection/analyzer/dd.py +70 -59
  3. deepdoctection/configs/conf_dd_one.yaml +34 -31
  4. deepdoctection/dataflow/common.py +9 -5
  5. deepdoctection/dataflow/custom.py +5 -5
  6. deepdoctection/dataflow/custom_serialize.py +75 -18
  7. deepdoctection/dataflow/parallel_map.py +3 -3
  8. deepdoctection/dataflow/serialize.py +4 -4
  9. deepdoctection/dataflow/stats.py +3 -3
  10. deepdoctection/datapoint/annotation.py +41 -56
  11. deepdoctection/datapoint/box.py +9 -8
  12. deepdoctection/datapoint/convert.py +6 -6
  13. deepdoctection/datapoint/image.py +56 -44
  14. deepdoctection/datapoint/view.py +245 -150
  15. deepdoctection/datasets/__init__.py +1 -4
  16. deepdoctection/datasets/adapter.py +35 -26
  17. deepdoctection/datasets/base.py +14 -12
  18. deepdoctection/datasets/dataflow_builder.py +3 -3
  19. deepdoctection/datasets/info.py +24 -26
  20. deepdoctection/datasets/instances/doclaynet.py +51 -51
  21. deepdoctection/datasets/instances/fintabnet.py +46 -46
  22. deepdoctection/datasets/instances/funsd.py +25 -24
  23. deepdoctection/datasets/instances/iiitar13k.py +13 -10
  24. deepdoctection/datasets/instances/layouttest.py +4 -3
  25. deepdoctection/datasets/instances/publaynet.py +5 -5
  26. deepdoctection/datasets/instances/pubtables1m.py +24 -21
  27. deepdoctection/datasets/instances/pubtabnet.py +32 -30
  28. deepdoctection/datasets/instances/rvlcdip.py +30 -30
  29. deepdoctection/datasets/instances/xfund.py +26 -26
  30. deepdoctection/datasets/save.py +6 -6
  31. deepdoctection/eval/__init__.py +1 -4
  32. deepdoctection/eval/accmetric.py +32 -33
  33. deepdoctection/eval/base.py +8 -9
  34. deepdoctection/eval/cocometric.py +15 -13
  35. deepdoctection/eval/eval.py +41 -37
  36. deepdoctection/eval/tedsmetric.py +30 -23
  37. deepdoctection/eval/tp_eval_callback.py +16 -19
  38. deepdoctection/extern/__init__.py +2 -7
  39. deepdoctection/extern/base.py +339 -134
  40. deepdoctection/extern/d2detect.py +85 -113
  41. deepdoctection/extern/deskew.py +14 -11
  42. deepdoctection/extern/doctrocr.py +141 -130
  43. deepdoctection/extern/fastlang.py +27 -18
  44. deepdoctection/extern/hfdetr.py +71 -62
  45. deepdoctection/extern/hflayoutlm.py +504 -211
  46. deepdoctection/extern/hflm.py +230 -0
  47. deepdoctection/extern/model.py +488 -302
  48. deepdoctection/extern/pdftext.py +23 -19
  49. deepdoctection/extern/pt/__init__.py +1 -3
  50. deepdoctection/extern/pt/nms.py +6 -2
  51. deepdoctection/extern/pt/ptutils.py +29 -19
  52. deepdoctection/extern/tessocr.py +39 -38
  53. deepdoctection/extern/texocr.py +18 -18
  54. deepdoctection/extern/tp/tfutils.py +57 -9
  55. deepdoctection/extern/tp/tpcompat.py +21 -14
  56. deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
  57. deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
  58. deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
  59. deepdoctection/extern/tp/tpfrcnn/config/config.py +13 -10
  60. deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
  61. deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +18 -8
  62. deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
  63. deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +14 -9
  64. deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
  65. deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +22 -17
  66. deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +21 -14
  67. deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +19 -11
  68. deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
  69. deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
  70. deepdoctection/extern/tp/tpfrcnn/preproc.py +12 -8
  71. deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
  72. deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
  73. deepdoctection/extern/tpdetect.py +45 -53
  74. deepdoctection/mapper/__init__.py +3 -8
  75. deepdoctection/mapper/cats.py +27 -29
  76. deepdoctection/mapper/cocostruct.py +10 -10
  77. deepdoctection/mapper/d2struct.py +27 -26
  78. deepdoctection/mapper/hfstruct.py +13 -8
  79. deepdoctection/mapper/laylmstruct.py +178 -37
  80. deepdoctection/mapper/maputils.py +12 -11
  81. deepdoctection/mapper/match.py +2 -2
  82. deepdoctection/mapper/misc.py +11 -9
  83. deepdoctection/mapper/pascalstruct.py +4 -4
  84. deepdoctection/mapper/prodigystruct.py +5 -5
  85. deepdoctection/mapper/pubstruct.py +84 -92
  86. deepdoctection/mapper/tpstruct.py +5 -5
  87. deepdoctection/mapper/xfundstruct.py +33 -33
  88. deepdoctection/pipe/__init__.py +1 -1
  89. deepdoctection/pipe/anngen.py +12 -14
  90. deepdoctection/pipe/base.py +52 -106
  91. deepdoctection/pipe/common.py +72 -59
  92. deepdoctection/pipe/concurrency.py +16 -11
  93. deepdoctection/pipe/doctectionpipe.py +24 -21
  94. deepdoctection/pipe/language.py +20 -25
  95. deepdoctection/pipe/layout.py +20 -16
  96. deepdoctection/pipe/lm.py +75 -105
  97. deepdoctection/pipe/order.py +194 -89
  98. deepdoctection/pipe/refine.py +111 -124
  99. deepdoctection/pipe/segment.py +156 -161
  100. deepdoctection/pipe/{cell.py → sub_layout.py} +50 -40
  101. deepdoctection/pipe/text.py +37 -36
  102. deepdoctection/pipe/transform.py +19 -16
  103. deepdoctection/train/__init__.py +6 -12
  104. deepdoctection/train/d2_frcnn_train.py +48 -41
  105. deepdoctection/train/hf_detr_train.py +41 -30
  106. deepdoctection/train/hf_layoutlm_train.py +153 -135
  107. deepdoctection/train/tp_frcnn_train.py +32 -31
  108. deepdoctection/utils/concurrency.py +1 -1
  109. deepdoctection/utils/context.py +13 -6
  110. deepdoctection/utils/develop.py +4 -4
  111. deepdoctection/utils/env_info.py +87 -125
  112. deepdoctection/utils/file_utils.py +6 -11
  113. deepdoctection/utils/fs.py +22 -18
  114. deepdoctection/utils/identifier.py +2 -2
  115. deepdoctection/utils/logger.py +16 -15
  116. deepdoctection/utils/metacfg.py +7 -7
  117. deepdoctection/utils/mocks.py +93 -0
  118. deepdoctection/utils/pdf_utils.py +11 -11
  119. deepdoctection/utils/settings.py +185 -181
  120. deepdoctection/utils/tqdm.py +1 -1
  121. deepdoctection/utils/transform.py +14 -9
  122. deepdoctection/utils/types.py +104 -0
  123. deepdoctection/utils/utils.py +7 -7
  124. deepdoctection/utils/viz.py +74 -72
  125. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/METADATA +30 -21
  126. deepdoctection-0.33.dist-info/RECORD +146 -0
  127. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/WHEEL +1 -1
  128. deepdoctection/utils/detection_types.py +0 -68
  129. deepdoctection-0.31.dist-info/RECORD +0 -144
  130. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/LICENSE +0 -0
  131. {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/top_level.txt +0 -0
@@ -19,15 +19,13 @@ import os
19
19
  import sys
20
20
  from typing import TYPE_CHECKING
21
21
 
22
- from packaging import version
23
-
24
- from .utils.env_info import auto_select_lib_and_device
22
+ from .utils.env_info import collect_env_info
25
23
  from .utils.file_utils import _LazyModule, get_tf_version, pytorch_available, tf_available
26
- from .utils.logger import logger
24
+ from .utils.logger import LoggingRecord, logger
27
25
 
28
26
  # pylint: enable=wrong-import-position
29
27
 
30
- __version__ = 0.31
28
+ __version__ = 0.33
31
29
 
32
30
  _IMPORT_STRUCTURE = {
33
31
  "analyzer": [
@@ -162,6 +160,8 @@ _IMPORT_STRUCTURE = {
162
160
  "EvalCallback",
163
161
  ],
164
162
  "extern": [
163
+ "ModelCategories",
164
+ "NerModelCategories",
165
165
  "PredictorBase",
166
166
  "DetectionResult",
167
167
  "ObjectDetector",
@@ -182,6 +182,7 @@ _IMPORT_STRUCTURE = {
182
182
  "DocTrRotationTransformer",
183
183
  "FasttextLangDetector",
184
184
  "HFDetrDerivedDetector",
185
+ "get_tokenizer_from_architecture",
185
186
  "HFLayoutLmTokenClassifierBase",
186
187
  "HFLayoutLmTokenClassifier",
187
188
  "HFLayoutLmv2TokenClassifier",
@@ -189,6 +190,9 @@ _IMPORT_STRUCTURE = {
189
190
  "HFLayoutLmSequenceClassifier",
190
191
  "HFLayoutLmv2SequenceClassifier",
191
192
  "HFLayoutLmv3SequenceClassifier",
193
+ "HFLiltTokenClassifier",
194
+ "HFLiltSequenceClassifier",
195
+ "HFLmSequenceClassifier",
192
196
  "ModelProfile",
193
197
  "ModelCatalog",
194
198
  "print_model_infos",
@@ -268,11 +272,11 @@ _IMPORT_STRUCTURE = {
268
272
  "DoctectionPipe",
269
273
  "LanguageDetectionService",
270
274
  "ImageLayoutService",
271
- "get_tokenizer_from_architecture",
272
275
  "LMTokenClassifierService",
273
276
  "LMSequenceClassifierService",
274
277
  "OrderGenerator",
275
278
  "TextLineGenerator",
279
+ "TextLineService",
276
280
  "TextOrderService",
277
281
  "TableSegmentationRefinementService",
278
282
  "generate_html_string",
@@ -297,14 +301,13 @@ _IMPORT_STRUCTURE = {
297
301
  "save_tmp_file",
298
302
  "timed_operation",
299
303
  "collect_env_info",
300
- "get_device",
301
- "auto_select_lib_and_device",
302
304
  "auto_select_viz_library",
303
305
  "get_tensorflow_requirement",
304
306
  "tf_addons_available",
305
307
  "get_tf_addons_requirements",
306
308
  "tensorpack_available",
307
309
  "get_tensorpack_requirement",
310
+ "pytorch_available",
308
311
  "get_pytorch_requirement",
309
312
  "lxml_available",
310
313
  "get_lxml_requirement",
@@ -418,25 +421,9 @@ _IMPORT_STRUCTURE = {
418
421
  ],
419
422
  }
420
423
 
421
-
422
- # disable TF warnings for versions > 2.4.1
423
- if tf_available():
424
- if version.parse(get_tf_version()) > version.parse("2.4.1"):
425
- os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
426
- try:
427
- import tensorflow.python.util.deprecation as deprecation # type: ignore # pylint: disable=E0401,R0402
428
-
429
- deprecation._PRINT_DEPRECATION_WARNINGS = False # pylint: disable=W0212
430
- except Exception: # pylint: disable=W0703
431
- try:
432
- from tensorflow.python.util import deprecation # type: ignore # pylint: disable=E0401
433
-
434
- deprecation._PRINT_DEPRECATION_WARNINGS = False # pylint: disable=W0212
435
- except Exception: # pylint: disable=W0703
436
- pass
437
-
438
424
  # Setting some environment variables so that standard functions can be invoked with available hardware
439
- auto_select_lib_and_device()
425
+ env_info = collect_env_info()
426
+ logger.debug(LoggingRecord(msg=env_info))
440
427
 
441
428
 
442
429
  # Direct imports for type-checking
@@ -444,10 +431,10 @@ if TYPE_CHECKING:
444
431
  from .analyzer import *
445
432
  from .dataflow import *
446
433
  from .datapoint import *
447
- from .datasets import *
434
+ from .datasets import * # type: ignore
448
435
  from .eval import *
449
- from .extern import *
450
- from .mapper import *
436
+ from .extern import * # type: ignore
437
+ from .mapper import * # type: ignore
451
438
  from .pipe import *
452
439
  from .train import *
453
440
  from .utils import *
@@ -23,51 +23,46 @@ Module for **deep**doctection analyzer.
23
23
  -user factory with a reduced config setting
24
24
  """
25
25
 
26
- import ast
26
+ from __future__ import annotations
27
+
27
28
  import os
28
29
  from os import environ
29
30
  from shutil import copyfile
30
- from typing import List, Optional, Union
31
+ from typing import Optional, Union
32
+
33
+ from lazy_imports import try_import
31
34
 
32
35
  from ..extern.base import ObjectDetector
36
+ from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
33
37
  from ..extern.doctrocr import DoctrTextlineDetector, DoctrTextRecognizer
38
+ from ..extern.hfdetr import HFDetrDerivedDetector
34
39
  from ..extern.model import ModelCatalog, ModelDownloadManager
35
40
  from ..extern.pdftext import PdfPlumberTextDetector
41
+ from ..extern.pt.ptutils import get_torch_device
36
42
  from ..extern.tessocr import TesseractOcrDetector
37
43
  from ..extern.texocr import TextractOcrDetector
44
+ from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
45
+ from ..extern.tpdetect import TPFrcnnDetector
38
46
  from ..pipe.base import PipelineComponent
39
- from ..pipe.cell import DetectResultGenerator, SubImageLayoutService
40
47
  from ..pipe.common import AnnotationNmsService, MatchingService, PageParsingService
41
48
  from ..pipe.doctectionpipe import DoctectionPipe
42
49
  from ..pipe.layout import ImageLayoutService
43
50
  from ..pipe.order import TextOrderService
44
51
  from ..pipe.refine import TableSegmentationRefinementService
45
52
  from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
53
+ from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
46
54
  from ..pipe.text import TextExtractionService
47
- from ..utils.detection_types import Pathlike
48
- from ..utils.env_info import get_device
49
- from ..utils.file_utils import (
50
- boto3_available,
51
- detectron2_available,
52
- pytorch_available,
53
- tensorpack_available,
54
- tf_available,
55
- )
55
+ from ..utils.env_info import ENV_VARS_TRUE
56
+ from ..utils.error import DependencyError
57
+ from ..utils.file_utils import detectron2_available, tensorpack_available
56
58
  from ..utils.fs import get_configs_dir_path, get_package_path, mkdir_p
57
59
  from ..utils.logger import LoggingRecord, logger
58
60
  from ..utils.metacfg import AttrDict, set_config_by_yaml
59
61
  from ..utils.settings import CellType, LayoutType
60
62
  from ..utils.transform import PadTransform
63
+ from ..utils.types import PathLikeOrStr
61
64
 
62
- if tf_available() and tensorpack_available():
63
- from ..extern.tp.tfutils import disable_tp_layer_logging
64
- from ..extern.tpdetect import TPFrcnnDetector
65
-
66
- if pytorch_available():
67
- from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
68
- from ..extern.hfdetr import HFDetrDerivedDetector
69
-
70
- if boto3_available():
65
+ with try_import() as image_guard:
71
66
  from botocore.config import Config # type: ignore
72
67
 
73
68
 
@@ -89,7 +84,7 @@ _TESSERACT = "deepdoctection/configs/conf_tesseract.yaml"
89
84
 
90
85
 
91
86
  def maybe_copy_config_to_cache(
92
- package_path: Pathlike, configs_dir_path: Pathlike, file_name: str, force_copy: bool = True
87
+ package_path: PathLikeOrStr, configs_dir_path: PathLikeOrStr, file_name: str, force_copy: bool = True
93
88
  ) -> str:
94
89
  """
95
90
  Initial copying of various files
@@ -123,7 +118,7 @@ def config_sanity_checks(cfg: AttrDict) -> None:
123
118
 
124
119
  def build_detector(
125
120
  cfg: AttrDict, mode: str
126
- ) -> Union["D2FrcnnDetector", "TPFrcnnDetector", "HFDetrDerivedDetector", "D2FrcnnTracingDetector"]:
121
+ ) -> Union[D2FrcnnDetector, TPFrcnnDetector, HFDetrDerivedDetector, D2FrcnnTracingDetector]:
127
122
  """Building a D2-Detector, a TP-Detector as Detr-Detector or a D2-Torch Tracing Detector according to
128
123
  the config
129
124
 
@@ -141,8 +136,8 @@ def build_detector(
141
136
  config_path = ModelCatalog.get_full_path_configs(weights)
142
137
  weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
143
138
  profile = ModelCatalog.get_profile(weights)
144
- categories = profile.categories
145
- assert categories is not None
139
+ categories = profile.categories if profile.categories is not None else {}
140
+
146
141
  if profile.model_wrapper in ("TPFrcnnDetector",):
147
142
  return TPFrcnnDetector(config_path, weights_path, categories, filter_categories=filter_categories)
148
143
  if profile.model_wrapper in ("D2FrcnnDetector",):
@@ -210,11 +205,13 @@ def build_sub_image_service(detector: ObjectDetector, cfg: AttrDict, mode: str)
210
205
  padder = None
211
206
  if mode == "ITEM":
212
207
  if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
213
- exclude_category_ids.extend(["1", "3", "4", "5", "6"])
208
+ exclude_category_ids.extend([1, 3, 4, 5, 6])
214
209
  padder = build_padder(cfg, mode)
215
- detect_result_generator = DetectResultGenerator(detector.categories, exclude_category_ids=exclude_category_ids)
210
+ detect_result_generator = DetectResultGenerator(
211
+ categories=detector.categories.categories, exclude_category_ids=exclude_category_ids
212
+ )
216
213
  return SubImageLayoutService(
217
- detector, [LayoutType.table, LayoutType.table_rotated], None, detect_result_generator, padder
214
+ detector, [LayoutType.TABLE, LayoutType.TABLE_ROTATED], None, detect_result_generator, padder
218
215
  )
219
216
 
220
217
 
@@ -241,9 +238,9 @@ def build_ocr(cfg: AttrDict) -> Union[TesseractOcrDetector, DoctrTextRecognizer,
241
238
  )
242
239
  if cfg.OCR.USE_TEXTRACT:
243
240
  credentials_kwargs = {
244
- "aws_access_key_id": environ.get("ACCESS_KEY"),
245
- "aws_secret_access_key": environ.get("SECRET_KEY"),
246
- "config": Config(region_name=environ.get("REGION")),
241
+ "aws_access_key_id": environ.get("ACCESS_KEY", None),
242
+ "aws_secret_access_key": environ.get("SECRET_KEY", None),
243
+ "config": Config(region_name=environ.get("REGION", None)),
247
244
  }
248
245
  return TextractOcrDetector(**credentials_kwargs)
249
246
  raise ValueError("You have set USE_OCR=True but any of USE_TESSERACT, USE_DOCTR, USE_TEXTRACT is set to False")
@@ -268,7 +265,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
268
265
  :param cfg: A configuration
269
266
  :return: Analyzer pipeline
270
267
  """
271
- pipe_component_list: List[PipelineComponent] = []
268
+ pipe_component_list: list[PipelineComponent] = []
272
269
 
273
270
  if cfg.USE_LAYOUT:
274
271
  d_layout = build_detector(cfg, "LAYOUT")
@@ -308,22 +305,22 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
308
305
  cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
309
306
  cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
310
307
  cfg.SEGMENTATION.CELL_CATEGORY_ID,
311
- LayoutType.table,
308
+ LayoutType.TABLE,
312
309
  [
313
- CellType.spanning,
314
- CellType.row_header,
315
- CellType.column_header,
316
- CellType.projected_row_header,
317
- LayoutType.cell,
310
+ CellType.SPANNING,
311
+ CellType.ROW_HEADER,
312
+ CellType.COLUMN_HEADER,
313
+ CellType.PROJECTED_ROW_HEADER,
314
+ LayoutType.CELL,
318
315
  ],
319
316
  [
320
- CellType.spanning,
321
- CellType.row_header,
322
- CellType.column_header,
323
- CellType.projected_row_header,
317
+ CellType.SPANNING,
318
+ CellType.ROW_HEADER,
319
+ CellType.COLUMN_HEADER,
320
+ CellType.PROJECTED_ROW_HEADER,
324
321
  ],
325
- [LayoutType.row, LayoutType.column],
326
- [CellType.row_number, CellType.column_number],
322
+ [LayoutType.ROW, LayoutType.COLUMN],
323
+ [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER],
327
324
  stretch_rule=cfg.SEGMENTATION.STRETCH_RULE,
328
325
  )
329
326
  pipe_component_list.append(pubtables)
@@ -335,20 +332,29 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
335
332
  cfg.SEGMENTATION.FULL_TABLE_TILING,
336
333
  cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
337
334
  cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
338
- LayoutType.table,
339
- [CellType.header, CellType.body, LayoutType.cell],
340
- [LayoutType.row, LayoutType.column],
341
- [CellType.row_number, CellType.column_number],
335
+ LayoutType.TABLE,
336
+ [CellType.HEADER, CellType.BODY, LayoutType.CELL],
337
+ [LayoutType.ROW, LayoutType.COLUMN],
338
+ [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER],
342
339
  cfg.SEGMENTATION.STRETCH_RULE,
343
340
  )
344
341
  pipe_component_list.append(table_segmentation)
345
342
 
346
343
  if cfg.USE_TABLE_REFINEMENT:
347
- table_segmentation_refinement = TableSegmentationRefinementService()
344
+ table_segmentation_refinement = TableSegmentationRefinementService(
345
+ [LayoutType.TABLE, LayoutType.TABLE_ROTATED],
346
+ [
347
+ LayoutType.CELL,
348
+ CellType.COLUMN_HEADER,
349
+ CellType.PROJECTED_ROW_HEADER,
350
+ CellType.SPANNING,
351
+ CellType.ROW_HEADER,
352
+ ],
353
+ )
348
354
  pipe_component_list.append(table_segmentation_refinement)
349
355
 
350
356
  if cfg.USE_PDF_MINER:
351
- pdf_text = PdfPlumberTextDetector()
357
+ pdf_text = PdfPlumberTextDetector(x_tolerance=cfg.PDF_MINER.X_TOLERANCE, y_tolerance=cfg.PDF_MINER.Y_TOLERANCE)
352
358
  d_text = TextExtractionService(pdf_text)
353
359
  pipe_component_list.append(d_text)
354
360
 
@@ -362,7 +368,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
362
368
 
363
369
  ocr = build_ocr(cfg)
364
370
  skip_if_text_extracted = cfg.USE_PDF_MINER
365
- extract_from_roi = LayoutType.word if cfg.OCR.USE_DOCTR else None
371
+ extract_from_roi = LayoutType.WORD if cfg.OCR.USE_DOCTR else None
366
372
  text = TextExtractionService(
367
373
  ocr, skip_if_text_extracted=skip_if_text_extracted, extract_from_roi=extract_from_roi
368
374
  )
@@ -371,7 +377,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
371
377
  if cfg.USE_PDF_MINER or cfg.USE_OCR:
372
378
  match = MatchingService(
373
379
  parent_categories=cfg.WORD_MATCHING.PARENTAL_CATEGORIES,
374
- child_categories=LayoutType.word,
380
+ child_categories=LayoutType.WORD,
375
381
  matching_rule=cfg.WORD_MATCHING.RULE,
376
382
  threshold=cfg.WORD_MATCHING.THRESHOLD,
377
383
  max_parent_only=cfg.WORD_MATCHING.MAX_PARENT_ONLY,
@@ -379,7 +385,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
379
385
  pipe_component_list.append(match)
380
386
 
381
387
  order = TextOrderService(
382
- text_container=LayoutType.word,
388
+ text_container=LayoutType.WORD,
383
389
  text_block_categories=cfg.TEXT_ORDERING.TEXT_BLOCK_CATEGORIES,
384
390
  floating_text_block_categories=cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
385
391
  include_residual_text_container=cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
@@ -391,7 +397,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
391
397
  pipe_component_list.append(order)
392
398
 
393
399
  page_parsing_service = PageParsingService(
394
- text_container=LayoutType.word,
400
+ text_container=LayoutType.WORD,
395
401
  floating_text_block_categories=cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
396
402
  include_residual_text_container=cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
397
403
  )
@@ -401,9 +407,9 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
401
407
 
402
408
 
403
409
  def get_dd_analyzer(
404
- reset_config_file: bool = False,
405
- config_overwrite: Optional[List[str]] = None,
406
- path_config_file: Optional[Pathlike] = None,
410
+ reset_config_file: bool = True,
411
+ config_overwrite: Optional[list[str]] = None,
412
+ path_config_file: Optional[PathLikeOrStr] = None,
407
413
  ) -> DoctectionPipe:
408
414
  """
409
415
  Factory function for creating the built-in **deep**doctection analyzer.
@@ -430,8 +436,13 @@ def get_dd_analyzer(
430
436
  :return: A DoctectionPipe instance with given configs
431
437
  """
432
438
  config_overwrite = [] if config_overwrite is None else config_overwrite
433
- lib = "TF" if ast.literal_eval(os.environ.get("USE_TENSORFLOW", "False")) else "PT"
434
- device = get_device(False)
439
+ lib = "TF" if os.environ.get("DD_USE_TF", "0") in ENV_VARS_TRUE else "PT"
440
+ if lib == "TF":
441
+ device = get_tf_device()
442
+ elif lib == "PT":
443
+ device = get_torch_device()
444
+ else:
445
+ raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
435
446
  dd_one_config_path = maybe_copy_config_to_cache(
436
447
  get_package_path(), get_configs_dir_path(), _DD_ONE, reset_config_file
437
448
  )
@@ -1,38 +1,38 @@
1
1
  USE_LAYOUT: True
2
2
  USE_TABLE_SEGMENTATION: True
3
3
  TF:
4
- LAYOUT:
5
- WEIGHTS: layout/model-800000_inf_only.data-00000-of-00001
6
- FILTER:
7
- CELL:
8
- WEIGHTS: cell/model-1800000_inf_only.data-00000-of-00001
9
- FILTER:
10
- ITEM:
11
- WEIGHTS: item/model-1620000_inf_only.data-00000-of-00001
12
- FILTER:
4
+ LAYOUT:
5
+ WEIGHTS: layout/model-800000_inf_only.data-00000-of-00001
6
+ FILTER:
7
+ CELL:
8
+ WEIGHTS: cell/model-1800000_inf_only.data-00000-of-00001
9
+ FILTER:
10
+ ITEM:
11
+ WEIGHTS: item/model-1620000_inf_only.data-00000-of-00001
12
+ FILTER:
13
13
  PT:
14
- LAYOUT:
15
- WEIGHTS: layout/d2_model_0829999_layout_inf_only.pt
16
- WEIGHTS_TS: layout/d2_model_0829999_layout_inf_only.ts
17
- FILTER:
18
- PAD:
19
- TOP: 60
20
- RIGHT: 60
21
- BOTTOM: 60
22
- LEFT: 60
23
- ITEM:
24
- WEIGHTS: item/d2_model_1639999_item_inf_only.pt
25
- WEIGHTS_TS: item/d2_model_1639999_item_inf_only.ts
26
- FILTER:
27
- PAD:
28
- TOP: 60
29
- RIGHT: 60
30
- BOTTOM: 60
31
- LEFT: 60
32
- CELL:
33
- WEIGHTS: cell/d2_model_1849999_cell_inf_only.pt
34
- WEIGHTS_TS: cell/d2_model_1849999_cell_inf_only.ts
35
- FILTER:
14
+ LAYOUT:
15
+ WEIGHTS: layout/d2_model_0829999_layout_inf_only.pt
16
+ WEIGHTS_TS: layout/d2_model_0829999_layout_inf_only.ts
17
+ FILTER:
18
+ PAD:
19
+ TOP: 60
20
+ RIGHT: 60
21
+ BOTTOM: 60
22
+ LEFT: 60
23
+ ITEM:
24
+ WEIGHTS: item/d2_model_1639999_item_inf_only.pt
25
+ WEIGHTS_TS: item/d2_model_1639999_item_inf_only.ts
26
+ FILTER:
27
+ PAD:
28
+ TOP: 60
29
+ RIGHT: 60
30
+ BOTTOM: 60
31
+ LEFT: 60
32
+ CELL:
33
+ WEIGHTS: cell/d2_model_1849999_cell_inf_only.pt
34
+ WEIGHTS_TS: cell/d2_model_1849999_cell_inf_only.ts
35
+ FILTER:
36
36
  LAYOUT_NMS_PAIRS:
37
37
  COMBINATIONS:
38
38
  THRESHOLDS:
@@ -48,6 +48,9 @@ SEGMENTATION:
48
48
  STRETCH_RULE: equal
49
49
  USE_TABLE_REFINEMENT: True
50
50
  USE_PDF_MINER: False
51
+ PDF_MINER:
52
+ X_TOLERANCE: 3
53
+ Y_TOLERANCE: 3
51
54
  USE_OCR: True
52
55
  OCR:
53
56
  USE_TESSERACT: True
@@ -12,7 +12,7 @@ Some DataFlow classes for transforming and processing datapoints. Many classes h
12
12
  """
13
13
  import itertools
14
14
  from copy import copy
15
- from typing import Any, Callable, Iterator, List, Union
15
+ from typing import Any, Callable, Iterator, Union
16
16
 
17
17
  import tqdm
18
18
 
@@ -164,6 +164,10 @@ class RepeatedData(ProxyDataFlow):
164
164
  Set to -1 to repeat ``ds`` infinite times.
165
165
  """
166
166
  self.num = num
167
+ if self.num != -1:
168
+ self.dfs = itertools.tee(df, self.num)
169
+ else:
170
+ self.dfs = ()
167
171
  super().__init__(df)
168
172
 
169
173
  def __len__(self) -> int:
@@ -180,8 +184,8 @@ class RepeatedData(ProxyDataFlow):
180
184
  while True:
181
185
  yield from self.df
182
186
  else:
183
- for _ in range(self.num):
184
- yield from self.df
187
+ for df in self.dfs:
188
+ yield from df
185
189
 
186
190
 
187
191
  class ConcatData(DataFlow):
@@ -197,7 +201,7 @@ class ConcatData(DataFlow):
197
201
  df = ConcatData([df_1,df_2])
198
202
  """
199
203
 
200
- def __init__(self, df_lists: List[DataFlow]) -> None:
204
+ def __init__(self, df_lists: list[DataFlow]) -> None:
201
205
  """
202
206
  :param df_lists: a list of DataFlow.
203
207
  """
@@ -233,7 +237,7 @@ class JoinData(DataFlow):
233
237
  `JoinData` will stop once the first Dataflow throws a StopIteration
234
238
  """
235
239
 
236
- def __init__(self, df_lists: List[DataFlow]) -> None:
240
+ def __init__(self, df_lists: list[DataFlow]) -> None:
237
241
  """
238
242
  :param df_lists: a list of DataFlow. When these dataflows have different sizes, JoinData will stop when any
239
243
  of them is exhausted.
@@ -21,7 +21,7 @@ from
21
21
 
22
22
  <https://github.com/tensorpack/dataflow/blob/master/dataflow/dataflow/common.py>
23
23
  """
24
- from typing import Any, Callable, Iterable, Iterator, List, Optional
24
+ from typing import Any, Callable, Iterable, Iterator, Optional
25
25
 
26
26
  import numpy as np
27
27
 
@@ -54,7 +54,7 @@ class CacheData(ProxyDataFlow):
54
54
  :param shuffle: whether to shuffle the cache before yielding from it.
55
55
  """
56
56
  self.shuffle = shuffle
57
- self.buffer: List[Any] = []
57
+ self.buffer: list[Any] = []
58
58
  self._guard: Optional[DataFlowReentrantGuard] = None
59
59
  self.rng = get_rng(self)
60
60
  super().__init__(df)
@@ -78,7 +78,7 @@ class CacheData(ProxyDataFlow):
78
78
  yield dp
79
79
  self.buffer.append(dp)
80
80
 
81
- def get_cache(self) -> List[Any]:
81
+ def get_cache(self) -> list[Any]:
82
82
  """
83
83
  get the cache of the whole dataflow as a list
84
84
 
@@ -115,10 +115,10 @@ class CustomDataFromList(DataFromList):
115
115
 
116
116
  def __init__(
117
117
  self,
118
- lst: List[Any],
118
+ lst: list[Any],
119
119
  shuffle: bool = False,
120
120
  max_datapoints: Optional[int] = None,
121
- rebalance_func: Optional[Callable[[List[Any]], List[Any]]] = None,
121
+ rebalance_func: Optional[Callable[[list[Any]], list[Any]]] = None,
122
122
  ):
123
123
  """
124
124
  :param lst: the input list. Each element represents a datapoint.