deepdoctection 0.32__py3-none-any.whl → 0.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (111) hide show
  1. deepdoctection/__init__.py +8 -25
  2. deepdoctection/analyzer/dd.py +84 -71
  3. deepdoctection/dataflow/common.py +9 -5
  4. deepdoctection/dataflow/custom.py +5 -5
  5. deepdoctection/dataflow/custom_serialize.py +75 -18
  6. deepdoctection/dataflow/parallel_map.py +3 -3
  7. deepdoctection/dataflow/serialize.py +4 -4
  8. deepdoctection/dataflow/stats.py +3 -3
  9. deepdoctection/datapoint/annotation.py +78 -56
  10. deepdoctection/datapoint/box.py +7 -7
  11. deepdoctection/datapoint/convert.py +6 -6
  12. deepdoctection/datapoint/image.py +157 -75
  13. deepdoctection/datapoint/view.py +175 -151
  14. deepdoctection/datasets/adapter.py +30 -24
  15. deepdoctection/datasets/base.py +10 -10
  16. deepdoctection/datasets/dataflow_builder.py +3 -3
  17. deepdoctection/datasets/info.py +23 -25
  18. deepdoctection/datasets/instances/doclaynet.py +48 -49
  19. deepdoctection/datasets/instances/fintabnet.py +44 -45
  20. deepdoctection/datasets/instances/funsd.py +23 -23
  21. deepdoctection/datasets/instances/iiitar13k.py +8 -8
  22. deepdoctection/datasets/instances/layouttest.py +2 -2
  23. deepdoctection/datasets/instances/publaynet.py +3 -3
  24. deepdoctection/datasets/instances/pubtables1m.py +18 -18
  25. deepdoctection/datasets/instances/pubtabnet.py +30 -29
  26. deepdoctection/datasets/instances/rvlcdip.py +28 -29
  27. deepdoctection/datasets/instances/xfund.py +51 -30
  28. deepdoctection/datasets/save.py +6 -6
  29. deepdoctection/eval/accmetric.py +32 -33
  30. deepdoctection/eval/base.py +8 -9
  31. deepdoctection/eval/cocometric.py +13 -12
  32. deepdoctection/eval/eval.py +32 -26
  33. deepdoctection/eval/tedsmetric.py +16 -12
  34. deepdoctection/eval/tp_eval_callback.py +7 -16
  35. deepdoctection/extern/base.py +339 -134
  36. deepdoctection/extern/d2detect.py +69 -89
  37. deepdoctection/extern/deskew.py +11 -10
  38. deepdoctection/extern/doctrocr.py +81 -64
  39. deepdoctection/extern/fastlang.py +23 -16
  40. deepdoctection/extern/hfdetr.py +53 -38
  41. deepdoctection/extern/hflayoutlm.py +216 -155
  42. deepdoctection/extern/hflm.py +35 -30
  43. deepdoctection/extern/model.py +433 -255
  44. deepdoctection/extern/pdftext.py +15 -15
  45. deepdoctection/extern/pt/ptutils.py +4 -2
  46. deepdoctection/extern/tessocr.py +39 -38
  47. deepdoctection/extern/texocr.py +14 -16
  48. deepdoctection/extern/tp/tfutils.py +16 -2
  49. deepdoctection/extern/tp/tpcompat.py +11 -7
  50. deepdoctection/extern/tp/tpfrcnn/config/config.py +4 -4
  51. deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +1 -1
  52. deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +5 -5
  53. deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +6 -6
  54. deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +4 -4
  55. deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +5 -3
  56. deepdoctection/extern/tp/tpfrcnn/preproc.py +5 -5
  57. deepdoctection/extern/tpdetect.py +40 -45
  58. deepdoctection/mapper/cats.py +36 -40
  59. deepdoctection/mapper/cocostruct.py +16 -12
  60. deepdoctection/mapper/d2struct.py +22 -22
  61. deepdoctection/mapper/hfstruct.py +7 -7
  62. deepdoctection/mapper/laylmstruct.py +22 -24
  63. deepdoctection/mapper/maputils.py +9 -10
  64. deepdoctection/mapper/match.py +33 -2
  65. deepdoctection/mapper/misc.py +6 -7
  66. deepdoctection/mapper/pascalstruct.py +4 -4
  67. deepdoctection/mapper/prodigystruct.py +6 -6
  68. deepdoctection/mapper/pubstruct.py +84 -92
  69. deepdoctection/mapper/tpstruct.py +3 -3
  70. deepdoctection/mapper/xfundstruct.py +33 -33
  71. deepdoctection/pipe/anngen.py +39 -14
  72. deepdoctection/pipe/base.py +68 -99
  73. deepdoctection/pipe/common.py +181 -85
  74. deepdoctection/pipe/concurrency.py +14 -10
  75. deepdoctection/pipe/doctectionpipe.py +24 -21
  76. deepdoctection/pipe/language.py +20 -25
  77. deepdoctection/pipe/layout.py +18 -16
  78. deepdoctection/pipe/lm.py +49 -47
  79. deepdoctection/pipe/order.py +63 -65
  80. deepdoctection/pipe/refine.py +102 -109
  81. deepdoctection/pipe/segment.py +157 -162
  82. deepdoctection/pipe/sub_layout.py +50 -40
  83. deepdoctection/pipe/text.py +37 -36
  84. deepdoctection/pipe/transform.py +19 -16
  85. deepdoctection/train/d2_frcnn_train.py +27 -25
  86. deepdoctection/train/hf_detr_train.py +22 -18
  87. deepdoctection/train/hf_layoutlm_train.py +49 -48
  88. deepdoctection/train/tp_frcnn_train.py +10 -11
  89. deepdoctection/utils/concurrency.py +1 -1
  90. deepdoctection/utils/context.py +13 -6
  91. deepdoctection/utils/develop.py +4 -4
  92. deepdoctection/utils/env_info.py +52 -14
  93. deepdoctection/utils/file_utils.py +6 -11
  94. deepdoctection/utils/fs.py +41 -14
  95. deepdoctection/utils/identifier.py +2 -2
  96. deepdoctection/utils/logger.py +15 -15
  97. deepdoctection/utils/metacfg.py +7 -7
  98. deepdoctection/utils/pdf_utils.py +39 -14
  99. deepdoctection/utils/settings.py +188 -182
  100. deepdoctection/utils/tqdm.py +1 -1
  101. deepdoctection/utils/transform.py +14 -9
  102. deepdoctection/utils/types.py +104 -0
  103. deepdoctection/utils/utils.py +7 -7
  104. deepdoctection/utils/viz.py +70 -69
  105. {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/METADATA +7 -4
  106. deepdoctection-0.34.dist-info/RECORD +146 -0
  107. {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/WHEEL +1 -1
  108. deepdoctection/utils/detection_types.py +0 -68
  109. deepdoctection-0.32.dist-info/RECORD +0 -146
  110. {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/LICENSE +0 -0
  111. {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/top_level.txt +0 -0
@@ -15,7 +15,6 @@ if importlib.util.find_spec("dotenv") is not None:
15
15
 
16
16
 
17
17
  # pylint: disable=wrong-import-position
18
- import os
19
18
  import sys
20
19
  from typing import TYPE_CHECKING
21
20
 
@@ -25,11 +24,10 @@ from .utils.logger import LoggingRecord, logger
25
24
 
26
25
  # pylint: enable=wrong-import-position
27
26
 
28
- __version__ = 0.32
27
+ __version__ = 0.34
29
28
 
30
29
  _IMPORT_STRUCTURE = {
31
30
  "analyzer": [
32
- "maybe_copy_config_to_cache",
33
31
  "config_sanity_checks",
34
32
  "build_detector",
35
33
  "build_padder",
@@ -76,6 +74,7 @@ _IMPORT_STRUCTURE = {
76
74
  ],
77
75
  "datapoint": [
78
76
  "ann_from_dict",
77
+ "AnnotationMap",
79
78
  "Annotation",
80
79
  "CategoryAnnotation",
81
80
  "ImageAnnotation",
@@ -160,6 +159,8 @@ _IMPORT_STRUCTURE = {
160
159
  "EvalCallback",
161
160
  ],
162
161
  "extern": [
162
+ "ModelCategories",
163
+ "NerModelCategories",
163
164
  "PredictorBase",
164
165
  "DetectionResult",
165
166
  "ObjectDetector",
@@ -235,6 +236,7 @@ _IMPORT_STRUCTURE = {
235
236
  "LabelSummarizer",
236
237
  "curry",
237
238
  "match_anns_by_intersection",
239
+ "match_anns_by_distance",
238
240
  "to_image",
239
241
  "maybe_load_image",
240
242
  "maybe_remove_image",
@@ -263,6 +265,8 @@ _IMPORT_STRUCTURE = {
263
265
  "DetectResultGenerator",
264
266
  "SubImageLayoutService",
265
267
  "ImageCroppingService",
268
+ "IntersectionMatcher",
269
+ "NeighbourMatcher",
266
270
  "MatchingService",
267
271
  "PageParsingService",
268
272
  "AnnotationNmsService",
@@ -362,6 +366,7 @@ _IMPORT_STRUCTURE = {
362
366
  "get_configs_dir_path",
363
367
  "get_weights_dir_path",
364
368
  "get_dataset_dir_path",
369
+ "maybe_copy_config_to_cache",
365
370
  "is_uuid_like",
366
371
  "get_uuid_from_str",
367
372
  "get_uuid",
@@ -423,28 +428,6 @@ _IMPORT_STRUCTURE = {
423
428
  env_info = collect_env_info()
424
429
  logger.debug(LoggingRecord(msg=env_info))
425
430
 
426
- if os.environ.get("PYTORCH_AVAILABLE") and os.environ.get("DD_USE_TORCH") is None:
427
- os.environ["DD_USE_TORCH"] = "1"
428
- os.environ["USE_TORCH"] = "1"
429
- if os.environ.get("TENSORFLOW_AVAILABLE") and os.environ.get("DD_USE_TF") is None:
430
- os.environ["DD_USE_TF"] = "1"
431
- os.environ["USE_TF"] = "1"
432
- if os.environ.get("DD_USE_TORCH") and os.environ.get("DD_USE_TF"):
433
- logger.warning(
434
- "Both DD_USE_TORCH and DD_USE_TF are set. Defaulting to PyTorch. If you want a different "
435
- "behaviour, set DD_USE_TORCH to None before importing deepdoctection."
436
- )
437
- os.environ.pop("DD_USE_TF")
438
- os.environ.pop("USE_TF")
439
-
440
- if not os.environ.get("PYTORCH_AVAILABLE") and not os.environ.get("TENSORFLOW_AVAILABLE"):
441
- logger.warning(
442
- LoggingRecord(
443
- msg="Neither Tensorflow or Pytorch are available. You will not be able to use any Deep Learning "
444
- "model from the library."
445
- )
446
- )
447
-
448
431
 
449
432
  # Direct imports for type-checking
450
433
  if TYPE_CHECKING:
@@ -23,10 +23,11 @@ Module for **deep**doctection analyzer.
23
23
  -user factory with a reduced config setting
24
24
  """
25
25
 
26
+ from __future__ import annotations
27
+
26
28
  import os
27
29
  from os import environ
28
- from shutil import copyfile
29
- from typing import List, Optional, Union
30
+ from typing import Optional, Union
30
31
 
31
32
  from lazy_imports import try_import
32
33
 
@@ -42,7 +43,7 @@ from ..extern.texocr import TextractOcrDetector
42
43
  from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
43
44
  from ..extern.tpdetect import TPFrcnnDetector
44
45
  from ..pipe.base import PipelineComponent
45
- from ..pipe.common import AnnotationNmsService, MatchingService, PageParsingService
46
+ from ..pipe.common import AnnotationNmsService, IntersectionMatcher, MatchingService, PageParsingService
46
47
  from ..pipe.doctectionpipe import DoctectionPipe
47
48
  from ..pipe.layout import ImageLayoutService
48
49
  from ..pipe.order import TextOrderService
@@ -50,21 +51,21 @@ from ..pipe.refine import TableSegmentationRefinementService
50
51
  from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
51
52
  from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
52
53
  from ..pipe.text import TextExtractionService
53
- from ..utils.detection_types import Pathlike
54
+ from ..utils.env_info import ENV_VARS_TRUE
54
55
  from ..utils.error import DependencyError
55
56
  from ..utils.file_utils import detectron2_available, tensorpack_available
56
- from ..utils.fs import get_configs_dir_path, get_package_path, mkdir_p
57
+ from ..utils.fs import get_configs_dir_path, get_package_path, maybe_copy_config_to_cache
57
58
  from ..utils.logger import LoggingRecord, logger
58
59
  from ..utils.metacfg import AttrDict, set_config_by_yaml
59
- from ..utils.settings import CellType, LayoutType
60
+ from ..utils.settings import CellType, LayoutType, Relationships
60
61
  from ..utils.transform import PadTransform
62
+ from ..utils.types import PathLikeOrStr
61
63
 
62
64
  with try_import() as image_guard:
63
65
  from botocore.config import Config # type: ignore
64
66
 
65
67
 
66
68
  __all__ = [
67
- "maybe_copy_config_to_cache",
68
69
  "config_sanity_checks",
69
70
  "build_detector",
70
71
  "build_padder",
@@ -74,31 +75,37 @@ __all__ = [
74
75
  "build_doctr_word",
75
76
  "get_dd_analyzer",
76
77
  "build_analyzer",
78
+ "set_config_by_yaml",
77
79
  ]
78
80
 
79
81
  _DD_ONE = "deepdoctection/configs/conf_dd_one.yaml"
80
82
  _TESSERACT = "deepdoctection/configs/conf_tesseract.yaml"
81
-
82
-
83
- def maybe_copy_config_to_cache(
84
- package_path: Pathlike, configs_dir_path: Pathlike, file_name: str, force_copy: bool = True
85
- ) -> str:
86
- """
87
- Initial copying of various files
88
- :param package_path: base path to directory of source file `file_name`
89
- :param configs_dir_path: base path to target directory
90
- :param file_name: file to copy
91
- :param force_copy: If file is already in target directory, will re-copy the file
92
-
93
- :return: path to the copied file_name
94
- """
95
-
96
- absolute_path_source = os.path.join(package_path, file_name)
97
- absolute_path = os.path.join(configs_dir_path, os.path.join("dd", os.path.split(file_name)[1]))
98
- mkdir_p(os.path.split(absolute_path)[0])
99
- if not os.path.isfile(absolute_path) or force_copy:
100
- copyfile(absolute_path_source, absolute_path)
101
- return absolute_path
83
+ _MODEL_CHOICES = {
84
+ "layout": [
85
+ "layout/d2_model_0829999_layout_inf_only.pt",
86
+ "xrf_layout/model_final_inf_only.pt",
87
+ "microsoft/table-transformer-detection/pytorch_model.bin",
88
+ ],
89
+ "segmentation": [
90
+ "item/model-1620000_inf_only.data-00000-of-00001",
91
+ "xrf_item/model_final_inf_only.pt",
92
+ "microsoft/table-transformer-structure-recognition/pytorch_model.bin",
93
+ "deepdoctection/tatr_tab_struct_v2/pytorch_model.bin",
94
+ ],
95
+ "ocr": ["Tesseract", "DocTr", "Textract"],
96
+ "doctr_word": ["doctr/db_resnet50/pt/db_resnet50-ac60cadc.pt"],
97
+ "doctr_recognition": [
98
+ "doctr/crnn_vgg16_bn/pt/crnn_vgg16_bn-9762b0b0.pt",
99
+ "doctr/crnn_vgg16_bn/pt/pytorch_model.bin",
100
+ ],
101
+ "llm": ["gpt-3.5-turbo", "gpt-4"],
102
+ "segmentation_choices": {
103
+ "item/model-1620000_inf_only.data-00000-of-00001": "cell/model-1800000_inf_only.data-00000-of-00001",
104
+ "xrf_item/model_final_inf_only.pt": "xrf_cell/model_final_inf_only.pt",
105
+ "microsoft/table-transformer-structure-recognition/pytorch_model.bin": None,
106
+ "deepdoctection/tatr_tab_struct_v2/pytorch_model.bin": None,
107
+ },
108
+ }
102
109
 
103
110
 
104
111
  def config_sanity_checks(cfg: AttrDict) -> None:
@@ -115,7 +122,7 @@ def config_sanity_checks(cfg: AttrDict) -> None:
115
122
 
116
123
  def build_detector(
117
124
  cfg: AttrDict, mode: str
118
- ) -> Union["D2FrcnnDetector", "TPFrcnnDetector", "HFDetrDerivedDetector", "D2FrcnnTracingDetector"]:
125
+ ) -> Union[D2FrcnnDetector, TPFrcnnDetector, HFDetrDerivedDetector, D2FrcnnTracingDetector]:
119
126
  """Building a D2-Detector, a TP-Detector as Detr-Detector or a D2-Torch Tracing Detector according to
120
127
  the config
121
128
 
@@ -133,8 +140,8 @@ def build_detector(
133
140
  config_path = ModelCatalog.get_full_path_configs(weights)
134
141
  weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
135
142
  profile = ModelCatalog.get_profile(weights)
136
- categories = profile.categories
137
- assert categories is not None
143
+ categories = profile.categories if profile.categories is not None else {}
144
+
138
145
  if profile.model_wrapper in ("TPFrcnnDetector",):
139
146
  return TPFrcnnDetector(config_path, weights_path, categories, filter_categories=filter_categories)
140
147
  if profile.model_wrapper in ("D2FrcnnDetector",):
@@ -202,11 +209,13 @@ def build_sub_image_service(detector: ObjectDetector, cfg: AttrDict, mode: str)
202
209
  padder = None
203
210
  if mode == "ITEM":
204
211
  if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
205
- exclude_category_ids.extend(["1", "3", "4", "5", "6"])
212
+ exclude_category_ids.extend([1, 3, 4, 5, 6])
206
213
  padder = build_padder(cfg, mode)
207
- detect_result_generator = DetectResultGenerator(detector.categories, exclude_category_ids=exclude_category_ids)
214
+ detect_result_generator = DetectResultGenerator(
215
+ categories=detector.categories.categories, exclude_category_ids=exclude_category_ids
216
+ )
208
217
  return SubImageLayoutService(
209
- detector, [LayoutType.table, LayoutType.table_rotated], None, detect_result_generator, padder
218
+ detector, [LayoutType.TABLE, LayoutType.TABLE_ROTATED], None, detect_result_generator, padder
210
219
  )
211
220
 
212
221
 
@@ -233,9 +242,9 @@ def build_ocr(cfg: AttrDict) -> Union[TesseractOcrDetector, DoctrTextRecognizer,
233
242
  )
234
243
  if cfg.OCR.USE_TEXTRACT:
235
244
  credentials_kwargs = {
236
- "aws_access_key_id": environ.get("ACCESS_KEY"),
237
- "aws_secret_access_key": environ.get("SECRET_KEY"),
238
- "config": Config(region_name=environ.get("REGION")),
245
+ "aws_access_key_id": environ.get("ACCESS_KEY", None),
246
+ "aws_secret_access_key": environ.get("SECRET_KEY", None),
247
+ "config": Config(region_name=environ.get("REGION", None)),
239
248
  }
240
249
  return TextractOcrDetector(**credentials_kwargs)
241
250
  raise ValueError("You have set USE_OCR=True but any of USE_TESSERACT, USE_DOCTR, USE_TEXTRACT is set to False")
@@ -260,7 +269,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
260
269
  :param cfg: A configuration
261
270
  :return: Analyzer pipeline
262
271
  """
263
- pipe_component_list: List[PipelineComponent] = []
272
+ pipe_component_list: list[PipelineComponent] = []
264
273
 
265
274
  if cfg.USE_LAYOUT:
266
275
  d_layout = build_detector(cfg, "LAYOUT")
@@ -300,22 +309,22 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
300
309
  cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
301
310
  cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
302
311
  cfg.SEGMENTATION.CELL_CATEGORY_ID,
303
- LayoutType.table,
312
+ LayoutType.TABLE,
304
313
  [
305
- CellType.spanning,
306
- CellType.row_header,
307
- CellType.column_header,
308
- CellType.projected_row_header,
309
- LayoutType.cell,
314
+ CellType.SPANNING,
315
+ CellType.ROW_HEADER,
316
+ CellType.COLUMN_HEADER,
317
+ CellType.PROJECTED_ROW_HEADER,
318
+ LayoutType.CELL,
310
319
  ],
311
320
  [
312
- CellType.spanning,
313
- CellType.row_header,
314
- CellType.column_header,
315
- CellType.projected_row_header,
321
+ CellType.SPANNING,
322
+ CellType.ROW_HEADER,
323
+ CellType.COLUMN_HEADER,
324
+ CellType.PROJECTED_ROW_HEADER,
316
325
  ],
317
- [LayoutType.row, LayoutType.column],
318
- [CellType.row_number, CellType.column_number],
326
+ [LayoutType.ROW, LayoutType.COLUMN],
327
+ [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER],
319
328
  stretch_rule=cfg.SEGMENTATION.STRETCH_RULE,
320
329
  )
321
330
  pipe_component_list.append(pubtables)
@@ -327,23 +336,23 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
327
336
  cfg.SEGMENTATION.FULL_TABLE_TILING,
328
337
  cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
329
338
  cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
330
- LayoutType.table,
331
- [CellType.header, CellType.body, LayoutType.cell],
332
- [LayoutType.row, LayoutType.column],
333
- [CellType.row_number, CellType.column_number],
339
+ LayoutType.TABLE,
340
+ [CellType.HEADER, CellType.BODY, LayoutType.CELL],
341
+ [LayoutType.ROW, LayoutType.COLUMN],
342
+ [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER],
334
343
  cfg.SEGMENTATION.STRETCH_RULE,
335
344
  )
336
345
  pipe_component_list.append(table_segmentation)
337
346
 
338
347
  if cfg.USE_TABLE_REFINEMENT:
339
348
  table_segmentation_refinement = TableSegmentationRefinementService(
340
- [LayoutType.table, LayoutType.table_rotated],
349
+ [LayoutType.TABLE, LayoutType.TABLE_ROTATED],
341
350
  [
342
- LayoutType.cell,
343
- CellType.column_header,
344
- CellType.projected_row_header,
345
- CellType.spanning,
346
- CellType.row_header,
351
+ LayoutType.CELL,
352
+ CellType.COLUMN_HEADER,
353
+ CellType.PROJECTED_ROW_HEADER,
354
+ CellType.SPANNING,
355
+ CellType.ROW_HEADER,
347
356
  ],
348
357
  )
349
358
  pipe_component_list.append(table_segmentation_refinement)
@@ -363,24 +372,28 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
363
372
 
364
373
  ocr = build_ocr(cfg)
365
374
  skip_if_text_extracted = cfg.USE_PDF_MINER
366
- extract_from_roi = LayoutType.word if cfg.OCR.USE_DOCTR else None
375
+ extract_from_roi = LayoutType.WORD if cfg.OCR.USE_DOCTR else None
367
376
  text = TextExtractionService(
368
377
  ocr, skip_if_text_extracted=skip_if_text_extracted, extract_from_roi=extract_from_roi
369
378
  )
370
379
  pipe_component_list.append(text)
371
380
 
372
381
  if cfg.USE_PDF_MINER or cfg.USE_OCR:
373
- match = MatchingService(
374
- parent_categories=cfg.WORD_MATCHING.PARENTAL_CATEGORIES,
375
- child_categories=LayoutType.word,
382
+ matcher = IntersectionMatcher(
376
383
  matching_rule=cfg.WORD_MATCHING.RULE,
377
384
  threshold=cfg.WORD_MATCHING.THRESHOLD,
378
385
  max_parent_only=cfg.WORD_MATCHING.MAX_PARENT_ONLY,
379
386
  )
387
+ match = MatchingService(
388
+ parent_categories=cfg.WORD_MATCHING.PARENTAL_CATEGORIES,
389
+ child_categories=LayoutType.WORD,
390
+ matcher=matcher,
391
+ relationship_key=Relationships.CHILD,
392
+ )
380
393
  pipe_component_list.append(match)
381
394
 
382
395
  order = TextOrderService(
383
- text_container=LayoutType.word,
396
+ text_container=LayoutType.WORD,
384
397
  text_block_categories=cfg.TEXT_ORDERING.TEXT_BLOCK_CATEGORIES,
385
398
  floating_text_block_categories=cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
386
399
  include_residual_text_container=cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
@@ -392,7 +405,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
392
405
  pipe_component_list.append(order)
393
406
 
394
407
  page_parsing_service = PageParsingService(
395
- text_container=LayoutType.word,
408
+ text_container=LayoutType.WORD,
396
409
  floating_text_block_categories=cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
397
410
  include_residual_text_container=cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
398
411
  )
@@ -403,8 +416,8 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
403
416
 
404
417
  def get_dd_analyzer(
405
418
  reset_config_file: bool = True,
406
- config_overwrite: Optional[List[str]] = None,
407
- path_config_file: Optional[Pathlike] = None,
419
+ config_overwrite: Optional[list[str]] = None,
420
+ path_config_file: Optional[PathLikeOrStr] = None,
408
421
  ) -> DoctectionPipe:
409
422
  """
410
423
  Factory function for creating the built-in **deep**doctection analyzer.
@@ -431,7 +444,7 @@ def get_dd_analyzer(
431
444
  :return: A DoctectionPipe instance with given configs
432
445
  """
433
446
  config_overwrite = [] if config_overwrite is None else config_overwrite
434
- lib = "TF" if os.environ.get("DD_USE_TF") else "PT"
447
+ lib = "TF" if os.environ.get("DD_USE_TF", "0") in ENV_VARS_TRUE else "PT"
435
448
  if lib == "TF":
436
449
  device = get_tf_device()
437
450
  elif lib == "PT":
@@ -439,9 +452,9 @@ def get_dd_analyzer(
439
452
  else:
440
453
  raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
441
454
  dd_one_config_path = maybe_copy_config_to_cache(
442
- get_package_path(), get_configs_dir_path(), _DD_ONE, reset_config_file
455
+ get_package_path(), get_configs_dir_path() / "dd", _DD_ONE, reset_config_file
443
456
  )
444
- maybe_copy_config_to_cache(get_package_path(), get_configs_dir_path(), _TESSERACT)
457
+ maybe_copy_config_to_cache(get_package_path(), get_configs_dir_path() / "dd", _TESSERACT)
445
458
 
446
459
  # Set up of the configuration and logging
447
460
  cfg = set_config_by_yaml(dd_one_config_path if not path_config_file else path_config_file)
@@ -12,7 +12,7 @@ Some DataFlow classes for transforming and processing datapoints. Many classes h
12
12
  """
13
13
  import itertools
14
14
  from copy import copy
15
- from typing import Any, Callable, Iterator, List, Union
15
+ from typing import Any, Callable, Iterator, Union
16
16
 
17
17
  import tqdm
18
18
 
@@ -164,6 +164,10 @@ class RepeatedData(ProxyDataFlow):
164
164
  Set to -1 to repeat ``ds`` infinite times.
165
165
  """
166
166
  self.num = num
167
+ if self.num != -1:
168
+ self.dfs = itertools.tee(df, self.num)
169
+ else:
170
+ self.dfs = ()
167
171
  super().__init__(df)
168
172
 
169
173
  def __len__(self) -> int:
@@ -180,8 +184,8 @@ class RepeatedData(ProxyDataFlow):
180
184
  while True:
181
185
  yield from self.df
182
186
  else:
183
- for _ in range(self.num):
184
- yield from self.df
187
+ for df in self.dfs:
188
+ yield from df
185
189
 
186
190
 
187
191
  class ConcatData(DataFlow):
@@ -197,7 +201,7 @@ class ConcatData(DataFlow):
197
201
  df = ConcatData([df_1,df_2])
198
202
  """
199
203
 
200
- def __init__(self, df_lists: List[DataFlow]) -> None:
204
+ def __init__(self, df_lists: list[DataFlow]) -> None:
201
205
  """
202
206
  :param df_lists: a list of DataFlow.
203
207
  """
@@ -233,7 +237,7 @@ class JoinData(DataFlow):
233
237
  `JoinData` will stop once the first Dataflow throws a StopIteration
234
238
  """
235
239
 
236
- def __init__(self, df_lists: List[DataFlow]) -> None:
240
+ def __init__(self, df_lists: list[DataFlow]) -> None:
237
241
  """
238
242
  :param df_lists: a list of DataFlow. When these dataflows have different sizes, JoinData will stop when any
239
243
  of them is exhausted.
@@ -21,7 +21,7 @@ from
21
21
 
22
22
  <https://github.com/tensorpack/dataflow/blob/master/dataflow/dataflow/common.py>
23
23
  """
24
- from typing import Any, Callable, Iterable, Iterator, List, Optional
24
+ from typing import Any, Callable, Iterable, Iterator, Optional
25
25
 
26
26
  import numpy as np
27
27
 
@@ -54,7 +54,7 @@ class CacheData(ProxyDataFlow):
54
54
  :param shuffle: whether to shuffle the cache before yielding from it.
55
55
  """
56
56
  self.shuffle = shuffle
57
- self.buffer: List[Any] = []
57
+ self.buffer: list[Any] = []
58
58
  self._guard: Optional[DataFlowReentrantGuard] = None
59
59
  self.rng = get_rng(self)
60
60
  super().__init__(df)
@@ -78,7 +78,7 @@ class CacheData(ProxyDataFlow):
78
78
  yield dp
79
79
  self.buffer.append(dp)
80
80
 
81
- def get_cache(self) -> List[Any]:
81
+ def get_cache(self) -> list[Any]:
82
82
  """
83
83
  get the cache of the whole dataflow as a list
84
84
 
@@ -115,10 +115,10 @@ class CustomDataFromList(DataFromList):
115
115
 
116
116
  def __init__(
117
117
  self,
118
- lst: List[Any],
118
+ lst: list[Any],
119
119
  shuffle: bool = False,
120
120
  max_datapoints: Optional[int] = None,
121
- rebalance_func: Optional[Callable[[List[Any]], List[Any]]] = None,
121
+ rebalance_func: Optional[Callable[[list[Any]], list[Any]]] = None,
122
122
  ):
123
123
  """
124
124
  :param lst: the input list. Each element represents a datapoint.