deepdoctection 0.32__py3-none-any.whl → 0.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +8 -25
- deepdoctection/analyzer/dd.py +84 -71
- deepdoctection/dataflow/common.py +9 -5
- deepdoctection/dataflow/custom.py +5 -5
- deepdoctection/dataflow/custom_serialize.py +75 -18
- deepdoctection/dataflow/parallel_map.py +3 -3
- deepdoctection/dataflow/serialize.py +4 -4
- deepdoctection/dataflow/stats.py +3 -3
- deepdoctection/datapoint/annotation.py +78 -56
- deepdoctection/datapoint/box.py +7 -7
- deepdoctection/datapoint/convert.py +6 -6
- deepdoctection/datapoint/image.py +157 -75
- deepdoctection/datapoint/view.py +175 -151
- deepdoctection/datasets/adapter.py +30 -24
- deepdoctection/datasets/base.py +10 -10
- deepdoctection/datasets/dataflow_builder.py +3 -3
- deepdoctection/datasets/info.py +23 -25
- deepdoctection/datasets/instances/doclaynet.py +48 -49
- deepdoctection/datasets/instances/fintabnet.py +44 -45
- deepdoctection/datasets/instances/funsd.py +23 -23
- deepdoctection/datasets/instances/iiitar13k.py +8 -8
- deepdoctection/datasets/instances/layouttest.py +2 -2
- deepdoctection/datasets/instances/publaynet.py +3 -3
- deepdoctection/datasets/instances/pubtables1m.py +18 -18
- deepdoctection/datasets/instances/pubtabnet.py +30 -29
- deepdoctection/datasets/instances/rvlcdip.py +28 -29
- deepdoctection/datasets/instances/xfund.py +51 -30
- deepdoctection/datasets/save.py +6 -6
- deepdoctection/eval/accmetric.py +32 -33
- deepdoctection/eval/base.py +8 -9
- deepdoctection/eval/cocometric.py +13 -12
- deepdoctection/eval/eval.py +32 -26
- deepdoctection/eval/tedsmetric.py +16 -12
- deepdoctection/eval/tp_eval_callback.py +7 -16
- deepdoctection/extern/base.py +339 -134
- deepdoctection/extern/d2detect.py +69 -89
- deepdoctection/extern/deskew.py +11 -10
- deepdoctection/extern/doctrocr.py +81 -64
- deepdoctection/extern/fastlang.py +23 -16
- deepdoctection/extern/hfdetr.py +53 -38
- deepdoctection/extern/hflayoutlm.py +216 -155
- deepdoctection/extern/hflm.py +35 -30
- deepdoctection/extern/model.py +433 -255
- deepdoctection/extern/pdftext.py +15 -15
- deepdoctection/extern/pt/ptutils.py +4 -2
- deepdoctection/extern/tessocr.py +39 -38
- deepdoctection/extern/texocr.py +14 -16
- deepdoctection/extern/tp/tfutils.py +16 -2
- deepdoctection/extern/tp/tpcompat.py +11 -7
- deepdoctection/extern/tp/tpfrcnn/config/config.py +4 -4
- deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +1 -1
- deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +5 -5
- deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +6 -6
- deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +4 -4
- deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +5 -3
- deepdoctection/extern/tp/tpfrcnn/preproc.py +5 -5
- deepdoctection/extern/tpdetect.py +40 -45
- deepdoctection/mapper/cats.py +36 -40
- deepdoctection/mapper/cocostruct.py +16 -12
- deepdoctection/mapper/d2struct.py +22 -22
- deepdoctection/mapper/hfstruct.py +7 -7
- deepdoctection/mapper/laylmstruct.py +22 -24
- deepdoctection/mapper/maputils.py +9 -10
- deepdoctection/mapper/match.py +33 -2
- deepdoctection/mapper/misc.py +6 -7
- deepdoctection/mapper/pascalstruct.py +4 -4
- deepdoctection/mapper/prodigystruct.py +6 -6
- deepdoctection/mapper/pubstruct.py +84 -92
- deepdoctection/mapper/tpstruct.py +3 -3
- deepdoctection/mapper/xfundstruct.py +33 -33
- deepdoctection/pipe/anngen.py +39 -14
- deepdoctection/pipe/base.py +68 -99
- deepdoctection/pipe/common.py +181 -85
- deepdoctection/pipe/concurrency.py +14 -10
- deepdoctection/pipe/doctectionpipe.py +24 -21
- deepdoctection/pipe/language.py +20 -25
- deepdoctection/pipe/layout.py +18 -16
- deepdoctection/pipe/lm.py +49 -47
- deepdoctection/pipe/order.py +63 -65
- deepdoctection/pipe/refine.py +102 -109
- deepdoctection/pipe/segment.py +157 -162
- deepdoctection/pipe/sub_layout.py +50 -40
- deepdoctection/pipe/text.py +37 -36
- deepdoctection/pipe/transform.py +19 -16
- deepdoctection/train/d2_frcnn_train.py +27 -25
- deepdoctection/train/hf_detr_train.py +22 -18
- deepdoctection/train/hf_layoutlm_train.py +49 -48
- deepdoctection/train/tp_frcnn_train.py +10 -11
- deepdoctection/utils/concurrency.py +1 -1
- deepdoctection/utils/context.py +13 -6
- deepdoctection/utils/develop.py +4 -4
- deepdoctection/utils/env_info.py +52 -14
- deepdoctection/utils/file_utils.py +6 -11
- deepdoctection/utils/fs.py +41 -14
- deepdoctection/utils/identifier.py +2 -2
- deepdoctection/utils/logger.py +15 -15
- deepdoctection/utils/metacfg.py +7 -7
- deepdoctection/utils/pdf_utils.py +39 -14
- deepdoctection/utils/settings.py +188 -182
- deepdoctection/utils/tqdm.py +1 -1
- deepdoctection/utils/transform.py +14 -9
- deepdoctection/utils/types.py +104 -0
- deepdoctection/utils/utils.py +7 -7
- deepdoctection/utils/viz.py +70 -69
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/METADATA +7 -4
- deepdoctection-0.34.dist-info/RECORD +146 -0
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/WHEEL +1 -1
- deepdoctection/utils/detection_types.py +0 -68
- deepdoctection-0.32.dist-info/RECORD +0 -146
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/LICENSE +0 -0
- {deepdoctection-0.32.dist-info → deepdoctection-0.34.dist-info}/top_level.txt +0 -0
deepdoctection/__init__.py
CHANGED
|
@@ -15,7 +15,6 @@ if importlib.util.find_spec("dotenv") is not None:
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
# pylint: disable=wrong-import-position
|
|
18
|
-
import os
|
|
19
18
|
import sys
|
|
20
19
|
from typing import TYPE_CHECKING
|
|
21
20
|
|
|
@@ -25,11 +24,10 @@ from .utils.logger import LoggingRecord, logger
|
|
|
25
24
|
|
|
26
25
|
# pylint: enable=wrong-import-position
|
|
27
26
|
|
|
28
|
-
__version__ = 0.
|
|
27
|
+
__version__ = 0.34
|
|
29
28
|
|
|
30
29
|
_IMPORT_STRUCTURE = {
|
|
31
30
|
"analyzer": [
|
|
32
|
-
"maybe_copy_config_to_cache",
|
|
33
31
|
"config_sanity_checks",
|
|
34
32
|
"build_detector",
|
|
35
33
|
"build_padder",
|
|
@@ -76,6 +74,7 @@ _IMPORT_STRUCTURE = {
|
|
|
76
74
|
],
|
|
77
75
|
"datapoint": [
|
|
78
76
|
"ann_from_dict",
|
|
77
|
+
"AnnotationMap",
|
|
79
78
|
"Annotation",
|
|
80
79
|
"CategoryAnnotation",
|
|
81
80
|
"ImageAnnotation",
|
|
@@ -160,6 +159,8 @@ _IMPORT_STRUCTURE = {
|
|
|
160
159
|
"EvalCallback",
|
|
161
160
|
],
|
|
162
161
|
"extern": [
|
|
162
|
+
"ModelCategories",
|
|
163
|
+
"NerModelCategories",
|
|
163
164
|
"PredictorBase",
|
|
164
165
|
"DetectionResult",
|
|
165
166
|
"ObjectDetector",
|
|
@@ -235,6 +236,7 @@ _IMPORT_STRUCTURE = {
|
|
|
235
236
|
"LabelSummarizer",
|
|
236
237
|
"curry",
|
|
237
238
|
"match_anns_by_intersection",
|
|
239
|
+
"match_anns_by_distance",
|
|
238
240
|
"to_image",
|
|
239
241
|
"maybe_load_image",
|
|
240
242
|
"maybe_remove_image",
|
|
@@ -263,6 +265,8 @@ _IMPORT_STRUCTURE = {
|
|
|
263
265
|
"DetectResultGenerator",
|
|
264
266
|
"SubImageLayoutService",
|
|
265
267
|
"ImageCroppingService",
|
|
268
|
+
"IntersectionMatcher",
|
|
269
|
+
"NeighbourMatcher",
|
|
266
270
|
"MatchingService",
|
|
267
271
|
"PageParsingService",
|
|
268
272
|
"AnnotationNmsService",
|
|
@@ -362,6 +366,7 @@ _IMPORT_STRUCTURE = {
|
|
|
362
366
|
"get_configs_dir_path",
|
|
363
367
|
"get_weights_dir_path",
|
|
364
368
|
"get_dataset_dir_path",
|
|
369
|
+
"maybe_copy_config_to_cache",
|
|
365
370
|
"is_uuid_like",
|
|
366
371
|
"get_uuid_from_str",
|
|
367
372
|
"get_uuid",
|
|
@@ -423,28 +428,6 @@ _IMPORT_STRUCTURE = {
|
|
|
423
428
|
env_info = collect_env_info()
|
|
424
429
|
logger.debug(LoggingRecord(msg=env_info))
|
|
425
430
|
|
|
426
|
-
if os.environ.get("PYTORCH_AVAILABLE") and os.environ.get("DD_USE_TORCH") is None:
|
|
427
|
-
os.environ["DD_USE_TORCH"] = "1"
|
|
428
|
-
os.environ["USE_TORCH"] = "1"
|
|
429
|
-
if os.environ.get("TENSORFLOW_AVAILABLE") and os.environ.get("DD_USE_TF") is None:
|
|
430
|
-
os.environ["DD_USE_TF"] = "1"
|
|
431
|
-
os.environ["USE_TF"] = "1"
|
|
432
|
-
if os.environ.get("DD_USE_TORCH") and os.environ.get("DD_USE_TF"):
|
|
433
|
-
logger.warning(
|
|
434
|
-
"Both DD_USE_TORCH and DD_USE_TF are set. Defaulting to PyTorch. If you want a different "
|
|
435
|
-
"behaviour, set DD_USE_TORCH to None before importing deepdoctection."
|
|
436
|
-
)
|
|
437
|
-
os.environ.pop("DD_USE_TF")
|
|
438
|
-
os.environ.pop("USE_TF")
|
|
439
|
-
|
|
440
|
-
if not os.environ.get("PYTORCH_AVAILABLE") and not os.environ.get("TENSORFLOW_AVAILABLE"):
|
|
441
|
-
logger.warning(
|
|
442
|
-
LoggingRecord(
|
|
443
|
-
msg="Neither Tensorflow or Pytorch are available. You will not be able to use any Deep Learning "
|
|
444
|
-
"model from the library."
|
|
445
|
-
)
|
|
446
|
-
)
|
|
447
|
-
|
|
448
431
|
|
|
449
432
|
# Direct imports for type-checking
|
|
450
433
|
if TYPE_CHECKING:
|
deepdoctection/analyzer/dd.py
CHANGED
|
@@ -23,10 +23,11 @@ Module for **deep**doctection analyzer.
|
|
|
23
23
|
-user factory with a reduced config setting
|
|
24
24
|
"""
|
|
25
25
|
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
26
28
|
import os
|
|
27
29
|
from os import environ
|
|
28
|
-
from
|
|
29
|
-
from typing import List, Optional, Union
|
|
30
|
+
from typing import Optional, Union
|
|
30
31
|
|
|
31
32
|
from lazy_imports import try_import
|
|
32
33
|
|
|
@@ -42,7 +43,7 @@ from ..extern.texocr import TextractOcrDetector
|
|
|
42
43
|
from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
|
|
43
44
|
from ..extern.tpdetect import TPFrcnnDetector
|
|
44
45
|
from ..pipe.base import PipelineComponent
|
|
45
|
-
from ..pipe.common import AnnotationNmsService, MatchingService, PageParsingService
|
|
46
|
+
from ..pipe.common import AnnotationNmsService, IntersectionMatcher, MatchingService, PageParsingService
|
|
46
47
|
from ..pipe.doctectionpipe import DoctectionPipe
|
|
47
48
|
from ..pipe.layout import ImageLayoutService
|
|
48
49
|
from ..pipe.order import TextOrderService
|
|
@@ -50,21 +51,21 @@ from ..pipe.refine import TableSegmentationRefinementService
|
|
|
50
51
|
from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
|
|
51
52
|
from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
|
|
52
53
|
from ..pipe.text import TextExtractionService
|
|
53
|
-
from ..utils.
|
|
54
|
+
from ..utils.env_info import ENV_VARS_TRUE
|
|
54
55
|
from ..utils.error import DependencyError
|
|
55
56
|
from ..utils.file_utils import detectron2_available, tensorpack_available
|
|
56
|
-
from ..utils.fs import get_configs_dir_path, get_package_path,
|
|
57
|
+
from ..utils.fs import get_configs_dir_path, get_package_path, maybe_copy_config_to_cache
|
|
57
58
|
from ..utils.logger import LoggingRecord, logger
|
|
58
59
|
from ..utils.metacfg import AttrDict, set_config_by_yaml
|
|
59
|
-
from ..utils.settings import CellType, LayoutType
|
|
60
|
+
from ..utils.settings import CellType, LayoutType, Relationships
|
|
60
61
|
from ..utils.transform import PadTransform
|
|
62
|
+
from ..utils.types import PathLikeOrStr
|
|
61
63
|
|
|
62
64
|
with try_import() as image_guard:
|
|
63
65
|
from botocore.config import Config # type: ignore
|
|
64
66
|
|
|
65
67
|
|
|
66
68
|
__all__ = [
|
|
67
|
-
"maybe_copy_config_to_cache",
|
|
68
69
|
"config_sanity_checks",
|
|
69
70
|
"build_detector",
|
|
70
71
|
"build_padder",
|
|
@@ -74,31 +75,37 @@ __all__ = [
|
|
|
74
75
|
"build_doctr_word",
|
|
75
76
|
"get_dd_analyzer",
|
|
76
77
|
"build_analyzer",
|
|
78
|
+
"set_config_by_yaml",
|
|
77
79
|
]
|
|
78
80
|
|
|
79
81
|
_DD_ONE = "deepdoctection/configs/conf_dd_one.yaml"
|
|
80
82
|
_TESSERACT = "deepdoctection/configs/conf_tesseract.yaml"
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
:
|
|
94
|
-
"""
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
83
|
+
_MODEL_CHOICES = {
|
|
84
|
+
"layout": [
|
|
85
|
+
"layout/d2_model_0829999_layout_inf_only.pt",
|
|
86
|
+
"xrf_layout/model_final_inf_only.pt",
|
|
87
|
+
"microsoft/table-transformer-detection/pytorch_model.bin",
|
|
88
|
+
],
|
|
89
|
+
"segmentation": [
|
|
90
|
+
"item/model-1620000_inf_only.data-00000-of-00001",
|
|
91
|
+
"xrf_item/model_final_inf_only.pt",
|
|
92
|
+
"microsoft/table-transformer-structure-recognition/pytorch_model.bin",
|
|
93
|
+
"deepdoctection/tatr_tab_struct_v2/pytorch_model.bin",
|
|
94
|
+
],
|
|
95
|
+
"ocr": ["Tesseract", "DocTr", "Textract"],
|
|
96
|
+
"doctr_word": ["doctr/db_resnet50/pt/db_resnet50-ac60cadc.pt"],
|
|
97
|
+
"doctr_recognition": [
|
|
98
|
+
"doctr/crnn_vgg16_bn/pt/crnn_vgg16_bn-9762b0b0.pt",
|
|
99
|
+
"doctr/crnn_vgg16_bn/pt/pytorch_model.bin",
|
|
100
|
+
],
|
|
101
|
+
"llm": ["gpt-3.5-turbo", "gpt-4"],
|
|
102
|
+
"segmentation_choices": {
|
|
103
|
+
"item/model-1620000_inf_only.data-00000-of-00001": "cell/model-1800000_inf_only.data-00000-of-00001",
|
|
104
|
+
"xrf_item/model_final_inf_only.pt": "xrf_cell/model_final_inf_only.pt",
|
|
105
|
+
"microsoft/table-transformer-structure-recognition/pytorch_model.bin": None,
|
|
106
|
+
"deepdoctection/tatr_tab_struct_v2/pytorch_model.bin": None,
|
|
107
|
+
},
|
|
108
|
+
}
|
|
102
109
|
|
|
103
110
|
|
|
104
111
|
def config_sanity_checks(cfg: AttrDict) -> None:
|
|
@@ -115,7 +122,7 @@ def config_sanity_checks(cfg: AttrDict) -> None:
|
|
|
115
122
|
|
|
116
123
|
def build_detector(
|
|
117
124
|
cfg: AttrDict, mode: str
|
|
118
|
-
) -> Union[
|
|
125
|
+
) -> Union[D2FrcnnDetector, TPFrcnnDetector, HFDetrDerivedDetector, D2FrcnnTracingDetector]:
|
|
119
126
|
"""Building a D2-Detector, a TP-Detector as Detr-Detector or a D2-Torch Tracing Detector according to
|
|
120
127
|
the config
|
|
121
128
|
|
|
@@ -133,8 +140,8 @@ def build_detector(
|
|
|
133
140
|
config_path = ModelCatalog.get_full_path_configs(weights)
|
|
134
141
|
weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
|
|
135
142
|
profile = ModelCatalog.get_profile(weights)
|
|
136
|
-
categories = profile.categories
|
|
137
|
-
|
|
143
|
+
categories = profile.categories if profile.categories is not None else {}
|
|
144
|
+
|
|
138
145
|
if profile.model_wrapper in ("TPFrcnnDetector",):
|
|
139
146
|
return TPFrcnnDetector(config_path, weights_path, categories, filter_categories=filter_categories)
|
|
140
147
|
if profile.model_wrapper in ("D2FrcnnDetector",):
|
|
@@ -202,11 +209,13 @@ def build_sub_image_service(detector: ObjectDetector, cfg: AttrDict, mode: str)
|
|
|
202
209
|
padder = None
|
|
203
210
|
if mode == "ITEM":
|
|
204
211
|
if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
|
|
205
|
-
exclude_category_ids.extend([
|
|
212
|
+
exclude_category_ids.extend([1, 3, 4, 5, 6])
|
|
206
213
|
padder = build_padder(cfg, mode)
|
|
207
|
-
detect_result_generator = DetectResultGenerator(
|
|
214
|
+
detect_result_generator = DetectResultGenerator(
|
|
215
|
+
categories=detector.categories.categories, exclude_category_ids=exclude_category_ids
|
|
216
|
+
)
|
|
208
217
|
return SubImageLayoutService(
|
|
209
|
-
detector, [LayoutType.
|
|
218
|
+
detector, [LayoutType.TABLE, LayoutType.TABLE_ROTATED], None, detect_result_generator, padder
|
|
210
219
|
)
|
|
211
220
|
|
|
212
221
|
|
|
@@ -233,9 +242,9 @@ def build_ocr(cfg: AttrDict) -> Union[TesseractOcrDetector, DoctrTextRecognizer,
|
|
|
233
242
|
)
|
|
234
243
|
if cfg.OCR.USE_TEXTRACT:
|
|
235
244
|
credentials_kwargs = {
|
|
236
|
-
"aws_access_key_id": environ.get("ACCESS_KEY"),
|
|
237
|
-
"aws_secret_access_key": environ.get("SECRET_KEY"),
|
|
238
|
-
"config": Config(region_name=environ.get("REGION")),
|
|
245
|
+
"aws_access_key_id": environ.get("ACCESS_KEY", None),
|
|
246
|
+
"aws_secret_access_key": environ.get("SECRET_KEY", None),
|
|
247
|
+
"config": Config(region_name=environ.get("REGION", None)),
|
|
239
248
|
}
|
|
240
249
|
return TextractOcrDetector(**credentials_kwargs)
|
|
241
250
|
raise ValueError("You have set USE_OCR=True but any of USE_TESSERACT, USE_DOCTR, USE_TEXTRACT is set to False")
|
|
@@ -260,7 +269,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
260
269
|
:param cfg: A configuration
|
|
261
270
|
:return: Analyzer pipeline
|
|
262
271
|
"""
|
|
263
|
-
pipe_component_list:
|
|
272
|
+
pipe_component_list: list[PipelineComponent] = []
|
|
264
273
|
|
|
265
274
|
if cfg.USE_LAYOUT:
|
|
266
275
|
d_layout = build_detector(cfg, "LAYOUT")
|
|
@@ -300,22 +309,22 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
300
309
|
cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
|
|
301
310
|
cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
|
|
302
311
|
cfg.SEGMENTATION.CELL_CATEGORY_ID,
|
|
303
|
-
LayoutType.
|
|
312
|
+
LayoutType.TABLE,
|
|
304
313
|
[
|
|
305
|
-
CellType.
|
|
306
|
-
CellType.
|
|
307
|
-
CellType.
|
|
308
|
-
CellType.
|
|
309
|
-
LayoutType.
|
|
314
|
+
CellType.SPANNING,
|
|
315
|
+
CellType.ROW_HEADER,
|
|
316
|
+
CellType.COLUMN_HEADER,
|
|
317
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
318
|
+
LayoutType.CELL,
|
|
310
319
|
],
|
|
311
320
|
[
|
|
312
|
-
CellType.
|
|
313
|
-
CellType.
|
|
314
|
-
CellType.
|
|
315
|
-
CellType.
|
|
321
|
+
CellType.SPANNING,
|
|
322
|
+
CellType.ROW_HEADER,
|
|
323
|
+
CellType.COLUMN_HEADER,
|
|
324
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
316
325
|
],
|
|
317
|
-
[LayoutType.
|
|
318
|
-
[CellType.
|
|
326
|
+
[LayoutType.ROW, LayoutType.COLUMN],
|
|
327
|
+
[CellType.ROW_NUMBER, CellType.COLUMN_NUMBER],
|
|
319
328
|
stretch_rule=cfg.SEGMENTATION.STRETCH_RULE,
|
|
320
329
|
)
|
|
321
330
|
pipe_component_list.append(pubtables)
|
|
@@ -327,23 +336,23 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
327
336
|
cfg.SEGMENTATION.FULL_TABLE_TILING,
|
|
328
337
|
cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
|
|
329
338
|
cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
|
|
330
|
-
LayoutType.
|
|
331
|
-
[CellType.
|
|
332
|
-
[LayoutType.
|
|
333
|
-
[CellType.
|
|
339
|
+
LayoutType.TABLE,
|
|
340
|
+
[CellType.HEADER, CellType.BODY, LayoutType.CELL],
|
|
341
|
+
[LayoutType.ROW, LayoutType.COLUMN],
|
|
342
|
+
[CellType.ROW_NUMBER, CellType.COLUMN_NUMBER],
|
|
334
343
|
cfg.SEGMENTATION.STRETCH_RULE,
|
|
335
344
|
)
|
|
336
345
|
pipe_component_list.append(table_segmentation)
|
|
337
346
|
|
|
338
347
|
if cfg.USE_TABLE_REFINEMENT:
|
|
339
348
|
table_segmentation_refinement = TableSegmentationRefinementService(
|
|
340
|
-
[LayoutType.
|
|
349
|
+
[LayoutType.TABLE, LayoutType.TABLE_ROTATED],
|
|
341
350
|
[
|
|
342
|
-
LayoutType.
|
|
343
|
-
CellType.
|
|
344
|
-
CellType.
|
|
345
|
-
CellType.
|
|
346
|
-
CellType.
|
|
351
|
+
LayoutType.CELL,
|
|
352
|
+
CellType.COLUMN_HEADER,
|
|
353
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
354
|
+
CellType.SPANNING,
|
|
355
|
+
CellType.ROW_HEADER,
|
|
347
356
|
],
|
|
348
357
|
)
|
|
349
358
|
pipe_component_list.append(table_segmentation_refinement)
|
|
@@ -363,24 +372,28 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
363
372
|
|
|
364
373
|
ocr = build_ocr(cfg)
|
|
365
374
|
skip_if_text_extracted = cfg.USE_PDF_MINER
|
|
366
|
-
extract_from_roi = LayoutType.
|
|
375
|
+
extract_from_roi = LayoutType.WORD if cfg.OCR.USE_DOCTR else None
|
|
367
376
|
text = TextExtractionService(
|
|
368
377
|
ocr, skip_if_text_extracted=skip_if_text_extracted, extract_from_roi=extract_from_roi
|
|
369
378
|
)
|
|
370
379
|
pipe_component_list.append(text)
|
|
371
380
|
|
|
372
381
|
if cfg.USE_PDF_MINER or cfg.USE_OCR:
|
|
373
|
-
|
|
374
|
-
parent_categories=cfg.WORD_MATCHING.PARENTAL_CATEGORIES,
|
|
375
|
-
child_categories=LayoutType.word,
|
|
382
|
+
matcher = IntersectionMatcher(
|
|
376
383
|
matching_rule=cfg.WORD_MATCHING.RULE,
|
|
377
384
|
threshold=cfg.WORD_MATCHING.THRESHOLD,
|
|
378
385
|
max_parent_only=cfg.WORD_MATCHING.MAX_PARENT_ONLY,
|
|
379
386
|
)
|
|
387
|
+
match = MatchingService(
|
|
388
|
+
parent_categories=cfg.WORD_MATCHING.PARENTAL_CATEGORIES,
|
|
389
|
+
child_categories=LayoutType.WORD,
|
|
390
|
+
matcher=matcher,
|
|
391
|
+
relationship_key=Relationships.CHILD,
|
|
392
|
+
)
|
|
380
393
|
pipe_component_list.append(match)
|
|
381
394
|
|
|
382
395
|
order = TextOrderService(
|
|
383
|
-
text_container=LayoutType.
|
|
396
|
+
text_container=LayoutType.WORD,
|
|
384
397
|
text_block_categories=cfg.TEXT_ORDERING.TEXT_BLOCK_CATEGORIES,
|
|
385
398
|
floating_text_block_categories=cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
|
|
386
399
|
include_residual_text_container=cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
|
|
@@ -392,7 +405,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
392
405
|
pipe_component_list.append(order)
|
|
393
406
|
|
|
394
407
|
page_parsing_service = PageParsingService(
|
|
395
|
-
text_container=LayoutType.
|
|
408
|
+
text_container=LayoutType.WORD,
|
|
396
409
|
floating_text_block_categories=cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
|
|
397
410
|
include_residual_text_container=cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
|
|
398
411
|
)
|
|
@@ -403,8 +416,8 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
403
416
|
|
|
404
417
|
def get_dd_analyzer(
|
|
405
418
|
reset_config_file: bool = True,
|
|
406
|
-
config_overwrite: Optional[
|
|
407
|
-
path_config_file: Optional[
|
|
419
|
+
config_overwrite: Optional[list[str]] = None,
|
|
420
|
+
path_config_file: Optional[PathLikeOrStr] = None,
|
|
408
421
|
) -> DoctectionPipe:
|
|
409
422
|
"""
|
|
410
423
|
Factory function for creating the built-in **deep**doctection analyzer.
|
|
@@ -431,7 +444,7 @@ def get_dd_analyzer(
|
|
|
431
444
|
:return: A DoctectionPipe instance with given configs
|
|
432
445
|
"""
|
|
433
446
|
config_overwrite = [] if config_overwrite is None else config_overwrite
|
|
434
|
-
lib = "TF" if os.environ.get("DD_USE_TF") else "PT"
|
|
447
|
+
lib = "TF" if os.environ.get("DD_USE_TF", "0") in ENV_VARS_TRUE else "PT"
|
|
435
448
|
if lib == "TF":
|
|
436
449
|
device = get_tf_device()
|
|
437
450
|
elif lib == "PT":
|
|
@@ -439,9 +452,9 @@ def get_dd_analyzer(
|
|
|
439
452
|
else:
|
|
440
453
|
raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
|
|
441
454
|
dd_one_config_path = maybe_copy_config_to_cache(
|
|
442
|
-
get_package_path(), get_configs_dir_path(), _DD_ONE, reset_config_file
|
|
455
|
+
get_package_path(), get_configs_dir_path() / "dd", _DD_ONE, reset_config_file
|
|
443
456
|
)
|
|
444
|
-
maybe_copy_config_to_cache(get_package_path(), get_configs_dir_path(), _TESSERACT)
|
|
457
|
+
maybe_copy_config_to_cache(get_package_path(), get_configs_dir_path() / "dd", _TESSERACT)
|
|
445
458
|
|
|
446
459
|
# Set up of the configuration and logging
|
|
447
460
|
cfg = set_config_by_yaml(dd_one_config_path if not path_config_file else path_config_file)
|
|
@@ -12,7 +12,7 @@ Some DataFlow classes for transforming and processing datapoints. Many classes h
|
|
|
12
12
|
"""
|
|
13
13
|
import itertools
|
|
14
14
|
from copy import copy
|
|
15
|
-
from typing import Any, Callable, Iterator,
|
|
15
|
+
from typing import Any, Callable, Iterator, Union
|
|
16
16
|
|
|
17
17
|
import tqdm
|
|
18
18
|
|
|
@@ -164,6 +164,10 @@ class RepeatedData(ProxyDataFlow):
|
|
|
164
164
|
Set to -1 to repeat ``ds`` infinite times.
|
|
165
165
|
"""
|
|
166
166
|
self.num = num
|
|
167
|
+
if self.num != -1:
|
|
168
|
+
self.dfs = itertools.tee(df, self.num)
|
|
169
|
+
else:
|
|
170
|
+
self.dfs = ()
|
|
167
171
|
super().__init__(df)
|
|
168
172
|
|
|
169
173
|
def __len__(self) -> int:
|
|
@@ -180,8 +184,8 @@ class RepeatedData(ProxyDataFlow):
|
|
|
180
184
|
while True:
|
|
181
185
|
yield from self.df
|
|
182
186
|
else:
|
|
183
|
-
for
|
|
184
|
-
yield from
|
|
187
|
+
for df in self.dfs:
|
|
188
|
+
yield from df
|
|
185
189
|
|
|
186
190
|
|
|
187
191
|
class ConcatData(DataFlow):
|
|
@@ -197,7 +201,7 @@ class ConcatData(DataFlow):
|
|
|
197
201
|
df = ConcatData([df_1,df_2])
|
|
198
202
|
"""
|
|
199
203
|
|
|
200
|
-
def __init__(self, df_lists:
|
|
204
|
+
def __init__(self, df_lists: list[DataFlow]) -> None:
|
|
201
205
|
"""
|
|
202
206
|
:param df_lists: a list of DataFlow.
|
|
203
207
|
"""
|
|
@@ -233,7 +237,7 @@ class JoinData(DataFlow):
|
|
|
233
237
|
`JoinData` will stop once the first Dataflow throws a StopIteration
|
|
234
238
|
"""
|
|
235
239
|
|
|
236
|
-
def __init__(self, df_lists:
|
|
240
|
+
def __init__(self, df_lists: list[DataFlow]) -> None:
|
|
237
241
|
"""
|
|
238
242
|
:param df_lists: a list of DataFlow. When these dataflows have different sizes, JoinData will stop when any
|
|
239
243
|
of them is exhausted.
|
|
@@ -21,7 +21,7 @@ from
|
|
|
21
21
|
|
|
22
22
|
<https://github.com/tensorpack/dataflow/blob/master/dataflow/dataflow/common.py>
|
|
23
23
|
"""
|
|
24
|
-
from typing import Any, Callable, Iterable, Iterator,
|
|
24
|
+
from typing import Any, Callable, Iterable, Iterator, Optional
|
|
25
25
|
|
|
26
26
|
import numpy as np
|
|
27
27
|
|
|
@@ -54,7 +54,7 @@ class CacheData(ProxyDataFlow):
|
|
|
54
54
|
:param shuffle: whether to shuffle the cache before yielding from it.
|
|
55
55
|
"""
|
|
56
56
|
self.shuffle = shuffle
|
|
57
|
-
self.buffer:
|
|
57
|
+
self.buffer: list[Any] = []
|
|
58
58
|
self._guard: Optional[DataFlowReentrantGuard] = None
|
|
59
59
|
self.rng = get_rng(self)
|
|
60
60
|
super().__init__(df)
|
|
@@ -78,7 +78,7 @@ class CacheData(ProxyDataFlow):
|
|
|
78
78
|
yield dp
|
|
79
79
|
self.buffer.append(dp)
|
|
80
80
|
|
|
81
|
-
def get_cache(self) ->
|
|
81
|
+
def get_cache(self) -> list[Any]:
|
|
82
82
|
"""
|
|
83
83
|
get the cache of the whole dataflow as a list
|
|
84
84
|
|
|
@@ -115,10 +115,10 @@ class CustomDataFromList(DataFromList):
|
|
|
115
115
|
|
|
116
116
|
def __init__(
|
|
117
117
|
self,
|
|
118
|
-
lst:
|
|
118
|
+
lst: list[Any],
|
|
119
119
|
shuffle: bool = False,
|
|
120
120
|
max_datapoints: Optional[int] = None,
|
|
121
|
-
rebalance_func: Optional[Callable[[
|
|
121
|
+
rebalance_func: Optional[Callable[[list[Any]], list[Any]]] = None,
|
|
122
122
|
):
|
|
123
123
|
"""
|
|
124
124
|
:param lst: the input list. Each element represents a datapoint.
|