deepdoctection 0.34__py3-none-any.whl → 0.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +6 -10
- deepdoctection/analyzer/__init__.py +1 -0
- deepdoctection/analyzer/_config.py +150 -0
- deepdoctection/analyzer/dd.py +11 -335
- deepdoctection/analyzer/factory.py +522 -0
- deepdoctection/configs/conf_dd_one.yaml +1 -0
- deepdoctection/datapoint/annotation.py +1 -1
- deepdoctection/datapoint/convert.py +6 -4
- deepdoctection/datapoint/image.py +16 -6
- deepdoctection/datapoint/view.py +1 -0
- deepdoctection/extern/pdftext.py +96 -5
- deepdoctection/extern/tessocr.py +1 -0
- deepdoctection/utils/env_info.py +30 -1
- deepdoctection/utils/file_utils.py +19 -0
- deepdoctection/utils/metacfg.py +12 -0
- deepdoctection/utils/pdf_utils.py +86 -3
- {deepdoctection-0.34.dist-info → deepdoctection-0.35.dist-info}/METADATA +17 -11
- {deepdoctection-0.34.dist-info → deepdoctection-0.35.dist-info}/RECORD +21 -19
- {deepdoctection-0.34.dist-info → deepdoctection-0.35.dist-info}/WHEEL +1 -1
- {deepdoctection-0.34.dist-info → deepdoctection-0.35.dist-info}/LICENSE +0 -0
- {deepdoctection-0.34.dist-info → deepdoctection-0.35.dist-info}/top_level.txt +0 -0
deepdoctection/__init__.py
CHANGED
|
@@ -18,25 +18,19 @@ if importlib.util.find_spec("dotenv") is not None:
|
|
|
18
18
|
import sys
|
|
19
19
|
from typing import TYPE_CHECKING
|
|
20
20
|
|
|
21
|
-
from .utils.env_info import collect_env_info
|
|
21
|
+
from .utils.env_info import auto_select_pdf_render_framework, collect_env_info
|
|
22
22
|
from .utils.file_utils import _LazyModule, get_tf_version, pytorch_available, tf_available
|
|
23
23
|
from .utils.logger import LoggingRecord, logger
|
|
24
24
|
|
|
25
25
|
# pylint: enable=wrong-import-position
|
|
26
26
|
|
|
27
|
-
__version__ = 0.
|
|
27
|
+
__version__ = 0.35
|
|
28
28
|
|
|
29
29
|
_IMPORT_STRUCTURE = {
|
|
30
30
|
"analyzer": [
|
|
31
31
|
"config_sanity_checks",
|
|
32
|
-
"build_detector",
|
|
33
|
-
"build_padder",
|
|
34
|
-
"build_service",
|
|
35
|
-
"build_sub_image_service",
|
|
36
|
-
"build_ocr",
|
|
37
|
-
"build_doctr_word",
|
|
38
32
|
"get_dd_analyzer",
|
|
39
|
-
"
|
|
33
|
+
"ServiceFactory"
|
|
40
34
|
],
|
|
41
35
|
"configs": [],
|
|
42
36
|
"dataflow": [
|
|
@@ -197,6 +191,7 @@ _IMPORT_STRUCTURE = {
|
|
|
197
191
|
"print_model_infos",
|
|
198
192
|
"ModelDownloadManager",
|
|
199
193
|
"PdfPlumberTextDetector",
|
|
194
|
+
"Pdfmium2TextDetector",
|
|
200
195
|
"TesseractOcrDetector",
|
|
201
196
|
"TesseractRotationTransformer",
|
|
202
197
|
"TextractOcrDetector",
|
|
@@ -304,6 +299,7 @@ _IMPORT_STRUCTURE = {
|
|
|
304
299
|
"timed_operation",
|
|
305
300
|
"collect_env_info",
|
|
306
301
|
"auto_select_viz_library",
|
|
302
|
+
"auto_select_pdf_render_framework",
|
|
307
303
|
"get_tensorflow_requirement",
|
|
308
304
|
"tf_addons_available",
|
|
309
305
|
"get_tf_addons_requirements",
|
|
@@ -427,7 +423,7 @@ _IMPORT_STRUCTURE = {
|
|
|
427
423
|
# Setting some environment variables so that standard functions can be invoked with available hardware
|
|
428
424
|
env_info = collect_env_info()
|
|
429
425
|
logger.debug(LoggingRecord(msg=env_info))
|
|
430
|
-
|
|
426
|
+
auto_select_pdf_render_framework()
|
|
431
427
|
|
|
432
428
|
# Direct imports for type-checking
|
|
433
429
|
if TYPE_CHECKING:
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# File: config.py
|
|
3
|
+
|
|
4
|
+
# Copyright 2024 Dr. Janis Meyer. All rights reserved.
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
#
|
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
#
|
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
# See the License for the specific language governing permissions and
|
|
16
|
+
# limitations under the License.
|
|
17
|
+
|
|
18
|
+
"""Pipeline configuration for deepdoctection analyzer. Do not change the defaults in this file. """
|
|
19
|
+
|
|
20
|
+
from ..utils.metacfg import AttrDict
|
|
21
|
+
from ..utils.settings import CellType, LayoutType
|
|
22
|
+
|
|
23
|
+
cfg = AttrDict()
|
|
24
|
+
|
|
25
|
+
cfg.LANGUAGE = None
|
|
26
|
+
cfg.LIB = None
|
|
27
|
+
cfg.DEVICE = None
|
|
28
|
+
cfg.USE_ROTATOR = False
|
|
29
|
+
cfg.USE_LAYOUT = True
|
|
30
|
+
cfg.USE_TABLE_SEGMENTATION = True
|
|
31
|
+
|
|
32
|
+
cfg.TF.LAYOUT.WEIGHTS = "layout/model-800000_inf_only.data-00000-of-00001"
|
|
33
|
+
cfg.TF.LAYOUT.FILTER = None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
cfg.TF.CELL.WEIGHTS = "cell/model-1800000_inf_only.data-00000-of-00001"
|
|
37
|
+
cfg.TF.CELL.FILTER = None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
cfg.TF.ITEM.WEIGHTS = "item/model-1620000_inf_only.data-00000-of-00001"
|
|
41
|
+
cfg.TF.ITEM.FILTER = None
|
|
42
|
+
|
|
43
|
+
cfg.PT.LAYOUT.WEIGHTS = "layout/d2_model_0829999_layout_inf_only.pt"
|
|
44
|
+
cfg.PT.LAYOUT.WEIGHTS_TS = "layout/d2_model_0829999_layout_inf_only.ts"
|
|
45
|
+
cfg.PT.LAYOUT.FILTER = None
|
|
46
|
+
cfg.PT.LAYOUT.PAD.TOP = 60
|
|
47
|
+
cfg.PT.LAYOUT.PAD.RIGHT = 60
|
|
48
|
+
cfg.PT.LAYOUT.PAD.BOTTOM = 60
|
|
49
|
+
cfg.PT.LAYOUT.PAD.LEFT = 60
|
|
50
|
+
|
|
51
|
+
cfg.PT.ITEM.WEIGHTS = "item/d2_model_1639999_item_inf_only.pt"
|
|
52
|
+
cfg.PT.ITEM.WEIGHTS_TS = "item/d2_model_1639999_item_inf_only.ts"
|
|
53
|
+
cfg.PT.ITEM.FILTER = None
|
|
54
|
+
cfg.PT.ITEM.PAD.TOP = 60
|
|
55
|
+
cfg.PT.ITEM.PAD.RIGHT = 60
|
|
56
|
+
cfg.PT.ITEM.PAD.BOTTOM = 60
|
|
57
|
+
cfg.PT.ITEM.PAD.LEFT = 60
|
|
58
|
+
|
|
59
|
+
cfg.PT.CELL.WEIGHTS = "cell/d2_model_1849999_cell_inf_only.pt"
|
|
60
|
+
cfg.PT.CELL.WEIGHTS_TS = "cell/d2_model_1849999_cell_inf_only.ts"
|
|
61
|
+
cfg.PT.CELL.FILTER = None
|
|
62
|
+
|
|
63
|
+
cfg.USE_LAYOUT_NMS = False
|
|
64
|
+
cfg.LAYOUT_NMS_PAIRS.COMBINATIONS = None
|
|
65
|
+
cfg.LAYOUT_NMS_PAIRS.THRESHOLDS = None
|
|
66
|
+
cfg.LAYOUT_NMS_PAIRS.PRIORITY = None
|
|
67
|
+
|
|
68
|
+
cfg.SEGMENTATION.ASSIGNMENT_RULE = "ioa"
|
|
69
|
+
cfg.SEGMENTATION.THRESHOLD_ROWS = 0.4
|
|
70
|
+
cfg.SEGMENTATION.THRESHOLD_COLS = 0.4
|
|
71
|
+
cfg.SEGMENTATION.FULL_TABLE_TILING = True
|
|
72
|
+
cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS = 0.001
|
|
73
|
+
cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS = 0.001
|
|
74
|
+
cfg.SEGMENTATION.CELL_CATEGORY_ID = 12
|
|
75
|
+
cfg.SEGMENTATION.TABLE_NAME = LayoutType.TABLE
|
|
76
|
+
cfg.SEGMENTATION.PUBTABLES_CELL_NAMES = [
|
|
77
|
+
CellType.SPANNING,
|
|
78
|
+
CellType.ROW_HEADER,
|
|
79
|
+
CellType.COLUMN_HEADER,
|
|
80
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
81
|
+
LayoutType.CELL,
|
|
82
|
+
]
|
|
83
|
+
cfg.SEGMENTATION.PUBTABLES_SPANNING_CELL_NAMES = [
|
|
84
|
+
CellType.SPANNING,
|
|
85
|
+
CellType.ROW_HEADER,
|
|
86
|
+
CellType.COLUMN_HEADER,
|
|
87
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
88
|
+
]
|
|
89
|
+
cfg.SEGMENTATION.PUBTABLES_ITEM_NAMES = [LayoutType.ROW, LayoutType.COLUMN]
|
|
90
|
+
cfg.SEGMENTATION.PUBTABLES_SUB_ITEM_NAMES = [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER]
|
|
91
|
+
cfg.SEGMENTATION.CELL_NAMES = [CellType.HEADER, CellType.BODY, LayoutType.CELL]
|
|
92
|
+
cfg.SEGMENTATION.ITEM_NAMES = [LayoutType.ROW, LayoutType.COLUMN]
|
|
93
|
+
cfg.SEGMENTATION.SUB_ITEM_NAMES = [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER]
|
|
94
|
+
|
|
95
|
+
cfg.SEGMENTATION.STRETCH_RULE = "equal"
|
|
96
|
+
|
|
97
|
+
cfg.USE_TABLE_REFINEMENT = True
|
|
98
|
+
cfg.USE_PDF_MINER = False
|
|
99
|
+
|
|
100
|
+
cfg.PDF_MINER.X_TOLERANCE = 3
|
|
101
|
+
cfg.PDF_MINER.Y_TOLERANCE = 3
|
|
102
|
+
|
|
103
|
+
cfg.USE_OCR = True
|
|
104
|
+
|
|
105
|
+
cfg.OCR.USE_TESSERACT = True
|
|
106
|
+
cfg.OCR.USE_DOCTR = False
|
|
107
|
+
cfg.OCR.USE_TEXTRACT = False
|
|
108
|
+
cfg.OCR.CONFIG.TESSERACT = "dd/conf_tesseract.yaml"
|
|
109
|
+
|
|
110
|
+
cfg.OCR.WEIGHTS.DOCTR_WORD.TF = "doctr/db_resnet50/tf/db_resnet50-adcafc63.zip"
|
|
111
|
+
cfg.OCR.WEIGHTS.DOCTR_WORD.PT = "doctr/db_resnet50/pt/db_resnet50-ac60cadc.pt"
|
|
112
|
+
cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.TF = "doctr/crnn_vgg16_bn/tf/crnn_vgg16_bn-76b7f2c6.zip"
|
|
113
|
+
cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.PT = "doctr/crnn_vgg16_bn/pt/crnn_vgg16_bn-9762b0b0.pt"
|
|
114
|
+
|
|
115
|
+
cfg.TEXT_CONTAINER = LayoutType.WORD
|
|
116
|
+
cfg.WORD_MATCHING.PARENTAL_CATEGORIES = [
|
|
117
|
+
LayoutType.TEXT,
|
|
118
|
+
LayoutType.TITLE,
|
|
119
|
+
LayoutType.LIST,
|
|
120
|
+
LayoutType.CELL,
|
|
121
|
+
CellType.COLUMN_HEADER,
|
|
122
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
123
|
+
CellType.SPANNING,
|
|
124
|
+
CellType.ROW_HEADER,
|
|
125
|
+
]
|
|
126
|
+
cfg.WORD_MATCHING.RULE = "ioa"
|
|
127
|
+
cfg.WORD_MATCHING.THRESHOLD = 0.6
|
|
128
|
+
cfg.WORD_MATCHING.MAX_PARENT_ONLY = True
|
|
129
|
+
|
|
130
|
+
cfg.TEXT_ORDERING.TEXT_BLOCK_CATEGORIES = [
|
|
131
|
+
LayoutType.TEXT,
|
|
132
|
+
LayoutType.TITLE,
|
|
133
|
+
LayoutType.LIST,
|
|
134
|
+
LayoutType.CELL,
|
|
135
|
+
CellType.COLUMN_HEADER,
|
|
136
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
137
|
+
CellType.SPANNING,
|
|
138
|
+
CellType.ROW_HEADER,
|
|
139
|
+
]
|
|
140
|
+
cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES = [
|
|
141
|
+
LayoutType.TEXT,
|
|
142
|
+
LayoutType.TITLE,
|
|
143
|
+
LayoutType.LIST,
|
|
144
|
+
]
|
|
145
|
+
cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER = False
|
|
146
|
+
cfg.TEXT_ORDERING.STARTING_POINT_TOLERANCE = 0.005
|
|
147
|
+
cfg.TEXT_ORDERING.BROKEN_LINE_TOLERANCE = 0.003
|
|
148
|
+
cfg.TEXT_ORDERING.HEIGHT_TOLERANCE = 2.0
|
|
149
|
+
cfg.TEXT_ORDERING.PARAGRAPH_BREAK = 0.035
|
|
150
|
+
cfg.freeze()
|
deepdoctection/analyzer/dd.py
CHANGED
|
@@ -26,56 +26,24 @@ Module for **deep**doctection analyzer.
|
|
|
26
26
|
from __future__ import annotations
|
|
27
27
|
|
|
28
28
|
import os
|
|
29
|
-
from
|
|
30
|
-
from typing import Optional, Union
|
|
29
|
+
from typing import Optional
|
|
31
30
|
|
|
32
|
-
from lazy_imports import try_import
|
|
33
|
-
|
|
34
|
-
from ..extern.base import ObjectDetector
|
|
35
|
-
from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
|
|
36
|
-
from ..extern.doctrocr import DoctrTextlineDetector, DoctrTextRecognizer
|
|
37
|
-
from ..extern.hfdetr import HFDetrDerivedDetector
|
|
38
|
-
from ..extern.model import ModelCatalog, ModelDownloadManager
|
|
39
|
-
from ..extern.pdftext import PdfPlumberTextDetector
|
|
40
31
|
from ..extern.pt.ptutils import get_torch_device
|
|
41
|
-
from ..extern.tessocr import TesseractOcrDetector
|
|
42
|
-
from ..extern.texocr import TextractOcrDetector
|
|
43
32
|
from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
|
|
44
|
-
from ..extern.tpdetect import TPFrcnnDetector
|
|
45
|
-
from ..pipe.base import PipelineComponent
|
|
46
|
-
from ..pipe.common import AnnotationNmsService, IntersectionMatcher, MatchingService, PageParsingService
|
|
47
33
|
from ..pipe.doctectionpipe import DoctectionPipe
|
|
48
|
-
from ..pipe.layout import ImageLayoutService
|
|
49
|
-
from ..pipe.order import TextOrderService
|
|
50
|
-
from ..pipe.refine import TableSegmentationRefinementService
|
|
51
|
-
from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
|
|
52
|
-
from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
|
|
53
|
-
from ..pipe.text import TextExtractionService
|
|
54
34
|
from ..utils.env_info import ENV_VARS_TRUE
|
|
55
35
|
from ..utils.error import DependencyError
|
|
56
|
-
from ..utils.file_utils import
|
|
36
|
+
from ..utils.file_utils import tensorpack_available
|
|
57
37
|
from ..utils.fs import get_configs_dir_path, get_package_path, maybe_copy_config_to_cache
|
|
58
38
|
from ..utils.logger import LoggingRecord, logger
|
|
59
|
-
from ..utils.metacfg import
|
|
60
|
-
from ..utils.settings import CellType, LayoutType, Relationships
|
|
61
|
-
from ..utils.transform import PadTransform
|
|
39
|
+
from ..utils.metacfg import set_config_by_yaml
|
|
62
40
|
from ..utils.types import PathLikeOrStr
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
from botocore.config import Config # type: ignore
|
|
66
|
-
|
|
41
|
+
from ._config import cfg
|
|
42
|
+
from .factory import ServiceFactory
|
|
67
43
|
|
|
68
44
|
__all__ = [
|
|
69
45
|
"config_sanity_checks",
|
|
70
|
-
"build_detector",
|
|
71
|
-
"build_padder",
|
|
72
|
-
"build_service",
|
|
73
|
-
"build_sub_image_service",
|
|
74
|
-
"build_ocr",
|
|
75
|
-
"build_doctr_word",
|
|
76
46
|
"get_dd_analyzer",
|
|
77
|
-
"build_analyzer",
|
|
78
|
-
"set_config_by_yaml",
|
|
79
47
|
]
|
|
80
48
|
|
|
81
49
|
_DD_ONE = "deepdoctection/configs/conf_dd_one.yaml"
|
|
@@ -108,7 +76,7 @@ _MODEL_CHOICES = {
|
|
|
108
76
|
}
|
|
109
77
|
|
|
110
78
|
|
|
111
|
-
def config_sanity_checks(
|
|
79
|
+
def config_sanity_checks() -> None:
|
|
112
80
|
"""Some config sanity checks"""
|
|
113
81
|
if cfg.USE_PDF_MINER and cfg.USE_OCR and cfg.OCR.USE_DOCTR:
|
|
114
82
|
raise ValueError("Configuration USE_PDF_MINER= True and USE_OCR=True and USE_DOCTR=True is not allowed")
|
|
@@ -120,300 +88,6 @@ def config_sanity_checks(cfg: AttrDict) -> None:
|
|
|
120
88
|
)
|
|
121
89
|
|
|
122
90
|
|
|
123
|
-
def build_detector(
|
|
124
|
-
cfg: AttrDict, mode: str
|
|
125
|
-
) -> Union[D2FrcnnDetector, TPFrcnnDetector, HFDetrDerivedDetector, D2FrcnnTracingDetector]:
|
|
126
|
-
"""Building a D2-Detector, a TP-Detector as Detr-Detector or a D2-Torch Tracing Detector according to
|
|
127
|
-
the config
|
|
128
|
-
|
|
129
|
-
:param cfg: Config
|
|
130
|
-
:param mode: either `LAYOUT`,`CELL` or `ITEM`
|
|
131
|
-
"""
|
|
132
|
-
weights = (
|
|
133
|
-
getattr(cfg.TF, mode).WEIGHTS
|
|
134
|
-
if cfg.LIB == "TF"
|
|
135
|
-
else (getattr(cfg.PT, mode).WEIGHTS if detectron2_available() else getattr(cfg.PT, mode).WEIGHTS_TS)
|
|
136
|
-
)
|
|
137
|
-
filter_categories = (
|
|
138
|
-
getattr(getattr(cfg.TF, mode), "FILTER") if cfg.LIB == "TF" else getattr(getattr(cfg.PT, mode), "FILTER")
|
|
139
|
-
)
|
|
140
|
-
config_path = ModelCatalog.get_full_path_configs(weights)
|
|
141
|
-
weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
|
|
142
|
-
profile = ModelCatalog.get_profile(weights)
|
|
143
|
-
categories = profile.categories if profile.categories is not None else {}
|
|
144
|
-
|
|
145
|
-
if profile.model_wrapper in ("TPFrcnnDetector",):
|
|
146
|
-
return TPFrcnnDetector(config_path, weights_path, categories, filter_categories=filter_categories)
|
|
147
|
-
if profile.model_wrapper in ("D2FrcnnDetector",):
|
|
148
|
-
return D2FrcnnDetector(
|
|
149
|
-
config_path, weights_path, categories, device=cfg.DEVICE, filter_categories=filter_categories
|
|
150
|
-
)
|
|
151
|
-
if profile.model_wrapper in ("D2FrcnnTracingDetector",):
|
|
152
|
-
return D2FrcnnTracingDetector(config_path, weights_path, categories, filter_categories=filter_categories)
|
|
153
|
-
if profile.model_wrapper in ("HFDetrDerivedDetector",):
|
|
154
|
-
preprocessor_config = ModelCatalog.get_full_path_preprocessor_configs(weights)
|
|
155
|
-
return HFDetrDerivedDetector(
|
|
156
|
-
config_path,
|
|
157
|
-
weights_path,
|
|
158
|
-
preprocessor_config,
|
|
159
|
-
categories,
|
|
160
|
-
device=cfg.DEVICE,
|
|
161
|
-
filter_categories=filter_categories,
|
|
162
|
-
)
|
|
163
|
-
raise TypeError(
|
|
164
|
-
f"You have chosen profile.model_wrapper: {profile.model_wrapper} which is not allowed. Please check "
|
|
165
|
-
f"compatability with your deep learning framework"
|
|
166
|
-
)
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
def build_padder(cfg: AttrDict, mode: str) -> PadTransform:
|
|
170
|
-
"""Building a padder according to the config
|
|
171
|
-
|
|
172
|
-
:param cfg: Config
|
|
173
|
-
:param mode: either `LAYOUT`,`CELL` or `ITEM`
|
|
174
|
-
:return `PadTransform` instance
|
|
175
|
-
"""
|
|
176
|
-
top, right, bottom, left = (
|
|
177
|
-
getattr(cfg.PT, mode).PAD.TOP,
|
|
178
|
-
getattr(cfg.PT, mode).PAD.RIGHT,
|
|
179
|
-
getattr(cfg.PT, mode).PAD.BOTTOM,
|
|
180
|
-
getattr(cfg.PT, mode).PAD.LEFT,
|
|
181
|
-
)
|
|
182
|
-
return PadTransform(top=top, right=right, bottom=bottom, left=left)
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
def build_service(detector: ObjectDetector, cfg: AttrDict, mode: str) -> ImageLayoutService:
|
|
186
|
-
"""Building a layout service with a given detector
|
|
187
|
-
|
|
188
|
-
:param detector: will be passed to the `ImageLayoutService`
|
|
189
|
-
:param cfg: Configuration
|
|
190
|
-
:param mode: either `LAYOUT`,`CELL` or `ITEM`
|
|
191
|
-
:return `ImageLayoutService` instance
|
|
192
|
-
"""
|
|
193
|
-
padder = None
|
|
194
|
-
if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
|
|
195
|
-
padder = build_padder(cfg, mode)
|
|
196
|
-
return ImageLayoutService(detector, to_image=True, crop_image=True, padder=padder)
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
def build_sub_image_service(detector: ObjectDetector, cfg: AttrDict, mode: str) -> SubImageLayoutService:
|
|
200
|
-
"""
|
|
201
|
-
Building a sub image layout service with a given detector
|
|
202
|
-
|
|
203
|
-
:param detector: will be passed to the `SubImageLayoutService`
|
|
204
|
-
:param cfg: Configuration
|
|
205
|
-
:param mode: either `LAYOUT`,`CELL` or `ITEM`
|
|
206
|
-
:return: `SubImageLayoutService` instance
|
|
207
|
-
"""
|
|
208
|
-
exclude_category_ids = []
|
|
209
|
-
padder = None
|
|
210
|
-
if mode == "ITEM":
|
|
211
|
-
if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
|
|
212
|
-
exclude_category_ids.extend([1, 3, 4, 5, 6])
|
|
213
|
-
padder = build_padder(cfg, mode)
|
|
214
|
-
detect_result_generator = DetectResultGenerator(
|
|
215
|
-
categories=detector.categories.categories, exclude_category_ids=exclude_category_ids
|
|
216
|
-
)
|
|
217
|
-
return SubImageLayoutService(
|
|
218
|
-
detector, [LayoutType.TABLE, LayoutType.TABLE_ROTATED], None, detect_result_generator, padder
|
|
219
|
-
)
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
def build_ocr(cfg: AttrDict) -> Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]:
|
|
223
|
-
"""
|
|
224
|
-
Building OCR predictor
|
|
225
|
-
:param cfg: Config
|
|
226
|
-
"""
|
|
227
|
-
if cfg.OCR.USE_TESSERACT:
|
|
228
|
-
ocr_config_path = get_configs_dir_path() / cfg.OCR.CONFIG.TESSERACT
|
|
229
|
-
return TesseractOcrDetector(
|
|
230
|
-
ocr_config_path, config_overwrite=[f"LANGUAGES={cfg.LANGUAGE}"] if cfg.LANGUAGE is not None else None
|
|
231
|
-
)
|
|
232
|
-
if cfg.OCR.USE_DOCTR:
|
|
233
|
-
weights = cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.TF if cfg.LIB == "TF" else cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.PT
|
|
234
|
-
weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
|
|
235
|
-
profile = ModelCatalog.get_profile(weights)
|
|
236
|
-
# get_full_path_configs will complete the path even if the model is not registered
|
|
237
|
-
config_path = ModelCatalog.get_full_path_configs(weights) if profile.config is not None else None
|
|
238
|
-
if profile.architecture is None:
|
|
239
|
-
raise ValueError("model profile.architecture must be specified")
|
|
240
|
-
return DoctrTextRecognizer(
|
|
241
|
-
profile.architecture, weights_path, cfg.DEVICE, lib=cfg.LIB, path_config_json=config_path
|
|
242
|
-
)
|
|
243
|
-
if cfg.OCR.USE_TEXTRACT:
|
|
244
|
-
credentials_kwargs = {
|
|
245
|
-
"aws_access_key_id": environ.get("ACCESS_KEY", None),
|
|
246
|
-
"aws_secret_access_key": environ.get("SECRET_KEY", None),
|
|
247
|
-
"config": Config(region_name=environ.get("REGION", None)),
|
|
248
|
-
}
|
|
249
|
-
return TextractOcrDetector(**credentials_kwargs)
|
|
250
|
-
raise ValueError("You have set USE_OCR=True but any of USE_TESSERACT, USE_DOCTR, USE_TEXTRACT is set to False")
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
def build_doctr_word(cfg: AttrDict) -> DoctrTextlineDetector:
|
|
254
|
-
"""Building `DoctrTextlineDetector` instance"""
|
|
255
|
-
weights = cfg.OCR.WEIGHTS.DOCTR_WORD.TF if cfg.LIB == "TF" else cfg.OCR.WEIGHTS.DOCTR_WORD.PT
|
|
256
|
-
weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
|
|
257
|
-
profile = ModelCatalog.get_profile(weights)
|
|
258
|
-
if profile.architecture is None:
|
|
259
|
-
raise ValueError("model profile.architecture must be specified")
|
|
260
|
-
if profile.categories is None:
|
|
261
|
-
raise ValueError("model profile.categories must be specified")
|
|
262
|
-
return DoctrTextlineDetector(profile.architecture, weights_path, profile.categories, cfg.DEVICE, lib=cfg.LIB)
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
266
|
-
"""
|
|
267
|
-
Builds the analyzer with a given config
|
|
268
|
-
|
|
269
|
-
:param cfg: A configuration
|
|
270
|
-
:return: Analyzer pipeline
|
|
271
|
-
"""
|
|
272
|
-
pipe_component_list: list[PipelineComponent] = []
|
|
273
|
-
|
|
274
|
-
if cfg.USE_LAYOUT:
|
|
275
|
-
d_layout = build_detector(cfg, "LAYOUT")
|
|
276
|
-
layout = build_service(d_layout, cfg, "LAYOUT")
|
|
277
|
-
pipe_component_list.append(layout)
|
|
278
|
-
|
|
279
|
-
# setup layout nms service
|
|
280
|
-
if cfg.LAYOUT_NMS_PAIRS.COMBINATIONS and cfg.USE_LAYOUT:
|
|
281
|
-
if not detectron2_available() and cfg.LIB == "PT":
|
|
282
|
-
raise ModuleNotFoundError("LAYOUT_NMS_PAIRS is only available for detectron2")
|
|
283
|
-
if not isinstance(cfg.LAYOUT_NMS_PAIRS.COMBINATIONS, list) and not isinstance(
|
|
284
|
-
cfg.LAYOUT_NMS_PAIRS.COMBINATIONS[0], list
|
|
285
|
-
):
|
|
286
|
-
raise ValueError("LAYOUT_NMS_PAIRS mus be a list of lists")
|
|
287
|
-
layout_nms_serivce = AnnotationNmsService(
|
|
288
|
-
cfg.LAYOUT_NMS_PAIRS.COMBINATIONS, cfg.LAYOUT_NMS_PAIRS.THRESHOLDS, cfg.LAYOUT_NMS_PAIRS.PRIORITY
|
|
289
|
-
)
|
|
290
|
-
pipe_component_list.append(layout_nms_serivce)
|
|
291
|
-
|
|
292
|
-
# setup tables service
|
|
293
|
-
if cfg.USE_TABLE_SEGMENTATION:
|
|
294
|
-
d_item = build_detector(cfg, "ITEM")
|
|
295
|
-
item = build_sub_image_service(d_item, cfg, "ITEM")
|
|
296
|
-
pipe_component_list.append(item)
|
|
297
|
-
|
|
298
|
-
if d_item.__class__.__name__ not in ("HFDetrDerivedDetector",):
|
|
299
|
-
d_cell = build_detector(cfg, "CELL")
|
|
300
|
-
cell = build_sub_image_service(d_cell, cfg, "CELL")
|
|
301
|
-
pipe_component_list.append(cell)
|
|
302
|
-
|
|
303
|
-
if d_item.__class__.__name__ in ("HFDetrDerivedDetector",):
|
|
304
|
-
pubtables = PubtablesSegmentationService(
|
|
305
|
-
cfg.SEGMENTATION.ASSIGNMENT_RULE,
|
|
306
|
-
cfg.SEGMENTATION.THRESHOLD_ROWS,
|
|
307
|
-
cfg.SEGMENTATION.THRESHOLD_COLS,
|
|
308
|
-
cfg.SEGMENTATION.FULL_TABLE_TILING,
|
|
309
|
-
cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
|
|
310
|
-
cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
|
|
311
|
-
cfg.SEGMENTATION.CELL_CATEGORY_ID,
|
|
312
|
-
LayoutType.TABLE,
|
|
313
|
-
[
|
|
314
|
-
CellType.SPANNING,
|
|
315
|
-
CellType.ROW_HEADER,
|
|
316
|
-
CellType.COLUMN_HEADER,
|
|
317
|
-
CellType.PROJECTED_ROW_HEADER,
|
|
318
|
-
LayoutType.CELL,
|
|
319
|
-
],
|
|
320
|
-
[
|
|
321
|
-
CellType.SPANNING,
|
|
322
|
-
CellType.ROW_HEADER,
|
|
323
|
-
CellType.COLUMN_HEADER,
|
|
324
|
-
CellType.PROJECTED_ROW_HEADER,
|
|
325
|
-
],
|
|
326
|
-
[LayoutType.ROW, LayoutType.COLUMN],
|
|
327
|
-
[CellType.ROW_NUMBER, CellType.COLUMN_NUMBER],
|
|
328
|
-
stretch_rule=cfg.SEGMENTATION.STRETCH_RULE,
|
|
329
|
-
)
|
|
330
|
-
pipe_component_list.append(pubtables)
|
|
331
|
-
else:
|
|
332
|
-
table_segmentation = TableSegmentationService(
|
|
333
|
-
cfg.SEGMENTATION.ASSIGNMENT_RULE,
|
|
334
|
-
cfg.SEGMENTATION.THRESHOLD_ROWS,
|
|
335
|
-
cfg.SEGMENTATION.THRESHOLD_COLS,
|
|
336
|
-
cfg.SEGMENTATION.FULL_TABLE_TILING,
|
|
337
|
-
cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
|
|
338
|
-
cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
|
|
339
|
-
LayoutType.TABLE,
|
|
340
|
-
[CellType.HEADER, CellType.BODY, LayoutType.CELL],
|
|
341
|
-
[LayoutType.ROW, LayoutType.COLUMN],
|
|
342
|
-
[CellType.ROW_NUMBER, CellType.COLUMN_NUMBER],
|
|
343
|
-
cfg.SEGMENTATION.STRETCH_RULE,
|
|
344
|
-
)
|
|
345
|
-
pipe_component_list.append(table_segmentation)
|
|
346
|
-
|
|
347
|
-
if cfg.USE_TABLE_REFINEMENT:
|
|
348
|
-
table_segmentation_refinement = TableSegmentationRefinementService(
|
|
349
|
-
[LayoutType.TABLE, LayoutType.TABLE_ROTATED],
|
|
350
|
-
[
|
|
351
|
-
LayoutType.CELL,
|
|
352
|
-
CellType.COLUMN_HEADER,
|
|
353
|
-
CellType.PROJECTED_ROW_HEADER,
|
|
354
|
-
CellType.SPANNING,
|
|
355
|
-
CellType.ROW_HEADER,
|
|
356
|
-
],
|
|
357
|
-
)
|
|
358
|
-
pipe_component_list.append(table_segmentation_refinement)
|
|
359
|
-
|
|
360
|
-
if cfg.USE_PDF_MINER:
|
|
361
|
-
pdf_text = PdfPlumberTextDetector(x_tolerance=cfg.PDF_MINER.X_TOLERANCE, y_tolerance=cfg.PDF_MINER.Y_TOLERANCE)
|
|
362
|
-
d_text = TextExtractionService(pdf_text)
|
|
363
|
-
pipe_component_list.append(d_text)
|
|
364
|
-
|
|
365
|
-
# setup ocr
|
|
366
|
-
if cfg.USE_OCR:
|
|
367
|
-
# the extra mile for DocTr
|
|
368
|
-
if cfg.OCR.USE_DOCTR:
|
|
369
|
-
d_word = build_doctr_word(cfg)
|
|
370
|
-
word = ImageLayoutService(d_word, to_image=True, crop_image=True, skip_if_layout_extracted=True)
|
|
371
|
-
pipe_component_list.append(word)
|
|
372
|
-
|
|
373
|
-
ocr = build_ocr(cfg)
|
|
374
|
-
skip_if_text_extracted = cfg.USE_PDF_MINER
|
|
375
|
-
extract_from_roi = LayoutType.WORD if cfg.OCR.USE_DOCTR else None
|
|
376
|
-
text = TextExtractionService(
|
|
377
|
-
ocr, skip_if_text_extracted=skip_if_text_extracted, extract_from_roi=extract_from_roi
|
|
378
|
-
)
|
|
379
|
-
pipe_component_list.append(text)
|
|
380
|
-
|
|
381
|
-
if cfg.USE_PDF_MINER or cfg.USE_OCR:
|
|
382
|
-
matcher = IntersectionMatcher(
|
|
383
|
-
matching_rule=cfg.WORD_MATCHING.RULE,
|
|
384
|
-
threshold=cfg.WORD_MATCHING.THRESHOLD,
|
|
385
|
-
max_parent_only=cfg.WORD_MATCHING.MAX_PARENT_ONLY,
|
|
386
|
-
)
|
|
387
|
-
match = MatchingService(
|
|
388
|
-
parent_categories=cfg.WORD_MATCHING.PARENTAL_CATEGORIES,
|
|
389
|
-
child_categories=LayoutType.WORD,
|
|
390
|
-
matcher=matcher,
|
|
391
|
-
relationship_key=Relationships.CHILD,
|
|
392
|
-
)
|
|
393
|
-
pipe_component_list.append(match)
|
|
394
|
-
|
|
395
|
-
order = TextOrderService(
|
|
396
|
-
text_container=LayoutType.WORD,
|
|
397
|
-
text_block_categories=cfg.TEXT_ORDERING.TEXT_BLOCK_CATEGORIES,
|
|
398
|
-
floating_text_block_categories=cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
|
|
399
|
-
include_residual_text_container=cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
|
|
400
|
-
starting_point_tolerance=cfg.TEXT_ORDERING.STARTING_POINT_TOLERANCE,
|
|
401
|
-
broken_line_tolerance=cfg.TEXT_ORDERING.BROKEN_LINE_TOLERANCE,
|
|
402
|
-
height_tolerance=cfg.TEXT_ORDERING.HEIGHT_TOLERANCE,
|
|
403
|
-
paragraph_break=cfg.TEXT_ORDERING.PARAGRAPH_BREAK,
|
|
404
|
-
)
|
|
405
|
-
pipe_component_list.append(order)
|
|
406
|
-
|
|
407
|
-
page_parsing_service = PageParsingService(
|
|
408
|
-
text_container=LayoutType.WORD,
|
|
409
|
-
floating_text_block_categories=cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
|
|
410
|
-
include_residual_text_container=cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
|
|
411
|
-
)
|
|
412
|
-
pipe = DoctectionPipe(pipeline_component_list=pipe_component_list, page_parsing_service=page_parsing_service)
|
|
413
|
-
|
|
414
|
-
return pipe
|
|
415
|
-
|
|
416
|
-
|
|
417
91
|
def get_dd_analyzer(
|
|
418
92
|
reset_config_file: bool = True,
|
|
419
93
|
config_overwrite: Optional[list[str]] = None,
|
|
@@ -457,7 +131,9 @@ def get_dd_analyzer(
|
|
|
457
131
|
maybe_copy_config_to_cache(get_package_path(), get_configs_dir_path() / "dd", _TESSERACT)
|
|
458
132
|
|
|
459
133
|
# Set up of the configuration and logging
|
|
460
|
-
|
|
134
|
+
file_cfg = set_config_by_yaml(dd_one_config_path if not path_config_file else path_config_file)
|
|
135
|
+
cfg.freeze(freezed=False)
|
|
136
|
+
cfg.overwrite_config(file_cfg)
|
|
461
137
|
|
|
462
138
|
cfg.freeze(freezed=False)
|
|
463
139
|
cfg.LANGUAGE = None
|
|
@@ -468,11 +144,11 @@ def get_dd_analyzer(
|
|
|
468
144
|
if config_overwrite:
|
|
469
145
|
cfg.update_args(config_overwrite)
|
|
470
146
|
|
|
471
|
-
config_sanity_checks(
|
|
147
|
+
config_sanity_checks()
|
|
472
148
|
logger.info(LoggingRecord(f"Config: \n {str(cfg)}", cfg.to_dict())) # type: ignore
|
|
473
149
|
|
|
474
150
|
# will silent all TP logging while building the tower
|
|
475
151
|
if tensorpack_available():
|
|
476
152
|
disable_tp_layer_logging()
|
|
477
153
|
|
|
478
|
-
return build_analyzer(cfg)
|
|
154
|
+
return ServiceFactory.build_analyzer(cfg)
|