deepdoctection 0.34__py3-none-any.whl → 0.36__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +7 -14
- deepdoctection/analyzer/__init__.py +1 -0
- deepdoctection/analyzer/_config.py +142 -0
- deepdoctection/analyzer/dd.py +11 -335
- deepdoctection/analyzer/factory.py +718 -0
- deepdoctection/configs/conf_dd_one.yaml +5 -0
- deepdoctection/datapoint/annotation.py +1 -1
- deepdoctection/datapoint/convert.py +6 -4
- deepdoctection/datapoint/image.py +16 -6
- deepdoctection/datapoint/view.py +91 -15
- deepdoctection/eval/cocometric.py +59 -13
- deepdoctection/extern/pdftext.py +96 -5
- deepdoctection/extern/tessocr.py +1 -0
- deepdoctection/mapper/match.py +4 -2
- deepdoctection/utils/env_info.py +30 -1
- deepdoctection/utils/file_utils.py +19 -0
- deepdoctection/utils/metacfg.py +12 -0
- deepdoctection/utils/pdf_utils.py +86 -3
- deepdoctection/utils/utils.py +39 -0
- deepdoctection/utils/viz.py +16 -13
- {deepdoctection-0.34.dist-info → deepdoctection-0.36.dist-info}/METADATA +126 -116
- {deepdoctection-0.34.dist-info → deepdoctection-0.36.dist-info}/RECORD +25 -23
- {deepdoctection-0.34.dist-info → deepdoctection-0.36.dist-info}/WHEEL +1 -1
- {deepdoctection-0.34.dist-info → deepdoctection-0.36.dist-info}/LICENSE +0 -0
- {deepdoctection-0.34.dist-info → deepdoctection-0.36.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,718 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# File: factory.py
|
|
3
|
+
|
|
4
|
+
# Copyright 2024 Dr. Janis Meyer. All rights reserved.
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
#
|
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
#
|
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
# See the License for the specific language governing permissions and
|
|
16
|
+
# limitations under the License.
|
|
17
|
+
|
|
18
|
+
"""Factory for building the deepdoctection analyzer pipeline"""
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
from os import environ
|
|
22
|
+
from typing import Union
|
|
23
|
+
|
|
24
|
+
from lazy_imports import try_import
|
|
25
|
+
|
|
26
|
+
from ..extern.base import ImageTransformer, ObjectDetector, PdfMiner
|
|
27
|
+
from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
|
|
28
|
+
from ..extern.doctrocr import DoctrTextlineDetector, DoctrTextRecognizer
|
|
29
|
+
from ..extern.hfdetr import HFDetrDerivedDetector
|
|
30
|
+
from ..extern.model import ModelCatalog, ModelDownloadManager
|
|
31
|
+
from ..extern.pdftext import PdfPlumberTextDetector
|
|
32
|
+
from ..extern.tessocr import TesseractOcrDetector, TesseractRotationTransformer
|
|
33
|
+
from ..extern.texocr import TextractOcrDetector
|
|
34
|
+
from ..extern.tpdetect import TPFrcnnDetector
|
|
35
|
+
from ..pipe.base import PipelineComponent
|
|
36
|
+
from ..pipe.common import (
|
|
37
|
+
AnnotationNmsService,
|
|
38
|
+
IntersectionMatcher,
|
|
39
|
+
MatchingService,
|
|
40
|
+
NeighbourMatcher,
|
|
41
|
+
PageParsingService,
|
|
42
|
+
)
|
|
43
|
+
from ..pipe.doctectionpipe import DoctectionPipe
|
|
44
|
+
from ..pipe.layout import ImageLayoutService
|
|
45
|
+
from ..pipe.order import TextOrderService
|
|
46
|
+
from ..pipe.refine import TableSegmentationRefinementService
|
|
47
|
+
from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
|
|
48
|
+
from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
|
|
49
|
+
from ..pipe.text import TextExtractionService
|
|
50
|
+
from ..pipe.transform import SimpleTransformService
|
|
51
|
+
from ..utils.file_utils import detectron2_available
|
|
52
|
+
from ..utils.fs import get_configs_dir_path
|
|
53
|
+
from ..utils.metacfg import AttrDict
|
|
54
|
+
from ..utils.settings import LayoutType, Relationships
|
|
55
|
+
from ..utils.transform import PadTransform
|
|
56
|
+
|
|
57
|
+
with try_import() as image_guard:
|
|
58
|
+
from botocore.config import Config # type: ignore
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
__all__ = [
|
|
62
|
+
"ServiceFactory",
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
# from ._config import cfg
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class ServiceFactory:
|
|
69
|
+
"""
|
|
70
|
+
Factory class for building various components of the deepdoctection analyzer pipeline.
|
|
71
|
+
|
|
72
|
+
This class uses the `cfg` configuration object from `_config.py`, which is an instance of the `AttrDict` class.
|
|
73
|
+
The configuration is not passed explicitly in an `__init__` method but is accessed directly within the methods.
|
|
74
|
+
|
|
75
|
+
The class provides static methods to build different services and detectors required for the pipeline, such as
|
|
76
|
+
layout detectors, OCR detectors, table segmentation services, and more. The methods disentangle the creation
|
|
77
|
+
of predictors (e.g., `ObjectDetector`, `TextRecognizer`) from the configuration, allowing for flexible and
|
|
78
|
+
modular construction of the pipeline components.
|
|
79
|
+
|
|
80
|
+
Extending the Class:
|
|
81
|
+
This class can be extended by using inheritance and adding new methods or overriding existing ones.
|
|
82
|
+
To extend the configuration attributes, you can modify the `cfg` object in `_config.py` to include new
|
|
83
|
+
settings or parameters required for the new methods.
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
@staticmethod
|
|
87
|
+
def _build_layout_detector(
|
|
88
|
+
config: AttrDict,
|
|
89
|
+
mode: str,
|
|
90
|
+
) -> Union[D2FrcnnDetector, TPFrcnnDetector, HFDetrDerivedDetector, D2FrcnnTracingDetector]:
|
|
91
|
+
"""Building a D2-Detector, a TP-Detector as Detr-Detector or a D2-Torch Tracing Detector according to
|
|
92
|
+
the config
|
|
93
|
+
|
|
94
|
+
:param config: configuration object
|
|
95
|
+
:param mode: either `LAYOUT`,`CELL` or `ITEM`
|
|
96
|
+
"""
|
|
97
|
+
weights = (
|
|
98
|
+
getattr(config.TF, mode).WEIGHTS
|
|
99
|
+
if config.LIB == "TF"
|
|
100
|
+
else (getattr(config.PT, mode).WEIGHTS if detectron2_available() else getattr(config.PT, mode).WEIGHTS_TS)
|
|
101
|
+
)
|
|
102
|
+
filter_categories = (
|
|
103
|
+
getattr(getattr(config.TF, mode), "FILTER")
|
|
104
|
+
if config.LIB == "TF"
|
|
105
|
+
else getattr(getattr(config.PT, mode), "FILTER")
|
|
106
|
+
)
|
|
107
|
+
config_path = ModelCatalog.get_full_path_configs(weights)
|
|
108
|
+
weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
|
|
109
|
+
profile = ModelCatalog.get_profile(weights)
|
|
110
|
+
categories = profile.categories if profile.categories is not None else {}
|
|
111
|
+
|
|
112
|
+
if profile.model_wrapper in ("TPFrcnnDetector",):
|
|
113
|
+
return TPFrcnnDetector(
|
|
114
|
+
path_yaml=config_path,
|
|
115
|
+
path_weights=weights_path,
|
|
116
|
+
categories=categories,
|
|
117
|
+
filter_categories=filter_categories,
|
|
118
|
+
)
|
|
119
|
+
if profile.model_wrapper in ("D2FrcnnDetector",):
|
|
120
|
+
return D2FrcnnDetector(
|
|
121
|
+
path_yaml=config_path,
|
|
122
|
+
path_weights=weights_path,
|
|
123
|
+
categories=categories,
|
|
124
|
+
device=config.DEVICE,
|
|
125
|
+
filter_categories=filter_categories,
|
|
126
|
+
)
|
|
127
|
+
if profile.model_wrapper in ("D2FrcnnTracingDetector",):
|
|
128
|
+
return D2FrcnnTracingDetector(
|
|
129
|
+
path_yaml=config_path,
|
|
130
|
+
path_weights=weights_path,
|
|
131
|
+
categories=categories,
|
|
132
|
+
filter_categories=filter_categories,
|
|
133
|
+
)
|
|
134
|
+
if profile.model_wrapper in ("HFDetrDerivedDetector",):
|
|
135
|
+
preprocessor_config = ModelCatalog.get_full_path_preprocessor_configs(weights)
|
|
136
|
+
return HFDetrDerivedDetector(
|
|
137
|
+
path_config_json=config_path,
|
|
138
|
+
path_weights=weights_path,
|
|
139
|
+
path_feature_extractor_config_json=preprocessor_config,
|
|
140
|
+
categories=categories,
|
|
141
|
+
device=config.DEVICE,
|
|
142
|
+
filter_categories=filter_categories,
|
|
143
|
+
)
|
|
144
|
+
raise TypeError(
|
|
145
|
+
f"You have chosen profile.model_wrapper: {profile.model_wrapper} which is not allowed. Please check "
|
|
146
|
+
f"compatability with your deep learning framework"
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
@staticmethod
|
|
150
|
+
def build_layout_detector(
|
|
151
|
+
config: AttrDict, mode: str
|
|
152
|
+
) -> Union[D2FrcnnDetector, TPFrcnnDetector, HFDetrDerivedDetector, D2FrcnnTracingDetector]:
|
|
153
|
+
"""Building a layout detector according to the config
|
|
154
|
+
|
|
155
|
+
:param config: configuration object
|
|
156
|
+
:param mode: either `LAYOUT`,`CELL` or `ITEM`
|
|
157
|
+
"""
|
|
158
|
+
return ServiceFactory._build_layout_detector(config, mode)
|
|
159
|
+
|
|
160
|
+
@staticmethod
|
|
161
|
+
def _build_rotation_detector() -> TesseractRotationTransformer:
|
|
162
|
+
"""Building a rotation detector"""
|
|
163
|
+
return TesseractRotationTransformer()
|
|
164
|
+
|
|
165
|
+
@staticmethod
|
|
166
|
+
def build_rotation_detector() -> TesseractRotationTransformer:
|
|
167
|
+
"""Building a rotation detector"""
|
|
168
|
+
return ServiceFactory._build_rotation_detector()
|
|
169
|
+
|
|
170
|
+
@staticmethod
|
|
171
|
+
def _build_transform_service(transform_predictor: ImageTransformer) -> SimpleTransformService:
|
|
172
|
+
"""Building a transform service with a given predictor"""
|
|
173
|
+
return SimpleTransformService(transform_predictor)
|
|
174
|
+
|
|
175
|
+
@staticmethod
|
|
176
|
+
def build_transform_service(transform_predictor: ImageTransformer) -> SimpleTransformService:
|
|
177
|
+
"""Building a transform service with a given predictor"""
|
|
178
|
+
return ServiceFactory._build_transform_service(transform_predictor)
|
|
179
|
+
|
|
180
|
+
@staticmethod
|
|
181
|
+
def _build_padder(config: AttrDict, mode: str) -> PadTransform:
|
|
182
|
+
"""Building a padder according to the config
|
|
183
|
+
|
|
184
|
+
:param config: configuration object
|
|
185
|
+
:param mode: either `LAYOUT`,`CELL` or `ITEM`
|
|
186
|
+
:return `PadTransform` instance
|
|
187
|
+
"""
|
|
188
|
+
top, right, bottom, left = (
|
|
189
|
+
getattr(config.PT, mode).PAD.TOP,
|
|
190
|
+
getattr(config.PT, mode).PAD.RIGHT,
|
|
191
|
+
getattr(config.PT, mode).PAD.BOTTOM,
|
|
192
|
+
getattr(config.PT, mode).PAD.LEFT,
|
|
193
|
+
)
|
|
194
|
+
return PadTransform(top=top, right=right, bottom=bottom, left=left) #
|
|
195
|
+
|
|
196
|
+
@staticmethod
|
|
197
|
+
def build_padder(config: AttrDict, mode: str) -> PadTransform:
|
|
198
|
+
"""Building a padder according to the config
|
|
199
|
+
|
|
200
|
+
:param config: configuration object
|
|
201
|
+
:param mode: either `LAYOUT`,`CELL` or `ITEM`
|
|
202
|
+
:return `PadTransform` instance
|
|
203
|
+
"""
|
|
204
|
+
return ServiceFactory._build_padder(config, mode)
|
|
205
|
+
|
|
206
|
+
@staticmethod
|
|
207
|
+
def _build_layout_service(config: AttrDict, detector: ObjectDetector, mode: str) -> ImageLayoutService:
|
|
208
|
+
"""Building a layout service with a given detector
|
|
209
|
+
|
|
210
|
+
:param config: configuration object
|
|
211
|
+
:param detector: will be passed to the `ImageLayoutService`
|
|
212
|
+
:param mode: either `LAYOUT`,`CELL` or `ITEM`
|
|
213
|
+
:return `ImageLayoutService` instance
|
|
214
|
+
"""
|
|
215
|
+
padder = None
|
|
216
|
+
if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
|
|
217
|
+
padder = ServiceFactory.build_padder(config, mode=mode)
|
|
218
|
+
return ImageLayoutService(layout_detector=detector, to_image=True, crop_image=True, padder=padder)
|
|
219
|
+
|
|
220
|
+
@staticmethod
|
|
221
|
+
def build_layout_service(config: AttrDict, detector: ObjectDetector, mode: str) -> ImageLayoutService:
|
|
222
|
+
"""Building a layout service with a given detector
|
|
223
|
+
|
|
224
|
+
:param config: configuration object
|
|
225
|
+
:param detector: will be passed to the `ImageLayoutService`
|
|
226
|
+
:param mode: either `LAYOUT`,`CELL` or `ITEM`
|
|
227
|
+
:return `ImageLayoutService` instance
|
|
228
|
+
"""
|
|
229
|
+
return ServiceFactory._build_layout_service(config, detector, mode)
|
|
230
|
+
|
|
231
|
+
@staticmethod
|
|
232
|
+
def _build_layout_nms_service(config: AttrDict) -> AnnotationNmsService:
|
|
233
|
+
"""Building a NMS service for layout annotations
|
|
234
|
+
|
|
235
|
+
:param config: configuration object
|
|
236
|
+
"""
|
|
237
|
+
if not detectron2_available() and config.LIB == "PT":
|
|
238
|
+
raise ModuleNotFoundError("LAYOUT_NMS_PAIRS is only available for detectron2")
|
|
239
|
+
if not isinstance(config.LAYOUT_NMS_PAIRS.COMBINATIONS, list) and not isinstance(
|
|
240
|
+
config.LAYOUT_NMS_PAIRS.COMBINATIONS[0], list
|
|
241
|
+
):
|
|
242
|
+
raise ValueError("LAYOUT_NMS_PAIRS must be a list of lists")
|
|
243
|
+
return AnnotationNmsService(
|
|
244
|
+
nms_pairs=config.LAYOUT_NMS_PAIRS.COMBINATIONS,
|
|
245
|
+
thresholds=config.LAYOUT_NMS_PAIRS.THRESHOLDS,
|
|
246
|
+
priority=config.LAYOUT_NMS_PAIRS.PRIORITY,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
@staticmethod
|
|
250
|
+
def build_layout_nms_service(config: AttrDict) -> AnnotationNmsService:
|
|
251
|
+
"""Building a NMS service for layout annotations
|
|
252
|
+
|
|
253
|
+
:param config: configuration object
|
|
254
|
+
"""
|
|
255
|
+
return ServiceFactory._build_layout_nms_service(config)
|
|
256
|
+
|
|
257
|
+
@staticmethod
|
|
258
|
+
def _build_sub_image_service(config: AttrDict, detector: ObjectDetector, mode: str) -> SubImageLayoutService:
|
|
259
|
+
"""
|
|
260
|
+
Building a sub image layout service with a given detector
|
|
261
|
+
|
|
262
|
+
:param config: configuration object
|
|
263
|
+
:param detector: will be passed to the `SubImageLayoutService`
|
|
264
|
+
:param mode: either `LAYOUT`,`CELL` or `ITEM`
|
|
265
|
+
:return: `SubImageLayoutService` instance
|
|
266
|
+
"""
|
|
267
|
+
exclude_category_ids = []
|
|
268
|
+
padder = None
|
|
269
|
+
if mode == "ITEM":
|
|
270
|
+
if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
|
|
271
|
+
exclude_category_ids.extend([1, 3, 4, 5, 6])
|
|
272
|
+
padder = ServiceFactory.build_padder(config, mode)
|
|
273
|
+
detect_result_generator = DetectResultGenerator(
|
|
274
|
+
categories=detector.categories.categories, exclude_category_ids=exclude_category_ids
|
|
275
|
+
)
|
|
276
|
+
return SubImageLayoutService(
|
|
277
|
+
sub_image_detector=detector,
|
|
278
|
+
sub_image_names=[LayoutType.TABLE, LayoutType.TABLE_ROTATED],
|
|
279
|
+
category_id_mapping=None,
|
|
280
|
+
detect_result_generator=detect_result_generator,
|
|
281
|
+
padder=padder,
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
@staticmethod
|
|
285
|
+
def build_sub_image_service(config: AttrDict, detector: ObjectDetector, mode: str) -> SubImageLayoutService:
|
|
286
|
+
"""
|
|
287
|
+
Building a sub image layout service with a given detector
|
|
288
|
+
|
|
289
|
+
:param config: configuration object
|
|
290
|
+
:param detector: will be passed to the `SubImageLayoutService`
|
|
291
|
+
:param mode: either `LAYOUT`,`CELL` or `ITEM`
|
|
292
|
+
:return: `SubImageLayoutService` instance
|
|
293
|
+
"""
|
|
294
|
+
return ServiceFactory._build_sub_image_service(config, detector, mode)
|
|
295
|
+
|
|
296
|
+
@staticmethod
|
|
297
|
+
def _build_ocr_detector(config: AttrDict) -> Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]:
|
|
298
|
+
"""
|
|
299
|
+
Building OCR predictor
|
|
300
|
+
|
|
301
|
+
:param config: configuration object
|
|
302
|
+
"""
|
|
303
|
+
if config.OCR.USE_TESSERACT:
|
|
304
|
+
ocr_config_path = get_configs_dir_path() / config.OCR.CONFIG.TESSERACT
|
|
305
|
+
return TesseractOcrDetector(
|
|
306
|
+
ocr_config_path,
|
|
307
|
+
config_overwrite=[f"LANGUAGES={config.LANGUAGE}"] if config.LANGUAGE is not None else None,
|
|
308
|
+
)
|
|
309
|
+
if config.OCR.USE_DOCTR:
|
|
310
|
+
weights = (
|
|
311
|
+
config.OCR.WEIGHTS.DOCTR_RECOGNITION.TF
|
|
312
|
+
if config.LIB == "TF"
|
|
313
|
+
else (config.OCR.WEIGHTS.DOCTR_RECOGNITION.PT)
|
|
314
|
+
)
|
|
315
|
+
weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
|
|
316
|
+
profile = ModelCatalog.get_profile(weights)
|
|
317
|
+
# get_full_path_configs will complete the path even if the model is not registered
|
|
318
|
+
config_path = ModelCatalog.get_full_path_configs(weights) if profile.config is not None else None
|
|
319
|
+
if profile.architecture is None:
|
|
320
|
+
raise ValueError("model profile.architecture must be specified")
|
|
321
|
+
return DoctrTextRecognizer(
|
|
322
|
+
architecture=profile.architecture,
|
|
323
|
+
path_weights=weights_path,
|
|
324
|
+
device=config.DEVICE,
|
|
325
|
+
lib=config.LIB,
|
|
326
|
+
path_config_json=config_path,
|
|
327
|
+
)
|
|
328
|
+
if config.OCR.USE_TEXTRACT:
|
|
329
|
+
credentials_kwargs = {
|
|
330
|
+
"aws_access_key_id": environ.get("ACCESS_KEY", None),
|
|
331
|
+
"aws_secret_access_key": environ.get("SECRET_KEY", None),
|
|
332
|
+
"config": Config(region_name=environ.get("REGION", None)),
|
|
333
|
+
}
|
|
334
|
+
return TextractOcrDetector(**credentials_kwargs)
|
|
335
|
+
raise ValueError("You have set USE_OCR=True but any of USE_TESSERACT, USE_DOCTR, USE_TEXTRACT is set to False")
|
|
336
|
+
|
|
337
|
+
@staticmethod
|
|
338
|
+
def build_ocr_detector(config: AttrDict) -> Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]:
|
|
339
|
+
"""
|
|
340
|
+
Building OCR predictor
|
|
341
|
+
|
|
342
|
+
:param config: configuration object
|
|
343
|
+
"""
|
|
344
|
+
return ServiceFactory._build_ocr_detector(config)
|
|
345
|
+
|
|
346
|
+
@staticmethod
|
|
347
|
+
def build_doctr_word_detector(config: AttrDict) -> DoctrTextlineDetector:
|
|
348
|
+
"""Building `DoctrTextlineDetector` instance
|
|
349
|
+
|
|
350
|
+
:param config: configuration object
|
|
351
|
+
:return: DoctrTextlineDetector
|
|
352
|
+
"""
|
|
353
|
+
weights = config.OCR.WEIGHTS.DOCTR_WORD.TF if config.LIB == "TF" else config.OCR.WEIGHTS.DOCTR_WORD.PT
|
|
354
|
+
weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
|
|
355
|
+
profile = ModelCatalog.get_profile(weights)
|
|
356
|
+
if profile.architecture is None:
|
|
357
|
+
raise ValueError("model profile.architecture must be specified")
|
|
358
|
+
if profile.categories is None:
|
|
359
|
+
raise ValueError("model profile.categories must be specified")
|
|
360
|
+
return DoctrTextlineDetector(
|
|
361
|
+
profile.architecture, weights_path, profile.categories, config.DEVICE, lib=config.LIB
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
@staticmethod
|
|
365
|
+
def _build_table_segmentation_service(
|
|
366
|
+
config: AttrDict,
|
|
367
|
+
detector: ObjectDetector,
|
|
368
|
+
) -> Union[PubtablesSegmentationService, TableSegmentationService]:
|
|
369
|
+
"""
|
|
370
|
+
Build and return a table segmentation service based on the provided detector.
|
|
371
|
+
|
|
372
|
+
Depending on the type of the detector, this method will return either a `PubtablesSegmentationService` or a
|
|
373
|
+
`TableSegmentationService` instance. The selection is made as follows:
|
|
374
|
+
|
|
375
|
+
- If the detector is an instance of `HFDetrDerivedDetector`, a `PubtablesSegmentationService` is created and
|
|
376
|
+
returned. This service uses specific configuration parameters for segmentation, such as assignment rules,
|
|
377
|
+
thresholds, and cell names defined in the `cfg` object.
|
|
378
|
+
- For other detector types, a `TableSegmentationService` is created and returned. This service also uses
|
|
379
|
+
configuration parameters from the `cfg` object but is tailored for different segmentation needs.
|
|
380
|
+
|
|
381
|
+
:param config: configuration object
|
|
382
|
+
:param detector: An instance of `ObjectDetector` used to determine the type of table segmentation
|
|
383
|
+
service to build.
|
|
384
|
+
:return: An instance of either `PubtablesSegmentationService` or `TableSegmentationService` based on the
|
|
385
|
+
detector type.
|
|
386
|
+
"""
|
|
387
|
+
table_segmentation: Union[PubtablesSegmentationService, TableSegmentationService]
|
|
388
|
+
if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
|
|
389
|
+
table_segmentation = PubtablesSegmentationService(
|
|
390
|
+
segment_rule=config.SEGMENTATION.ASSIGNMENT_RULE,
|
|
391
|
+
threshold_rows=config.SEGMENTATION.THRESHOLD_ROWS,
|
|
392
|
+
threshold_cols=config.SEGMENTATION.THRESHOLD_COLS,
|
|
393
|
+
tile_table_with_items=config.SEGMENTATION.FULL_TABLE_TILING,
|
|
394
|
+
remove_iou_threshold_rows=config.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
|
|
395
|
+
remove_iou_threshold_cols=config.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
|
|
396
|
+
cell_class_id=config.SEGMENTATION.CELL_CATEGORY_ID,
|
|
397
|
+
table_name=config.SEGMENTATION.TABLE_NAME,
|
|
398
|
+
cell_names=config.SEGMENTATION.PUBTABLES_CELL_NAMES,
|
|
399
|
+
spanning_cell_names=config.SEGMENTATION.PUBTABLES_SPANNING_CELL_NAMES,
|
|
400
|
+
item_names=config.SEGMENTATION.PUBTABLES_ITEM_NAMES,
|
|
401
|
+
sub_item_names=config.SEGMENTATION.PUBTABLES_SUB_ITEM_NAMES,
|
|
402
|
+
stretch_rule=config.SEGMENTATION.STRETCH_RULE,
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
else:
|
|
406
|
+
table_segmentation = TableSegmentationService(
|
|
407
|
+
segment_rule=config.SEGMENTATION.ASSIGNMENT_RULE,
|
|
408
|
+
threshold_rows=config.SEGMENTATION.THRESHOLD_ROWS,
|
|
409
|
+
threshold_cols=config.SEGMENTATION.THRESHOLD_COLS,
|
|
410
|
+
tile_table_with_items=config.SEGMENTATION.FULL_TABLE_TILING,
|
|
411
|
+
remove_iou_threshold_rows=config.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
|
|
412
|
+
remove_iou_threshold_cols=config.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
|
|
413
|
+
table_name=config.SEGMENTATION.TABLE_NAME,
|
|
414
|
+
cell_names=config.SEGMENTATION.CELL_NAMES,
|
|
415
|
+
item_names=config.SEGMENTATION.ITEM_NAMES,
|
|
416
|
+
sub_item_names=config.SEGMENTATION.SUB_ITEM_NAMES,
|
|
417
|
+
stretch_rule=config.SEGMENTATION.STRETCH_RULE,
|
|
418
|
+
)
|
|
419
|
+
return table_segmentation
|
|
420
|
+
|
|
421
|
+
@staticmethod
|
|
422
|
+
def build_table_segmentation_service(
|
|
423
|
+
config: AttrDict,
|
|
424
|
+
detector: ObjectDetector,
|
|
425
|
+
) -> Union[PubtablesSegmentationService, TableSegmentationService]:
|
|
426
|
+
"""
|
|
427
|
+
Build and return a table segmentation service based on the provided detector.
|
|
428
|
+
|
|
429
|
+
Depending on the type of the detector, this method will return either a `PubtablesSegmentationService` or a
|
|
430
|
+
`TableSegmentationService` instance. The selection is made as follows:
|
|
431
|
+
|
|
432
|
+
- If the detector is an instance of `HFDetrDerivedDetector`, a `PubtablesSegmentationService` is created and
|
|
433
|
+
returned. This service uses specific configuration parameters for segmentation, such as assignment rules,
|
|
434
|
+
thresholds, and cell names defined in the `cfg` object.
|
|
435
|
+
- For other detector types, a `TableSegmentationService` is created and returned. This service also uses
|
|
436
|
+
configuration parameters from the `cfg` object but is tailored for different segmentation needs.
|
|
437
|
+
|
|
438
|
+
:param config: configuration object
|
|
439
|
+
:param detector: An instance of `ObjectDetector` used to determine the type of table segmentation
|
|
440
|
+
service to build.
|
|
441
|
+
:return: An instance of either `PubtablesSegmentationService` or `TableSegmentationService` based on the
|
|
442
|
+
detector type.
|
|
443
|
+
"""
|
|
444
|
+
return ServiceFactory._build_table_segmentation_service(config, detector)
|
|
445
|
+
|
|
446
|
+
@staticmethod
|
|
447
|
+
def _build_table_refinement_service(config: AttrDict) -> TableSegmentationRefinementService:
|
|
448
|
+
"""Building a table segmentation refinement service
|
|
449
|
+
|
|
450
|
+
:param config: configuration object
|
|
451
|
+
:return: TableSegmentationRefinementService
|
|
452
|
+
"""
|
|
453
|
+
return TableSegmentationRefinementService(
|
|
454
|
+
[config.SEGMENTATION.TABLE_NAME],
|
|
455
|
+
config.SEGMENTATION.PUBTABLES_CELL_NAMES,
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
@staticmethod
|
|
459
|
+
def build_table_refinement_service(config: AttrDict) -> TableSegmentationRefinementService:
|
|
460
|
+
"""Building a table segmentation refinement service
|
|
461
|
+
|
|
462
|
+
:param config: configuration object
|
|
463
|
+
:return: TableSegmentationRefinementService
|
|
464
|
+
"""
|
|
465
|
+
return ServiceFactory._build_table_refinement_service(config)
|
|
466
|
+
|
|
467
|
+
@staticmethod
|
|
468
|
+
def _build_pdf_text_detector(config: AttrDict) -> PdfPlumberTextDetector:
|
|
469
|
+
"""Building a PDF text detector
|
|
470
|
+
|
|
471
|
+
:param config: configuration object
|
|
472
|
+
:return: PdfPlumberTextDetector
|
|
473
|
+
"""
|
|
474
|
+
return PdfPlumberTextDetector(
|
|
475
|
+
x_tolerance=config.PDF_MINER.X_TOLERANCE, y_tolerance=config.PDF_MINER.Y_TOLERANCE
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
@staticmethod
|
|
479
|
+
def build_pdf_text_detector(config: AttrDict) -> PdfPlumberTextDetector:
|
|
480
|
+
"""Building a PDF text detector
|
|
481
|
+
|
|
482
|
+
:param config: configuration object
|
|
483
|
+
:return: PdfPlumberTextDetector
|
|
484
|
+
"""
|
|
485
|
+
return ServiceFactory._build_pdf_text_detector(config)
|
|
486
|
+
|
|
487
|
+
@staticmethod
|
|
488
|
+
def _build_pdf_miner_text_service(detector: PdfMiner) -> TextExtractionService:
|
|
489
|
+
"""Building a PDFMiner text extraction service
|
|
490
|
+
|
|
491
|
+
:param detector: PdfMiner
|
|
492
|
+
:return: TextExtractionService
|
|
493
|
+
"""
|
|
494
|
+
return TextExtractionService(detector)
|
|
495
|
+
|
|
496
|
+
@staticmethod
|
|
497
|
+
def build_pdf_miner_text_service(detector: PdfMiner) -> TextExtractionService:
|
|
498
|
+
"""Building a PDFMiner text extraction service
|
|
499
|
+
|
|
500
|
+
:param detector: PdfMiner
|
|
501
|
+
:return: TextExtractionService
|
|
502
|
+
"""
|
|
503
|
+
return ServiceFactory._build_pdf_miner_text_service(detector)
|
|
504
|
+
|
|
505
|
+
@staticmethod
|
|
506
|
+
def build_doctr_word_detector_service(detector: DoctrTextlineDetector) -> ImageLayoutService:
|
|
507
|
+
"""Building a Doctr word detector service
|
|
508
|
+
|
|
509
|
+
:param detector: DoctrTextlineDetector
|
|
510
|
+
:return: ImageLayoutService
|
|
511
|
+
"""
|
|
512
|
+
return ImageLayoutService(
|
|
513
|
+
layout_detector=detector, to_image=True, crop_image=True, skip_if_layout_extracted=True
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
@staticmethod
|
|
517
|
+
def _build_text_extraction_service(
|
|
518
|
+
config: AttrDict, detector: Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]
|
|
519
|
+
) -> TextExtractionService:
|
|
520
|
+
"""Building a text extraction service
|
|
521
|
+
|
|
522
|
+
:param config: configuration object
|
|
523
|
+
:param detector: OCR detector
|
|
524
|
+
:return: TextExtractionService
|
|
525
|
+
"""
|
|
526
|
+
return TextExtractionService(
|
|
527
|
+
detector,
|
|
528
|
+
skip_if_text_extracted=config.USE_PDF_MINER,
|
|
529
|
+
extract_from_roi=config.TEXT_CONTAINER if config.OCR.USE_DOCTR else None,
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
@staticmethod
|
|
533
|
+
def build_text_extraction_service(
|
|
534
|
+
config: AttrDict, detector: Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]
|
|
535
|
+
) -> TextExtractionService:
|
|
536
|
+
"""Building a text extraction service
|
|
537
|
+
|
|
538
|
+
:param config: configuration object
|
|
539
|
+
:param detector: OCR detector
|
|
540
|
+
:return: TextExtractionService
|
|
541
|
+
"""
|
|
542
|
+
return ServiceFactory._build_text_extraction_service(config, detector)
|
|
543
|
+
|
|
544
|
+
@staticmethod
|
|
545
|
+
def _build_word_matching_service(config: AttrDict) -> MatchingService:
|
|
546
|
+
"""Building a word matching service
|
|
547
|
+
|
|
548
|
+
:param config: configuration object
|
|
549
|
+
:return: MatchingService
|
|
550
|
+
"""
|
|
551
|
+
matcher = IntersectionMatcher(
|
|
552
|
+
matching_rule=config.WORD_MATCHING.RULE,
|
|
553
|
+
threshold=config.WORD_MATCHING.THRESHOLD,
|
|
554
|
+
max_parent_only=config.WORD_MATCHING.MAX_PARENT_ONLY,
|
|
555
|
+
)
|
|
556
|
+
return MatchingService(
|
|
557
|
+
parent_categories=config.WORD_MATCHING.PARENTAL_CATEGORIES,
|
|
558
|
+
child_categories=config.TEXT_CONTAINER,
|
|
559
|
+
matcher=matcher,
|
|
560
|
+
relationship_key=Relationships.CHILD,
|
|
561
|
+
)
|
|
562
|
+
|
|
563
|
+
@staticmethod
|
|
564
|
+
def build_word_matching_service(config: AttrDict) -> MatchingService:
|
|
565
|
+
"""Building a word matching service
|
|
566
|
+
|
|
567
|
+
:param config: configuration object
|
|
568
|
+
:return: MatchingService
|
|
569
|
+
"""
|
|
570
|
+
return ServiceFactory._build_word_matching_service(config)
|
|
571
|
+
|
|
572
|
+
@staticmethod
|
|
573
|
+
def _build_layout_link_matching_service(config: AttrDict) -> MatchingService:
|
|
574
|
+
"""Building a word matching service
|
|
575
|
+
|
|
576
|
+
:param config: configuration object
|
|
577
|
+
:return: MatchingService
|
|
578
|
+
"""
|
|
579
|
+
neighbor_matcher = NeighbourMatcher()
|
|
580
|
+
return MatchingService(
|
|
581
|
+
parent_categories=config.LAYOUT_LINK.PARENTAL_CATEGORIES,
|
|
582
|
+
child_categories=config.LAYOUT_LINK.CHILD_CATEGORIES,
|
|
583
|
+
matcher=neighbor_matcher,
|
|
584
|
+
relationship_key=Relationships.LAYOUT_LINK,
|
|
585
|
+
)
|
|
586
|
+
|
|
587
|
+
@staticmethod
|
|
588
|
+
def build_layout_link_matching_service(config: AttrDict) -> MatchingService:
|
|
589
|
+
"""Building a word matching service
|
|
590
|
+
|
|
591
|
+
:param config: configuration object
|
|
592
|
+
:return: MatchingService
|
|
593
|
+
"""
|
|
594
|
+
return ServiceFactory._build_layout_link_matching_service(config)
|
|
595
|
+
|
|
596
|
+
@staticmethod
|
|
597
|
+
def _build_text_order_service(config: AttrDict) -> TextOrderService:
|
|
598
|
+
"""Building a text order service
|
|
599
|
+
|
|
600
|
+
:param config: configuration object
|
|
601
|
+
:return: TextOrderService instance
|
|
602
|
+
"""
|
|
603
|
+
return TextOrderService(
|
|
604
|
+
text_container=config.TEXT_CONTAINER,
|
|
605
|
+
text_block_categories=config.TEXT_ORDERING.TEXT_BLOCK_CATEGORIES,
|
|
606
|
+
floating_text_block_categories=config.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
|
|
607
|
+
include_residual_text_container=config.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
|
|
608
|
+
starting_point_tolerance=config.TEXT_ORDERING.STARTING_POINT_TOLERANCE,
|
|
609
|
+
broken_line_tolerance=config.TEXT_ORDERING.BROKEN_LINE_TOLERANCE,
|
|
610
|
+
height_tolerance=config.TEXT_ORDERING.HEIGHT_TOLERANCE,
|
|
611
|
+
paragraph_break=config.TEXT_ORDERING.PARAGRAPH_BREAK,
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
@staticmethod
|
|
615
|
+
def build_text_order_service(config: AttrDict) -> TextOrderService:
|
|
616
|
+
"""Building a text order service
|
|
617
|
+
|
|
618
|
+
:param config: configuration object
|
|
619
|
+
:return: TextOrderService instance
|
|
620
|
+
"""
|
|
621
|
+
return ServiceFactory._build_text_order_service(config)
|
|
622
|
+
|
|
623
|
+
@staticmethod
|
|
624
|
+
def _build_page_parsing_service(config: AttrDict) -> PageParsingService:
|
|
625
|
+
"""Building a page parsing service
|
|
626
|
+
|
|
627
|
+
:param config: configuration object
|
|
628
|
+
:return: PageParsingService instance
|
|
629
|
+
"""
|
|
630
|
+
return PageParsingService(
|
|
631
|
+
text_container=config.TEXT_CONTAINER,
|
|
632
|
+
floating_text_block_categories=config.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
|
|
633
|
+
include_residual_text_container=config.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
|
|
634
|
+
)
|
|
635
|
+
|
|
636
|
+
@staticmethod
|
|
637
|
+
def build_page_parsing_service(config: AttrDict) -> PageParsingService:
|
|
638
|
+
"""Building a page parsing service
|
|
639
|
+
|
|
640
|
+
:param config: configuration object
|
|
641
|
+
:return: PageParsingService instance
|
|
642
|
+
"""
|
|
643
|
+
return ServiceFactory._build_page_parsing_service(config)
|
|
644
|
+
|
|
645
|
+
@staticmethod
|
|
646
|
+
def build_analyzer(config: AttrDict) -> DoctectionPipe:
|
|
647
|
+
"""
|
|
648
|
+
Builds the analyzer with a given config
|
|
649
|
+
|
|
650
|
+
:param config: configuration object
|
|
651
|
+
:return: Analyzer pipeline
|
|
652
|
+
"""
|
|
653
|
+
pipe_component_list: list[PipelineComponent] = []
|
|
654
|
+
|
|
655
|
+
if config.USE_ROTATOR:
|
|
656
|
+
rotation_detector = ServiceFactory.build_rotation_detector()
|
|
657
|
+
transform_service = ServiceFactory.build_transform_service(transform_predictor=rotation_detector)
|
|
658
|
+
pipe_component_list.append(transform_service)
|
|
659
|
+
|
|
660
|
+
if config.USE_LAYOUT:
|
|
661
|
+
layout_detector = ServiceFactory.build_layout_detector(config, mode="LAYOUT")
|
|
662
|
+
layout_service = ServiceFactory.build_layout_service(config, detector=layout_detector, mode="LAYOUT")
|
|
663
|
+
pipe_component_list.append(layout_service)
|
|
664
|
+
|
|
665
|
+
# setup layout nms service
|
|
666
|
+
if config.USE_LAYOUT_NMS:
|
|
667
|
+
layout_nms_service = ServiceFactory.build_layout_nms_service(config)
|
|
668
|
+
pipe_component_list.append(layout_nms_service)
|
|
669
|
+
|
|
670
|
+
# setup tables service
|
|
671
|
+
if config.USE_TABLE_SEGMENTATION:
|
|
672
|
+
item_detector = ServiceFactory.build_layout_detector(config, mode="ITEM")
|
|
673
|
+
item_service = ServiceFactory.build_sub_image_service(config, detector=item_detector, mode="ITEM")
|
|
674
|
+
pipe_component_list.append(item_service)
|
|
675
|
+
|
|
676
|
+
if item_detector.__class__.__name__ not in ("HFDetrDerivedDetector",):
|
|
677
|
+
cell_detector = ServiceFactory.build_layout_detector(config, mode="CELL")
|
|
678
|
+
cell_service = ServiceFactory.build_sub_image_service(config, detector=cell_detector, mode="CELL")
|
|
679
|
+
pipe_component_list.append(cell_service)
|
|
680
|
+
|
|
681
|
+
table_segmentation_service = ServiceFactory.build_table_segmentation_service(config, detector=item_detector)
|
|
682
|
+
pipe_component_list.append(table_segmentation_service)
|
|
683
|
+
|
|
684
|
+
if config.USE_TABLE_REFINEMENT:
|
|
685
|
+
table_refinement_service = ServiceFactory.build_table_refinement_service(config)
|
|
686
|
+
pipe_component_list.append(table_refinement_service)
|
|
687
|
+
|
|
688
|
+
if config.USE_PDF_MINER:
|
|
689
|
+
pdf_miner = ServiceFactory.build_pdf_text_detector(config)
|
|
690
|
+
d_text = ServiceFactory.build_pdf_miner_text_service(pdf_miner)
|
|
691
|
+
pipe_component_list.append(d_text)
|
|
692
|
+
|
|
693
|
+
# setup ocr
|
|
694
|
+
if config.USE_OCR:
|
|
695
|
+
# the extra mile for DocTr
|
|
696
|
+
if config.OCR.USE_DOCTR:
|
|
697
|
+
word_detector = ServiceFactory.build_doctr_word_detector(config)
|
|
698
|
+
word_service = ServiceFactory.build_doctr_word_detector_service(word_detector)
|
|
699
|
+
pipe_component_list.append(word_service)
|
|
700
|
+
|
|
701
|
+
ocr_detector = ServiceFactory.build_ocr_detector(config)
|
|
702
|
+
text_extraction_service = ServiceFactory.build_text_extraction_service(config, ocr_detector)
|
|
703
|
+
pipe_component_list.append(text_extraction_service)
|
|
704
|
+
|
|
705
|
+
if config.USE_PDF_MINER or config.USE_OCR:
|
|
706
|
+
matching_service = ServiceFactory.build_word_matching_service(config)
|
|
707
|
+
pipe_component_list.append(matching_service)
|
|
708
|
+
|
|
709
|
+
text_order_service = ServiceFactory.build_text_order_service(config)
|
|
710
|
+
pipe_component_list.append(text_order_service)
|
|
711
|
+
|
|
712
|
+
if config.USE_LAYOUT_LINK:
|
|
713
|
+
layout_link_matching_service = ServiceFactory.build_layout_link_matching_service(config)
|
|
714
|
+
pipe_component_list.append(layout_link_matching_service)
|
|
715
|
+
|
|
716
|
+
page_parsing_service = ServiceFactory.build_page_parsing_service(config)
|
|
717
|
+
|
|
718
|
+
return DoctectionPipe(pipeline_component_list=pipe_component_list, page_parsing_service=page_parsing_service)
|