deepdoctection 0.34__py3-none-any.whl → 0.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

@@ -18,25 +18,19 @@ if importlib.util.find_spec("dotenv") is not None:
18
18
  import sys
19
19
  from typing import TYPE_CHECKING
20
20
 
21
- from .utils.env_info import collect_env_info
21
+ from .utils.env_info import auto_select_pdf_render_framework, collect_env_info
22
22
  from .utils.file_utils import _LazyModule, get_tf_version, pytorch_available, tf_available
23
23
  from .utils.logger import LoggingRecord, logger
24
24
 
25
25
  # pylint: enable=wrong-import-position
26
26
 
27
- __version__ = 0.34
27
+ __version__ = 0.35
28
28
 
29
29
  _IMPORT_STRUCTURE = {
30
30
  "analyzer": [
31
31
  "config_sanity_checks",
32
- "build_detector",
33
- "build_padder",
34
- "build_service",
35
- "build_sub_image_service",
36
- "build_ocr",
37
- "build_doctr_word",
38
32
  "get_dd_analyzer",
39
- "build_analyzer",
33
+ "ServiceFactory"
40
34
  ],
41
35
  "configs": [],
42
36
  "dataflow": [
@@ -197,6 +191,7 @@ _IMPORT_STRUCTURE = {
197
191
  "print_model_infos",
198
192
  "ModelDownloadManager",
199
193
  "PdfPlumberTextDetector",
194
+ "Pdfmium2TextDetector",
200
195
  "TesseractOcrDetector",
201
196
  "TesseractRotationTransformer",
202
197
  "TextractOcrDetector",
@@ -304,6 +299,7 @@ _IMPORT_STRUCTURE = {
304
299
  "timed_operation",
305
300
  "collect_env_info",
306
301
  "auto_select_viz_library",
302
+ "auto_select_pdf_render_framework",
307
303
  "get_tensorflow_requirement",
308
304
  "tf_addons_available",
309
305
  "get_tf_addons_requirements",
@@ -427,7 +423,7 @@ _IMPORT_STRUCTURE = {
427
423
  # Setting some environment variables so that standard functions can be invoked with available hardware
428
424
  env_info = collect_env_info()
429
425
  logger.debug(LoggingRecord(msg=env_info))
430
-
426
+ auto_select_pdf_render_framework()
431
427
 
432
428
  # Direct imports for type-checking
433
429
  if TYPE_CHECKING:
@@ -20,3 +20,4 @@ Package for pre-built pipelines
20
20
  """
21
21
 
22
22
  from .dd import *
23
+ from .factory import *
@@ -0,0 +1,150 @@
1
+ # -*- coding: utf-8 -*-
2
+ # File: config.py
3
+
4
+ # Copyright 2024 Dr. Janis Meyer. All rights reserved.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ """Pipeline configuration for deepdoctection analyzer. Do not change the defaults in this file. """
19
+
20
+ from ..utils.metacfg import AttrDict
21
+ from ..utils.settings import CellType, LayoutType
22
+
23
+ cfg = AttrDict()
24
+
25
+ cfg.LANGUAGE = None
26
+ cfg.LIB = None
27
+ cfg.DEVICE = None
28
+ cfg.USE_ROTATOR = False
29
+ cfg.USE_LAYOUT = True
30
+ cfg.USE_TABLE_SEGMENTATION = True
31
+
32
+ cfg.TF.LAYOUT.WEIGHTS = "layout/model-800000_inf_only.data-00000-of-00001"
33
+ cfg.TF.LAYOUT.FILTER = None
34
+
35
+
36
+ cfg.TF.CELL.WEIGHTS = "cell/model-1800000_inf_only.data-00000-of-00001"
37
+ cfg.TF.CELL.FILTER = None
38
+
39
+
40
+ cfg.TF.ITEM.WEIGHTS = "item/model-1620000_inf_only.data-00000-of-00001"
41
+ cfg.TF.ITEM.FILTER = None
42
+
43
+ cfg.PT.LAYOUT.WEIGHTS = "layout/d2_model_0829999_layout_inf_only.pt"
44
+ cfg.PT.LAYOUT.WEIGHTS_TS = "layout/d2_model_0829999_layout_inf_only.ts"
45
+ cfg.PT.LAYOUT.FILTER = None
46
+ cfg.PT.LAYOUT.PAD.TOP = 60
47
+ cfg.PT.LAYOUT.PAD.RIGHT = 60
48
+ cfg.PT.LAYOUT.PAD.BOTTOM = 60
49
+ cfg.PT.LAYOUT.PAD.LEFT = 60
50
+
51
+ cfg.PT.ITEM.WEIGHTS = "item/d2_model_1639999_item_inf_only.pt"
52
+ cfg.PT.ITEM.WEIGHTS_TS = "item/d2_model_1639999_item_inf_only.ts"
53
+ cfg.PT.ITEM.FILTER = None
54
+ cfg.PT.ITEM.PAD.TOP = 60
55
+ cfg.PT.ITEM.PAD.RIGHT = 60
56
+ cfg.PT.ITEM.PAD.BOTTOM = 60
57
+ cfg.PT.ITEM.PAD.LEFT = 60
58
+
59
+ cfg.PT.CELL.WEIGHTS = "cell/d2_model_1849999_cell_inf_only.pt"
60
+ cfg.PT.CELL.WEIGHTS_TS = "cell/d2_model_1849999_cell_inf_only.ts"
61
+ cfg.PT.CELL.FILTER = None
62
+
63
+ cfg.USE_LAYOUT_NMS = False
64
+ cfg.LAYOUT_NMS_PAIRS.COMBINATIONS = None
65
+ cfg.LAYOUT_NMS_PAIRS.THRESHOLDS = None
66
+ cfg.LAYOUT_NMS_PAIRS.PRIORITY = None
67
+
68
+ cfg.SEGMENTATION.ASSIGNMENT_RULE = "ioa"
69
+ cfg.SEGMENTATION.THRESHOLD_ROWS = 0.4
70
+ cfg.SEGMENTATION.THRESHOLD_COLS = 0.4
71
+ cfg.SEGMENTATION.FULL_TABLE_TILING = True
72
+ cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS = 0.001
73
+ cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS = 0.001
74
+ cfg.SEGMENTATION.CELL_CATEGORY_ID = 12
75
+ cfg.SEGMENTATION.TABLE_NAME = LayoutType.TABLE
76
+ cfg.SEGMENTATION.PUBTABLES_CELL_NAMES = [
77
+ CellType.SPANNING,
78
+ CellType.ROW_HEADER,
79
+ CellType.COLUMN_HEADER,
80
+ CellType.PROJECTED_ROW_HEADER,
81
+ LayoutType.CELL,
82
+ ]
83
+ cfg.SEGMENTATION.PUBTABLES_SPANNING_CELL_NAMES = [
84
+ CellType.SPANNING,
85
+ CellType.ROW_HEADER,
86
+ CellType.COLUMN_HEADER,
87
+ CellType.PROJECTED_ROW_HEADER,
88
+ ]
89
+ cfg.SEGMENTATION.PUBTABLES_ITEM_NAMES = [LayoutType.ROW, LayoutType.COLUMN]
90
+ cfg.SEGMENTATION.PUBTABLES_SUB_ITEM_NAMES = [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER]
91
+ cfg.SEGMENTATION.CELL_NAMES = [CellType.HEADER, CellType.BODY, LayoutType.CELL]
92
+ cfg.SEGMENTATION.ITEM_NAMES = [LayoutType.ROW, LayoutType.COLUMN]
93
+ cfg.SEGMENTATION.SUB_ITEM_NAMES = [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER]
94
+
95
+ cfg.SEGMENTATION.STRETCH_RULE = "equal"
96
+
97
+ cfg.USE_TABLE_REFINEMENT = True
98
+ cfg.USE_PDF_MINER = False
99
+
100
+ cfg.PDF_MINER.X_TOLERANCE = 3
101
+ cfg.PDF_MINER.Y_TOLERANCE = 3
102
+
103
+ cfg.USE_OCR = True
104
+
105
+ cfg.OCR.USE_TESSERACT = True
106
+ cfg.OCR.USE_DOCTR = False
107
+ cfg.OCR.USE_TEXTRACT = False
108
+ cfg.OCR.CONFIG.TESSERACT = "dd/conf_tesseract.yaml"
109
+
110
+ cfg.OCR.WEIGHTS.DOCTR_WORD.TF = "doctr/db_resnet50/tf/db_resnet50-adcafc63.zip"
111
+ cfg.OCR.WEIGHTS.DOCTR_WORD.PT = "doctr/db_resnet50/pt/db_resnet50-ac60cadc.pt"
112
+ cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.TF = "doctr/crnn_vgg16_bn/tf/crnn_vgg16_bn-76b7f2c6.zip"
113
+ cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.PT = "doctr/crnn_vgg16_bn/pt/crnn_vgg16_bn-9762b0b0.pt"
114
+
115
+ cfg.TEXT_CONTAINER = LayoutType.WORD
116
+ cfg.WORD_MATCHING.PARENTAL_CATEGORIES = [
117
+ LayoutType.TEXT,
118
+ LayoutType.TITLE,
119
+ LayoutType.LIST,
120
+ LayoutType.CELL,
121
+ CellType.COLUMN_HEADER,
122
+ CellType.PROJECTED_ROW_HEADER,
123
+ CellType.SPANNING,
124
+ CellType.ROW_HEADER,
125
+ ]
126
+ cfg.WORD_MATCHING.RULE = "ioa"
127
+ cfg.WORD_MATCHING.THRESHOLD = 0.6
128
+ cfg.WORD_MATCHING.MAX_PARENT_ONLY = True
129
+
130
+ cfg.TEXT_ORDERING.TEXT_BLOCK_CATEGORIES = [
131
+ LayoutType.TEXT,
132
+ LayoutType.TITLE,
133
+ LayoutType.LIST,
134
+ LayoutType.CELL,
135
+ CellType.COLUMN_HEADER,
136
+ CellType.PROJECTED_ROW_HEADER,
137
+ CellType.SPANNING,
138
+ CellType.ROW_HEADER,
139
+ ]
140
+ cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES = [
141
+ LayoutType.TEXT,
142
+ LayoutType.TITLE,
143
+ LayoutType.LIST,
144
+ ]
145
+ cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER = False
146
+ cfg.TEXT_ORDERING.STARTING_POINT_TOLERANCE = 0.005
147
+ cfg.TEXT_ORDERING.BROKEN_LINE_TOLERANCE = 0.003
148
+ cfg.TEXT_ORDERING.HEIGHT_TOLERANCE = 2.0
149
+ cfg.TEXT_ORDERING.PARAGRAPH_BREAK = 0.035
150
+ cfg.freeze()
@@ -26,56 +26,24 @@ Module for **deep**doctection analyzer.
26
26
  from __future__ import annotations
27
27
 
28
28
  import os
29
- from os import environ
30
- from typing import Optional, Union
29
+ from typing import Optional
31
30
 
32
- from lazy_imports import try_import
33
-
34
- from ..extern.base import ObjectDetector
35
- from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
36
- from ..extern.doctrocr import DoctrTextlineDetector, DoctrTextRecognizer
37
- from ..extern.hfdetr import HFDetrDerivedDetector
38
- from ..extern.model import ModelCatalog, ModelDownloadManager
39
- from ..extern.pdftext import PdfPlumberTextDetector
40
31
  from ..extern.pt.ptutils import get_torch_device
41
- from ..extern.tessocr import TesseractOcrDetector
42
- from ..extern.texocr import TextractOcrDetector
43
32
  from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
44
- from ..extern.tpdetect import TPFrcnnDetector
45
- from ..pipe.base import PipelineComponent
46
- from ..pipe.common import AnnotationNmsService, IntersectionMatcher, MatchingService, PageParsingService
47
33
  from ..pipe.doctectionpipe import DoctectionPipe
48
- from ..pipe.layout import ImageLayoutService
49
- from ..pipe.order import TextOrderService
50
- from ..pipe.refine import TableSegmentationRefinementService
51
- from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
52
- from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
53
- from ..pipe.text import TextExtractionService
54
34
  from ..utils.env_info import ENV_VARS_TRUE
55
35
  from ..utils.error import DependencyError
56
- from ..utils.file_utils import detectron2_available, tensorpack_available
36
+ from ..utils.file_utils import tensorpack_available
57
37
  from ..utils.fs import get_configs_dir_path, get_package_path, maybe_copy_config_to_cache
58
38
  from ..utils.logger import LoggingRecord, logger
59
- from ..utils.metacfg import AttrDict, set_config_by_yaml
60
- from ..utils.settings import CellType, LayoutType, Relationships
61
- from ..utils.transform import PadTransform
39
+ from ..utils.metacfg import set_config_by_yaml
62
40
  from ..utils.types import PathLikeOrStr
63
-
64
- with try_import() as image_guard:
65
- from botocore.config import Config # type: ignore
66
-
41
+ from ._config import cfg
42
+ from .factory import ServiceFactory
67
43
 
68
44
  __all__ = [
69
45
  "config_sanity_checks",
70
- "build_detector",
71
- "build_padder",
72
- "build_service",
73
- "build_sub_image_service",
74
- "build_ocr",
75
- "build_doctr_word",
76
46
  "get_dd_analyzer",
77
- "build_analyzer",
78
- "set_config_by_yaml",
79
47
  ]
80
48
 
81
49
  _DD_ONE = "deepdoctection/configs/conf_dd_one.yaml"
@@ -108,7 +76,7 @@ _MODEL_CHOICES = {
108
76
  }
109
77
 
110
78
 
111
- def config_sanity_checks(cfg: AttrDict) -> None:
79
+ def config_sanity_checks() -> None:
112
80
  """Some config sanity checks"""
113
81
  if cfg.USE_PDF_MINER and cfg.USE_OCR and cfg.OCR.USE_DOCTR:
114
82
  raise ValueError("Configuration USE_PDF_MINER= True and USE_OCR=True and USE_DOCTR=True is not allowed")
@@ -120,300 +88,6 @@ def config_sanity_checks(cfg: AttrDict) -> None:
120
88
  )
121
89
 
122
90
 
123
- def build_detector(
124
- cfg: AttrDict, mode: str
125
- ) -> Union[D2FrcnnDetector, TPFrcnnDetector, HFDetrDerivedDetector, D2FrcnnTracingDetector]:
126
- """Building a D2-Detector, a TP-Detector as Detr-Detector or a D2-Torch Tracing Detector according to
127
- the config
128
-
129
- :param cfg: Config
130
- :param mode: either `LAYOUT`,`CELL` or `ITEM`
131
- """
132
- weights = (
133
- getattr(cfg.TF, mode).WEIGHTS
134
- if cfg.LIB == "TF"
135
- else (getattr(cfg.PT, mode).WEIGHTS if detectron2_available() else getattr(cfg.PT, mode).WEIGHTS_TS)
136
- )
137
- filter_categories = (
138
- getattr(getattr(cfg.TF, mode), "FILTER") if cfg.LIB == "TF" else getattr(getattr(cfg.PT, mode), "FILTER")
139
- )
140
- config_path = ModelCatalog.get_full_path_configs(weights)
141
- weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
142
- profile = ModelCatalog.get_profile(weights)
143
- categories = profile.categories if profile.categories is not None else {}
144
-
145
- if profile.model_wrapper in ("TPFrcnnDetector",):
146
- return TPFrcnnDetector(config_path, weights_path, categories, filter_categories=filter_categories)
147
- if profile.model_wrapper in ("D2FrcnnDetector",):
148
- return D2FrcnnDetector(
149
- config_path, weights_path, categories, device=cfg.DEVICE, filter_categories=filter_categories
150
- )
151
- if profile.model_wrapper in ("D2FrcnnTracingDetector",):
152
- return D2FrcnnTracingDetector(config_path, weights_path, categories, filter_categories=filter_categories)
153
- if profile.model_wrapper in ("HFDetrDerivedDetector",):
154
- preprocessor_config = ModelCatalog.get_full_path_preprocessor_configs(weights)
155
- return HFDetrDerivedDetector(
156
- config_path,
157
- weights_path,
158
- preprocessor_config,
159
- categories,
160
- device=cfg.DEVICE,
161
- filter_categories=filter_categories,
162
- )
163
- raise TypeError(
164
- f"You have chosen profile.model_wrapper: {profile.model_wrapper} which is not allowed. Please check "
165
- f"compatability with your deep learning framework"
166
- )
167
-
168
-
169
- def build_padder(cfg: AttrDict, mode: str) -> PadTransform:
170
- """Building a padder according to the config
171
-
172
- :param cfg: Config
173
- :param mode: either `LAYOUT`,`CELL` or `ITEM`
174
- :return `PadTransform` instance
175
- """
176
- top, right, bottom, left = (
177
- getattr(cfg.PT, mode).PAD.TOP,
178
- getattr(cfg.PT, mode).PAD.RIGHT,
179
- getattr(cfg.PT, mode).PAD.BOTTOM,
180
- getattr(cfg.PT, mode).PAD.LEFT,
181
- )
182
- return PadTransform(top=top, right=right, bottom=bottom, left=left)
183
-
184
-
185
- def build_service(detector: ObjectDetector, cfg: AttrDict, mode: str) -> ImageLayoutService:
186
- """Building a layout service with a given detector
187
-
188
- :param detector: will be passed to the `ImageLayoutService`
189
- :param cfg: Configuration
190
- :param mode: either `LAYOUT`,`CELL` or `ITEM`
191
- :return `ImageLayoutService` instance
192
- """
193
- padder = None
194
- if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
195
- padder = build_padder(cfg, mode)
196
- return ImageLayoutService(detector, to_image=True, crop_image=True, padder=padder)
197
-
198
-
199
- def build_sub_image_service(detector: ObjectDetector, cfg: AttrDict, mode: str) -> SubImageLayoutService:
200
- """
201
- Building a sub image layout service with a given detector
202
-
203
- :param detector: will be passed to the `SubImageLayoutService`
204
- :param cfg: Configuration
205
- :param mode: either `LAYOUT`,`CELL` or `ITEM`
206
- :return: `SubImageLayoutService` instance
207
- """
208
- exclude_category_ids = []
209
- padder = None
210
- if mode == "ITEM":
211
- if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
212
- exclude_category_ids.extend([1, 3, 4, 5, 6])
213
- padder = build_padder(cfg, mode)
214
- detect_result_generator = DetectResultGenerator(
215
- categories=detector.categories.categories, exclude_category_ids=exclude_category_ids
216
- )
217
- return SubImageLayoutService(
218
- detector, [LayoutType.TABLE, LayoutType.TABLE_ROTATED], None, detect_result_generator, padder
219
- )
220
-
221
-
222
- def build_ocr(cfg: AttrDict) -> Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]:
223
- """
224
- Building OCR predictor
225
- :param cfg: Config
226
- """
227
- if cfg.OCR.USE_TESSERACT:
228
- ocr_config_path = get_configs_dir_path() / cfg.OCR.CONFIG.TESSERACT
229
- return TesseractOcrDetector(
230
- ocr_config_path, config_overwrite=[f"LANGUAGES={cfg.LANGUAGE}"] if cfg.LANGUAGE is not None else None
231
- )
232
- if cfg.OCR.USE_DOCTR:
233
- weights = cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.TF if cfg.LIB == "TF" else cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.PT
234
- weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
235
- profile = ModelCatalog.get_profile(weights)
236
- # get_full_path_configs will complete the path even if the model is not registered
237
- config_path = ModelCatalog.get_full_path_configs(weights) if profile.config is not None else None
238
- if profile.architecture is None:
239
- raise ValueError("model profile.architecture must be specified")
240
- return DoctrTextRecognizer(
241
- profile.architecture, weights_path, cfg.DEVICE, lib=cfg.LIB, path_config_json=config_path
242
- )
243
- if cfg.OCR.USE_TEXTRACT:
244
- credentials_kwargs = {
245
- "aws_access_key_id": environ.get("ACCESS_KEY", None),
246
- "aws_secret_access_key": environ.get("SECRET_KEY", None),
247
- "config": Config(region_name=environ.get("REGION", None)),
248
- }
249
- return TextractOcrDetector(**credentials_kwargs)
250
- raise ValueError("You have set USE_OCR=True but any of USE_TESSERACT, USE_DOCTR, USE_TEXTRACT is set to False")
251
-
252
-
253
- def build_doctr_word(cfg: AttrDict) -> DoctrTextlineDetector:
254
- """Building `DoctrTextlineDetector` instance"""
255
- weights = cfg.OCR.WEIGHTS.DOCTR_WORD.TF if cfg.LIB == "TF" else cfg.OCR.WEIGHTS.DOCTR_WORD.PT
256
- weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
257
- profile = ModelCatalog.get_profile(weights)
258
- if profile.architecture is None:
259
- raise ValueError("model profile.architecture must be specified")
260
- if profile.categories is None:
261
- raise ValueError("model profile.categories must be specified")
262
- return DoctrTextlineDetector(profile.architecture, weights_path, profile.categories, cfg.DEVICE, lib=cfg.LIB)
263
-
264
-
265
- def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
266
- """
267
- Builds the analyzer with a given config
268
-
269
- :param cfg: A configuration
270
- :return: Analyzer pipeline
271
- """
272
- pipe_component_list: list[PipelineComponent] = []
273
-
274
- if cfg.USE_LAYOUT:
275
- d_layout = build_detector(cfg, "LAYOUT")
276
- layout = build_service(d_layout, cfg, "LAYOUT")
277
- pipe_component_list.append(layout)
278
-
279
- # setup layout nms service
280
- if cfg.LAYOUT_NMS_PAIRS.COMBINATIONS and cfg.USE_LAYOUT:
281
- if not detectron2_available() and cfg.LIB == "PT":
282
- raise ModuleNotFoundError("LAYOUT_NMS_PAIRS is only available for detectron2")
283
- if not isinstance(cfg.LAYOUT_NMS_PAIRS.COMBINATIONS, list) and not isinstance(
284
- cfg.LAYOUT_NMS_PAIRS.COMBINATIONS[0], list
285
- ):
286
- raise ValueError("LAYOUT_NMS_PAIRS mus be a list of lists")
287
- layout_nms_serivce = AnnotationNmsService(
288
- cfg.LAYOUT_NMS_PAIRS.COMBINATIONS, cfg.LAYOUT_NMS_PAIRS.THRESHOLDS, cfg.LAYOUT_NMS_PAIRS.PRIORITY
289
- )
290
- pipe_component_list.append(layout_nms_serivce)
291
-
292
- # setup tables service
293
- if cfg.USE_TABLE_SEGMENTATION:
294
- d_item = build_detector(cfg, "ITEM")
295
- item = build_sub_image_service(d_item, cfg, "ITEM")
296
- pipe_component_list.append(item)
297
-
298
- if d_item.__class__.__name__ not in ("HFDetrDerivedDetector",):
299
- d_cell = build_detector(cfg, "CELL")
300
- cell = build_sub_image_service(d_cell, cfg, "CELL")
301
- pipe_component_list.append(cell)
302
-
303
- if d_item.__class__.__name__ in ("HFDetrDerivedDetector",):
304
- pubtables = PubtablesSegmentationService(
305
- cfg.SEGMENTATION.ASSIGNMENT_RULE,
306
- cfg.SEGMENTATION.THRESHOLD_ROWS,
307
- cfg.SEGMENTATION.THRESHOLD_COLS,
308
- cfg.SEGMENTATION.FULL_TABLE_TILING,
309
- cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
310
- cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
311
- cfg.SEGMENTATION.CELL_CATEGORY_ID,
312
- LayoutType.TABLE,
313
- [
314
- CellType.SPANNING,
315
- CellType.ROW_HEADER,
316
- CellType.COLUMN_HEADER,
317
- CellType.PROJECTED_ROW_HEADER,
318
- LayoutType.CELL,
319
- ],
320
- [
321
- CellType.SPANNING,
322
- CellType.ROW_HEADER,
323
- CellType.COLUMN_HEADER,
324
- CellType.PROJECTED_ROW_HEADER,
325
- ],
326
- [LayoutType.ROW, LayoutType.COLUMN],
327
- [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER],
328
- stretch_rule=cfg.SEGMENTATION.STRETCH_RULE,
329
- )
330
- pipe_component_list.append(pubtables)
331
- else:
332
- table_segmentation = TableSegmentationService(
333
- cfg.SEGMENTATION.ASSIGNMENT_RULE,
334
- cfg.SEGMENTATION.THRESHOLD_ROWS,
335
- cfg.SEGMENTATION.THRESHOLD_COLS,
336
- cfg.SEGMENTATION.FULL_TABLE_TILING,
337
- cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
338
- cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
339
- LayoutType.TABLE,
340
- [CellType.HEADER, CellType.BODY, LayoutType.CELL],
341
- [LayoutType.ROW, LayoutType.COLUMN],
342
- [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER],
343
- cfg.SEGMENTATION.STRETCH_RULE,
344
- )
345
- pipe_component_list.append(table_segmentation)
346
-
347
- if cfg.USE_TABLE_REFINEMENT:
348
- table_segmentation_refinement = TableSegmentationRefinementService(
349
- [LayoutType.TABLE, LayoutType.TABLE_ROTATED],
350
- [
351
- LayoutType.CELL,
352
- CellType.COLUMN_HEADER,
353
- CellType.PROJECTED_ROW_HEADER,
354
- CellType.SPANNING,
355
- CellType.ROW_HEADER,
356
- ],
357
- )
358
- pipe_component_list.append(table_segmentation_refinement)
359
-
360
- if cfg.USE_PDF_MINER:
361
- pdf_text = PdfPlumberTextDetector(x_tolerance=cfg.PDF_MINER.X_TOLERANCE, y_tolerance=cfg.PDF_MINER.Y_TOLERANCE)
362
- d_text = TextExtractionService(pdf_text)
363
- pipe_component_list.append(d_text)
364
-
365
- # setup ocr
366
- if cfg.USE_OCR:
367
- # the extra mile for DocTr
368
- if cfg.OCR.USE_DOCTR:
369
- d_word = build_doctr_word(cfg)
370
- word = ImageLayoutService(d_word, to_image=True, crop_image=True, skip_if_layout_extracted=True)
371
- pipe_component_list.append(word)
372
-
373
- ocr = build_ocr(cfg)
374
- skip_if_text_extracted = cfg.USE_PDF_MINER
375
- extract_from_roi = LayoutType.WORD if cfg.OCR.USE_DOCTR else None
376
- text = TextExtractionService(
377
- ocr, skip_if_text_extracted=skip_if_text_extracted, extract_from_roi=extract_from_roi
378
- )
379
- pipe_component_list.append(text)
380
-
381
- if cfg.USE_PDF_MINER or cfg.USE_OCR:
382
- matcher = IntersectionMatcher(
383
- matching_rule=cfg.WORD_MATCHING.RULE,
384
- threshold=cfg.WORD_MATCHING.THRESHOLD,
385
- max_parent_only=cfg.WORD_MATCHING.MAX_PARENT_ONLY,
386
- )
387
- match = MatchingService(
388
- parent_categories=cfg.WORD_MATCHING.PARENTAL_CATEGORIES,
389
- child_categories=LayoutType.WORD,
390
- matcher=matcher,
391
- relationship_key=Relationships.CHILD,
392
- )
393
- pipe_component_list.append(match)
394
-
395
- order = TextOrderService(
396
- text_container=LayoutType.WORD,
397
- text_block_categories=cfg.TEXT_ORDERING.TEXT_BLOCK_CATEGORIES,
398
- floating_text_block_categories=cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
399
- include_residual_text_container=cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
400
- starting_point_tolerance=cfg.TEXT_ORDERING.STARTING_POINT_TOLERANCE,
401
- broken_line_tolerance=cfg.TEXT_ORDERING.BROKEN_LINE_TOLERANCE,
402
- height_tolerance=cfg.TEXT_ORDERING.HEIGHT_TOLERANCE,
403
- paragraph_break=cfg.TEXT_ORDERING.PARAGRAPH_BREAK,
404
- )
405
- pipe_component_list.append(order)
406
-
407
- page_parsing_service = PageParsingService(
408
- text_container=LayoutType.WORD,
409
- floating_text_block_categories=cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
410
- include_residual_text_container=cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
411
- )
412
- pipe = DoctectionPipe(pipeline_component_list=pipe_component_list, page_parsing_service=page_parsing_service)
413
-
414
- return pipe
415
-
416
-
417
91
  def get_dd_analyzer(
418
92
  reset_config_file: bool = True,
419
93
  config_overwrite: Optional[list[str]] = None,
@@ -457,7 +131,9 @@ def get_dd_analyzer(
457
131
  maybe_copy_config_to_cache(get_package_path(), get_configs_dir_path() / "dd", _TESSERACT)
458
132
 
459
133
  # Set up of the configuration and logging
460
- cfg = set_config_by_yaml(dd_one_config_path if not path_config_file else path_config_file)
134
+ file_cfg = set_config_by_yaml(dd_one_config_path if not path_config_file else path_config_file)
135
+ cfg.freeze(freezed=False)
136
+ cfg.overwrite_config(file_cfg)
461
137
 
462
138
  cfg.freeze(freezed=False)
463
139
  cfg.LANGUAGE = None
@@ -468,11 +144,11 @@ def get_dd_analyzer(
468
144
  if config_overwrite:
469
145
  cfg.update_args(config_overwrite)
470
146
 
471
- config_sanity_checks(cfg)
147
+ config_sanity_checks()
472
148
  logger.info(LoggingRecord(f"Config: \n {str(cfg)}", cfg.to_dict())) # type: ignore
473
149
 
474
150
  # will silent all TP logging while building the tower
475
151
  if tensorpack_available():
476
152
  disable_tp_layer_logging()
477
153
 
478
- return build_analyzer(cfg)
154
+ return ServiceFactory.build_analyzer(cfg)