deepdoctection 0.33__py3-none-any.whl → 0.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (40) hide show
  1. deepdoctection/__init__.py +11 -12
  2. deepdoctection/analyzer/__init__.py +1 -0
  3. deepdoctection/analyzer/_config.py +150 -0
  4. deepdoctection/analyzer/dd.py +42 -358
  5. deepdoctection/analyzer/factory.py +522 -0
  6. deepdoctection/configs/conf_dd_one.yaml +1 -0
  7. deepdoctection/datapoint/annotation.py +41 -3
  8. deepdoctection/datapoint/convert.py +6 -4
  9. deepdoctection/datapoint/image.py +132 -46
  10. deepdoctection/datapoint/view.py +2 -1
  11. deepdoctection/datasets/base.py +1 -1
  12. deepdoctection/datasets/instances/fintabnet.py +1 -1
  13. deepdoctection/datasets/instances/xfund.py +29 -7
  14. deepdoctection/eval/eval.py +7 -1
  15. deepdoctection/extern/model.py +2 -1
  16. deepdoctection/extern/pdftext.py +96 -5
  17. deepdoctection/extern/tessocr.py +1 -0
  18. deepdoctection/mapper/cats.py +11 -13
  19. deepdoctection/mapper/cocostruct.py +6 -2
  20. deepdoctection/mapper/d2struct.py +2 -1
  21. deepdoctection/mapper/laylmstruct.py +1 -1
  22. deepdoctection/mapper/match.py +31 -0
  23. deepdoctection/mapper/misc.py +1 -1
  24. deepdoctection/mapper/prodigystruct.py +1 -1
  25. deepdoctection/pipe/anngen.py +27 -0
  26. deepdoctection/pipe/base.py +23 -0
  27. deepdoctection/pipe/common.py +123 -38
  28. deepdoctection/pipe/segment.py +1 -1
  29. deepdoctection/pipe/sub_layout.py +1 -1
  30. deepdoctection/utils/env_info.py +31 -2
  31. deepdoctection/utils/file_utils.py +19 -0
  32. deepdoctection/utils/fs.py +27 -4
  33. deepdoctection/utils/metacfg.py +12 -0
  34. deepdoctection/utils/pdf_utils.py +114 -6
  35. deepdoctection/utils/settings.py +3 -0
  36. {deepdoctection-0.33.dist-info → deepdoctection-0.35.dist-info}/METADATA +20 -11
  37. {deepdoctection-0.33.dist-info → deepdoctection-0.35.dist-info}/RECORD +40 -38
  38. {deepdoctection-0.33.dist-info → deepdoctection-0.35.dist-info}/WHEEL +1 -1
  39. {deepdoctection-0.33.dist-info → deepdoctection-0.35.dist-info}/LICENSE +0 -0
  40. {deepdoctection-0.33.dist-info → deepdoctection-0.35.dist-info}/top_level.txt +0 -0
@@ -26,85 +26,57 @@ Module for **deep**doctection analyzer.
26
26
  from __future__ import annotations
27
27
 
28
28
  import os
29
- from os import environ
30
- from shutil import copyfile
31
- from typing import Optional, Union
29
+ from typing import Optional
32
30
 
33
- from lazy_imports import try_import
34
-
35
- from ..extern.base import ObjectDetector
36
- from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
37
- from ..extern.doctrocr import DoctrTextlineDetector, DoctrTextRecognizer
38
- from ..extern.hfdetr import HFDetrDerivedDetector
39
- from ..extern.model import ModelCatalog, ModelDownloadManager
40
- from ..extern.pdftext import PdfPlumberTextDetector
41
31
  from ..extern.pt.ptutils import get_torch_device
42
- from ..extern.tessocr import TesseractOcrDetector
43
- from ..extern.texocr import TextractOcrDetector
44
32
  from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
45
- from ..extern.tpdetect import TPFrcnnDetector
46
- from ..pipe.base import PipelineComponent
47
- from ..pipe.common import AnnotationNmsService, MatchingService, PageParsingService
48
33
  from ..pipe.doctectionpipe import DoctectionPipe
49
- from ..pipe.layout import ImageLayoutService
50
- from ..pipe.order import TextOrderService
51
- from ..pipe.refine import TableSegmentationRefinementService
52
- from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
53
- from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
54
- from ..pipe.text import TextExtractionService
55
34
  from ..utils.env_info import ENV_VARS_TRUE
56
35
  from ..utils.error import DependencyError
57
- from ..utils.file_utils import detectron2_available, tensorpack_available
58
- from ..utils.fs import get_configs_dir_path, get_package_path, mkdir_p
36
+ from ..utils.file_utils import tensorpack_available
37
+ from ..utils.fs import get_configs_dir_path, get_package_path, maybe_copy_config_to_cache
59
38
  from ..utils.logger import LoggingRecord, logger
60
- from ..utils.metacfg import AttrDict, set_config_by_yaml
61
- from ..utils.settings import CellType, LayoutType
62
- from ..utils.transform import PadTransform
39
+ from ..utils.metacfg import set_config_by_yaml
63
40
  from ..utils.types import PathLikeOrStr
64
-
65
- with try_import() as image_guard:
66
- from botocore.config import Config # type: ignore
67
-
41
+ from ._config import cfg
42
+ from .factory import ServiceFactory
68
43
 
69
44
  __all__ = [
70
- "maybe_copy_config_to_cache",
71
45
  "config_sanity_checks",
72
- "build_detector",
73
- "build_padder",
74
- "build_service",
75
- "build_sub_image_service",
76
- "build_ocr",
77
- "build_doctr_word",
78
46
  "get_dd_analyzer",
79
- "build_analyzer",
80
47
  ]
81
48
 
82
49
  _DD_ONE = "deepdoctection/configs/conf_dd_one.yaml"
83
50
  _TESSERACT = "deepdoctection/configs/conf_tesseract.yaml"
84
-
85
-
86
- def maybe_copy_config_to_cache(
87
- package_path: PathLikeOrStr, configs_dir_path: PathLikeOrStr, file_name: str, force_copy: bool = True
88
- ) -> str:
89
- """
90
- Initial copying of various files
91
- :param package_path: base path to directory of source file `file_name`
92
- :param configs_dir_path: base path to target directory
93
- :param file_name: file to copy
94
- :param force_copy: If file is already in target directory, will re-copy the file
95
-
96
- :return: path to the copied file_name
97
- """
98
-
99
- absolute_path_source = os.path.join(package_path, file_name)
100
- absolute_path = os.path.join(configs_dir_path, os.path.join("dd", os.path.split(file_name)[1]))
101
- mkdir_p(os.path.split(absolute_path)[0])
102
- if not os.path.isfile(absolute_path) or force_copy:
103
- copyfile(absolute_path_source, absolute_path)
104
- return absolute_path
105
-
106
-
107
- def config_sanity_checks(cfg: AttrDict) -> None:
51
+ _MODEL_CHOICES = {
52
+ "layout": [
53
+ "layout/d2_model_0829999_layout_inf_only.pt",
54
+ "xrf_layout/model_final_inf_only.pt",
55
+ "microsoft/table-transformer-detection/pytorch_model.bin",
56
+ ],
57
+ "segmentation": [
58
+ "item/model-1620000_inf_only.data-00000-of-00001",
59
+ "xrf_item/model_final_inf_only.pt",
60
+ "microsoft/table-transformer-structure-recognition/pytorch_model.bin",
61
+ "deepdoctection/tatr_tab_struct_v2/pytorch_model.bin",
62
+ ],
63
+ "ocr": ["Tesseract", "DocTr", "Textract"],
64
+ "doctr_word": ["doctr/db_resnet50/pt/db_resnet50-ac60cadc.pt"],
65
+ "doctr_recognition": [
66
+ "doctr/crnn_vgg16_bn/pt/crnn_vgg16_bn-9762b0b0.pt",
67
+ "doctr/crnn_vgg16_bn/pt/pytorch_model.bin",
68
+ ],
69
+ "llm": ["gpt-3.5-turbo", "gpt-4"],
70
+ "segmentation_choices": {
71
+ "item/model-1620000_inf_only.data-00000-of-00001": "cell/model-1800000_inf_only.data-00000-of-00001",
72
+ "xrf_item/model_final_inf_only.pt": "xrf_cell/model_final_inf_only.pt",
73
+ "microsoft/table-transformer-structure-recognition/pytorch_model.bin": None,
74
+ "deepdoctection/tatr_tab_struct_v2/pytorch_model.bin": None,
75
+ },
76
+ }
77
+
78
+
79
+ def config_sanity_checks() -> None:
108
80
  """Some config sanity checks"""
109
81
  if cfg.USE_PDF_MINER and cfg.USE_OCR and cfg.OCR.USE_DOCTR:
110
82
  raise ValueError("Configuration USE_PDF_MINER= True and USE_OCR=True and USE_DOCTR=True is not allowed")
@@ -116,296 +88,6 @@ def config_sanity_checks(cfg: AttrDict) -> None:
116
88
  )
117
89
 
118
90
 
119
- def build_detector(
120
- cfg: AttrDict, mode: str
121
- ) -> Union[D2FrcnnDetector, TPFrcnnDetector, HFDetrDerivedDetector, D2FrcnnTracingDetector]:
122
- """Building a D2-Detector, a TP-Detector as Detr-Detector or a D2-Torch Tracing Detector according to
123
- the config
124
-
125
- :param cfg: Config
126
- :param mode: either `LAYOUT`,`CELL` or `ITEM`
127
- """
128
- weights = (
129
- getattr(cfg.TF, mode).WEIGHTS
130
- if cfg.LIB == "TF"
131
- else (getattr(cfg.PT, mode).WEIGHTS if detectron2_available() else getattr(cfg.PT, mode).WEIGHTS_TS)
132
- )
133
- filter_categories = (
134
- getattr(getattr(cfg.TF, mode), "FILTER") if cfg.LIB == "TF" else getattr(getattr(cfg.PT, mode), "FILTER")
135
- )
136
- config_path = ModelCatalog.get_full_path_configs(weights)
137
- weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
138
- profile = ModelCatalog.get_profile(weights)
139
- categories = profile.categories if profile.categories is not None else {}
140
-
141
- if profile.model_wrapper in ("TPFrcnnDetector",):
142
- return TPFrcnnDetector(config_path, weights_path, categories, filter_categories=filter_categories)
143
- if profile.model_wrapper in ("D2FrcnnDetector",):
144
- return D2FrcnnDetector(
145
- config_path, weights_path, categories, device=cfg.DEVICE, filter_categories=filter_categories
146
- )
147
- if profile.model_wrapper in ("D2FrcnnTracingDetector",):
148
- return D2FrcnnTracingDetector(config_path, weights_path, categories, filter_categories=filter_categories)
149
- if profile.model_wrapper in ("HFDetrDerivedDetector",):
150
- preprocessor_config = ModelCatalog.get_full_path_preprocessor_configs(weights)
151
- return HFDetrDerivedDetector(
152
- config_path,
153
- weights_path,
154
- preprocessor_config,
155
- categories,
156
- device=cfg.DEVICE,
157
- filter_categories=filter_categories,
158
- )
159
- raise TypeError(
160
- f"You have chosen profile.model_wrapper: {profile.model_wrapper} which is not allowed. Please check "
161
- f"compatability with your deep learning framework"
162
- )
163
-
164
-
165
- def build_padder(cfg: AttrDict, mode: str) -> PadTransform:
166
- """Building a padder according to the config
167
-
168
- :param cfg: Config
169
- :param mode: either `LAYOUT`,`CELL` or `ITEM`
170
- :return `PadTransform` instance
171
- """
172
- top, right, bottom, left = (
173
- getattr(cfg.PT, mode).PAD.TOP,
174
- getattr(cfg.PT, mode).PAD.RIGHT,
175
- getattr(cfg.PT, mode).PAD.BOTTOM,
176
- getattr(cfg.PT, mode).PAD.LEFT,
177
- )
178
- return PadTransform(top=top, right=right, bottom=bottom, left=left)
179
-
180
-
181
- def build_service(detector: ObjectDetector, cfg: AttrDict, mode: str) -> ImageLayoutService:
182
- """Building a layout service with a given detector
183
-
184
- :param detector: will be passed to the `ImageLayoutService`
185
- :param cfg: Configuration
186
- :param mode: either `LAYOUT`,`CELL` or `ITEM`
187
- :return `ImageLayoutService` instance
188
- """
189
- padder = None
190
- if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
191
- padder = build_padder(cfg, mode)
192
- return ImageLayoutService(detector, to_image=True, crop_image=True, padder=padder)
193
-
194
-
195
- def build_sub_image_service(detector: ObjectDetector, cfg: AttrDict, mode: str) -> SubImageLayoutService:
196
- """
197
- Building a sub image layout service with a given detector
198
-
199
- :param detector: will be passed to the `SubImageLayoutService`
200
- :param cfg: Configuration
201
- :param mode: either `LAYOUT`,`CELL` or `ITEM`
202
- :return: `SubImageLayoutService` instance
203
- """
204
- exclude_category_ids = []
205
- padder = None
206
- if mode == "ITEM":
207
- if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
208
- exclude_category_ids.extend([1, 3, 4, 5, 6])
209
- padder = build_padder(cfg, mode)
210
- detect_result_generator = DetectResultGenerator(
211
- categories=detector.categories.categories, exclude_category_ids=exclude_category_ids
212
- )
213
- return SubImageLayoutService(
214
- detector, [LayoutType.TABLE, LayoutType.TABLE_ROTATED], None, detect_result_generator, padder
215
- )
216
-
217
-
218
- def build_ocr(cfg: AttrDict) -> Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]:
219
- """
220
- Building OCR predictor
221
- :param cfg: Config
222
- """
223
- if cfg.OCR.USE_TESSERACT:
224
- ocr_config_path = get_configs_dir_path() / cfg.OCR.CONFIG.TESSERACT
225
- return TesseractOcrDetector(
226
- ocr_config_path, config_overwrite=[f"LANGUAGES={cfg.LANGUAGE}"] if cfg.LANGUAGE is not None else None
227
- )
228
- if cfg.OCR.USE_DOCTR:
229
- weights = cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.TF if cfg.LIB == "TF" else cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.PT
230
- weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
231
- profile = ModelCatalog.get_profile(weights)
232
- # get_full_path_configs will complete the path even if the model is not registered
233
- config_path = ModelCatalog.get_full_path_configs(weights) if profile.config is not None else None
234
- if profile.architecture is None:
235
- raise ValueError("model profile.architecture must be specified")
236
- return DoctrTextRecognizer(
237
- profile.architecture, weights_path, cfg.DEVICE, lib=cfg.LIB, path_config_json=config_path
238
- )
239
- if cfg.OCR.USE_TEXTRACT:
240
- credentials_kwargs = {
241
- "aws_access_key_id": environ.get("ACCESS_KEY", None),
242
- "aws_secret_access_key": environ.get("SECRET_KEY", None),
243
- "config": Config(region_name=environ.get("REGION", None)),
244
- }
245
- return TextractOcrDetector(**credentials_kwargs)
246
- raise ValueError("You have set USE_OCR=True but any of USE_TESSERACT, USE_DOCTR, USE_TEXTRACT is set to False")
247
-
248
-
249
- def build_doctr_word(cfg: AttrDict) -> DoctrTextlineDetector:
250
- """Building `DoctrTextlineDetector` instance"""
251
- weights = cfg.OCR.WEIGHTS.DOCTR_WORD.TF if cfg.LIB == "TF" else cfg.OCR.WEIGHTS.DOCTR_WORD.PT
252
- weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
253
- profile = ModelCatalog.get_profile(weights)
254
- if profile.architecture is None:
255
- raise ValueError("model profile.architecture must be specified")
256
- if profile.categories is None:
257
- raise ValueError("model profile.categories must be specified")
258
- return DoctrTextlineDetector(profile.architecture, weights_path, profile.categories, cfg.DEVICE, lib=cfg.LIB)
259
-
260
-
261
- def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
262
- """
263
- Builds the analyzer with a given config
264
-
265
- :param cfg: A configuration
266
- :return: Analyzer pipeline
267
- """
268
- pipe_component_list: list[PipelineComponent] = []
269
-
270
- if cfg.USE_LAYOUT:
271
- d_layout = build_detector(cfg, "LAYOUT")
272
- layout = build_service(d_layout, cfg, "LAYOUT")
273
- pipe_component_list.append(layout)
274
-
275
- # setup layout nms service
276
- if cfg.LAYOUT_NMS_PAIRS.COMBINATIONS and cfg.USE_LAYOUT:
277
- if not detectron2_available() and cfg.LIB == "PT":
278
- raise ModuleNotFoundError("LAYOUT_NMS_PAIRS is only available for detectron2")
279
- if not isinstance(cfg.LAYOUT_NMS_PAIRS.COMBINATIONS, list) and not isinstance(
280
- cfg.LAYOUT_NMS_PAIRS.COMBINATIONS[0], list
281
- ):
282
- raise ValueError("LAYOUT_NMS_PAIRS mus be a list of lists")
283
- layout_nms_serivce = AnnotationNmsService(
284
- cfg.LAYOUT_NMS_PAIRS.COMBINATIONS, cfg.LAYOUT_NMS_PAIRS.THRESHOLDS, cfg.LAYOUT_NMS_PAIRS.PRIORITY
285
- )
286
- pipe_component_list.append(layout_nms_serivce)
287
-
288
- # setup tables service
289
- if cfg.USE_TABLE_SEGMENTATION:
290
- d_item = build_detector(cfg, "ITEM")
291
- item = build_sub_image_service(d_item, cfg, "ITEM")
292
- pipe_component_list.append(item)
293
-
294
- if d_item.__class__.__name__ not in ("HFDetrDerivedDetector",):
295
- d_cell = build_detector(cfg, "CELL")
296
- cell = build_sub_image_service(d_cell, cfg, "CELL")
297
- pipe_component_list.append(cell)
298
-
299
- if d_item.__class__.__name__ in ("HFDetrDerivedDetector",):
300
- pubtables = PubtablesSegmentationService(
301
- cfg.SEGMENTATION.ASSIGNMENT_RULE,
302
- cfg.SEGMENTATION.THRESHOLD_ROWS,
303
- cfg.SEGMENTATION.THRESHOLD_COLS,
304
- cfg.SEGMENTATION.FULL_TABLE_TILING,
305
- cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
306
- cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
307
- cfg.SEGMENTATION.CELL_CATEGORY_ID,
308
- LayoutType.TABLE,
309
- [
310
- CellType.SPANNING,
311
- CellType.ROW_HEADER,
312
- CellType.COLUMN_HEADER,
313
- CellType.PROJECTED_ROW_HEADER,
314
- LayoutType.CELL,
315
- ],
316
- [
317
- CellType.SPANNING,
318
- CellType.ROW_HEADER,
319
- CellType.COLUMN_HEADER,
320
- CellType.PROJECTED_ROW_HEADER,
321
- ],
322
- [LayoutType.ROW, LayoutType.COLUMN],
323
- [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER],
324
- stretch_rule=cfg.SEGMENTATION.STRETCH_RULE,
325
- )
326
- pipe_component_list.append(pubtables)
327
- else:
328
- table_segmentation = TableSegmentationService(
329
- cfg.SEGMENTATION.ASSIGNMENT_RULE,
330
- cfg.SEGMENTATION.THRESHOLD_ROWS,
331
- cfg.SEGMENTATION.THRESHOLD_COLS,
332
- cfg.SEGMENTATION.FULL_TABLE_TILING,
333
- cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
334
- cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
335
- LayoutType.TABLE,
336
- [CellType.HEADER, CellType.BODY, LayoutType.CELL],
337
- [LayoutType.ROW, LayoutType.COLUMN],
338
- [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER],
339
- cfg.SEGMENTATION.STRETCH_RULE,
340
- )
341
- pipe_component_list.append(table_segmentation)
342
-
343
- if cfg.USE_TABLE_REFINEMENT:
344
- table_segmentation_refinement = TableSegmentationRefinementService(
345
- [LayoutType.TABLE, LayoutType.TABLE_ROTATED],
346
- [
347
- LayoutType.CELL,
348
- CellType.COLUMN_HEADER,
349
- CellType.PROJECTED_ROW_HEADER,
350
- CellType.SPANNING,
351
- CellType.ROW_HEADER,
352
- ],
353
- )
354
- pipe_component_list.append(table_segmentation_refinement)
355
-
356
- if cfg.USE_PDF_MINER:
357
- pdf_text = PdfPlumberTextDetector(x_tolerance=cfg.PDF_MINER.X_TOLERANCE, y_tolerance=cfg.PDF_MINER.Y_TOLERANCE)
358
- d_text = TextExtractionService(pdf_text)
359
- pipe_component_list.append(d_text)
360
-
361
- # setup ocr
362
- if cfg.USE_OCR:
363
- # the extra mile for DocTr
364
- if cfg.OCR.USE_DOCTR:
365
- d_word = build_doctr_word(cfg)
366
- word = ImageLayoutService(d_word, to_image=True, crop_image=True, skip_if_layout_extracted=True)
367
- pipe_component_list.append(word)
368
-
369
- ocr = build_ocr(cfg)
370
- skip_if_text_extracted = cfg.USE_PDF_MINER
371
- extract_from_roi = LayoutType.WORD if cfg.OCR.USE_DOCTR else None
372
- text = TextExtractionService(
373
- ocr, skip_if_text_extracted=skip_if_text_extracted, extract_from_roi=extract_from_roi
374
- )
375
- pipe_component_list.append(text)
376
-
377
- if cfg.USE_PDF_MINER or cfg.USE_OCR:
378
- match = MatchingService(
379
- parent_categories=cfg.WORD_MATCHING.PARENTAL_CATEGORIES,
380
- child_categories=LayoutType.WORD,
381
- matching_rule=cfg.WORD_MATCHING.RULE,
382
- threshold=cfg.WORD_MATCHING.THRESHOLD,
383
- max_parent_only=cfg.WORD_MATCHING.MAX_PARENT_ONLY,
384
- )
385
- pipe_component_list.append(match)
386
-
387
- order = TextOrderService(
388
- text_container=LayoutType.WORD,
389
- text_block_categories=cfg.TEXT_ORDERING.TEXT_BLOCK_CATEGORIES,
390
- floating_text_block_categories=cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
391
- include_residual_text_container=cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
392
- starting_point_tolerance=cfg.TEXT_ORDERING.STARTING_POINT_TOLERANCE,
393
- broken_line_tolerance=cfg.TEXT_ORDERING.BROKEN_LINE_TOLERANCE,
394
- height_tolerance=cfg.TEXT_ORDERING.HEIGHT_TOLERANCE,
395
- paragraph_break=cfg.TEXT_ORDERING.PARAGRAPH_BREAK,
396
- )
397
- pipe_component_list.append(order)
398
-
399
- page_parsing_service = PageParsingService(
400
- text_container=LayoutType.WORD,
401
- floating_text_block_categories=cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
402
- include_residual_text_container=cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
403
- )
404
- pipe = DoctectionPipe(pipeline_component_list=pipe_component_list, page_parsing_service=page_parsing_service)
405
-
406
- return pipe
407
-
408
-
409
91
  def get_dd_analyzer(
410
92
  reset_config_file: bool = True,
411
93
  config_overwrite: Optional[list[str]] = None,
@@ -444,12 +126,14 @@ def get_dd_analyzer(
444
126
  else:
445
127
  raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
446
128
  dd_one_config_path = maybe_copy_config_to_cache(
447
- get_package_path(), get_configs_dir_path(), _DD_ONE, reset_config_file
129
+ get_package_path(), get_configs_dir_path() / "dd", _DD_ONE, reset_config_file
448
130
  )
449
- maybe_copy_config_to_cache(get_package_path(), get_configs_dir_path(), _TESSERACT)
131
+ maybe_copy_config_to_cache(get_package_path(), get_configs_dir_path() / "dd", _TESSERACT)
450
132
 
451
133
  # Set up of the configuration and logging
452
- cfg = set_config_by_yaml(dd_one_config_path if not path_config_file else path_config_file)
134
+ file_cfg = set_config_by_yaml(dd_one_config_path if not path_config_file else path_config_file)
135
+ cfg.freeze(freezed=False)
136
+ cfg.overwrite_config(file_cfg)
453
137
 
454
138
  cfg.freeze(freezed=False)
455
139
  cfg.LANGUAGE = None
@@ -460,11 +144,11 @@ def get_dd_analyzer(
460
144
  if config_overwrite:
461
145
  cfg.update_args(config_overwrite)
462
146
 
463
- config_sanity_checks(cfg)
147
+ config_sanity_checks()
464
148
  logger.info(LoggingRecord(f"Config: \n {str(cfg)}", cfg.to_dict())) # type: ignore
465
149
 
466
150
  # will silent all TP logging while building the tower
467
151
  if tensorpack_available():
468
152
  disable_tp_layer_logging()
469
153
 
470
- return build_analyzer(cfg)
154
+ return ServiceFactory.build_analyzer(cfg)