deepdoctection 0.34__py3-none-any.whl → 0.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

@@ -0,0 +1,718 @@
1
+ # -*- coding: utf-8 -*-
2
+ # File: factory.py
3
+
4
+ # Copyright 2024 Dr. Janis Meyer. All rights reserved.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ """Factory for building the deepdoctection analyzer pipeline"""
19
+
20
+
21
+ from os import environ
22
+ from typing import Union
23
+
24
+ from lazy_imports import try_import
25
+
26
+ from ..extern.base import ImageTransformer, ObjectDetector, PdfMiner
27
+ from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
28
+ from ..extern.doctrocr import DoctrTextlineDetector, DoctrTextRecognizer
29
+ from ..extern.hfdetr import HFDetrDerivedDetector
30
+ from ..extern.model import ModelCatalog, ModelDownloadManager
31
+ from ..extern.pdftext import PdfPlumberTextDetector
32
+ from ..extern.tessocr import TesseractOcrDetector, TesseractRotationTransformer
33
+ from ..extern.texocr import TextractOcrDetector
34
+ from ..extern.tpdetect import TPFrcnnDetector
35
+ from ..pipe.base import PipelineComponent
36
+ from ..pipe.common import (
37
+ AnnotationNmsService,
38
+ IntersectionMatcher,
39
+ MatchingService,
40
+ NeighbourMatcher,
41
+ PageParsingService,
42
+ )
43
+ from ..pipe.doctectionpipe import DoctectionPipe
44
+ from ..pipe.layout import ImageLayoutService
45
+ from ..pipe.order import TextOrderService
46
+ from ..pipe.refine import TableSegmentationRefinementService
47
+ from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
48
+ from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
49
+ from ..pipe.text import TextExtractionService
50
+ from ..pipe.transform import SimpleTransformService
51
+ from ..utils.file_utils import detectron2_available
52
+ from ..utils.fs import get_configs_dir_path
53
+ from ..utils.metacfg import AttrDict
54
+ from ..utils.settings import LayoutType, Relationships
55
+ from ..utils.transform import PadTransform
56
+
57
+ with try_import() as image_guard:
58
+ from botocore.config import Config # type: ignore
59
+
60
+
61
+ __all__ = [
62
+ "ServiceFactory",
63
+ ]
64
+
65
+ # from ._config import cfg
66
+
67
+
68
+ class ServiceFactory:
69
+ """
70
+ Factory class for building various components of the deepdoctection analyzer pipeline.
71
+
72
+ This class uses the `cfg` configuration object from `_config.py`, which is an instance of the `AttrDict` class.
73
+ The configuration is not passed explicitly in an `__init__` method but is accessed directly within the methods.
74
+
75
+ The class provides static methods to build different services and detectors required for the pipeline, such as
76
+ layout detectors, OCR detectors, table segmentation services, and more. The methods disentangle the creation
77
+ of predictors (e.g., `ObjectDetector`, `TextRecognizer`) from the configuration, allowing for flexible and
78
+ modular construction of the pipeline components.
79
+
80
+ Extending the Class:
81
+ This class can be extended by using inheritance and adding new methods or overriding existing ones.
82
+ To extend the configuration attributes, you can modify the `cfg` object in `_config.py` to include new
83
+ settings or parameters required for the new methods.
84
+ """
85
+
86
+ @staticmethod
87
+ def _build_layout_detector(
88
+ config: AttrDict,
89
+ mode: str,
90
+ ) -> Union[D2FrcnnDetector, TPFrcnnDetector, HFDetrDerivedDetector, D2FrcnnTracingDetector]:
91
+ """Building a D2-Detector, a TP-Detector as Detr-Detector or a D2-Torch Tracing Detector according to
92
+ the config
93
+
94
+ :param config: configuration object
95
+ :param mode: either `LAYOUT`,`CELL` or `ITEM`
96
+ """
97
+ weights = (
98
+ getattr(config.TF, mode).WEIGHTS
99
+ if config.LIB == "TF"
100
+ else (getattr(config.PT, mode).WEIGHTS if detectron2_available() else getattr(config.PT, mode).WEIGHTS_TS)
101
+ )
102
+ filter_categories = (
103
+ getattr(getattr(config.TF, mode), "FILTER")
104
+ if config.LIB == "TF"
105
+ else getattr(getattr(config.PT, mode), "FILTER")
106
+ )
107
+ config_path = ModelCatalog.get_full_path_configs(weights)
108
+ weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
109
+ profile = ModelCatalog.get_profile(weights)
110
+ categories = profile.categories if profile.categories is not None else {}
111
+
112
+ if profile.model_wrapper in ("TPFrcnnDetector",):
113
+ return TPFrcnnDetector(
114
+ path_yaml=config_path,
115
+ path_weights=weights_path,
116
+ categories=categories,
117
+ filter_categories=filter_categories,
118
+ )
119
+ if profile.model_wrapper in ("D2FrcnnDetector",):
120
+ return D2FrcnnDetector(
121
+ path_yaml=config_path,
122
+ path_weights=weights_path,
123
+ categories=categories,
124
+ device=config.DEVICE,
125
+ filter_categories=filter_categories,
126
+ )
127
+ if profile.model_wrapper in ("D2FrcnnTracingDetector",):
128
+ return D2FrcnnTracingDetector(
129
+ path_yaml=config_path,
130
+ path_weights=weights_path,
131
+ categories=categories,
132
+ filter_categories=filter_categories,
133
+ )
134
+ if profile.model_wrapper in ("HFDetrDerivedDetector",):
135
+ preprocessor_config = ModelCatalog.get_full_path_preprocessor_configs(weights)
136
+ return HFDetrDerivedDetector(
137
+ path_config_json=config_path,
138
+ path_weights=weights_path,
139
+ path_feature_extractor_config_json=preprocessor_config,
140
+ categories=categories,
141
+ device=config.DEVICE,
142
+ filter_categories=filter_categories,
143
+ )
144
+ raise TypeError(
145
+ f"You have chosen profile.model_wrapper: {profile.model_wrapper} which is not allowed. Please check "
146
+ f"compatability with your deep learning framework"
147
+ )
148
+
149
+ @staticmethod
150
+ def build_layout_detector(
151
+ config: AttrDict, mode: str
152
+ ) -> Union[D2FrcnnDetector, TPFrcnnDetector, HFDetrDerivedDetector, D2FrcnnTracingDetector]:
153
+ """Building a layout detector according to the config
154
+
155
+ :param config: configuration object
156
+ :param mode: either `LAYOUT`,`CELL` or `ITEM`
157
+ """
158
+ return ServiceFactory._build_layout_detector(config, mode)
159
+
160
+ @staticmethod
161
+ def _build_rotation_detector() -> TesseractRotationTransformer:
162
+ """Building a rotation detector"""
163
+ return TesseractRotationTransformer()
164
+
165
+ @staticmethod
166
+ def build_rotation_detector() -> TesseractRotationTransformer:
167
+ """Building a rotation detector"""
168
+ return ServiceFactory._build_rotation_detector()
169
+
170
+ @staticmethod
171
+ def _build_transform_service(transform_predictor: ImageTransformer) -> SimpleTransformService:
172
+ """Building a transform service with a given predictor"""
173
+ return SimpleTransformService(transform_predictor)
174
+
175
+ @staticmethod
176
+ def build_transform_service(transform_predictor: ImageTransformer) -> SimpleTransformService:
177
+ """Building a transform service with a given predictor"""
178
+ return ServiceFactory._build_transform_service(transform_predictor)
179
+
180
+ @staticmethod
181
+ def _build_padder(config: AttrDict, mode: str) -> PadTransform:
182
+ """Building a padder according to the config
183
+
184
+ :param config: configuration object
185
+ :param mode: either `LAYOUT`,`CELL` or `ITEM`
186
+ :return `PadTransform` instance
187
+ """
188
+ top, right, bottom, left = (
189
+ getattr(config.PT, mode).PAD.TOP,
190
+ getattr(config.PT, mode).PAD.RIGHT,
191
+ getattr(config.PT, mode).PAD.BOTTOM,
192
+ getattr(config.PT, mode).PAD.LEFT,
193
+ )
194
+ return PadTransform(top=top, right=right, bottom=bottom, left=left) #
195
+
196
+ @staticmethod
197
+ def build_padder(config: AttrDict, mode: str) -> PadTransform:
198
+ """Building a padder according to the config
199
+
200
+ :param config: configuration object
201
+ :param mode: either `LAYOUT`,`CELL` or `ITEM`
202
+ :return `PadTransform` instance
203
+ """
204
+ return ServiceFactory._build_padder(config, mode)
205
+
206
+ @staticmethod
207
+ def _build_layout_service(config: AttrDict, detector: ObjectDetector, mode: str) -> ImageLayoutService:
208
+ """Building a layout service with a given detector
209
+
210
+ :param config: configuration object
211
+ :param detector: will be passed to the `ImageLayoutService`
212
+ :param mode: either `LAYOUT`,`CELL` or `ITEM`
213
+ :return `ImageLayoutService` instance
214
+ """
215
+ padder = None
216
+ if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
217
+ padder = ServiceFactory.build_padder(config, mode=mode)
218
+ return ImageLayoutService(layout_detector=detector, to_image=True, crop_image=True, padder=padder)
219
+
220
+ @staticmethod
221
+ def build_layout_service(config: AttrDict, detector: ObjectDetector, mode: str) -> ImageLayoutService:
222
+ """Building a layout service with a given detector
223
+
224
+ :param config: configuration object
225
+ :param detector: will be passed to the `ImageLayoutService`
226
+ :param mode: either `LAYOUT`,`CELL` or `ITEM`
227
+ :return `ImageLayoutService` instance
228
+ """
229
+ return ServiceFactory._build_layout_service(config, detector, mode)
230
+
231
+ @staticmethod
232
+ def _build_layout_nms_service(config: AttrDict) -> AnnotationNmsService:
233
+ """Building a NMS service for layout annotations
234
+
235
+ :param config: configuration object
236
+ """
237
+ if not detectron2_available() and config.LIB == "PT":
238
+ raise ModuleNotFoundError("LAYOUT_NMS_PAIRS is only available for detectron2")
239
+ if not isinstance(config.LAYOUT_NMS_PAIRS.COMBINATIONS, list) and not isinstance(
240
+ config.LAYOUT_NMS_PAIRS.COMBINATIONS[0], list
241
+ ):
242
+ raise ValueError("LAYOUT_NMS_PAIRS must be a list of lists")
243
+ return AnnotationNmsService(
244
+ nms_pairs=config.LAYOUT_NMS_PAIRS.COMBINATIONS,
245
+ thresholds=config.LAYOUT_NMS_PAIRS.THRESHOLDS,
246
+ priority=config.LAYOUT_NMS_PAIRS.PRIORITY,
247
+ )
248
+
249
+ @staticmethod
250
+ def build_layout_nms_service(config: AttrDict) -> AnnotationNmsService:
251
+ """Building a NMS service for layout annotations
252
+
253
+ :param config: configuration object
254
+ """
255
+ return ServiceFactory._build_layout_nms_service(config)
256
+
257
+ @staticmethod
258
+ def _build_sub_image_service(config: AttrDict, detector: ObjectDetector, mode: str) -> SubImageLayoutService:
259
+ """
260
+ Building a sub image layout service with a given detector
261
+
262
+ :param config: configuration object
263
+ :param detector: will be passed to the `SubImageLayoutService`
264
+ :param mode: either `LAYOUT`,`CELL` or `ITEM`
265
+ :return: `SubImageLayoutService` instance
266
+ """
267
+ exclude_category_ids = []
268
+ padder = None
269
+ if mode == "ITEM":
270
+ if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
271
+ exclude_category_ids.extend([1, 3, 4, 5, 6])
272
+ padder = ServiceFactory.build_padder(config, mode)
273
+ detect_result_generator = DetectResultGenerator(
274
+ categories=detector.categories.categories, exclude_category_ids=exclude_category_ids
275
+ )
276
+ return SubImageLayoutService(
277
+ sub_image_detector=detector,
278
+ sub_image_names=[LayoutType.TABLE, LayoutType.TABLE_ROTATED],
279
+ category_id_mapping=None,
280
+ detect_result_generator=detect_result_generator,
281
+ padder=padder,
282
+ )
283
+
284
+ @staticmethod
285
+ def build_sub_image_service(config: AttrDict, detector: ObjectDetector, mode: str) -> SubImageLayoutService:
286
+ """
287
+ Building a sub image layout service with a given detector
288
+
289
+ :param config: configuration object
290
+ :param detector: will be passed to the `SubImageLayoutService`
291
+ :param mode: either `LAYOUT`,`CELL` or `ITEM`
292
+ :return: `SubImageLayoutService` instance
293
+ """
294
+ return ServiceFactory._build_sub_image_service(config, detector, mode)
295
+
296
+ @staticmethod
297
+ def _build_ocr_detector(config: AttrDict) -> Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]:
298
+ """
299
+ Building OCR predictor
300
+
301
+ :param config: configuration object
302
+ """
303
+ if config.OCR.USE_TESSERACT:
304
+ ocr_config_path = get_configs_dir_path() / config.OCR.CONFIG.TESSERACT
305
+ return TesseractOcrDetector(
306
+ ocr_config_path,
307
+ config_overwrite=[f"LANGUAGES={config.LANGUAGE}"] if config.LANGUAGE is not None else None,
308
+ )
309
+ if config.OCR.USE_DOCTR:
310
+ weights = (
311
+ config.OCR.WEIGHTS.DOCTR_RECOGNITION.TF
312
+ if config.LIB == "TF"
313
+ else (config.OCR.WEIGHTS.DOCTR_RECOGNITION.PT)
314
+ )
315
+ weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
316
+ profile = ModelCatalog.get_profile(weights)
317
+ # get_full_path_configs will complete the path even if the model is not registered
318
+ config_path = ModelCatalog.get_full_path_configs(weights) if profile.config is not None else None
319
+ if profile.architecture is None:
320
+ raise ValueError("model profile.architecture must be specified")
321
+ return DoctrTextRecognizer(
322
+ architecture=profile.architecture,
323
+ path_weights=weights_path,
324
+ device=config.DEVICE,
325
+ lib=config.LIB,
326
+ path_config_json=config_path,
327
+ )
328
+ if config.OCR.USE_TEXTRACT:
329
+ credentials_kwargs = {
330
+ "aws_access_key_id": environ.get("ACCESS_KEY", None),
331
+ "aws_secret_access_key": environ.get("SECRET_KEY", None),
332
+ "config": Config(region_name=environ.get("REGION", None)),
333
+ }
334
+ return TextractOcrDetector(**credentials_kwargs)
335
+ raise ValueError("You have set USE_OCR=True but any of USE_TESSERACT, USE_DOCTR, USE_TEXTRACT is set to False")
336
+
337
+ @staticmethod
338
+ def build_ocr_detector(config: AttrDict) -> Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]:
339
+ """
340
+ Building OCR predictor
341
+
342
+ :param config: configuration object
343
+ """
344
+ return ServiceFactory._build_ocr_detector(config)
345
+
346
+ @staticmethod
347
+ def build_doctr_word_detector(config: AttrDict) -> DoctrTextlineDetector:
348
+ """Building `DoctrTextlineDetector` instance
349
+
350
+ :param config: configuration object
351
+ :return: DoctrTextlineDetector
352
+ """
353
+ weights = config.OCR.WEIGHTS.DOCTR_WORD.TF if config.LIB == "TF" else config.OCR.WEIGHTS.DOCTR_WORD.PT
354
+ weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
355
+ profile = ModelCatalog.get_profile(weights)
356
+ if profile.architecture is None:
357
+ raise ValueError("model profile.architecture must be specified")
358
+ if profile.categories is None:
359
+ raise ValueError("model profile.categories must be specified")
360
+ return DoctrTextlineDetector(
361
+ profile.architecture, weights_path, profile.categories, config.DEVICE, lib=config.LIB
362
+ )
363
+
364
+ @staticmethod
365
+ def _build_table_segmentation_service(
366
+ config: AttrDict,
367
+ detector: ObjectDetector,
368
+ ) -> Union[PubtablesSegmentationService, TableSegmentationService]:
369
+ """
370
+ Build and return a table segmentation service based on the provided detector.
371
+
372
+ Depending on the type of the detector, this method will return either a `PubtablesSegmentationService` or a
373
+ `TableSegmentationService` instance. The selection is made as follows:
374
+
375
+ - If the detector is an instance of `HFDetrDerivedDetector`, a `PubtablesSegmentationService` is created and
376
+ returned. This service uses specific configuration parameters for segmentation, such as assignment rules,
377
+ thresholds, and cell names defined in the `cfg` object.
378
+ - For other detector types, a `TableSegmentationService` is created and returned. This service also uses
379
+ configuration parameters from the `cfg` object but is tailored for different segmentation needs.
380
+
381
+ :param config: configuration object
382
+ :param detector: An instance of `ObjectDetector` used to determine the type of table segmentation
383
+ service to build.
384
+ :return: An instance of either `PubtablesSegmentationService` or `TableSegmentationService` based on the
385
+ detector type.
386
+ """
387
+ table_segmentation: Union[PubtablesSegmentationService, TableSegmentationService]
388
+ if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
389
+ table_segmentation = PubtablesSegmentationService(
390
+ segment_rule=config.SEGMENTATION.ASSIGNMENT_RULE,
391
+ threshold_rows=config.SEGMENTATION.THRESHOLD_ROWS,
392
+ threshold_cols=config.SEGMENTATION.THRESHOLD_COLS,
393
+ tile_table_with_items=config.SEGMENTATION.FULL_TABLE_TILING,
394
+ remove_iou_threshold_rows=config.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
395
+ remove_iou_threshold_cols=config.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
396
+ cell_class_id=config.SEGMENTATION.CELL_CATEGORY_ID,
397
+ table_name=config.SEGMENTATION.TABLE_NAME,
398
+ cell_names=config.SEGMENTATION.PUBTABLES_CELL_NAMES,
399
+ spanning_cell_names=config.SEGMENTATION.PUBTABLES_SPANNING_CELL_NAMES,
400
+ item_names=config.SEGMENTATION.PUBTABLES_ITEM_NAMES,
401
+ sub_item_names=config.SEGMENTATION.PUBTABLES_SUB_ITEM_NAMES,
402
+ stretch_rule=config.SEGMENTATION.STRETCH_RULE,
403
+ )
404
+
405
+ else:
406
+ table_segmentation = TableSegmentationService(
407
+ segment_rule=config.SEGMENTATION.ASSIGNMENT_RULE,
408
+ threshold_rows=config.SEGMENTATION.THRESHOLD_ROWS,
409
+ threshold_cols=config.SEGMENTATION.THRESHOLD_COLS,
410
+ tile_table_with_items=config.SEGMENTATION.FULL_TABLE_TILING,
411
+ remove_iou_threshold_rows=config.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
412
+ remove_iou_threshold_cols=config.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
413
+ table_name=config.SEGMENTATION.TABLE_NAME,
414
+ cell_names=config.SEGMENTATION.CELL_NAMES,
415
+ item_names=config.SEGMENTATION.ITEM_NAMES,
416
+ sub_item_names=config.SEGMENTATION.SUB_ITEM_NAMES,
417
+ stretch_rule=config.SEGMENTATION.STRETCH_RULE,
418
+ )
419
+ return table_segmentation
420
+
421
+ @staticmethod
422
+ def build_table_segmentation_service(
423
+ config: AttrDict,
424
+ detector: ObjectDetector,
425
+ ) -> Union[PubtablesSegmentationService, TableSegmentationService]:
426
+ """
427
+ Build and return a table segmentation service based on the provided detector.
428
+
429
+ Depending on the type of the detector, this method will return either a `PubtablesSegmentationService` or a
430
+ `TableSegmentationService` instance. The selection is made as follows:
431
+
432
+ - If the detector is an instance of `HFDetrDerivedDetector`, a `PubtablesSegmentationService` is created and
433
+ returned. This service uses specific configuration parameters for segmentation, such as assignment rules,
434
+ thresholds, and cell names defined in the `cfg` object.
435
+ - For other detector types, a `TableSegmentationService` is created and returned. This service also uses
436
+ configuration parameters from the `cfg` object but is tailored for different segmentation needs.
437
+
438
+ :param config: configuration object
439
+ :param detector: An instance of `ObjectDetector` used to determine the type of table segmentation
440
+ service to build.
441
+ :return: An instance of either `PubtablesSegmentationService` or `TableSegmentationService` based on the
442
+ detector type.
443
+ """
444
+ return ServiceFactory._build_table_segmentation_service(config, detector)
445
+
446
+ @staticmethod
447
+ def _build_table_refinement_service(config: AttrDict) -> TableSegmentationRefinementService:
448
+ """Building a table segmentation refinement service
449
+
450
+ :param config: configuration object
451
+ :return: TableSegmentationRefinementService
452
+ """
453
+ return TableSegmentationRefinementService(
454
+ [config.SEGMENTATION.TABLE_NAME],
455
+ config.SEGMENTATION.PUBTABLES_CELL_NAMES,
456
+ )
457
+
458
+ @staticmethod
459
+ def build_table_refinement_service(config: AttrDict) -> TableSegmentationRefinementService:
460
+ """Building a table segmentation refinement service
461
+
462
+ :param config: configuration object
463
+ :return: TableSegmentationRefinementService
464
+ """
465
+ return ServiceFactory._build_table_refinement_service(config)
466
+
467
+ @staticmethod
468
+ def _build_pdf_text_detector(config: AttrDict) -> PdfPlumberTextDetector:
469
+ """Building a PDF text detector
470
+
471
+ :param config: configuration object
472
+ :return: PdfPlumberTextDetector
473
+ """
474
+ return PdfPlumberTextDetector(
475
+ x_tolerance=config.PDF_MINER.X_TOLERANCE, y_tolerance=config.PDF_MINER.Y_TOLERANCE
476
+ )
477
+
478
+ @staticmethod
479
+ def build_pdf_text_detector(config: AttrDict) -> PdfPlumberTextDetector:
480
+ """Building a PDF text detector
481
+
482
+ :param config: configuration object
483
+ :return: PdfPlumberTextDetector
484
+ """
485
+ return ServiceFactory._build_pdf_text_detector(config)
486
+
487
+ @staticmethod
488
+ def _build_pdf_miner_text_service(detector: PdfMiner) -> TextExtractionService:
489
+ """Building a PDFMiner text extraction service
490
+
491
+ :param detector: PdfMiner
492
+ :return: TextExtractionService
493
+ """
494
+ return TextExtractionService(detector)
495
+
496
+ @staticmethod
497
+ def build_pdf_miner_text_service(detector: PdfMiner) -> TextExtractionService:
498
+ """Building a PDFMiner text extraction service
499
+
500
+ :param detector: PdfMiner
501
+ :return: TextExtractionService
502
+ """
503
+ return ServiceFactory._build_pdf_miner_text_service(detector)
504
+
505
+ @staticmethod
506
+ def build_doctr_word_detector_service(detector: DoctrTextlineDetector) -> ImageLayoutService:
507
+ """Building a Doctr word detector service
508
+
509
+ :param detector: DoctrTextlineDetector
510
+ :return: ImageLayoutService
511
+ """
512
+ return ImageLayoutService(
513
+ layout_detector=detector, to_image=True, crop_image=True, skip_if_layout_extracted=True
514
+ )
515
+
516
+ @staticmethod
517
+ def _build_text_extraction_service(
518
+ config: AttrDict, detector: Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]
519
+ ) -> TextExtractionService:
520
+ """Building a text extraction service
521
+
522
+ :param config: configuration object
523
+ :param detector: OCR detector
524
+ :return: TextExtractionService
525
+ """
526
+ return TextExtractionService(
527
+ detector,
528
+ skip_if_text_extracted=config.USE_PDF_MINER,
529
+ extract_from_roi=config.TEXT_CONTAINER if config.OCR.USE_DOCTR else None,
530
+ )
531
+
532
+ @staticmethod
533
+ def build_text_extraction_service(
534
+ config: AttrDict, detector: Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]
535
+ ) -> TextExtractionService:
536
+ """Building a text extraction service
537
+
538
+ :param config: configuration object
539
+ :param detector: OCR detector
540
+ :return: TextExtractionService
541
+ """
542
+ return ServiceFactory._build_text_extraction_service(config, detector)
543
+
544
+ @staticmethod
545
+ def _build_word_matching_service(config: AttrDict) -> MatchingService:
546
+ """Building a word matching service
547
+
548
+ :param config: configuration object
549
+ :return: MatchingService
550
+ """
551
+ matcher = IntersectionMatcher(
552
+ matching_rule=config.WORD_MATCHING.RULE,
553
+ threshold=config.WORD_MATCHING.THRESHOLD,
554
+ max_parent_only=config.WORD_MATCHING.MAX_PARENT_ONLY,
555
+ )
556
+ return MatchingService(
557
+ parent_categories=config.WORD_MATCHING.PARENTAL_CATEGORIES,
558
+ child_categories=config.TEXT_CONTAINER,
559
+ matcher=matcher,
560
+ relationship_key=Relationships.CHILD,
561
+ )
562
+
563
+ @staticmethod
564
+ def build_word_matching_service(config: AttrDict) -> MatchingService:
565
+ """Building a word matching service
566
+
567
+ :param config: configuration object
568
+ :return: MatchingService
569
+ """
570
+ return ServiceFactory._build_word_matching_service(config)
571
+
572
+ @staticmethod
573
+ def _build_layout_link_matching_service(config: AttrDict) -> MatchingService:
574
+ """Building a word matching service
575
+
576
+ :param config: configuration object
577
+ :return: MatchingService
578
+ """
579
+ neighbor_matcher = NeighbourMatcher()
580
+ return MatchingService(
581
+ parent_categories=config.LAYOUT_LINK.PARENTAL_CATEGORIES,
582
+ child_categories=config.LAYOUT_LINK.CHILD_CATEGORIES,
583
+ matcher=neighbor_matcher,
584
+ relationship_key=Relationships.LAYOUT_LINK,
585
+ )
586
+
587
+ @staticmethod
588
+ def build_layout_link_matching_service(config: AttrDict) -> MatchingService:
589
+ """Building a word matching service
590
+
591
+ :param config: configuration object
592
+ :return: MatchingService
593
+ """
594
+ return ServiceFactory._build_layout_link_matching_service(config)
595
+
596
+ @staticmethod
597
+ def _build_text_order_service(config: AttrDict) -> TextOrderService:
598
+ """Building a text order service
599
+
600
+ :param config: configuration object
601
+ :return: TextOrderService instance
602
+ """
603
+ return TextOrderService(
604
+ text_container=config.TEXT_CONTAINER,
605
+ text_block_categories=config.TEXT_ORDERING.TEXT_BLOCK_CATEGORIES,
606
+ floating_text_block_categories=config.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
607
+ include_residual_text_container=config.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
608
+ starting_point_tolerance=config.TEXT_ORDERING.STARTING_POINT_TOLERANCE,
609
+ broken_line_tolerance=config.TEXT_ORDERING.BROKEN_LINE_TOLERANCE,
610
+ height_tolerance=config.TEXT_ORDERING.HEIGHT_TOLERANCE,
611
+ paragraph_break=config.TEXT_ORDERING.PARAGRAPH_BREAK,
612
+ )
613
+
614
+ @staticmethod
615
+ def build_text_order_service(config: AttrDict) -> TextOrderService:
616
+ """Building a text order service
617
+
618
+ :param config: configuration object
619
+ :return: TextOrderService instance
620
+ """
621
+ return ServiceFactory._build_text_order_service(config)
622
+
623
+ @staticmethod
624
+ def _build_page_parsing_service(config: AttrDict) -> PageParsingService:
625
+ """Building a page parsing service
626
+
627
+ :param config: configuration object
628
+ :return: PageParsingService instance
629
+ """
630
+ return PageParsingService(
631
+ text_container=config.TEXT_CONTAINER,
632
+ floating_text_block_categories=config.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
633
+ include_residual_text_container=config.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
634
+ )
635
+
636
+ @staticmethod
637
+ def build_page_parsing_service(config: AttrDict) -> PageParsingService:
638
+ """Building a page parsing service
639
+
640
+ :param config: configuration object
641
+ :return: PageParsingService instance
642
+ """
643
+ return ServiceFactory._build_page_parsing_service(config)
644
+
645
+ @staticmethod
646
+ def build_analyzer(config: AttrDict) -> DoctectionPipe:
647
+ """
648
+ Builds the analyzer with a given config
649
+
650
+ :param config: configuration object
651
+ :return: Analyzer pipeline
652
+ """
653
+ pipe_component_list: list[PipelineComponent] = []
654
+
655
+ if config.USE_ROTATOR:
656
+ rotation_detector = ServiceFactory.build_rotation_detector()
657
+ transform_service = ServiceFactory.build_transform_service(transform_predictor=rotation_detector)
658
+ pipe_component_list.append(transform_service)
659
+
660
+ if config.USE_LAYOUT:
661
+ layout_detector = ServiceFactory.build_layout_detector(config, mode="LAYOUT")
662
+ layout_service = ServiceFactory.build_layout_service(config, detector=layout_detector, mode="LAYOUT")
663
+ pipe_component_list.append(layout_service)
664
+
665
+ # setup layout nms service
666
+ if config.USE_LAYOUT_NMS:
667
+ layout_nms_service = ServiceFactory.build_layout_nms_service(config)
668
+ pipe_component_list.append(layout_nms_service)
669
+
670
+ # setup tables service
671
+ if config.USE_TABLE_SEGMENTATION:
672
+ item_detector = ServiceFactory.build_layout_detector(config, mode="ITEM")
673
+ item_service = ServiceFactory.build_sub_image_service(config, detector=item_detector, mode="ITEM")
674
+ pipe_component_list.append(item_service)
675
+
676
+ if item_detector.__class__.__name__ not in ("HFDetrDerivedDetector",):
677
+ cell_detector = ServiceFactory.build_layout_detector(config, mode="CELL")
678
+ cell_service = ServiceFactory.build_sub_image_service(config, detector=cell_detector, mode="CELL")
679
+ pipe_component_list.append(cell_service)
680
+
681
+ table_segmentation_service = ServiceFactory.build_table_segmentation_service(config, detector=item_detector)
682
+ pipe_component_list.append(table_segmentation_service)
683
+
684
+ if config.USE_TABLE_REFINEMENT:
685
+ table_refinement_service = ServiceFactory.build_table_refinement_service(config)
686
+ pipe_component_list.append(table_refinement_service)
687
+
688
+ if config.USE_PDF_MINER:
689
+ pdf_miner = ServiceFactory.build_pdf_text_detector(config)
690
+ d_text = ServiceFactory.build_pdf_miner_text_service(pdf_miner)
691
+ pipe_component_list.append(d_text)
692
+
693
+ # setup ocr
694
+ if config.USE_OCR:
695
+ # the extra mile for DocTr
696
+ if config.OCR.USE_DOCTR:
697
+ word_detector = ServiceFactory.build_doctr_word_detector(config)
698
+ word_service = ServiceFactory.build_doctr_word_detector_service(word_detector)
699
+ pipe_component_list.append(word_service)
700
+
701
+ ocr_detector = ServiceFactory.build_ocr_detector(config)
702
+ text_extraction_service = ServiceFactory.build_text_extraction_service(config, ocr_detector)
703
+ pipe_component_list.append(text_extraction_service)
704
+
705
+ if config.USE_PDF_MINER or config.USE_OCR:
706
+ matching_service = ServiceFactory.build_word_matching_service(config)
707
+ pipe_component_list.append(matching_service)
708
+
709
+ text_order_service = ServiceFactory.build_text_order_service(config)
710
+ pipe_component_list.append(text_order_service)
711
+
712
+ if config.USE_LAYOUT_LINK:
713
+ layout_link_matching_service = ServiceFactory.build_layout_link_matching_service(config)
714
+ pipe_component_list.append(layout_link_matching_service)
715
+
716
+ page_parsing_service = ServiceFactory.build_page_parsing_service(config)
717
+
718
+ return DoctectionPipe(pipeline_component_list=pipe_component_list, page_parsing_service=page_parsing_service)