deepdoctection 0.33__py3-none-any.whl → 0.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (40) hide show
  1. deepdoctection/__init__.py +11 -12
  2. deepdoctection/analyzer/__init__.py +1 -0
  3. deepdoctection/analyzer/_config.py +150 -0
  4. deepdoctection/analyzer/dd.py +42 -358
  5. deepdoctection/analyzer/factory.py +522 -0
  6. deepdoctection/configs/conf_dd_one.yaml +1 -0
  7. deepdoctection/datapoint/annotation.py +41 -3
  8. deepdoctection/datapoint/convert.py +6 -4
  9. deepdoctection/datapoint/image.py +132 -46
  10. deepdoctection/datapoint/view.py +2 -1
  11. deepdoctection/datasets/base.py +1 -1
  12. deepdoctection/datasets/instances/fintabnet.py +1 -1
  13. deepdoctection/datasets/instances/xfund.py +29 -7
  14. deepdoctection/eval/eval.py +7 -1
  15. deepdoctection/extern/model.py +2 -1
  16. deepdoctection/extern/pdftext.py +96 -5
  17. deepdoctection/extern/tessocr.py +1 -0
  18. deepdoctection/mapper/cats.py +11 -13
  19. deepdoctection/mapper/cocostruct.py +6 -2
  20. deepdoctection/mapper/d2struct.py +2 -1
  21. deepdoctection/mapper/laylmstruct.py +1 -1
  22. deepdoctection/mapper/match.py +31 -0
  23. deepdoctection/mapper/misc.py +1 -1
  24. deepdoctection/mapper/prodigystruct.py +1 -1
  25. deepdoctection/pipe/anngen.py +27 -0
  26. deepdoctection/pipe/base.py +23 -0
  27. deepdoctection/pipe/common.py +123 -38
  28. deepdoctection/pipe/segment.py +1 -1
  29. deepdoctection/pipe/sub_layout.py +1 -1
  30. deepdoctection/utils/env_info.py +31 -2
  31. deepdoctection/utils/file_utils.py +19 -0
  32. deepdoctection/utils/fs.py +27 -4
  33. deepdoctection/utils/metacfg.py +12 -0
  34. deepdoctection/utils/pdf_utils.py +114 -6
  35. deepdoctection/utils/settings.py +3 -0
  36. {deepdoctection-0.33.dist-info → deepdoctection-0.35.dist-info}/METADATA +20 -11
  37. {deepdoctection-0.33.dist-info → deepdoctection-0.35.dist-info}/RECORD +40 -38
  38. {deepdoctection-0.33.dist-info → deepdoctection-0.35.dist-info}/WHEEL +1 -1
  39. {deepdoctection-0.33.dist-info → deepdoctection-0.35.dist-info}/LICENSE +0 -0
  40. {deepdoctection-0.33.dist-info → deepdoctection-0.35.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,522 @@
1
+ # -*- coding: utf-8 -*-
2
+ # File: factory.py
3
+
4
+ # Copyright 2024 Dr. Janis Meyer. All rights reserved.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ """Factory for building the deepdoctection analyzer pipeline"""
19
+
20
+
21
+ from os import environ
22
+ from typing import Union
23
+
24
+ from lazy_imports import try_import
25
+
26
+ from ..extern.base import ImageTransformer, ObjectDetector, PdfMiner
27
+ from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
28
+ from ..extern.doctrocr import DoctrTextlineDetector, DoctrTextRecognizer
29
+ from ..extern.hfdetr import HFDetrDerivedDetector
30
+ from ..extern.model import ModelCatalog, ModelDownloadManager
31
+ from ..extern.pdftext import PdfPlumberTextDetector
32
+ from ..extern.tessocr import TesseractOcrDetector, TesseractRotationTransformer
33
+ from ..extern.texocr import TextractOcrDetector
34
+ from ..extern.tpdetect import TPFrcnnDetector
35
+ from ..pipe.base import PipelineComponent
36
+ from ..pipe.common import AnnotationNmsService, IntersectionMatcher, MatchingService, PageParsingService
37
+ from ..pipe.doctectionpipe import DoctectionPipe
38
+ from ..pipe.layout import ImageLayoutService
39
+ from ..pipe.order import TextOrderService
40
+ from ..pipe.refine import TableSegmentationRefinementService
41
+ from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
42
+ from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
43
+ from ..pipe.text import TextExtractionService
44
+ from ..pipe.transform import SimpleTransformService
45
+ from ..utils.file_utils import detectron2_available
46
+ from ..utils.fs import get_configs_dir_path
47
+ from ..utils.metacfg import AttrDict
48
+ from ..utils.settings import LayoutType, Relationships
49
+ from ..utils.transform import PadTransform
50
+
51
+ with try_import() as image_guard:
52
+ from botocore.config import Config # type: ignore
53
+
54
+
55
+ __all__ = [
56
+ "ServiceFactory",
57
+ ]
58
+
59
+ # from ._config import cfg
60
+
61
+
62
+ class ServiceFactory:
63
+ """
64
+ Factory class for building various components of the deepdoctection analyzer pipeline.
65
+
66
+ This class uses the `cfg` configuration object from `_config.py`, which is an instance of the `AttrDict` class.
67
+ The configuration is not passed explicitly in an `__init__` method but is accessed directly within the methods.
68
+
69
+ The class provides static methods to build different services and detectors required for the pipeline, such as
70
+ layout detectors, OCR detectors, table segmentation services, and more. The methods disentangle the creation
71
+ of predictors (e.g., `ObjectDetector`, `TextRecognizer`) from the configuration, allowing for flexible and
72
+ modular construction of the pipeline components.
73
+
74
+ Extending the Class:
75
+ This class can be extended by using inheritance and adding new methods or overriding existing ones.
76
+ To extend the configuration attributes, you can modify the `cfg` object in `_config.py` to include new
77
+ settings or parameters required for the new methods.
78
+ """
79
+
80
+ @staticmethod
81
+ def build_layout_detector(
82
+ config: AttrDict,
83
+ mode: str,
84
+ ) -> Union[D2FrcnnDetector, TPFrcnnDetector, HFDetrDerivedDetector, D2FrcnnTracingDetector]:
85
+ """Building a D2-Detector, a TP-Detector as Detr-Detector or a D2-Torch Tracing Detector according to
86
+ the config
87
+
88
+ :param config: configuration object
89
+ :param mode: either `LAYOUT`,`CELL` or `ITEM`
90
+ """
91
+ weights = (
92
+ getattr(config.TF, mode).WEIGHTS
93
+ if config.LIB == "TF"
94
+ else (getattr(config.PT, mode).WEIGHTS if detectron2_available() else getattr(config.PT, mode).WEIGHTS_TS)
95
+ )
96
+ filter_categories = (
97
+ getattr(getattr(config.TF, mode), "FILTER")
98
+ if config.LIB == "TF"
99
+ else getattr(getattr(config.PT, mode), "FILTER")
100
+ )
101
+ config_path = ModelCatalog.get_full_path_configs(weights)
102
+ weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
103
+ profile = ModelCatalog.get_profile(weights)
104
+ categories = profile.categories if profile.categories is not None else {}
105
+
106
+ if profile.model_wrapper in ("TPFrcnnDetector",):
107
+ return TPFrcnnDetector(
108
+ path_yaml=config_path,
109
+ path_weights=weights_path,
110
+ categories=categories,
111
+ filter_categories=filter_categories,
112
+ )
113
+ if profile.model_wrapper in ("D2FrcnnDetector",):
114
+ return D2FrcnnDetector(
115
+ path_yaml=config_path,
116
+ path_weights=weights_path,
117
+ categories=categories,
118
+ device=config.DEVICE,
119
+ filter_categories=filter_categories,
120
+ )
121
+ if profile.model_wrapper in ("D2FrcnnTracingDetector",):
122
+ return D2FrcnnTracingDetector(
123
+ path_yaml=config_path,
124
+ path_weights=weights_path,
125
+ categories=categories,
126
+ filter_categories=filter_categories,
127
+ )
128
+ if profile.model_wrapper in ("HFDetrDerivedDetector",):
129
+ preprocessor_config = ModelCatalog.get_full_path_preprocessor_configs(weights)
130
+ return HFDetrDerivedDetector(
131
+ path_config_json=config_path,
132
+ path_weights=weights_path,
133
+ path_feature_extractor_config_json=preprocessor_config,
134
+ categories=categories,
135
+ device=config.DEVICE,
136
+ filter_categories=filter_categories,
137
+ )
138
+ raise TypeError(
139
+ f"You have chosen profile.model_wrapper: {profile.model_wrapper} which is not allowed. Please check "
140
+ f"compatability with your deep learning framework"
141
+ )
142
+
143
+ @staticmethod
144
+ def build_rotation_detector() -> TesseractRotationTransformer:
145
+ """Building a rotation detector"""
146
+ return TesseractRotationTransformer()
147
+
148
+ @staticmethod
149
+ def build_transform_service(transform_predictor: ImageTransformer) -> SimpleTransformService:
150
+ """Building a transform service with a given predictor"""
151
+ return SimpleTransformService(transform_predictor)
152
+
153
+ @staticmethod
154
+ def build_padder(config: AttrDict, mode: str) -> PadTransform:
155
+ """Building a padder according to the config
156
+
157
+ :param config: configuration object
158
+ :param mode: either `LAYOUT`,`CELL` or `ITEM`
159
+ :return `PadTransform` instance
160
+ """
161
+ top, right, bottom, left = (
162
+ getattr(config.PT, mode).PAD.TOP,
163
+ getattr(config.PT, mode).PAD.RIGHT,
164
+ getattr(config.PT, mode).PAD.BOTTOM,
165
+ getattr(config.PT, mode).PAD.LEFT,
166
+ )
167
+ return PadTransform(top=top, right=right, bottom=bottom, left=left)
168
+
169
+ @staticmethod
170
+ def build_layout_service(config: AttrDict, detector: ObjectDetector, mode: str) -> ImageLayoutService:
171
+ """Building a layout service with a given detector
172
+
173
+ :param config: configuration object
174
+ :param detector: will be passed to the `ImageLayoutService`
175
+ :param mode: either `LAYOUT`,`CELL` or `ITEM`
176
+ :return `ImageLayoutService` instance
177
+ """
178
+ padder = None
179
+ if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
180
+ padder = ServiceFactory.build_padder(config, mode=mode)
181
+ return ImageLayoutService(layout_detector=detector, to_image=True, crop_image=True, padder=padder)
182
+
183
+ @staticmethod
184
+ def build_layout_nms_service(config: AttrDict) -> AnnotationNmsService:
185
+ """Building a NMS service for layout annotations
186
+
187
+ :param config: configuration object
188
+ """
189
+ if not detectron2_available() and config.LIB == "PT":
190
+ raise ModuleNotFoundError("LAYOUT_NMS_PAIRS is only available for detectron2")
191
+ if not isinstance(config.LAYOUT_NMS_PAIRS.COMBINATIONS, list) and not isinstance(
192
+ config.LAYOUT_NMS_PAIRS.COMBINATIONS[0], list
193
+ ):
194
+ raise ValueError("LAYOUT_NMS_PAIRS must be a list of lists")
195
+ return AnnotationNmsService(
196
+ nms_pairs=config.LAYOUT_NMS_PAIRS.COMBINATIONS,
197
+ thresholds=config.LAYOUT_NMS_PAIRS.THRESHOLDS,
198
+ priority=config.LAYOUT_NMS_PAIRS.PRIORITY,
199
+ )
200
+
201
+ @staticmethod
202
+ def build_sub_image_service(config: AttrDict, detector: ObjectDetector, mode: str) -> SubImageLayoutService:
203
+ """
204
+ Building a sub image layout service with a given detector
205
+
206
+ :param config: configuration object
207
+ :param detector: will be passed to the `SubImageLayoutService`
208
+ :param mode: either `LAYOUT`,`CELL` or `ITEM`
209
+ :return: `SubImageLayoutService` instance
210
+ """
211
+ exclude_category_ids = []
212
+ padder = None
213
+ if mode == "ITEM":
214
+ if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
215
+ exclude_category_ids.extend([1, 3, 4, 5, 6])
216
+ padder = ServiceFactory.build_padder(config, mode)
217
+ detect_result_generator = DetectResultGenerator(
218
+ categories=detector.categories.categories, exclude_category_ids=exclude_category_ids
219
+ )
220
+ return SubImageLayoutService(
221
+ sub_image_detector=detector,
222
+ sub_image_names=[LayoutType.TABLE, LayoutType.TABLE_ROTATED],
223
+ category_id_mapping=None,
224
+ detect_result_generator=detect_result_generator,
225
+ padder=padder,
226
+ )
227
+
228
+ @staticmethod
229
+ def build_ocr_detector(config: AttrDict) -> Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]:
230
+ """
231
+ Building OCR predictor
232
+
233
+ :param config: configuration object
234
+ """
235
+ if config.OCR.USE_TESSERACT:
236
+ ocr_config_path = get_configs_dir_path() / config.OCR.CONFIG.TESSERACT
237
+ return TesseractOcrDetector(
238
+ ocr_config_path,
239
+ config_overwrite=[f"LANGUAGES={config.LANGUAGE}"] if config.LANGUAGE is not None else None,
240
+ )
241
+ if config.OCR.USE_DOCTR:
242
+ weights = (
243
+ config.OCR.WEIGHTS.DOCTR_RECOGNITION.TF
244
+ if config.LIB == "TF"
245
+ else (config.OCR.WEIGHTS.DOCTR_RECOGNITION.PT)
246
+ )
247
+ weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
248
+ profile = ModelCatalog.get_profile(weights)
249
+ # get_full_path_configs will complete the path even if the model is not registered
250
+ config_path = ModelCatalog.get_full_path_configs(weights) if profile.config is not None else None
251
+ if profile.architecture is None:
252
+ raise ValueError("model profile.architecture must be specified")
253
+ return DoctrTextRecognizer(
254
+ architecture=profile.architecture,
255
+ path_weights=weights_path,
256
+ device=config.DEVICE,
257
+ lib=config.LIB,
258
+ path_config_json=config_path,
259
+ )
260
+ if config.OCR.USE_TEXTRACT:
261
+ credentials_kwargs = {
262
+ "aws_access_key_id": environ.get("ACCESS_KEY", None),
263
+ "aws_secret_access_key": environ.get("SECRET_KEY", None),
264
+ "config": Config(region_name=environ.get("REGION", None)),
265
+ }
266
+ return TextractOcrDetector(**credentials_kwargs)
267
+ raise ValueError("You have set USE_OCR=True but any of USE_TESSERACT, USE_DOCTR, USE_TEXTRACT is set to False")
268
+
269
+ @staticmethod
270
+ def build_doctr_word_detector(config: AttrDict) -> DoctrTextlineDetector:
271
+ """Building `DoctrTextlineDetector` instance
272
+
273
+ :param config: configuration object
274
+ :return: DoctrTextlineDetector
275
+ """
276
+ weights = config.OCR.WEIGHTS.DOCTR_WORD.TF if config.LIB == "TF" else config.OCR.WEIGHTS.DOCTR_WORD.PT
277
+ weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
278
+ profile = ModelCatalog.get_profile(weights)
279
+ if profile.architecture is None:
280
+ raise ValueError("model profile.architecture must be specified")
281
+ if profile.categories is None:
282
+ raise ValueError("model profile.categories must be specified")
283
+ return DoctrTextlineDetector(
284
+ profile.architecture, weights_path, profile.categories, config.DEVICE, lib=config.LIB
285
+ )
286
+
287
+ @staticmethod
288
+ def build_table_segmentation_service(
289
+ config: AttrDict,
290
+ detector: ObjectDetector,
291
+ ) -> Union[PubtablesSegmentationService, TableSegmentationService]:
292
+ """
293
+ Build and return a table segmentation service based on the provided detector.
294
+
295
+ Depending on the type of the detector, this method will return either a `PubtablesSegmentationService` or a
296
+ `TableSegmentationService` instance. The selection is made as follows:
297
+
298
+ - If the detector is an instance of `HFDetrDerivedDetector`, a `PubtablesSegmentationService` is created and
299
+ returned. This service uses specific configuration parameters for segmentation, such as assignment rules,
300
+ thresholds, and cell names defined in the `cfg` object.
301
+ - For other detector types, a `TableSegmentationService` is created and returned. This service also uses
302
+ configuration parameters from the `cfg` object but is tailored for different segmentation needs.
303
+
304
+ :param config: configuration object
305
+ :param detector: An instance of `ObjectDetector` used to determine the type of table segmentation
306
+ service to build.
307
+ :return: An instance of either `PubtablesSegmentationService` or `TableSegmentationService` based on the
308
+ detector type.
309
+ """
310
+ table_segmentation: Union[PubtablesSegmentationService, TableSegmentationService]
311
+ if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
312
+ table_segmentation = PubtablesSegmentationService(
313
+ segment_rule=config.SEGMENTATION.ASSIGNMENT_RULE,
314
+ threshold_rows=config.SEGMENTATION.THRESHOLD_ROWS,
315
+ threshold_cols=config.SEGMENTATION.THRESHOLD_COLS,
316
+ tile_table_with_items=config.SEGMENTATION.FULL_TABLE_TILING,
317
+ remove_iou_threshold_rows=config.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
318
+ remove_iou_threshold_cols=config.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
319
+ cell_class_id=config.SEGMENTATION.CELL_CATEGORY_ID,
320
+ table_name=config.SEGMENTATION.TABLE_NAME,
321
+ cell_names=config.SEGMENTATION.PUBTABLES_CELL_NAMES,
322
+ spanning_cell_names=config.SEGMENTATION.PUBTABLES_SPANNING_CELL_NAMES,
323
+ item_names=config.SEGMENTATION.PUBTABLES_ITEM_NAMES,
324
+ sub_item_names=config.SEGMENTATION.PUBTABLES_SUB_ITEM_NAMES,
325
+ stretch_rule=config.SEGMENTATION.STRETCH_RULE,
326
+ )
327
+
328
+ else:
329
+ table_segmentation = TableSegmentationService(
330
+ segment_rule=config.SEGMENTATION.ASSIGNMENT_RULE,
331
+ threshold_rows=config.SEGMENTATION.THRESHOLD_ROWS,
332
+ threshold_cols=config.SEGMENTATION.THRESHOLD_COLS,
333
+ tile_table_with_items=config.SEGMENTATION.FULL_TABLE_TILING,
334
+ remove_iou_threshold_rows=config.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
335
+ remove_iou_threshold_cols=config.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
336
+ table_name=config.SEGMENTATION.TABLE_NAME,
337
+ cell_names=config.SEGMENTATION.CELL_NAMES,
338
+ item_names=config.SEGMENTATION.ITEM_NAMES,
339
+ sub_item_names=config.SEGMENTATION.SUB_ITEM_NAMES,
340
+ stretch_rule=config.SEGMENTATION.STRETCH_RULE,
341
+ )
342
+ return table_segmentation
343
+
344
+ @staticmethod
345
+ def build_table_refinement_service(config: AttrDict) -> TableSegmentationRefinementService:
346
+ """Building a table segmentation refinement service
347
+
348
+ :param config: configuration object
349
+ :return: TableSegmentationRefinementService
350
+ """
351
+ return TableSegmentationRefinementService(
352
+ [config.SEGMENTATION.TABLE_NAME],
353
+ config.SEGMENTATION.PUBTABLES_CELL_NAMES,
354
+ )
355
+
356
+ @staticmethod
357
+ def build_pdf_text_detector(config: AttrDict) -> PdfPlumberTextDetector:
358
+ """Building a PDF text detector
359
+
360
+ :param config: configuration object
361
+ :return: PdfPlumberTextDetector
362
+ """
363
+ return PdfPlumberTextDetector(
364
+ x_tolerance=config.PDF_MINER.X_TOLERANCE, y_tolerance=config.PDF_MINER.Y_TOLERANCE
365
+ )
366
+
367
+ @staticmethod
368
+ def build_pdf_miner_text_service(detector: PdfMiner) -> TextExtractionService:
369
+ """Building a PDFMiner text extraction service
370
+
371
+ :param detector: PdfMiner
372
+ :return: TextExtractionService
373
+ """
374
+ return TextExtractionService(detector)
375
+
376
+ @staticmethod
377
+ def build_doctr_word_detector_service(detector: DoctrTextlineDetector) -> ImageLayoutService:
378
+ """Building a Doctr word detector service
379
+
380
+ :param detector: DoctrTextlineDetector
381
+ :return: ImageLayoutService
382
+ """
383
+ return ImageLayoutService(
384
+ layout_detector=detector, to_image=True, crop_image=True, skip_if_layout_extracted=True
385
+ )
386
+
387
+ @staticmethod
388
+ def build_text_extraction_service(
389
+ config: AttrDict, detector: Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]
390
+ ) -> TextExtractionService:
391
+ """Building a text extraction service
392
+
393
+ :param config: configuration object
394
+ :param detector: OCR detector
395
+ :return: TextExtractionService
396
+ """
397
+ return TextExtractionService(
398
+ detector,
399
+ skip_if_text_extracted=config.USE_PDF_MINER,
400
+ extract_from_roi=config.TEXT_CONTAINER if config.OCR.USE_DOCTR else None,
401
+ )
402
+
403
+ @staticmethod
404
+ def build_word_matching_service(config: AttrDict) -> MatchingService:
405
+ """Building a word matching service
406
+
407
+ :param config: configuration object
408
+ :return: MatchingService
409
+ """
410
+ matcher = IntersectionMatcher(
411
+ matching_rule=config.WORD_MATCHING.RULE,
412
+ threshold=config.WORD_MATCHING.THRESHOLD,
413
+ max_parent_only=config.WORD_MATCHING.MAX_PARENT_ONLY,
414
+ )
415
+ return MatchingService(
416
+ parent_categories=config.WORD_MATCHING.PARENTAL_CATEGORIES,
417
+ child_categories=config.TEXT_CONTAINER,
418
+ matcher=matcher,
419
+ relationship_key=Relationships.CHILD,
420
+ )
421
+
422
+ @staticmethod
423
+ def build_text_order_service(config: AttrDict) -> TextOrderService:
424
+ """Building a text order service
425
+
426
+ :param config: configuration object
427
+ :return: TextOrderService instance
428
+ """
429
+ return TextOrderService(
430
+ text_container=config.TEXT_CONTAINER,
431
+ text_block_categories=config.TEXT_ORDERING.TEXT_BLOCK_CATEGORIES,
432
+ floating_text_block_categories=config.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
433
+ include_residual_text_container=config.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
434
+ starting_point_tolerance=config.TEXT_ORDERING.STARTING_POINT_TOLERANCE,
435
+ broken_line_tolerance=config.TEXT_ORDERING.BROKEN_LINE_TOLERANCE,
436
+ height_tolerance=config.TEXT_ORDERING.HEIGHT_TOLERANCE,
437
+ paragraph_break=config.TEXT_ORDERING.PARAGRAPH_BREAK,
438
+ )
439
+
440
+ @staticmethod
441
+ def build_page_parsing_service(config: AttrDict) -> PageParsingService:
442
+ """Building a page parsing service
443
+
444
+ :param config: configuration object
445
+ :return: PageParsingService instance
446
+ """
447
+ return PageParsingService(
448
+ text_container=config.TEXT_CONTAINER,
449
+ floating_text_block_categories=config.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
450
+ include_residual_text_container=config.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
451
+ )
452
+
453
+ @staticmethod
454
+ def build_analyzer(config: AttrDict) -> DoctectionPipe:
455
+ """
456
+ Builds the analyzer with a given config
457
+
458
+ :param config: configuration object
459
+ :return: Analyzer pipeline
460
+ """
461
+ pipe_component_list: list[PipelineComponent] = []
462
+
463
+ if config.USE_ROTATOR:
464
+ rotation_detector = ServiceFactory.build_rotation_detector()
465
+ transform_service = ServiceFactory.build_transform_service(transform_predictor=rotation_detector)
466
+ pipe_component_list.append(transform_service)
467
+
468
+ if config.USE_LAYOUT:
469
+ layout_detector = ServiceFactory.build_layout_detector(config, mode="LAYOUT")
470
+ layout_service = ServiceFactory.build_layout_service(config, detector=layout_detector, mode="LAYOUT")
471
+ pipe_component_list.append(layout_service)
472
+
473
+ # setup layout nms service
474
+ if config.USE_LAYOUT_NMS:
475
+ layout_nms_service = ServiceFactory.build_layout_nms_service(config)
476
+ pipe_component_list.append(layout_nms_service)
477
+
478
+ # setup tables service
479
+ if config.USE_TABLE_SEGMENTATION:
480
+ item_detector = ServiceFactory.build_layout_detector(config, mode="ITEM")
481
+ item_service = ServiceFactory.build_sub_image_service(config, detector=item_detector, mode="ITEM")
482
+ pipe_component_list.append(item_service)
483
+
484
+ if item_detector.__class__.__name__ not in ("HFDetrDerivedDetector",):
485
+ cell_detector = ServiceFactory.build_layout_detector(config, mode="CELL")
486
+ cell_service = ServiceFactory.build_sub_image_service(config, detector=cell_detector, mode="CELL")
487
+ pipe_component_list.append(cell_service)
488
+
489
+ table_segmentation_service = ServiceFactory.build_table_segmentation_service(config, detector=item_detector)
490
+ pipe_component_list.append(table_segmentation_service)
491
+
492
+ if config.USE_TABLE_REFINEMENT:
493
+ table_refinement_service = ServiceFactory.build_table_refinement_service(config)
494
+ pipe_component_list.append(table_refinement_service)
495
+
496
+ if config.USE_PDF_MINER:
497
+ pdf_miner = ServiceFactory.build_pdf_text_detector(config)
498
+ d_text = ServiceFactory.build_pdf_miner_text_service(pdf_miner)
499
+ pipe_component_list.append(d_text)
500
+
501
+ # setup ocr
502
+ if config.USE_OCR:
503
+ # the extra mile for DocTr
504
+ if config.OCR.USE_DOCTR:
505
+ word_detector = ServiceFactory.build_doctr_word_detector(config)
506
+ word_service = ServiceFactory.build_doctr_word_detector_service(word_detector)
507
+ pipe_component_list.append(word_service)
508
+
509
+ ocr_detector = ServiceFactory.build_ocr_detector(config)
510
+ text_extraction_service = ServiceFactory.build_text_extraction_service(config, ocr_detector)
511
+ pipe_component_list.append(text_extraction_service)
512
+
513
+ if config.USE_PDF_MINER or config.USE_OCR:
514
+ matching_service = ServiceFactory.build_word_matching_service(config)
515
+ pipe_component_list.append(matching_service)
516
+
517
+ text_order_service = ServiceFactory.build_text_order_service(config)
518
+ pipe_component_list.append(text_order_service)
519
+
520
+ page_parsing_service = ServiceFactory.build_page_parsing_service(config)
521
+
522
+ return DoctectionPipe(pipeline_component_list=pipe_component_list, page_parsing_service=page_parsing_service)
@@ -1,3 +1,4 @@
1
+ USE_ROTATOR: False
1
2
  USE_LAYOUT: True
2
3
  USE_TABLE_SEGMENTATION: True
3
4
  TF:
@@ -21,6 +21,7 @@ Dataclass for annotations and their derived classes.
21
21
  from __future__ import annotations
22
22
 
23
23
  from abc import ABC, abstractmethod
24
+ from collections import defaultdict
24
25
  from dataclasses import dataclass, field
25
26
  from typing import Optional, Union, no_type_check
26
27
 
@@ -66,6 +67,16 @@ def ann_from_dict(cls, **kwargs: AnnotationDict):
66
67
  return ann
67
68
 
68
69
 
70
+ @dataclass(frozen=True)
71
+ class AnnotationMap:
72
+ """AnnotationMap to store all sub categories, relationship keys and summary keys of an annotation"""
73
+
74
+ image_annotation_id: str
75
+ sub_category_key: Optional[ObjectTypes] = None
76
+ relationship_key: Optional[ObjectTypes] = None
77
+ summary_key: Optional[ObjectTypes] = None
78
+
79
+
69
80
  @dataclass
70
81
  class Annotation(ABC):
71
82
  """
@@ -397,7 +408,8 @@ class CategoryAnnotation(Annotation):
397
408
  except ValueError:
398
409
  logger.warning(LoggingRecord(f"Relationship {key} cannot be removed because it does not exist"))
399
410
  else:
400
- self.relationships[key].clear()
411
+ if key in self.relationships:
412
+ self.relationships[key].clear()
401
413
 
402
414
  def get_defining_attributes(self) -> list[str]:
403
415
  return ["category_name", "category_id"]
@@ -409,7 +421,7 @@ class CategoryAnnotation(Annotation):
409
421
 
410
422
  :return: list of attributes.
411
423
  """
412
- return []
424
+ return ["_category_name"]
413
425
 
414
426
  @classmethod
415
427
  def from_dict(cls, **kwargs: AnnotationDict) -> CategoryAnnotation:
@@ -470,6 +482,32 @@ class ImageAnnotation(CategoryAnnotation):
470
482
  return self.image.summary.get_sub_category(key)
471
483
  raise AnnotationError(f"Summary does not exist for {self.annotation_id} and key: {key}")
472
484
 
485
+ def get_annotation_map(self) -> defaultdict[str, list[AnnotationMap]]:
486
+ """
487
+ Returns a defaultdict with annotation ids as keys and a list of AnnotationMap instances as values for all sub
488
+ categories, relationships and image summaries.
489
+ :return: defaultdict with annotation ids as keys and a list of AnnotationMap instances as values.
490
+ """
491
+ annotation_id_dict = defaultdict(list)
492
+ annotation_id_dict[self.annotation_id].append(AnnotationMap(image_annotation_id=self.annotation_id))
493
+ for sub_cat_key in self.sub_categories:
494
+ sub_cat = self.get_sub_category(sub_cat_key)
495
+ annotation_id_dict[sub_cat.annotation_id].append(
496
+ AnnotationMap(image_annotation_id=self.annotation_id, sub_category_key=sub_cat_key)
497
+ )
498
+ if self.image is not None:
499
+ for summary_cat_key in self.image.summary.sub_categories:
500
+ summary_cat = self.get_summary(summary_cat_key)
501
+ annotation_id_dict[summary_cat.annotation_id].append(
502
+ AnnotationMap(image_annotation_id=self.annotation_id, summary_key=summary_cat_key)
503
+ )
504
+ for rel_key in self.relationships:
505
+ for rel_ann_ids in self.get_relationship(rel_key):
506
+ annotation_id_dict[rel_ann_ids].append(
507
+ AnnotationMap(image_annotation_id=self.annotation_id, relationship_key=rel_key)
508
+ )
509
+ return annotation_id_dict
510
+
473
511
 
474
512
  @dataclass
475
513
  class ContainerAnnotation(CategoryAnnotation):
@@ -489,5 +527,5 @@ class ContainerAnnotation(CategoryAnnotation):
489
527
  def from_dict(cls, **kwargs: AnnotationDict) -> ContainerAnnotation:
490
528
  container_ann = ann_from_dict(cls, **kwargs)
491
529
  value = kwargs.get("value", "")
492
- container_ann.value = value if isinstance(value, str) else list(value)
530
+ container_ann.value = value if isinstance(value, (int, float, str)) else list(value)
493
531
  return container_ann
@@ -143,11 +143,13 @@ def convert_pdf_bytes_to_np_array(pdf_bytes: bytes, dpi: Optional[int] = None) -
143
143
  return np_array.astype(uint8)
144
144
 
145
145
 
146
- def convert_pdf_bytes_to_np_array_v2(pdf_bytes: bytes, dpi: Optional[int] = None) -> PixelValues:
146
+ def convert_pdf_bytes_to_np_array_v2(pdf_bytes: bytes, dpi: Optional[int] = 200) -> PixelValues:
147
147
  """
148
- Converts a pdf passed as bytes into a numpy array. Note, that this method expects poppler to be installed. This
149
- function, however does not rely on the wrapper pdf2image but uses a function of this lib which calls poppler
150
- directly.
148
+ Converts a pdf passed as bytes into a numpy array. We use poppler or pdfmium to convert the pdf to an image.
149
+ If both is available you can steer the selection of the render engine with environment variables:
150
+
151
+ USE_DD_POPPLER: Set to 1, "TRUE", "True" to use poppler
152
+ USE_DD_PDFIUM: Set to 1, "TRUE", "True" to use pdfium
151
153
 
152
154
  :param pdf_bytes: A pdf as bytes object. A byte representation can from a pdf file can be generated e.g. with
153
155
  `utils.fs.load_bytes_from_pdf_file`