deepdoctection 0.34__py3-none-any.whl → 0.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

@@ -0,0 +1,522 @@
1
+ # -*- coding: utf-8 -*-
2
+ # File: factory.py
3
+
4
+ # Copyright 2024 Dr. Janis Meyer. All rights reserved.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ """Factory for building the deepdoctection analyzer pipeline"""
19
+
20
+
21
+ from os import environ
22
+ from typing import Union
23
+
24
+ from lazy_imports import try_import
25
+
26
+ from ..extern.base import ImageTransformer, ObjectDetector, PdfMiner
27
+ from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
28
+ from ..extern.doctrocr import DoctrTextlineDetector, DoctrTextRecognizer
29
+ from ..extern.hfdetr import HFDetrDerivedDetector
30
+ from ..extern.model import ModelCatalog, ModelDownloadManager
31
+ from ..extern.pdftext import PdfPlumberTextDetector
32
+ from ..extern.tessocr import TesseractOcrDetector, TesseractRotationTransformer
33
+ from ..extern.texocr import TextractOcrDetector
34
+ from ..extern.tpdetect import TPFrcnnDetector
35
+ from ..pipe.base import PipelineComponent
36
+ from ..pipe.common import AnnotationNmsService, IntersectionMatcher, MatchingService, PageParsingService
37
+ from ..pipe.doctectionpipe import DoctectionPipe
38
+ from ..pipe.layout import ImageLayoutService
39
+ from ..pipe.order import TextOrderService
40
+ from ..pipe.refine import TableSegmentationRefinementService
41
+ from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
42
+ from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
43
+ from ..pipe.text import TextExtractionService
44
+ from ..pipe.transform import SimpleTransformService
45
+ from ..utils.file_utils import detectron2_available
46
+ from ..utils.fs import get_configs_dir_path
47
+ from ..utils.metacfg import AttrDict
48
+ from ..utils.settings import LayoutType, Relationships
49
+ from ..utils.transform import PadTransform
50
+
51
+ with try_import() as image_guard:
52
+ from botocore.config import Config # type: ignore
53
+
54
+
55
+ __all__ = [
56
+ "ServiceFactory",
57
+ ]
58
+
59
+ # from ._config import cfg
60
+
61
+
62
+ class ServiceFactory:
63
+ """
64
+ Factory class for building various components of the deepdoctection analyzer pipeline.
65
+
66
+ This class uses the `cfg` configuration object from `_config.py`, which is an instance of the `AttrDict` class.
67
+ The configuration is not passed explicitly in an `__init__` method but is accessed directly within the methods.
68
+
69
+ The class provides static methods to build different services and detectors required for the pipeline, such as
70
+ layout detectors, OCR detectors, table segmentation services, and more. The methods disentangle the creation
71
+ of predictors (e.g., `ObjectDetector`, `TextRecognizer`) from the configuration, allowing for flexible and
72
+ modular construction of the pipeline components.
73
+
74
+ Extending the Class:
75
+ This class can be extended by using inheritance and adding new methods or overriding existing ones.
76
+ To extend the configuration attributes, you can modify the `cfg` object in `_config.py` to include new
77
+ settings or parameters required for the new methods.
78
+ """
79
+
80
+ @staticmethod
81
+ def build_layout_detector(
82
+ config: AttrDict,
83
+ mode: str,
84
+ ) -> Union[D2FrcnnDetector, TPFrcnnDetector, HFDetrDerivedDetector, D2FrcnnTracingDetector]:
85
+ """Building a D2-Detector, a TP-Detector as Detr-Detector or a D2-Torch Tracing Detector according to
86
+ the config
87
+
88
+ :param config: configuration object
89
+ :param mode: either `LAYOUT`,`CELL` or `ITEM`
90
+ """
91
+ weights = (
92
+ getattr(config.TF, mode).WEIGHTS
93
+ if config.LIB == "TF"
94
+ else (getattr(config.PT, mode).WEIGHTS if detectron2_available() else getattr(config.PT, mode).WEIGHTS_TS)
95
+ )
96
+ filter_categories = (
97
+ getattr(getattr(config.TF, mode), "FILTER")
98
+ if config.LIB == "TF"
99
+ else getattr(getattr(config.PT, mode), "FILTER")
100
+ )
101
+ config_path = ModelCatalog.get_full_path_configs(weights)
102
+ weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
103
+ profile = ModelCatalog.get_profile(weights)
104
+ categories = profile.categories if profile.categories is not None else {}
105
+
106
+ if profile.model_wrapper in ("TPFrcnnDetector",):
107
+ return TPFrcnnDetector(
108
+ path_yaml=config_path,
109
+ path_weights=weights_path,
110
+ categories=categories,
111
+ filter_categories=filter_categories,
112
+ )
113
+ if profile.model_wrapper in ("D2FrcnnDetector",):
114
+ return D2FrcnnDetector(
115
+ path_yaml=config_path,
116
+ path_weights=weights_path,
117
+ categories=categories,
118
+ device=config.DEVICE,
119
+ filter_categories=filter_categories,
120
+ )
121
+ if profile.model_wrapper in ("D2FrcnnTracingDetector",):
122
+ return D2FrcnnTracingDetector(
123
+ path_yaml=config_path,
124
+ path_weights=weights_path,
125
+ categories=categories,
126
+ filter_categories=filter_categories,
127
+ )
128
+ if profile.model_wrapper in ("HFDetrDerivedDetector",):
129
+ preprocessor_config = ModelCatalog.get_full_path_preprocessor_configs(weights)
130
+ return HFDetrDerivedDetector(
131
+ path_config_json=config_path,
132
+ path_weights=weights_path,
133
+ path_feature_extractor_config_json=preprocessor_config,
134
+ categories=categories,
135
+ device=config.DEVICE,
136
+ filter_categories=filter_categories,
137
+ )
138
+ raise TypeError(
139
+ f"You have chosen profile.model_wrapper: {profile.model_wrapper} which is not allowed. Please check "
140
+ f"compatability with your deep learning framework"
141
+ )
142
+
143
+ @staticmethod
144
+ def build_rotation_detector() -> TesseractRotationTransformer:
145
+ """Building a rotation detector"""
146
+ return TesseractRotationTransformer()
147
+
148
+ @staticmethod
149
+ def build_transform_service(transform_predictor: ImageTransformer) -> SimpleTransformService:
150
+ """Building a transform service with a given predictor"""
151
+ return SimpleTransformService(transform_predictor)
152
+
153
+ @staticmethod
154
+ def build_padder(config: AttrDict, mode: str) -> PadTransform:
155
+ """Building a padder according to the config
156
+
157
+ :param config: configuration object
158
+ :param mode: either `LAYOUT`,`CELL` or `ITEM`
159
+ :return `PadTransform` instance
160
+ """
161
+ top, right, bottom, left = (
162
+ getattr(config.PT, mode).PAD.TOP,
163
+ getattr(config.PT, mode).PAD.RIGHT,
164
+ getattr(config.PT, mode).PAD.BOTTOM,
165
+ getattr(config.PT, mode).PAD.LEFT,
166
+ )
167
+ return PadTransform(top=top, right=right, bottom=bottom, left=left)
168
+
169
+ @staticmethod
170
+ def build_layout_service(config: AttrDict, detector: ObjectDetector, mode: str) -> ImageLayoutService:
171
+ """Building a layout service with a given detector
172
+
173
+ :param config: configuration object
174
+ :param detector: will be passed to the `ImageLayoutService`
175
+ :param mode: either `LAYOUT`,`CELL` or `ITEM`
176
+ :return `ImageLayoutService` instance
177
+ """
178
+ padder = None
179
+ if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
180
+ padder = ServiceFactory.build_padder(config, mode=mode)
181
+ return ImageLayoutService(layout_detector=detector, to_image=True, crop_image=True, padder=padder)
182
+
183
+ @staticmethod
184
+ def build_layout_nms_service(config: AttrDict) -> AnnotationNmsService:
185
+ """Building a NMS service for layout annotations
186
+
187
+ :param config: configuration object
188
+ """
189
+ if not detectron2_available() and config.LIB == "PT":
190
+ raise ModuleNotFoundError("LAYOUT_NMS_PAIRS is only available for detectron2")
191
+ if not isinstance(config.LAYOUT_NMS_PAIRS.COMBINATIONS, list) and not isinstance(
192
+ config.LAYOUT_NMS_PAIRS.COMBINATIONS[0], list
193
+ ):
194
+ raise ValueError("LAYOUT_NMS_PAIRS must be a list of lists")
195
+ return AnnotationNmsService(
196
+ nms_pairs=config.LAYOUT_NMS_PAIRS.COMBINATIONS,
197
+ thresholds=config.LAYOUT_NMS_PAIRS.THRESHOLDS,
198
+ priority=config.LAYOUT_NMS_PAIRS.PRIORITY,
199
+ )
200
+
201
+ @staticmethod
202
+ def build_sub_image_service(config: AttrDict, detector: ObjectDetector, mode: str) -> SubImageLayoutService:
203
+ """
204
+ Building a sub image layout service with a given detector
205
+
206
+ :param config: configuration object
207
+ :param detector: will be passed to the `SubImageLayoutService`
208
+ :param mode: either `LAYOUT`,`CELL` or `ITEM`
209
+ :return: `SubImageLayoutService` instance
210
+ """
211
+ exclude_category_ids = []
212
+ padder = None
213
+ if mode == "ITEM":
214
+ if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
215
+ exclude_category_ids.extend([1, 3, 4, 5, 6])
216
+ padder = ServiceFactory.build_padder(config, mode)
217
+ detect_result_generator = DetectResultGenerator(
218
+ categories=detector.categories.categories, exclude_category_ids=exclude_category_ids
219
+ )
220
+ return SubImageLayoutService(
221
+ sub_image_detector=detector,
222
+ sub_image_names=[LayoutType.TABLE, LayoutType.TABLE_ROTATED],
223
+ category_id_mapping=None,
224
+ detect_result_generator=detect_result_generator,
225
+ padder=padder,
226
+ )
227
+
228
+ @staticmethod
229
+ def build_ocr_detector(config: AttrDict) -> Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]:
230
+ """
231
+ Building OCR predictor
232
+
233
+ :param config: configuration object
234
+ """
235
+ if config.OCR.USE_TESSERACT:
236
+ ocr_config_path = get_configs_dir_path() / config.OCR.CONFIG.TESSERACT
237
+ return TesseractOcrDetector(
238
+ ocr_config_path,
239
+ config_overwrite=[f"LANGUAGES={config.LANGUAGE}"] if config.LANGUAGE is not None else None,
240
+ )
241
+ if config.OCR.USE_DOCTR:
242
+ weights = (
243
+ config.OCR.WEIGHTS.DOCTR_RECOGNITION.TF
244
+ if config.LIB == "TF"
245
+ else (config.OCR.WEIGHTS.DOCTR_RECOGNITION.PT)
246
+ )
247
+ weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
248
+ profile = ModelCatalog.get_profile(weights)
249
+ # get_full_path_configs will complete the path even if the model is not registered
250
+ config_path = ModelCatalog.get_full_path_configs(weights) if profile.config is not None else None
251
+ if profile.architecture is None:
252
+ raise ValueError("model profile.architecture must be specified")
253
+ return DoctrTextRecognizer(
254
+ architecture=profile.architecture,
255
+ path_weights=weights_path,
256
+ device=config.DEVICE,
257
+ lib=config.LIB,
258
+ path_config_json=config_path,
259
+ )
260
+ if config.OCR.USE_TEXTRACT:
261
+ credentials_kwargs = {
262
+ "aws_access_key_id": environ.get("ACCESS_KEY", None),
263
+ "aws_secret_access_key": environ.get("SECRET_KEY", None),
264
+ "config": Config(region_name=environ.get("REGION", None)),
265
+ }
266
+ return TextractOcrDetector(**credentials_kwargs)
267
+ raise ValueError("You have set USE_OCR=True but any of USE_TESSERACT, USE_DOCTR, USE_TEXTRACT is set to False")
268
+
269
+ @staticmethod
270
+ def build_doctr_word_detector(config: AttrDict) -> DoctrTextlineDetector:
271
+ """Building `DoctrTextlineDetector` instance
272
+
273
+ :param config: configuration object
274
+ :return: DoctrTextlineDetector
275
+ """
276
+ weights = config.OCR.WEIGHTS.DOCTR_WORD.TF if config.LIB == "TF" else config.OCR.WEIGHTS.DOCTR_WORD.PT
277
+ weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
278
+ profile = ModelCatalog.get_profile(weights)
279
+ if profile.architecture is None:
280
+ raise ValueError("model profile.architecture must be specified")
281
+ if profile.categories is None:
282
+ raise ValueError("model profile.categories must be specified")
283
+ return DoctrTextlineDetector(
284
+ profile.architecture, weights_path, profile.categories, config.DEVICE, lib=config.LIB
285
+ )
286
+
287
+ @staticmethod
288
+ def build_table_segmentation_service(
289
+ config: AttrDict,
290
+ detector: ObjectDetector,
291
+ ) -> Union[PubtablesSegmentationService, TableSegmentationService]:
292
+ """
293
+ Build and return a table segmentation service based on the provided detector.
294
+
295
+ Depending on the type of the detector, this method will return either a `PubtablesSegmentationService` or a
296
+ `TableSegmentationService` instance. The selection is made as follows:
297
+
298
+ - If the detector is an instance of `HFDetrDerivedDetector`, a `PubtablesSegmentationService` is created and
299
+ returned. This service uses specific configuration parameters for segmentation, such as assignment rules,
300
+ thresholds, and cell names defined in the `cfg` object.
301
+ - For other detector types, a `TableSegmentationService` is created and returned. This service also uses
302
+ configuration parameters from the `cfg` object but is tailored for different segmentation needs.
303
+
304
+ :param config: configuration object
305
+ :param detector: An instance of `ObjectDetector` used to determine the type of table segmentation
306
+ service to build.
307
+ :return: An instance of either `PubtablesSegmentationService` or `TableSegmentationService` based on the
308
+ detector type.
309
+ """
310
+ table_segmentation: Union[PubtablesSegmentationService, TableSegmentationService]
311
+ if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
312
+ table_segmentation = PubtablesSegmentationService(
313
+ segment_rule=config.SEGMENTATION.ASSIGNMENT_RULE,
314
+ threshold_rows=config.SEGMENTATION.THRESHOLD_ROWS,
315
+ threshold_cols=config.SEGMENTATION.THRESHOLD_COLS,
316
+ tile_table_with_items=config.SEGMENTATION.FULL_TABLE_TILING,
317
+ remove_iou_threshold_rows=config.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
318
+ remove_iou_threshold_cols=config.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
319
+ cell_class_id=config.SEGMENTATION.CELL_CATEGORY_ID,
320
+ table_name=config.SEGMENTATION.TABLE_NAME,
321
+ cell_names=config.SEGMENTATION.PUBTABLES_CELL_NAMES,
322
+ spanning_cell_names=config.SEGMENTATION.PUBTABLES_SPANNING_CELL_NAMES,
323
+ item_names=config.SEGMENTATION.PUBTABLES_ITEM_NAMES,
324
+ sub_item_names=config.SEGMENTATION.PUBTABLES_SUB_ITEM_NAMES,
325
+ stretch_rule=config.SEGMENTATION.STRETCH_RULE,
326
+ )
327
+
328
+ else:
329
+ table_segmentation = TableSegmentationService(
330
+ segment_rule=config.SEGMENTATION.ASSIGNMENT_RULE,
331
+ threshold_rows=config.SEGMENTATION.THRESHOLD_ROWS,
332
+ threshold_cols=config.SEGMENTATION.THRESHOLD_COLS,
333
+ tile_table_with_items=config.SEGMENTATION.FULL_TABLE_TILING,
334
+ remove_iou_threshold_rows=config.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
335
+ remove_iou_threshold_cols=config.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
336
+ table_name=config.SEGMENTATION.TABLE_NAME,
337
+ cell_names=config.SEGMENTATION.CELL_NAMES,
338
+ item_names=config.SEGMENTATION.ITEM_NAMES,
339
+ sub_item_names=config.SEGMENTATION.SUB_ITEM_NAMES,
340
+ stretch_rule=config.SEGMENTATION.STRETCH_RULE,
341
+ )
342
+ return table_segmentation
343
+
344
+ @staticmethod
345
+ def build_table_refinement_service(config: AttrDict) -> TableSegmentationRefinementService:
346
+ """Building a table segmentation refinement service
347
+
348
+ :param config: configuration object
349
+ :return: TableSegmentationRefinementService
350
+ """
351
+ return TableSegmentationRefinementService(
352
+ [config.SEGMENTATION.TABLE_NAME],
353
+ config.SEGMENTATION.PUBTABLES_CELL_NAMES,
354
+ )
355
+
356
+ @staticmethod
357
+ def build_pdf_text_detector(config: AttrDict) -> PdfPlumberTextDetector:
358
+ """Building a PDF text detector
359
+
360
+ :param config: configuration object
361
+ :return: PdfPlumberTextDetector
362
+ """
363
+ return PdfPlumberTextDetector(
364
+ x_tolerance=config.PDF_MINER.X_TOLERANCE, y_tolerance=config.PDF_MINER.Y_TOLERANCE
365
+ )
366
+
367
+ @staticmethod
368
+ def build_pdf_miner_text_service(detector: PdfMiner) -> TextExtractionService:
369
+ """Building a PDFMiner text extraction service
370
+
371
+ :param detector: PdfMiner
372
+ :return: TextExtractionService
373
+ """
374
+ return TextExtractionService(detector)
375
+
376
+ @staticmethod
377
+ def build_doctr_word_detector_service(detector: DoctrTextlineDetector) -> ImageLayoutService:
378
+ """Building a Doctr word detector service
379
+
380
+ :param detector: DoctrTextlineDetector
381
+ :return: ImageLayoutService
382
+ """
383
+ return ImageLayoutService(
384
+ layout_detector=detector, to_image=True, crop_image=True, skip_if_layout_extracted=True
385
+ )
386
+
387
+ @staticmethod
388
+ def build_text_extraction_service(
389
+ config: AttrDict, detector: Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]
390
+ ) -> TextExtractionService:
391
+ """Building a text extraction service
392
+
393
+ :param config: configuration object
394
+ :param detector: OCR detector
395
+ :return: TextExtractionService
396
+ """
397
+ return TextExtractionService(
398
+ detector,
399
+ skip_if_text_extracted=config.USE_PDF_MINER,
400
+ extract_from_roi=config.TEXT_CONTAINER if config.OCR.USE_DOCTR else None,
401
+ )
402
+
403
+ @staticmethod
404
+ def build_word_matching_service(config: AttrDict) -> MatchingService:
405
+ """Building a word matching service
406
+
407
+ :param config: configuration object
408
+ :return: MatchingService
409
+ """
410
+ matcher = IntersectionMatcher(
411
+ matching_rule=config.WORD_MATCHING.RULE,
412
+ threshold=config.WORD_MATCHING.THRESHOLD,
413
+ max_parent_only=config.WORD_MATCHING.MAX_PARENT_ONLY,
414
+ )
415
+ return MatchingService(
416
+ parent_categories=config.WORD_MATCHING.PARENTAL_CATEGORIES,
417
+ child_categories=config.TEXT_CONTAINER,
418
+ matcher=matcher,
419
+ relationship_key=Relationships.CHILD,
420
+ )
421
+
422
+ @staticmethod
423
+ def build_text_order_service(config: AttrDict) -> TextOrderService:
424
+ """Building a text order service
425
+
426
+ :param config: configuration object
427
+ :return: TextOrderService instance
428
+ """
429
+ return TextOrderService(
430
+ text_container=config.TEXT_CONTAINER,
431
+ text_block_categories=config.TEXT_ORDERING.TEXT_BLOCK_CATEGORIES,
432
+ floating_text_block_categories=config.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
433
+ include_residual_text_container=config.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
434
+ starting_point_tolerance=config.TEXT_ORDERING.STARTING_POINT_TOLERANCE,
435
+ broken_line_tolerance=config.TEXT_ORDERING.BROKEN_LINE_TOLERANCE,
436
+ height_tolerance=config.TEXT_ORDERING.HEIGHT_TOLERANCE,
437
+ paragraph_break=config.TEXT_ORDERING.PARAGRAPH_BREAK,
438
+ )
439
+
440
+ @staticmethod
441
+ def build_page_parsing_service(config: AttrDict) -> PageParsingService:
442
+ """Building a page parsing service
443
+
444
+ :param config: configuration object
445
+ :return: PageParsingService instance
446
+ """
447
+ return PageParsingService(
448
+ text_container=config.TEXT_CONTAINER,
449
+ floating_text_block_categories=config.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
450
+ include_residual_text_container=config.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
451
+ )
452
+
453
+ @staticmethod
454
+ def build_analyzer(config: AttrDict) -> DoctectionPipe:
455
+ """
456
+ Builds the analyzer with a given config
457
+
458
+ :param config: configuration object
459
+ :return: Analyzer pipeline
460
+ """
461
+ pipe_component_list: list[PipelineComponent] = []
462
+
463
+ if config.USE_ROTATOR:
464
+ rotation_detector = ServiceFactory.build_rotation_detector()
465
+ transform_service = ServiceFactory.build_transform_service(transform_predictor=rotation_detector)
466
+ pipe_component_list.append(transform_service)
467
+
468
+ if config.USE_LAYOUT:
469
+ layout_detector = ServiceFactory.build_layout_detector(config, mode="LAYOUT")
470
+ layout_service = ServiceFactory.build_layout_service(config, detector=layout_detector, mode="LAYOUT")
471
+ pipe_component_list.append(layout_service)
472
+
473
+ # setup layout nms service
474
+ if config.USE_LAYOUT_NMS:
475
+ layout_nms_service = ServiceFactory.build_layout_nms_service(config)
476
+ pipe_component_list.append(layout_nms_service)
477
+
478
+ # setup tables service
479
+ if config.USE_TABLE_SEGMENTATION:
480
+ item_detector = ServiceFactory.build_layout_detector(config, mode="ITEM")
481
+ item_service = ServiceFactory.build_sub_image_service(config, detector=item_detector, mode="ITEM")
482
+ pipe_component_list.append(item_service)
483
+
484
+ if item_detector.__class__.__name__ not in ("HFDetrDerivedDetector",):
485
+ cell_detector = ServiceFactory.build_layout_detector(config, mode="CELL")
486
+ cell_service = ServiceFactory.build_sub_image_service(config, detector=cell_detector, mode="CELL")
487
+ pipe_component_list.append(cell_service)
488
+
489
+ table_segmentation_service = ServiceFactory.build_table_segmentation_service(config, detector=item_detector)
490
+ pipe_component_list.append(table_segmentation_service)
491
+
492
+ if config.USE_TABLE_REFINEMENT:
493
+ table_refinement_service = ServiceFactory.build_table_refinement_service(config)
494
+ pipe_component_list.append(table_refinement_service)
495
+
496
+ if config.USE_PDF_MINER:
497
+ pdf_miner = ServiceFactory.build_pdf_text_detector(config)
498
+ d_text = ServiceFactory.build_pdf_miner_text_service(pdf_miner)
499
+ pipe_component_list.append(d_text)
500
+
501
+ # setup ocr
502
+ if config.USE_OCR:
503
+ # the extra mile for DocTr
504
+ if config.OCR.USE_DOCTR:
505
+ word_detector = ServiceFactory.build_doctr_word_detector(config)
506
+ word_service = ServiceFactory.build_doctr_word_detector_service(word_detector)
507
+ pipe_component_list.append(word_service)
508
+
509
+ ocr_detector = ServiceFactory.build_ocr_detector(config)
510
+ text_extraction_service = ServiceFactory.build_text_extraction_service(config, ocr_detector)
511
+ pipe_component_list.append(text_extraction_service)
512
+
513
+ if config.USE_PDF_MINER or config.USE_OCR:
514
+ matching_service = ServiceFactory.build_word_matching_service(config)
515
+ pipe_component_list.append(matching_service)
516
+
517
+ text_order_service = ServiceFactory.build_text_order_service(config)
518
+ pipe_component_list.append(text_order_service)
519
+
520
+ page_parsing_service = ServiceFactory.build_page_parsing_service(config)
521
+
522
+ return DoctectionPipe(pipeline_component_list=pipe_component_list, page_parsing_service=page_parsing_service)
@@ -1,3 +1,4 @@
1
+ USE_ROTATOR: False
1
2
  USE_LAYOUT: True
2
3
  USE_TABLE_SEGMENTATION: True
3
4
  TF:
@@ -527,5 +527,5 @@ class ContainerAnnotation(CategoryAnnotation):
527
527
  def from_dict(cls, **kwargs: AnnotationDict) -> ContainerAnnotation:
528
528
  container_ann = ann_from_dict(cls, **kwargs)
529
529
  value = kwargs.get("value", "")
530
- container_ann.value = value if isinstance(value, str) else list(value)
530
+ container_ann.value = value if isinstance(value, (int, float, str)) else list(value)
531
531
  return container_ann
@@ -143,11 +143,13 @@ def convert_pdf_bytes_to_np_array(pdf_bytes: bytes, dpi: Optional[int] = None) -
143
143
  return np_array.astype(uint8)
144
144
 
145
145
 
146
- def convert_pdf_bytes_to_np_array_v2(pdf_bytes: bytes, dpi: Optional[int] = None) -> PixelValues:
146
+ def convert_pdf_bytes_to_np_array_v2(pdf_bytes: bytes, dpi: Optional[int] = 200) -> PixelValues:
147
147
  """
148
- Converts a pdf passed as bytes into a numpy array. Note, that this method expects poppler to be installed. This
149
- function, however does not rely on the wrapper pdf2image but uses a function of this lib which calls poppler
150
- directly.
148
+ Converts a pdf passed as bytes into a numpy array. We use poppler or pdfmium to convert the pdf to an image.
149
+ If both is available you can steer the selection of the render engine with environment variables:
150
+
151
+ USE_DD_POPPLER: Set to 1, "TRUE", "True" to use poppler
152
+ USE_DD_PDFIUM: Set to 1, "TRUE", "True" to use pdfium
151
153
 
152
154
  :param pdf_bytes: A pdf as bytes object. A byte representation can from a pdf file can be generated e.g. with
153
155
  `utils.fs.load_bytes_from_pdf_file`
@@ -23,7 +23,7 @@ from __future__ import annotations
23
23
  import json
24
24
  from collections import defaultdict
25
25
  from dataclasses import dataclass, field
26
- from os import environ
26
+ from os import environ, fspath
27
27
  from pathlib import Path
28
28
  from typing import Any, Optional, Sequence, Union, no_type_check
29
29
 
@@ -412,13 +412,22 @@ class Image:
412
412
  img_dict["_image"] = None
413
413
  return img_dict
414
414
 
415
+ def as_json(self) -> str:
416
+ """
417
+ Returns the full image dataclass as json string.
418
+
419
+ :return: A json string.
420
+ """
421
+
422
+ return json.dumps(self.as_dict(), indent=4)
423
+
415
424
  @staticmethod
416
425
  def remove_keys() -> list[str]:
417
426
  """
418
427
  A list of attributes to suspend from as_dict creation.
419
428
  """
420
429
 
421
- return ["_image", "_annotation_ids"]
430
+ return ["_image", "_annotation_ids", "_category_name"]
422
431
 
423
432
  def define_annotation_id(self, annotation: Annotation) -> str:
424
433
  """
@@ -443,7 +452,8 @@ class Image:
443
452
 
444
453
  Calls `List.remove`. Make sure, the element is in the list for otherwise a ValueError will be raised.
445
454
 
446
- :param annotation: The annotation to remove
455
+ :param annotation_ids: The annotation to remove
456
+ :param service_ids: The service id to remove
447
457
  """
448
458
  ann_id_to_annotation_maps = self.get_annotation_id_to_annotation_maps()
449
459
 
@@ -703,13 +713,13 @@ class Image:
703
713
  path = path / self.image_id
704
714
  suffix = path.suffix
705
715
  if suffix:
706
- path_json = path.as_posix().replace(suffix, ".json")
716
+ path_json = fspath(path).replace(suffix, ".json")
707
717
  else:
708
- path_json = path.as_posix() + ".json"
718
+ path_json = fspath(path) + ".json"
709
719
  if highest_hierarchy_only:
710
720
  self.remove_image_from_lower_hierachy()
711
721
  export_dict = self.as_dict()
712
- export_dict["location"] = str(export_dict["location"])
722
+ export_dict["location"] = fspath(export_dict["location"])
713
723
  if not image_to_json:
714
724
  export_dict["_image"] = None
715
725
  if dry:
@@ -509,6 +509,7 @@ class Page(Image):
509
509
  "location",
510
510
  "document_id",
511
511
  "page_number",
512
+ "angle",
512
513
  }
513
514
  include_residual_text_container: bool = True
514
515