deepdoctection 0.35__py3-none-any.whl → 0.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +5 -6
- deepdoctection/analyzer/_config.py +10 -18
- deepdoctection/analyzer/factory.py +214 -18
- deepdoctection/configs/conf_dd_one.yaml +4 -0
- deepdoctection/dataflow/custom_serialize.py +1 -1
- deepdoctection/datapoint/convert.py +11 -0
- deepdoctection/datapoint/image.py +2 -2
- deepdoctection/datapoint/view.py +90 -15
- deepdoctection/datasets/save.py +1 -1
- deepdoctection/eval/cocometric.py +59 -13
- deepdoctection/extern/base.py +2 -3
- deepdoctection/mapper/match.py +4 -2
- deepdoctection/mapper/misc.py +5 -1
- deepdoctection/pipe/doctectionpipe.py +77 -10
- deepdoctection/utils/fs.py +8 -7
- deepdoctection/utils/pdf_utils.py +45 -17
- deepdoctection/utils/utils.py +39 -0
- deepdoctection/utils/viz.py +49 -13
- {deepdoctection-0.35.dist-info → deepdoctection-0.37.dist-info}/METADATA +116 -112
- {deepdoctection-0.35.dist-info → deepdoctection-0.37.dist-info}/RECORD +23 -23
- {deepdoctection-0.35.dist-info → deepdoctection-0.37.dist-info}/WHEEL +1 -1
- {deepdoctection-0.35.dist-info → deepdoctection-0.37.dist-info}/LICENSE +0 -0
- {deepdoctection-0.35.dist-info → deepdoctection-0.37.dist-info}/top_level.txt +0 -0
deepdoctection/__init__.py
CHANGED
|
@@ -24,14 +24,10 @@ from .utils.logger import LoggingRecord, logger
|
|
|
24
24
|
|
|
25
25
|
# pylint: enable=wrong-import-position
|
|
26
26
|
|
|
27
|
-
__version__ = 0.
|
|
27
|
+
__version__ = 0.37
|
|
28
28
|
|
|
29
29
|
_IMPORT_STRUCTURE = {
|
|
30
|
-
"analyzer": [
|
|
31
|
-
"config_sanity_checks",
|
|
32
|
-
"get_dd_analyzer",
|
|
33
|
-
"ServiceFactory"
|
|
34
|
-
],
|
|
30
|
+
"analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory"],
|
|
35
31
|
"configs": [],
|
|
36
32
|
"dataflow": [
|
|
37
33
|
"DataFlowTerminated",
|
|
@@ -91,6 +87,7 @@ _IMPORT_STRUCTURE = {
|
|
|
91
87
|
"convert_b64_to_np_array",
|
|
92
88
|
"convert_np_array_to_b64",
|
|
93
89
|
"convert_np_array_to_b64_b",
|
|
90
|
+
"convert_bytes_to_np_array",
|
|
94
91
|
"convert_pdf_bytes_to_np_array_v2",
|
|
95
92
|
"box_to_point4",
|
|
96
93
|
"point4_to_box",
|
|
@@ -375,10 +372,12 @@ _IMPORT_STRUCTURE = {
|
|
|
375
372
|
"save_config_to_yaml",
|
|
376
373
|
"config_to_cli_str",
|
|
377
374
|
"decrypt_pdf_document",
|
|
375
|
+
"decrypt_pdf_document_from_bytes",
|
|
378
376
|
"get_pdf_file_reader",
|
|
379
377
|
"get_pdf_file_writer",
|
|
380
378
|
"PDFStreamer",
|
|
381
379
|
"pdf_to_np_array",
|
|
380
|
+
"split_pdf",
|
|
382
381
|
"ObjectTypes",
|
|
383
382
|
"TypeOrStr",
|
|
384
383
|
"object_types_registry",
|
|
@@ -17,11 +17,13 @@
|
|
|
17
17
|
|
|
18
18
|
"""Pipeline configuration for deepdoctection analyzer. Do not change the defaults in this file. """
|
|
19
19
|
|
|
20
|
+
from ..datapoint.view import IMAGE_DEFAULTS
|
|
20
21
|
from ..utils.metacfg import AttrDict
|
|
21
22
|
from ..utils.settings import CellType, LayoutType
|
|
22
23
|
|
|
23
24
|
cfg = AttrDict()
|
|
24
25
|
|
|
26
|
+
|
|
25
27
|
cfg.LANGUAGE = None
|
|
26
28
|
cfg.LIB = None
|
|
27
29
|
cfg.DEVICE = None
|
|
@@ -32,11 +34,9 @@ cfg.USE_TABLE_SEGMENTATION = True
|
|
|
32
34
|
cfg.TF.LAYOUT.WEIGHTS = "layout/model-800000_inf_only.data-00000-of-00001"
|
|
33
35
|
cfg.TF.LAYOUT.FILTER = None
|
|
34
36
|
|
|
35
|
-
|
|
36
37
|
cfg.TF.CELL.WEIGHTS = "cell/model-1800000_inf_only.data-00000-of-00001"
|
|
37
38
|
cfg.TF.CELL.FILTER = None
|
|
38
39
|
|
|
39
|
-
|
|
40
40
|
cfg.TF.ITEM.WEIGHTS = "item/model-1620000_inf_only.data-00000-of-00001"
|
|
41
41
|
cfg.TF.ITEM.FILTER = None
|
|
42
42
|
|
|
@@ -112,7 +112,7 @@ cfg.OCR.WEIGHTS.DOCTR_WORD.PT = "doctr/db_resnet50/pt/db_resnet50-ac60cadc.pt"
|
|
|
112
112
|
cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.TF = "doctr/crnn_vgg16_bn/tf/crnn_vgg16_bn-76b7f2c6.zip"
|
|
113
113
|
cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.PT = "doctr/crnn_vgg16_bn/pt/crnn_vgg16_bn-9762b0b0.pt"
|
|
114
114
|
|
|
115
|
-
cfg.TEXT_CONTAINER =
|
|
115
|
+
cfg.TEXT_CONTAINER = IMAGE_DEFAULTS["text_container"]
|
|
116
116
|
cfg.WORD_MATCHING.PARENTAL_CATEGORIES = [
|
|
117
117
|
LayoutType.TEXT,
|
|
118
118
|
LayoutType.TITLE,
|
|
@@ -127,24 +127,16 @@ cfg.WORD_MATCHING.RULE = "ioa"
|
|
|
127
127
|
cfg.WORD_MATCHING.THRESHOLD = 0.6
|
|
128
128
|
cfg.WORD_MATCHING.MAX_PARENT_ONLY = True
|
|
129
129
|
|
|
130
|
-
cfg.TEXT_ORDERING.TEXT_BLOCK_CATEGORIES = [
|
|
131
|
-
|
|
132
|
-
LayoutType.TITLE,
|
|
133
|
-
LayoutType.LIST,
|
|
134
|
-
LayoutType.CELL,
|
|
135
|
-
CellType.COLUMN_HEADER,
|
|
136
|
-
CellType.PROJECTED_ROW_HEADER,
|
|
137
|
-
CellType.SPANNING,
|
|
138
|
-
CellType.ROW_HEADER,
|
|
139
|
-
]
|
|
140
|
-
cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES = [
|
|
141
|
-
LayoutType.TEXT,
|
|
142
|
-
LayoutType.TITLE,
|
|
143
|
-
LayoutType.LIST,
|
|
144
|
-
]
|
|
130
|
+
cfg.TEXT_ORDERING.TEXT_BLOCK_CATEGORIES = IMAGE_DEFAULTS["text_block_categories"]
|
|
131
|
+
cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES = IMAGE_DEFAULTS["floating_text_block_categories"]
|
|
145
132
|
cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER = False
|
|
146
133
|
cfg.TEXT_ORDERING.STARTING_POINT_TOLERANCE = 0.005
|
|
147
134
|
cfg.TEXT_ORDERING.BROKEN_LINE_TOLERANCE = 0.003
|
|
148
135
|
cfg.TEXT_ORDERING.HEIGHT_TOLERANCE = 2.0
|
|
149
136
|
cfg.TEXT_ORDERING.PARAGRAPH_BREAK = 0.035
|
|
137
|
+
|
|
138
|
+
cfg.USE_LAYOUT_LINK = False
|
|
139
|
+
cfg.LAYOUT_LINK.PARENTAL_CATEGORIES = []
|
|
140
|
+
cfg.LAYOUT_LINK.CHILD_CATEGORIES = []
|
|
141
|
+
|
|
150
142
|
cfg.freeze()
|
|
@@ -33,7 +33,13 @@ from ..extern.tessocr import TesseractOcrDetector, TesseractRotationTransformer
|
|
|
33
33
|
from ..extern.texocr import TextractOcrDetector
|
|
34
34
|
from ..extern.tpdetect import TPFrcnnDetector
|
|
35
35
|
from ..pipe.base import PipelineComponent
|
|
36
|
-
from ..pipe.common import
|
|
36
|
+
from ..pipe.common import (
|
|
37
|
+
AnnotationNmsService,
|
|
38
|
+
IntersectionMatcher,
|
|
39
|
+
MatchingService,
|
|
40
|
+
NeighbourMatcher,
|
|
41
|
+
PageParsingService,
|
|
42
|
+
)
|
|
37
43
|
from ..pipe.doctectionpipe import DoctectionPipe
|
|
38
44
|
from ..pipe.layout import ImageLayoutService
|
|
39
45
|
from ..pipe.order import TextOrderService
|
|
@@ -78,7 +84,7 @@ class ServiceFactory:
|
|
|
78
84
|
"""
|
|
79
85
|
|
|
80
86
|
@staticmethod
|
|
81
|
-
def
|
|
87
|
+
def _build_layout_detector(
|
|
82
88
|
config: AttrDict,
|
|
83
89
|
mode: str,
|
|
84
90
|
) -> Union[D2FrcnnDetector, TPFrcnnDetector, HFDetrDerivedDetector, D2FrcnnTracingDetector]:
|
|
@@ -141,17 +147,38 @@ class ServiceFactory:
|
|
|
141
147
|
)
|
|
142
148
|
|
|
143
149
|
@staticmethod
|
|
144
|
-
def
|
|
150
|
+
def build_layout_detector(
|
|
151
|
+
config: AttrDict, mode: str
|
|
152
|
+
) -> Union[D2FrcnnDetector, TPFrcnnDetector, HFDetrDerivedDetector, D2FrcnnTracingDetector]:
|
|
153
|
+
"""Building a layout detector according to the config
|
|
154
|
+
|
|
155
|
+
:param config: configuration object
|
|
156
|
+
:param mode: either `LAYOUT`,`CELL` or `ITEM`
|
|
157
|
+
"""
|
|
158
|
+
return ServiceFactory._build_layout_detector(config, mode)
|
|
159
|
+
|
|
160
|
+
@staticmethod
|
|
161
|
+
def _build_rotation_detector() -> TesseractRotationTransformer:
|
|
145
162
|
"""Building a rotation detector"""
|
|
146
163
|
return TesseractRotationTransformer()
|
|
147
164
|
|
|
148
165
|
@staticmethod
|
|
149
|
-
def
|
|
166
|
+
def build_rotation_detector() -> TesseractRotationTransformer:
|
|
167
|
+
"""Building a rotation detector"""
|
|
168
|
+
return ServiceFactory._build_rotation_detector()
|
|
169
|
+
|
|
170
|
+
@staticmethod
|
|
171
|
+
def _build_transform_service(transform_predictor: ImageTransformer) -> SimpleTransformService:
|
|
150
172
|
"""Building a transform service with a given predictor"""
|
|
151
173
|
return SimpleTransformService(transform_predictor)
|
|
152
174
|
|
|
153
175
|
@staticmethod
|
|
154
|
-
def
|
|
176
|
+
def build_transform_service(transform_predictor: ImageTransformer) -> SimpleTransformService:
|
|
177
|
+
"""Building a transform service with a given predictor"""
|
|
178
|
+
return ServiceFactory._build_transform_service(transform_predictor)
|
|
179
|
+
|
|
180
|
+
@staticmethod
|
|
181
|
+
def _build_padder(config: AttrDict, mode: str) -> PadTransform:
|
|
155
182
|
"""Building a padder according to the config
|
|
156
183
|
|
|
157
184
|
:param config: configuration object
|
|
@@ -164,10 +191,20 @@ class ServiceFactory:
|
|
|
164
191
|
getattr(config.PT, mode).PAD.BOTTOM,
|
|
165
192
|
getattr(config.PT, mode).PAD.LEFT,
|
|
166
193
|
)
|
|
167
|
-
return PadTransform(top=top, right=right, bottom=bottom, left=left)
|
|
194
|
+
return PadTransform(top=top, right=right, bottom=bottom, left=left) #
|
|
168
195
|
|
|
169
196
|
@staticmethod
|
|
170
|
-
def
|
|
197
|
+
def build_padder(config: AttrDict, mode: str) -> PadTransform:
|
|
198
|
+
"""Building a padder according to the config
|
|
199
|
+
|
|
200
|
+
:param config: configuration object
|
|
201
|
+
:param mode: either `LAYOUT`,`CELL` or `ITEM`
|
|
202
|
+
:return `PadTransform` instance
|
|
203
|
+
"""
|
|
204
|
+
return ServiceFactory._build_padder(config, mode)
|
|
205
|
+
|
|
206
|
+
@staticmethod
|
|
207
|
+
def _build_layout_service(config: AttrDict, detector: ObjectDetector, mode: str) -> ImageLayoutService:
|
|
171
208
|
"""Building a layout service with a given detector
|
|
172
209
|
|
|
173
210
|
:param config: configuration object
|
|
@@ -181,7 +218,18 @@ class ServiceFactory:
|
|
|
181
218
|
return ImageLayoutService(layout_detector=detector, to_image=True, crop_image=True, padder=padder)
|
|
182
219
|
|
|
183
220
|
@staticmethod
|
|
184
|
-
def
|
|
221
|
+
def build_layout_service(config: AttrDict, detector: ObjectDetector, mode: str) -> ImageLayoutService:
|
|
222
|
+
"""Building a layout service with a given detector
|
|
223
|
+
|
|
224
|
+
:param config: configuration object
|
|
225
|
+
:param detector: will be passed to the `ImageLayoutService`
|
|
226
|
+
:param mode: either `LAYOUT`,`CELL` or `ITEM`
|
|
227
|
+
:return `ImageLayoutService` instance
|
|
228
|
+
"""
|
|
229
|
+
return ServiceFactory._build_layout_service(config, detector, mode)
|
|
230
|
+
|
|
231
|
+
@staticmethod
|
|
232
|
+
def _build_layout_nms_service(config: AttrDict) -> AnnotationNmsService:
|
|
185
233
|
"""Building a NMS service for layout annotations
|
|
186
234
|
|
|
187
235
|
:param config: configuration object
|
|
@@ -199,7 +247,15 @@ class ServiceFactory:
|
|
|
199
247
|
)
|
|
200
248
|
|
|
201
249
|
@staticmethod
|
|
202
|
-
def
|
|
250
|
+
def build_layout_nms_service(config: AttrDict) -> AnnotationNmsService:
|
|
251
|
+
"""Building a NMS service for layout annotations
|
|
252
|
+
|
|
253
|
+
:param config: configuration object
|
|
254
|
+
"""
|
|
255
|
+
return ServiceFactory._build_layout_nms_service(config)
|
|
256
|
+
|
|
257
|
+
@staticmethod
|
|
258
|
+
def _build_sub_image_service(config: AttrDict, detector: ObjectDetector, mode: str) -> SubImageLayoutService:
|
|
203
259
|
"""
|
|
204
260
|
Building a sub image layout service with a given detector
|
|
205
261
|
|
|
@@ -226,7 +282,19 @@ class ServiceFactory:
|
|
|
226
282
|
)
|
|
227
283
|
|
|
228
284
|
@staticmethod
|
|
229
|
-
def
|
|
285
|
+
def build_sub_image_service(config: AttrDict, detector: ObjectDetector, mode: str) -> SubImageLayoutService:
|
|
286
|
+
"""
|
|
287
|
+
Building a sub image layout service with a given detector
|
|
288
|
+
|
|
289
|
+
:param config: configuration object
|
|
290
|
+
:param detector: will be passed to the `SubImageLayoutService`
|
|
291
|
+
:param mode: either `LAYOUT`,`CELL` or `ITEM`
|
|
292
|
+
:return: `SubImageLayoutService` instance
|
|
293
|
+
"""
|
|
294
|
+
return ServiceFactory._build_sub_image_service(config, detector, mode)
|
|
295
|
+
|
|
296
|
+
@staticmethod
|
|
297
|
+
def _build_ocr_detector(config: AttrDict) -> Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]:
|
|
230
298
|
"""
|
|
231
299
|
Building OCR predictor
|
|
232
300
|
|
|
@@ -266,6 +334,15 @@ class ServiceFactory:
|
|
|
266
334
|
return TextractOcrDetector(**credentials_kwargs)
|
|
267
335
|
raise ValueError("You have set USE_OCR=True but any of USE_TESSERACT, USE_DOCTR, USE_TEXTRACT is set to False")
|
|
268
336
|
|
|
337
|
+
@staticmethod
|
|
338
|
+
def build_ocr_detector(config: AttrDict) -> Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]:
|
|
339
|
+
"""
|
|
340
|
+
Building OCR predictor
|
|
341
|
+
|
|
342
|
+
:param config: configuration object
|
|
343
|
+
"""
|
|
344
|
+
return ServiceFactory._build_ocr_detector(config)
|
|
345
|
+
|
|
269
346
|
@staticmethod
|
|
270
347
|
def build_doctr_word_detector(config: AttrDict) -> DoctrTextlineDetector:
|
|
271
348
|
"""Building `DoctrTextlineDetector` instance
|
|
@@ -285,7 +362,7 @@ class ServiceFactory:
|
|
|
285
362
|
)
|
|
286
363
|
|
|
287
364
|
@staticmethod
|
|
288
|
-
def
|
|
365
|
+
def _build_table_segmentation_service(
|
|
289
366
|
config: AttrDict,
|
|
290
367
|
detector: ObjectDetector,
|
|
291
368
|
) -> Union[PubtablesSegmentationService, TableSegmentationService]:
|
|
@@ -342,7 +419,32 @@ class ServiceFactory:
|
|
|
342
419
|
return table_segmentation
|
|
343
420
|
|
|
344
421
|
@staticmethod
|
|
345
|
-
def
|
|
422
|
+
def build_table_segmentation_service(
|
|
423
|
+
config: AttrDict,
|
|
424
|
+
detector: ObjectDetector,
|
|
425
|
+
) -> Union[PubtablesSegmentationService, TableSegmentationService]:
|
|
426
|
+
"""
|
|
427
|
+
Build and return a table segmentation service based on the provided detector.
|
|
428
|
+
|
|
429
|
+
Depending on the type of the detector, this method will return either a `PubtablesSegmentationService` or a
|
|
430
|
+
`TableSegmentationService` instance. The selection is made as follows:
|
|
431
|
+
|
|
432
|
+
- If the detector is an instance of `HFDetrDerivedDetector`, a `PubtablesSegmentationService` is created and
|
|
433
|
+
returned. This service uses specific configuration parameters for segmentation, such as assignment rules,
|
|
434
|
+
thresholds, and cell names defined in the `cfg` object.
|
|
435
|
+
- For other detector types, a `TableSegmentationService` is created and returned. This service also uses
|
|
436
|
+
configuration parameters from the `cfg` object but is tailored for different segmentation needs.
|
|
437
|
+
|
|
438
|
+
:param config: configuration object
|
|
439
|
+
:param detector: An instance of `ObjectDetector` used to determine the type of table segmentation
|
|
440
|
+
service to build.
|
|
441
|
+
:return: An instance of either `PubtablesSegmentationService` or `TableSegmentationService` based on the
|
|
442
|
+
detector type.
|
|
443
|
+
"""
|
|
444
|
+
return ServiceFactory._build_table_segmentation_service(config, detector)
|
|
445
|
+
|
|
446
|
+
@staticmethod
|
|
447
|
+
def _build_table_refinement_service(config: AttrDict) -> TableSegmentationRefinementService:
|
|
346
448
|
"""Building a table segmentation refinement service
|
|
347
449
|
|
|
348
450
|
:param config: configuration object
|
|
@@ -354,7 +456,16 @@ class ServiceFactory:
|
|
|
354
456
|
)
|
|
355
457
|
|
|
356
458
|
@staticmethod
|
|
357
|
-
def
|
|
459
|
+
def build_table_refinement_service(config: AttrDict) -> TableSegmentationRefinementService:
|
|
460
|
+
"""Building a table segmentation refinement service
|
|
461
|
+
|
|
462
|
+
:param config: configuration object
|
|
463
|
+
:return: TableSegmentationRefinementService
|
|
464
|
+
"""
|
|
465
|
+
return ServiceFactory._build_table_refinement_service(config)
|
|
466
|
+
|
|
467
|
+
@staticmethod
|
|
468
|
+
def _build_pdf_text_detector(config: AttrDict) -> PdfPlumberTextDetector:
|
|
358
469
|
"""Building a PDF text detector
|
|
359
470
|
|
|
360
471
|
:param config: configuration object
|
|
@@ -365,7 +476,16 @@ class ServiceFactory:
|
|
|
365
476
|
)
|
|
366
477
|
|
|
367
478
|
@staticmethod
|
|
368
|
-
def
|
|
479
|
+
def build_pdf_text_detector(config: AttrDict) -> PdfPlumberTextDetector:
|
|
480
|
+
"""Building a PDF text detector
|
|
481
|
+
|
|
482
|
+
:param config: configuration object
|
|
483
|
+
:return: PdfPlumberTextDetector
|
|
484
|
+
"""
|
|
485
|
+
return ServiceFactory._build_pdf_text_detector(config)
|
|
486
|
+
|
|
487
|
+
@staticmethod
|
|
488
|
+
def _build_pdf_miner_text_service(detector: PdfMiner) -> TextExtractionService:
|
|
369
489
|
"""Building a PDFMiner text extraction service
|
|
370
490
|
|
|
371
491
|
:param detector: PdfMiner
|
|
@@ -373,6 +493,15 @@ class ServiceFactory:
|
|
|
373
493
|
"""
|
|
374
494
|
return TextExtractionService(detector)
|
|
375
495
|
|
|
496
|
+
@staticmethod
|
|
497
|
+
def build_pdf_miner_text_service(detector: PdfMiner) -> TextExtractionService:
|
|
498
|
+
"""Building a PDFMiner text extraction service
|
|
499
|
+
|
|
500
|
+
:param detector: PdfMiner
|
|
501
|
+
:return: TextExtractionService
|
|
502
|
+
"""
|
|
503
|
+
return ServiceFactory._build_pdf_miner_text_service(detector)
|
|
504
|
+
|
|
376
505
|
@staticmethod
|
|
377
506
|
def build_doctr_word_detector_service(detector: DoctrTextlineDetector) -> ImageLayoutService:
|
|
378
507
|
"""Building a Doctr word detector service
|
|
@@ -385,7 +514,7 @@ class ServiceFactory:
|
|
|
385
514
|
)
|
|
386
515
|
|
|
387
516
|
@staticmethod
|
|
388
|
-
def
|
|
517
|
+
def _build_text_extraction_service(
|
|
389
518
|
config: AttrDict, detector: Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]
|
|
390
519
|
) -> TextExtractionService:
|
|
391
520
|
"""Building a text extraction service
|
|
@@ -401,7 +530,19 @@ class ServiceFactory:
|
|
|
401
530
|
)
|
|
402
531
|
|
|
403
532
|
@staticmethod
|
|
404
|
-
def
|
|
533
|
+
def build_text_extraction_service(
|
|
534
|
+
config: AttrDict, detector: Union[TesseractOcrDetector, DoctrTextRecognizer, TextractOcrDetector]
|
|
535
|
+
) -> TextExtractionService:
|
|
536
|
+
"""Building a text extraction service
|
|
537
|
+
|
|
538
|
+
:param config: configuration object
|
|
539
|
+
:param detector: OCR detector
|
|
540
|
+
:return: TextExtractionService
|
|
541
|
+
"""
|
|
542
|
+
return ServiceFactory._build_text_extraction_service(config, detector)
|
|
543
|
+
|
|
544
|
+
@staticmethod
|
|
545
|
+
def _build_word_matching_service(config: AttrDict) -> MatchingService:
|
|
405
546
|
"""Building a word matching service
|
|
406
547
|
|
|
407
548
|
:param config: configuration object
|
|
@@ -420,7 +561,40 @@ class ServiceFactory:
|
|
|
420
561
|
)
|
|
421
562
|
|
|
422
563
|
@staticmethod
|
|
423
|
-
def
|
|
564
|
+
def build_word_matching_service(config: AttrDict) -> MatchingService:
|
|
565
|
+
"""Building a word matching service
|
|
566
|
+
|
|
567
|
+
:param config: configuration object
|
|
568
|
+
:return: MatchingService
|
|
569
|
+
"""
|
|
570
|
+
return ServiceFactory._build_word_matching_service(config)
|
|
571
|
+
|
|
572
|
+
@staticmethod
|
|
573
|
+
def _build_layout_link_matching_service(config: AttrDict) -> MatchingService:
|
|
574
|
+
"""Building a word matching service
|
|
575
|
+
|
|
576
|
+
:param config: configuration object
|
|
577
|
+
:return: MatchingService
|
|
578
|
+
"""
|
|
579
|
+
neighbor_matcher = NeighbourMatcher()
|
|
580
|
+
return MatchingService(
|
|
581
|
+
parent_categories=config.LAYOUT_LINK.PARENTAL_CATEGORIES,
|
|
582
|
+
child_categories=config.LAYOUT_LINK.CHILD_CATEGORIES,
|
|
583
|
+
matcher=neighbor_matcher,
|
|
584
|
+
relationship_key=Relationships.LAYOUT_LINK,
|
|
585
|
+
)
|
|
586
|
+
|
|
587
|
+
@staticmethod
|
|
588
|
+
def build_layout_link_matching_service(config: AttrDict) -> MatchingService:
|
|
589
|
+
"""Building a word matching service
|
|
590
|
+
|
|
591
|
+
:param config: configuration object
|
|
592
|
+
:return: MatchingService
|
|
593
|
+
"""
|
|
594
|
+
return ServiceFactory._build_layout_link_matching_service(config)
|
|
595
|
+
|
|
596
|
+
@staticmethod
|
|
597
|
+
def _build_text_order_service(config: AttrDict) -> TextOrderService:
|
|
424
598
|
"""Building a text order service
|
|
425
599
|
|
|
426
600
|
:param config: configuration object
|
|
@@ -438,7 +612,16 @@ class ServiceFactory:
|
|
|
438
612
|
)
|
|
439
613
|
|
|
440
614
|
@staticmethod
|
|
441
|
-
def
|
|
615
|
+
def build_text_order_service(config: AttrDict) -> TextOrderService:
|
|
616
|
+
"""Building a text order service
|
|
617
|
+
|
|
618
|
+
:param config: configuration object
|
|
619
|
+
:return: TextOrderService instance
|
|
620
|
+
"""
|
|
621
|
+
return ServiceFactory._build_text_order_service(config)
|
|
622
|
+
|
|
623
|
+
@staticmethod
|
|
624
|
+
def _build_page_parsing_service(config: AttrDict) -> PageParsingService:
|
|
442
625
|
"""Building a page parsing service
|
|
443
626
|
|
|
444
627
|
:param config: configuration object
|
|
@@ -450,6 +633,15 @@ class ServiceFactory:
|
|
|
450
633
|
include_residual_text_container=config.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
|
|
451
634
|
)
|
|
452
635
|
|
|
636
|
+
@staticmethod
|
|
637
|
+
def build_page_parsing_service(config: AttrDict) -> PageParsingService:
|
|
638
|
+
"""Building a page parsing service
|
|
639
|
+
|
|
640
|
+
:param config: configuration object
|
|
641
|
+
:return: PageParsingService instance
|
|
642
|
+
"""
|
|
643
|
+
return ServiceFactory._build_page_parsing_service(config)
|
|
644
|
+
|
|
453
645
|
@staticmethod
|
|
454
646
|
def build_analyzer(config: AttrDict) -> DoctectionPipe:
|
|
455
647
|
"""
|
|
@@ -517,6 +709,10 @@ class ServiceFactory:
|
|
|
517
709
|
text_order_service = ServiceFactory.build_text_order_service(config)
|
|
518
710
|
pipe_component_list.append(text_order_service)
|
|
519
711
|
|
|
712
|
+
if config.USE_LAYOUT_LINK:
|
|
713
|
+
layout_link_matching_service = ServiceFactory.build_layout_link_matching_service(config)
|
|
714
|
+
pipe_component_list.append(layout_link_matching_service)
|
|
715
|
+
|
|
520
716
|
page_parsing_service = ServiceFactory.build_page_parsing_service(config)
|
|
521
717
|
|
|
522
718
|
return DoctectionPipe(pipeline_component_list=pipe_component_list, page_parsing_service=page_parsing_service)
|
|
@@ -593,7 +593,7 @@ class SerializerPdfDoc:
|
|
|
593
593
|
file_name = os.path.split(path)[1]
|
|
594
594
|
prefix, suffix = os.path.splitext(file_name)
|
|
595
595
|
df: DataFlow
|
|
596
|
-
df = CustomDataFromIterable(PDFStreamer(
|
|
596
|
+
df = CustomDataFromIterable(PDFStreamer(path_or_bytes=path), max_datapoints=max_datapoints)
|
|
597
597
|
df = MapData(
|
|
598
598
|
df,
|
|
599
599
|
lambda dp: {
|
|
@@ -40,6 +40,7 @@ __all__ = [
|
|
|
40
40
|
"convert_b64_to_np_array",
|
|
41
41
|
"convert_np_array_to_b64",
|
|
42
42
|
"convert_np_array_to_b64_b",
|
|
43
|
+
"convert_bytes_to_np_array",
|
|
43
44
|
"convert_pdf_bytes_to_np_array_v2",
|
|
44
45
|
"box_to_point4",
|
|
45
46
|
"point4_to_box",
|
|
@@ -107,6 +108,16 @@ def convert_np_array_to_b64_b(np_image: PixelValues) -> bytes:
|
|
|
107
108
|
return viz_handler.encode(np_image)
|
|
108
109
|
|
|
109
110
|
|
|
111
|
+
def convert_bytes_to_np_array(image_bytes: bytes) -> PixelValues:
|
|
112
|
+
"""
|
|
113
|
+
Converts an image in bytes to a numpy array
|
|
114
|
+
|
|
115
|
+
:param image_bytes: An image as bytes.
|
|
116
|
+
:return: numpy array.
|
|
117
|
+
"""
|
|
118
|
+
return viz_handler.convert_bytes_to_np(image_bytes)
|
|
119
|
+
|
|
120
|
+
|
|
110
121
|
@deprecated("Use convert_pdf_bytes_to_np_array_v2", "2022-02-23")
|
|
111
122
|
def convert_pdf_bytes_to_np_array(pdf_bytes: bytes, dpi: Optional[int] = None) -> PixelValues:
|
|
112
123
|
"""
|
|
@@ -587,7 +587,7 @@ class Image:
|
|
|
587
587
|
)
|
|
588
588
|
ann.image.dump(sub_image)
|
|
589
589
|
|
|
590
|
-
def
|
|
590
|
+
def remove_image_from_lower_hierarchy(self, pixel_values_only: bool = False) -> None:
|
|
591
591
|
"""Will remove all images from image annotations."""
|
|
592
592
|
for ann in self.annotations:
|
|
593
593
|
if pixel_values_only:
|
|
@@ -717,7 +717,7 @@ class Image:
|
|
|
717
717
|
else:
|
|
718
718
|
path_json = fspath(path) + ".json"
|
|
719
719
|
if highest_hierarchy_only:
|
|
720
|
-
self.
|
|
720
|
+
self.remove_image_from_lower_hierarchy()
|
|
721
721
|
export_dict = self.as_dict()
|
|
722
722
|
export_dict["location"] = fspath(export_dict["location"])
|
|
723
723
|
if not image_to_json:
|