deepdoctection 0.31__py3-none-any.whl → 0.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +16 -29
- deepdoctection/analyzer/dd.py +70 -59
- deepdoctection/configs/conf_dd_one.yaml +34 -31
- deepdoctection/dataflow/common.py +9 -5
- deepdoctection/dataflow/custom.py +5 -5
- deepdoctection/dataflow/custom_serialize.py +75 -18
- deepdoctection/dataflow/parallel_map.py +3 -3
- deepdoctection/dataflow/serialize.py +4 -4
- deepdoctection/dataflow/stats.py +3 -3
- deepdoctection/datapoint/annotation.py +41 -56
- deepdoctection/datapoint/box.py +9 -8
- deepdoctection/datapoint/convert.py +6 -6
- deepdoctection/datapoint/image.py +56 -44
- deepdoctection/datapoint/view.py +245 -150
- deepdoctection/datasets/__init__.py +1 -4
- deepdoctection/datasets/adapter.py +35 -26
- deepdoctection/datasets/base.py +14 -12
- deepdoctection/datasets/dataflow_builder.py +3 -3
- deepdoctection/datasets/info.py +24 -26
- deepdoctection/datasets/instances/doclaynet.py +51 -51
- deepdoctection/datasets/instances/fintabnet.py +46 -46
- deepdoctection/datasets/instances/funsd.py +25 -24
- deepdoctection/datasets/instances/iiitar13k.py +13 -10
- deepdoctection/datasets/instances/layouttest.py +4 -3
- deepdoctection/datasets/instances/publaynet.py +5 -5
- deepdoctection/datasets/instances/pubtables1m.py +24 -21
- deepdoctection/datasets/instances/pubtabnet.py +32 -30
- deepdoctection/datasets/instances/rvlcdip.py +30 -30
- deepdoctection/datasets/instances/xfund.py +26 -26
- deepdoctection/datasets/save.py +6 -6
- deepdoctection/eval/__init__.py +1 -4
- deepdoctection/eval/accmetric.py +32 -33
- deepdoctection/eval/base.py +8 -9
- deepdoctection/eval/cocometric.py +15 -13
- deepdoctection/eval/eval.py +41 -37
- deepdoctection/eval/tedsmetric.py +30 -23
- deepdoctection/eval/tp_eval_callback.py +16 -19
- deepdoctection/extern/__init__.py +2 -7
- deepdoctection/extern/base.py +339 -134
- deepdoctection/extern/d2detect.py +85 -113
- deepdoctection/extern/deskew.py +14 -11
- deepdoctection/extern/doctrocr.py +141 -130
- deepdoctection/extern/fastlang.py +27 -18
- deepdoctection/extern/hfdetr.py +71 -62
- deepdoctection/extern/hflayoutlm.py +504 -211
- deepdoctection/extern/hflm.py +230 -0
- deepdoctection/extern/model.py +488 -302
- deepdoctection/extern/pdftext.py +23 -19
- deepdoctection/extern/pt/__init__.py +1 -3
- deepdoctection/extern/pt/nms.py +6 -2
- deepdoctection/extern/pt/ptutils.py +29 -19
- deepdoctection/extern/tessocr.py +39 -38
- deepdoctection/extern/texocr.py +18 -18
- deepdoctection/extern/tp/tfutils.py +57 -9
- deepdoctection/extern/tp/tpcompat.py +21 -14
- deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
- deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/config/config.py +13 -10
- deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +18 -8
- deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
- deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +14 -9
- deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
- deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +22 -17
- deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +21 -14
- deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +19 -11
- deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
- deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
- deepdoctection/extern/tp/tpfrcnn/preproc.py +12 -8
- deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
- deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
- deepdoctection/extern/tpdetect.py +45 -53
- deepdoctection/mapper/__init__.py +3 -8
- deepdoctection/mapper/cats.py +27 -29
- deepdoctection/mapper/cocostruct.py +10 -10
- deepdoctection/mapper/d2struct.py +27 -26
- deepdoctection/mapper/hfstruct.py +13 -8
- deepdoctection/mapper/laylmstruct.py +178 -37
- deepdoctection/mapper/maputils.py +12 -11
- deepdoctection/mapper/match.py +2 -2
- deepdoctection/mapper/misc.py +11 -9
- deepdoctection/mapper/pascalstruct.py +4 -4
- deepdoctection/mapper/prodigystruct.py +5 -5
- deepdoctection/mapper/pubstruct.py +84 -92
- deepdoctection/mapper/tpstruct.py +5 -5
- deepdoctection/mapper/xfundstruct.py +33 -33
- deepdoctection/pipe/__init__.py +1 -1
- deepdoctection/pipe/anngen.py +12 -14
- deepdoctection/pipe/base.py +52 -106
- deepdoctection/pipe/common.py +72 -59
- deepdoctection/pipe/concurrency.py +16 -11
- deepdoctection/pipe/doctectionpipe.py +24 -21
- deepdoctection/pipe/language.py +20 -25
- deepdoctection/pipe/layout.py +20 -16
- deepdoctection/pipe/lm.py +75 -105
- deepdoctection/pipe/order.py +194 -89
- deepdoctection/pipe/refine.py +111 -124
- deepdoctection/pipe/segment.py +156 -161
- deepdoctection/pipe/{cell.py → sub_layout.py} +50 -40
- deepdoctection/pipe/text.py +37 -36
- deepdoctection/pipe/transform.py +19 -16
- deepdoctection/train/__init__.py +6 -12
- deepdoctection/train/d2_frcnn_train.py +48 -41
- deepdoctection/train/hf_detr_train.py +41 -30
- deepdoctection/train/hf_layoutlm_train.py +153 -135
- deepdoctection/train/tp_frcnn_train.py +32 -31
- deepdoctection/utils/concurrency.py +1 -1
- deepdoctection/utils/context.py +13 -6
- deepdoctection/utils/develop.py +4 -4
- deepdoctection/utils/env_info.py +87 -125
- deepdoctection/utils/file_utils.py +6 -11
- deepdoctection/utils/fs.py +22 -18
- deepdoctection/utils/identifier.py +2 -2
- deepdoctection/utils/logger.py +16 -15
- deepdoctection/utils/metacfg.py +7 -7
- deepdoctection/utils/mocks.py +93 -0
- deepdoctection/utils/pdf_utils.py +11 -11
- deepdoctection/utils/settings.py +185 -181
- deepdoctection/utils/tqdm.py +1 -1
- deepdoctection/utils/transform.py +14 -9
- deepdoctection/utils/types.py +104 -0
- deepdoctection/utils/utils.py +7 -7
- deepdoctection/utils/viz.py +74 -72
- {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/METADATA +30 -21
- deepdoctection-0.33.dist-info/RECORD +146 -0
- {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/WHEEL +1 -1
- deepdoctection/utils/detection_types.py +0 -68
- deepdoctection-0.31.dist-info/RECORD +0 -144
- {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/LICENSE +0 -0
- {deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/top_level.txt +0 -0
deepdoctection/pipe/order.py
CHANGED
|
@@ -18,11 +18,14 @@
|
|
|
18
18
|
"""
|
|
19
19
|
Module for ordering text and layout segments pipeline components
|
|
20
20
|
"""
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
21
23
|
import os
|
|
24
|
+
from abc import ABC
|
|
22
25
|
from copy import copy
|
|
23
26
|
from itertools import chain
|
|
24
27
|
from logging import DEBUG
|
|
25
|
-
from typing import Any,
|
|
28
|
+
from typing import Any, Optional, Sequence, Union
|
|
26
29
|
|
|
27
30
|
import numpy as np
|
|
28
31
|
|
|
@@ -32,9 +35,8 @@ from ..datapoint.image import Image
|
|
|
32
35
|
from ..datapoint.view import IMAGE_DEFAULTS
|
|
33
36
|
from ..extern.base import DetectionResult
|
|
34
37
|
from ..extern.tp.tpfrcnn.utils.np_box_ops import ioa as np_ioa
|
|
35
|
-
from ..pipe.base import PipelineComponent
|
|
38
|
+
from ..pipe.base import MetaAnnotation, PipelineComponent
|
|
36
39
|
from ..pipe.registry import pipeline_component_registry
|
|
37
|
-
from ..utils.detection_types import JsonDict
|
|
38
40
|
from ..utils.logger import LoggingRecord, logger
|
|
39
41
|
from ..utils.settings import LayoutType, ObjectTypes, Relationships, TypeOrStr, get_type
|
|
40
42
|
|
|
@@ -67,7 +69,7 @@ class OrderGenerator:
|
|
|
67
69
|
@staticmethod
|
|
68
70
|
def group_words_into_lines(
|
|
69
71
|
word_anns: Sequence[ImageAnnotation], image_id: Optional[str] = None
|
|
70
|
-
) ->
|
|
72
|
+
) -> list[tuple[int, int, str]]:
|
|
71
73
|
"""Arranging words into horizontal text lines and sorting text lines vertically in order to give
|
|
72
74
|
an enumeration of words that is used for establishing the reading order. Using this reading order arragement
|
|
73
75
|
makes only sense for words within a rectangle and needs to be revised in more complex appearances.
|
|
@@ -75,7 +77,7 @@ class OrderGenerator:
|
|
|
75
77
|
id)`.
|
|
76
78
|
"""
|
|
77
79
|
reading_lines = []
|
|
78
|
-
rows:
|
|
80
|
+
rows: list[dict[str, float]] = []
|
|
79
81
|
for word in word_anns:
|
|
80
82
|
bounding_box = word.get_bounding_box(image_id)
|
|
81
83
|
row_found = False
|
|
@@ -114,13 +116,13 @@ class OrderGenerator:
|
|
|
114
116
|
@staticmethod
|
|
115
117
|
def group_lines_into_lines(
|
|
116
118
|
line_anns: Sequence[ImageAnnotation], image_id: Optional[str] = None
|
|
117
|
-
) ->
|
|
119
|
+
) -> list[tuple[int, int, str]]:
|
|
118
120
|
"""
|
|
119
121
|
Sorting reading lines. Returns for a list of `ImageAnnotation` an list of tuples (each tuple containing the
|
|
120
122
|
reading order and the `annotation_id` for each list element.
|
|
121
123
|
:param line_anns: text line `ImageAnnotation`
|
|
122
124
|
:param image_id: image_id of underyling image (to find get the bounding boxes)
|
|
123
|
-
:return: `
|
|
125
|
+
:return: `list[(reading_order, reading_order,annotation_id)]`
|
|
124
126
|
"""
|
|
125
127
|
reading_lines = []
|
|
126
128
|
for ann in line_anns:
|
|
@@ -131,9 +133,9 @@ class OrderGenerator:
|
|
|
131
133
|
return [(idx + 1, idx + 1, line[1]) for idx, line in enumerate(reading_lines)]
|
|
132
134
|
|
|
133
135
|
@staticmethod
|
|
134
|
-
def _connected_components(columns:
|
|
136
|
+
def _connected_components(columns: list[BoundingBox]) -> list[dict[str, Any]]:
|
|
135
137
|
# building connected components of columns
|
|
136
|
-
connected_components:
|
|
138
|
+
connected_components: list[dict[str, Any]] = []
|
|
137
139
|
for idx, col in enumerate(columns):
|
|
138
140
|
col_dict = {"id": idx, "box": col}
|
|
139
141
|
component_found = False
|
|
@@ -168,8 +170,8 @@ class OrderGenerator:
|
|
|
168
170
|
return connected_components
|
|
169
171
|
|
|
170
172
|
def order_blocks(
|
|
171
|
-
self, anns:
|
|
172
|
-
) -> Sequence[
|
|
173
|
+
self, anns: list[ImageAnnotation], image_width: float, image_height: float, image_id: Optional[str] = None
|
|
174
|
+
) -> Sequence[tuple[int, str]]:
|
|
173
175
|
"""
|
|
174
176
|
Determining a text ordering of text blocks. These text blocks should be larger sections than barely words.
|
|
175
177
|
It will first try to detect columns, then try to consolidate columns and finally try to detecting connected
|
|
@@ -181,12 +183,12 @@ class OrderGenerator:
|
|
|
181
183
|
:param image_width: image width (to re-calculate bounding boxes into relative coords)
|
|
182
184
|
:param image_height: image height (to re-calculate bounding boxes into relative coords)
|
|
183
185
|
:param image_id: image id
|
|
184
|
-
:return:
|
|
186
|
+
:return: list of tuples with reading order position and `annotation_id`
|
|
185
187
|
"""
|
|
186
188
|
if not anns:
|
|
187
189
|
return []
|
|
188
190
|
reading_blocks = []
|
|
189
|
-
columns:
|
|
191
|
+
columns: list[BoundingBox] = []
|
|
190
192
|
anns.sort(
|
|
191
193
|
key=lambda x: (
|
|
192
194
|
x.bounding_box.transform(image_width, image_height).cy, # type: ignore
|
|
@@ -267,7 +269,7 @@ class OrderGenerator:
|
|
|
267
269
|
blocks.sort(key=lambda x: x[0]) # type: ignore
|
|
268
270
|
sorted_blocks = []
|
|
269
271
|
max_block_number = max(list(columns_dict.values()))
|
|
270
|
-
filtered_blocks: Sequence[
|
|
272
|
+
filtered_blocks: Sequence[tuple[int, str]]
|
|
271
273
|
for idx in range(max_block_number + 1):
|
|
272
274
|
filtered_blocks = list(filter(lambda x: x[0] == idx, blocks)) # type: ignore # pylint: disable=W0640
|
|
273
275
|
sorted_blocks.extend(self._sort_anns_grouped_by_blocks(filtered_blocks, anns, image_width, image_height))
|
|
@@ -286,7 +288,7 @@ class OrderGenerator:
|
|
|
286
288
|
)
|
|
287
289
|
return reading_blocks
|
|
288
290
|
|
|
289
|
-
def _consolidate_columns(self, columns:
|
|
291
|
+
def _consolidate_columns(self, columns: list[BoundingBox]) -> dict[int, int]:
|
|
290
292
|
if not columns:
|
|
291
293
|
return {}
|
|
292
294
|
np_boxes = np.array([col.to_list(mode="xyxy") for col in columns])
|
|
@@ -307,8 +309,8 @@ class OrderGenerator:
|
|
|
307
309
|
|
|
308
310
|
@staticmethod
|
|
309
311
|
def _sort_anns_grouped_by_blocks(
|
|
310
|
-
block: Sequence[
|
|
311
|
-
) ->
|
|
312
|
+
block: Sequence[tuple[int, str]], anns: Sequence[ImageAnnotation], image_width: float, image_height: float
|
|
313
|
+
) -> list[tuple[int, str]]:
|
|
312
314
|
if not block:
|
|
313
315
|
return []
|
|
314
316
|
anns_and_blocks_numbers = list(zip(*block))
|
|
@@ -326,14 +328,14 @@ class OrderGenerator:
|
|
|
326
328
|
@staticmethod
|
|
327
329
|
def _make_column_detect_results(columns: Sequence[BoundingBox]) -> Sequence[DetectionResult]:
|
|
328
330
|
column_detect_result_list = []
|
|
329
|
-
if os.environ.get("LOG_LEVEL") == "DEBUG":
|
|
331
|
+
if os.environ.get("LOG_LEVEL", "INFO") == "DEBUG":
|
|
330
332
|
for box in columns:
|
|
331
333
|
column_detect_result_list.append(
|
|
332
334
|
DetectionResult(
|
|
333
335
|
box=box.to_list(mode="xyxy"),
|
|
334
336
|
absolute_coords=box.absolute_coords,
|
|
335
337
|
class_id=99,
|
|
336
|
-
class_name=LayoutType.
|
|
338
|
+
class_name=LayoutType.COLUMN,
|
|
337
339
|
)
|
|
338
340
|
)
|
|
339
341
|
return column_detect_result_list
|
|
@@ -349,10 +351,11 @@ class TextLineGenerator:
|
|
|
349
351
|
self, make_sub_lines: bool, line_category_id: Union[int, str], paragraph_break: Optional[float] = None
|
|
350
352
|
):
|
|
351
353
|
"""
|
|
352
|
-
:param make_sub_lines: Whether to build sub lines from lines
|
|
354
|
+
:param make_sub_lines: Whether to build sub lines from lines.
|
|
353
355
|
:param line_category_id: category_id to give a text line
|
|
354
|
-
:param paragraph_break: threshold of two consecutive words. If distance is larger than threshold, two
|
|
355
|
-
will be built
|
|
356
|
+
:param paragraph_break: threshold of two consecutive words. If distance is larger than threshold, two sub-lines
|
|
357
|
+
will be built. We use relative coordinates to calculate the distance between two
|
|
358
|
+
consecutive words. A reasonable value is 0.035
|
|
356
359
|
"""
|
|
357
360
|
if make_sub_lines and paragraph_break is None:
|
|
358
361
|
raise ValueError("You must specify paragraph_break when setting make_sub_lines to True")
|
|
@@ -360,10 +363,10 @@ class TextLineGenerator:
|
|
|
360
363
|
self.make_sub_lines = make_sub_lines
|
|
361
364
|
self.paragraph_break = paragraph_break
|
|
362
365
|
|
|
363
|
-
def _make_detect_result(self, box: BoundingBox, relationships:
|
|
366
|
+
def _make_detect_result(self, box: BoundingBox, relationships: dict[str, list[str]]) -> DetectionResult:
|
|
364
367
|
return DetectionResult(
|
|
365
368
|
box=box.to_list(mode="xyxy"),
|
|
366
|
-
class_name=LayoutType.
|
|
369
|
+
class_name=LayoutType.LINE,
|
|
367
370
|
class_id=self.line_category_id,
|
|
368
371
|
absolute_coords=box.absolute_coords,
|
|
369
372
|
relationships=relationships,
|
|
@@ -375,6 +378,7 @@ class TextLineGenerator:
|
|
|
375
378
|
image_width: float,
|
|
376
379
|
image_height: float,
|
|
377
380
|
image_id: Optional[str] = None,
|
|
381
|
+
highest_level: bool = True,
|
|
378
382
|
) -> Sequence[DetectionResult]:
|
|
379
383
|
"""
|
|
380
384
|
Creating detecting result of lines (or sub lines) from given word type `ImageAnnotation`.
|
|
@@ -392,6 +396,8 @@ class TextLineGenerator:
|
|
|
392
396
|
# list of (word index, text line, word annotation_id)
|
|
393
397
|
word_order_list = OrderGenerator.group_words_into_lines(word_anns, image_id)
|
|
394
398
|
number_rows = max(word[1] for word in word_order_list)
|
|
399
|
+
if number_rows == 1 and not highest_level:
|
|
400
|
+
return []
|
|
395
401
|
detection_result_list = []
|
|
396
402
|
for number_row in range(1, number_rows + 1):
|
|
397
403
|
# list of (word index, text line, word annotation_id) for text line equal to number_row
|
|
@@ -423,29 +429,139 @@ class TextLineGenerator:
|
|
|
423
429
|
if current_box.absolute_coords:
|
|
424
430
|
current_box = current_box.transform(image_width, image_height)
|
|
425
431
|
|
|
426
|
-
# If distance between boxes is lower than paragraph break, same sub
|
|
432
|
+
# If distance between boxes is lower than paragraph break, same sub-line
|
|
427
433
|
if current_box.ulx - prev_box.lrx < self.paragraph_break: # type: ignore
|
|
428
434
|
sub_line.append(ann)
|
|
429
435
|
sub_line_ann_ids.append(ann.annotation_id)
|
|
430
436
|
else:
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
+
# We need to iterate maybe more than one time, because sub-lines may have more than one line
|
|
438
|
+
# if having been split. Take fore example a multi-column layout where a sub-line has
|
|
439
|
+
# two lines because of a column break and fonts twice as large as the other column.
|
|
440
|
+
detection_results = self.create_detection_result(
|
|
441
|
+
sub_line, image_width, image_height, image_id, False
|
|
442
|
+
)
|
|
443
|
+
if detection_results:
|
|
444
|
+
detection_result_list.extend(detection_results)
|
|
445
|
+
else:
|
|
446
|
+
boxes = [ann.get_bounding_box(image_id) for ann in sub_line]
|
|
447
|
+
merge_box = merge_boxes(*boxes)
|
|
448
|
+
detection_result = self._make_detect_result(merge_box, {"child": sub_line_ann_ids})
|
|
449
|
+
detection_result_list.append(detection_result)
|
|
450
|
+
sub_line = [ann]
|
|
451
|
+
sub_line_ann_ids = [ann.annotation_id]
|
|
437
452
|
|
|
438
453
|
if idx == len(anns_per_row) - 1:
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
454
|
+
detection_results = self.create_detection_result(
|
|
455
|
+
sub_line, image_width, image_height, image_id, False
|
|
456
|
+
)
|
|
457
|
+
if detection_results:
|
|
458
|
+
detection_result_list.extend(detection_results)
|
|
459
|
+
else:
|
|
460
|
+
boxes = [ann.get_bounding_box(image_id) for ann in sub_line]
|
|
461
|
+
merge_box = merge_boxes(*boxes)
|
|
462
|
+
detection_result = self._make_detect_result(merge_box, {"child": sub_line_ann_ids})
|
|
463
|
+
detection_result_list.append(detection_result)
|
|
443
464
|
|
|
444
465
|
return detection_result_list
|
|
445
466
|
|
|
446
467
|
|
|
468
|
+
class TextLineServiceMixin(PipelineComponent, ABC):
|
|
469
|
+
"""
|
|
470
|
+
This class is used to create text lines similar to TextOrderService.
|
|
471
|
+
It uses the logic of the TextOrderService but modifies it to suit its needs.
|
|
472
|
+
It specifically uses the _create_lines_for_words method and modifies the serve method.
|
|
473
|
+
"""
|
|
474
|
+
|
|
475
|
+
def __init__(
|
|
476
|
+
self,
|
|
477
|
+
name: str,
|
|
478
|
+
line_category_id: int = 1,
|
|
479
|
+
include_residual_text_container: bool = True,
|
|
480
|
+
paragraph_break: Optional[float] = None,
|
|
481
|
+
):
|
|
482
|
+
"""
|
|
483
|
+
Initialize the TextLineService with a line_category_id and a TextLineGenerator instance.
|
|
484
|
+
"""
|
|
485
|
+
self.line_category_id = line_category_id
|
|
486
|
+
self.include_residual_text_container = include_residual_text_container
|
|
487
|
+
self.text_line_generator = TextLineGenerator(
|
|
488
|
+
self.include_residual_text_container, self.line_category_id, paragraph_break
|
|
489
|
+
)
|
|
490
|
+
super().__init__(name)
|
|
491
|
+
|
|
492
|
+
def _create_lines_for_words(self, word_anns: Sequence[ImageAnnotation]) -> Sequence[ImageAnnotation]:
|
|
493
|
+
"""
|
|
494
|
+
This method creates lines for words using the TextLineGenerator instance.
|
|
495
|
+
"""
|
|
496
|
+
detection_result_list = self.text_line_generator.create_detection_result(
|
|
497
|
+
word_anns,
|
|
498
|
+
self.dp_manager.datapoint.width,
|
|
499
|
+
self.dp_manager.datapoint.height,
|
|
500
|
+
self.dp_manager.datapoint.image_id,
|
|
501
|
+
)
|
|
502
|
+
line_anns = []
|
|
503
|
+
for detect_result in detection_result_list:
|
|
504
|
+
ann_id = self.dp_manager.set_image_annotation(detect_result)
|
|
505
|
+
if ann_id:
|
|
506
|
+
line_ann = self.dp_manager.get_annotation(ann_id)
|
|
507
|
+
child_ann_id_list = detect_result.relationships["child"] # type: ignore
|
|
508
|
+
for child_ann_id in child_ann_id_list:
|
|
509
|
+
line_ann.dump_relationship(Relationships.CHILD, child_ann_id)
|
|
510
|
+
line_anns.append(line_ann)
|
|
511
|
+
return line_anns
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
class TextLineService(TextLineServiceMixin):
|
|
515
|
+
"""
|
|
516
|
+
Some OCR systems do not identify lines of text but only provide text boxes for words. This is not sufficient
|
|
517
|
+
for certain applications. This service determines rule-based text lines based on word boxes. One difficulty is
|
|
518
|
+
that text lines are not continuous but are interrupted, for example in multi-column layouts.
|
|
519
|
+
These interruptions are taken into account insofar as the gap between two words on almost the same page height
|
|
520
|
+
must not be too large.
|
|
521
|
+
|
|
522
|
+
The service constructs new ImageAnnotation of the category `LayoutType.line` and forms relations between the
|
|
523
|
+
text lines and the words contained in the text lines. The reading order is not arranged.
|
|
524
|
+
"""
|
|
525
|
+
|
|
526
|
+
def __init__(self, line_category_id: int = 1, paragraph_break: Optional[float] = None):
|
|
527
|
+
"""
|
|
528
|
+
Initialize `TextLineService`
|
|
529
|
+
|
|
530
|
+
:param line_category_id: category_id to give a text line
|
|
531
|
+
:param paragraph_break: threshold of two consecutive words. If distance is larger than threshold, two sublines
|
|
532
|
+
will be built
|
|
533
|
+
"""
|
|
534
|
+
super().__init__(
|
|
535
|
+
name="text_line",
|
|
536
|
+
line_category_id=line_category_id,
|
|
537
|
+
include_residual_text_container=True,
|
|
538
|
+
paragraph_break=paragraph_break,
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
def clone(self) -> TextLineService:
|
|
542
|
+
"""
|
|
543
|
+
This method returns a new instance of the class with the same configuration.
|
|
544
|
+
"""
|
|
545
|
+
return self.__class__(self.line_category_id, self.text_line_generator.paragraph_break)
|
|
546
|
+
|
|
547
|
+
def serve(self, dp: Image) -> None:
|
|
548
|
+
text_container_anns = dp.get_annotation(category_names=LayoutType.WORD)
|
|
549
|
+
self._create_lines_for_words(text_container_anns)
|
|
550
|
+
|
|
551
|
+
def get_meta_annotation(self) -> MetaAnnotation:
|
|
552
|
+
"""
|
|
553
|
+
This method returns metadata about the annotations created by this pipeline component.
|
|
554
|
+
"""
|
|
555
|
+
return MetaAnnotation(
|
|
556
|
+
image_annotations=(LayoutType.LINE,),
|
|
557
|
+
sub_categories={LayoutType.LINE: {Relationships.CHILD}},
|
|
558
|
+
relationships={},
|
|
559
|
+
summaries=(),
|
|
560
|
+
)
|
|
561
|
+
|
|
562
|
+
|
|
447
563
|
@pipeline_component_registry.register("TextOrderService")
|
|
448
|
-
class TextOrderService(
|
|
564
|
+
class TextOrderService(TextLineServiceMixin):
|
|
449
565
|
"""
|
|
450
566
|
Reading order of words within floating text blocks as well as reading order of blocks within simple text blocks.
|
|
451
567
|
To understand the difference between floating text blocks and simple text blocks consider a page containing an
|
|
@@ -470,7 +586,8 @@ class TextOrderService(PipelineComponent):
|
|
|
470
586
|
A category annotation per word is generated, which fixes the order per word in the block, as well as a category
|
|
471
587
|
annotation per block, which saves the reading order of the block per page.
|
|
472
588
|
|
|
473
|
-
The blocks are defined in `
|
|
589
|
+
The blocks are defined in `text_block_categories` and text blocks that should be considered when generating
|
|
590
|
+
narrative text must be added in `floating_text_block_categories`.
|
|
474
591
|
|
|
475
592
|
order = TextOrderService(text_container="word",
|
|
476
593
|
text_block_categories=["title", "text", "list", "cell",
|
|
@@ -517,23 +634,28 @@ class TextOrderService(PipelineComponent):
|
|
|
517
634
|
"""
|
|
518
635
|
self.text_container = get_type(text_container)
|
|
519
636
|
if isinstance(text_block_categories, (str, ObjectTypes)):
|
|
520
|
-
text_block_categories =
|
|
637
|
+
text_block_categories = (get_type(text_block_categories),)
|
|
521
638
|
if text_block_categories is None:
|
|
522
639
|
text_block_categories = IMAGE_DEFAULTS["text_block_categories"]
|
|
523
|
-
self.text_block_categories =
|
|
640
|
+
self.text_block_categories = tuple((get_type(category) for category in text_block_categories))
|
|
524
641
|
if isinstance(floating_text_block_categories, (str, ObjectTypes)):
|
|
525
|
-
floating_text_block_categories =
|
|
642
|
+
floating_text_block_categories = (get_type(floating_text_block_categories),)
|
|
526
643
|
if floating_text_block_categories is None:
|
|
527
644
|
floating_text_block_categories = IMAGE_DEFAULTS["floating_text_block_categories"]
|
|
528
|
-
self.floating_text_block_categories =
|
|
645
|
+
self.floating_text_block_categories = tuple((get_type(category) for category in floating_text_block_categories))
|
|
529
646
|
if include_residual_text_container:
|
|
530
|
-
self.floating_text_block_categories.
|
|
647
|
+
self.floating_text_block_categories = self.floating_text_block_categories + (LayoutType.LINE,)
|
|
531
648
|
self.include_residual_text_container = include_residual_text_container
|
|
532
649
|
self.order_generator = OrderGenerator(starting_point_tolerance, broken_line_tolerance, height_tolerance)
|
|
533
650
|
self.text_line_generator = TextLineGenerator(
|
|
534
651
|
self.include_residual_text_container, line_category_id, paragraph_break
|
|
535
652
|
)
|
|
536
|
-
super().__init__(
|
|
653
|
+
super().__init__(
|
|
654
|
+
name="text_order",
|
|
655
|
+
line_category_id=line_category_id,
|
|
656
|
+
include_residual_text_container=include_residual_text_container,
|
|
657
|
+
paragraph_break=paragraph_break,
|
|
658
|
+
)
|
|
537
659
|
self._init_sanity_checks()
|
|
538
660
|
|
|
539
661
|
def serve(self, dp: Image) -> None:
|
|
@@ -541,12 +663,12 @@ class TextOrderService(PipelineComponent):
|
|
|
541
663
|
text_block_anns = dp.get_annotation(category_names=self.text_block_categories)
|
|
542
664
|
if self.include_residual_text_container:
|
|
543
665
|
mapped_text_container_ids = list(
|
|
544
|
-
chain(*[text_block.get_relationship(Relationships.
|
|
666
|
+
chain(*[text_block.get_relationship(Relationships.CHILD) for text_block in text_block_anns])
|
|
545
667
|
)
|
|
546
668
|
residual_text_container_anns = [
|
|
547
669
|
ann for ann in text_container_anns if ann.annotation_id not in mapped_text_container_ids
|
|
548
670
|
]
|
|
549
|
-
if self.text_container == LayoutType.
|
|
671
|
+
if self.text_container == LayoutType.WORD:
|
|
550
672
|
text_block_anns.extend(self._create_lines_for_words(residual_text_container_anns))
|
|
551
673
|
else:
|
|
552
674
|
text_block_anns.extend(residual_text_container_anns)
|
|
@@ -564,27 +686,9 @@ class TextOrderService(PipelineComponent):
|
|
|
564
686
|
annotation_id = self.dp_manager.set_image_annotation(detect_result)
|
|
565
687
|
if annotation_id:
|
|
566
688
|
self.dp_manager.set_category_annotation(
|
|
567
|
-
Relationships.
|
|
689
|
+
Relationships.READING_ORDER, idx, Relationships.READING_ORDER, annotation_id
|
|
568
690
|
)
|
|
569
691
|
|
|
570
|
-
def _create_lines_for_words(self, word_anns: Sequence[ImageAnnotation]) -> Sequence[ImageAnnotation]:
|
|
571
|
-
detection_result_list = self.text_line_generator.create_detection_result(
|
|
572
|
-
word_anns,
|
|
573
|
-
self.dp_manager.datapoint.width,
|
|
574
|
-
self.dp_manager.datapoint.height,
|
|
575
|
-
self.dp_manager.datapoint.image_id,
|
|
576
|
-
)
|
|
577
|
-
line_anns = []
|
|
578
|
-
for detect_result in detection_result_list:
|
|
579
|
-
ann_id = self.dp_manager.set_image_annotation(detect_result)
|
|
580
|
-
if ann_id:
|
|
581
|
-
line_ann = self.dp_manager.get_annotation(ann_id)
|
|
582
|
-
child_ann_id_list = detect_result.relationships["child"] # type: ignore
|
|
583
|
-
for child_ann_id in child_ann_id_list:
|
|
584
|
-
line_ann.dump_relationship(Relationships.child, child_ann_id)
|
|
585
|
-
line_anns.append(line_ann)
|
|
586
|
-
return line_anns
|
|
587
|
-
|
|
588
692
|
def order_text_in_text_block(self, text_block_ann: ImageAnnotation) -> None:
|
|
589
693
|
"""
|
|
590
694
|
Order text within a text block. It will take all child-like text containers (determined by a
|
|
@@ -592,11 +696,11 @@ class TextOrderService(PipelineComponent):
|
|
|
592
696
|
|
|
593
697
|
:param text_block_ann: text block annotation (category one of `text_block_categories`).
|
|
594
698
|
"""
|
|
595
|
-
text_container_ids = text_block_ann.get_relationship(Relationships.
|
|
699
|
+
text_container_ids = text_block_ann.get_relationship(Relationships.CHILD)
|
|
596
700
|
text_container_ann = self.dp_manager.datapoint.get_annotation(
|
|
597
701
|
annotation_ids=text_container_ids, category_names=self.text_container
|
|
598
702
|
)
|
|
599
|
-
if self.text_container == LayoutType.
|
|
703
|
+
if self.text_container == LayoutType.WORD:
|
|
600
704
|
word_order_list = self.order_generator.group_words_into_lines(
|
|
601
705
|
text_container_ann, self.dp_manager.datapoint.image_id
|
|
602
706
|
)
|
|
@@ -606,10 +710,10 @@ class TextOrderService(PipelineComponent):
|
|
|
606
710
|
)
|
|
607
711
|
for word_order in word_order_list:
|
|
608
712
|
self.dp_manager.set_category_annotation(
|
|
609
|
-
Relationships.
|
|
713
|
+
Relationships.READING_ORDER, word_order[0], Relationships.READING_ORDER, word_order[2]
|
|
610
714
|
)
|
|
611
715
|
|
|
612
|
-
def order_blocks(self, text_block_anns:
|
|
716
|
+
def order_blocks(self, text_block_anns: list[ImageAnnotation]) -> None:
|
|
613
717
|
"""
|
|
614
718
|
Ordering of text blocks. Will use the internal order generator.
|
|
615
719
|
|
|
@@ -620,42 +724,40 @@ class TextOrderService(PipelineComponent):
|
|
|
620
724
|
)
|
|
621
725
|
for word_order in block_order_list:
|
|
622
726
|
self.dp_manager.set_category_annotation(
|
|
623
|
-
Relationships.
|
|
727
|
+
Relationships.READING_ORDER, word_order[0], Relationships.READING_ORDER, word_order[1]
|
|
624
728
|
)
|
|
625
729
|
|
|
626
730
|
def _init_sanity_checks(self) -> None:
|
|
627
|
-
assert self.text_container in (LayoutType.
|
|
628
|
-
f"text_container must be either {LayoutType.
|
|
731
|
+
assert self.text_container in (LayoutType.WORD, LayoutType.LINE), (
|
|
732
|
+
f"text_container must be either {LayoutType.WORD} or " f"{LayoutType.LINE}"
|
|
629
733
|
)
|
|
630
734
|
add_category = []
|
|
631
735
|
if self.include_residual_text_container:
|
|
632
|
-
add_category.append(LayoutType.
|
|
736
|
+
add_category.append(LayoutType.LINE)
|
|
633
737
|
|
|
634
738
|
assert set(self.floating_text_block_categories) <= set(
|
|
635
|
-
self.text_block_categories + add_category
|
|
739
|
+
self.text_block_categories + tuple(add_category)
|
|
636
740
|
), "floating_text_block_categories must be a subset of text_block_categories"
|
|
637
741
|
|
|
638
|
-
def get_meta_annotation(self) ->
|
|
742
|
+
def get_meta_annotation(self) -> MetaAnnotation:
|
|
639
743
|
add_category = [self.text_container]
|
|
640
|
-
image_annotations = []
|
|
641
|
-
if self.include_residual_text_container and self.text_container == LayoutType.
|
|
642
|
-
add_category.append(LayoutType.
|
|
643
|
-
image_annotations.append(LayoutType.
|
|
744
|
+
image_annotations: list[ObjectTypes] = []
|
|
745
|
+
if self.include_residual_text_container and self.text_container == LayoutType.WORD:
|
|
746
|
+
add_category.append(LayoutType.LINE)
|
|
747
|
+
image_annotations.append(LayoutType.LINE)
|
|
644
748
|
anns_with_reading_order = list(copy(self.floating_text_block_categories)) + add_category
|
|
645
|
-
return
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
("summaries", []),
|
|
651
|
-
]
|
|
749
|
+
return MetaAnnotation(
|
|
750
|
+
image_annotations=tuple(image_annotations),
|
|
751
|
+
sub_categories={category: {Relationships.READING_ORDER} for category in anns_with_reading_order},
|
|
752
|
+
relationships={},
|
|
753
|
+
summaries=(),
|
|
652
754
|
)
|
|
653
755
|
|
|
654
|
-
def clone(self) ->
|
|
756
|
+
def clone(self) -> TextOrderService:
|
|
655
757
|
return self.__class__(
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
758
|
+
self.text_container,
|
|
759
|
+
self.text_block_categories,
|
|
760
|
+
self.floating_text_block_categories,
|
|
659
761
|
self.include_residual_text_container,
|
|
660
762
|
self.order_generator.starting_point_tolerance,
|
|
661
763
|
self.order_generator.broken_line_tolerance,
|
|
@@ -663,3 +765,6 @@ class TextOrderService(PipelineComponent):
|
|
|
663
765
|
self.text_line_generator.paragraph_break,
|
|
664
766
|
self.text_line_generator.line_category_id,
|
|
665
767
|
)
|
|
768
|
+
|
|
769
|
+
def clear_predictor(self) -> None:
|
|
770
|
+
pass
|