paddlex 3.0.0rc1__py3-none-any.whl → 3.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- paddlex/.version +1 -1
- paddlex/__init__.py +1 -1
- paddlex/configs/modules/chart_parsing/PP-Chart2Table.yaml +13 -0
- paddlex/configs/modules/doc_vlm/PP-DocBee2-3B.yaml +14 -0
- paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-L.yaml +40 -0
- paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-M.yaml +40 -0
- paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-S.yaml +40 -0
- paddlex/configs/modules/layout_detection/PP-DocBlockLayout.yaml +40 -0
- paddlex/configs/modules/layout_detection/PP-DocLayout-L.yaml +2 -2
- paddlex/configs/modules/layout_detection/PP-DocLayout-M.yaml +2 -2
- paddlex/configs/modules/layout_detection/PP-DocLayout-S.yaml +2 -2
- paddlex/configs/modules/layout_detection/PP-DocLayout_plus-L.yaml +40 -0
- paddlex/configs/modules/text_detection/PP-OCRv5_mobile_det.yaml +40 -0
- paddlex/configs/modules/text_detection/PP-OCRv5_server_det.yaml +40 -0
- paddlex/configs/modules/text_recognition/PP-OCRv5_mobile_rec.yaml +39 -0
- paddlex/configs/modules/text_recognition/PP-OCRv5_server_rec.yaml +39 -0
- paddlex/configs/modules/textline_orientation/PP-LCNet_x1_0_textline_ori.yaml +41 -0
- paddlex/configs/pipelines/OCR.yaml +7 -6
- paddlex/configs/pipelines/PP-ChatOCRv3-doc.yaml +3 -1
- paddlex/configs/pipelines/PP-ChatOCRv4-doc.yaml +91 -34
- paddlex/configs/pipelines/PP-StructureV3.yaml +72 -72
- paddlex/configs/pipelines/doc_understanding.yaml +1 -1
- paddlex/configs/pipelines/formula_recognition.yaml +2 -2
- paddlex/configs/pipelines/layout_parsing.yaml +3 -2
- paddlex/configs/pipelines/seal_recognition.yaml +1 -0
- paddlex/configs/pipelines/table_recognition.yaml +2 -1
- paddlex/configs/pipelines/table_recognition_v2.yaml +7 -1
- paddlex/hpip_links.html +20 -20
- paddlex/inference/common/batch_sampler/doc_vlm_batch_sampler.py +33 -10
- paddlex/inference/common/batch_sampler/image_batch_sampler.py +34 -25
- paddlex/inference/common/result/mixin.py +19 -12
- paddlex/inference/models/base/predictor/base_predictor.py +2 -8
- paddlex/inference/models/common/static_infer.py +29 -73
- paddlex/inference/models/common/tokenizer/__init__.py +2 -0
- paddlex/inference/models/common/tokenizer/clip_tokenizer.py +1 -1
- paddlex/inference/models/common/tokenizer/gpt_tokenizer.py +2 -2
- paddlex/inference/models/common/tokenizer/qwen2_5_tokenizer.py +112 -0
- paddlex/inference/models/common/tokenizer/qwen2_tokenizer.py +7 -1
- paddlex/inference/models/common/tokenizer/qwen_tokenizer.py +288 -0
- paddlex/inference/models/common/tokenizer/tokenizer_utils.py +13 -13
- paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py +3 -3
- paddlex/inference/models/common/tokenizer/vocab.py +7 -7
- paddlex/inference/models/common/ts/funcs.py +19 -8
- paddlex/inference/models/common/vlm/conversion_utils.py +99 -0
- paddlex/inference/models/common/vlm/fusion_ops.py +205 -0
- paddlex/inference/models/common/vlm/generation/configuration_utils.py +1 -1
- paddlex/inference/models/common/vlm/generation/logits_process.py +1 -1
- paddlex/inference/models/common/vlm/generation/utils.py +1 -1
- paddlex/inference/models/common/vlm/transformers/configuration_utils.py +3 -3
- paddlex/inference/models/common/vlm/transformers/conversion_utils.py +3 -3
- paddlex/inference/models/common/vlm/transformers/model_outputs.py +2 -2
- paddlex/inference/models/common/vlm/transformers/model_utils.py +7 -31
- paddlex/inference/models/doc_vlm/modeling/GOT_ocr_2_0.py +830 -0
- paddlex/inference/models/doc_vlm/modeling/__init__.py +2 -0
- paddlex/inference/models/doc_vlm/modeling/qwen2.py +1606 -0
- paddlex/inference/models/doc_vlm/modeling/qwen2_5_vl.py +3006 -0
- paddlex/inference/models/doc_vlm/modeling/qwen2_vl.py +0 -105
- paddlex/inference/models/doc_vlm/predictor.py +79 -24
- paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py +97 -0
- paddlex/inference/models/doc_vlm/processors/__init__.py +2 -0
- paddlex/inference/models/doc_vlm/processors/common.py +189 -0
- paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py +548 -0
- paddlex/inference/models/doc_vlm/processors/qwen2_vl.py +21 -176
- paddlex/inference/models/formula_recognition/predictor.py +8 -2
- paddlex/inference/models/formula_recognition/processors.py +90 -77
- paddlex/inference/models/formula_recognition/result.py +28 -27
- paddlex/inference/models/image_feature/processors.py +3 -4
- paddlex/inference/models/keypoint_detection/predictor.py +3 -0
- paddlex/inference/models/object_detection/predictor.py +2 -0
- paddlex/inference/models/object_detection/processors.py +28 -3
- paddlex/inference/models/object_detection/utils.py +2 -0
- paddlex/inference/models/table_structure_recognition/result.py +0 -10
- paddlex/inference/models/text_detection/predictor.py +8 -0
- paddlex/inference/models/text_detection/processors.py +44 -10
- paddlex/inference/models/text_detection/result.py +0 -10
- paddlex/inference/models/text_recognition/result.py +1 -1
- paddlex/inference/pipelines/__init__.py +9 -5
- paddlex/inference/pipelines/_parallel.py +172 -0
- paddlex/inference/pipelines/anomaly_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/attribute_recognition/pipeline.py +11 -1
- paddlex/inference/pipelines/base.py +14 -4
- paddlex/inference/pipelines/components/faisser.py +1 -1
- paddlex/inference/pipelines/doc_preprocessor/pipeline.py +53 -27
- paddlex/inference/pipelines/formula_recognition/pipeline.py +120 -82
- paddlex/inference/pipelines/formula_recognition/result.py +1 -11
- paddlex/inference/pipelines/image_classification/pipeline.py +16 -6
- paddlex/inference/pipelines/image_multilabel_classification/pipeline.py +16 -6
- paddlex/inference/pipelines/instance_segmentation/pipeline.py +16 -6
- paddlex/inference/pipelines/keypoint_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/layout_parsing/layout_objects.py +859 -0
- paddlex/inference/pipelines/layout_parsing/pipeline.py +34 -47
- paddlex/inference/pipelines/layout_parsing/pipeline_v2.py +832 -260
- paddlex/inference/pipelines/layout_parsing/result.py +4 -17
- paddlex/inference/pipelines/layout_parsing/result_v2.py +259 -245
- paddlex/inference/pipelines/layout_parsing/setting.py +88 -0
- paddlex/inference/pipelines/layout_parsing/utils.py +391 -2028
- paddlex/inference/pipelines/layout_parsing/xycut_enhanced/__init__.py +16 -0
- paddlex/inference/pipelines/layout_parsing/xycut_enhanced/utils.py +1199 -0
- paddlex/inference/pipelines/layout_parsing/xycut_enhanced/xycuts.py +615 -0
- paddlex/inference/pipelines/m_3d_bev_detection/pipeline.py +2 -2
- paddlex/inference/pipelines/multilingual_speech_recognition/pipeline.py +2 -2
- paddlex/inference/pipelines/object_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/ocr/pipeline.py +127 -70
- paddlex/inference/pipelines/ocr/result.py +21 -18
- paddlex/inference/pipelines/open_vocabulary_detection/pipeline.py +2 -2
- paddlex/inference/pipelines/open_vocabulary_segmentation/pipeline.py +2 -2
- paddlex/inference/pipelines/pp_chatocr/pipeline_base.py +2 -2
- paddlex/inference/pipelines/pp_chatocr/pipeline_v3.py +2 -5
- paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py +6 -6
- paddlex/inference/pipelines/rotated_object_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/seal_recognition/pipeline.py +109 -53
- paddlex/inference/pipelines/semantic_segmentation/pipeline.py +16 -6
- paddlex/inference/pipelines/small_object_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/table_recognition/pipeline.py +26 -18
- paddlex/inference/pipelines/table_recognition/pipeline_v2.py +624 -53
- paddlex/inference/pipelines/table_recognition/result.py +1 -1
- paddlex/inference/pipelines/table_recognition/table_recognition_post_processing_v2.py +9 -5
- paddlex/inference/pipelines/ts_anomaly_detection/pipeline.py +2 -2
- paddlex/inference/pipelines/ts_classification/pipeline.py +2 -2
- paddlex/inference/pipelines/ts_forecasting/pipeline.py +2 -2
- paddlex/inference/pipelines/video_classification/pipeline.py +2 -2
- paddlex/inference/pipelines/video_detection/pipeline.py +2 -2
- paddlex/inference/serving/basic_serving/_app.py +46 -13
- paddlex/inference/serving/basic_serving/_pipeline_apps/_common/common.py +5 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/layout_parsing.py +0 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv3_doc.py +0 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv4_doc.py +1 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py +6 -2
- paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition.py +1 -5
- paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition_v2.py +4 -5
- paddlex/inference/serving/infra/utils.py +20 -22
- paddlex/inference/serving/schemas/formula_recognition.py +1 -1
- paddlex/inference/serving/schemas/layout_parsing.py +1 -2
- paddlex/inference/serving/schemas/pp_chatocrv3_doc.py +1 -2
- paddlex/inference/serving/schemas/pp_chatocrv4_doc.py +2 -2
- paddlex/inference/serving/schemas/pp_structurev3.py +10 -6
- paddlex/inference/serving/schemas/seal_recognition.py +1 -1
- paddlex/inference/serving/schemas/table_recognition.py +2 -6
- paddlex/inference/serving/schemas/table_recognition_v2.py +5 -6
- paddlex/inference/utils/hpi.py +30 -16
- paddlex/inference/utils/hpi_model_info_collection.json +666 -162
- paddlex/inference/utils/io/readers.py +12 -12
- paddlex/inference/utils/misc.py +20 -0
- paddlex/inference/utils/mkldnn_blocklist.py +59 -0
- paddlex/inference/utils/official_models.py +140 -5
- paddlex/inference/utils/pp_option.py +74 -9
- paddlex/model.py +2 -2
- paddlex/modules/__init__.py +1 -1
- paddlex/modules/anomaly_detection/evaluator.py +2 -2
- paddlex/modules/base/__init__.py +1 -1
- paddlex/modules/base/evaluator.py +5 -5
- paddlex/modules/base/trainer.py +1 -1
- paddlex/modules/doc_vlm/dataset_checker.py +2 -2
- paddlex/modules/doc_vlm/evaluator.py +2 -2
- paddlex/modules/doc_vlm/exportor.py +2 -2
- paddlex/modules/doc_vlm/model_list.py +1 -1
- paddlex/modules/doc_vlm/trainer.py +2 -2
- paddlex/modules/face_recognition/evaluator.py +2 -2
- paddlex/modules/formula_recognition/evaluator.py +5 -2
- paddlex/modules/formula_recognition/model_list.py +3 -0
- paddlex/modules/formula_recognition/trainer.py +3 -0
- paddlex/modules/general_recognition/evaluator.py +1 -1
- paddlex/modules/image_classification/evaluator.py +2 -2
- paddlex/modules/image_classification/model_list.py +1 -0
- paddlex/modules/instance_segmentation/evaluator.py +1 -1
- paddlex/modules/keypoint_detection/evaluator.py +1 -1
- paddlex/modules/m_3d_bev_detection/evaluator.py +2 -2
- paddlex/modules/multilabel_classification/evaluator.py +2 -2
- paddlex/modules/object_detection/dataset_checker/dataset_src/convert_dataset.py +4 -4
- paddlex/modules/object_detection/evaluator.py +2 -2
- paddlex/modules/object_detection/model_list.py +2 -0
- paddlex/modules/semantic_segmentation/dataset_checker/__init__.py +12 -2
- paddlex/modules/semantic_segmentation/evaluator.py +2 -2
- paddlex/modules/table_recognition/evaluator.py +2 -2
- paddlex/modules/text_detection/evaluator.py +2 -2
- paddlex/modules/text_detection/model_list.py +2 -0
- paddlex/modules/text_recognition/evaluator.py +2 -2
- paddlex/modules/text_recognition/model_list.py +2 -0
- paddlex/modules/ts_anomaly_detection/evaluator.py +2 -2
- paddlex/modules/ts_classification/dataset_checker/dataset_src/split_dataset.py +1 -1
- paddlex/modules/ts_classification/evaluator.py +2 -2
- paddlex/modules/ts_forecast/evaluator.py +2 -2
- paddlex/modules/video_classification/evaluator.py +2 -2
- paddlex/modules/video_detection/evaluator.py +2 -2
- paddlex/ops/__init__.py +8 -5
- paddlex/paddlex_cli.py +19 -13
- paddlex/repo_apis/Paddle3D_api/bev_fusion/model.py +2 -2
- paddlex/repo_apis/PaddleClas_api/cls/config.py +1 -1
- paddlex/repo_apis/PaddleClas_api/cls/model.py +1 -1
- paddlex/repo_apis/PaddleClas_api/cls/register.py +10 -0
- paddlex/repo_apis/PaddleClas_api/cls/runner.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/instance_seg/model.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/instance_seg/runner.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/object_det/config.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/object_det/model.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py +25 -0
- paddlex/repo_apis/PaddleDetection_api/object_det/register.py +30 -0
- paddlex/repo_apis/PaddleDetection_api/object_det/runner.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/formula_rec/config.py +3 -3
- paddlex/repo_apis/PaddleOCR_api/formula_rec/model.py +5 -9
- paddlex/repo_apis/PaddleOCR_api/formula_rec/register.py +27 -0
- paddlex/repo_apis/PaddleOCR_api/formula_rec/runner.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/table_rec/model.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/table_rec/runner.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/text_det/model.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/text_det/register.py +18 -0
- paddlex/repo_apis/PaddleOCR_api/text_det/runner.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/text_rec/config.py +3 -3
- paddlex/repo_apis/PaddleOCR_api/text_rec/model.py +5 -9
- paddlex/repo_apis/PaddleOCR_api/text_rec/register.py +18 -0
- paddlex/repo_apis/PaddleOCR_api/text_rec/runner.py +1 -1
- paddlex/repo_apis/PaddleSeg_api/seg/model.py +1 -1
- paddlex/repo_apis/PaddleSeg_api/seg/runner.py +1 -1
- paddlex/repo_apis/PaddleTS_api/ts_ad/config.py +3 -3
- paddlex/repo_apis/PaddleTS_api/ts_cls/config.py +2 -2
- paddlex/repo_apis/PaddleTS_api/ts_fc/config.py +4 -4
- paddlex/repo_apis/PaddleVideo_api/video_cls/config.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_cls/model.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_cls/runner.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_det/config.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_det/model.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_det/runner.py +1 -1
- paddlex/repo_apis/base/config.py +1 -1
- paddlex/repo_manager/core.py +3 -3
- paddlex/repo_manager/meta.py +6 -2
- paddlex/repo_manager/repo.py +17 -16
- paddlex/utils/custom_device_list.py +26 -2
- paddlex/utils/deps.py +3 -3
- paddlex/utils/device.py +5 -13
- paddlex/utils/env.py +4 -0
- paddlex/utils/flags.py +11 -4
- paddlex/utils/fonts/__init__.py +34 -4
- paddlex/utils/misc.py +1 -1
- paddlex/utils/subclass_register.py +2 -2
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/METADATA +349 -208
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/RECORD +240 -211
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/WHEEL +1 -1
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/entry_points.txt +1 -0
- {paddlex-3.0.0rc1.dist-info/licenses → paddlex-3.0.2.dist-info}/LICENSE +0 -0
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/top_level.txt +0 -0
@@ -14,21 +14,20 @@
|
|
14
14
|
|
15
15
|
__all__ = [
|
16
16
|
"get_sub_regions_ocr_res",
|
17
|
-
"get_layout_ordering",
|
18
|
-
"get_single_block_parsing_res",
|
19
17
|
"get_show_color",
|
20
18
|
"sorted_layout_boxes",
|
21
19
|
]
|
22
20
|
|
23
21
|
import re
|
24
22
|
from copy import deepcopy
|
25
|
-
from typing import
|
23
|
+
from typing import Dict, List, Optional, Tuple, Union
|
26
24
|
|
27
25
|
import numpy as np
|
28
26
|
from PIL import Image
|
29
27
|
|
30
|
-
from
|
28
|
+
from ..components import convert_points_to_boxes
|
31
29
|
from ..ocr.result import OCRResult
|
30
|
+
from .setting import BLOCK_LABEL_MAP, REGION_SETTINGS
|
32
31
|
|
33
32
|
|
34
33
|
def get_overlap_boxes_idx(src_boxes: np.ndarray, ref_boxes: np.ndarray) -> List:
|
@@ -172,808 +171,167 @@ def sorted_layout_boxes(res, w):
|
|
172
171
|
return new_res
|
173
172
|
|
174
173
|
|
175
|
-
def
|
176
|
-
bbox1:
|
177
|
-
bbox2:
|
174
|
+
def calculate_projection_overlap_ratio(
|
175
|
+
bbox1: List[float],
|
176
|
+
bbox2: List[float],
|
177
|
+
direction: str = "horizontal",
|
178
|
+
mode="union",
|
178
179
|
) -> float:
|
179
180
|
"""
|
180
|
-
Calculate the
|
181
|
-
to the area of the smaller bounding box.
|
182
|
-
|
183
|
-
Args:
|
184
|
-
bbox1 (list or tuple): Coordinates of the first bounding box [x_min, y_min, x_max, y_max].
|
185
|
-
bbox2 (list or tuple): Coordinates of the second bounding box [x_min, y_min, x_max, y_max].
|
186
|
-
|
187
|
-
Returns:
|
188
|
-
float: The ratio of the overlap area to the area of the smaller bounding box.
|
189
|
-
"""
|
190
|
-
bbox1 = list(map(int, bbox1))
|
191
|
-
bbox2 = list(map(int, bbox2))
|
192
|
-
|
193
|
-
x_left = max(bbox1[0], bbox2[0])
|
194
|
-
y_top = max(bbox1[1], bbox2[1])
|
195
|
-
x_right = min(bbox1[2], bbox2[2])
|
196
|
-
y_bottom = min(bbox1[3], bbox2[3])
|
197
|
-
|
198
|
-
if x_right <= x_left or y_bottom <= y_top:
|
199
|
-
return 0.0
|
200
|
-
|
201
|
-
intersection_area = (x_right - x_left) * (y_bottom - y_top)
|
202
|
-
area_bbox1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
|
203
|
-
area_bbox2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
|
204
|
-
min_box_area = min(area_bbox1, area_bbox2)
|
205
|
-
|
206
|
-
if min_box_area <= 0:
|
207
|
-
return 0.0
|
208
|
-
|
209
|
-
return intersection_area / min_box_area
|
210
|
-
|
211
|
-
|
212
|
-
def _whether_y_overlap_exceeds_threshold(
|
213
|
-
bbox1: Union[list, tuple],
|
214
|
-
bbox2: Union[list, tuple],
|
215
|
-
overlap_ratio_threshold: float = 0.6,
|
216
|
-
) -> bool:
|
217
|
-
"""
|
218
|
-
Determines whether the vertical overlap between two bounding boxes exceeds a given threshold.
|
219
|
-
|
220
|
-
Args:
|
221
|
-
bbox1 (list or tuple): The first bounding box defined as (left, top, right, bottom).
|
222
|
-
bbox2 (list or tuple): The second bounding box defined as (left, top, right, bottom).
|
223
|
-
overlap_ratio_threshold (float): The threshold ratio to determine if the overlap is significant.
|
224
|
-
Defaults to 0.6.
|
225
|
-
|
226
|
-
Returns:
|
227
|
-
bool: True if the vertical overlap divided by the minimum height of the two bounding boxes
|
228
|
-
exceeds the overlap_ratio_threshold, otherwise False.
|
229
|
-
"""
|
230
|
-
_, y1_0, _, y1_1 = bbox1
|
231
|
-
_, y2_0, _, y2_1 = bbox2
|
232
|
-
|
233
|
-
overlap = max(0, min(y1_1, y2_1) - max(y1_0, y2_0))
|
234
|
-
min_height = min(y1_1 - y1_0, y2_1 - y2_0)
|
235
|
-
|
236
|
-
return (overlap / min_height) > overlap_ratio_threshold
|
237
|
-
|
238
|
-
|
239
|
-
def _adjust_span_text(span: List[str], prepend: bool = False, append: bool = False):
|
240
|
-
"""
|
241
|
-
Adjust the text of a span by prepending or appending a newline.
|
242
|
-
|
243
|
-
Args:
|
244
|
-
span (list): A list where the second element is the text of the span.
|
245
|
-
prepend (bool): If True, prepend a newline to the text.
|
246
|
-
append (bool): If True, append a newline to the text.
|
247
|
-
|
248
|
-
Returns:
|
249
|
-
None: The function modifies the span in place.
|
250
|
-
"""
|
251
|
-
if prepend:
|
252
|
-
span[1] = "\n" + span[1]
|
253
|
-
if append:
|
254
|
-
span[1] = span[1] + "\n"
|
255
|
-
return span
|
256
|
-
|
257
|
-
|
258
|
-
def _format_line(
|
259
|
-
line: List[List[Union[List[int], str]]],
|
260
|
-
layout_min: int,
|
261
|
-
layout_max: int,
|
262
|
-
is_reference: bool = False,
|
263
|
-
) -> None:
|
264
|
-
"""
|
265
|
-
Format a line of text spans based on layout constraints.
|
181
|
+
Calculate the IoU of lines between two bounding boxes.
|
266
182
|
|
267
183
|
Args:
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
is_reference (bool): A flag indicating whether the line is a reference line, which affects formatting rules.
|
184
|
+
bbox1 (List[float]): First bounding box [x_min, y_min, x_max, y_max].
|
185
|
+
bbox2 (List[float]): Second bounding box [x_min, y_min, x_max, y_max].
|
186
|
+
direction (str): direction of the projection, "horizontal" or "vertical".
|
272
187
|
|
273
188
|
Returns:
|
274
|
-
|
275
|
-
"""
|
276
|
-
first_span = line[0]
|
277
|
-
end_span = line[-1]
|
278
|
-
|
279
|
-
if not is_reference:
|
280
|
-
if first_span[0][0] - layout_min > 10:
|
281
|
-
first_span = _adjust_span_text(first_span, prepend=True)
|
282
|
-
if layout_max - end_span[0][2] > 10:
|
283
|
-
end_span = _adjust_span_text(end_span, append=True)
|
284
|
-
else:
|
285
|
-
if first_span[0][0] - layout_min < 5:
|
286
|
-
first_span = _adjust_span_text(first_span, prepend=True)
|
287
|
-
if layout_max - end_span[0][2] > 20:
|
288
|
-
end_span = _adjust_span_text(end_span, append=True)
|
289
|
-
|
290
|
-
line[0] = first_span
|
291
|
-
line[-1] = end_span
|
292
|
-
|
293
|
-
return line
|
294
|
-
|
295
|
-
|
296
|
-
def split_boxes_if_x_contained(boxes, offset=1e-5):
|
189
|
+
float: Line overlap ratio. Returns 0 if there is no overlap.
|
297
190
|
"""
|
298
|
-
|
299
|
-
|
191
|
+
start_index, end_index = 1, 3
|
192
|
+
if direction == "horizontal":
|
193
|
+
start_index, end_index = 0, 2
|
300
194
|
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
"""
|
195
|
+
intersection_start = max(bbox1[start_index], bbox2[start_index])
|
196
|
+
intersection_end = min(bbox1[end_index], bbox2[end_index])
|
197
|
+
overlap = intersection_end - intersection_start
|
198
|
+
if overlap <= 0:
|
199
|
+
return 0
|
307
200
|
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
box_b = boxes[j]
|
321
|
-
if is_x_contained(box_a, box_b):
|
322
|
-
is_split = True
|
323
|
-
# Split box_a based on the x-coordinates of box_b
|
324
|
-
if box_a[0][0] < box_b[0][0]:
|
325
|
-
w = box_b[0][0] - offset - box_a[0][0]
|
326
|
-
if w > 1:
|
327
|
-
new_boxes.append(
|
328
|
-
[
|
329
|
-
np.array(
|
330
|
-
[
|
331
|
-
box_a[0][0],
|
332
|
-
box_a[0][1],
|
333
|
-
box_b[0][0] - offset,
|
334
|
-
box_a[0][3],
|
335
|
-
]
|
336
|
-
),
|
337
|
-
box_a[1],
|
338
|
-
box_a[2],
|
339
|
-
]
|
340
|
-
)
|
341
|
-
if box_a[0][2] > box_b[0][2]:
|
342
|
-
w = box_a[0][2] - box_b[0][2] + offset
|
343
|
-
if w > 1:
|
344
|
-
box_a = [
|
345
|
-
np.array(
|
346
|
-
[
|
347
|
-
box_b[0][2] + offset,
|
348
|
-
box_a[0][1],
|
349
|
-
box_a[0][2],
|
350
|
-
box_a[0][3],
|
351
|
-
]
|
352
|
-
),
|
353
|
-
box_a[1],
|
354
|
-
box_a[2],
|
355
|
-
]
|
356
|
-
if j == len(boxes) - 1 and is_split:
|
357
|
-
new_boxes.append(box_a)
|
358
|
-
if not is_split:
|
359
|
-
new_boxes.append(box_a)
|
360
|
-
|
361
|
-
return new_boxes
|
362
|
-
|
363
|
-
|
364
|
-
def _sort_line_by_x_projection(
|
365
|
-
input_img: np.ndarray,
|
366
|
-
general_ocr_pipeline: Any,
|
367
|
-
line: List[List[Union[List[int], str]]],
|
368
|
-
) -> None:
|
369
|
-
"""
|
370
|
-
Sort a line of text spans based on their vertical position within the layout bounding box.
|
371
|
-
|
372
|
-
Args:
|
373
|
-
input_img (ndarray): The input image used for OCR.
|
374
|
-
general_ocr_pipeline (Any): The general OCR pipeline used for text recognition.
|
375
|
-
line (list): A list of spans, where each span is a list containing a bounding box and text.
|
376
|
-
|
377
|
-
Returns:
|
378
|
-
list: The sorted line of text spans.
|
379
|
-
"""
|
380
|
-
splited_boxes = split_boxes_if_x_contained(line)
|
381
|
-
splited_lines = []
|
382
|
-
if len(line) != len(splited_boxes):
|
383
|
-
splited_boxes.sort(key=lambda span: span[0][0])
|
384
|
-
text_rec_model = general_ocr_pipeline.text_rec_model
|
385
|
-
for span in splited_boxes:
|
386
|
-
if span[2] == "text":
|
387
|
-
crop_img = input_img[
|
388
|
-
int(span[0][1]) : int(span[0][3]),
|
389
|
-
int(span[0][0]) : int(span[0][2]),
|
390
|
-
]
|
391
|
-
span[1] = next(text_rec_model([crop_img]))["rec_text"]
|
392
|
-
splited_lines.append(span)
|
201
|
+
if mode == "union":
|
202
|
+
ref_width = max(bbox1[end_index], bbox2[end_index]) - min(
|
203
|
+
bbox1[start_index], bbox2[start_index]
|
204
|
+
)
|
205
|
+
elif mode == "small":
|
206
|
+
ref_width = min(
|
207
|
+
bbox1[end_index] - bbox1[start_index], bbox2[end_index] - bbox2[start_index]
|
208
|
+
)
|
209
|
+
elif mode == "large":
|
210
|
+
ref_width = max(
|
211
|
+
bbox1[end_index] - bbox1[start_index], bbox2[end_index] - bbox2[start_index]
|
212
|
+
)
|
393
213
|
else:
|
394
|
-
|
214
|
+
raise ValueError(
|
215
|
+
f"Invalid mode {mode}, must be one of ['union', 'small', 'large']."
|
216
|
+
)
|
395
217
|
|
396
|
-
return
|
218
|
+
return overlap / ref_width if ref_width > 0 else 0.0
|
397
219
|
|
398
220
|
|
399
|
-
def
|
400
|
-
|
401
|
-
|
402
|
-
label: Any,
|
403
|
-
block_bbox: Tuple[int, int, int, int],
|
404
|
-
ocr_res: Dict[str, List[Any]],
|
405
|
-
line_height_iou_threshold: float = 0.7,
|
406
|
-
) -> Dict[str, List[Any]]:
|
221
|
+
def calculate_overlap_ratio(
|
222
|
+
bbox1: Union[list, tuple], bbox2: Union[list, tuple], mode="union"
|
223
|
+
) -> float:
|
407
224
|
"""
|
408
|
-
|
225
|
+
Calculate the overlap ratio between two bounding boxes.
|
409
226
|
|
410
227
|
Args:
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
relevant for other parts of the calling context.
|
415
|
-
block_bbox (Tuple[int, int, int, int]): A tuple representing the layout bounding box, defined as
|
416
|
-
(left, top, right, bottom).
|
417
|
-
ocr_res (Dict[str, List[Any]]): A dictionary containing OCR results with the following keys:
|
418
|
-
- "boxes": A list of bounding boxes, each defined as [left, top, right, bottom].
|
419
|
-
- "rec_texts": A corresponding list of recognized text strings for each box.
|
420
|
-
line_height_iou_threshold (float): The threshold for determining whether two boxes belong to
|
421
|
-
the same line based on their vertical overlap. Defaults to 0.7.
|
228
|
+
bbox1 (list or tuple): The first bounding box, format [x_min, y_min, x_max, y_max]
|
229
|
+
bbox2 (list or tuple): The second bounding box, format [x_min, y_min, x_max, y_max]
|
230
|
+
mode (str): The mode of calculation, either 'union', 'small', or 'large'.
|
422
231
|
|
423
232
|
Returns:
|
424
|
-
|
425
|
-
and grouped into lines and blocks.
|
233
|
+
float: The overlap ratio value between the two bounding boxes
|
426
234
|
"""
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
boxes = ocr_res["boxes"]
|
432
|
-
rec_texts = ocr_res["rec_texts"]
|
433
|
-
rec_labels = ocr_res["rec_labels"]
|
235
|
+
x_min_inter = max(bbox1[0], bbox2[0])
|
236
|
+
y_min_inter = max(bbox1[1], bbox2[1])
|
237
|
+
x_max_inter = min(bbox1[2], bbox2[2])
|
238
|
+
y_max_inter = min(bbox1[3], bbox2[3])
|
434
239
|
|
435
|
-
|
436
|
-
|
437
|
-
inline_x_max = max([box[2] for box in boxes])
|
240
|
+
inter_width = max(0, x_max_inter - x_min_inter)
|
241
|
+
inter_height = max(0, y_max_inter - y_min_inter)
|
438
242
|
|
439
|
-
|
243
|
+
inter_area = float(inter_width) * float(inter_height)
|
440
244
|
|
441
|
-
|
442
|
-
|
245
|
+
bbox1_area = caculate_bbox_area(bbox1)
|
246
|
+
bbox2_area = caculate_bbox_area(bbox2)
|
443
247
|
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
if _whether_y_overlap_exceeds_threshold(
|
451
|
-
(0, current_y0, 0, current_y1),
|
452
|
-
(0, y0, 0, y1),
|
453
|
-
line_height_iou_threshold,
|
454
|
-
):
|
455
|
-
current_line.append(span)
|
456
|
-
current_y0 = min(current_y0, y0)
|
457
|
-
current_y1 = max(current_y1, y1)
|
458
|
-
else:
|
459
|
-
lines.append(current_line)
|
460
|
-
current_line = [span]
|
461
|
-
current_y0, current_y1 = y0, y1
|
462
|
-
|
463
|
-
if current_line:
|
464
|
-
lines.append(current_line)
|
465
|
-
|
466
|
-
new_lines = []
|
467
|
-
for line in lines:
|
468
|
-
line.sort(key=lambda span: span[0][0])
|
469
|
-
|
470
|
-
ocr_labels = [span[2] for span in line]
|
471
|
-
if "formula" in ocr_labels:
|
472
|
-
line = _sort_line_by_x_projection(input_img, general_ocr_pipeline, line)
|
473
|
-
if label == "reference":
|
474
|
-
line = _format_line(line, inline_x_min, inline_x_max, is_reference=True)
|
475
|
-
elif label != "content":
|
476
|
-
line = _format_line(line, x_min, x_max)
|
477
|
-
new_lines.append(line)
|
478
|
-
|
479
|
-
ocr_res["boxes"] = [span[0] for line in new_lines for span in line]
|
480
|
-
if label == "content":
|
481
|
-
ocr_res["rec_texts"] = [
|
482
|
-
"".join(f"{span[1]} " for span in line).rstrip() for line in new_lines
|
483
|
-
]
|
248
|
+
if mode == "union":
|
249
|
+
ref_area = bbox1_area + bbox2_area - inter_area
|
250
|
+
elif mode == "small":
|
251
|
+
ref_area = min(bbox1_area, bbox2_area)
|
252
|
+
elif mode == "large":
|
253
|
+
ref_area = max(bbox1_area, bbox2_area)
|
484
254
|
else:
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
def _process_text(input_text: str) -> str:
|
490
|
-
"""
|
491
|
-
Process the input text to handle spaces.
|
492
|
-
|
493
|
-
The function removes multiple consecutive spaces between Chinese characters and ensures that
|
494
|
-
only a single space is retained between Chinese and non-Chinese characters.
|
495
|
-
|
496
|
-
Args:
|
497
|
-
input_text (str): The text to be processed.
|
498
|
-
|
499
|
-
Returns:
|
500
|
-
str: The processed text with properly formatted spaces.
|
501
|
-
"""
|
502
|
-
|
503
|
-
def handle_spaces_(text: str) -> str:
|
504
|
-
"""
|
505
|
-
Handle spaces in the text by removing multiple consecutive spaces and inserting a single space
|
506
|
-
between Chinese and non-Chinese characters.
|
507
|
-
|
508
|
-
Args:
|
509
|
-
text (str): The text to handle spaces for.
|
510
|
-
|
511
|
-
Returns:
|
512
|
-
str: The text with properly formatted spaces.
|
513
|
-
"""
|
514
|
-
spaces = re.finditer(r"\s+", text)
|
515
|
-
processed_text = list(text)
|
516
|
-
|
517
|
-
for space in reversed(list(spaces)):
|
518
|
-
start, end = space.span()
|
519
|
-
prev_char = processed_text[start - 1] if start > 0 else ""
|
520
|
-
next_char = processed_text[end] if end < len(processed_text) else ""
|
521
|
-
|
522
|
-
is_prev_chinese = (
|
523
|
-
re.match(r"[\u4e00-\u9fff]", prev_char) if prev_char else False
|
524
|
-
)
|
525
|
-
is_next_chinese = (
|
526
|
-
re.match(r"[\u4e00-\u9fff]", next_char) if next_char else False
|
527
|
-
)
|
528
|
-
|
529
|
-
if is_prev_chinese and is_next_chinese:
|
530
|
-
processed_text[start:end] = []
|
531
|
-
else:
|
532
|
-
processed_text[start:end] = [" "]
|
533
|
-
|
534
|
-
return "".join(processed_text)
|
535
|
-
|
536
|
-
text_without_spaces = handle_spaces_(input_text)
|
537
|
-
|
538
|
-
final_text = re.sub(r"\s+", " ", text_without_spaces).strip()
|
539
|
-
return final_text
|
540
|
-
|
541
|
-
|
542
|
-
def get_single_block_parsing_res(
|
543
|
-
general_ocr_pipeline: Any,
|
544
|
-
overall_ocr_res: OCRResult,
|
545
|
-
layout_det_res: DetResult,
|
546
|
-
table_res_list: list,
|
547
|
-
seal_res_list: list,
|
548
|
-
) -> OCRResult:
|
549
|
-
"""
|
550
|
-
Extract structured information from OCR and layout detection results.
|
551
|
-
|
552
|
-
Args:
|
553
|
-
overall_ocr_res (OCRResult): An object containing the overall OCR results, including detected text boxes and recognized text. The structure is expected to have:
|
554
|
-
- "input_img": The image on which OCR was performed.
|
555
|
-
- "dt_boxes": A list of detected text box coordinates.
|
556
|
-
- "rec_texts": A list of recognized text corresponding to the detected boxes.
|
557
|
-
|
558
|
-
layout_det_res (DetResult): An object containing the layout detection results, including detected layout boxes and their labels. The structure is expected to have:
|
559
|
-
- "boxes": A list of dictionaries with keys "coordinate" for box coordinates and "block_label" for the type of content.
|
560
|
-
|
561
|
-
table_res_list (list): A list of table detection results, where each item is a dictionary containing:
|
562
|
-
- "block_bbox": The bounding box of the table layout.
|
563
|
-
- "pred_html": The predicted HTML representation of the table.
|
564
|
-
|
565
|
-
seal_res_list (List): A list of seal detection results. The details of each item depend on the specific application context.
|
566
|
-
|
567
|
-
Returns:
|
568
|
-
list: A list of structured boxes where each item is a dictionary containing:
|
569
|
-
- "block_label": The label of the content (e.g., 'table', 'chart', 'image').
|
570
|
-
- The label as a key with either table HTML or image data and text.
|
571
|
-
- "block_bbox": The coordinates of the layout box.
|
572
|
-
"""
|
573
|
-
|
574
|
-
single_block_layout_parsing_res = []
|
575
|
-
input_img = overall_ocr_res["doc_preprocessor_res"]["output_img"]
|
576
|
-
seal_index = 0
|
577
|
-
with_doc_title = False
|
578
|
-
max_block_area = 0.0
|
579
|
-
paragraph_title_indexs = []
|
580
|
-
|
581
|
-
layout_det_res_list, _ = _remove_overlap_blocks(
|
582
|
-
deepcopy(layout_det_res["boxes"]),
|
583
|
-
threshold=0.5,
|
584
|
-
smaller=True,
|
585
|
-
)
|
586
|
-
|
587
|
-
for box_idx, box_info in enumerate(layout_det_res_list):
|
588
|
-
block_bbox = box_info["coordinate"]
|
589
|
-
label = box_info["label"]
|
590
|
-
rec_res = {"boxes": [], "rec_texts": [], "rec_labels": [], "flag": False}
|
591
|
-
seg_start_coordinate = float("inf")
|
592
|
-
seg_end_coordinate = float("-inf")
|
593
|
-
num_of_lines = 1
|
594
|
-
|
595
|
-
if label == "doc_title":
|
596
|
-
with_doc_title = True
|
597
|
-
elif label == "paragraph_title":
|
598
|
-
paragraph_title_indexs.append(box_idx)
|
599
|
-
|
600
|
-
block_area = (block_bbox[2] - block_bbox[0]) * (block_bbox[3] - block_bbox[1])
|
601
|
-
max_block_area = max(max_block_area, block_area)
|
602
|
-
|
603
|
-
if label == "table":
|
604
|
-
for table_res in table_res_list:
|
605
|
-
if len(table_res["cell_box_list"]) == 0:
|
606
|
-
continue
|
607
|
-
if (
|
608
|
-
_calculate_overlap_area_div_minbox_area_ratio(
|
609
|
-
block_bbox, table_res["cell_box_list"][0]
|
610
|
-
)
|
611
|
-
> 0.5
|
612
|
-
):
|
613
|
-
single_block_layout_parsing_res.append(
|
614
|
-
{
|
615
|
-
"block_label": label,
|
616
|
-
"block_content": table_res["pred_html"],
|
617
|
-
"block_bbox": block_bbox,
|
618
|
-
},
|
619
|
-
)
|
620
|
-
break
|
621
|
-
elif label == "seal":
|
622
|
-
if len(seal_res_list) > 0:
|
623
|
-
single_block_layout_parsing_res.append(
|
624
|
-
{
|
625
|
-
"block_label": label,
|
626
|
-
"block_content": _process_text(
|
627
|
-
", ".join(seal_res_list[seal_index]["rec_texts"])
|
628
|
-
),
|
629
|
-
"block_bbox": block_bbox,
|
630
|
-
},
|
631
|
-
)
|
632
|
-
seal_index += 1
|
633
|
-
else:
|
634
|
-
overall_text_boxes = overall_ocr_res["rec_boxes"]
|
635
|
-
for box_no in range(len(overall_text_boxes)):
|
636
|
-
if (
|
637
|
-
_calculate_overlap_area_div_minbox_area_ratio(
|
638
|
-
block_bbox, overall_text_boxes[box_no]
|
639
|
-
)
|
640
|
-
> 0.5
|
641
|
-
):
|
642
|
-
rec_res["boxes"].append(overall_text_boxes[box_no])
|
643
|
-
rec_res["rec_texts"].append(
|
644
|
-
overall_ocr_res["rec_texts"][box_no],
|
645
|
-
)
|
646
|
-
rec_res["rec_labels"].append(
|
647
|
-
overall_ocr_res["rec_labels"][box_no],
|
648
|
-
)
|
649
|
-
rec_res["flag"] = True
|
650
|
-
|
651
|
-
if rec_res["flag"]:
|
652
|
-
rec_res, num_of_lines = _sort_ocr_res_by_y_projection(
|
653
|
-
input_img, general_ocr_pipeline, label, block_bbox, rec_res, 0.7
|
654
|
-
)
|
655
|
-
seg_start_coordinate = rec_res["boxes"][0][0]
|
656
|
-
seg_end_coordinate = rec_res["boxes"][-1][2]
|
657
|
-
if label == "formula":
|
658
|
-
rec_res["rec_texts"] = [
|
659
|
-
rec_res_text.replace("$", "")
|
660
|
-
for rec_res_text in rec_res["rec_texts"]
|
661
|
-
]
|
662
|
-
|
663
|
-
if label in ["chart", "image"]:
|
664
|
-
x_min, y_min, x_max, y_max = list(map(int, block_bbox))
|
665
|
-
img_path = f"imgs/img_in_table_box_{x_min}_{y_min}_{x_max}_{y_max}.jpg"
|
666
|
-
img = Image.fromarray(input_img[y_min:y_max, x_min:x_max, ::-1])
|
667
|
-
single_block_layout_parsing_res.append(
|
668
|
-
{
|
669
|
-
"block_label": label,
|
670
|
-
"block_content": _process_text("".join(rec_res["rec_texts"])),
|
671
|
-
"block_image": {img_path: img},
|
672
|
-
"block_bbox": block_bbox,
|
673
|
-
},
|
674
|
-
)
|
675
|
-
else:
|
676
|
-
if label in ["doc_title"]:
|
677
|
-
content = " ".join(rec_res["rec_texts"])
|
678
|
-
elif label in ["content"]:
|
679
|
-
content = "\n".join(rec_res["rec_texts"])
|
680
|
-
else:
|
681
|
-
content = "".join(rec_res["rec_texts"])
|
682
|
-
if label != "reference":
|
683
|
-
content = _process_text(content)
|
684
|
-
single_block_layout_parsing_res.append(
|
685
|
-
{
|
686
|
-
"block_label": label,
|
687
|
-
"block_content": content,
|
688
|
-
"block_bbox": block_bbox,
|
689
|
-
"seg_start_coordinate": seg_start_coordinate,
|
690
|
-
"seg_end_coordinate": seg_end_coordinate,
|
691
|
-
"num_of_lines": num_of_lines,
|
692
|
-
"block_area": block_area,
|
693
|
-
},
|
694
|
-
)
|
695
|
-
|
696
|
-
if (
|
697
|
-
not with_doc_title
|
698
|
-
and len(paragraph_title_indexs) == 1
|
699
|
-
and single_block_layout_parsing_res[paragraph_title_indexs[0]].get(
|
700
|
-
"block_area", 0
|
255
|
+
raise ValueError(
|
256
|
+
f"Invalid mode {mode}, must be one of ['union', 'small', 'large']."
|
701
257
|
)
|
702
|
-
> max_block_area * 0.3
|
703
|
-
):
|
704
|
-
single_block_layout_parsing_res[paragraph_title_indexs[0]][
|
705
|
-
"block_label"
|
706
|
-
] = "doc_title"
|
707
|
-
|
708
|
-
if len(layout_det_res_list) == 0:
|
709
|
-
for ocr_rec_box, ocr_rec_text in zip(
|
710
|
-
overall_ocr_res["rec_boxes"], overall_ocr_res["rec_texts"]
|
711
|
-
):
|
712
|
-
single_block_layout_parsing_res.append(
|
713
|
-
{
|
714
|
-
"block_label": "text",
|
715
|
-
"block_content": ocr_rec_text,
|
716
|
-
"block_bbox": ocr_rec_box,
|
717
|
-
"seg_start_coordinate": ocr_rec_box[0],
|
718
|
-
"seg_end_coordinate": ocr_rec_box[2],
|
719
|
-
},
|
720
|
-
)
|
721
258
|
|
722
|
-
|
723
|
-
|
724
|
-
no_mask_labels=[
|
725
|
-
"text",
|
726
|
-
"formula",
|
727
|
-
"algorithm",
|
728
|
-
"reference",
|
729
|
-
"content",
|
730
|
-
"abstract",
|
731
|
-
],
|
732
|
-
)
|
259
|
+
if ref_area == 0:
|
260
|
+
return 0.0
|
733
261
|
|
734
|
-
return
|
262
|
+
return inter_area / ref_area
|
735
263
|
|
736
264
|
|
737
|
-
def
|
265
|
+
def calculate_minimum_enclosing_bbox(bboxes):
|
738
266
|
"""
|
739
|
-
|
267
|
+
Calculate the minimum enclosing bounding box for a list of bounding boxes.
|
740
268
|
|
741
269
|
Args:
|
742
|
-
|
743
|
-
axis: Axis for projection; 0 for horizontal (x-axis), 1 for vertical (y-axis).
|
270
|
+
bboxes (list): A list of bounding boxes represented as lists of four integers [x1, y1, x2, y2].
|
744
271
|
|
745
272
|
Returns:
|
746
|
-
|
273
|
+
list: The minimum enclosing bounding box represented as a list of four integers [x1, y1, x2, y2].
|
747
274
|
"""
|
748
|
-
|
749
|
-
|
750
|
-
projection = np.zeros(max_length, dtype=int)
|
275
|
+
if not bboxes:
|
276
|
+
raise ValueError("The list of bounding boxes is empty.")
|
751
277
|
|
752
|
-
#
|
753
|
-
|
754
|
-
projection[start:end] += 1
|
278
|
+
# Convert the list of bounding boxes to a NumPy array
|
279
|
+
bboxes_array = np.array(bboxes)
|
755
280
|
|
756
|
-
|
281
|
+
# Compute the minimum and maximum values along the respective axes
|
282
|
+
min_x = np.min(bboxes_array[:, 0])
|
283
|
+
min_y = np.min(bboxes_array[:, 1])
|
284
|
+
max_x = np.max(bboxes_array[:, 2])
|
285
|
+
max_y = np.max(bboxes_array[:, 3])
|
757
286
|
|
287
|
+
# Return the minimum enclosing bounding box
|
288
|
+
return np.array([min_x, min_y, max_x, max_y])
|
758
289
|
|
759
|
-
def _split_projection_profile(arr_values: np.ndarray, min_value: float, min_gap: float):
|
760
|
-
"""
|
761
|
-
Split the projection profile into segments based on specified thresholds.
|
762
290
|
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
min_gap: Minimum gap width to consider a separation between segments.
|
291
|
+
def is_english_letter(char):
|
292
|
+
"""check if the char is english letter"""
|
293
|
+
return bool(re.match(r"^[A-Za-z]$", char))
|
767
294
|
|
768
|
-
Returns:
|
769
|
-
A tuple of start and end indices for each segment that meets the criteria.
|
770
|
-
"""
|
771
|
-
# Identify indices where the projection exceeds the minimum value
|
772
|
-
significant_indices = np.where(arr_values > min_value)[0]
|
773
|
-
if not len(significant_indices):
|
774
|
-
return
|
775
|
-
|
776
|
-
# Calculate gaps between significant indices
|
777
|
-
index_diffs = significant_indices[1:] - significant_indices[:-1]
|
778
|
-
gap_indices = np.where(index_diffs > min_gap)[0]
|
779
|
-
|
780
|
-
# Determine start and end indices of segments
|
781
|
-
segment_starts = np.insert(
|
782
|
-
significant_indices[gap_indices + 1],
|
783
|
-
0,
|
784
|
-
significant_indices[0],
|
785
|
-
)
|
786
|
-
segment_ends = np.append(
|
787
|
-
significant_indices[gap_indices],
|
788
|
-
significant_indices[-1] + 1,
|
789
|
-
)
|
790
295
|
|
791
|
-
|
296
|
+
def is_numeric(char):
|
297
|
+
"""check if the char is numeric"""
|
298
|
+
return bool(re.match(r"^[\d]+$", char))
|
792
299
|
|
793
300
|
|
794
|
-
def
|
795
|
-
boxes: np.ndarray, indices: List[int], res: List[int], min_gap: int = 1
|
796
|
-
):
|
301
|
+
def is_non_breaking_punctuation(char):
|
797
302
|
"""
|
798
|
-
|
303
|
+
check if the char is non-breaking punctuation
|
799
304
|
|
800
305
|
Args:
|
801
|
-
|
802
|
-
indices: List of indices indicating the original position of boxes.
|
803
|
-
res: List to store indices of the final segmented bounding boxes.
|
804
|
-
min_gap (int): Minimum gap width to consider a separation between segments on the X-axis. Defaults to 1.
|
306
|
+
char (str): character to check
|
805
307
|
|
806
308
|
Returns:
|
807
|
-
|
808
|
-
"""
|
809
|
-
|
810
|
-
|
811
|
-
|
812
|
-
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
if not y_intervals:
|
823
|
-
return
|
824
|
-
|
825
|
-
# Process each segment defined by Y-axis projection
|
826
|
-
for y_start, y_end in zip(*y_intervals):
|
827
|
-
# Select boxes within the current y interval
|
828
|
-
y_interval_indices = (y_start <= y_sorted_boxes[:, 1]) & (
|
829
|
-
y_sorted_boxes[:, 1] < y_end
|
830
|
-
)
|
831
|
-
y_boxes_chunk = y_sorted_boxes[y_interval_indices]
|
832
|
-
y_indices_chunk = y_sorted_indices[y_interval_indices]
|
833
|
-
|
834
|
-
# Sort by x_min for X-axis projection
|
835
|
-
x_sorted_indices = y_boxes_chunk[:, 0].argsort()
|
836
|
-
x_sorted_boxes_chunk = y_boxes_chunk[x_sorted_indices]
|
837
|
-
x_sorted_indices_chunk = y_indices_chunk[x_sorted_indices]
|
838
|
-
|
839
|
-
# Perform X-axis projection
|
840
|
-
x_projection = _projection_by_bboxes(boxes=x_sorted_boxes_chunk, axis=0)
|
841
|
-
x_intervals = _split_projection_profile(x_projection, 0, min_gap)
|
842
|
-
|
843
|
-
if not x_intervals:
|
844
|
-
continue
|
845
|
-
|
846
|
-
# If X-axis cannot be further segmented, add current indices to results
|
847
|
-
if len(x_intervals[0]) == 1:
|
848
|
-
res.extend(x_sorted_indices_chunk)
|
849
|
-
continue
|
850
|
-
|
851
|
-
# Recursively process each segment defined by X-axis projection
|
852
|
-
for x_start, x_end in zip(*x_intervals):
|
853
|
-
x_interval_indices = (x_start <= x_sorted_boxes_chunk[:, 0]) & (
|
854
|
-
x_sorted_boxes_chunk[:, 0] < x_end
|
855
|
-
)
|
856
|
-
_recursive_yx_cut(
|
857
|
-
x_sorted_boxes_chunk[x_interval_indices],
|
858
|
-
x_sorted_indices_chunk[x_interval_indices],
|
859
|
-
res,
|
860
|
-
)
|
861
|
-
|
862
|
-
|
863
|
-
def _recursive_xy_cut(
|
864
|
-
boxes: np.ndarray, indices: List[int], res: List[int], min_gap: int = 1
|
865
|
-
):
|
866
|
-
"""
|
867
|
-
Recursively performs X-axis projection followed by Y-axis projection to segment bounding boxes.
|
868
|
-
|
869
|
-
Args:
|
870
|
-
boxes: A (N, 4) array representing bounding boxes with [x_min, y_min, x_max, y_max].
|
871
|
-
indices: A list of indices representing the position of boxes in the original data.
|
872
|
-
res: A list to store indices of bounding boxes that meet the criteria.
|
873
|
-
min_gap (int): Minimum gap width to consider a separation between segments on the X-axis. Defaults to 1.
|
874
|
-
|
875
|
-
Returns:
|
876
|
-
None: This function modifies the `res` list in place.
|
877
|
-
"""
|
878
|
-
# Ensure boxes and indices have the same length
|
879
|
-
assert len(boxes) == len(
|
880
|
-
indices
|
881
|
-
), "The length of boxes and indices must be the same."
|
882
|
-
|
883
|
-
# Sort by x_min to prepare for X-axis projection
|
884
|
-
x_sorted_indices = boxes[:, 0].argsort()
|
885
|
-
x_sorted_boxes = boxes[x_sorted_indices]
|
886
|
-
x_sorted_indices = np.array(indices)[x_sorted_indices]
|
887
|
-
|
888
|
-
# Perform X-axis projection
|
889
|
-
x_projection = _projection_by_bboxes(boxes=x_sorted_boxes, axis=0)
|
890
|
-
x_intervals = _split_projection_profile(x_projection, 0, 1)
|
891
|
-
|
892
|
-
if not x_intervals:
|
893
|
-
return
|
894
|
-
|
895
|
-
# Process each segment defined by X-axis projection
|
896
|
-
for x_start, x_end in zip(*x_intervals):
|
897
|
-
# Select boxes within the current x interval
|
898
|
-
x_interval_indices = (x_start <= x_sorted_boxes[:, 0]) & (
|
899
|
-
x_sorted_boxes[:, 0] < x_end
|
900
|
-
)
|
901
|
-
x_boxes_chunk = x_sorted_boxes[x_interval_indices]
|
902
|
-
x_indices_chunk = x_sorted_indices[x_interval_indices]
|
903
|
-
|
904
|
-
# Sort selected boxes by y_min to prepare for Y-axis projection
|
905
|
-
y_sorted_indices = x_boxes_chunk[:, 1].argsort()
|
906
|
-
y_sorted_boxes_chunk = x_boxes_chunk[y_sorted_indices]
|
907
|
-
y_sorted_indices_chunk = x_indices_chunk[y_sorted_indices]
|
908
|
-
|
909
|
-
# Perform Y-axis projection
|
910
|
-
y_projection = _projection_by_bboxes(boxes=y_sorted_boxes_chunk, axis=1)
|
911
|
-
y_intervals = _split_projection_profile(y_projection, 0, min_gap)
|
912
|
-
|
913
|
-
if not y_intervals:
|
914
|
-
continue
|
915
|
-
|
916
|
-
# If Y-axis cannot be further segmented, add current indices to results
|
917
|
-
if len(y_intervals[0]) == 1:
|
918
|
-
res.extend(y_sorted_indices_chunk)
|
919
|
-
continue
|
920
|
-
|
921
|
-
# Recursively process each segment defined by Y-axis projection
|
922
|
-
for y_start, y_end in zip(*y_intervals):
|
923
|
-
y_interval_indices = (y_start <= y_sorted_boxes_chunk[:, 1]) & (
|
924
|
-
y_sorted_boxes_chunk[:, 1] < y_end
|
925
|
-
)
|
926
|
-
_recursive_xy_cut(
|
927
|
-
y_sorted_boxes_chunk[y_interval_indices],
|
928
|
-
y_sorted_indices_chunk[y_interval_indices],
|
929
|
-
res,
|
930
|
-
)
|
931
|
-
|
932
|
-
|
933
|
-
def sort_by_xycut(
|
934
|
-
block_bboxes: Union[np.ndarray, List[List[int]]],
|
935
|
-
direction: int = 0,
|
936
|
-
min_gap: int = 1,
|
937
|
-
) -> List[int]:
|
938
|
-
"""
|
939
|
-
Sort bounding boxes using recursive XY cut method based on the specified direction.
|
940
|
-
|
941
|
-
Args:
|
942
|
-
block_bboxes (Union[np.ndarray, List[List[int]]]): An array or list of bounding boxes,
|
943
|
-
where each box is represented as
|
944
|
-
[x_min, y_min, x_max, y_max].
|
945
|
-
direction (int): Direction for the initial cut. Use 1 for Y-axis first and 0 for X-axis first.
|
946
|
-
Defaults to 0.
|
947
|
-
min_gap (int): Minimum gap width to consider a separation between segments. Defaults to 1.
|
309
|
+
bool: True if the char is non-breaking punctuation
|
310
|
+
"""
|
311
|
+
non_breaking_punctuations = {
|
312
|
+
",",
|
313
|
+
",",
|
314
|
+
"、",
|
315
|
+
";",
|
316
|
+
";",
|
317
|
+
":",
|
318
|
+
":",
|
319
|
+
"-",
|
320
|
+
"'",
|
321
|
+
'"',
|
322
|
+
"“",
|
323
|
+
}
|
948
324
|
|
949
|
-
|
950
|
-
List[int]: A list of indices representing the order of sorted bounding boxes.
|
951
|
-
"""
|
952
|
-
block_bboxes = np.asarray(block_bboxes).astype(int)
|
953
|
-
res = []
|
954
|
-
if direction == 1:
|
955
|
-
_recursive_yx_cut(
|
956
|
-
block_bboxes,
|
957
|
-
np.arange(len(block_bboxes)).tolist(),
|
958
|
-
res,
|
959
|
-
min_gap,
|
960
|
-
)
|
961
|
-
else:
|
962
|
-
_recursive_xy_cut(
|
963
|
-
block_bboxes,
|
964
|
-
np.arange(len(block_bboxes)).tolist(),
|
965
|
-
res,
|
966
|
-
min_gap,
|
967
|
-
)
|
968
|
-
return res
|
325
|
+
return char in non_breaking_punctuations
|
969
326
|
|
970
327
|
|
971
328
|
def gather_imgs(original_img, layout_det_objs):
|
972
329
|
imgs_in_doc = []
|
973
330
|
for det_obj in layout_det_objs:
|
974
|
-
if det_obj["label"] in
|
331
|
+
if det_obj["label"] in BLOCK_LABEL_MAP["image_labels"]:
|
332
|
+
label = det_obj["label"]
|
975
333
|
x_min, y_min, x_max, y_max = list(map(int, det_obj["coordinate"]))
|
976
|
-
img_path = f"imgs/
|
334
|
+
img_path = f"imgs/img_in_{label}_box_{x_min}_{y_min}_{x_max}_{y_max}.jpg"
|
977
335
|
img = Image.fromarray(original_img[y_min:y_max, x_min:x_max, ::-1])
|
978
336
|
imgs_in_doc.append(
|
979
337
|
{
|
@@ -1007,10 +365,10 @@ def _get_minbox_if_overlap_by_ratio(
|
|
1007
365
|
The selected bounding box or None if the overlap ratio is not exceeded.
|
1008
366
|
"""
|
1009
367
|
# Calculate the areas of both bounding boxes
|
1010
|
-
area1 = (bbox1
|
1011
|
-
area2 = (bbox2
|
368
|
+
area1 = caculate_bbox_area(bbox1)
|
369
|
+
area2 = caculate_bbox_area(bbox2)
|
1012
370
|
# Calculate the overlap ratio using a helper function
|
1013
|
-
overlap_ratio =
|
371
|
+
overlap_ratio = calculate_overlap_ratio(bbox1, bbox2, mode="small")
|
1014
372
|
# Check if the overlap ratio exceeds the threshold
|
1015
373
|
if overlap_ratio > ratio:
|
1016
374
|
if (area1 <= area2 and smaller) or (area1 >= area2 and not smaller):
|
@@ -1020,7 +378,7 @@ def _get_minbox_if_overlap_by_ratio(
|
|
1020
378
|
return None
|
1021
379
|
|
1022
380
|
|
1023
|
-
def
|
381
|
+
def remove_overlap_blocks(
|
1024
382
|
blocks: List[Dict[str, List[int]]], threshold: float = 0.65, smaller: bool = True
|
1025
383
|
) -> Tuple[List[Dict[str, List[int]]], List[Dict[str, List[int]]]]:
|
1026
384
|
"""
|
@@ -1035,13 +393,13 @@ def _remove_overlap_blocks(
|
|
1035
393
|
Tuple[List[Dict[str, List[int]]], List[Dict[str, List[int]]]]:
|
1036
394
|
A tuple containing the updated list of blocks and a list of dropped blocks.
|
1037
395
|
"""
|
1038
|
-
dropped_blocks = []
|
1039
396
|
dropped_indexes = set()
|
1040
|
-
|
397
|
+
blocks = deepcopy(blocks)
|
398
|
+
overlap_image_blocks = []
|
1041
399
|
# Iterate over each pair of blocks to find overlaps
|
1042
|
-
for i, block1 in enumerate(blocks):
|
1043
|
-
for j in range(i + 1, len(blocks)):
|
1044
|
-
block2 = blocks[j]
|
400
|
+
for i, block1 in enumerate(blocks["boxes"]):
|
401
|
+
for j in range(i + 1, len(blocks["boxes"])):
|
402
|
+
block2 = blocks["boxes"][j]
|
1045
403
|
# Skip blocks that are already marked for removal
|
1046
404
|
if i in dropped_indexes or j in dropped_indexes:
|
1047
405
|
continue
|
@@ -1053,1332 +411,337 @@ def _remove_overlap_blocks(
|
|
1053
411
|
smaller=smaller,
|
1054
412
|
)
|
1055
413
|
if overlap_box_index is not None:
|
1056
|
-
|
1057
|
-
|
1058
|
-
|
414
|
+
is_block1_image = block1["label"] == "image"
|
415
|
+
is_block2_image = block2["label"] == "image"
|
416
|
+
|
417
|
+
if is_block1_image != is_block2_image:
|
418
|
+
# 如果只有一个块在视觉标签中,删除在视觉标签中的那个块
|
419
|
+
drop_index = i if is_block1_image else j
|
420
|
+
overlap_image_blocks.append(blocks["boxes"][drop_index])
|
1059
421
|
else:
|
1060
|
-
|
422
|
+
# 如果两个块都在或都不在视觉标签中,根据 overlap_box_index 决定删除哪个块
|
423
|
+
drop_index = i if overlap_box_index == 1 else j
|
424
|
+
|
1061
425
|
dropped_indexes.add(drop_index)
|
1062
426
|
|
1063
427
|
# Remove marked blocks from the original list
|
1064
428
|
for index in sorted(dropped_indexes, reverse=True):
|
1065
|
-
|
1066
|
-
del blocks[index]
|
1067
|
-
|
1068
|
-
return blocks, dropped_blocks
|
1069
|
-
|
1070
|
-
|
1071
|
-
def _get_text_median_width(blocks: List[Dict[str, any]]) -> float:
|
1072
|
-
"""
|
1073
|
-
Calculate the median width of blocks labeled as "text".
|
1074
|
-
|
1075
|
-
Args:
|
1076
|
-
blocks (List[Dict[str, any]]): List of block dictionaries, each containing a 'block_bbox' and 'label'.
|
1077
|
-
|
1078
|
-
Returns:
|
1079
|
-
float: The median width of text blocks, or infinity if no text blocks are found.
|
1080
|
-
"""
|
1081
|
-
widths = [
|
1082
|
-
block["block_bbox"][2] - block["block_bbox"][0]
|
1083
|
-
for block in blocks
|
1084
|
-
if block.get("block_label") == "text"
|
1085
|
-
]
|
1086
|
-
return np.median(widths) if widths else float("inf")
|
1087
|
-
|
1088
|
-
|
1089
|
-
def _get_layout_property(
|
1090
|
-
blocks: List[Dict[str, any]],
|
1091
|
-
median_width: float,
|
1092
|
-
no_mask_labels: List[str],
|
1093
|
-
threshold: float = 0.8,
|
1094
|
-
) -> Tuple[List[Dict[str, any]], bool]:
|
1095
|
-
"""
|
1096
|
-
Determine the layout (single or double column) of text blocks.
|
1097
|
-
|
1098
|
-
Args:
|
1099
|
-
blocks (List[Dict[str, any]]): List of block dictionaries containing 'label' and 'block_bbox'.
|
1100
|
-
median_width (float): Median width of text blocks.
|
1101
|
-
no_mask_labels (List[str]): Labels of blocks to be considered for layout analysis.
|
1102
|
-
threshold (float): Threshold for determining layout overlap.
|
429
|
+
del blocks["boxes"][index]
|
1103
430
|
|
1104
|
-
|
1105
|
-
Tuple[List[Dict[str, any]], bool]: Updated list of blocks with layout information and a boolean
|
1106
|
-
indicating if the double layout area is greater than the single layout area.
|
1107
|
-
"""
|
1108
|
-
blocks.sort(
|
1109
|
-
key=lambda x: (
|
1110
|
-
x["block_bbox"][0],
|
1111
|
-
(x["block_bbox"][2] - x["block_bbox"][0]),
|
1112
|
-
),
|
1113
|
-
)
|
1114
|
-
check_single_layout = {}
|
1115
|
-
page_min_x, page_max_x = float("inf"), 0
|
1116
|
-
double_label_area = 0
|
1117
|
-
single_label_area = 0
|
1118
|
-
|
1119
|
-
for i, block in enumerate(blocks):
|
1120
|
-
page_min_x = min(page_min_x, block["block_bbox"][0])
|
1121
|
-
page_max_x = max(page_max_x, block["block_bbox"][2])
|
1122
|
-
page_width = page_max_x - page_min_x
|
1123
|
-
|
1124
|
-
for i, block in enumerate(blocks):
|
1125
|
-
if block["block_label"] not in no_mask_labels:
|
1126
|
-
continue
|
1127
|
-
|
1128
|
-
x_min_i, _, x_max_i, _ = block["block_bbox"]
|
1129
|
-
layout_length = x_max_i - x_min_i
|
1130
|
-
cover_count, cover_with_threshold_count = 0, 0
|
1131
|
-
match_block_with_threshold_indexes = []
|
1132
|
-
|
1133
|
-
for j, other_block in enumerate(blocks):
|
1134
|
-
if i == j or other_block["block_label"] not in no_mask_labels:
|
1135
|
-
continue
|
1136
|
-
|
1137
|
-
x_min_j, _, x_max_j, _ = other_block["block_bbox"]
|
1138
|
-
x_match_min, x_match_max = max(
|
1139
|
-
x_min_i,
|
1140
|
-
x_min_j,
|
1141
|
-
), min(x_max_i, x_max_j)
|
1142
|
-
match_block_iou = (x_match_max - x_match_min) / (x_max_j - x_min_j)
|
1143
|
-
|
1144
|
-
if match_block_iou > 0:
|
1145
|
-
cover_count += 1
|
1146
|
-
if match_block_iou > threshold:
|
1147
|
-
cover_with_threshold_count += 1
|
1148
|
-
match_block_with_threshold_indexes.append(
|
1149
|
-
(j, match_block_iou),
|
1150
|
-
)
|
1151
|
-
x_min_i = x_match_max
|
1152
|
-
if x_min_i >= x_max_i:
|
1153
|
-
break
|
1154
|
-
|
1155
|
-
if (
|
1156
|
-
layout_length > median_width * 1.3
|
1157
|
-
and (cover_with_threshold_count >= 2 or cover_count >= 2)
|
1158
|
-
) or layout_length > 0.6 * page_width:
|
1159
|
-
# if layout_length > median_width * 1.3 and (cover_with_threshold_count >= 2):
|
1160
|
-
block["layout"] = "double"
|
1161
|
-
double_label_area += (block["block_bbox"][2] - block["block_bbox"][0]) * (
|
1162
|
-
block["block_bbox"][3] - block["block_bbox"][1]
|
1163
|
-
)
|
1164
|
-
else:
|
1165
|
-
block["layout"] = "single"
|
1166
|
-
check_single_layout[i] = match_block_with_threshold_indexes
|
1167
|
-
|
1168
|
-
# Check single-layout block
|
1169
|
-
for i, single_layout in check_single_layout.items():
|
1170
|
-
if single_layout:
|
1171
|
-
index, match_iou = single_layout[-1]
|
1172
|
-
if match_iou > 0.9 and blocks[index]["layout"] == "double":
|
1173
|
-
blocks[i]["layout"] = "double"
|
1174
|
-
double_label_area += (
|
1175
|
-
blocks[i]["block_bbox"][2] - blocks[i]["block_bbox"][0]
|
1176
|
-
) * (blocks[i]["block_bbox"][3] - blocks[i]["block_bbox"][1])
|
1177
|
-
else:
|
1178
|
-
single_label_area += (
|
1179
|
-
blocks[i]["block_bbox"][2] - blocks[i]["block_bbox"][0]
|
1180
|
-
) * (blocks[i]["block_bbox"][3] - blocks[i]["block_bbox"][1])
|
431
|
+
return blocks
|
1181
432
|
|
1182
|
-
return blocks, (double_label_area > single_label_area)
|
1183
433
|
|
1184
|
-
|
1185
|
-
def _get_bbox_direction(input_bbox: List[float], ratio: float = 1.0) -> bool:
|
434
|
+
def get_bbox_intersection(bbox1, bbox2, return_format="bbox"):
|
1186
435
|
"""
|
1187
|
-
|
436
|
+
Compute the intersection of two bounding boxes, supporting both 4-coordinate and 8-coordinate formats.
|
1188
437
|
|
1189
438
|
Args:
|
1190
|
-
|
1191
|
-
|
439
|
+
bbox1 (tuple): The first bounding box, either in 4-coordinate format (x_min, y_min, x_max, y_max)
|
440
|
+
or 8-coordinate format (x1, y1, x2, y2, x3, y3, x4, y4).
|
441
|
+
bbox2 (tuple): The second bounding box in the same format as bbox1.
|
442
|
+
return_format (str): The format of the output intersection, either 'bbox' or 'poly'.
|
1192
443
|
|
1193
444
|
Returns:
|
1194
|
-
|
1195
|
-
"""
|
1196
|
-
|
1197
|
-
|
1198
|
-
|
1199
|
-
|
1200
|
-
|
1201
|
-
|
1202
|
-
|
1203
|
-
|
1204
|
-
|
1205
|
-
|
1206
|
-
|
1207
|
-
|
1208
|
-
|
1209
|
-
|
1210
|
-
|
1211
|
-
|
1212
|
-
|
1213
|
-
|
1214
|
-
|
1215
|
-
|
1216
|
-
|
1217
|
-
|
1218
|
-
|
1219
|
-
|
445
|
+
tuple or None: The intersection bounding box in the specified format, or None if there is no intersection.
|
446
|
+
"""
|
447
|
+
bbox1 = np.array(bbox1)
|
448
|
+
bbox2 = np.array(bbox2)
|
449
|
+
# Convert both bounding boxes to rectangles
|
450
|
+
rect1 = bbox1 if len(bbox1.shape) == 1 else convert_points_to_boxes([bbox1])[0]
|
451
|
+
rect2 = bbox2 if len(bbox2.shape) == 1 else convert_points_to_boxes([bbox2])[0]
|
452
|
+
|
453
|
+
# Calculate the intersection rectangle
|
454
|
+
|
455
|
+
x_min_inter = max(rect1[0], rect2[0])
|
456
|
+
y_min_inter = max(rect1[1], rect2[1])
|
457
|
+
x_max_inter = min(rect1[2], rect2[2])
|
458
|
+
y_max_inter = min(rect1[3], rect2[3])
|
459
|
+
|
460
|
+
# Check if there is an intersection
|
461
|
+
if x_min_inter >= x_max_inter or y_min_inter >= y_max_inter:
|
462
|
+
return None
|
463
|
+
|
464
|
+
if return_format == "bbox":
|
465
|
+
return np.array([x_min_inter, y_min_inter, x_max_inter, y_max_inter])
|
466
|
+
elif return_format == "poly":
|
467
|
+
return np.array(
|
468
|
+
[
|
469
|
+
[x_min_inter, y_min_inter],
|
470
|
+
[x_max_inter, y_min_inter],
|
471
|
+
[x_max_inter, y_max_inter],
|
472
|
+
[x_min_inter, y_max_inter],
|
473
|
+
],
|
474
|
+
dtype=np.int16,
|
475
|
+
)
|
1220
476
|
else:
|
1221
|
-
|
1222
|
-
y_match_max = min(input_bbox[3], match_bbox[3])
|
1223
|
-
overlap = max(0, y_match_max - y_match_min)
|
1224
|
-
input_width = min(input_bbox[3] - input_bbox[1], match_bbox[3] - match_bbox[1])
|
477
|
+
raise ValueError("return_format must be either 'bbox' or 'poly'.")
|
1225
478
|
|
1226
|
-
return overlap / input_width if input_width > 0 else 0.0
|
1227
479
|
|
1228
|
-
|
1229
|
-
|
1230
|
-
|
1231
|
-
|
480
|
+
def shrink_supplement_region_bbox(
|
481
|
+
supplement_region_bbox,
|
482
|
+
ref_region_bbox,
|
483
|
+
image_width,
|
484
|
+
image_height,
|
485
|
+
block_idxes_set,
|
486
|
+
block_bboxes,
|
487
|
+
) -> List:
|
1232
488
|
"""
|
1233
|
-
|
489
|
+
Shrink the supplement region bbox according to the reference region bbox and match the block bboxes.
|
1234
490
|
|
1235
491
|
Args:
|
1236
|
-
|
1237
|
-
|
492
|
+
supplement_region_bbox (list): The supplement region bbox.
|
493
|
+
ref_region_bbox (list): The reference region bbox.
|
494
|
+
image_width (int): The width of the image.
|
495
|
+
image_height (int): The height of the image.
|
496
|
+
block_idxes_set (set): The indexes of the blocks that intersect with the region bbox.
|
497
|
+
block_bboxes (dict): The dictionary of block bboxes.
|
1238
498
|
|
1239
499
|
Returns:
|
1240
|
-
|
1241
|
-
|
1242
|
-
|
1243
|
-
|
1244
|
-
|
1245
|
-
|
1246
|
-
|
1247
|
-
|
1248
|
-
|
1249
|
-
|
1250
|
-
|
1251
|
-
|
1252
|
-
|
1253
|
-
|
1254
|
-
|
1255
|
-
|
1256
|
-
|
1257
|
-
|
1258
|
-
|
1259
|
-
|
1260
|
-
|
1261
|
-
|
1262
|
-
|
1263
|
-
|
1264
|
-
max_y = max(block["block_bbox"][3] for block in blocks)
|
1265
|
-
region_bbox = (min_x, min_y, max_x, max_y)
|
1266
|
-
region_x_center = (region_bbox[0] + region_bbox[2]) / 2
|
1267
|
-
region_y_center = (region_bbox[1] + region_bbox[3]) / 2
|
1268
|
-
region_width = region_bbox[2] - region_bbox[0]
|
1269
|
-
region_height = region_bbox[3] - region_bbox[1]
|
1270
|
-
|
1271
|
-
pre_cuts = {}
|
1272
|
-
|
1273
|
-
for i, block1 in enumerate(blocks):
|
1274
|
-
block1.setdefault("title_text", [])
|
1275
|
-
block1.setdefault("sub_title", [])
|
1276
|
-
block1.setdefault("vision_footnote", [])
|
1277
|
-
block1.setdefault("sub_label", block1["block_label"])
|
1278
|
-
|
1279
|
-
if block1["block_label"] not in all_labels:
|
1280
|
-
continue
|
1281
|
-
|
1282
|
-
bbox1 = block1["block_bbox"]
|
1283
|
-
x1, y1, x2, y2 = bbox1
|
1284
|
-
is_horizontal_1 = _get_bbox_direction(block1["block_bbox"])
|
1285
|
-
left_up_title_text_distance = float("inf")
|
1286
|
-
left_up_title_text_index = -1
|
1287
|
-
left_up_title_text_direction = None
|
1288
|
-
right_down_title_text_distance = float("inf")
|
1289
|
-
right_down_title_text_index = -1
|
1290
|
-
right_down_title_text_direction = None
|
1291
|
-
|
1292
|
-
# pre-cuts
|
1293
|
-
# Condition 1: Length is greater than half of the layout region
|
1294
|
-
if is_horizontal_1:
|
1295
|
-
block_length = x2 - x1
|
1296
|
-
required_length = region_width / 2
|
1297
|
-
else:
|
1298
|
-
block_length = y2 - y1
|
1299
|
-
required_length = region_height / 2
|
1300
|
-
if block1["block_label"] in special_pre_cut_labels:
|
1301
|
-
length_condition = True
|
1302
|
-
else:
|
1303
|
-
length_condition = block_length > required_length
|
1304
|
-
|
1305
|
-
# Condition 2: Centered check (must be within ±20 in both horizontal and vertical directions)
|
1306
|
-
block_x_center = (x1 + x2) / 2
|
1307
|
-
block_y_center = (y1 + y2) / 2
|
1308
|
-
tolerance_len = block_length // 5
|
1309
|
-
if block1["block_label"] in special_pre_cut_labels:
|
1310
|
-
tolerance_len = block_length // 10
|
1311
|
-
if is_horizontal_1:
|
1312
|
-
is_centered = abs(block_x_center - region_x_center) <= tolerance_len
|
1313
|
-
else:
|
1314
|
-
is_centered = abs(block_y_center - region_y_center) <= tolerance_len
|
1315
|
-
|
1316
|
-
# Condition 3: Check for surrounding text
|
1317
|
-
has_left_text = False
|
1318
|
-
has_right_text = False
|
1319
|
-
has_above_text = False
|
1320
|
-
has_below_text = False
|
1321
|
-
for block2 in blocks:
|
1322
|
-
if block2["block_label"] != "text":
|
1323
|
-
continue
|
1324
|
-
bbox2 = block2["block_bbox"]
|
1325
|
-
x1_2, y1_2, x2_2, y2_2 = bbox2
|
1326
|
-
if is_horizontal_1:
|
1327
|
-
if x2_2 <= x1 and not (y2_2 <= y1 or y1_2 >= y2):
|
1328
|
-
has_left_text = True
|
1329
|
-
if x1_2 >= x2 and not (y2_2 <= y1 or y1_2 >= y2):
|
1330
|
-
has_right_text = True
|
1331
|
-
else:
|
1332
|
-
if y2_2 <= y1 and not (x2_2 <= x1 or x1_2 >= x2):
|
1333
|
-
has_above_text = True
|
1334
|
-
if y1_2 >= y2 and not (x2_2 <= x1 or x1_2 >= x2):
|
1335
|
-
has_below_text = True
|
1336
|
-
|
1337
|
-
if (is_horizontal_1 and has_left_text and has_right_text) or (
|
1338
|
-
not is_horizontal_1 and has_above_text and has_below_text
|
1339
|
-
):
|
1340
|
-
break
|
1341
|
-
|
1342
|
-
no_text_on_sides = (
|
1343
|
-
not (has_left_text or has_right_text)
|
1344
|
-
if is_horizontal_1
|
1345
|
-
else not (has_above_text or has_below_text)
|
1346
|
-
)
|
1347
|
-
|
1348
|
-
# Add coordinates if all conditions are met
|
1349
|
-
if is_centered and length_condition and no_text_on_sides:
|
1350
|
-
if is_horizontal_1:
|
1351
|
-
pre_cuts.setdefault("y", []).append(y1)
|
1352
|
-
else:
|
1353
|
-
pre_cuts.setdefault("x", []).append(x1)
|
1354
|
-
|
1355
|
-
for j, block2 in enumerate(blocks):
|
1356
|
-
if i == j:
|
1357
|
-
continue
|
1358
|
-
|
1359
|
-
bbox2 = block2["block_bbox"]
|
1360
|
-
x1_prime, y1_prime, x2_prime, y2_prime = bbox2
|
1361
|
-
is_horizontal_2 = _get_bbox_direction(bbox2)
|
1362
|
-
match_block_iou = _get_projection_iou(
|
1363
|
-
bbox2,
|
1364
|
-
bbox1,
|
1365
|
-
is_horizontal_1,
|
500
|
+
list: The new region bbox and the matched block idxes.
|
501
|
+
"""
|
502
|
+
x1, y1, x2, y2 = supplement_region_bbox
|
503
|
+
x1_prime, y1_prime, x2_prime, y2_prime = ref_region_bbox
|
504
|
+
index_conversion_map = {0: 2, 1: 3, 2: 0, 3: 1}
|
505
|
+
edge_distance_list = [
|
506
|
+
(x1_prime - x1) / image_width,
|
507
|
+
(y1_prime - y1) / image_height,
|
508
|
+
(x2 - x2_prime) / image_width,
|
509
|
+
(y2 - y2_prime) / image_height,
|
510
|
+
]
|
511
|
+
edge_distance_list_tmp = deepcopy(edge_distance_list)
|
512
|
+
min_distance = min(edge_distance_list)
|
513
|
+
src_index = index_conversion_map[edge_distance_list.index(min_distance)]
|
514
|
+
if len(block_idxes_set) == 0:
|
515
|
+
return supplement_region_bbox, []
|
516
|
+
for _ in range(3):
|
517
|
+
dst_index = index_conversion_map[src_index]
|
518
|
+
tmp_region_bbox = supplement_region_bbox[:]
|
519
|
+
tmp_region_bbox[dst_index] = ref_region_bbox[src_index]
|
520
|
+
iner_block_idxes, split_block_idxes = [], []
|
521
|
+
for block_idx in block_idxes_set:
|
522
|
+
overlap_ratio = calculate_overlap_ratio(
|
523
|
+
tmp_region_bbox, block_bboxes[block_idx], mode="small"
|
1366
524
|
)
|
1367
|
-
|
1368
|
-
|
1369
|
-
if is_horizontal:
|
1370
|
-
if is_left_up:
|
1371
|
-
return (y1 - y2_prime + 2) // 5 + x1_prime / 5000
|
1372
|
-
else:
|
1373
|
-
return (y1_prime - y2 + 2) // 5 + x1_prime / 5000
|
1374
|
-
|
1375
|
-
else:
|
1376
|
-
if is_left_up:
|
1377
|
-
return (x1 - x2_prime + 2) // 5 + y1_prime / 5000
|
1378
|
-
else:
|
1379
|
-
return (x1_prime - x2 + 2) // 5 + y1_prime / 5000
|
1380
|
-
|
1381
|
-
block_iou_threshold = 0.1
|
1382
|
-
if block1["block_label"] in sub_title_labels:
|
1383
|
-
block_iou_threshold = 0.5
|
1384
|
-
|
1385
|
-
if is_horizontal_1:
|
1386
|
-
if match_block_iou >= block_iou_threshold:
|
1387
|
-
left_up_distance = distance_(True, True)
|
1388
|
-
right_down_distance = distance_(True, False)
|
1389
|
-
if (
|
1390
|
-
y2_prime <= y1
|
1391
|
-
and left_up_distance <= left_up_title_text_distance
|
1392
|
-
):
|
1393
|
-
left_up_title_text_distance = left_up_distance
|
1394
|
-
left_up_title_text_index = j
|
1395
|
-
left_up_title_text_direction = is_horizontal_2
|
1396
|
-
elif (
|
1397
|
-
y1_prime > y2
|
1398
|
-
and right_down_distance < right_down_title_text_distance
|
1399
|
-
):
|
1400
|
-
right_down_title_text_distance = right_down_distance
|
1401
|
-
right_down_title_text_index = j
|
1402
|
-
right_down_title_text_direction = is_horizontal_2
|
1403
|
-
else:
|
1404
|
-
if match_block_iou >= block_iou_threshold:
|
1405
|
-
left_up_distance = distance_(False, True)
|
1406
|
-
right_down_distance = distance_(False, False)
|
1407
|
-
if (
|
1408
|
-
x2_prime <= x1
|
1409
|
-
and left_up_distance <= left_up_title_text_distance
|
1410
|
-
):
|
1411
|
-
left_up_title_text_distance = left_up_distance
|
1412
|
-
left_up_title_text_index = j
|
1413
|
-
left_up_title_text_direction = is_horizontal_2
|
1414
|
-
elif (
|
1415
|
-
x1_prime > x2
|
1416
|
-
and right_down_distance < right_down_title_text_distance
|
1417
|
-
):
|
1418
|
-
right_down_title_text_distance = right_down_distance
|
1419
|
-
right_down_title_text_index = j
|
1420
|
-
right_down_title_text_direction = is_horizontal_2
|
1421
|
-
|
1422
|
-
height = bbox1[3] - bbox1[1]
|
1423
|
-
width = bbox1[2] - bbox1[0]
|
1424
|
-
title_text_weight = [0.8, 0.8]
|
1425
|
-
|
1426
|
-
title_text, sub_title, vision_footnote = [], [], []
|
1427
|
-
|
1428
|
-
def get_sub_category_(
|
1429
|
-
title_text_direction,
|
1430
|
-
title_text_index,
|
1431
|
-
label,
|
1432
|
-
is_left_up=True,
|
1433
|
-
):
|
1434
|
-
direction_ = [1, 3] if is_left_up else [2, 4]
|
1435
|
-
if (
|
1436
|
-
title_text_direction == is_horizontal_1
|
1437
|
-
and title_text_index != -1
|
1438
|
-
and (label == "text" or label == "paragraph_title")
|
525
|
+
if overlap_ratio > REGION_SETTINGS.get(
|
526
|
+
"match_block_overlap_ratio_threshold", 0.8
|
1439
527
|
):
|
1440
|
-
|
1441
|
-
|
1442
|
-
|
1443
|
-
width1 = bbox2[2] - bbox2[0]
|
1444
|
-
if label == "text":
|
1445
|
-
if (
|
1446
|
-
_nearest_edge_distance(bbox1, bbox2)[0] <= 15
|
1447
|
-
and block1["block_label"] in vision_labels
|
1448
|
-
and width1 < width
|
1449
|
-
and height1 < 0.5 * height
|
1450
|
-
):
|
1451
|
-
blocks[title_text_index]["sub_label"] = "vision_footnote"
|
1452
|
-
vision_footnote.append(bbox2)
|
1453
|
-
elif (
|
1454
|
-
height1 < height * title_text_weight[0]
|
1455
|
-
and (width1 < width or width1 > 1.5 * width)
|
1456
|
-
and block1["block_label"] in title_labels
|
1457
|
-
):
|
1458
|
-
blocks[title_text_index]["sub_label"] = "title_text"
|
1459
|
-
title_text.append((direction_[0], bbox2))
|
1460
|
-
elif (
|
1461
|
-
label == "paragraph_title"
|
1462
|
-
and block1["block_label"] in sub_title_labels
|
1463
|
-
):
|
1464
|
-
sub_title.append(bbox2)
|
1465
|
-
else:
|
1466
|
-
height1 = bbox2[3] - bbox2[1]
|
1467
|
-
width1 = bbox2[2] - bbox2[0]
|
1468
|
-
if label == "text":
|
1469
|
-
if (
|
1470
|
-
_nearest_edge_distance(bbox1, bbox2)[0] <= 15
|
1471
|
-
and block1["block_label"] in vision_labels
|
1472
|
-
and height1 < height
|
1473
|
-
and width1 < 0.5 * width
|
1474
|
-
):
|
1475
|
-
blocks[title_text_index]["sub_label"] = "vision_footnote"
|
1476
|
-
vision_footnote.append(bbox2)
|
1477
|
-
elif (
|
1478
|
-
width1 < width * title_text_weight[1]
|
1479
|
-
and block1["block_label"] in title_labels
|
1480
|
-
):
|
1481
|
-
blocks[title_text_index]["sub_label"] = "title_text"
|
1482
|
-
title_text.append((direction_[1], bbox2))
|
1483
|
-
elif (
|
1484
|
-
label == "paragraph_title"
|
1485
|
-
and block1["block_label"] in sub_title_labels
|
1486
|
-
):
|
1487
|
-
sub_title.append(bbox2)
|
1488
|
-
|
1489
|
-
if (
|
1490
|
-
is_horizontal_1
|
1491
|
-
and abs(left_up_title_text_distance - right_down_title_text_distance) * 5
|
1492
|
-
> height
|
1493
|
-
) or (
|
1494
|
-
not is_horizontal_1
|
1495
|
-
and abs(left_up_title_text_distance - right_down_title_text_distance) * 5
|
1496
|
-
> width
|
1497
|
-
):
|
1498
|
-
if left_up_title_text_distance < right_down_title_text_distance:
|
1499
|
-
get_sub_category_(
|
1500
|
-
left_up_title_text_direction,
|
1501
|
-
left_up_title_text_index,
|
1502
|
-
blocks[left_up_title_text_index]["block_label"],
|
1503
|
-
True,
|
1504
|
-
)
|
1505
|
-
else:
|
1506
|
-
get_sub_category_(
|
1507
|
-
right_down_title_text_direction,
|
1508
|
-
right_down_title_text_index,
|
1509
|
-
blocks[right_down_title_text_index]["block_label"],
|
1510
|
-
False,
|
1511
|
-
)
|
1512
|
-
else:
|
1513
|
-
get_sub_category_(
|
1514
|
-
left_up_title_text_direction,
|
1515
|
-
left_up_title_text_index,
|
1516
|
-
blocks[left_up_title_text_index]["block_label"],
|
1517
|
-
True,
|
1518
|
-
)
|
1519
|
-
get_sub_category_(
|
1520
|
-
right_down_title_text_direction,
|
1521
|
-
right_down_title_text_index,
|
1522
|
-
blocks[right_down_title_text_index]["block_label"],
|
1523
|
-
False,
|
1524
|
-
)
|
1525
|
-
|
1526
|
-
if block1["block_label"] in title_labels:
|
1527
|
-
if blocks[i].get("title_text") == []:
|
1528
|
-
blocks[i]["title_text"] = title_text
|
1529
|
-
|
1530
|
-
if block1["block_label"] in sub_title_labels:
|
1531
|
-
if blocks[i].get("sub_title") == []:
|
1532
|
-
blocks[i]["sub_title"] = sub_title
|
1533
|
-
|
1534
|
-
if block1["block_label"] in vision_labels:
|
1535
|
-
if blocks[i].get("vision_footnote") == []:
|
1536
|
-
blocks[i]["vision_footnote"] = vision_footnote
|
1537
|
-
|
1538
|
-
return blocks, pre_cuts
|
1539
|
-
|
1540
|
-
|
1541
|
-
def get_layout_ordering(
|
1542
|
-
parsing_res_list: List[Dict[str, Any]],
|
1543
|
-
no_mask_labels: List[str] = [],
|
1544
|
-
) -> None:
|
1545
|
-
"""
|
1546
|
-
Process layout parsing results to remove overlapping bounding boxes
|
1547
|
-
and assign an ordering index based on their positions.
|
1548
|
-
|
1549
|
-
Modifies:
|
1550
|
-
The 'parsing_res_list' list by adding an 'index' to each block.
|
1551
|
-
|
1552
|
-
Args:
|
1553
|
-
parsing_res_list (List[Dict[str, Any]]): List of block dictionaries with 'block_bbox' and 'block_label'.
|
1554
|
-
no_mask_labels (List[str]): Labels for which overlapping removal is not performed.
|
1555
|
-
"""
|
1556
|
-
title_text_labels = ["doc_title"]
|
1557
|
-
title_labels = ["doc_title", "paragraph_title"]
|
1558
|
-
vision_labels = ["image", "table", "seal", "chart", "figure"]
|
1559
|
-
vision_title_labels = ["table_title", "chart_title", "figure_title"]
|
1560
|
-
|
1561
|
-
parsing_res_list, pre_cuts = _get_sub_category(parsing_res_list, title_text_labels)
|
1562
|
-
|
1563
|
-
parsing_res_by_pre_cuts_list = []
|
1564
|
-
if len(pre_cuts) > 0:
|
1565
|
-
block_bboxes = [block["block_bbox"] for block in parsing_res_list]
|
1566
|
-
for axis, cuts in pre_cuts.items():
|
1567
|
-
axis_index = 1 if axis == "y" else 0
|
1568
|
-
|
1569
|
-
max_val = max(bbox[axis_index + 2] for bbox in block_bboxes)
|
1570
|
-
|
1571
|
-
intervals = []
|
1572
|
-
prev = 0
|
1573
|
-
for cut in sorted(cuts):
|
1574
|
-
intervals.append((prev, cut))
|
1575
|
-
prev = cut
|
1576
|
-
intervals.append((prev, max_val))
|
1577
|
-
|
1578
|
-
for start, end in intervals:
|
1579
|
-
mask = [
|
1580
|
-
(bbox[axis_index] >= start) and (bbox[axis_index] < end)
|
1581
|
-
for bbox in block_bboxes
|
1582
|
-
]
|
1583
|
-
parsing_res_by_pre_cuts_list.append(
|
1584
|
-
[parsing_res_list[i] for i, m in enumerate(mask) if m]
|
1585
|
-
)
|
1586
|
-
else:
|
1587
|
-
parsing_res_by_pre_cuts_list = [parsing_res_list]
|
1588
|
-
|
1589
|
-
final_parsing_res_list = []
|
1590
|
-
num_index = 0
|
1591
|
-
num_sub_index = 0
|
1592
|
-
for parsing_res_by_pre_cuts in parsing_res_by_pre_cuts_list:
|
1593
|
-
|
1594
|
-
doc_flag = False
|
1595
|
-
median_width = _get_text_median_width(parsing_res_by_pre_cuts)
|
1596
|
-
parsing_res_by_pre_cuts, projection_direction = _get_layout_property(
|
1597
|
-
parsing_res_by_pre_cuts,
|
1598
|
-
median_width,
|
1599
|
-
no_mask_labels=no_mask_labels,
|
1600
|
-
threshold=0.3,
|
1601
|
-
)
|
1602
|
-
# Convert bounding boxes to float and remove overlaps
|
1603
|
-
(
|
1604
|
-
double_text_blocks,
|
1605
|
-
title_text_blocks,
|
1606
|
-
title_blocks,
|
1607
|
-
vision_blocks,
|
1608
|
-
vision_title_blocks,
|
1609
|
-
vision_footnote_blocks,
|
1610
|
-
other_blocks,
|
1611
|
-
) = ([], [], [], [], [], [], [])
|
1612
|
-
|
1613
|
-
drop_indexes = []
|
1614
|
-
|
1615
|
-
for index, block in enumerate(parsing_res_by_pre_cuts):
|
1616
|
-
label = block["sub_label"]
|
1617
|
-
block["block_bbox"] = list(map(int, block["block_bbox"]))
|
1618
|
-
|
1619
|
-
if label == "doc_title":
|
1620
|
-
doc_flag = True
|
1621
|
-
|
1622
|
-
if label in no_mask_labels:
|
1623
|
-
if block["layout"] == "double":
|
1624
|
-
double_text_blocks.append(block)
|
1625
|
-
drop_indexes.append(index)
|
1626
|
-
elif label == "title_text":
|
1627
|
-
title_text_blocks.append(block)
|
1628
|
-
drop_indexes.append(index)
|
1629
|
-
elif label == "vision_footnote":
|
1630
|
-
vision_footnote_blocks.append(block)
|
1631
|
-
drop_indexes.append(index)
|
1632
|
-
elif label in vision_title_labels:
|
1633
|
-
vision_title_blocks.append(block)
|
1634
|
-
drop_indexes.append(index)
|
1635
|
-
elif label in title_labels:
|
1636
|
-
title_blocks.append(block)
|
1637
|
-
drop_indexes.append(index)
|
1638
|
-
elif label in vision_labels:
|
1639
|
-
vision_blocks.append(block)
|
1640
|
-
drop_indexes.append(index)
|
1641
|
-
else:
|
1642
|
-
other_blocks.append(block)
|
1643
|
-
drop_indexes.append(index)
|
1644
|
-
|
1645
|
-
for index in sorted(drop_indexes, reverse=True):
|
1646
|
-
del parsing_res_by_pre_cuts[index]
|
1647
|
-
|
1648
|
-
if len(parsing_res_by_pre_cuts) > 0:
|
1649
|
-
# single text label
|
1650
|
-
if (
|
1651
|
-
len(double_text_blocks) > len(parsing_res_by_pre_cuts)
|
1652
|
-
or projection_direction
|
528
|
+
iner_block_idxes.append(block_idx)
|
529
|
+
elif overlap_ratio > REGION_SETTINGS.get(
|
530
|
+
"split_block_overlap_ratio_threshold", 0.4
|
1653
531
|
):
|
1654
|
-
|
1655
|
-
|
1656
|
-
|
1657
|
-
|
1658
|
-
|
1659
|
-
|
1660
|
-
|
1661
|
-
|
1662
|
-
|
1663
|
-
|
1664
|
-
|
1665
|
-
|
1666
|
-
|
1667
|
-
|
1668
|
-
|
1669
|
-
|
1670
|
-
|
1671
|
-
|
1672
|
-
|
1673
|
-
|
1674
|
-
|
1675
|
-
|
1676
|
-
|
1677
|
-
|
1678
|
-
|
1679
|
-
block["index"] = num_index + sorted_boxes.index(block["block_bbox"]) + 1
|
1680
|
-
block["sub_index"] = (
|
1681
|
-
num_sub_index + sorted_boxes.index(block["block_bbox"]) + 1
|
1682
|
-
)
|
1683
|
-
|
1684
|
-
def nearest_match_(input_blocks, distance_type="manhattan", is_add_index=True):
|
1685
|
-
for block in input_blocks:
|
1686
|
-
bbox = block["block_bbox"]
|
1687
|
-
min_distance = float("inf")
|
1688
|
-
min_distance_config = [
|
1689
|
-
[float("inf"), float("inf")],
|
1690
|
-
float("inf"),
|
1691
|
-
float("inf"),
|
1692
|
-
] # for double text
|
1693
|
-
nearest_gt_index = 0
|
1694
|
-
for match_block in parsing_res_by_pre_cuts:
|
1695
|
-
match_bbox = match_block["block_bbox"]
|
1696
|
-
if distance_type == "nearest_iou_edge_distance":
|
1697
|
-
distance, min_distance_config = _nearest_iou_edge_distance(
|
1698
|
-
bbox,
|
1699
|
-
match_bbox,
|
1700
|
-
block["sub_label"],
|
1701
|
-
vision_labels=vision_labels,
|
1702
|
-
no_mask_labels=no_mask_labels,
|
1703
|
-
median_width=median_width,
|
1704
|
-
title_labels=title_labels,
|
1705
|
-
title_text=block["title_text"],
|
1706
|
-
sub_title=block["sub_title"],
|
1707
|
-
min_distance_config=min_distance_config,
|
1708
|
-
tolerance_len=10,
|
1709
|
-
)
|
1710
|
-
elif distance_type == "title_text":
|
1711
|
-
if (
|
1712
|
-
match_block["block_label"] in title_labels + ["abstract"]
|
1713
|
-
and match_block["title_text"] != []
|
1714
|
-
):
|
1715
|
-
iou_left_up = _calculate_overlap_area_div_minbox_area_ratio(
|
1716
|
-
bbox,
|
1717
|
-
match_block["title_text"][0][1],
|
1718
|
-
)
|
1719
|
-
iou_right_down = (
|
1720
|
-
_calculate_overlap_area_div_minbox_area_ratio(
|
1721
|
-
bbox,
|
1722
|
-
match_block["title_text"][-1][1],
|
1723
|
-
)
|
1724
|
-
)
|
1725
|
-
iou = 1 - max(iou_left_up, iou_right_down)
|
1726
|
-
distance = _manhattan_distance(bbox, match_bbox) * iou
|
1727
|
-
else:
|
1728
|
-
distance = float("inf")
|
1729
|
-
elif distance_type == "manhattan":
|
1730
|
-
distance = _manhattan_distance(bbox, match_bbox)
|
1731
|
-
elif distance_type == "vision_footnote":
|
1732
|
-
if (
|
1733
|
-
match_block["block_label"] in vision_labels
|
1734
|
-
and match_block["vision_footnote"] != []
|
1735
|
-
):
|
1736
|
-
iou_left_up = _calculate_overlap_area_div_minbox_area_ratio(
|
1737
|
-
bbox,
|
1738
|
-
match_block["vision_footnote"][0],
|
1739
|
-
)
|
1740
|
-
iou_right_down = (
|
1741
|
-
_calculate_overlap_area_div_minbox_area_ratio(
|
1742
|
-
bbox,
|
1743
|
-
match_block["vision_footnote"][-1],
|
1744
|
-
)
|
1745
|
-
)
|
1746
|
-
iou = 1 - max(iou_left_up, iou_right_down)
|
1747
|
-
distance = _manhattan_distance(bbox, match_bbox) * iou
|
1748
|
-
else:
|
1749
|
-
distance = float("inf")
|
1750
|
-
elif distance_type == "vision_body":
|
1751
|
-
if (
|
1752
|
-
match_block["block_label"] in vision_title_labels
|
1753
|
-
and block["vision_footnote"] != []
|
1754
|
-
):
|
1755
|
-
iou_left_up = _calculate_overlap_area_div_minbox_area_ratio(
|
1756
|
-
match_bbox,
|
1757
|
-
block["vision_footnote"][0],
|
1758
|
-
)
|
1759
|
-
iou_right_down = (
|
1760
|
-
_calculate_overlap_area_div_minbox_area_ratio(
|
1761
|
-
match_bbox,
|
1762
|
-
block["vision_footnote"][-1],
|
1763
|
-
)
|
1764
|
-
)
|
1765
|
-
iou = 1 - max(iou_left_up, iou_right_down)
|
1766
|
-
distance = _manhattan_distance(bbox, match_bbox) * iou
|
1767
|
-
else:
|
1768
|
-
distance = float("inf")
|
1769
|
-
# when reference block cross mulitple columns, its order should be after the blocks above it.
|
1770
|
-
elif distance_type == "append":
|
1771
|
-
if match_bbox[3] <= bbox[1]:
|
1772
|
-
distance = -(match_bbox[2] * 10 + match_bbox[3])
|
1773
|
-
else:
|
1774
|
-
distance = float("inf")
|
1775
|
-
else:
|
1776
|
-
raise NotImplementedError
|
1777
|
-
|
1778
|
-
if distance < min_distance:
|
1779
|
-
min_distance = distance
|
1780
|
-
if is_add_index:
|
1781
|
-
nearest_gt_index = match_block.get("index", 999)
|
1782
|
-
else:
|
1783
|
-
nearest_gt_index = match_block.get("sub_index", 999)
|
1784
|
-
|
1785
|
-
if is_add_index:
|
1786
|
-
block["index"] = nearest_gt_index
|
1787
|
-
else:
|
1788
|
-
block["sub_index"] = nearest_gt_index
|
1789
|
-
|
1790
|
-
parsing_res_by_pre_cuts.append(block)
|
1791
|
-
|
1792
|
-
# double text label
|
1793
|
-
double_text_blocks.sort(
|
1794
|
-
key=lambda x: (
|
1795
|
-
x["block_bbox"][1] // 10,
|
1796
|
-
x["block_bbox"][0] // median_width,
|
1797
|
-
x["block_bbox"][1] ** 2 + x["block_bbox"][0] ** 2,
|
1798
|
-
),
|
1799
|
-
)
|
1800
|
-
# filter the reference blocks from all blocks that cross mulitple columns.
|
1801
|
-
# they should be ordered using "append".
|
1802
|
-
double_text_reference_blocks = []
|
1803
|
-
i = 0
|
1804
|
-
while i < len(double_text_blocks):
|
1805
|
-
if double_text_blocks[i]["block_label"] == "reference":
|
1806
|
-
double_text_reference_blocks.append(double_text_blocks.pop(i))
|
1807
|
-
else:
|
1808
|
-
i += 1
|
1809
|
-
nearest_match_(
|
1810
|
-
double_text_blocks,
|
1811
|
-
distance_type="nearest_iou_edge_distance",
|
1812
|
-
)
|
1813
|
-
nearest_match_(
|
1814
|
-
double_text_reference_blocks,
|
1815
|
-
distance_type="append",
|
1816
|
-
)
|
1817
|
-
parsing_res_by_pre_cuts.sort(
|
1818
|
-
key=lambda x: (x["index"], x["block_bbox"][1], x["block_bbox"][0]),
|
1819
|
-
)
|
1820
|
-
|
1821
|
-
for idx, block in enumerate(parsing_res_by_pre_cuts):
|
1822
|
-
block["index"] = num_index + idx + 1
|
1823
|
-
block["sub_index"] = num_sub_index + idx + 1
|
1824
|
-
|
1825
|
-
# title label
|
1826
|
-
title_blocks.sort(
|
1827
|
-
key=lambda x: (
|
1828
|
-
x["block_bbox"][1] // 10,
|
1829
|
-
x["block_bbox"][0] // median_width,
|
1830
|
-
x["block_bbox"][1] ** 2 + x["block_bbox"][0] ** 2,
|
1831
|
-
),
|
1832
|
-
)
|
1833
|
-
nearest_match_(title_blocks, distance_type="nearest_iou_edge_distance")
|
1834
|
-
|
1835
|
-
if doc_flag:
|
1836
|
-
text_sort_labels = ["doc_title"]
|
1837
|
-
text_label_priority = {
|
1838
|
-
label: priority for priority, label in enumerate(text_sort_labels)
|
1839
|
-
}
|
1840
|
-
doc_titles = []
|
1841
|
-
for i, block in enumerate(parsing_res_by_pre_cuts):
|
1842
|
-
if block["block_label"] == "doc_title":
|
1843
|
-
doc_titles.append(
|
1844
|
-
(i, block["block_bbox"][1], block["block_bbox"][0]),
|
532
|
+
split_block_idxes.append(block_idx)
|
533
|
+
|
534
|
+
if len(iner_block_idxes) > 0:
|
535
|
+
if len(split_block_idxes) > 0:
|
536
|
+
for split_block_idx in split_block_idxes:
|
537
|
+
split_block_bbox = block_bboxes[split_block_idx]
|
538
|
+
x1, y1, x2, y2 = tmp_region_bbox
|
539
|
+
x1_prime, y1_prime, x2_prime, y2_prime = split_block_bbox
|
540
|
+
edge_distance_list = [
|
541
|
+
(x1_prime - x1) / image_width,
|
542
|
+
(y1_prime - y1) / image_height,
|
543
|
+
(x2 - x2_prime) / image_width,
|
544
|
+
(y2 - y2_prime) / image_height,
|
545
|
+
]
|
546
|
+
max_distance = max(edge_distance_list)
|
547
|
+
src_index = edge_distance_list.index(max_distance)
|
548
|
+
dst_index = index_conversion_map[src_index]
|
549
|
+
tmp_region_bbox[dst_index] = split_block_bbox[src_index]
|
550
|
+
tmp_region_bbox, iner_idxes = shrink_supplement_region_bbox(
|
551
|
+
tmp_region_bbox,
|
552
|
+
ref_region_bbox,
|
553
|
+
image_width,
|
554
|
+
image_height,
|
555
|
+
iner_block_idxes,
|
556
|
+
block_bboxes,
|
1845
557
|
)
|
1846
|
-
|
1847
|
-
|
1848
|
-
|
1849
|
-
|
1850
|
-
|
1851
|
-
x["index"],
|
1852
|
-
text_label_priority.get(x["block_label"], 9999),
|
1853
|
-
x["block_bbox"][1],
|
1854
|
-
x["block_bbox"][0],
|
1855
|
-
),
|
1856
|
-
)
|
558
|
+
if len(iner_idxes) == 0:
|
559
|
+
continue
|
560
|
+
matched_bboxes = [block_bboxes[idx] for idx in iner_block_idxes]
|
561
|
+
supplement_region_bbox = calculate_minimum_enclosing_bbox(matched_bboxes)
|
562
|
+
break
|
1857
563
|
else:
|
1858
|
-
|
1859
|
-
|
1860
|
-
|
1861
|
-
|
1862
|
-
x["block_bbox"][0],
|
1863
|
-
),
|
1864
|
-
)
|
564
|
+
edge_distance_list_tmp.remove(min_distance)
|
565
|
+
min_distance = min(edge_distance_list_tmp)
|
566
|
+
src_index = index_conversion_map[edge_distance_list.index(min_distance)]
|
567
|
+
return supplement_region_bbox, iner_block_idxes
|
1865
568
|
|
1866
|
-
for idx, block in enumerate(parsing_res_by_pre_cuts):
|
1867
|
-
block["index"] = num_index + idx + 1
|
1868
|
-
block["sub_index"] = num_sub_index + idx + 1
|
1869
569
|
|
1870
|
-
|
1871
|
-
|
570
|
+
def update_region_box(bbox, region_box):
|
571
|
+
"""Update region box with bbox"""
|
572
|
+
if region_box is None:
|
573
|
+
return bbox
|
1872
574
|
|
1873
|
-
|
1874
|
-
|
1875
|
-
is_horizontal = _get_bbox_direction(input_bbox)
|
1876
|
-
if is_horizontal:
|
1877
|
-
return input_bbox[1]
|
1878
|
-
else:
|
1879
|
-
return input_bbox[0]
|
575
|
+
x1, y1, x2, y2 = bbox
|
576
|
+
x1_region, y1_region, x2_region, y2_region = region_box
|
1880
577
|
|
1881
|
-
|
1882
|
-
|
1883
|
-
|
578
|
+
x1_region = int(min(x1, x1_region))
|
579
|
+
y1_region = int(min(y1, y1_region))
|
580
|
+
x2_region = int(max(x2, x2_region))
|
581
|
+
y2_region = int(max(y2, y2_region))
|
1884
582
|
|
1885
|
-
|
1886
|
-
block["index"] = num_index + idx + 1
|
1887
|
-
block["sub_index"] = num_sub_index + idx + 1
|
583
|
+
region_box = [x1_region, y1_region, x2_region, y2_region]
|
1888
584
|
|
1889
|
-
|
1890
|
-
nearest_match_(
|
1891
|
-
vision_blocks,
|
1892
|
-
distance_type="nearest_iou_edge_distance",
|
1893
|
-
is_add_index=False,
|
1894
|
-
)
|
1895
|
-
parsing_res_by_pre_cuts.sort(
|
1896
|
-
key=lambda x: (
|
1897
|
-
x["sub_index"],
|
1898
|
-
x["block_bbox"][1],
|
1899
|
-
x["block_bbox"][0],
|
1900
|
-
),
|
1901
|
-
)
|
585
|
+
return region_box
|
1902
586
|
|
1903
|
-
for idx, block in enumerate(parsing_res_by_pre_cuts):
|
1904
|
-
block["sub_index"] = num_sub_index + idx + 1
|
1905
587
|
|
1906
|
-
|
1907
|
-
|
1908
|
-
vision_title_blocks,
|
1909
|
-
distance_type="nearest_iou_edge_distance",
|
1910
|
-
is_add_index=False,
|
1911
|
-
)
|
1912
|
-
parsing_res_by_pre_cuts.sort(
|
1913
|
-
key=lambda x: (
|
1914
|
-
x["sub_index"],
|
1915
|
-
x["block_bbox"][1],
|
1916
|
-
x["block_bbox"][0],
|
1917
|
-
),
|
1918
|
-
)
|
1919
|
-
|
1920
|
-
for idx, block in enumerate(parsing_res_by_pre_cuts):
|
1921
|
-
block["sub_index"] = num_sub_index + idx + 1
|
1922
|
-
|
1923
|
-
# vision footnote label
|
1924
|
-
nearest_match_(
|
1925
|
-
vision_footnote_blocks,
|
1926
|
-
distance_type="vision_footnote",
|
1927
|
-
is_add_index=False,
|
1928
|
-
)
|
1929
|
-
text_label_priority = {"vision_footnote": 9999}
|
1930
|
-
parsing_res_by_pre_cuts.sort(
|
1931
|
-
key=lambda x: (
|
1932
|
-
x["sub_index"],
|
1933
|
-
text_label_priority.get(x["sub_label"], 0),
|
1934
|
-
x["block_bbox"][1],
|
1935
|
-
x["block_bbox"][0],
|
1936
|
-
),
|
1937
|
-
)
|
1938
|
-
|
1939
|
-
for idx, block in enumerate(parsing_res_by_pre_cuts):
|
1940
|
-
block["sub_index"] = num_sub_index + idx + 1
|
1941
|
-
|
1942
|
-
# header、footnote、header_image... label
|
1943
|
-
nearest_match_(other_blocks, distance_type="manhattan", is_add_index=False)
|
1944
|
-
|
1945
|
-
# add all parsing result
|
1946
|
-
final_parsing_res_list.extend(parsing_res_by_pre_cuts)
|
1947
|
-
|
1948
|
-
# update num index
|
1949
|
-
num_sub_index += len(parsing_res_by_pre_cuts)
|
1950
|
-
for parsing_res in parsing_res_by_pre_cuts:
|
1951
|
-
if parsing_res.get("index"):
|
1952
|
-
num_index += 1
|
1953
|
-
|
1954
|
-
parsing_res_list = [
|
1955
|
-
{
|
1956
|
-
"block_label": parsing_res["block_label"],
|
1957
|
-
"block_content": parsing_res["block_content"],
|
1958
|
-
"block_bbox": parsing_res["block_bbox"],
|
1959
|
-
"block_image": parsing_res.get("block_image", None),
|
1960
|
-
"sub_label": parsing_res["sub_label"],
|
1961
|
-
"sub_index": parsing_res["sub_index"],
|
1962
|
-
"index": parsing_res.get("index", None),
|
1963
|
-
"seg_start_coordinate": parsing_res.get(
|
1964
|
-
"seg_start_coordinate", float("inf")
|
1965
|
-
),
|
1966
|
-
"seg_end_coordinate": parsing_res.get("seg_end_coordinate", float("-inf")),
|
1967
|
-
"num_of_lines": parsing_res.get("num_of_lines", 1),
|
1968
|
-
}
|
1969
|
-
for parsing_res in final_parsing_res_list
|
1970
|
-
]
|
1971
|
-
|
1972
|
-
return parsing_res_list
|
1973
|
-
|
1974
|
-
|
1975
|
-
def _manhattan_distance(
|
1976
|
-
point1: Tuple[float, float],
|
1977
|
-
point2: Tuple[float, float],
|
1978
|
-
weight_x: float = 1.0,
|
1979
|
-
weight_y: float = 1.0,
|
1980
|
-
) -> float:
|
1981
|
-
"""
|
1982
|
-
Calculate the weighted Manhattan distance between two points.
|
588
|
+
def convert_formula_res_to_ocr_format(formula_res_list: List, ocr_res: dict):
|
589
|
+
"""Convert formula result to OCR result format
|
1983
590
|
|
1984
591
|
Args:
|
1985
|
-
|
1986
|
-
|
1987
|
-
weight_x (float): The weight for the x-axis distance. Default is 1.0.
|
1988
|
-
weight_y (float): The weight for the y-axis distance. Default is 1.0.
|
1989
|
-
|
592
|
+
formula_res_list (List): Formula results
|
593
|
+
ocr_res (dict): OCR result
|
1990
594
|
Returns:
|
1991
|
-
|
1992
|
-
"""
|
1993
|
-
|
1994
|
-
|
595
|
+
ocr_res (dict): Updated OCR result
|
596
|
+
"""
|
597
|
+
for formula_res in formula_res_list:
|
598
|
+
x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
|
599
|
+
poly_points = [
|
600
|
+
(x_min, y_min),
|
601
|
+
(x_max, y_min),
|
602
|
+
(x_max, y_max),
|
603
|
+
(x_min, y_max),
|
604
|
+
]
|
605
|
+
ocr_res["dt_polys"].append(poly_points)
|
606
|
+
formula_res_text: str = formula_res["rec_formula"]
|
607
|
+
ocr_res["rec_texts"].append(formula_res_text)
|
608
|
+
if ocr_res["rec_boxes"].size == 0:
|
609
|
+
ocr_res["rec_boxes"] = np.array(formula_res["dt_polys"])
|
610
|
+
else:
|
611
|
+
ocr_res["rec_boxes"] = np.vstack(
|
612
|
+
(ocr_res["rec_boxes"], [formula_res["dt_polys"]])
|
613
|
+
)
|
614
|
+
ocr_res["rec_labels"].append("formula")
|
615
|
+
ocr_res["rec_polys"].append(poly_points)
|
616
|
+
ocr_res["rec_scores"].append(1)
|
1995
617
|
|
1996
|
-
def _calculate_horizontal_distance(
|
1997
|
-
input_bbox: List[int],
|
1998
|
-
match_bbox: List[int],
|
1999
|
-
height: int,
|
2000
|
-
disperse: int,
|
2001
|
-
title_text: List[Tuple[int, List[int]]],
|
2002
|
-
) -> float:
|
2003
|
-
"""
|
2004
|
-
Calculate the horizontal distance between two bounding boxes, considering title text adjustments.
|
2005
618
|
|
2006
|
-
|
2007
|
-
|
2008
|
-
|
2009
|
-
|
2010
|
-
|
2011
|
-
title_text (List[Tuple[int, List[int]]]): A list of tuples containing title text information and their bounding box coordinates.
|
2012
|
-
Format: [(position_indicator, [x1, y1, x2, y2]), ...].
|
619
|
+
def caculate_bbox_area(bbox):
|
620
|
+
"""Calculate bounding box area"""
|
621
|
+
x1, y1, x2, y2 = map(float, bbox)
|
622
|
+
area = abs((x2 - x1) * (y2 - y1))
|
623
|
+
return area
|
2013
624
|
|
2014
|
-
Returns:
|
2015
|
-
float: The calculated horizontal distance taking into account the title text adjustments.
|
2016
|
-
"""
|
2017
|
-
x1, y1, x2, y2 = input_bbox
|
2018
|
-
x1_prime, y1_prime, x2_prime, y2_prime = match_bbox
|
2019
|
-
|
2020
|
-
# Determine vertical distance adjustment based on title text
|
2021
|
-
if y2 < y1_prime:
|
2022
|
-
if title_text and title_text[-1][0] == 2:
|
2023
|
-
y2 += title_text[-1][1][3] - title_text[-1][1][1]
|
2024
|
-
vertical_adjustment = (y1_prime - y2) * 0.5
|
2025
|
-
else:
|
2026
|
-
if title_text and title_text[0][0] == 1:
|
2027
|
-
y1 -= title_text[0][1][3] - title_text[0][1][1]
|
2028
|
-
vertical_adjustment = y1 - y2_prime
|
2029
|
-
|
2030
|
-
# Calculate horizontal distance with adjustments
|
2031
|
-
horizontal_distance = (
|
2032
|
-
abs(x2_prime - x1) // disperse
|
2033
|
-
+ vertical_adjustment // height
|
2034
|
-
+ vertical_adjustment / 5000
|
2035
|
-
)
|
2036
625
|
|
2037
|
-
|
626
|
+
def caculate_euclidean_dist(point1, point2):
|
627
|
+
"""Calculate euclidean distance between two points"""
|
628
|
+
x1, y1 = point1
|
629
|
+
x2, y2 = point2
|
630
|
+
return ((x1 - x2) ** 2 + (y1 - y2) ** 2) ** 0.5
|
2038
631
|
|
2039
632
|
|
2040
|
-
def
|
2041
|
-
|
2042
|
-
match_bbox: List[int],
|
2043
|
-
width: int,
|
2044
|
-
disperse: int,
|
2045
|
-
title_text: List[Tuple[int, List[int]]],
|
2046
|
-
) -> float:
|
2047
|
-
"""
|
2048
|
-
Calculate the vertical distance between two bounding boxes, considering title text adjustments.
|
633
|
+
def get_seg_flag(block, prev_block):
|
634
|
+
"""Get segment start flag and end flag based on previous block
|
2049
635
|
|
2050
636
|
Args:
|
2051
|
-
|
2052
|
-
|
2053
|
-
width (int): The width of the input bounding box used for normalization.
|
2054
|
-
disperse (int): The dispersion factor used to normalize the vertical distance.
|
2055
|
-
title_text (List[Tuple[int, List[int]]]): A list of tuples containing title text information and their bounding box coordinates.
|
2056
|
-
Format: [(position_indicator, [x1, y1, x2, y2]), ...].
|
637
|
+
block (Block): Current block
|
638
|
+
prev_block (Block): Previous block
|
2057
639
|
|
2058
640
|
Returns:
|
2059
|
-
|
641
|
+
seg_start_flag (bool): Segment start flag
|
642
|
+
seg_end_flag (bool): Segment end flag
|
2060
643
|
"""
|
2061
|
-
x1, y1, x2, y2 = input_bbox
|
2062
|
-
x1_prime, y1_prime, x2_prime, y2_prime = match_bbox
|
2063
|
-
|
2064
|
-
# Determine horizontal distance adjustment based on title text
|
2065
|
-
if x1 > x2_prime:
|
2066
|
-
if title_text and title_text[0][0] == 3:
|
2067
|
-
x1 -= title_text[0][1][2] - title_text[0][1][0]
|
2068
|
-
horizontal_adjustment = (x1 - x2_prime) * 0.5
|
2069
|
-
else:
|
2070
|
-
if title_text and title_text[-1][0] == 4:
|
2071
|
-
x2 += title_text[-1][1][2] - title_text[-1][1][0]
|
2072
|
-
horizontal_adjustment = x1_prime - x2
|
2073
|
-
|
2074
|
-
# Calculate vertical distance with adjustments
|
2075
|
-
vertical_distance = (
|
2076
|
-
abs(y2_prime - y1) // disperse
|
2077
|
-
+ horizontal_adjustment // width
|
2078
|
-
+ horizontal_adjustment / 5000
|
2079
|
-
)
|
2080
644
|
|
2081
|
-
|
645
|
+
seg_start_flag = True
|
646
|
+
seg_end_flag = True
|
2082
647
|
|
648
|
+
context_left_coordinate = block.start_coordinate
|
649
|
+
context_right_coordinate = block.end_coordinate
|
650
|
+
seg_start_coordinate = block.seg_start_coordinate
|
651
|
+
seg_end_coordinate = block.seg_end_coordinate
|
2083
652
|
|
2084
|
-
|
2085
|
-
|
2086
|
-
|
2087
|
-
|
2088
|
-
|
2089
|
-
|
2090
|
-
|
2091
|
-
tolerance_len: float = 10.0,
|
2092
|
-
) -> Tuple[float, List[float]]:
|
2093
|
-
"""
|
2094
|
-
Calculate the nearest edge distance between two bounding boxes, considering directional weights.
|
653
|
+
if prev_block is not None:
|
654
|
+
num_of_prev_lines = prev_block.num_of_lines
|
655
|
+
pre_block_seg_end_coordinate = prev_block.seg_end_coordinate
|
656
|
+
prev_end_space_small = (
|
657
|
+
abs(prev_block.end_coordinate - pre_block_seg_end_coordinate) < 10
|
658
|
+
)
|
659
|
+
prev_lines_more_than_one = num_of_prev_lines > 1
|
2095
660
|
|
2096
|
-
|
2097
|
-
|
2098
|
-
|
2099
|
-
|
2100
|
-
label (str, optional): The label/type of the object in the bounding box (e.g., 'text'). Defaults to 'text'.
|
2101
|
-
no_mask_labels (list, optional): Labels for which no masking is applied when calculating edge distances. Defaults to an empty list.
|
2102
|
-
min_edge_distance_config (list, optional): Configuration for minimum edge distances [min_edge_distance_x, min_edge_distance_y].
|
2103
|
-
Defaults to [float('inf'), float('inf')].
|
2104
|
-
tolerance_len (float, optional): The tolerance length for adjusting edge distances. Defaults to 10.
|
661
|
+
overlap_blocks = (
|
662
|
+
context_left_coordinate < prev_block.end_coordinate
|
663
|
+
and context_right_coordinate > prev_block.start_coordinate
|
664
|
+
)
|
2105
665
|
|
2106
|
-
|
2107
|
-
|
2108
|
-
|
2109
|
-
|
2110
|
-
|
2111
|
-
|
2112
|
-
|
2113
|
-
|
2114
|
-
|
2115
|
-
|
2116
|
-
|
2117
|
-
|
2118
|
-
if not min_edge_distance_config:
|
2119
|
-
min_edge_distance_config = [float("inf"), float("inf")]
|
2120
|
-
min_edge_distance_x, min_edge_distance_y = min_edge_distance_config
|
2121
|
-
|
2122
|
-
x1, y1, x2, y2 = input_bbox
|
2123
|
-
x1_prime, y1_prime, x2_prime, y2_prime = match_bbox
|
2124
|
-
|
2125
|
-
direction_num = 0
|
2126
|
-
distance_x = float("inf")
|
2127
|
-
distance_y = float("inf")
|
2128
|
-
distance = [float("inf")] * 4
|
2129
|
-
|
2130
|
-
# input_bbox is to the left of match_bbox
|
2131
|
-
if x2 < x1_prime:
|
2132
|
-
direction_num += 1
|
2133
|
-
distance[0] = x1_prime - x2
|
2134
|
-
if abs(distance[0] - min_edge_distance_x) <= tolerance_len:
|
2135
|
-
distance_x = min_edge_distance_x * weight[0]
|
2136
|
-
else:
|
2137
|
-
distance_x = distance[0] * weight[0]
|
2138
|
-
# input_bbox is to the right of match_bbox
|
2139
|
-
elif x1 > x2_prime:
|
2140
|
-
direction_num += 1
|
2141
|
-
distance[1] = x1 - x2_prime
|
2142
|
-
if abs(distance[1] - min_edge_distance_x) <= tolerance_len:
|
2143
|
-
distance_x = min_edge_distance_x * weight[1]
|
2144
|
-
else:
|
2145
|
-
distance_x = distance[1] * weight[1]
|
2146
|
-
elif match_bbox_iou > 0:
|
2147
|
-
distance[0] = 0
|
2148
|
-
distance_x = 0
|
2149
|
-
|
2150
|
-
# input_bbox is above match_bbox
|
2151
|
-
if y2 < y1_prime:
|
2152
|
-
direction_num += 1
|
2153
|
-
distance[2] = y1_prime - y2
|
2154
|
-
if abs(distance[2] - min_edge_distance_y) <= tolerance_len:
|
2155
|
-
distance_y = min_edge_distance_y * weight[2]
|
2156
|
-
else:
|
2157
|
-
distance_y = distance[2] * weight[2]
|
2158
|
-
if label in no_mask_labels:
|
2159
|
-
distance_y = max(0.1, distance_y) * 10 # for abstract
|
2160
|
-
# input_bbox is below match_bbox
|
2161
|
-
elif y1 > y2_prime:
|
2162
|
-
direction_num += 1
|
2163
|
-
distance[3] = y1 - y2_prime
|
2164
|
-
if abs(distance[3] - min_edge_distance_y) <= tolerance_len:
|
2165
|
-
distance_y = min_edge_distance_y * weight[3]
|
666
|
+
# update context_left_coordinate and context_right_coordinate
|
667
|
+
if overlap_blocks:
|
668
|
+
context_left_coordinate = min(
|
669
|
+
prev_block.start_coordinate, context_left_coordinate
|
670
|
+
)
|
671
|
+
context_right_coordinate = max(
|
672
|
+
prev_block.end_coordinate, context_right_coordinate
|
673
|
+
)
|
674
|
+
prev_end_space_small = (
|
675
|
+
abs(context_right_coordinate - pre_block_seg_end_coordinate) < 10
|
676
|
+
)
|
677
|
+
edge_distance = 0
|
2166
678
|
else:
|
2167
|
-
|
2168
|
-
elif match_bbox_iou > 0:
|
2169
|
-
distance[2] = 0
|
2170
|
-
distance_y = 0
|
2171
|
-
|
2172
|
-
if direction_num == 2:
|
2173
|
-
return (distance_x + distance_y), [
|
2174
|
-
min(distance[0], distance[1]),
|
2175
|
-
min(distance[2], distance[3]),
|
2176
|
-
]
|
2177
|
-
else:
|
2178
|
-
return min(distance_x, distance_y), [
|
2179
|
-
min(distance[0], distance[1]),
|
2180
|
-
min(distance[2], distance[3]),
|
2181
|
-
]
|
679
|
+
edge_distance = abs(block.start_coordinate - prev_block.end_coordinate)
|
2182
680
|
|
681
|
+
current_start_space_small = seg_start_coordinate - context_left_coordinate < 10
|
2183
682
|
|
2184
|
-
|
2185
|
-
|
2186
|
-
|
2187
|
-
|
2188
|
-
|
2189
|
-
)
|
2190
|
-
|
2191
|
-
"paragraph_title",
|
2192
|
-
"table_title",
|
2193
|
-
"abstract",
|
2194
|
-
"image",
|
2195
|
-
"seal",
|
2196
|
-
"chart",
|
2197
|
-
"figure",
|
2198
|
-
]:
|
2199
|
-
return [1, 1, 0.1, 1] # down
|
683
|
+
if (
|
684
|
+
prev_end_space_small
|
685
|
+
and current_start_space_small
|
686
|
+
and prev_lines_more_than_one
|
687
|
+
and edge_distance < max(prev_block.width, block.width)
|
688
|
+
):
|
689
|
+
seg_start_flag = False
|
2200
690
|
else:
|
2201
|
-
|
2202
|
-
|
2203
|
-
|
2204
|
-
|
2205
|
-
|
2206
|
-
|
2207
|
-
|
2208
|
-
|
2209
|
-
|
2210
|
-
|
2211
|
-
|
2212
|
-
|
2213
|
-
|
2214
|
-
|
2215
|
-
|
2216
|
-
|
2217
|
-
|
2218
|
-
|
2219
|
-
|
2220
|
-
|
2221
|
-
|
2222
|
-
|
2223
|
-
|
2224
|
-
match_bbox (List[int]): The bounding box coordinates [x1', y1', x2', y2'] of the object to match against.
|
2225
|
-
label (str): The label/type of the object in the bounding box (e.g., 'image', 'text', etc.).
|
2226
|
-
vision_labels (List[str]): List of labels for vision-related objects (e.g., images, icons).
|
2227
|
-
no_mask_labels (List[str]): Labels for which no masking is applied when calculating edge distances.
|
2228
|
-
median_width (int, optional): The median width for title dispersion calculation. Defaults to -1.
|
2229
|
-
title_labels (List[str], optional): Labels that indicate the object is a title. Defaults to an empty list.
|
2230
|
-
title_text (List[Tuple[int, List[int]]], optional): Text content associated with title labels, in the format [(position_indicator, [x1, y1, x2, y2]), ...].
|
2231
|
-
sub_title (List[List[int]], optional): List of subtitle bounding boxes to adjust the input_bbox. Defaults to an empty list.
|
2232
|
-
min_distance_config (List[float], optional): Configuration for minimum distances [min_edge_distance_config, up_edge_distances_config, total_distance].
|
2233
|
-
tolerance_len (float, optional): The tolerance length for adjusting edge distances. Defaults to 10.0.
|
2234
|
-
|
2235
|
-
Returns:
|
2236
|
-
Tuple[float, List[float]]: A tuple containing:
|
2237
|
-
- The calculated distance considering IOU and adjustments.
|
2238
|
-
- The updated minimum distance configuration.
|
2239
|
-
"""
|
2240
|
-
|
2241
|
-
x1, y1, x2, y2 = input_bbox
|
2242
|
-
x1_prime, y1_prime, x2_prime, y2_prime = match_bbox
|
2243
|
-
|
2244
|
-
min_edge_distance_config, up_edge_distances_config, total_distance = (
|
2245
|
-
min_distance_config
|
2246
|
-
)
|
2247
|
-
|
2248
|
-
iou_distance = 0
|
2249
|
-
|
2250
|
-
if label in vision_labels:
|
2251
|
-
horizontal1 = horizontal2 = True
|
691
|
+
if seg_start_coordinate - context_left_coordinate < 10:
|
692
|
+
seg_start_flag = False
|
693
|
+
|
694
|
+
if context_right_coordinate - seg_end_coordinate < 10:
|
695
|
+
seg_end_flag = False
|
696
|
+
|
697
|
+
return seg_start_flag, seg_end_flag
|
698
|
+
|
699
|
+
|
700
|
+
def get_show_color(label: str, order_label=False) -> Tuple:
|
701
|
+
if order_label:
|
702
|
+
label_colors = {
|
703
|
+
"doc_title": (255, 248, 220, 100), # Cornsilk
|
704
|
+
"doc_title_text": (255, 239, 213, 100),
|
705
|
+
"paragraph_title": (102, 102, 255, 100),
|
706
|
+
"sub_paragraph_title": (102, 178, 255, 100),
|
707
|
+
"vision": (153, 255, 51, 100),
|
708
|
+
"vision_title": (144, 238, 144, 100), # Light Green
|
709
|
+
"vision_footnote": (144, 238, 144, 100), # Light Green
|
710
|
+
"normal_text": (153, 0, 76, 100),
|
711
|
+
"cross_layout": (53, 218, 207, 100), # Thistle
|
712
|
+
"cross_reference": (221, 160, 221, 100), # Floral White
|
713
|
+
}
|
2252
714
|
else:
|
2253
|
-
|
2254
|
-
|
2255
|
-
|
2256
|
-
|
2257
|
-
|
2258
|
-
|
2259
|
-
|
2260
|
-
|
2261
|
-
|
2262
|
-
|
2263
|
-
|
2264
|
-
|
2265
|
-
|
2266
|
-
|
2267
|
-
|
2268
|
-
|
2269
|
-
|
2270
|
-
|
2271
|
-
|
2272
|
-
|
2273
|
-
|
2274
|
-
|
2275
|
-
|
2276
|
-
)
|
2277
|
-
|
2278
|
-
|
2279
|
-
|
2280
|
-
|
2281
|
-
|
2282
|
-
|
2283
|
-
|
2284
|
-
min(x1, x1_),
|
2285
|
-
min(y1, y1_),
|
2286
|
-
min(x2, x2_),
|
2287
|
-
max(y2, y2_),
|
2288
|
-
)
|
2289
|
-
else:
|
2290
|
-
x1, y1, x2, y2 = (
|
2291
|
-
min(x1, x1_),
|
2292
|
-
min(y1, y1_),
|
2293
|
-
max(x2, x2_),
|
2294
|
-
min(y2, y2_),
|
2295
|
-
)
|
2296
|
-
input_bbox = [x1, y1, x2, y2]
|
2297
|
-
|
2298
|
-
# Calculate edge distance
|
2299
|
-
weight = _get_weights(label, horizontal1)
|
2300
|
-
if label == "abstract":
|
2301
|
-
tolerance_len *= 2
|
2302
|
-
|
2303
|
-
edge_distance, edge_distance_config = _nearest_edge_distance(
|
2304
|
-
input_bbox,
|
2305
|
-
match_bbox,
|
2306
|
-
weight,
|
2307
|
-
label=label,
|
2308
|
-
no_mask_labels=no_mask_labels,
|
2309
|
-
min_edge_distance_config=min_edge_distance_config,
|
2310
|
-
tolerance_len=tolerance_len,
|
2311
|
-
)
|
2312
|
-
|
2313
|
-
# Weights for combining distances
|
2314
|
-
iou_edge_weight = [10**8, 10**4, 1, 0.0001]
|
2315
|
-
|
2316
|
-
# Calculate up and left edge distances
|
2317
|
-
up_edge_distance = y1_prime
|
2318
|
-
left_edge_distance = x1_prime
|
2319
|
-
if (
|
2320
|
-
label in no_mask_labels or label in title_labels or label in vision_labels
|
2321
|
-
) and y1 > y2_prime:
|
2322
|
-
up_edge_distance = -y2_prime
|
2323
|
-
left_edge_distance = -x2_prime
|
2324
|
-
|
2325
|
-
min_up_edge_distance = up_edge_distances_config
|
2326
|
-
if abs(min_up_edge_distance - up_edge_distance) <= tolerance_len:
|
2327
|
-
up_edge_distance = min_up_edge_distance
|
2328
|
-
|
2329
|
-
# Calculate total distance
|
2330
|
-
distance = (
|
2331
|
-
iou_distance * iou_edge_weight[0]
|
2332
|
-
+ edge_distance * iou_edge_weight[1]
|
2333
|
-
+ up_edge_distance * iou_edge_weight[2]
|
2334
|
-
+ left_edge_distance * iou_edge_weight[3]
|
2335
|
-
)
|
2336
|
-
|
2337
|
-
# Update minimum distance configuration if a smaller distance is found
|
2338
|
-
if total_distance > distance:
|
2339
|
-
edge_distance_config = [
|
2340
|
-
edge_distance_config[0],
|
2341
|
-
edge_distance_config[1],
|
2342
|
-
]
|
2343
|
-
min_distance_config = [
|
2344
|
-
edge_distance_config,
|
2345
|
-
up_edge_distance,
|
2346
|
-
distance,
|
2347
|
-
]
|
2348
|
-
|
2349
|
-
return distance, min_distance_config
|
2350
|
-
|
2351
|
-
|
2352
|
-
def get_show_color(label: str) -> Tuple:
|
2353
|
-
label_colors = {
|
2354
|
-
# Medium Blue (from 'titles_list')
|
2355
|
-
"paragraph_title": (102, 102, 255, 100),
|
2356
|
-
"doc_title": (255, 248, 220, 100), # Cornsilk
|
2357
|
-
# Light Yellow (from 'tables_caption_list')
|
2358
|
-
"table_title": (255, 255, 102, 100),
|
2359
|
-
# Sky Blue (from 'imgs_caption_list')
|
2360
|
-
"figure_title": (102, 178, 255, 100),
|
2361
|
-
"chart_title": (221, 160, 221, 100), # Plum
|
2362
|
-
"vision_footnote": (144, 238, 144, 100), # Light Green
|
2363
|
-
# Deep Purple (from 'texts_list')
|
2364
|
-
"text": (153, 0, 76, 100),
|
2365
|
-
# Bright Green (from 'interequations_list')
|
2366
|
-
"formula": (0, 255, 0, 100),
|
2367
|
-
"abstract": (255, 239, 213, 100), # Papaya Whip
|
2368
|
-
# Medium Green (from 'lists_list' and 'indexs_list')
|
2369
|
-
"content": (40, 169, 92, 100),
|
2370
|
-
# Neutral Gray (from 'dropped_bbox_list')
|
2371
|
-
"seal": (158, 158, 158, 100),
|
2372
|
-
# Olive Yellow (from 'tables_body_list')
|
2373
|
-
"table": (204, 204, 0, 100),
|
2374
|
-
# Bright Green (from 'imgs_body_list')
|
2375
|
-
"image": (153, 255, 51, 100),
|
2376
|
-
# Bright Green (from 'imgs_body_list')
|
2377
|
-
"figure": (153, 255, 51, 100),
|
2378
|
-
"chart": (216, 191, 216, 100), # Thistle
|
2379
|
-
# Pale Yellow-Green (from 'tables_footnote_list')
|
2380
|
-
"reference": (229, 255, 204, 100),
|
2381
|
-
"algorithm": (255, 250, 240, 100), # Floral White
|
2382
|
-
}
|
715
|
+
label_colors = {
|
716
|
+
# Medium Blue (from 'titles_list')
|
717
|
+
"paragraph_title": (102, 102, 255, 100),
|
718
|
+
"doc_title": (255, 248, 220, 100), # Cornsilk
|
719
|
+
# Light Yellow (from 'tables_caption_list')
|
720
|
+
"table_title": (255, 255, 102, 100),
|
721
|
+
# Sky Blue (from 'imgs_caption_list')
|
722
|
+
"figure_title": (102, 178, 255, 100),
|
723
|
+
"chart_title": (221, 160, 221, 100), # Plum
|
724
|
+
"vision_footnote": (144, 238, 144, 100), # Light Green
|
725
|
+
# Deep Purple (from 'texts_list')
|
726
|
+
"text": (153, 0, 76, 100),
|
727
|
+
# Bright Green (from 'interequations_list')
|
728
|
+
"formula": (0, 255, 0, 100),
|
729
|
+
"abstract": (255, 239, 213, 100), # Papaya Whip
|
730
|
+
# Medium Green (from 'lists_list' and 'indexs_list')
|
731
|
+
"content": (40, 169, 92, 100),
|
732
|
+
# Neutral Gray (from 'dropped_bbox_list')
|
733
|
+
"seal": (158, 158, 158, 100),
|
734
|
+
# Olive Yellow (from 'tables_body_list')
|
735
|
+
"table": (204, 204, 0, 100),
|
736
|
+
# Bright Green (from 'imgs_body_list')
|
737
|
+
"image": (153, 255, 51, 100),
|
738
|
+
# Bright Green (from 'imgs_body_list')
|
739
|
+
"figure": (153, 255, 51, 100),
|
740
|
+
"chart": (216, 191, 216, 100), # Thistle
|
741
|
+
# Pale Yellow-Green (from 'tables_footnote_list')
|
742
|
+
"reference": (229, 255, 204, 100),
|
743
|
+
# "reference_content": (229, 255, 204, 100),
|
744
|
+
"algorithm": (255, 250, 240, 100), # Floral White
|
745
|
+
}
|
2383
746
|
default_color = (158, 158, 158, 100)
|
2384
747
|
return label_colors.get(label, default_color)
|