paddlex 3.0.0rc1__py3-none-any.whl → 3.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. paddlex/.version +1 -1
  2. paddlex/__init__.py +1 -1
  3. paddlex/configs/modules/chart_parsing/PP-Chart2Table.yaml +13 -0
  4. paddlex/configs/modules/doc_vlm/PP-DocBee2-3B.yaml +14 -0
  5. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-L.yaml +40 -0
  6. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-M.yaml +40 -0
  7. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-S.yaml +40 -0
  8. paddlex/configs/modules/layout_detection/PP-DocBlockLayout.yaml +40 -0
  9. paddlex/configs/modules/layout_detection/PP-DocLayout-L.yaml +2 -2
  10. paddlex/configs/modules/layout_detection/PP-DocLayout-M.yaml +2 -2
  11. paddlex/configs/modules/layout_detection/PP-DocLayout-S.yaml +2 -2
  12. paddlex/configs/modules/layout_detection/PP-DocLayout_plus-L.yaml +40 -0
  13. paddlex/configs/modules/text_detection/PP-OCRv5_mobile_det.yaml +40 -0
  14. paddlex/configs/modules/text_detection/PP-OCRv5_server_det.yaml +40 -0
  15. paddlex/configs/modules/text_recognition/PP-OCRv5_mobile_rec.yaml +39 -0
  16. paddlex/configs/modules/text_recognition/PP-OCRv5_server_rec.yaml +39 -0
  17. paddlex/configs/modules/textline_orientation/PP-LCNet_x1_0_textline_ori.yaml +41 -0
  18. paddlex/configs/pipelines/OCR.yaml +7 -6
  19. paddlex/configs/pipelines/PP-ChatOCRv3-doc.yaml +3 -1
  20. paddlex/configs/pipelines/PP-ChatOCRv4-doc.yaml +91 -34
  21. paddlex/configs/pipelines/PP-StructureV3.yaml +72 -72
  22. paddlex/configs/pipelines/doc_understanding.yaml +1 -1
  23. paddlex/configs/pipelines/formula_recognition.yaml +2 -2
  24. paddlex/configs/pipelines/layout_parsing.yaml +3 -2
  25. paddlex/configs/pipelines/seal_recognition.yaml +1 -0
  26. paddlex/configs/pipelines/table_recognition.yaml +2 -1
  27. paddlex/configs/pipelines/table_recognition_v2.yaml +7 -1
  28. paddlex/hpip_links.html +20 -20
  29. paddlex/inference/common/batch_sampler/doc_vlm_batch_sampler.py +33 -10
  30. paddlex/inference/common/batch_sampler/image_batch_sampler.py +34 -25
  31. paddlex/inference/common/result/mixin.py +19 -12
  32. paddlex/inference/models/base/predictor/base_predictor.py +2 -8
  33. paddlex/inference/models/common/static_infer.py +11 -59
  34. paddlex/inference/models/common/tokenizer/__init__.py +2 -0
  35. paddlex/inference/models/common/tokenizer/clip_tokenizer.py +1 -1
  36. paddlex/inference/models/common/tokenizer/gpt_tokenizer.py +2 -2
  37. paddlex/inference/models/common/tokenizer/qwen2_5_tokenizer.py +112 -0
  38. paddlex/inference/models/common/tokenizer/qwen2_tokenizer.py +7 -1
  39. paddlex/inference/models/common/tokenizer/qwen_tokenizer.py +288 -0
  40. paddlex/inference/models/common/tokenizer/tokenizer_utils.py +13 -13
  41. paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py +3 -3
  42. paddlex/inference/models/common/tokenizer/vocab.py +7 -7
  43. paddlex/inference/models/common/vlm/conversion_utils.py +99 -0
  44. paddlex/inference/models/common/vlm/fusion_ops.py +205 -0
  45. paddlex/inference/models/common/vlm/generation/configuration_utils.py +1 -1
  46. paddlex/inference/models/common/vlm/generation/logits_process.py +1 -1
  47. paddlex/inference/models/common/vlm/generation/utils.py +1 -1
  48. paddlex/inference/models/common/vlm/transformers/configuration_utils.py +3 -3
  49. paddlex/inference/models/common/vlm/transformers/conversion_utils.py +3 -3
  50. paddlex/inference/models/common/vlm/transformers/model_outputs.py +2 -2
  51. paddlex/inference/models/common/vlm/transformers/model_utils.py +7 -31
  52. paddlex/inference/models/doc_vlm/modeling/GOT_ocr_2_0.py +830 -0
  53. paddlex/inference/models/doc_vlm/modeling/__init__.py +2 -0
  54. paddlex/inference/models/doc_vlm/modeling/qwen2.py +1606 -0
  55. paddlex/inference/models/doc_vlm/modeling/qwen2_5_vl.py +3006 -0
  56. paddlex/inference/models/doc_vlm/modeling/qwen2_vl.py +0 -105
  57. paddlex/inference/models/doc_vlm/predictor.py +79 -24
  58. paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py +97 -0
  59. paddlex/inference/models/doc_vlm/processors/__init__.py +2 -0
  60. paddlex/inference/models/doc_vlm/processors/common.py +189 -0
  61. paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py +548 -0
  62. paddlex/inference/models/doc_vlm/processors/qwen2_vl.py +21 -176
  63. paddlex/inference/models/formula_recognition/predictor.py +7 -1
  64. paddlex/inference/models/formula_recognition/processors.py +92 -79
  65. paddlex/inference/models/formula_recognition/result.py +28 -27
  66. paddlex/inference/models/image_feature/processors.py +3 -4
  67. paddlex/inference/models/keypoint_detection/predictor.py +3 -0
  68. paddlex/inference/models/object_detection/predictor.py +2 -0
  69. paddlex/inference/models/object_detection/processors.py +28 -3
  70. paddlex/inference/models/object_detection/utils.py +2 -0
  71. paddlex/inference/models/table_structure_recognition/result.py +0 -10
  72. paddlex/inference/models/text_detection/predictor.py +8 -0
  73. paddlex/inference/models/text_detection/processors.py +44 -10
  74. paddlex/inference/models/text_detection/result.py +0 -10
  75. paddlex/inference/pipelines/__init__.py +9 -5
  76. paddlex/inference/pipelines/_parallel.py +172 -0
  77. paddlex/inference/pipelines/anomaly_detection/pipeline.py +16 -6
  78. paddlex/inference/pipelines/attribute_recognition/pipeline.py +11 -1
  79. paddlex/inference/pipelines/base.py +14 -4
  80. paddlex/inference/pipelines/components/faisser.py +1 -1
  81. paddlex/inference/pipelines/doc_preprocessor/pipeline.py +53 -27
  82. paddlex/inference/pipelines/formula_recognition/pipeline.py +120 -82
  83. paddlex/inference/pipelines/formula_recognition/result.py +1 -11
  84. paddlex/inference/pipelines/image_classification/pipeline.py +16 -6
  85. paddlex/inference/pipelines/image_multilabel_classification/pipeline.py +16 -6
  86. paddlex/inference/pipelines/instance_segmentation/pipeline.py +16 -6
  87. paddlex/inference/pipelines/keypoint_detection/pipeline.py +16 -6
  88. paddlex/inference/pipelines/layout_parsing/pipeline.py +34 -47
  89. paddlex/inference/pipelines/layout_parsing/pipeline_v2.py +893 -260
  90. paddlex/inference/pipelines/layout_parsing/result.py +4 -17
  91. paddlex/inference/pipelines/layout_parsing/result_v2.py +523 -245
  92. paddlex/inference/pipelines/layout_parsing/setting.py +87 -0
  93. paddlex/inference/pipelines/layout_parsing/utils.py +565 -1998
  94. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/__init__.py +16 -0
  95. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/utils.py +1144 -0
  96. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/xycuts.py +563 -0
  97. paddlex/inference/pipelines/m_3d_bev_detection/pipeline.py +2 -2
  98. paddlex/inference/pipelines/multilingual_speech_recognition/pipeline.py +2 -2
  99. paddlex/inference/pipelines/object_detection/pipeline.py +16 -6
  100. paddlex/inference/pipelines/ocr/pipeline.py +127 -70
  101. paddlex/inference/pipelines/ocr/result.py +19 -16
  102. paddlex/inference/pipelines/open_vocabulary_detection/pipeline.py +2 -2
  103. paddlex/inference/pipelines/open_vocabulary_segmentation/pipeline.py +2 -2
  104. paddlex/inference/pipelines/pp_chatocr/pipeline_base.py +2 -2
  105. paddlex/inference/pipelines/pp_chatocr/pipeline_v3.py +2 -5
  106. paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py +5 -5
  107. paddlex/inference/pipelines/rotated_object_detection/pipeline.py +16 -6
  108. paddlex/inference/pipelines/seal_recognition/pipeline.py +109 -53
  109. paddlex/inference/pipelines/semantic_segmentation/pipeline.py +16 -6
  110. paddlex/inference/pipelines/small_object_detection/pipeline.py +16 -6
  111. paddlex/inference/pipelines/table_recognition/pipeline.py +26 -18
  112. paddlex/inference/pipelines/table_recognition/pipeline_v2.py +624 -53
  113. paddlex/inference/pipelines/table_recognition/result.py +1 -1
  114. paddlex/inference/pipelines/table_recognition/table_recognition_post_processing_v2.py +9 -5
  115. paddlex/inference/pipelines/ts_anomaly_detection/pipeline.py +2 -2
  116. paddlex/inference/pipelines/ts_classification/pipeline.py +2 -2
  117. paddlex/inference/pipelines/ts_forecasting/pipeline.py +2 -2
  118. paddlex/inference/pipelines/video_classification/pipeline.py +2 -2
  119. paddlex/inference/pipelines/video_detection/pipeline.py +2 -2
  120. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/common.py +5 -1
  121. paddlex/inference/serving/basic_serving/_pipeline_apps/layout_parsing.py +0 -1
  122. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv3_doc.py +0 -1
  123. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv4_doc.py +1 -1
  124. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py +6 -2
  125. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition.py +1 -5
  126. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition_v2.py +4 -5
  127. paddlex/inference/serving/infra/utils.py +20 -22
  128. paddlex/inference/serving/schemas/formula_recognition.py +1 -1
  129. paddlex/inference/serving/schemas/layout_parsing.py +1 -2
  130. paddlex/inference/serving/schemas/pp_chatocrv3_doc.py +1 -2
  131. paddlex/inference/serving/schemas/pp_chatocrv4_doc.py +2 -2
  132. paddlex/inference/serving/schemas/pp_structurev3.py +10 -6
  133. paddlex/inference/serving/schemas/seal_recognition.py +1 -1
  134. paddlex/inference/serving/schemas/table_recognition.py +2 -6
  135. paddlex/inference/serving/schemas/table_recognition_v2.py +5 -6
  136. paddlex/inference/utils/hpi.py +8 -1
  137. paddlex/inference/utils/hpi_model_info_collection.json +81 -2
  138. paddlex/inference/utils/io/readers.py +12 -12
  139. paddlex/inference/utils/mkldnn_blocklist.py +25 -0
  140. paddlex/inference/utils/official_models.py +14 -0
  141. paddlex/inference/utils/pp_option.py +29 -8
  142. paddlex/model.py +2 -2
  143. paddlex/modules/__init__.py +1 -1
  144. paddlex/modules/anomaly_detection/evaluator.py +2 -2
  145. paddlex/modules/base/__init__.py +1 -1
  146. paddlex/modules/base/evaluator.py +5 -5
  147. paddlex/modules/base/trainer.py +1 -1
  148. paddlex/modules/doc_vlm/dataset_checker.py +2 -2
  149. paddlex/modules/doc_vlm/evaluator.py +2 -2
  150. paddlex/modules/doc_vlm/exportor.py +2 -2
  151. paddlex/modules/doc_vlm/model_list.py +1 -1
  152. paddlex/modules/doc_vlm/trainer.py +2 -2
  153. paddlex/modules/face_recognition/evaluator.py +2 -2
  154. paddlex/modules/formula_recognition/evaluator.py +5 -2
  155. paddlex/modules/formula_recognition/model_list.py +3 -0
  156. paddlex/modules/formula_recognition/trainer.py +3 -0
  157. paddlex/modules/general_recognition/evaluator.py +1 -1
  158. paddlex/modules/image_classification/evaluator.py +2 -2
  159. paddlex/modules/image_classification/model_list.py +1 -0
  160. paddlex/modules/instance_segmentation/evaluator.py +1 -1
  161. paddlex/modules/keypoint_detection/evaluator.py +1 -1
  162. paddlex/modules/m_3d_bev_detection/evaluator.py +2 -2
  163. paddlex/modules/multilabel_classification/evaluator.py +2 -2
  164. paddlex/modules/object_detection/dataset_checker/dataset_src/convert_dataset.py +4 -4
  165. paddlex/modules/object_detection/evaluator.py +2 -2
  166. paddlex/modules/object_detection/model_list.py +2 -0
  167. paddlex/modules/semantic_segmentation/evaluator.py +2 -2
  168. paddlex/modules/table_recognition/evaluator.py +2 -2
  169. paddlex/modules/text_detection/evaluator.py +2 -2
  170. paddlex/modules/text_detection/model_list.py +2 -0
  171. paddlex/modules/text_recognition/evaluator.py +2 -2
  172. paddlex/modules/text_recognition/model_list.py +2 -0
  173. paddlex/modules/ts_anomaly_detection/evaluator.py +2 -2
  174. paddlex/modules/ts_classification/dataset_checker/dataset_src/split_dataset.py +1 -1
  175. paddlex/modules/ts_classification/evaluator.py +2 -2
  176. paddlex/modules/ts_forecast/evaluator.py +2 -2
  177. paddlex/modules/video_classification/evaluator.py +2 -2
  178. paddlex/modules/video_detection/evaluator.py +2 -2
  179. paddlex/ops/__init__.py +2 -2
  180. paddlex/paddlex_cli.py +19 -13
  181. paddlex/repo_apis/Paddle3D_api/bev_fusion/model.py +2 -2
  182. paddlex/repo_apis/PaddleClas_api/cls/config.py +1 -1
  183. paddlex/repo_apis/PaddleClas_api/cls/model.py +1 -1
  184. paddlex/repo_apis/PaddleClas_api/cls/register.py +10 -0
  185. paddlex/repo_apis/PaddleClas_api/cls/runner.py +1 -1
  186. paddlex/repo_apis/PaddleDetection_api/instance_seg/model.py +1 -1
  187. paddlex/repo_apis/PaddleDetection_api/instance_seg/runner.py +1 -1
  188. paddlex/repo_apis/PaddleDetection_api/object_det/config.py +1 -1
  189. paddlex/repo_apis/PaddleDetection_api/object_det/model.py +1 -1
  190. paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py +25 -0
  191. paddlex/repo_apis/PaddleDetection_api/object_det/register.py +30 -0
  192. paddlex/repo_apis/PaddleDetection_api/object_det/runner.py +1 -1
  193. paddlex/repo_apis/PaddleOCR_api/formula_rec/config.py +3 -3
  194. paddlex/repo_apis/PaddleOCR_api/formula_rec/model.py +5 -9
  195. paddlex/repo_apis/PaddleOCR_api/formula_rec/register.py +27 -0
  196. paddlex/repo_apis/PaddleOCR_api/formula_rec/runner.py +1 -1
  197. paddlex/repo_apis/PaddleOCR_api/table_rec/model.py +1 -1
  198. paddlex/repo_apis/PaddleOCR_api/table_rec/runner.py +1 -1
  199. paddlex/repo_apis/PaddleOCR_api/text_det/model.py +1 -1
  200. paddlex/repo_apis/PaddleOCR_api/text_det/register.py +18 -0
  201. paddlex/repo_apis/PaddleOCR_api/text_det/runner.py +1 -1
  202. paddlex/repo_apis/PaddleOCR_api/text_rec/config.py +3 -3
  203. paddlex/repo_apis/PaddleOCR_api/text_rec/model.py +5 -9
  204. paddlex/repo_apis/PaddleOCR_api/text_rec/register.py +18 -0
  205. paddlex/repo_apis/PaddleOCR_api/text_rec/runner.py +1 -1
  206. paddlex/repo_apis/PaddleSeg_api/seg/model.py +1 -1
  207. paddlex/repo_apis/PaddleSeg_api/seg/runner.py +1 -1
  208. paddlex/repo_apis/PaddleTS_api/ts_ad/config.py +3 -3
  209. paddlex/repo_apis/PaddleTS_api/ts_cls/config.py +2 -2
  210. paddlex/repo_apis/PaddleTS_api/ts_fc/config.py +4 -4
  211. paddlex/repo_apis/PaddleVideo_api/video_cls/config.py +1 -1
  212. paddlex/repo_apis/PaddleVideo_api/video_cls/model.py +1 -1
  213. paddlex/repo_apis/PaddleVideo_api/video_cls/runner.py +1 -1
  214. paddlex/repo_apis/PaddleVideo_api/video_det/config.py +1 -1
  215. paddlex/repo_apis/PaddleVideo_api/video_det/model.py +1 -1
  216. paddlex/repo_apis/PaddleVideo_api/video_det/runner.py +1 -1
  217. paddlex/repo_apis/base/config.py +1 -1
  218. paddlex/repo_manager/core.py +3 -3
  219. paddlex/repo_manager/meta.py +6 -2
  220. paddlex/repo_manager/repo.py +17 -16
  221. paddlex/utils/custom_device_list.py +26 -2
  222. paddlex/utils/deps.py +1 -1
  223. paddlex/utils/device.py +15 -8
  224. paddlex/utils/env.py +4 -0
  225. paddlex/utils/flags.py +2 -4
  226. paddlex/utils/fonts/__init__.py +34 -4
  227. paddlex/utils/misc.py +1 -1
  228. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/METADATA +52 -56
  229. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/RECORD +233 -206
  230. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/WHEEL +1 -1
  231. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/entry_points.txt +0 -0
  232. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/licenses/LICENSE +0 -0
  233. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1144 @@
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import List, Tuple
16
+
17
+ import numpy as np
18
+
19
+ from ..result_v2 import LayoutParsingBlock, LayoutParsingRegion
20
+ from ..setting import BLOCK_LABEL_MAP, XYCUT_SETTINGS
21
+ from ..utils import calculate_projection_overlap_ratio
22
+
23
+
24
+ def get_nearest_edge_distance(
25
+ bbox1: List[int],
26
+ bbox2: List[int],
27
+ weight: List[float] = [1.0, 1.0, 1.0, 1.0],
28
+ ) -> Tuple[float]:
29
+ """
30
+ Calculate the nearest edge distance between two bounding boxes, considering directional weights.
31
+
32
+ Args:
33
+ bbox1 (list): The bounding box coordinates [x1, y1, x2, y2] of the input object.
34
+ bbox2 (list): The bounding box coordinates [x1', y1', x2', y2'] of the object to match against.
35
+ weight (list, optional): directional weights for the edge distances [left, right, up, down]. Defaults to [1, 1, 1, 1].
36
+
37
+ Returns:
38
+ float: The calculated minimum edge distance between the bounding boxes.
39
+ """
40
+ x1, y1, x2, y2 = bbox1
41
+ x1_prime, y1_prime, x2_prime, y2_prime = bbox2
42
+ min_x_distance, min_y_distance = 0, 0
43
+ horizontal_iou = calculate_projection_overlap_ratio(bbox1, bbox2, "horizontal")
44
+ vertical_iou = calculate_projection_overlap_ratio(bbox1, bbox2, "vertical")
45
+ if horizontal_iou > 0 and vertical_iou > 0:
46
+ return 0.0
47
+ if horizontal_iou == 0:
48
+ min_x_distance = min(abs(x1 - x2_prime), abs(x2 - x1_prime)) * (
49
+ weight[0] if x2 < x1_prime else weight[1]
50
+ )
51
+ if vertical_iou == 0:
52
+ min_y_distance = min(abs(y1 - y2_prime), abs(y2 - y1_prime)) * (
53
+ weight[2] if y2 < y1_prime else weight[3]
54
+ )
55
+
56
+ return min_x_distance + min_y_distance
57
+
58
+
59
+ def projection_by_bboxes(boxes: np.ndarray, axis: int) -> np.ndarray:
60
+ """
61
+ Generate a 1D projection histogram from bounding boxes along a specified axis.
62
+
63
+ Args:
64
+ boxes: A (N, 4) array of bounding boxes defined by [x_min, y_min, x_max, y_max].
65
+ axis: Axis for projection; 0 for horizontal (x-axis), 1 for vertical (y-axis).
66
+
67
+ Returns:
68
+ A 1D numpy array representing the projection histogram based on bounding box intervals.
69
+ """
70
+ assert axis in [0, 1]
71
+
72
+ if np.min(boxes[:, axis::2]) < 0:
73
+ max_length = abs(np.min(boxes[:, axis::2]))
74
+ else:
75
+ max_length = np.max(boxes[:, axis::2])
76
+
77
+ projection = np.zeros(max_length, dtype=int)
78
+
79
+ # Increment projection histogram over the interval defined by each bounding box
80
+ for start, end in boxes[:, axis::2]:
81
+ start = abs(start)
82
+ end = abs(end)
83
+ projection[start:end] += 1
84
+
85
+ return projection
86
+
87
+
88
+ def split_projection_profile(arr_values: np.ndarray, min_value: float, min_gap: float):
89
+ """
90
+ Split the projection profile into segments based on specified thresholds.
91
+
92
+ Args:
93
+ arr_values: 1D array representing the projection profile.
94
+ min_value: Minimum value threshold to consider a profile segment significant.
95
+ min_gap: Minimum gap width to consider a separation between segments.
96
+
97
+ Returns:
98
+ A tuple of start and end indices for each segment that meets the criteria.
99
+ """
100
+ # Identify indices where the projection exceeds the minimum value
101
+ significant_indices = np.where(arr_values > min_value)[0]
102
+ if not len(significant_indices):
103
+ return
104
+
105
+ # Calculate gaps between significant indices
106
+ index_diffs = significant_indices[1:] - significant_indices[:-1]
107
+ gap_indices = np.where(index_diffs > min_gap)[0]
108
+
109
+ # Determine start and end indices of segments
110
+ segment_starts = np.insert(
111
+ significant_indices[gap_indices + 1],
112
+ 0,
113
+ significant_indices[0],
114
+ )
115
+ segment_ends = np.append(
116
+ significant_indices[gap_indices],
117
+ significant_indices[-1] + 1,
118
+ )
119
+
120
+ return segment_starts, segment_ends
121
+
122
+
123
+ def recursive_yx_cut(
124
+ boxes: np.ndarray, indices: List[int], res: List[int], min_gap: int = 1
125
+ ):
126
+ """
127
+ Recursively project and segment bounding boxes, starting with Y-axis and followed by X-axis.
128
+
129
+ Args:
130
+ boxes: A (N, 4) array representing bounding boxes.
131
+ indices: List of indices indicating the original position of boxes.
132
+ res: List to store indices of the final segmented bounding boxes.
133
+ min_gap (int): Minimum gap width to consider a separation between segments on the X-axis. Defaults to 1.
134
+
135
+ Returns:
136
+ None: This function modifies the `res` list in place.
137
+ """
138
+ assert len(boxes) == len(
139
+ indices
140
+ ), "The length of boxes and indices must be the same."
141
+
142
+ # Sort by y_min for Y-axis projection
143
+ y_sorted_indices = boxes[:, 1].argsort()
144
+ y_sorted_boxes = boxes[y_sorted_indices]
145
+ y_sorted_indices = np.array(indices)[y_sorted_indices]
146
+
147
+ # Perform Y-axis projection
148
+ y_projection = projection_by_bboxes(boxes=y_sorted_boxes, axis=1)
149
+ y_intervals = split_projection_profile(y_projection, 0, 1)
150
+
151
+ if not y_intervals:
152
+ return
153
+
154
+ # Process each segment defined by Y-axis projection
155
+ for y_start, y_end in zip(*y_intervals):
156
+ # Select boxes within the current y interval
157
+ y_interval_indices = (y_start <= y_sorted_boxes[:, 1]) & (
158
+ y_sorted_boxes[:, 1] < y_end
159
+ )
160
+ y_boxes_chunk = y_sorted_boxes[y_interval_indices]
161
+ y_indices_chunk = y_sorted_indices[y_interval_indices]
162
+
163
+ # Sort by x_min for X-axis projection
164
+ x_sorted_indices = y_boxes_chunk[:, 0].argsort()
165
+ x_sorted_boxes_chunk = y_boxes_chunk[x_sorted_indices]
166
+ x_sorted_indices_chunk = y_indices_chunk[x_sorted_indices]
167
+
168
+ # Perform X-axis projection
169
+ x_projection = projection_by_bboxes(boxes=x_sorted_boxes_chunk, axis=0)
170
+ x_intervals = split_projection_profile(x_projection, 0, min_gap)
171
+
172
+ if not x_intervals:
173
+ continue
174
+
175
+ # If X-axis cannot be further segmented, add current indices to results
176
+ if len(x_intervals[0]) == 1:
177
+ res.extend(x_sorted_indices_chunk)
178
+ continue
179
+
180
+ if np.min(x_sorted_boxes_chunk[:, 0]) < 0:
181
+ x_intervals = np.flip(x_intervals, axis=1)
182
+ # Recursively process each segment defined by X-axis projection
183
+ for x_start, x_end in zip(*x_intervals):
184
+ x_interval_indices = (x_start <= abs(x_sorted_boxes_chunk[:, 0])) & (
185
+ abs(x_sorted_boxes_chunk[:, 0]) < x_end
186
+ )
187
+ recursive_yx_cut(
188
+ x_sorted_boxes_chunk[x_interval_indices],
189
+ x_sorted_indices_chunk[x_interval_indices],
190
+ res,
191
+ )
192
+
193
+
194
+ def recursive_xy_cut(
195
+ boxes: np.ndarray, indices: List[int], res: List[int], min_gap: int = 1
196
+ ):
197
+ """
198
+ Recursively performs X-axis projection followed by Y-axis projection to segment bounding boxes.
199
+
200
+ Args:
201
+ boxes: A (N, 4) array representing bounding boxes with [x_min, y_min, x_max, y_max].
202
+ indices: A list of indices representing the position of boxes in the original data.
203
+ res: A list to store indices of bounding boxes that meet the criteria.
204
+ min_gap (int): Minimum gap width to consider a separation between segments on the X-axis. Defaults to 1.
205
+
206
+ Returns:
207
+ None: This function modifies the `res` list in place.
208
+ """
209
+ # Ensure boxes and indices have the same length
210
+ assert len(boxes) == len(
211
+ indices
212
+ ), "The length of boxes and indices must be the same."
213
+
214
+ # Sort by x_min to prepare for X-axis projection
215
+ x_sorted_indices = boxes[:, 0].argsort()
216
+ x_sorted_boxes = boxes[x_sorted_indices]
217
+ x_sorted_indices = np.array(indices)[x_sorted_indices]
218
+
219
+ # Perform X-axis projection
220
+ x_projection = projection_by_bboxes(boxes=x_sorted_boxes, axis=0)
221
+ x_intervals = split_projection_profile(x_projection, 0, 1)
222
+
223
+ if not x_intervals:
224
+ return
225
+
226
+ if np.min(x_sorted_boxes[:, 0]) < 0:
227
+ x_intervals = np.flip(x_intervals, axis=1)
228
+ # Process each segment defined by X-axis projection
229
+ for x_start, x_end in zip(*x_intervals):
230
+ # Select boxes within the current x interval
231
+ x_interval_indices = (x_start <= abs(x_sorted_boxes[:, 0])) & (
232
+ abs(x_sorted_boxes[:, 0]) < x_end
233
+ )
234
+ x_boxes_chunk = x_sorted_boxes[x_interval_indices]
235
+ x_indices_chunk = x_sorted_indices[x_interval_indices]
236
+
237
+ # Sort selected boxes by y_min to prepare for Y-axis projection
238
+ y_sorted_indices = x_boxes_chunk[:, 1].argsort()
239
+ y_sorted_boxes_chunk = x_boxes_chunk[y_sorted_indices]
240
+ y_sorted_indices_chunk = x_indices_chunk[y_sorted_indices]
241
+
242
+ # Perform Y-axis projection
243
+ y_projection = projection_by_bboxes(boxes=y_sorted_boxes_chunk, axis=1)
244
+ y_intervals = split_projection_profile(y_projection, 0, min_gap)
245
+
246
+ if not y_intervals:
247
+ continue
248
+
249
+ # If Y-axis cannot be further segmented, add current indices to results
250
+ if len(y_intervals[0]) == 1:
251
+ res.extend(y_sorted_indices_chunk)
252
+ continue
253
+
254
+ # Recursively process each segment defined by Y-axis projection
255
+ for y_start, y_end in zip(*y_intervals):
256
+ y_interval_indices = (y_start <= y_sorted_boxes_chunk[:, 1]) & (
257
+ y_sorted_boxes_chunk[:, 1] < y_end
258
+ )
259
+ recursive_xy_cut(
260
+ y_sorted_boxes_chunk[y_interval_indices],
261
+ y_sorted_indices_chunk[y_interval_indices],
262
+ res,
263
+ )
264
+
265
+
266
+ def reference_insert(
267
+ block: LayoutParsingBlock,
268
+ sorted_blocks: List[LayoutParsingBlock],
269
+ **kwargs,
270
+ ):
271
+ """
272
+ Insert reference block into sorted blocks based on the distance between the block and the nearest sorted block.
273
+
274
+ Args:
275
+ block: The block to insert into the sorted blocks.
276
+ sorted_blocks: The sorted blocks where the new block will be inserted.
277
+ config: Configuration dictionary containing parameters related to the layout parsing.
278
+ median_width: Median width of the document. Defaults to 0.0.
279
+
280
+ Returns:
281
+ sorted_blocks: The updated sorted blocks after insertion.
282
+ """
283
+ min_distance = float("inf")
284
+ nearest_sorted_block_index = 0
285
+ for sorted_block_idx, sorted_block in enumerate(sorted_blocks):
286
+ if sorted_block.bbox[3] <= block.bbox[1]:
287
+ distance = -(sorted_block.bbox[2] * 10 + sorted_block.bbox[3])
288
+ if distance < min_distance:
289
+ min_distance = distance
290
+ nearest_sorted_block_index = sorted_block_idx
291
+
292
+ sorted_blocks.insert(nearest_sorted_block_index + 1, block)
293
+ return sorted_blocks
294
+
295
+
296
+ def manhattan_insert(
297
+ block: LayoutParsingBlock,
298
+ sorted_blocks: List[LayoutParsingBlock],
299
+ **kwargs,
300
+ ):
301
+ """
302
+ Insert a block into a sorted list of blocks based on the Manhattan distance between the block and the nearest sorted block.
303
+
304
+ Args:
305
+ block: The block to insert into the sorted blocks.
306
+ sorted_blocks: The sorted blocks where the new block will be inserted.
307
+ config: Configuration dictionary containing parameters related to the layout parsing.
308
+ median_width: Median width of the document. Defaults to 0.0.
309
+
310
+ Returns:
311
+ sorted_blocks: The updated sorted blocks after insertion.
312
+ """
313
+ min_distance = float("inf")
314
+ nearest_sorted_block_index = 0
315
+ for sorted_block_idx, sorted_block in enumerate(sorted_blocks):
316
+ distance = _manhattan_distance(block.bbox, sorted_block.bbox)
317
+ if distance < min_distance:
318
+ min_distance = distance
319
+ nearest_sorted_block_index = sorted_block_idx
320
+
321
+ sorted_blocks.insert(nearest_sorted_block_index + 1, block)
322
+ return sorted_blocks
323
+
324
+
325
+ def weighted_distance_insert(
326
+ block: LayoutParsingBlock,
327
+ sorted_blocks: List[LayoutParsingBlock],
328
+ region: LayoutParsingRegion,
329
+ ):
330
+ """
331
+ Insert a block into a sorted list of blocks based on the weighted distance between the block and the nearest sorted block.
332
+
333
+ Args:
334
+ block: The block to insert into the sorted blocks.
335
+ sorted_blocks: The sorted blocks where the new block will be inserted.
336
+ config: Configuration dictionary containing parameters related to the layout parsing.
337
+ median_width: Median width of the document. Defaults to 0.0.
338
+
339
+ Returns:
340
+ sorted_blocks: The updated sorted blocks after insertion.
341
+ """
342
+
343
+ tolerance_len = XYCUT_SETTINGS["edge_distance_compare_tolerance_len"]
344
+ x1, y1, x2, y2 = block.bbox
345
+ min_weighted_distance, min_edge_distance, min_up_edge_distance = (
346
+ float("inf"),
347
+ float("inf"),
348
+ float("inf"),
349
+ )
350
+ nearest_sorted_block_index = 0
351
+ for sorted_block_idx, sorted_block in enumerate(sorted_blocks):
352
+
353
+ x1_prime, y1_prime, x2_prime, y2_prime = sorted_block.bbox
354
+
355
+ # Calculate edge distance
356
+ weight = _get_weights(block.order_label, block.direction)
357
+ edge_distance = get_nearest_edge_distance(block.bbox, sorted_block.bbox, weight)
358
+
359
+ if block.label in BLOCK_LABEL_MAP["doc_title_labels"]:
360
+ disperse = max(1, region.text_line_width)
361
+ tolerance_len = max(tolerance_len, disperse)
362
+ if block.label == "abstract":
363
+ tolerance_len *= 2
364
+ edge_distance = max(0.1, edge_distance) * 10
365
+
366
+ # Calculate up edge distances
367
+ up_edge_distance = y1_prime if region.direction == "horizontal" else -x2_prime
368
+ left_edge_distance = x1_prime if region.direction == "horizontal" else y1_prime
369
+ is_below_sorted_block = (
370
+ y2_prime < y1 if region.direction == "horizontal" else x1_prime > x2
371
+ )
372
+
373
+ if (
374
+ block.label not in BLOCK_LABEL_MAP["unordered_labels"]
375
+ or block.label in BLOCK_LABEL_MAP["doc_title_labels"]
376
+ or block.label in BLOCK_LABEL_MAP["paragraph_title_labels"]
377
+ or block.label in BLOCK_LABEL_MAP["vision_labels"]
378
+ ) and is_below_sorted_block:
379
+ up_edge_distance = -up_edge_distance
380
+ left_edge_distance = -left_edge_distance
381
+
382
+ if abs(min_up_edge_distance - up_edge_distance) <= tolerance_len:
383
+ up_edge_distance = min_up_edge_distance
384
+
385
+ # Calculate weighted distance
386
+ weighted_distance = (
387
+ +edge_distance
388
+ * XYCUT_SETTINGS["distance_weight_map"].get("edge_weight", 10**4)
389
+ + up_edge_distance
390
+ * XYCUT_SETTINGS["distance_weight_map"].get("up_edge_weight", 1)
391
+ + left_edge_distance
392
+ * XYCUT_SETTINGS["distance_weight_map"].get("left_edge_weight", 0.0001)
393
+ )
394
+
395
+ min_edge_distance = min(edge_distance, min_edge_distance)
396
+ min_up_edge_distance = min(up_edge_distance, min_up_edge_distance)
397
+
398
+ if weighted_distance < min_weighted_distance:
399
+ nearest_sorted_block_index = sorted_block_idx
400
+ min_weighted_distance = weighted_distance
401
+ if y1 > y1_prime or (y1 == y1_prime and x1 > x1_prime):
402
+ nearest_sorted_block_index = sorted_block_idx + 1
403
+
404
+ sorted_blocks.insert(nearest_sorted_block_index, block)
405
+ return sorted_blocks
406
+
407
+
408
+ def insert_child_blocks(
409
+ block: LayoutParsingBlock,
410
+ block_idx: int,
411
+ sorted_blocks: List[LayoutParsingBlock],
412
+ ) -> List[LayoutParsingBlock]:
413
+ """
414
+ Insert child blocks of a block into the sorted blocks list.
415
+
416
+ Args:
417
+ block: The parent block whose child blocks need to be inserted.
418
+ block_idx: Index at which the parent block exists in the sorted blocks list.
419
+ sorted_blocks: Sorted blocks list where the child blocks are to be inserted.
420
+
421
+ Returns:
422
+ sorted_blocks: Updated sorted blocks list after inserting child blocks.
423
+ """
424
+ if block.child_blocks:
425
+ sub_blocks = block.get_child_blocks()
426
+ sub_blocks.append(block)
427
+ sub_blocks = sort_child_blocks(sub_blocks, sub_blocks[0].direction)
428
+ sorted_blocks[block_idx] = sub_blocks[0]
429
+ for block in sub_blocks[1:]:
430
+ block_idx += 1
431
+ sorted_blocks.insert(block_idx, block)
432
+ return sorted_blocks
433
+
434
+
435
+ def sort_child_blocks(blocks, direction="horizontal") -> List[LayoutParsingBlock]:
436
+ """
437
+ Sort child blocks based on their bounding box coordinates.
438
+
439
+ Args:
440
+ blocks: A list of LayoutParsingBlock objects representing the child blocks.
441
+ direction: direction of the blocks ('horizontal' or 'vertical'). Default is 'horizontal'.
442
+ Returns:
443
+ sorted_blocks: A sorted list of LayoutParsingBlock objects.
444
+ """
445
+ if direction == "horizontal":
446
+ # from top to bottom
447
+ blocks.sort(
448
+ key=lambda x: (
449
+ x.bbox[1], # y_min
450
+ x.bbox[0], # x_min
451
+ x.bbox[1] ** 2 + x.bbox[0] ** 2, # distance with (0,0)
452
+ ),
453
+ )
454
+ else:
455
+ # from right to left
456
+ blocks.sort(
457
+ key=lambda x: (
458
+ -x.bbox[0], # x_min
459
+ x.bbox[1], # y_min
460
+ x.bbox[1] ** 2 - x.bbox[0] ** 2, # distance with (max,0)
461
+ ),
462
+ )
463
+ return blocks
464
+
465
+
466
+ def _get_weights(label, direction="horizontal"):
467
+ """Define weights based on the label and direction."""
468
+ if label == "doc_title":
469
+ return (
470
+ [1, 0.1, 0.1, 1] if direction == "horizontal" else [0.2, 0.1, 1, 1]
471
+ ) # left-down , right-left
472
+ elif label in [
473
+ "paragraph_title",
474
+ "table_title",
475
+ "abstract",
476
+ "image",
477
+ "seal",
478
+ "chart",
479
+ "figure",
480
+ ]:
481
+ return [1, 1, 0.1, 1] # down
482
+ else:
483
+ return [1, 1, 1, 0.1] # up
484
+
485
+
486
+ def _manhattan_distance(
487
+ point1: Tuple[float, float],
488
+ point2: Tuple[float, float],
489
+ weight_x: float = 1.0,
490
+ weight_y: float = 1.0,
491
+ ) -> float:
492
+ """
493
+ Calculate the weighted Manhattan distance between two points.
494
+
495
+ Args:
496
+ point1 (Tuple[float, float]): The first point as (x, y).
497
+ point2 (Tuple[float, float]): The second point as (x, y).
498
+ weight_x (float): The weight for the x-axis distance. Default is 1.0.
499
+ weight_y (float): The weight for the y-axis distance. Default is 1.0.
500
+
501
+ Returns:
502
+ float: The weighted Manhattan distance between the two points.
503
+ """
504
+ return weight_x * abs(point1[0] - point2[0]) + weight_y * abs(point1[1] - point2[1])
505
+
506
+
507
+ def sort_normal_blocks(blocks, text_line_height, text_line_width, region_direction):
508
+ if region_direction == "horizontal":
509
+ blocks.sort(
510
+ key=lambda x: (
511
+ x.bbox[1] // text_line_height,
512
+ x.bbox[0] // text_line_width,
513
+ x.bbox[1] ** 2 + x.bbox[0] ** 2,
514
+ ),
515
+ )
516
+ else:
517
+ blocks.sort(
518
+ key=lambda x: (
519
+ -x.bbox[0] // text_line_width,
520
+ x.bbox[1] // text_line_height,
521
+ x.bbox[1] ** 2 - x.bbox[2] ** 2, # distance with (max,0)
522
+ ),
523
+ )
524
+ return blocks
525
+
526
+
527
+ def sort_normal_blocks(blocks, text_line_height, text_line_width, region_direction):
528
+ if region_direction == "horizontal":
529
+ blocks.sort(
530
+ key=lambda x: (
531
+ x.bbox[1] // text_line_height,
532
+ x.bbox[0] // text_line_width,
533
+ x.bbox[1] ** 2 + x.bbox[0] ** 2,
534
+ ),
535
+ )
536
+ else:
537
+ blocks.sort(
538
+ key=lambda x: (
539
+ -x.bbox[0] // text_line_width,
540
+ x.bbox[1] // text_line_height,
541
+ -(x.bbox[2] ** 2 + x.bbox[1] ** 2),
542
+ ),
543
+ )
544
+ return blocks
545
+
546
+
547
+ def get_cut_blocks(blocks, cut_direction, cut_coordinates, mask_labels=[]):
548
+ """
549
+ Cut blocks based on the given cut direction and coordinates.
550
+
551
+ Args:
552
+ blocks (list): list of blocks to be cut.
553
+ cut_direction (str): cut direction, either "horizontal" or "vertical".
554
+ cut_coordinates (list): list of cut coordinates.
555
+
556
+ Returns:
557
+ list: a list of tuples containing the cutted blocks and their corresponding mean width。
558
+ """
559
+ cuted_list = []
560
+ # filter out mask blocks,including header, footer, unordered and child_blocks
561
+
562
+ # 0: horizontal, 1: vertical
563
+ cut_aixis = 0 if cut_direction == "horizontal" else 1
564
+ blocks.sort(key=lambda x: x.bbox[cut_aixis + 2])
565
+ cut_coordinates.append(float("inf"))
566
+
567
+ cut_coordinates = list(set(cut_coordinates))
568
+ cut_coordinates.sort()
569
+
570
+ cut_idx = 0
571
+ for cut_coordinate in cut_coordinates:
572
+ group_blocks = []
573
+ block_idx = cut_idx
574
+ while block_idx < len(blocks):
575
+ block = blocks[block_idx]
576
+ if block.bbox[cut_aixis + 2] > cut_coordinate:
577
+ break
578
+ elif block.order_label not in mask_labels:
579
+ group_blocks.append(block)
580
+ block_idx += 1
581
+ cut_idx = block_idx
582
+ if group_blocks:
583
+ cuted_list.append(group_blocks)
584
+
585
+ return cuted_list
586
+
587
+
588
+ def add_split_block(
589
+ blocks: List[LayoutParsingBlock], region_bbox: List[int]
590
+ ) -> List[LayoutParsingBlock]:
591
+ block_bboxes = np.array([block.bbox for block in blocks])
592
+ discontinuous = calculate_discontinuous_projection(
593
+ block_bboxes, direction="vertical"
594
+ )
595
+ current_interval = discontinuous[0]
596
+ for interval in discontinuous[1:]:
597
+ gap_len = interval[0] - current_interval[1]
598
+ if gap_len > 40:
599
+ x1, _, x2, __ = region_bbox
600
+ y1 = current_interval[1] + 5
601
+ y2 = interval[0] - 5
602
+ bbox = [x1, y1, x2, y2]
603
+ split_block = LayoutParsingBlock(label="split", bbox=bbox)
604
+ blocks.append(split_block)
605
+ current_interval = interval
606
+
607
+
608
+ def get_nearest_blocks(
609
+ block: LayoutParsingBlock,
610
+ ref_blocks: List[LayoutParsingBlock],
611
+ overlap_threshold,
612
+ direction="horizontal",
613
+ ) -> List:
614
+ """
615
+ Get the adjacent blocks with the same direction as the current block.
616
+ Args:
617
+ block (LayoutParsingBlock): The current block.
618
+ blocks (List[LayoutParsingBlock]): A list of all blocks.
619
+ ref_block_idxes (List[int]): A list of indices of reference blocks.
620
+ iou_threshold (float): The IOU threshold to determine if two blocks are considered adjacent.
621
+ Returns:
622
+ Int: The index of the previous block with same direction.
623
+ Int: The index of the following block with same direction.
624
+ """
625
+ prev_blocks: List[LayoutParsingBlock] = []
626
+ post_blocks: List[LayoutParsingBlock] = []
627
+ sort_index = 1 if direction == "horizontal" else 0
628
+ for ref_block in ref_blocks:
629
+ if ref_block.index == block.index:
630
+ continue
631
+ overlap_ratio = calculate_projection_overlap_ratio(
632
+ block.bbox, ref_block.bbox, direction, mode="small"
633
+ )
634
+ if overlap_ratio > overlap_threshold:
635
+ if ref_block.bbox[sort_index] <= block.bbox[sort_index]:
636
+ prev_blocks.append(ref_block)
637
+ else:
638
+ post_blocks.append(ref_block)
639
+
640
+ if prev_blocks:
641
+ prev_blocks.sort(key=lambda x: x.bbox[sort_index], reverse=True)
642
+ if post_blocks:
643
+ post_blocks.sort(key=lambda x: x.bbox[sort_index])
644
+
645
+ return prev_blocks, post_blocks
646
+
647
+
648
+ def get_adjacent_blocks_by_direction(
649
+ blocks: List[LayoutParsingBlock],
650
+ block_idx: int,
651
+ ref_block_idxes: List[int],
652
+ iou_threshold,
653
+ ) -> List:
654
+ """
655
+ Get the adjacent blocks with the same direction as the current block.
656
+ Args:
657
+ block (LayoutParsingBlock): The current block.
658
+ blocks (List[LayoutParsingBlock]): A list of all blocks.
659
+ ref_block_idxes (List[int]): A list of indices of reference blocks.
660
+ iou_threshold (float): The IOU threshold to determine if two blocks are considered adjacent.
661
+ Returns:
662
+ Int: The index of the previous block with same direction.
663
+ Int: The index of the following block with same direction.
664
+ """
665
+ min_prev_block_distance = float("inf")
666
+ prev_block_index = None
667
+ min_post_block_distance = float("inf")
668
+ post_block_index = None
669
+ block = blocks[block_idx]
670
+ child_labels = [
671
+ "vision_footnote",
672
+ "sub_paragraph_title",
673
+ "doc_title_text",
674
+ "vision_title",
675
+ ]
676
+
677
+ # find the nearest text block with same direction to the current block
678
+ for ref_block_idx in ref_block_idxes:
679
+ ref_block = blocks[ref_block_idx]
680
+ ref_block_direction = ref_block.direction
681
+ if ref_block.order_label in child_labels:
682
+ continue
683
+ match_block_iou = calculate_projection_overlap_ratio(
684
+ block.bbox,
685
+ ref_block.bbox,
686
+ ref_block_direction,
687
+ )
688
+
689
+ child_match_distance_tolerance_len = block.short_side_length / 10
690
+
691
+ if block.order_label == "vision":
692
+ if ref_block.num_of_lines == 1:
693
+ gap_tolerance_len = ref_block.short_side_length * 2
694
+ else:
695
+ gap_tolerance_len = block.short_side_length / 10
696
+ else:
697
+ gap_tolerance_len = block.short_side_length * 2
698
+
699
+ if match_block_iou >= iou_threshold:
700
+ prev_distance = (
701
+ block.secondary_direction_start_coordinate
702
+ - ref_block.secondary_direction_end_coordinate
703
+ + child_match_distance_tolerance_len
704
+ ) // 5 + ref_block.start_coordinate / 5000
705
+ next_distance = (
706
+ ref_block.secondary_direction_start_coordinate
707
+ - block.secondary_direction_end_coordinate
708
+ + child_match_distance_tolerance_len
709
+ ) // 5 + ref_block.start_coordinate / 5000
710
+ if (
711
+ ref_block.secondary_direction_end_coordinate
712
+ <= block.secondary_direction_start_coordinate
713
+ + child_match_distance_tolerance_len
714
+ and prev_distance < min_prev_block_distance
715
+ ):
716
+ min_prev_block_distance = prev_distance
717
+ if (
718
+ block.secondary_direction_start_coordinate
719
+ - ref_block.secondary_direction_end_coordinate
720
+ < gap_tolerance_len
721
+ ):
722
+ prev_block_index = ref_block_idx
723
+ elif (
724
+ ref_block.secondary_direction_start_coordinate
725
+ > block.secondary_direction_end_coordinate
726
+ - child_match_distance_tolerance_len
727
+ and next_distance < min_post_block_distance
728
+ ):
729
+ min_post_block_distance = next_distance
730
+ if (
731
+ ref_block.secondary_direction_start_coordinate
732
+ - block.secondary_direction_end_coordinate
733
+ < gap_tolerance_len
734
+ ):
735
+ post_block_index = ref_block_idx
736
+
737
+ diff_dist = abs(min_prev_block_distance - min_post_block_distance)
738
+
739
+ # if the difference in distance is too large, only consider the nearest one
740
+ if diff_dist * 5 > block.short_side_length:
741
+ if min_prev_block_distance < min_post_block_distance:
742
+ post_block_index = None
743
+ else:
744
+ prev_block_index = None
745
+
746
+ return prev_block_index, post_block_index
747
+
748
+
749
+ def update_doc_title_child_blocks(
750
+ block: LayoutParsingBlock,
751
+ region: LayoutParsingRegion,
752
+ ) -> None:
753
+ """
754
+ Update the child blocks of a document title block.
755
+
756
+ The child blocks need to meet the following conditions:
757
+ 1. They must be adjacent
758
+ 2. They must have the same direction as the parent block.
759
+ 3. Their short side length should be less than 80% of the parent's short side length.
760
+ 4. Their long side length should be less than 150% of the parent's long side length.
761
+ 5. The child block must be text block.
762
+ 6. The nearest edge distance should be less than 2 times of the text line height.
763
+
764
+ Args:
765
+ blocks (List[LayoutParsingBlock]): overall blocks.
766
+ block (LayoutParsingBlock): document title block.
767
+ prev_idx (int): previous block index, None if not exist.
768
+ post_idx (int): post block index, None if not exist.
769
+ config (dict): configurations.
770
+
771
+ Returns:
772
+ None
773
+
774
+ """
775
+ ref_blocks = [region.block_map[idx] for idx in region.normal_text_block_idxes]
776
+ overlap_threshold = XYCUT_SETTINGS["child_block_overlap_ratio_threshold"]
777
+ prev_blocks, post_blocks = get_nearest_blocks(
778
+ block, ref_blocks, overlap_threshold, block.direction
779
+ )
780
+ prev_block = None
781
+ post_block = None
782
+
783
+ if prev_blocks:
784
+ prev_block = prev_blocks[0]
785
+ if post_blocks:
786
+ post_block = post_blocks[0]
787
+
788
+ for ref_block in [prev_block, post_block]:
789
+ if ref_block is None:
790
+ continue
791
+ with_seem_direction = ref_block.direction == block.direction
792
+
793
+ short_side_length_condition = (
794
+ ref_block.short_side_length < block.short_side_length * 0.8
795
+ )
796
+
797
+ long_side_length_condition = (
798
+ ref_block.long_side_length < block.long_side_length
799
+ or ref_block.long_side_length > 1.5 * block.long_side_length
800
+ )
801
+
802
+ nearest_edge_distance = get_nearest_edge_distance(block.bbox, ref_block.bbox)
803
+
804
+ if (
805
+ with_seem_direction
806
+ and ref_block.label in BLOCK_LABEL_MAP["text_labels"]
807
+ and short_side_length_condition
808
+ and long_side_length_condition
809
+ and ref_block.num_of_lines < 3
810
+ and nearest_edge_distance < ref_block.text_line_height * 2
811
+ ):
812
+ ref_block.order_label = "doc_title_text"
813
+ block.append_child_block(ref_block)
814
+ region.normal_text_block_idxes.remove(ref_block.index)
815
+
816
+
817
+ def update_paragraph_title_child_blocks(
818
+ block: LayoutParsingBlock,
819
+ region: LayoutParsingRegion,
820
+ ) -> None:
821
+ """
822
+ Update the child blocks of a paragraph title block.
823
+
824
+ The child blocks need to meet the following conditions:
825
+ 1. They must be adjacent
826
+ 2. They must have the same direction as the parent block.
827
+ 3. The child block must be paragraph title block.
828
+
829
+ Args:
830
+ blocks (List[LayoutParsingBlock]): overall blocks.
831
+ block (LayoutParsingBlock): document title block.
832
+ prev_idx (int): previous block index, None if not exist.
833
+ post_idx (int): post block index, None if not exist.
834
+ config (dict): configurations.
835
+
836
+ Returns:
837
+ None
838
+
839
+ """
840
+ if block.order_label == "sub_paragraph_title":
841
+ return
842
+ ref_blocks = [
843
+ region.block_map[idx]
844
+ for idx in region.paragraph_title_block_idxes + region.normal_text_block_idxes
845
+ ]
846
+ overlap_threshold = XYCUT_SETTINGS["child_block_overlap_ratio_threshold"]
847
+ prev_blocks, post_blocks = get_nearest_blocks(
848
+ block, ref_blocks, overlap_threshold, block.direction
849
+ )
850
+ for ref_blocks in [prev_blocks, post_blocks]:
851
+ for ref_block in ref_blocks:
852
+ if ref_block.label not in BLOCK_LABEL_MAP["paragraph_title_labels"]:
853
+ break
854
+ min_text_line_height = min(
855
+ block.text_line_height, ref_block.text_line_height
856
+ )
857
+ nearest_edge_distance = get_nearest_edge_distance(
858
+ block.bbox, ref_block.bbox
859
+ )
860
+ with_seem_direction = ref_block.direction == block.direction
861
+ if (
862
+ with_seem_direction
863
+ and nearest_edge_distance <= min_text_line_height * 1.5
864
+ ):
865
+ ref_block.order_label = "sub_paragraph_title"
866
+ block.append_child_block(ref_block)
867
+ region.paragraph_title_block_idxes.remove(ref_block.index)
868
+
869
+
870
+ def update_vision_child_blocks(
871
+ block: LayoutParsingBlock,
872
+ region: LayoutParsingRegion,
873
+ ) -> None:
874
+ """
875
+ Update the child blocks of a paragraph title block.
876
+
877
+ The child blocks need to meet the following conditions:
878
+ - For Both:
879
+ 1. They must be adjacent
880
+ 2. The child block must be vision_title or text block.
881
+ - For vision_title:
882
+ 1. The distance between the child block and the parent block should be less than 1/2 of the parent's height.
883
+ - For text block:
884
+ 1. The distance between the child block and the parent block should be less than 15.
885
+ 2. The child short_side_length should be less than the parent's short side length.
886
+ 3. The child long_side_length should be less than 50% of the parent's long side length.
887
+ 4. The difference between their centers is very small.
888
+
889
+ Args:
890
+ blocks (List[LayoutParsingBlock]): overall blocks.
891
+ block (LayoutParsingBlock): document title block.
892
+ ref_block_idxes (List[int]): A list of indices of reference blocks.
893
+ prev_idx (int): previous block index, None if not exist.
894
+ post_idx (int): post block index, None if not exist.
895
+ config (dict): configurations.
896
+
897
+ Returns:
898
+ None
899
+
900
+ """
901
+ ref_blocks = [
902
+ region.block_map[idx]
903
+ for idx in region.normal_text_block_idxes + region.vision_title_block_idxes
904
+ ]
905
+ overlap_threshold = XYCUT_SETTINGS["child_block_overlap_ratio_threshold"]
906
+ has_vision_footnote = False
907
+ has_vision_title = False
908
+ for direction in [block.direction, block.secondary_direction]:
909
+ prev_blocks, post_blocks = get_nearest_blocks(
910
+ block, ref_blocks, overlap_threshold, direction
911
+ )
912
+ for ref_block in prev_blocks:
913
+ if (
914
+ ref_block.label
915
+ not in BLOCK_LABEL_MAP["text_labels"]
916
+ + BLOCK_LABEL_MAP["vision_title_labels"]
917
+ ):
918
+ break
919
+ nearest_edge_distance = get_nearest_edge_distance(
920
+ block.bbox, ref_block.bbox
921
+ )
922
+ block_center = block.get_centroid()
923
+ ref_block_center = ref_block.get_centroid()
924
+ if (
925
+ ref_block.label in BLOCK_LABEL_MAP["vision_title_labels"]
926
+ and nearest_edge_distance <= ref_block.text_line_height * 2
927
+ ):
928
+ has_vision_title = True
929
+ ref_block.order_label = "vision_title"
930
+ block.append_child_block(ref_block)
931
+ region.vision_title_block_idxes.remove(ref_block.index)
932
+ if ref_block.label in BLOCK_LABEL_MAP["text_labels"]:
933
+ if (
934
+ not has_vision_footnote
935
+ and ref_block.direction == block.direction
936
+ and ref_block.long_side_length < block.long_side_length
937
+ ):
938
+ if (
939
+ (
940
+ nearest_edge_distance <= block.text_line_height * 2
941
+ and ref_block.short_side_length < block.short_side_length
942
+ and ref_block.long_side_length
943
+ < 0.5 * block.long_side_length
944
+ and abs(block_center[0] - ref_block_center[0]) < 10
945
+ )
946
+ or (
947
+ block.bbox[0] - ref_block.bbox[0] < 10
948
+ and ref_block.num_of_lines == 1
949
+ )
950
+ or (
951
+ block.bbox[2] - ref_block.bbox[2] < 10
952
+ and ref_block.num_of_lines == 1
953
+ )
954
+ ):
955
+ has_vision_footnote = True
956
+ ref_block.order_label = "vision_footnote"
957
+ block.append_child_block(ref_block)
958
+ region.normal_text_block_idxes.remove(ref_block.index)
959
+ break
960
+ for ref_block in post_blocks:
961
+ if (
962
+ has_vision_footnote
963
+ and ref_block.label in BLOCK_LABEL_MAP["text_labels"]
964
+ ):
965
+ break
966
+ nearest_edge_distance = get_nearest_edge_distance(
967
+ block.bbox, ref_block.bbox
968
+ )
969
+ block_center = block.get_centroid()
970
+ ref_block_center = ref_block.get_centroid()
971
+ if (
972
+ ref_block.label in BLOCK_LABEL_MAP["vision_title_labels"]
973
+ and nearest_edge_distance <= ref_block.text_line_height * 2
974
+ ):
975
+ has_vision_title = True
976
+ ref_block.order_label = "vision_title"
977
+ block.append_child_block(ref_block)
978
+ region.vision_title_block_idxes.remove(ref_block.index)
979
+ if ref_block.label in BLOCK_LABEL_MAP["text_labels"]:
980
+ if (
981
+ not has_vision_footnote
982
+ and nearest_edge_distance <= block.text_line_height * 2
983
+ and ref_block.short_side_length < block.short_side_length
984
+ and ref_block.long_side_length < 0.5 * block.long_side_length
985
+ and ref_block.direction == block.direction
986
+ and (
987
+ abs(block_center[0] - ref_block_center[0]) < 10
988
+ or (
989
+ block.bbox[0] - ref_block.bbox[0] < 10
990
+ and ref_block.num_of_lines == 1
991
+ )
992
+ or (
993
+ block.bbox[2] - ref_block.bbox[2] < 10
994
+ and ref_block.num_of_lines == 1
995
+ )
996
+ )
997
+ ):
998
+ has_vision_footnote = True
999
+ ref_block.order_label = "vision_footnote"
1000
+ block.append_child_block(ref_block)
1001
+ region.normal_text_block_idxes.remove(ref_block.index)
1002
+ break
1003
+ if has_vision_title:
1004
+ break
1005
+
1006
+
1007
+ def calculate_discontinuous_projection(
1008
+ boxes, direction="horizontal", return_num=False
1009
+ ) -> List:
1010
+ """
1011
+ Calculate the discontinuous projection of boxes along the specified direction.
1012
+
1013
+ Args:
1014
+ boxes (ndarray): Array of bounding boxes represented by [[x_min, y_min, x_max, y_max]].
1015
+ direction (str): direction along which to perform the projection ('horizontal' or 'vertical').
1016
+
1017
+ Returns:
1018
+ list: List of tuples representing the merged intervals.
1019
+ """
1020
+ boxes = np.array(boxes)
1021
+ if direction == "horizontal":
1022
+ intervals = boxes[:, [0, 2]]
1023
+ elif direction == "vertical":
1024
+ intervals = boxes[:, [1, 3]]
1025
+ else:
1026
+ raise ValueError("direction must be 'horizontal' or 'vertical'")
1027
+
1028
+ intervals = intervals[np.argsort(intervals[:, 0])]
1029
+
1030
+ merged_intervals = []
1031
+ num = 1
1032
+ current_start, current_end = intervals[0]
1033
+ num_list = []
1034
+
1035
+ for start, end in intervals[1:]:
1036
+ if start <= current_end:
1037
+ num += 1
1038
+ current_end = max(current_end, end)
1039
+ else:
1040
+ num_list.append(num)
1041
+ merged_intervals.append((current_start, current_end))
1042
+ num = 1
1043
+ current_start, current_end = start, end
1044
+
1045
+ num_list.append(num)
1046
+ merged_intervals.append((current_start, current_end))
1047
+ if return_num:
1048
+ return merged_intervals, num_list
1049
+ return merged_intervals
1050
+
1051
+
1052
+ def is_projection_consistent(blocks, intervals, direction="horizontal"):
1053
+
1054
+ for interval in intervals:
1055
+ if direction == "horizontal":
1056
+ start_index, stop_index = 0, 2
1057
+ interval_box = [interval[0], 0, interval[1], 1]
1058
+ else:
1059
+ start_index, stop_index = 1, 3
1060
+ interval_box = [0, interval[0], 1, interval[1]]
1061
+ same_interval_bboxes = []
1062
+ for block in blocks:
1063
+ overlap_ratio = calculate_projection_overlap_ratio(
1064
+ interval_box, block.bbox, direction=direction
1065
+ )
1066
+ if overlap_ratio > 0 and block.label in BLOCK_LABEL_MAP["text_labels"]:
1067
+ same_interval_bboxes.append(block.bbox)
1068
+ start_coordinates = [bbox[start_index] for bbox in same_interval_bboxes]
1069
+ if start_coordinates:
1070
+ min_start_coordinate = min(start_coordinates)
1071
+ max_start_coordinate = max(start_coordinates)
1072
+ is_start_consistent = (
1073
+ False
1074
+ if max_start_coordinate - min_start_coordinate
1075
+ >= abs(interval[0] - interval[1]) * 0.05
1076
+ else True
1077
+ )
1078
+ stop_coordinates = [bbox[stop_index] for bbox in same_interval_bboxes]
1079
+ min_stop_coordinate = min(stop_coordinates)
1080
+ max_stop_coordinate = max(stop_coordinates)
1081
+ if (
1082
+ max_stop_coordinate - min_stop_coordinate
1083
+ >= abs(interval[0] - interval[1]) * 0.05
1084
+ and is_start_consistent
1085
+ ):
1086
+ return False
1087
+ return True
1088
+
1089
+
1090
+ def shrink_overlapping_boxes(
1091
+ boxes, direction="horizontal", min_threshold=0, max_threshold=0.1
1092
+ ) -> List:
1093
+ """
1094
+ Shrink overlapping boxes along the specified direction.
1095
+
1096
+ Args:
1097
+ boxes (ndarray): Array of bounding boxes represented by [[x_min, y_min, x_max, y_max]].
1098
+ direction (str): direction along which to perform the shrinking ('horizontal' or 'vertical').
1099
+ min_threshold (float): Minimum threshold for shrinking. Default is 0.
1100
+ max_threshold (float): Maximum threshold for shrinking. Default is 0.2.
1101
+
1102
+ Returns:
1103
+ list: List of tuples representing the merged intervals.
1104
+ """
1105
+ current_block = boxes[0]
1106
+ for block in boxes[1:]:
1107
+ x1, y1, x2, y2 = current_block.bbox
1108
+ x1_prime, y1_prime, x2_prime, y2_prime = block.bbox
1109
+ cut_iou = calculate_projection_overlap_ratio(
1110
+ current_block.bbox, block.bbox, direction=direction
1111
+ )
1112
+ match_iou = calculate_projection_overlap_ratio(
1113
+ current_block.bbox,
1114
+ block.bbox,
1115
+ direction="horizontal" if direction == "vertical" else "vertical",
1116
+ )
1117
+ if direction == "vertical":
1118
+ if (
1119
+ (match_iou > 0 and cut_iou > min_threshold and cut_iou < max_threshold)
1120
+ or y2 == y1_prime
1121
+ or abs(y2 - y1_prime) <= 3
1122
+ ):
1123
+ overlap_y_min = max(y1, y1_prime)
1124
+ overlap_y_max = min(y2, y2_prime)
1125
+ split_y = int((overlap_y_min + overlap_y_max) / 2)
1126
+ overlap_y_min = split_y - 1
1127
+ overlap_y_max = split_y + 1
1128
+ current_block.bbox = [x1, y1, x2, overlap_y_min]
1129
+ block.bbox = [x1_prime, overlap_y_max, x2_prime, y2_prime]
1130
+ else:
1131
+ if (
1132
+ (match_iou > 0 and cut_iou > min_threshold and cut_iou < max_threshold)
1133
+ or x2 == x1_prime
1134
+ or abs(x2 - x1_prime) <= 3
1135
+ ):
1136
+ overlap_x_min = max(x1, x1_prime)
1137
+ overlap_x_max = min(x2, x2_prime)
1138
+ split_x = int((overlap_x_min + overlap_x_max) / 2)
1139
+ overlap_x_min = split_x - 1
1140
+ overlap_x_max = split_x + 1
1141
+ current_block.bbox = [x1, y1, overlap_x_min, y2]
1142
+ block.bbox = [overlap_x_max, y1_prime, x2_prime, y2_prime]
1143
+ current_block = block
1144
+ return boxes