paddlex 3.0.0rc1__py3-none-any.whl → 3.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (240) hide show
  1. paddlex/.version +1 -1
  2. paddlex/__init__.py +1 -1
  3. paddlex/configs/modules/chart_parsing/PP-Chart2Table.yaml +13 -0
  4. paddlex/configs/modules/doc_vlm/PP-DocBee2-3B.yaml +14 -0
  5. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-L.yaml +40 -0
  6. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-M.yaml +40 -0
  7. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-S.yaml +40 -0
  8. paddlex/configs/modules/layout_detection/PP-DocBlockLayout.yaml +40 -0
  9. paddlex/configs/modules/layout_detection/PP-DocLayout-L.yaml +2 -2
  10. paddlex/configs/modules/layout_detection/PP-DocLayout-M.yaml +2 -2
  11. paddlex/configs/modules/layout_detection/PP-DocLayout-S.yaml +2 -2
  12. paddlex/configs/modules/layout_detection/PP-DocLayout_plus-L.yaml +40 -0
  13. paddlex/configs/modules/text_detection/PP-OCRv5_mobile_det.yaml +40 -0
  14. paddlex/configs/modules/text_detection/PP-OCRv5_server_det.yaml +40 -0
  15. paddlex/configs/modules/text_recognition/PP-OCRv5_mobile_rec.yaml +39 -0
  16. paddlex/configs/modules/text_recognition/PP-OCRv5_server_rec.yaml +39 -0
  17. paddlex/configs/modules/textline_orientation/PP-LCNet_x1_0_textline_ori.yaml +41 -0
  18. paddlex/configs/pipelines/OCR.yaml +7 -6
  19. paddlex/configs/pipelines/PP-ChatOCRv3-doc.yaml +3 -1
  20. paddlex/configs/pipelines/PP-ChatOCRv4-doc.yaml +91 -34
  21. paddlex/configs/pipelines/PP-StructureV3.yaml +72 -72
  22. paddlex/configs/pipelines/doc_understanding.yaml +1 -1
  23. paddlex/configs/pipelines/formula_recognition.yaml +2 -2
  24. paddlex/configs/pipelines/layout_parsing.yaml +3 -2
  25. paddlex/configs/pipelines/seal_recognition.yaml +1 -0
  26. paddlex/configs/pipelines/table_recognition.yaml +2 -1
  27. paddlex/configs/pipelines/table_recognition_v2.yaml +7 -1
  28. paddlex/hpip_links.html +20 -20
  29. paddlex/inference/common/batch_sampler/doc_vlm_batch_sampler.py +33 -10
  30. paddlex/inference/common/batch_sampler/image_batch_sampler.py +34 -25
  31. paddlex/inference/common/result/mixin.py +19 -12
  32. paddlex/inference/models/base/predictor/base_predictor.py +2 -8
  33. paddlex/inference/models/common/static_infer.py +29 -73
  34. paddlex/inference/models/common/tokenizer/__init__.py +2 -0
  35. paddlex/inference/models/common/tokenizer/clip_tokenizer.py +1 -1
  36. paddlex/inference/models/common/tokenizer/gpt_tokenizer.py +2 -2
  37. paddlex/inference/models/common/tokenizer/qwen2_5_tokenizer.py +112 -0
  38. paddlex/inference/models/common/tokenizer/qwen2_tokenizer.py +7 -1
  39. paddlex/inference/models/common/tokenizer/qwen_tokenizer.py +288 -0
  40. paddlex/inference/models/common/tokenizer/tokenizer_utils.py +13 -13
  41. paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py +3 -3
  42. paddlex/inference/models/common/tokenizer/vocab.py +7 -7
  43. paddlex/inference/models/common/ts/funcs.py +19 -8
  44. paddlex/inference/models/common/vlm/conversion_utils.py +99 -0
  45. paddlex/inference/models/common/vlm/fusion_ops.py +205 -0
  46. paddlex/inference/models/common/vlm/generation/configuration_utils.py +1 -1
  47. paddlex/inference/models/common/vlm/generation/logits_process.py +1 -1
  48. paddlex/inference/models/common/vlm/generation/utils.py +1 -1
  49. paddlex/inference/models/common/vlm/transformers/configuration_utils.py +3 -3
  50. paddlex/inference/models/common/vlm/transformers/conversion_utils.py +3 -3
  51. paddlex/inference/models/common/vlm/transformers/model_outputs.py +2 -2
  52. paddlex/inference/models/common/vlm/transformers/model_utils.py +7 -31
  53. paddlex/inference/models/doc_vlm/modeling/GOT_ocr_2_0.py +830 -0
  54. paddlex/inference/models/doc_vlm/modeling/__init__.py +2 -0
  55. paddlex/inference/models/doc_vlm/modeling/qwen2.py +1606 -0
  56. paddlex/inference/models/doc_vlm/modeling/qwen2_5_vl.py +3006 -0
  57. paddlex/inference/models/doc_vlm/modeling/qwen2_vl.py +0 -105
  58. paddlex/inference/models/doc_vlm/predictor.py +79 -24
  59. paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py +97 -0
  60. paddlex/inference/models/doc_vlm/processors/__init__.py +2 -0
  61. paddlex/inference/models/doc_vlm/processors/common.py +189 -0
  62. paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py +548 -0
  63. paddlex/inference/models/doc_vlm/processors/qwen2_vl.py +21 -176
  64. paddlex/inference/models/formula_recognition/predictor.py +8 -2
  65. paddlex/inference/models/formula_recognition/processors.py +90 -77
  66. paddlex/inference/models/formula_recognition/result.py +28 -27
  67. paddlex/inference/models/image_feature/processors.py +3 -4
  68. paddlex/inference/models/keypoint_detection/predictor.py +3 -0
  69. paddlex/inference/models/object_detection/predictor.py +2 -0
  70. paddlex/inference/models/object_detection/processors.py +28 -3
  71. paddlex/inference/models/object_detection/utils.py +2 -0
  72. paddlex/inference/models/table_structure_recognition/result.py +0 -10
  73. paddlex/inference/models/text_detection/predictor.py +8 -0
  74. paddlex/inference/models/text_detection/processors.py +44 -10
  75. paddlex/inference/models/text_detection/result.py +0 -10
  76. paddlex/inference/models/text_recognition/result.py +1 -1
  77. paddlex/inference/pipelines/__init__.py +9 -5
  78. paddlex/inference/pipelines/_parallel.py +172 -0
  79. paddlex/inference/pipelines/anomaly_detection/pipeline.py +16 -6
  80. paddlex/inference/pipelines/attribute_recognition/pipeline.py +11 -1
  81. paddlex/inference/pipelines/base.py +14 -4
  82. paddlex/inference/pipelines/components/faisser.py +1 -1
  83. paddlex/inference/pipelines/doc_preprocessor/pipeline.py +53 -27
  84. paddlex/inference/pipelines/formula_recognition/pipeline.py +120 -82
  85. paddlex/inference/pipelines/formula_recognition/result.py +1 -11
  86. paddlex/inference/pipelines/image_classification/pipeline.py +16 -6
  87. paddlex/inference/pipelines/image_multilabel_classification/pipeline.py +16 -6
  88. paddlex/inference/pipelines/instance_segmentation/pipeline.py +16 -6
  89. paddlex/inference/pipelines/keypoint_detection/pipeline.py +16 -6
  90. paddlex/inference/pipelines/layout_parsing/layout_objects.py +859 -0
  91. paddlex/inference/pipelines/layout_parsing/pipeline.py +34 -47
  92. paddlex/inference/pipelines/layout_parsing/pipeline_v2.py +832 -260
  93. paddlex/inference/pipelines/layout_parsing/result.py +4 -17
  94. paddlex/inference/pipelines/layout_parsing/result_v2.py +259 -245
  95. paddlex/inference/pipelines/layout_parsing/setting.py +88 -0
  96. paddlex/inference/pipelines/layout_parsing/utils.py +391 -2028
  97. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/__init__.py +16 -0
  98. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/utils.py +1199 -0
  99. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/xycuts.py +615 -0
  100. paddlex/inference/pipelines/m_3d_bev_detection/pipeline.py +2 -2
  101. paddlex/inference/pipelines/multilingual_speech_recognition/pipeline.py +2 -2
  102. paddlex/inference/pipelines/object_detection/pipeline.py +16 -6
  103. paddlex/inference/pipelines/ocr/pipeline.py +127 -70
  104. paddlex/inference/pipelines/ocr/result.py +21 -18
  105. paddlex/inference/pipelines/open_vocabulary_detection/pipeline.py +2 -2
  106. paddlex/inference/pipelines/open_vocabulary_segmentation/pipeline.py +2 -2
  107. paddlex/inference/pipelines/pp_chatocr/pipeline_base.py +2 -2
  108. paddlex/inference/pipelines/pp_chatocr/pipeline_v3.py +2 -5
  109. paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py +6 -6
  110. paddlex/inference/pipelines/rotated_object_detection/pipeline.py +16 -6
  111. paddlex/inference/pipelines/seal_recognition/pipeline.py +109 -53
  112. paddlex/inference/pipelines/semantic_segmentation/pipeline.py +16 -6
  113. paddlex/inference/pipelines/small_object_detection/pipeline.py +16 -6
  114. paddlex/inference/pipelines/table_recognition/pipeline.py +26 -18
  115. paddlex/inference/pipelines/table_recognition/pipeline_v2.py +624 -53
  116. paddlex/inference/pipelines/table_recognition/result.py +1 -1
  117. paddlex/inference/pipelines/table_recognition/table_recognition_post_processing_v2.py +9 -5
  118. paddlex/inference/pipelines/ts_anomaly_detection/pipeline.py +2 -2
  119. paddlex/inference/pipelines/ts_classification/pipeline.py +2 -2
  120. paddlex/inference/pipelines/ts_forecasting/pipeline.py +2 -2
  121. paddlex/inference/pipelines/video_classification/pipeline.py +2 -2
  122. paddlex/inference/pipelines/video_detection/pipeline.py +2 -2
  123. paddlex/inference/serving/basic_serving/_app.py +46 -13
  124. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/common.py +5 -1
  125. paddlex/inference/serving/basic_serving/_pipeline_apps/layout_parsing.py +0 -1
  126. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv3_doc.py +0 -1
  127. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv4_doc.py +1 -1
  128. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py +6 -2
  129. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition.py +1 -5
  130. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition_v2.py +4 -5
  131. paddlex/inference/serving/infra/utils.py +20 -22
  132. paddlex/inference/serving/schemas/formula_recognition.py +1 -1
  133. paddlex/inference/serving/schemas/layout_parsing.py +1 -2
  134. paddlex/inference/serving/schemas/pp_chatocrv3_doc.py +1 -2
  135. paddlex/inference/serving/schemas/pp_chatocrv4_doc.py +2 -2
  136. paddlex/inference/serving/schemas/pp_structurev3.py +10 -6
  137. paddlex/inference/serving/schemas/seal_recognition.py +1 -1
  138. paddlex/inference/serving/schemas/table_recognition.py +2 -6
  139. paddlex/inference/serving/schemas/table_recognition_v2.py +5 -6
  140. paddlex/inference/utils/hpi.py +30 -16
  141. paddlex/inference/utils/hpi_model_info_collection.json +666 -162
  142. paddlex/inference/utils/io/readers.py +12 -12
  143. paddlex/inference/utils/misc.py +20 -0
  144. paddlex/inference/utils/mkldnn_blocklist.py +59 -0
  145. paddlex/inference/utils/official_models.py +140 -5
  146. paddlex/inference/utils/pp_option.py +74 -9
  147. paddlex/model.py +2 -2
  148. paddlex/modules/__init__.py +1 -1
  149. paddlex/modules/anomaly_detection/evaluator.py +2 -2
  150. paddlex/modules/base/__init__.py +1 -1
  151. paddlex/modules/base/evaluator.py +5 -5
  152. paddlex/modules/base/trainer.py +1 -1
  153. paddlex/modules/doc_vlm/dataset_checker.py +2 -2
  154. paddlex/modules/doc_vlm/evaluator.py +2 -2
  155. paddlex/modules/doc_vlm/exportor.py +2 -2
  156. paddlex/modules/doc_vlm/model_list.py +1 -1
  157. paddlex/modules/doc_vlm/trainer.py +2 -2
  158. paddlex/modules/face_recognition/evaluator.py +2 -2
  159. paddlex/modules/formula_recognition/evaluator.py +5 -2
  160. paddlex/modules/formula_recognition/model_list.py +3 -0
  161. paddlex/modules/formula_recognition/trainer.py +3 -0
  162. paddlex/modules/general_recognition/evaluator.py +1 -1
  163. paddlex/modules/image_classification/evaluator.py +2 -2
  164. paddlex/modules/image_classification/model_list.py +1 -0
  165. paddlex/modules/instance_segmentation/evaluator.py +1 -1
  166. paddlex/modules/keypoint_detection/evaluator.py +1 -1
  167. paddlex/modules/m_3d_bev_detection/evaluator.py +2 -2
  168. paddlex/modules/multilabel_classification/evaluator.py +2 -2
  169. paddlex/modules/object_detection/dataset_checker/dataset_src/convert_dataset.py +4 -4
  170. paddlex/modules/object_detection/evaluator.py +2 -2
  171. paddlex/modules/object_detection/model_list.py +2 -0
  172. paddlex/modules/semantic_segmentation/dataset_checker/__init__.py +12 -2
  173. paddlex/modules/semantic_segmentation/evaluator.py +2 -2
  174. paddlex/modules/table_recognition/evaluator.py +2 -2
  175. paddlex/modules/text_detection/evaluator.py +2 -2
  176. paddlex/modules/text_detection/model_list.py +2 -0
  177. paddlex/modules/text_recognition/evaluator.py +2 -2
  178. paddlex/modules/text_recognition/model_list.py +2 -0
  179. paddlex/modules/ts_anomaly_detection/evaluator.py +2 -2
  180. paddlex/modules/ts_classification/dataset_checker/dataset_src/split_dataset.py +1 -1
  181. paddlex/modules/ts_classification/evaluator.py +2 -2
  182. paddlex/modules/ts_forecast/evaluator.py +2 -2
  183. paddlex/modules/video_classification/evaluator.py +2 -2
  184. paddlex/modules/video_detection/evaluator.py +2 -2
  185. paddlex/ops/__init__.py +8 -5
  186. paddlex/paddlex_cli.py +19 -13
  187. paddlex/repo_apis/Paddle3D_api/bev_fusion/model.py +2 -2
  188. paddlex/repo_apis/PaddleClas_api/cls/config.py +1 -1
  189. paddlex/repo_apis/PaddleClas_api/cls/model.py +1 -1
  190. paddlex/repo_apis/PaddleClas_api/cls/register.py +10 -0
  191. paddlex/repo_apis/PaddleClas_api/cls/runner.py +1 -1
  192. paddlex/repo_apis/PaddleDetection_api/instance_seg/model.py +1 -1
  193. paddlex/repo_apis/PaddleDetection_api/instance_seg/runner.py +1 -1
  194. paddlex/repo_apis/PaddleDetection_api/object_det/config.py +1 -1
  195. paddlex/repo_apis/PaddleDetection_api/object_det/model.py +1 -1
  196. paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py +25 -0
  197. paddlex/repo_apis/PaddleDetection_api/object_det/register.py +30 -0
  198. paddlex/repo_apis/PaddleDetection_api/object_det/runner.py +1 -1
  199. paddlex/repo_apis/PaddleOCR_api/formula_rec/config.py +3 -3
  200. paddlex/repo_apis/PaddleOCR_api/formula_rec/model.py +5 -9
  201. paddlex/repo_apis/PaddleOCR_api/formula_rec/register.py +27 -0
  202. paddlex/repo_apis/PaddleOCR_api/formula_rec/runner.py +1 -1
  203. paddlex/repo_apis/PaddleOCR_api/table_rec/model.py +1 -1
  204. paddlex/repo_apis/PaddleOCR_api/table_rec/runner.py +1 -1
  205. paddlex/repo_apis/PaddleOCR_api/text_det/model.py +1 -1
  206. paddlex/repo_apis/PaddleOCR_api/text_det/register.py +18 -0
  207. paddlex/repo_apis/PaddleOCR_api/text_det/runner.py +1 -1
  208. paddlex/repo_apis/PaddleOCR_api/text_rec/config.py +3 -3
  209. paddlex/repo_apis/PaddleOCR_api/text_rec/model.py +5 -9
  210. paddlex/repo_apis/PaddleOCR_api/text_rec/register.py +18 -0
  211. paddlex/repo_apis/PaddleOCR_api/text_rec/runner.py +1 -1
  212. paddlex/repo_apis/PaddleSeg_api/seg/model.py +1 -1
  213. paddlex/repo_apis/PaddleSeg_api/seg/runner.py +1 -1
  214. paddlex/repo_apis/PaddleTS_api/ts_ad/config.py +3 -3
  215. paddlex/repo_apis/PaddleTS_api/ts_cls/config.py +2 -2
  216. paddlex/repo_apis/PaddleTS_api/ts_fc/config.py +4 -4
  217. paddlex/repo_apis/PaddleVideo_api/video_cls/config.py +1 -1
  218. paddlex/repo_apis/PaddleVideo_api/video_cls/model.py +1 -1
  219. paddlex/repo_apis/PaddleVideo_api/video_cls/runner.py +1 -1
  220. paddlex/repo_apis/PaddleVideo_api/video_det/config.py +1 -1
  221. paddlex/repo_apis/PaddleVideo_api/video_det/model.py +1 -1
  222. paddlex/repo_apis/PaddleVideo_api/video_det/runner.py +1 -1
  223. paddlex/repo_apis/base/config.py +1 -1
  224. paddlex/repo_manager/core.py +3 -3
  225. paddlex/repo_manager/meta.py +6 -2
  226. paddlex/repo_manager/repo.py +17 -16
  227. paddlex/utils/custom_device_list.py +26 -2
  228. paddlex/utils/deps.py +3 -3
  229. paddlex/utils/device.py +5 -13
  230. paddlex/utils/env.py +4 -0
  231. paddlex/utils/flags.py +11 -4
  232. paddlex/utils/fonts/__init__.py +34 -4
  233. paddlex/utils/misc.py +1 -1
  234. paddlex/utils/subclass_register.py +2 -2
  235. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/METADATA +349 -208
  236. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/RECORD +240 -211
  237. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/WHEEL +1 -1
  238. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/entry_points.txt +1 -0
  239. {paddlex-3.0.0rc1.dist-info/licenses → paddlex-3.0.2.dist-info}/LICENSE +0 -0
  240. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/top_level.txt +0 -0
@@ -14,21 +14,20 @@
14
14
 
15
15
  __all__ = [
16
16
  "get_sub_regions_ocr_res",
17
- "get_layout_ordering",
18
- "get_single_block_parsing_res",
19
17
  "get_show_color",
20
18
  "sorted_layout_boxes",
21
19
  ]
22
20
 
23
21
  import re
24
22
  from copy import deepcopy
25
- from typing import Any, Dict, List, Optional, Tuple, Union
23
+ from typing import Dict, List, Optional, Tuple, Union
26
24
 
27
25
  import numpy as np
28
26
  from PIL import Image
29
27
 
30
- from ...models.object_detection.result import DetResult
28
+ from ..components import convert_points_to_boxes
31
29
  from ..ocr.result import OCRResult
30
+ from .setting import BLOCK_LABEL_MAP, REGION_SETTINGS
32
31
 
33
32
 
34
33
  def get_overlap_boxes_idx(src_boxes: np.ndarray, ref_boxes: np.ndarray) -> List:
@@ -172,808 +171,167 @@ def sorted_layout_boxes(res, w):
172
171
  return new_res
173
172
 
174
173
 
175
- def _calculate_overlap_area_div_minbox_area_ratio(
176
- bbox1: Union[list, tuple],
177
- bbox2: Union[list, tuple],
174
+ def calculate_projection_overlap_ratio(
175
+ bbox1: List[float],
176
+ bbox2: List[float],
177
+ direction: str = "horizontal",
178
+ mode="union",
178
179
  ) -> float:
179
180
  """
180
- Calculate the ratio of the overlap area between bbox1 and bbox2
181
- to the area of the smaller bounding box.
182
-
183
- Args:
184
- bbox1 (list or tuple): Coordinates of the first bounding box [x_min, y_min, x_max, y_max].
185
- bbox2 (list or tuple): Coordinates of the second bounding box [x_min, y_min, x_max, y_max].
186
-
187
- Returns:
188
- float: The ratio of the overlap area to the area of the smaller bounding box.
189
- """
190
- bbox1 = list(map(int, bbox1))
191
- bbox2 = list(map(int, bbox2))
192
-
193
- x_left = max(bbox1[0], bbox2[0])
194
- y_top = max(bbox1[1], bbox2[1])
195
- x_right = min(bbox1[2], bbox2[2])
196
- y_bottom = min(bbox1[3], bbox2[3])
197
-
198
- if x_right <= x_left or y_bottom <= y_top:
199
- return 0.0
200
-
201
- intersection_area = (x_right - x_left) * (y_bottom - y_top)
202
- area_bbox1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
203
- area_bbox2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
204
- min_box_area = min(area_bbox1, area_bbox2)
205
-
206
- if min_box_area <= 0:
207
- return 0.0
208
-
209
- return intersection_area / min_box_area
210
-
211
-
212
- def _whether_y_overlap_exceeds_threshold(
213
- bbox1: Union[list, tuple],
214
- bbox2: Union[list, tuple],
215
- overlap_ratio_threshold: float = 0.6,
216
- ) -> bool:
217
- """
218
- Determines whether the vertical overlap between two bounding boxes exceeds a given threshold.
219
-
220
- Args:
221
- bbox1 (list or tuple): The first bounding box defined as (left, top, right, bottom).
222
- bbox2 (list or tuple): The second bounding box defined as (left, top, right, bottom).
223
- overlap_ratio_threshold (float): The threshold ratio to determine if the overlap is significant.
224
- Defaults to 0.6.
225
-
226
- Returns:
227
- bool: True if the vertical overlap divided by the minimum height of the two bounding boxes
228
- exceeds the overlap_ratio_threshold, otherwise False.
229
- """
230
- _, y1_0, _, y1_1 = bbox1
231
- _, y2_0, _, y2_1 = bbox2
232
-
233
- overlap = max(0, min(y1_1, y2_1) - max(y1_0, y2_0))
234
- min_height = min(y1_1 - y1_0, y2_1 - y2_0)
235
-
236
- return (overlap / min_height) > overlap_ratio_threshold
237
-
238
-
239
- def _adjust_span_text(span: List[str], prepend: bool = False, append: bool = False):
240
- """
241
- Adjust the text of a span by prepending or appending a newline.
242
-
243
- Args:
244
- span (list): A list where the second element is the text of the span.
245
- prepend (bool): If True, prepend a newline to the text.
246
- append (bool): If True, append a newline to the text.
247
-
248
- Returns:
249
- None: The function modifies the span in place.
250
- """
251
- if prepend:
252
- span[1] = "\n" + span[1]
253
- if append:
254
- span[1] = span[1] + "\n"
255
- return span
256
-
257
-
258
- def _format_line(
259
- line: List[List[Union[List[int], str]]],
260
- layout_min: int,
261
- layout_max: int,
262
- is_reference: bool = False,
263
- ) -> None:
264
- """
265
- Format a line of text spans based on layout constraints.
181
+ Calculate the IoU of lines between two bounding boxes.
266
182
 
267
183
  Args:
268
- line (list): A list of spans, where each span is a list containing a bounding box and text.
269
- layout_min (int): The minimum x-coordinate of the layout bounding box.
270
- layout_max (int): The maximum x-coordinate of the layout bounding box.
271
- is_reference (bool): A flag indicating whether the line is a reference line, which affects formatting rules.
184
+ bbox1 (List[float]): First bounding box [x_min, y_min, x_max, y_max].
185
+ bbox2 (List[float]): Second bounding box [x_min, y_min, x_max, y_max].
186
+ direction (str): direction of the projection, "horizontal" or "vertical".
272
187
 
273
188
  Returns:
274
- None: The function modifies the line in place.
275
- """
276
- first_span = line[0]
277
- end_span = line[-1]
278
-
279
- if not is_reference:
280
- if first_span[0][0] - layout_min > 10:
281
- first_span = _adjust_span_text(first_span, prepend=True)
282
- if layout_max - end_span[0][2] > 10:
283
- end_span = _adjust_span_text(end_span, append=True)
284
- else:
285
- if first_span[0][0] - layout_min < 5:
286
- first_span = _adjust_span_text(first_span, prepend=True)
287
- if layout_max - end_span[0][2] > 20:
288
- end_span = _adjust_span_text(end_span, append=True)
289
-
290
- line[0] = first_span
291
- line[-1] = end_span
292
-
293
- return line
294
-
295
-
296
- def split_boxes_if_x_contained(boxes, offset=1e-5):
189
+ float: Line overlap ratio. Returns 0 if there is no overlap.
297
190
  """
298
- Check if there is any complete containment in the x-direction
299
- between the bounding boxes and split the containing box accordingly.
191
+ start_index, end_index = 1, 3
192
+ if direction == "horizontal":
193
+ start_index, end_index = 0, 2
300
194
 
301
- Args:
302
- boxes (list of lists): Each element is a list containing an ndarray of length 4, a description, and a label.
303
- offset (float): A small offset value to ensure that the split boxes are not too close to the original boxes.
304
- Returns:
305
- A new list of boxes, including split boxes, with the same `rec_text` and `label` attributes.
306
- """
195
+ intersection_start = max(bbox1[start_index], bbox2[start_index])
196
+ intersection_end = min(bbox1[end_index], bbox2[end_index])
197
+ overlap = intersection_end - intersection_start
198
+ if overlap <= 0:
199
+ return 0
307
200
 
308
- def is_x_contained(box_a, box_b):
309
- """Check if box_a completely contains box_b in the x-direction."""
310
- return box_a[0][0] <= box_b[0][0] and box_a[0][2] >= box_b[0][2]
311
-
312
- new_boxes = []
313
-
314
- for i in range(len(boxes)):
315
- box_a = boxes[i]
316
- is_split = False
317
- for j in range(len(boxes)):
318
- if i == j:
319
- continue
320
- box_b = boxes[j]
321
- if is_x_contained(box_a, box_b):
322
- is_split = True
323
- # Split box_a based on the x-coordinates of box_b
324
- if box_a[0][0] < box_b[0][0]:
325
- w = box_b[0][0] - offset - box_a[0][0]
326
- if w > 1:
327
- new_boxes.append(
328
- [
329
- np.array(
330
- [
331
- box_a[0][0],
332
- box_a[0][1],
333
- box_b[0][0] - offset,
334
- box_a[0][3],
335
- ]
336
- ),
337
- box_a[1],
338
- box_a[2],
339
- ]
340
- )
341
- if box_a[0][2] > box_b[0][2]:
342
- w = box_a[0][2] - box_b[0][2] + offset
343
- if w > 1:
344
- box_a = [
345
- np.array(
346
- [
347
- box_b[0][2] + offset,
348
- box_a[0][1],
349
- box_a[0][2],
350
- box_a[0][3],
351
- ]
352
- ),
353
- box_a[1],
354
- box_a[2],
355
- ]
356
- if j == len(boxes) - 1 and is_split:
357
- new_boxes.append(box_a)
358
- if not is_split:
359
- new_boxes.append(box_a)
360
-
361
- return new_boxes
362
-
363
-
364
- def _sort_line_by_x_projection(
365
- input_img: np.ndarray,
366
- general_ocr_pipeline: Any,
367
- line: List[List[Union[List[int], str]]],
368
- ) -> None:
369
- """
370
- Sort a line of text spans based on their vertical position within the layout bounding box.
371
-
372
- Args:
373
- input_img (ndarray): The input image used for OCR.
374
- general_ocr_pipeline (Any): The general OCR pipeline used for text recognition.
375
- line (list): A list of spans, where each span is a list containing a bounding box and text.
376
-
377
- Returns:
378
- list: The sorted line of text spans.
379
- """
380
- splited_boxes = split_boxes_if_x_contained(line)
381
- splited_lines = []
382
- if len(line) != len(splited_boxes):
383
- splited_boxes.sort(key=lambda span: span[0][0])
384
- text_rec_model = general_ocr_pipeline.text_rec_model
385
- for span in splited_boxes:
386
- if span[2] == "text":
387
- crop_img = input_img[
388
- int(span[0][1]) : int(span[0][3]),
389
- int(span[0][0]) : int(span[0][2]),
390
- ]
391
- span[1] = next(text_rec_model([crop_img]))["rec_text"]
392
- splited_lines.append(span)
201
+ if mode == "union":
202
+ ref_width = max(bbox1[end_index], bbox2[end_index]) - min(
203
+ bbox1[start_index], bbox2[start_index]
204
+ )
205
+ elif mode == "small":
206
+ ref_width = min(
207
+ bbox1[end_index] - bbox1[start_index], bbox2[end_index] - bbox2[start_index]
208
+ )
209
+ elif mode == "large":
210
+ ref_width = max(
211
+ bbox1[end_index] - bbox1[start_index], bbox2[end_index] - bbox2[start_index]
212
+ )
393
213
  else:
394
- splited_lines = line
214
+ raise ValueError(
215
+ f"Invalid mode {mode}, must be one of ['union', 'small', 'large']."
216
+ )
395
217
 
396
- return splited_lines
218
+ return overlap / ref_width if ref_width > 0 else 0.0
397
219
 
398
220
 
399
- def _sort_ocr_res_by_y_projection(
400
- input_img: np.ndarray,
401
- general_ocr_pipeline: Any,
402
- label: Any,
403
- block_bbox: Tuple[int, int, int, int],
404
- ocr_res: Dict[str, List[Any]],
405
- line_height_iou_threshold: float = 0.7,
406
- ) -> Dict[str, List[Any]]:
221
+ def calculate_overlap_ratio(
222
+ bbox1: Union[list, tuple], bbox2: Union[list, tuple], mode="union"
223
+ ) -> float:
407
224
  """
408
- Sorts OCR results based on their spatial arrangement, grouping them into lines and blocks.
225
+ Calculate the overlap ratio between two bounding boxes.
409
226
 
410
227
  Args:
411
- input_img (ndarray): The input image used for OCR.
412
- general_ocr_pipeline (Any): The general OCR pipeline used for text recognition.
413
- label (Any): The label associated with the OCR results. It's not used in the function but might be
414
- relevant for other parts of the calling context.
415
- block_bbox (Tuple[int, int, int, int]): A tuple representing the layout bounding box, defined as
416
- (left, top, right, bottom).
417
- ocr_res (Dict[str, List[Any]]): A dictionary containing OCR results with the following keys:
418
- - "boxes": A list of bounding boxes, each defined as [left, top, right, bottom].
419
- - "rec_texts": A corresponding list of recognized text strings for each box.
420
- line_height_iou_threshold (float): The threshold for determining whether two boxes belong to
421
- the same line based on their vertical overlap. Defaults to 0.7.
228
+ bbox1 (list or tuple): The first bounding box, format [x_min, y_min, x_max, y_max]
229
+ bbox2 (list or tuple): The second bounding box, format [x_min, y_min, x_max, y_max]
230
+ mode (str): The mode of calculation, either 'union', 'small', or 'large'.
422
231
 
423
232
  Returns:
424
- Dict[str, List[Any]]: A dictionary with the same structure as `ocr_res`, but with boxes and texts sorted
425
- and grouped into lines and blocks.
233
+ float: The overlap ratio value between the two bounding boxes
426
234
  """
427
- assert (
428
- ocr_res["boxes"] and ocr_res["rec_texts"]
429
- ), "OCR results must contain 'boxes' and 'rec_texts'"
430
-
431
- boxes = ocr_res["boxes"]
432
- rec_texts = ocr_res["rec_texts"]
433
- rec_labels = ocr_res["rec_labels"]
235
+ x_min_inter = max(bbox1[0], bbox2[0])
236
+ y_min_inter = max(bbox1[1], bbox2[1])
237
+ x_max_inter = min(bbox1[2], bbox2[2])
238
+ y_max_inter = min(bbox1[3], bbox2[3])
434
239
 
435
- x_min, _, x_max, _ = block_bbox
436
- inline_x_min = min([box[0] for box in boxes])
437
- inline_x_max = max([box[2] for box in boxes])
240
+ inter_width = max(0, x_max_inter - x_min_inter)
241
+ inter_height = max(0, y_max_inter - y_min_inter)
438
242
 
439
- spans = list(zip(boxes, rec_texts, rec_labels))
243
+ inter_area = float(inter_width) * float(inter_height)
440
244
 
441
- spans.sort(key=lambda span: span[0][1])
442
- spans = [list(span) for span in spans]
245
+ bbox1_area = caculate_bbox_area(bbox1)
246
+ bbox2_area = caculate_bbox_area(bbox2)
443
247
 
444
- lines = []
445
- current_line = [spans[0]]
446
- current_y0, current_y1 = spans[0][0][1], spans[0][0][3]
447
-
448
- for span in spans[1:]:
449
- y0, y1 = span[0][1], span[0][3]
450
- if _whether_y_overlap_exceeds_threshold(
451
- (0, current_y0, 0, current_y1),
452
- (0, y0, 0, y1),
453
- line_height_iou_threshold,
454
- ):
455
- current_line.append(span)
456
- current_y0 = min(current_y0, y0)
457
- current_y1 = max(current_y1, y1)
458
- else:
459
- lines.append(current_line)
460
- current_line = [span]
461
- current_y0, current_y1 = y0, y1
462
-
463
- if current_line:
464
- lines.append(current_line)
465
-
466
- new_lines = []
467
- for line in lines:
468
- line.sort(key=lambda span: span[0][0])
469
-
470
- ocr_labels = [span[2] for span in line]
471
- if "formula" in ocr_labels:
472
- line = _sort_line_by_x_projection(input_img, general_ocr_pipeline, line)
473
- if label == "reference":
474
- line = _format_line(line, inline_x_min, inline_x_max, is_reference=True)
475
- elif label != "content":
476
- line = _format_line(line, x_min, x_max)
477
- new_lines.append(line)
478
-
479
- ocr_res["boxes"] = [span[0] for line in new_lines for span in line]
480
- if label == "content":
481
- ocr_res["rec_texts"] = [
482
- "".join(f"{span[1]} " for span in line).rstrip() for line in new_lines
483
- ]
248
+ if mode == "union":
249
+ ref_area = bbox1_area + bbox2_area - inter_area
250
+ elif mode == "small":
251
+ ref_area = min(bbox1_area, bbox2_area)
252
+ elif mode == "large":
253
+ ref_area = max(bbox1_area, bbox2_area)
484
254
  else:
485
- ocr_res["rec_texts"] = [span[1] + " " for line in new_lines for span in line]
486
- return ocr_res, len(new_lines)
487
-
488
-
489
- def _process_text(input_text: str) -> str:
490
- """
491
- Process the input text to handle spaces.
492
-
493
- The function removes multiple consecutive spaces between Chinese characters and ensures that
494
- only a single space is retained between Chinese and non-Chinese characters.
495
-
496
- Args:
497
- input_text (str): The text to be processed.
498
-
499
- Returns:
500
- str: The processed text with properly formatted spaces.
501
- """
502
-
503
- def handle_spaces_(text: str) -> str:
504
- """
505
- Handle spaces in the text by removing multiple consecutive spaces and inserting a single space
506
- between Chinese and non-Chinese characters.
507
-
508
- Args:
509
- text (str): The text to handle spaces for.
510
-
511
- Returns:
512
- str: The text with properly formatted spaces.
513
- """
514
- spaces = re.finditer(r"\s+", text)
515
- processed_text = list(text)
516
-
517
- for space in reversed(list(spaces)):
518
- start, end = space.span()
519
- prev_char = processed_text[start - 1] if start > 0 else ""
520
- next_char = processed_text[end] if end < len(processed_text) else ""
521
-
522
- is_prev_chinese = (
523
- re.match(r"[\u4e00-\u9fff]", prev_char) if prev_char else False
524
- )
525
- is_next_chinese = (
526
- re.match(r"[\u4e00-\u9fff]", next_char) if next_char else False
527
- )
528
-
529
- if is_prev_chinese and is_next_chinese:
530
- processed_text[start:end] = []
531
- else:
532
- processed_text[start:end] = [" "]
533
-
534
- return "".join(processed_text)
535
-
536
- text_without_spaces = handle_spaces_(input_text)
537
-
538
- final_text = re.sub(r"\s+", " ", text_without_spaces).strip()
539
- return final_text
540
-
541
-
542
- def get_single_block_parsing_res(
543
- general_ocr_pipeline: Any,
544
- overall_ocr_res: OCRResult,
545
- layout_det_res: DetResult,
546
- table_res_list: list,
547
- seal_res_list: list,
548
- ) -> OCRResult:
549
- """
550
- Extract structured information from OCR and layout detection results.
551
-
552
- Args:
553
- overall_ocr_res (OCRResult): An object containing the overall OCR results, including detected text boxes and recognized text. The structure is expected to have:
554
- - "input_img": The image on which OCR was performed.
555
- - "dt_boxes": A list of detected text box coordinates.
556
- - "rec_texts": A list of recognized text corresponding to the detected boxes.
557
-
558
- layout_det_res (DetResult): An object containing the layout detection results, including detected layout boxes and their labels. The structure is expected to have:
559
- - "boxes": A list of dictionaries with keys "coordinate" for box coordinates and "block_label" for the type of content.
560
-
561
- table_res_list (list): A list of table detection results, where each item is a dictionary containing:
562
- - "block_bbox": The bounding box of the table layout.
563
- - "pred_html": The predicted HTML representation of the table.
564
-
565
- seal_res_list (List): A list of seal detection results. The details of each item depend on the specific application context.
566
-
567
- Returns:
568
- list: A list of structured boxes where each item is a dictionary containing:
569
- - "block_label": The label of the content (e.g., 'table', 'chart', 'image').
570
- - The label as a key with either table HTML or image data and text.
571
- - "block_bbox": The coordinates of the layout box.
572
- """
573
-
574
- single_block_layout_parsing_res = []
575
- input_img = overall_ocr_res["doc_preprocessor_res"]["output_img"]
576
- seal_index = 0
577
- with_doc_title = False
578
- max_block_area = 0.0
579
- paragraph_title_indexs = []
580
-
581
- layout_det_res_list, _ = _remove_overlap_blocks(
582
- deepcopy(layout_det_res["boxes"]),
583
- threshold=0.5,
584
- smaller=True,
585
- )
586
-
587
- for box_idx, box_info in enumerate(layout_det_res_list):
588
- block_bbox = box_info["coordinate"]
589
- label = box_info["label"]
590
- rec_res = {"boxes": [], "rec_texts": [], "rec_labels": [], "flag": False}
591
- seg_start_coordinate = float("inf")
592
- seg_end_coordinate = float("-inf")
593
- num_of_lines = 1
594
-
595
- if label == "doc_title":
596
- with_doc_title = True
597
- elif label == "paragraph_title":
598
- paragraph_title_indexs.append(box_idx)
599
-
600
- block_area = (block_bbox[2] - block_bbox[0]) * (block_bbox[3] - block_bbox[1])
601
- max_block_area = max(max_block_area, block_area)
602
-
603
- if label == "table":
604
- for table_res in table_res_list:
605
- if len(table_res["cell_box_list"]) == 0:
606
- continue
607
- if (
608
- _calculate_overlap_area_div_minbox_area_ratio(
609
- block_bbox, table_res["cell_box_list"][0]
610
- )
611
- > 0.5
612
- ):
613
- single_block_layout_parsing_res.append(
614
- {
615
- "block_label": label,
616
- "block_content": table_res["pred_html"],
617
- "block_bbox": block_bbox,
618
- },
619
- )
620
- break
621
- elif label == "seal":
622
- if len(seal_res_list) > 0:
623
- single_block_layout_parsing_res.append(
624
- {
625
- "block_label": label,
626
- "block_content": _process_text(
627
- ", ".join(seal_res_list[seal_index]["rec_texts"])
628
- ),
629
- "block_bbox": block_bbox,
630
- },
631
- )
632
- seal_index += 1
633
- else:
634
- overall_text_boxes = overall_ocr_res["rec_boxes"]
635
- for box_no in range(len(overall_text_boxes)):
636
- if (
637
- _calculate_overlap_area_div_minbox_area_ratio(
638
- block_bbox, overall_text_boxes[box_no]
639
- )
640
- > 0.5
641
- ):
642
- rec_res["boxes"].append(overall_text_boxes[box_no])
643
- rec_res["rec_texts"].append(
644
- overall_ocr_res["rec_texts"][box_no],
645
- )
646
- rec_res["rec_labels"].append(
647
- overall_ocr_res["rec_labels"][box_no],
648
- )
649
- rec_res["flag"] = True
650
-
651
- if rec_res["flag"]:
652
- rec_res, num_of_lines = _sort_ocr_res_by_y_projection(
653
- input_img, general_ocr_pipeline, label, block_bbox, rec_res, 0.7
654
- )
655
- seg_start_coordinate = rec_res["boxes"][0][0]
656
- seg_end_coordinate = rec_res["boxes"][-1][2]
657
- if label == "formula":
658
- rec_res["rec_texts"] = [
659
- rec_res_text.replace("$", "")
660
- for rec_res_text in rec_res["rec_texts"]
661
- ]
662
-
663
- if label in ["chart", "image"]:
664
- x_min, y_min, x_max, y_max = list(map(int, block_bbox))
665
- img_path = f"imgs/img_in_table_box_{x_min}_{y_min}_{x_max}_{y_max}.jpg"
666
- img = Image.fromarray(input_img[y_min:y_max, x_min:x_max, ::-1])
667
- single_block_layout_parsing_res.append(
668
- {
669
- "block_label": label,
670
- "block_content": _process_text("".join(rec_res["rec_texts"])),
671
- "block_image": {img_path: img},
672
- "block_bbox": block_bbox,
673
- },
674
- )
675
- else:
676
- if label in ["doc_title"]:
677
- content = " ".join(rec_res["rec_texts"])
678
- elif label in ["content"]:
679
- content = "\n".join(rec_res["rec_texts"])
680
- else:
681
- content = "".join(rec_res["rec_texts"])
682
- if label != "reference":
683
- content = _process_text(content)
684
- single_block_layout_parsing_res.append(
685
- {
686
- "block_label": label,
687
- "block_content": content,
688
- "block_bbox": block_bbox,
689
- "seg_start_coordinate": seg_start_coordinate,
690
- "seg_end_coordinate": seg_end_coordinate,
691
- "num_of_lines": num_of_lines,
692
- "block_area": block_area,
693
- },
694
- )
695
-
696
- if (
697
- not with_doc_title
698
- and len(paragraph_title_indexs) == 1
699
- and single_block_layout_parsing_res[paragraph_title_indexs[0]].get(
700
- "block_area", 0
255
+ raise ValueError(
256
+ f"Invalid mode {mode}, must be one of ['union', 'small', 'large']."
701
257
  )
702
- > max_block_area * 0.3
703
- ):
704
- single_block_layout_parsing_res[paragraph_title_indexs[0]][
705
- "block_label"
706
- ] = "doc_title"
707
-
708
- if len(layout_det_res_list) == 0:
709
- for ocr_rec_box, ocr_rec_text in zip(
710
- overall_ocr_res["rec_boxes"], overall_ocr_res["rec_texts"]
711
- ):
712
- single_block_layout_parsing_res.append(
713
- {
714
- "block_label": "text",
715
- "block_content": ocr_rec_text,
716
- "block_bbox": ocr_rec_box,
717
- "seg_start_coordinate": ocr_rec_box[0],
718
- "seg_end_coordinate": ocr_rec_box[2],
719
- },
720
- )
721
258
 
722
- single_block_layout_parsing_res = get_layout_ordering(
723
- single_block_layout_parsing_res,
724
- no_mask_labels=[
725
- "text",
726
- "formula",
727
- "algorithm",
728
- "reference",
729
- "content",
730
- "abstract",
731
- ],
732
- )
259
+ if ref_area == 0:
260
+ return 0.0
733
261
 
734
- return single_block_layout_parsing_res
262
+ return inter_area / ref_area
735
263
 
736
264
 
737
- def _projection_by_bboxes(boxes: np.ndarray, axis: int) -> np.ndarray:
265
+ def calculate_minimum_enclosing_bbox(bboxes):
738
266
  """
739
- Generate a 1D projection histogram from bounding boxes along a specified axis.
267
+ Calculate the minimum enclosing bounding box for a list of bounding boxes.
740
268
 
741
269
  Args:
742
- boxes: A (N, 4) array of bounding boxes defined by [x_min, y_min, x_max, y_max].
743
- axis: Axis for projection; 0 for horizontal (x-axis), 1 for vertical (y-axis).
270
+ bboxes (list): A list of bounding boxes represented as lists of four integers [x1, y1, x2, y2].
744
271
 
745
272
  Returns:
746
- A 1D numpy array representing the projection histogram based on bounding box intervals.
273
+ list: The minimum enclosing bounding box represented as a list of four integers [x1, y1, x2, y2].
747
274
  """
748
- assert axis in [0, 1]
749
- max_length = np.max(boxes[:, axis::2])
750
- projection = np.zeros(max_length, dtype=int)
275
+ if not bboxes:
276
+ raise ValueError("The list of bounding boxes is empty.")
751
277
 
752
- # Increment projection histogram over the interval defined by each bounding box
753
- for start, end in boxes[:, axis::2]:
754
- projection[start:end] += 1
278
+ # Convert the list of bounding boxes to a NumPy array
279
+ bboxes_array = np.array(bboxes)
755
280
 
756
- return projection
281
+ # Compute the minimum and maximum values along the respective axes
282
+ min_x = np.min(bboxes_array[:, 0])
283
+ min_y = np.min(bboxes_array[:, 1])
284
+ max_x = np.max(bboxes_array[:, 2])
285
+ max_y = np.max(bboxes_array[:, 3])
757
286
 
287
+ # Return the minimum enclosing bounding box
288
+ return np.array([min_x, min_y, max_x, max_y])
758
289
 
759
- def _split_projection_profile(arr_values: np.ndarray, min_value: float, min_gap: float):
760
- """
761
- Split the projection profile into segments based on specified thresholds.
762
290
 
763
- Args:
764
- arr_values: 1D array representing the projection profile.
765
- min_value: Minimum value threshold to consider a profile segment significant.
766
- min_gap: Minimum gap width to consider a separation between segments.
291
+ def is_english_letter(char):
292
+ """check if the char is english letter"""
293
+ return bool(re.match(r"^[A-Za-z]$", char))
767
294
 
768
- Returns:
769
- A tuple of start and end indices for each segment that meets the criteria.
770
- """
771
- # Identify indices where the projection exceeds the minimum value
772
- significant_indices = np.where(arr_values > min_value)[0]
773
- if not len(significant_indices):
774
- return
775
-
776
- # Calculate gaps between significant indices
777
- index_diffs = significant_indices[1:] - significant_indices[:-1]
778
- gap_indices = np.where(index_diffs > min_gap)[0]
779
-
780
- # Determine start and end indices of segments
781
- segment_starts = np.insert(
782
- significant_indices[gap_indices + 1],
783
- 0,
784
- significant_indices[0],
785
- )
786
- segment_ends = np.append(
787
- significant_indices[gap_indices],
788
- significant_indices[-1] + 1,
789
- )
790
295
 
791
- return segment_starts, segment_ends
296
+ def is_numeric(char):
297
+ """check if the char is numeric"""
298
+ return bool(re.match(r"^[\d]+$", char))
792
299
 
793
300
 
794
- def _recursive_yx_cut(
795
- boxes: np.ndarray, indices: List[int], res: List[int], min_gap: int = 1
796
- ):
301
+ def is_non_breaking_punctuation(char):
797
302
  """
798
- Recursively project and segment bounding boxes, starting with Y-axis and followed by X-axis.
303
+ check if the char is non-breaking punctuation
799
304
 
800
305
  Args:
801
- boxes: A (N, 4) array representing bounding boxes.
802
- indices: List of indices indicating the original position of boxes.
803
- res: List to store indices of the final segmented bounding boxes.
804
- min_gap (int): Minimum gap width to consider a separation between segments on the X-axis. Defaults to 1.
306
+ char (str): character to check
805
307
 
806
308
  Returns:
807
- None: This function modifies the `res` list in place.
808
- """
809
- assert len(boxes) == len(
810
- indices
811
- ), "The length of boxes and indices must be the same."
812
-
813
- # Sort by y_min for Y-axis projection
814
- y_sorted_indices = boxes[:, 1].argsort()
815
- y_sorted_boxes = boxes[y_sorted_indices]
816
- y_sorted_indices = np.array(indices)[y_sorted_indices]
817
-
818
- # Perform Y-axis projection
819
- y_projection = _projection_by_bboxes(boxes=y_sorted_boxes, axis=1)
820
- y_intervals = _split_projection_profile(y_projection, 0, 1)
821
-
822
- if not y_intervals:
823
- return
824
-
825
- # Process each segment defined by Y-axis projection
826
- for y_start, y_end in zip(*y_intervals):
827
- # Select boxes within the current y interval
828
- y_interval_indices = (y_start <= y_sorted_boxes[:, 1]) & (
829
- y_sorted_boxes[:, 1] < y_end
830
- )
831
- y_boxes_chunk = y_sorted_boxes[y_interval_indices]
832
- y_indices_chunk = y_sorted_indices[y_interval_indices]
833
-
834
- # Sort by x_min for X-axis projection
835
- x_sorted_indices = y_boxes_chunk[:, 0].argsort()
836
- x_sorted_boxes_chunk = y_boxes_chunk[x_sorted_indices]
837
- x_sorted_indices_chunk = y_indices_chunk[x_sorted_indices]
838
-
839
- # Perform X-axis projection
840
- x_projection = _projection_by_bboxes(boxes=x_sorted_boxes_chunk, axis=0)
841
- x_intervals = _split_projection_profile(x_projection, 0, min_gap)
842
-
843
- if not x_intervals:
844
- continue
845
-
846
- # If X-axis cannot be further segmented, add current indices to results
847
- if len(x_intervals[0]) == 1:
848
- res.extend(x_sorted_indices_chunk)
849
- continue
850
-
851
- # Recursively process each segment defined by X-axis projection
852
- for x_start, x_end in zip(*x_intervals):
853
- x_interval_indices = (x_start <= x_sorted_boxes_chunk[:, 0]) & (
854
- x_sorted_boxes_chunk[:, 0] < x_end
855
- )
856
- _recursive_yx_cut(
857
- x_sorted_boxes_chunk[x_interval_indices],
858
- x_sorted_indices_chunk[x_interval_indices],
859
- res,
860
- )
861
-
862
-
863
- def _recursive_xy_cut(
864
- boxes: np.ndarray, indices: List[int], res: List[int], min_gap: int = 1
865
- ):
866
- """
867
- Recursively performs X-axis projection followed by Y-axis projection to segment bounding boxes.
868
-
869
- Args:
870
- boxes: A (N, 4) array representing bounding boxes with [x_min, y_min, x_max, y_max].
871
- indices: A list of indices representing the position of boxes in the original data.
872
- res: A list to store indices of bounding boxes that meet the criteria.
873
- min_gap (int): Minimum gap width to consider a separation between segments on the X-axis. Defaults to 1.
874
-
875
- Returns:
876
- None: This function modifies the `res` list in place.
877
- """
878
- # Ensure boxes and indices have the same length
879
- assert len(boxes) == len(
880
- indices
881
- ), "The length of boxes and indices must be the same."
882
-
883
- # Sort by x_min to prepare for X-axis projection
884
- x_sorted_indices = boxes[:, 0].argsort()
885
- x_sorted_boxes = boxes[x_sorted_indices]
886
- x_sorted_indices = np.array(indices)[x_sorted_indices]
887
-
888
- # Perform X-axis projection
889
- x_projection = _projection_by_bboxes(boxes=x_sorted_boxes, axis=0)
890
- x_intervals = _split_projection_profile(x_projection, 0, 1)
891
-
892
- if not x_intervals:
893
- return
894
-
895
- # Process each segment defined by X-axis projection
896
- for x_start, x_end in zip(*x_intervals):
897
- # Select boxes within the current x interval
898
- x_interval_indices = (x_start <= x_sorted_boxes[:, 0]) & (
899
- x_sorted_boxes[:, 0] < x_end
900
- )
901
- x_boxes_chunk = x_sorted_boxes[x_interval_indices]
902
- x_indices_chunk = x_sorted_indices[x_interval_indices]
903
-
904
- # Sort selected boxes by y_min to prepare for Y-axis projection
905
- y_sorted_indices = x_boxes_chunk[:, 1].argsort()
906
- y_sorted_boxes_chunk = x_boxes_chunk[y_sorted_indices]
907
- y_sorted_indices_chunk = x_indices_chunk[y_sorted_indices]
908
-
909
- # Perform Y-axis projection
910
- y_projection = _projection_by_bboxes(boxes=y_sorted_boxes_chunk, axis=1)
911
- y_intervals = _split_projection_profile(y_projection, 0, min_gap)
912
-
913
- if not y_intervals:
914
- continue
915
-
916
- # If Y-axis cannot be further segmented, add current indices to results
917
- if len(y_intervals[0]) == 1:
918
- res.extend(y_sorted_indices_chunk)
919
- continue
920
-
921
- # Recursively process each segment defined by Y-axis projection
922
- for y_start, y_end in zip(*y_intervals):
923
- y_interval_indices = (y_start <= y_sorted_boxes_chunk[:, 1]) & (
924
- y_sorted_boxes_chunk[:, 1] < y_end
925
- )
926
- _recursive_xy_cut(
927
- y_sorted_boxes_chunk[y_interval_indices],
928
- y_sorted_indices_chunk[y_interval_indices],
929
- res,
930
- )
931
-
932
-
933
- def sort_by_xycut(
934
- block_bboxes: Union[np.ndarray, List[List[int]]],
935
- direction: int = 0,
936
- min_gap: int = 1,
937
- ) -> List[int]:
938
- """
939
- Sort bounding boxes using recursive XY cut method based on the specified direction.
940
-
941
- Args:
942
- block_bboxes (Union[np.ndarray, List[List[int]]]): An array or list of bounding boxes,
943
- where each box is represented as
944
- [x_min, y_min, x_max, y_max].
945
- direction (int): Direction for the initial cut. Use 1 for Y-axis first and 0 for X-axis first.
946
- Defaults to 0.
947
- min_gap (int): Minimum gap width to consider a separation between segments. Defaults to 1.
309
+ bool: True if the char is non-breaking punctuation
310
+ """
311
+ non_breaking_punctuations = {
312
+ ",",
313
+ "",
314
+ "、",
315
+ ";",
316
+ ";",
317
+ ":",
318
+ ":",
319
+ "-",
320
+ "'",
321
+ '"',
322
+ "“",
323
+ }
948
324
 
949
- Returns:
950
- List[int]: A list of indices representing the order of sorted bounding boxes.
951
- """
952
- block_bboxes = np.asarray(block_bboxes).astype(int)
953
- res = []
954
- if direction == 1:
955
- _recursive_yx_cut(
956
- block_bboxes,
957
- np.arange(len(block_bboxes)).tolist(),
958
- res,
959
- min_gap,
960
- )
961
- else:
962
- _recursive_xy_cut(
963
- block_bboxes,
964
- np.arange(len(block_bboxes)).tolist(),
965
- res,
966
- min_gap,
967
- )
968
- return res
325
+ return char in non_breaking_punctuations
969
326
 
970
327
 
971
328
  def gather_imgs(original_img, layout_det_objs):
972
329
  imgs_in_doc = []
973
330
  for det_obj in layout_det_objs:
974
- if det_obj["label"] in ("image", "chart"):
331
+ if det_obj["label"] in BLOCK_LABEL_MAP["image_labels"]:
332
+ label = det_obj["label"]
975
333
  x_min, y_min, x_max, y_max = list(map(int, det_obj["coordinate"]))
976
- img_path = f"imgs/img_in_table_box_{x_min}_{y_min}_{x_max}_{y_max}.jpg"
334
+ img_path = f"imgs/img_in_{label}_box_{x_min}_{y_min}_{x_max}_{y_max}.jpg"
977
335
  img = Image.fromarray(original_img[y_min:y_max, x_min:x_max, ::-1])
978
336
  imgs_in_doc.append(
979
337
  {
@@ -1007,10 +365,10 @@ def _get_minbox_if_overlap_by_ratio(
1007
365
  The selected bounding box or None if the overlap ratio is not exceeded.
1008
366
  """
1009
367
  # Calculate the areas of both bounding boxes
1010
- area1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
1011
- area2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
368
+ area1 = caculate_bbox_area(bbox1)
369
+ area2 = caculate_bbox_area(bbox2)
1012
370
  # Calculate the overlap ratio using a helper function
1013
- overlap_ratio = _calculate_overlap_area_div_minbox_area_ratio(bbox1, bbox2)
371
+ overlap_ratio = calculate_overlap_ratio(bbox1, bbox2, mode="small")
1014
372
  # Check if the overlap ratio exceeds the threshold
1015
373
  if overlap_ratio > ratio:
1016
374
  if (area1 <= area2 and smaller) or (area1 >= area2 and not smaller):
@@ -1020,7 +378,7 @@ def _get_minbox_if_overlap_by_ratio(
1020
378
  return None
1021
379
 
1022
380
 
1023
- def _remove_overlap_blocks(
381
+ def remove_overlap_blocks(
1024
382
  blocks: List[Dict[str, List[int]]], threshold: float = 0.65, smaller: bool = True
1025
383
  ) -> Tuple[List[Dict[str, List[int]]], List[Dict[str, List[int]]]]:
1026
384
  """
@@ -1035,13 +393,13 @@ def _remove_overlap_blocks(
1035
393
  Tuple[List[Dict[str, List[int]]], List[Dict[str, List[int]]]]:
1036
394
  A tuple containing the updated list of blocks and a list of dropped blocks.
1037
395
  """
1038
- dropped_blocks = []
1039
396
  dropped_indexes = set()
1040
-
397
+ blocks = deepcopy(blocks)
398
+ overlap_image_blocks = []
1041
399
  # Iterate over each pair of blocks to find overlaps
1042
- for i, block1 in enumerate(blocks):
1043
- for j in range(i + 1, len(blocks)):
1044
- block2 = blocks[j]
400
+ for i, block1 in enumerate(blocks["boxes"]):
401
+ for j in range(i + 1, len(blocks["boxes"])):
402
+ block2 = blocks["boxes"][j]
1045
403
  # Skip blocks that are already marked for removal
1046
404
  if i in dropped_indexes or j in dropped_indexes:
1047
405
  continue
@@ -1053,1332 +411,337 @@ def _remove_overlap_blocks(
1053
411
  smaller=smaller,
1054
412
  )
1055
413
  if overlap_box_index is not None:
1056
- # Determine which block to remove based on overlap_box_index
1057
- if overlap_box_index == 1:
1058
- drop_index = i
414
+ is_block1_image = block1["label"] == "image"
415
+ is_block2_image = block2["label"] == "image"
416
+
417
+ if is_block1_image != is_block2_image:
418
+ # 如果只有一个块在视觉标签中,删除在视觉标签中的那个块
419
+ drop_index = i if is_block1_image else j
420
+ overlap_image_blocks.append(blocks["boxes"][drop_index])
1059
421
  else:
1060
- drop_index = j
422
+ # 如果两个块都在或都不在视觉标签中,根据 overlap_box_index 决定删除哪个块
423
+ drop_index = i if overlap_box_index == 1 else j
424
+
1061
425
  dropped_indexes.add(drop_index)
1062
426
 
1063
427
  # Remove marked blocks from the original list
1064
428
  for index in sorted(dropped_indexes, reverse=True):
1065
- dropped_blocks.append(blocks[index])
1066
- del blocks[index]
1067
-
1068
- return blocks, dropped_blocks
1069
-
1070
-
1071
- def _get_text_median_width(blocks: List[Dict[str, any]]) -> float:
1072
- """
1073
- Calculate the median width of blocks labeled as "text".
1074
-
1075
- Args:
1076
- blocks (List[Dict[str, any]]): List of block dictionaries, each containing a 'block_bbox' and 'label'.
1077
-
1078
- Returns:
1079
- float: The median width of text blocks, or infinity if no text blocks are found.
1080
- """
1081
- widths = [
1082
- block["block_bbox"][2] - block["block_bbox"][0]
1083
- for block in blocks
1084
- if block.get("block_label") == "text"
1085
- ]
1086
- return np.median(widths) if widths else float("inf")
1087
-
1088
-
1089
- def _get_layout_property(
1090
- blocks: List[Dict[str, any]],
1091
- median_width: float,
1092
- no_mask_labels: List[str],
1093
- threshold: float = 0.8,
1094
- ) -> Tuple[List[Dict[str, any]], bool]:
1095
- """
1096
- Determine the layout (single or double column) of text blocks.
1097
-
1098
- Args:
1099
- blocks (List[Dict[str, any]]): List of block dictionaries containing 'label' and 'block_bbox'.
1100
- median_width (float): Median width of text blocks.
1101
- no_mask_labels (List[str]): Labels of blocks to be considered for layout analysis.
1102
- threshold (float): Threshold for determining layout overlap.
429
+ del blocks["boxes"][index]
1103
430
 
1104
- Returns:
1105
- Tuple[List[Dict[str, any]], bool]: Updated list of blocks with layout information and a boolean
1106
- indicating if the double layout area is greater than the single layout area.
1107
- """
1108
- blocks.sort(
1109
- key=lambda x: (
1110
- x["block_bbox"][0],
1111
- (x["block_bbox"][2] - x["block_bbox"][0]),
1112
- ),
1113
- )
1114
- check_single_layout = {}
1115
- page_min_x, page_max_x = float("inf"), 0
1116
- double_label_area = 0
1117
- single_label_area = 0
1118
-
1119
- for i, block in enumerate(blocks):
1120
- page_min_x = min(page_min_x, block["block_bbox"][0])
1121
- page_max_x = max(page_max_x, block["block_bbox"][2])
1122
- page_width = page_max_x - page_min_x
1123
-
1124
- for i, block in enumerate(blocks):
1125
- if block["block_label"] not in no_mask_labels:
1126
- continue
1127
-
1128
- x_min_i, _, x_max_i, _ = block["block_bbox"]
1129
- layout_length = x_max_i - x_min_i
1130
- cover_count, cover_with_threshold_count = 0, 0
1131
- match_block_with_threshold_indexes = []
1132
-
1133
- for j, other_block in enumerate(blocks):
1134
- if i == j or other_block["block_label"] not in no_mask_labels:
1135
- continue
1136
-
1137
- x_min_j, _, x_max_j, _ = other_block["block_bbox"]
1138
- x_match_min, x_match_max = max(
1139
- x_min_i,
1140
- x_min_j,
1141
- ), min(x_max_i, x_max_j)
1142
- match_block_iou = (x_match_max - x_match_min) / (x_max_j - x_min_j)
1143
-
1144
- if match_block_iou > 0:
1145
- cover_count += 1
1146
- if match_block_iou > threshold:
1147
- cover_with_threshold_count += 1
1148
- match_block_with_threshold_indexes.append(
1149
- (j, match_block_iou),
1150
- )
1151
- x_min_i = x_match_max
1152
- if x_min_i >= x_max_i:
1153
- break
1154
-
1155
- if (
1156
- layout_length > median_width * 1.3
1157
- and (cover_with_threshold_count >= 2 or cover_count >= 2)
1158
- ) or layout_length > 0.6 * page_width:
1159
- # if layout_length > median_width * 1.3 and (cover_with_threshold_count >= 2):
1160
- block["layout"] = "double"
1161
- double_label_area += (block["block_bbox"][2] - block["block_bbox"][0]) * (
1162
- block["block_bbox"][3] - block["block_bbox"][1]
1163
- )
1164
- else:
1165
- block["layout"] = "single"
1166
- check_single_layout[i] = match_block_with_threshold_indexes
1167
-
1168
- # Check single-layout block
1169
- for i, single_layout in check_single_layout.items():
1170
- if single_layout:
1171
- index, match_iou = single_layout[-1]
1172
- if match_iou > 0.9 and blocks[index]["layout"] == "double":
1173
- blocks[i]["layout"] = "double"
1174
- double_label_area += (
1175
- blocks[i]["block_bbox"][2] - blocks[i]["block_bbox"][0]
1176
- ) * (blocks[i]["block_bbox"][3] - blocks[i]["block_bbox"][1])
1177
- else:
1178
- single_label_area += (
1179
- blocks[i]["block_bbox"][2] - blocks[i]["block_bbox"][0]
1180
- ) * (blocks[i]["block_bbox"][3] - blocks[i]["block_bbox"][1])
431
+ return blocks
1181
432
 
1182
- return blocks, (double_label_area > single_label_area)
1183
433
 
1184
-
1185
- def _get_bbox_direction(input_bbox: List[float], ratio: float = 1.0) -> bool:
434
+ def get_bbox_intersection(bbox1, bbox2, return_format="bbox"):
1186
435
  """
1187
- Determine if a bounding box is horizontal or vertical.
436
+ Compute the intersection of two bounding boxes, supporting both 4-coordinate and 8-coordinate formats.
1188
437
 
1189
438
  Args:
1190
- input_bbox (List[float]): Bounding box [x_min, y_min, x_max, y_max].
1191
- ratio (float): Ratio for determining orientation. Default is 1.0.
439
+ bbox1 (tuple): The first bounding box, either in 4-coordinate format (x_min, y_min, x_max, y_max)
440
+ or 8-coordinate format (x1, y1, x2, y2, x3, y3, x4, y4).
441
+ bbox2 (tuple): The second bounding box in the same format as bbox1.
442
+ return_format (str): The format of the output intersection, either 'bbox' or 'poly'.
1192
443
 
1193
444
  Returns:
1194
- bool: True if the bounding box is considered horizontal, False if vertical.
1195
- """
1196
- width = input_bbox[2] - input_bbox[0]
1197
- height = input_bbox[3] - input_bbox[1]
1198
- return width * ratio >= height
1199
-
1200
-
1201
- def _get_projection_iou(
1202
- input_bbox: List[float], match_bbox: List[float], is_horizontal: bool = True
1203
- ) -> float:
1204
- """
1205
- Calculate the IoU of lines between two bounding boxes.
1206
-
1207
- Args:
1208
- input_bbox (List[float]): First bounding box [x_min, y_min, x_max, y_max].
1209
- match_bbox (List[float]): Second bounding box [x_min, y_min, x_max, y_max].
1210
- is_horizontal (bool): Whether to compare horizontally or vertically.
1211
-
1212
- Returns:
1213
- float: Line IoU. Returns 0 if there is no overlap.
1214
- """
1215
- if is_horizontal:
1216
- x_match_min = max(input_bbox[0], match_bbox[0])
1217
- x_match_max = min(input_bbox[2], match_bbox[2])
1218
- overlap = max(0, x_match_max - x_match_min)
1219
- input_width = min(input_bbox[2] - input_bbox[0], match_bbox[2] - match_bbox[0])
445
+ tuple or None: The intersection bounding box in the specified format, or None if there is no intersection.
446
+ """
447
+ bbox1 = np.array(bbox1)
448
+ bbox2 = np.array(bbox2)
449
+ # Convert both bounding boxes to rectangles
450
+ rect1 = bbox1 if len(bbox1.shape) == 1 else convert_points_to_boxes([bbox1])[0]
451
+ rect2 = bbox2 if len(bbox2.shape) == 1 else convert_points_to_boxes([bbox2])[0]
452
+
453
+ # Calculate the intersection rectangle
454
+
455
+ x_min_inter = max(rect1[0], rect2[0])
456
+ y_min_inter = max(rect1[1], rect2[1])
457
+ x_max_inter = min(rect1[2], rect2[2])
458
+ y_max_inter = min(rect1[3], rect2[3])
459
+
460
+ # Check if there is an intersection
461
+ if x_min_inter >= x_max_inter or y_min_inter >= y_max_inter:
462
+ return None
463
+
464
+ if return_format == "bbox":
465
+ return np.array([x_min_inter, y_min_inter, x_max_inter, y_max_inter])
466
+ elif return_format == "poly":
467
+ return np.array(
468
+ [
469
+ [x_min_inter, y_min_inter],
470
+ [x_max_inter, y_min_inter],
471
+ [x_max_inter, y_max_inter],
472
+ [x_min_inter, y_max_inter],
473
+ ],
474
+ dtype=np.int16,
475
+ )
1220
476
  else:
1221
- y_match_min = max(input_bbox[1], match_bbox[1])
1222
- y_match_max = min(input_bbox[3], match_bbox[3])
1223
- overlap = max(0, y_match_max - y_match_min)
1224
- input_width = min(input_bbox[3] - input_bbox[1], match_bbox[3] - match_bbox[1])
477
+ raise ValueError("return_format must be either 'bbox' or 'poly'.")
1225
478
 
1226
- return overlap / input_width if input_width > 0 else 0.0
1227
479
 
1228
-
1229
- def _get_sub_category(
1230
- blocks: List[Dict[str, Any]], title_labels: List[str]
1231
- ) -> Tuple[List[Dict[str, Any]], List[float]]:
480
+ def shrink_supplement_region_bbox(
481
+ supplement_region_bbox,
482
+ ref_region_bbox,
483
+ image_width,
484
+ image_height,
485
+ block_idxes_set,
486
+ block_bboxes,
487
+ ) -> List:
1232
488
  """
1233
- Determine the layout of title and text blocks and collect pre_cuts.
489
+ Shrink the supplement region bbox according to the reference region bbox and match the block bboxes.
1234
490
 
1235
491
  Args:
1236
- blocks (List[Dict[str, Any]]): List of block dictionaries.
1237
- title_labels (List[str]): List of labels considered as titles.
492
+ supplement_region_bbox (list): The supplement region bbox.
493
+ ref_region_bbox (list): The reference region bbox.
494
+ image_width (int): The width of the image.
495
+ image_height (int): The height of the image.
496
+ block_idxes_set (set): The indexes of the blocks that intersect with the region bbox.
497
+ block_bboxes (dict): The dictionary of block bboxes.
1238
498
 
1239
499
  Returns:
1240
- List[Dict[str, Any]]: Updated list of blocks with title-text layout information.
1241
- Dict[float]: Dict of pre_cuts coordinates.
1242
- """
1243
-
1244
- sub_title_labels = ["paragraph_title"]
1245
- vision_labels = ["image", "table", "chart", "figure"]
1246
- vision_title_labels = ["figure_title", "chart_title", "table_title"]
1247
- all_labels = title_labels + sub_title_labels + vision_labels + vision_title_labels
1248
- special_pre_cut_labels = sub_title_labels
1249
-
1250
- # single doc title is irregular,pre cut not applicable
1251
- num_doc_title = 0
1252
- for block in blocks:
1253
- if block["block_label"] == "doc_title":
1254
- num_doc_title += 1
1255
- if num_doc_title == 2:
1256
- special_pre_cut_labels = title_labels + sub_title_labels
1257
- break
1258
- if len(blocks) == 0:
1259
- return blocks, {}
1260
-
1261
- min_x = min(block["block_bbox"][0] for block in blocks)
1262
- min_y = min(block["block_bbox"][1] for block in blocks)
1263
- max_x = max(block["block_bbox"][2] for block in blocks)
1264
- max_y = max(block["block_bbox"][3] for block in blocks)
1265
- region_bbox = (min_x, min_y, max_x, max_y)
1266
- region_x_center = (region_bbox[0] + region_bbox[2]) / 2
1267
- region_y_center = (region_bbox[1] + region_bbox[3]) / 2
1268
- region_width = region_bbox[2] - region_bbox[0]
1269
- region_height = region_bbox[3] - region_bbox[1]
1270
-
1271
- pre_cuts = {}
1272
-
1273
- for i, block1 in enumerate(blocks):
1274
- block1.setdefault("title_text", [])
1275
- block1.setdefault("sub_title", [])
1276
- block1.setdefault("vision_footnote", [])
1277
- block1.setdefault("sub_label", block1["block_label"])
1278
-
1279
- if block1["block_label"] not in all_labels:
1280
- continue
1281
-
1282
- bbox1 = block1["block_bbox"]
1283
- x1, y1, x2, y2 = bbox1
1284
- is_horizontal_1 = _get_bbox_direction(block1["block_bbox"])
1285
- left_up_title_text_distance = float("inf")
1286
- left_up_title_text_index = -1
1287
- left_up_title_text_direction = None
1288
- right_down_title_text_distance = float("inf")
1289
- right_down_title_text_index = -1
1290
- right_down_title_text_direction = None
1291
-
1292
- # pre-cuts
1293
- # Condition 1: Length is greater than half of the layout region
1294
- if is_horizontal_1:
1295
- block_length = x2 - x1
1296
- required_length = region_width / 2
1297
- else:
1298
- block_length = y2 - y1
1299
- required_length = region_height / 2
1300
- if block1["block_label"] in special_pre_cut_labels:
1301
- length_condition = True
1302
- else:
1303
- length_condition = block_length > required_length
1304
-
1305
- # Condition 2: Centered check (must be within ±20 in both horizontal and vertical directions)
1306
- block_x_center = (x1 + x2) / 2
1307
- block_y_center = (y1 + y2) / 2
1308
- tolerance_len = block_length // 5
1309
- if block1["block_label"] in special_pre_cut_labels:
1310
- tolerance_len = block_length // 10
1311
- if is_horizontal_1:
1312
- is_centered = abs(block_x_center - region_x_center) <= tolerance_len
1313
- else:
1314
- is_centered = abs(block_y_center - region_y_center) <= tolerance_len
1315
-
1316
- # Condition 3: Check for surrounding text
1317
- has_left_text = False
1318
- has_right_text = False
1319
- has_above_text = False
1320
- has_below_text = False
1321
- for block2 in blocks:
1322
- if block2["block_label"] != "text":
1323
- continue
1324
- bbox2 = block2["block_bbox"]
1325
- x1_2, y1_2, x2_2, y2_2 = bbox2
1326
- if is_horizontal_1:
1327
- if x2_2 <= x1 and not (y2_2 <= y1 or y1_2 >= y2):
1328
- has_left_text = True
1329
- if x1_2 >= x2 and not (y2_2 <= y1 or y1_2 >= y2):
1330
- has_right_text = True
1331
- else:
1332
- if y2_2 <= y1 and not (x2_2 <= x1 or x1_2 >= x2):
1333
- has_above_text = True
1334
- if y1_2 >= y2 and not (x2_2 <= x1 or x1_2 >= x2):
1335
- has_below_text = True
1336
-
1337
- if (is_horizontal_1 and has_left_text and has_right_text) or (
1338
- not is_horizontal_1 and has_above_text and has_below_text
1339
- ):
1340
- break
1341
-
1342
- no_text_on_sides = (
1343
- not (has_left_text or has_right_text)
1344
- if is_horizontal_1
1345
- else not (has_above_text or has_below_text)
1346
- )
1347
-
1348
- # Add coordinates if all conditions are met
1349
- if is_centered and length_condition and no_text_on_sides:
1350
- if is_horizontal_1:
1351
- pre_cuts.setdefault("y", []).append(y1)
1352
- else:
1353
- pre_cuts.setdefault("x", []).append(x1)
1354
-
1355
- for j, block2 in enumerate(blocks):
1356
- if i == j:
1357
- continue
1358
-
1359
- bbox2 = block2["block_bbox"]
1360
- x1_prime, y1_prime, x2_prime, y2_prime = bbox2
1361
- is_horizontal_2 = _get_bbox_direction(bbox2)
1362
- match_block_iou = _get_projection_iou(
1363
- bbox2,
1364
- bbox1,
1365
- is_horizontal_1,
500
+ list: The new region bbox and the matched block idxes.
501
+ """
502
+ x1, y1, x2, y2 = supplement_region_bbox
503
+ x1_prime, y1_prime, x2_prime, y2_prime = ref_region_bbox
504
+ index_conversion_map = {0: 2, 1: 3, 2: 0, 3: 1}
505
+ edge_distance_list = [
506
+ (x1_prime - x1) / image_width,
507
+ (y1_prime - y1) / image_height,
508
+ (x2 - x2_prime) / image_width,
509
+ (y2 - y2_prime) / image_height,
510
+ ]
511
+ edge_distance_list_tmp = deepcopy(edge_distance_list)
512
+ min_distance = min(edge_distance_list)
513
+ src_index = index_conversion_map[edge_distance_list.index(min_distance)]
514
+ if len(block_idxes_set) == 0:
515
+ return supplement_region_bbox, []
516
+ for _ in range(3):
517
+ dst_index = index_conversion_map[src_index]
518
+ tmp_region_bbox = supplement_region_bbox[:]
519
+ tmp_region_bbox[dst_index] = ref_region_bbox[src_index]
520
+ iner_block_idxes, split_block_idxes = [], []
521
+ for block_idx in block_idxes_set:
522
+ overlap_ratio = calculate_overlap_ratio(
523
+ tmp_region_bbox, block_bboxes[block_idx], mode="small"
1366
524
  )
1367
-
1368
- def distance_(is_horizontal, is_left_up):
1369
- if is_horizontal:
1370
- if is_left_up:
1371
- return (y1 - y2_prime + 2) // 5 + x1_prime / 5000
1372
- else:
1373
- return (y1_prime - y2 + 2) // 5 + x1_prime / 5000
1374
-
1375
- else:
1376
- if is_left_up:
1377
- return (x1 - x2_prime + 2) // 5 + y1_prime / 5000
1378
- else:
1379
- return (x1_prime - x2 + 2) // 5 + y1_prime / 5000
1380
-
1381
- block_iou_threshold = 0.1
1382
- if block1["block_label"] in sub_title_labels:
1383
- block_iou_threshold = 0.5
1384
-
1385
- if is_horizontal_1:
1386
- if match_block_iou >= block_iou_threshold:
1387
- left_up_distance = distance_(True, True)
1388
- right_down_distance = distance_(True, False)
1389
- if (
1390
- y2_prime <= y1
1391
- and left_up_distance <= left_up_title_text_distance
1392
- ):
1393
- left_up_title_text_distance = left_up_distance
1394
- left_up_title_text_index = j
1395
- left_up_title_text_direction = is_horizontal_2
1396
- elif (
1397
- y1_prime > y2
1398
- and right_down_distance < right_down_title_text_distance
1399
- ):
1400
- right_down_title_text_distance = right_down_distance
1401
- right_down_title_text_index = j
1402
- right_down_title_text_direction = is_horizontal_2
1403
- else:
1404
- if match_block_iou >= block_iou_threshold:
1405
- left_up_distance = distance_(False, True)
1406
- right_down_distance = distance_(False, False)
1407
- if (
1408
- x2_prime <= x1
1409
- and left_up_distance <= left_up_title_text_distance
1410
- ):
1411
- left_up_title_text_distance = left_up_distance
1412
- left_up_title_text_index = j
1413
- left_up_title_text_direction = is_horizontal_2
1414
- elif (
1415
- x1_prime > x2
1416
- and right_down_distance < right_down_title_text_distance
1417
- ):
1418
- right_down_title_text_distance = right_down_distance
1419
- right_down_title_text_index = j
1420
- right_down_title_text_direction = is_horizontal_2
1421
-
1422
- height = bbox1[3] - bbox1[1]
1423
- width = bbox1[2] - bbox1[0]
1424
- title_text_weight = [0.8, 0.8]
1425
-
1426
- title_text, sub_title, vision_footnote = [], [], []
1427
-
1428
- def get_sub_category_(
1429
- title_text_direction,
1430
- title_text_index,
1431
- label,
1432
- is_left_up=True,
1433
- ):
1434
- direction_ = [1, 3] if is_left_up else [2, 4]
1435
- if (
1436
- title_text_direction == is_horizontal_1
1437
- and title_text_index != -1
1438
- and (label == "text" or label == "paragraph_title")
525
+ if overlap_ratio > REGION_SETTINGS.get(
526
+ "match_block_overlap_ratio_threshold", 0.8
1439
527
  ):
1440
- bbox2 = blocks[title_text_index]["block_bbox"]
1441
- if is_horizontal_1:
1442
- height1 = bbox2[3] - bbox2[1]
1443
- width1 = bbox2[2] - bbox2[0]
1444
- if label == "text":
1445
- if (
1446
- _nearest_edge_distance(bbox1, bbox2)[0] <= 15
1447
- and block1["block_label"] in vision_labels
1448
- and width1 < width
1449
- and height1 < 0.5 * height
1450
- ):
1451
- blocks[title_text_index]["sub_label"] = "vision_footnote"
1452
- vision_footnote.append(bbox2)
1453
- elif (
1454
- height1 < height * title_text_weight[0]
1455
- and (width1 < width or width1 > 1.5 * width)
1456
- and block1["block_label"] in title_labels
1457
- ):
1458
- blocks[title_text_index]["sub_label"] = "title_text"
1459
- title_text.append((direction_[0], bbox2))
1460
- elif (
1461
- label == "paragraph_title"
1462
- and block1["block_label"] in sub_title_labels
1463
- ):
1464
- sub_title.append(bbox2)
1465
- else:
1466
- height1 = bbox2[3] - bbox2[1]
1467
- width1 = bbox2[2] - bbox2[0]
1468
- if label == "text":
1469
- if (
1470
- _nearest_edge_distance(bbox1, bbox2)[0] <= 15
1471
- and block1["block_label"] in vision_labels
1472
- and height1 < height
1473
- and width1 < 0.5 * width
1474
- ):
1475
- blocks[title_text_index]["sub_label"] = "vision_footnote"
1476
- vision_footnote.append(bbox2)
1477
- elif (
1478
- width1 < width * title_text_weight[1]
1479
- and block1["block_label"] in title_labels
1480
- ):
1481
- blocks[title_text_index]["sub_label"] = "title_text"
1482
- title_text.append((direction_[1], bbox2))
1483
- elif (
1484
- label == "paragraph_title"
1485
- and block1["block_label"] in sub_title_labels
1486
- ):
1487
- sub_title.append(bbox2)
1488
-
1489
- if (
1490
- is_horizontal_1
1491
- and abs(left_up_title_text_distance - right_down_title_text_distance) * 5
1492
- > height
1493
- ) or (
1494
- not is_horizontal_1
1495
- and abs(left_up_title_text_distance - right_down_title_text_distance) * 5
1496
- > width
1497
- ):
1498
- if left_up_title_text_distance < right_down_title_text_distance:
1499
- get_sub_category_(
1500
- left_up_title_text_direction,
1501
- left_up_title_text_index,
1502
- blocks[left_up_title_text_index]["block_label"],
1503
- True,
1504
- )
1505
- else:
1506
- get_sub_category_(
1507
- right_down_title_text_direction,
1508
- right_down_title_text_index,
1509
- blocks[right_down_title_text_index]["block_label"],
1510
- False,
1511
- )
1512
- else:
1513
- get_sub_category_(
1514
- left_up_title_text_direction,
1515
- left_up_title_text_index,
1516
- blocks[left_up_title_text_index]["block_label"],
1517
- True,
1518
- )
1519
- get_sub_category_(
1520
- right_down_title_text_direction,
1521
- right_down_title_text_index,
1522
- blocks[right_down_title_text_index]["block_label"],
1523
- False,
1524
- )
1525
-
1526
- if block1["block_label"] in title_labels:
1527
- if blocks[i].get("title_text") == []:
1528
- blocks[i]["title_text"] = title_text
1529
-
1530
- if block1["block_label"] in sub_title_labels:
1531
- if blocks[i].get("sub_title") == []:
1532
- blocks[i]["sub_title"] = sub_title
1533
-
1534
- if block1["block_label"] in vision_labels:
1535
- if blocks[i].get("vision_footnote") == []:
1536
- blocks[i]["vision_footnote"] = vision_footnote
1537
-
1538
- return blocks, pre_cuts
1539
-
1540
-
1541
- def get_layout_ordering(
1542
- parsing_res_list: List[Dict[str, Any]],
1543
- no_mask_labels: List[str] = [],
1544
- ) -> None:
1545
- """
1546
- Process layout parsing results to remove overlapping bounding boxes
1547
- and assign an ordering index based on their positions.
1548
-
1549
- Modifies:
1550
- The 'parsing_res_list' list by adding an 'index' to each block.
1551
-
1552
- Args:
1553
- parsing_res_list (List[Dict[str, Any]]): List of block dictionaries with 'block_bbox' and 'block_label'.
1554
- no_mask_labels (List[str]): Labels for which overlapping removal is not performed.
1555
- """
1556
- title_text_labels = ["doc_title"]
1557
- title_labels = ["doc_title", "paragraph_title"]
1558
- vision_labels = ["image", "table", "seal", "chart", "figure"]
1559
- vision_title_labels = ["table_title", "chart_title", "figure_title"]
1560
-
1561
- parsing_res_list, pre_cuts = _get_sub_category(parsing_res_list, title_text_labels)
1562
-
1563
- parsing_res_by_pre_cuts_list = []
1564
- if len(pre_cuts) > 0:
1565
- block_bboxes = [block["block_bbox"] for block in parsing_res_list]
1566
- for axis, cuts in pre_cuts.items():
1567
- axis_index = 1 if axis == "y" else 0
1568
-
1569
- max_val = max(bbox[axis_index + 2] for bbox in block_bboxes)
1570
-
1571
- intervals = []
1572
- prev = 0
1573
- for cut in sorted(cuts):
1574
- intervals.append((prev, cut))
1575
- prev = cut
1576
- intervals.append((prev, max_val))
1577
-
1578
- for start, end in intervals:
1579
- mask = [
1580
- (bbox[axis_index] >= start) and (bbox[axis_index] < end)
1581
- for bbox in block_bboxes
1582
- ]
1583
- parsing_res_by_pre_cuts_list.append(
1584
- [parsing_res_list[i] for i, m in enumerate(mask) if m]
1585
- )
1586
- else:
1587
- parsing_res_by_pre_cuts_list = [parsing_res_list]
1588
-
1589
- final_parsing_res_list = []
1590
- num_index = 0
1591
- num_sub_index = 0
1592
- for parsing_res_by_pre_cuts in parsing_res_by_pre_cuts_list:
1593
-
1594
- doc_flag = False
1595
- median_width = _get_text_median_width(parsing_res_by_pre_cuts)
1596
- parsing_res_by_pre_cuts, projection_direction = _get_layout_property(
1597
- parsing_res_by_pre_cuts,
1598
- median_width,
1599
- no_mask_labels=no_mask_labels,
1600
- threshold=0.3,
1601
- )
1602
- # Convert bounding boxes to float and remove overlaps
1603
- (
1604
- double_text_blocks,
1605
- title_text_blocks,
1606
- title_blocks,
1607
- vision_blocks,
1608
- vision_title_blocks,
1609
- vision_footnote_blocks,
1610
- other_blocks,
1611
- ) = ([], [], [], [], [], [], [])
1612
-
1613
- drop_indexes = []
1614
-
1615
- for index, block in enumerate(parsing_res_by_pre_cuts):
1616
- label = block["sub_label"]
1617
- block["block_bbox"] = list(map(int, block["block_bbox"]))
1618
-
1619
- if label == "doc_title":
1620
- doc_flag = True
1621
-
1622
- if label in no_mask_labels:
1623
- if block["layout"] == "double":
1624
- double_text_blocks.append(block)
1625
- drop_indexes.append(index)
1626
- elif label == "title_text":
1627
- title_text_blocks.append(block)
1628
- drop_indexes.append(index)
1629
- elif label == "vision_footnote":
1630
- vision_footnote_blocks.append(block)
1631
- drop_indexes.append(index)
1632
- elif label in vision_title_labels:
1633
- vision_title_blocks.append(block)
1634
- drop_indexes.append(index)
1635
- elif label in title_labels:
1636
- title_blocks.append(block)
1637
- drop_indexes.append(index)
1638
- elif label in vision_labels:
1639
- vision_blocks.append(block)
1640
- drop_indexes.append(index)
1641
- else:
1642
- other_blocks.append(block)
1643
- drop_indexes.append(index)
1644
-
1645
- for index in sorted(drop_indexes, reverse=True):
1646
- del parsing_res_by_pre_cuts[index]
1647
-
1648
- if len(parsing_res_by_pre_cuts) > 0:
1649
- # single text label
1650
- if (
1651
- len(double_text_blocks) > len(parsing_res_by_pre_cuts)
1652
- or projection_direction
528
+ iner_block_idxes.append(block_idx)
529
+ elif overlap_ratio > REGION_SETTINGS.get(
530
+ "split_block_overlap_ratio_threshold", 0.4
1653
531
  ):
1654
- parsing_res_by_pre_cuts.extend(title_blocks + double_text_blocks)
1655
- title_blocks = []
1656
- double_text_blocks = []
1657
- block_bboxes = [
1658
- block["block_bbox"] for block in parsing_res_by_pre_cuts
1659
- ]
1660
- block_bboxes.sort(
1661
- key=lambda x: (
1662
- x[0] // max(20, median_width),
1663
- x[1],
1664
- ),
1665
- )
1666
- block_bboxes = np.array(block_bboxes)
1667
- sorted_indices = sort_by_xycut(block_bboxes, direction=1, min_gap=1)
1668
- else:
1669
- block_bboxes = [
1670
- block["block_bbox"] for block in parsing_res_by_pre_cuts
1671
- ]
1672
- block_bboxes.sort(key=lambda x: (x[0] // 20, x[1]))
1673
- block_bboxes = np.array(block_bboxes)
1674
- sorted_indices = sort_by_xycut(block_bboxes, direction=0, min_gap=20)
1675
-
1676
- sorted_boxes = block_bboxes[sorted_indices].tolist()
1677
-
1678
- for block in parsing_res_by_pre_cuts:
1679
- block["index"] = num_index + sorted_boxes.index(block["block_bbox"]) + 1
1680
- block["sub_index"] = (
1681
- num_sub_index + sorted_boxes.index(block["block_bbox"]) + 1
1682
- )
1683
-
1684
- def nearest_match_(input_blocks, distance_type="manhattan", is_add_index=True):
1685
- for block in input_blocks:
1686
- bbox = block["block_bbox"]
1687
- min_distance = float("inf")
1688
- min_distance_config = [
1689
- [float("inf"), float("inf")],
1690
- float("inf"),
1691
- float("inf"),
1692
- ] # for double text
1693
- nearest_gt_index = 0
1694
- for match_block in parsing_res_by_pre_cuts:
1695
- match_bbox = match_block["block_bbox"]
1696
- if distance_type == "nearest_iou_edge_distance":
1697
- distance, min_distance_config = _nearest_iou_edge_distance(
1698
- bbox,
1699
- match_bbox,
1700
- block["sub_label"],
1701
- vision_labels=vision_labels,
1702
- no_mask_labels=no_mask_labels,
1703
- median_width=median_width,
1704
- title_labels=title_labels,
1705
- title_text=block["title_text"],
1706
- sub_title=block["sub_title"],
1707
- min_distance_config=min_distance_config,
1708
- tolerance_len=10,
1709
- )
1710
- elif distance_type == "title_text":
1711
- if (
1712
- match_block["block_label"] in title_labels + ["abstract"]
1713
- and match_block["title_text"] != []
1714
- ):
1715
- iou_left_up = _calculate_overlap_area_div_minbox_area_ratio(
1716
- bbox,
1717
- match_block["title_text"][0][1],
1718
- )
1719
- iou_right_down = (
1720
- _calculate_overlap_area_div_minbox_area_ratio(
1721
- bbox,
1722
- match_block["title_text"][-1][1],
1723
- )
1724
- )
1725
- iou = 1 - max(iou_left_up, iou_right_down)
1726
- distance = _manhattan_distance(bbox, match_bbox) * iou
1727
- else:
1728
- distance = float("inf")
1729
- elif distance_type == "manhattan":
1730
- distance = _manhattan_distance(bbox, match_bbox)
1731
- elif distance_type == "vision_footnote":
1732
- if (
1733
- match_block["block_label"] in vision_labels
1734
- and match_block["vision_footnote"] != []
1735
- ):
1736
- iou_left_up = _calculate_overlap_area_div_minbox_area_ratio(
1737
- bbox,
1738
- match_block["vision_footnote"][0],
1739
- )
1740
- iou_right_down = (
1741
- _calculate_overlap_area_div_minbox_area_ratio(
1742
- bbox,
1743
- match_block["vision_footnote"][-1],
1744
- )
1745
- )
1746
- iou = 1 - max(iou_left_up, iou_right_down)
1747
- distance = _manhattan_distance(bbox, match_bbox) * iou
1748
- else:
1749
- distance = float("inf")
1750
- elif distance_type == "vision_body":
1751
- if (
1752
- match_block["block_label"] in vision_title_labels
1753
- and block["vision_footnote"] != []
1754
- ):
1755
- iou_left_up = _calculate_overlap_area_div_minbox_area_ratio(
1756
- match_bbox,
1757
- block["vision_footnote"][0],
1758
- )
1759
- iou_right_down = (
1760
- _calculate_overlap_area_div_minbox_area_ratio(
1761
- match_bbox,
1762
- block["vision_footnote"][-1],
1763
- )
1764
- )
1765
- iou = 1 - max(iou_left_up, iou_right_down)
1766
- distance = _manhattan_distance(bbox, match_bbox) * iou
1767
- else:
1768
- distance = float("inf")
1769
- # when reference block cross mulitple columns, its order should be after the blocks above it.
1770
- elif distance_type == "append":
1771
- if match_bbox[3] <= bbox[1]:
1772
- distance = -(match_bbox[2] * 10 + match_bbox[3])
1773
- else:
1774
- distance = float("inf")
1775
- else:
1776
- raise NotImplementedError
1777
-
1778
- if distance < min_distance:
1779
- min_distance = distance
1780
- if is_add_index:
1781
- nearest_gt_index = match_block.get("index", 999)
1782
- else:
1783
- nearest_gt_index = match_block.get("sub_index", 999)
1784
-
1785
- if is_add_index:
1786
- block["index"] = nearest_gt_index
1787
- else:
1788
- block["sub_index"] = nearest_gt_index
1789
-
1790
- parsing_res_by_pre_cuts.append(block)
1791
-
1792
- # double text label
1793
- double_text_blocks.sort(
1794
- key=lambda x: (
1795
- x["block_bbox"][1] // 10,
1796
- x["block_bbox"][0] // median_width,
1797
- x["block_bbox"][1] ** 2 + x["block_bbox"][0] ** 2,
1798
- ),
1799
- )
1800
- # filter the reference blocks from all blocks that cross mulitple columns.
1801
- # they should be ordered using "append".
1802
- double_text_reference_blocks = []
1803
- i = 0
1804
- while i < len(double_text_blocks):
1805
- if double_text_blocks[i]["block_label"] == "reference":
1806
- double_text_reference_blocks.append(double_text_blocks.pop(i))
1807
- else:
1808
- i += 1
1809
- nearest_match_(
1810
- double_text_blocks,
1811
- distance_type="nearest_iou_edge_distance",
1812
- )
1813
- nearest_match_(
1814
- double_text_reference_blocks,
1815
- distance_type="append",
1816
- )
1817
- parsing_res_by_pre_cuts.sort(
1818
- key=lambda x: (x["index"], x["block_bbox"][1], x["block_bbox"][0]),
1819
- )
1820
-
1821
- for idx, block in enumerate(parsing_res_by_pre_cuts):
1822
- block["index"] = num_index + idx + 1
1823
- block["sub_index"] = num_sub_index + idx + 1
1824
-
1825
- # title label
1826
- title_blocks.sort(
1827
- key=lambda x: (
1828
- x["block_bbox"][1] // 10,
1829
- x["block_bbox"][0] // median_width,
1830
- x["block_bbox"][1] ** 2 + x["block_bbox"][0] ** 2,
1831
- ),
1832
- )
1833
- nearest_match_(title_blocks, distance_type="nearest_iou_edge_distance")
1834
-
1835
- if doc_flag:
1836
- text_sort_labels = ["doc_title"]
1837
- text_label_priority = {
1838
- label: priority for priority, label in enumerate(text_sort_labels)
1839
- }
1840
- doc_titles = []
1841
- for i, block in enumerate(parsing_res_by_pre_cuts):
1842
- if block["block_label"] == "doc_title":
1843
- doc_titles.append(
1844
- (i, block["block_bbox"][1], block["block_bbox"][0]),
532
+ split_block_idxes.append(block_idx)
533
+
534
+ if len(iner_block_idxes) > 0:
535
+ if len(split_block_idxes) > 0:
536
+ for split_block_idx in split_block_idxes:
537
+ split_block_bbox = block_bboxes[split_block_idx]
538
+ x1, y1, x2, y2 = tmp_region_bbox
539
+ x1_prime, y1_prime, x2_prime, y2_prime = split_block_bbox
540
+ edge_distance_list = [
541
+ (x1_prime - x1) / image_width,
542
+ (y1_prime - y1) / image_height,
543
+ (x2 - x2_prime) / image_width,
544
+ (y2 - y2_prime) / image_height,
545
+ ]
546
+ max_distance = max(edge_distance_list)
547
+ src_index = edge_distance_list.index(max_distance)
548
+ dst_index = index_conversion_map[src_index]
549
+ tmp_region_bbox[dst_index] = split_block_bbox[src_index]
550
+ tmp_region_bbox, iner_idxes = shrink_supplement_region_bbox(
551
+ tmp_region_bbox,
552
+ ref_region_bbox,
553
+ image_width,
554
+ image_height,
555
+ iner_block_idxes,
556
+ block_bboxes,
1845
557
  )
1846
- doc_titles.sort(key=lambda x: (x[1], x[2]))
1847
- first_doc_title_index = doc_titles[0][0]
1848
- parsing_res_by_pre_cuts[first_doc_title_index]["index"] = 1
1849
- parsing_res_by_pre_cuts.sort(
1850
- key=lambda x: (
1851
- x["index"],
1852
- text_label_priority.get(x["block_label"], 9999),
1853
- x["block_bbox"][1],
1854
- x["block_bbox"][0],
1855
- ),
1856
- )
558
+ if len(iner_idxes) == 0:
559
+ continue
560
+ matched_bboxes = [block_bboxes[idx] for idx in iner_block_idxes]
561
+ supplement_region_bbox = calculate_minimum_enclosing_bbox(matched_bboxes)
562
+ break
1857
563
  else:
1858
- parsing_res_by_pre_cuts.sort(
1859
- key=lambda x: (
1860
- x["index"],
1861
- x["block_bbox"][1],
1862
- x["block_bbox"][0],
1863
- ),
1864
- )
564
+ edge_distance_list_tmp.remove(min_distance)
565
+ min_distance = min(edge_distance_list_tmp)
566
+ src_index = index_conversion_map[edge_distance_list.index(min_distance)]
567
+ return supplement_region_bbox, iner_block_idxes
1865
568
 
1866
- for idx, block in enumerate(parsing_res_by_pre_cuts):
1867
- block["index"] = num_index + idx + 1
1868
- block["sub_index"] = num_sub_index + idx + 1
1869
569
 
1870
- # title-text label
1871
- nearest_match_(title_text_blocks, distance_type="title_text")
570
+ def update_region_box(bbox, region_box):
571
+ """Update region box with bbox"""
572
+ if region_box is None:
573
+ return bbox
1872
574
 
1873
- def hor_tb_and_ver_lr(x):
1874
- input_bbox = x["block_bbox"]
1875
- is_horizontal = _get_bbox_direction(input_bbox)
1876
- if is_horizontal:
1877
- return input_bbox[1]
1878
- else:
1879
- return input_bbox[0]
575
+ x1, y1, x2, y2 = bbox
576
+ x1_region, y1_region, x2_region, y2_region = region_box
1880
577
 
1881
- parsing_res_by_pre_cuts.sort(
1882
- key=lambda x: (x["index"], hor_tb_and_ver_lr(x)),
1883
- )
578
+ x1_region = int(min(x1, x1_region))
579
+ y1_region = int(min(y1, y1_region))
580
+ x2_region = int(max(x2, x2_region))
581
+ y2_region = int(max(y2, y2_region))
1884
582
 
1885
- for idx, block in enumerate(parsing_res_by_pre_cuts):
1886
- block["index"] = num_index + idx + 1
1887
- block["sub_index"] = num_sub_index + idx + 1
583
+ region_box = [x1_region, y1_region, x2_region, y2_region]
1888
584
 
1889
- # image,figure,chart,seal label
1890
- nearest_match_(
1891
- vision_blocks,
1892
- distance_type="nearest_iou_edge_distance",
1893
- is_add_index=False,
1894
- )
1895
- parsing_res_by_pre_cuts.sort(
1896
- key=lambda x: (
1897
- x["sub_index"],
1898
- x["block_bbox"][1],
1899
- x["block_bbox"][0],
1900
- ),
1901
- )
585
+ return region_box
1902
586
 
1903
- for idx, block in enumerate(parsing_res_by_pre_cuts):
1904
- block["sub_index"] = num_sub_index + idx + 1
1905
587
 
1906
- # image,figure,chart,seal title label
1907
- nearest_match_(
1908
- vision_title_blocks,
1909
- distance_type="nearest_iou_edge_distance",
1910
- is_add_index=False,
1911
- )
1912
- parsing_res_by_pre_cuts.sort(
1913
- key=lambda x: (
1914
- x["sub_index"],
1915
- x["block_bbox"][1],
1916
- x["block_bbox"][0],
1917
- ),
1918
- )
1919
-
1920
- for idx, block in enumerate(parsing_res_by_pre_cuts):
1921
- block["sub_index"] = num_sub_index + idx + 1
1922
-
1923
- # vision footnote label
1924
- nearest_match_(
1925
- vision_footnote_blocks,
1926
- distance_type="vision_footnote",
1927
- is_add_index=False,
1928
- )
1929
- text_label_priority = {"vision_footnote": 9999}
1930
- parsing_res_by_pre_cuts.sort(
1931
- key=lambda x: (
1932
- x["sub_index"],
1933
- text_label_priority.get(x["sub_label"], 0),
1934
- x["block_bbox"][1],
1935
- x["block_bbox"][0],
1936
- ),
1937
- )
1938
-
1939
- for idx, block in enumerate(parsing_res_by_pre_cuts):
1940
- block["sub_index"] = num_sub_index + idx + 1
1941
-
1942
- # header、footnote、header_image... label
1943
- nearest_match_(other_blocks, distance_type="manhattan", is_add_index=False)
1944
-
1945
- # add all parsing result
1946
- final_parsing_res_list.extend(parsing_res_by_pre_cuts)
1947
-
1948
- # update num index
1949
- num_sub_index += len(parsing_res_by_pre_cuts)
1950
- for parsing_res in parsing_res_by_pre_cuts:
1951
- if parsing_res.get("index"):
1952
- num_index += 1
1953
-
1954
- parsing_res_list = [
1955
- {
1956
- "block_label": parsing_res["block_label"],
1957
- "block_content": parsing_res["block_content"],
1958
- "block_bbox": parsing_res["block_bbox"],
1959
- "block_image": parsing_res.get("block_image", None),
1960
- "sub_label": parsing_res["sub_label"],
1961
- "sub_index": parsing_res["sub_index"],
1962
- "index": parsing_res.get("index", None),
1963
- "seg_start_coordinate": parsing_res.get(
1964
- "seg_start_coordinate", float("inf")
1965
- ),
1966
- "seg_end_coordinate": parsing_res.get("seg_end_coordinate", float("-inf")),
1967
- "num_of_lines": parsing_res.get("num_of_lines", 1),
1968
- }
1969
- for parsing_res in final_parsing_res_list
1970
- ]
1971
-
1972
- return parsing_res_list
1973
-
1974
-
1975
- def _manhattan_distance(
1976
- point1: Tuple[float, float],
1977
- point2: Tuple[float, float],
1978
- weight_x: float = 1.0,
1979
- weight_y: float = 1.0,
1980
- ) -> float:
1981
- """
1982
- Calculate the weighted Manhattan distance between two points.
588
+ def convert_formula_res_to_ocr_format(formula_res_list: List, ocr_res: dict):
589
+ """Convert formula result to OCR result format
1983
590
 
1984
591
  Args:
1985
- point1 (Tuple[float, float]): The first point as (x, y).
1986
- point2 (Tuple[float, float]): The second point as (x, y).
1987
- weight_x (float): The weight for the x-axis distance. Default is 1.0.
1988
- weight_y (float): The weight for the y-axis distance. Default is 1.0.
1989
-
592
+ formula_res_list (List): Formula results
593
+ ocr_res (dict): OCR result
1990
594
  Returns:
1991
- float: The weighted Manhattan distance between the two points.
1992
- """
1993
- return weight_x * abs(point1[0] - point2[0]) + weight_y * abs(point1[1] - point2[1])
1994
-
595
+ ocr_res (dict): Updated OCR result
596
+ """
597
+ for formula_res in formula_res_list:
598
+ x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
599
+ poly_points = [
600
+ (x_min, y_min),
601
+ (x_max, y_min),
602
+ (x_max, y_max),
603
+ (x_min, y_max),
604
+ ]
605
+ ocr_res["dt_polys"].append(poly_points)
606
+ formula_res_text: str = formula_res["rec_formula"]
607
+ ocr_res["rec_texts"].append(formula_res_text)
608
+ if ocr_res["rec_boxes"].size == 0:
609
+ ocr_res["rec_boxes"] = np.array(formula_res["dt_polys"])
610
+ else:
611
+ ocr_res["rec_boxes"] = np.vstack(
612
+ (ocr_res["rec_boxes"], [formula_res["dt_polys"]])
613
+ )
614
+ ocr_res["rec_labels"].append("formula")
615
+ ocr_res["rec_polys"].append(poly_points)
616
+ ocr_res["rec_scores"].append(1)
1995
617
 
1996
- def _calculate_horizontal_distance(
1997
- input_bbox: List[int],
1998
- match_bbox: List[int],
1999
- height: int,
2000
- disperse: int,
2001
- title_text: List[Tuple[int, List[int]]],
2002
- ) -> float:
2003
- """
2004
- Calculate the horizontal distance between two bounding boxes, considering title text adjustments.
2005
618
 
2006
- Args:
2007
- input_bbox (List[int]): The bounding box coordinates [x1, y1, x2, y2] of the input object.
2008
- match_bbox (List[int]): The bounding box coordinates [x1', y1', x2', y2'] of the object to match against.
2009
- height (int): The height of the input bounding box used for normalization.
2010
- disperse (int): The dispersion factor used to normalize the horizontal distance.
2011
- title_text (List[Tuple[int, List[int]]]): A list of tuples containing title text information and their bounding box coordinates.
2012
- Format: [(position_indicator, [x1, y1, x2, y2]), ...].
619
+ def caculate_bbox_area(bbox):
620
+ """Calculate bounding box area"""
621
+ x1, y1, x2, y2 = map(float, bbox)
622
+ area = abs((x2 - x1) * (y2 - y1))
623
+ return area
2013
624
 
2014
- Returns:
2015
- float: The calculated horizontal distance taking into account the title text adjustments.
2016
- """
2017
- x1, y1, x2, y2 = input_bbox
2018
- x1_prime, y1_prime, x2_prime, y2_prime = match_bbox
2019
-
2020
- # Determine vertical distance adjustment based on title text
2021
- if y2 < y1_prime:
2022
- if title_text and title_text[-1][0] == 2:
2023
- y2 += title_text[-1][1][3] - title_text[-1][1][1]
2024
- vertical_adjustment = (y1_prime - y2) * 0.5
2025
- else:
2026
- if title_text and title_text[0][0] == 1:
2027
- y1 -= title_text[0][1][3] - title_text[0][1][1]
2028
- vertical_adjustment = y1 - y2_prime
2029
-
2030
- # Calculate horizontal distance with adjustments
2031
- horizontal_distance = (
2032
- abs(x2_prime - x1) // disperse
2033
- + vertical_adjustment // height
2034
- + vertical_adjustment / 5000
2035
- )
2036
625
 
2037
- return horizontal_distance
626
+ def caculate_euclidean_dist(point1, point2):
627
+ """Calculate euclidean distance between two points"""
628
+ x1, y1 = point1
629
+ x2, y2 = point2
630
+ return ((x1 - x2) ** 2 + (y1 - y2) ** 2) ** 0.5
2038
631
 
2039
632
 
2040
- def _calculate_vertical_distance(
2041
- input_bbox: List[int],
2042
- match_bbox: List[int],
2043
- width: int,
2044
- disperse: int,
2045
- title_text: List[Tuple[int, List[int]]],
2046
- ) -> float:
2047
- """
2048
- Calculate the vertical distance between two bounding boxes, considering title text adjustments.
633
+ def get_seg_flag(block, prev_block):
634
+ """Get segment start flag and end flag based on previous block
2049
635
 
2050
636
  Args:
2051
- input_bbox (List[int]): The bounding box coordinates [x1, y1, x2, y2] of the input object.
2052
- match_bbox (List[int]): The bounding box coordinates [x1', y1', x2', y2'] of the object to match against.
2053
- width (int): The width of the input bounding box used for normalization.
2054
- disperse (int): The dispersion factor used to normalize the vertical distance.
2055
- title_text (List[Tuple[int, List[int]]]): A list of tuples containing title text information and their bounding box coordinates.
2056
- Format: [(position_indicator, [x1, y1, x2, y2]), ...].
637
+ block (Block): Current block
638
+ prev_block (Block): Previous block
2057
639
 
2058
640
  Returns:
2059
- float: The calculated vertical distance taking into account the title text adjustments.
641
+ seg_start_flag (bool): Segment start flag
642
+ seg_end_flag (bool): Segment end flag
2060
643
  """
2061
- x1, y1, x2, y2 = input_bbox
2062
- x1_prime, y1_prime, x2_prime, y2_prime = match_bbox
2063
-
2064
- # Determine horizontal distance adjustment based on title text
2065
- if x1 > x2_prime:
2066
- if title_text and title_text[0][0] == 3:
2067
- x1 -= title_text[0][1][2] - title_text[0][1][0]
2068
- horizontal_adjustment = (x1 - x2_prime) * 0.5
2069
- else:
2070
- if title_text and title_text[-1][0] == 4:
2071
- x2 += title_text[-1][1][2] - title_text[-1][1][0]
2072
- horizontal_adjustment = x1_prime - x2
2073
-
2074
- # Calculate vertical distance with adjustments
2075
- vertical_distance = (
2076
- abs(y2_prime - y1) // disperse
2077
- + horizontal_adjustment // width
2078
- + horizontal_adjustment / 5000
2079
- )
2080
644
 
2081
- return vertical_distance
645
+ seg_start_flag = True
646
+ seg_end_flag = True
2082
647
 
648
+ context_left_coordinate = block.start_coordinate
649
+ context_right_coordinate = block.end_coordinate
650
+ seg_start_coordinate = block.seg_start_coordinate
651
+ seg_end_coordinate = block.seg_end_coordinate
2083
652
 
2084
- def _nearest_edge_distance(
2085
- input_bbox: List[int],
2086
- match_bbox: List[int],
2087
- weight: List[float] = [1.0, 1.0, 1.0, 1.0],
2088
- label: str = "text",
2089
- no_mask_labels: List[str] = [],
2090
- min_edge_distance_config: List[float] = [],
2091
- tolerance_len: float = 10.0,
2092
- ) -> Tuple[float, List[float]]:
2093
- """
2094
- Calculate the nearest edge distance between two bounding boxes, considering directional weights.
653
+ if prev_block is not None:
654
+ num_of_prev_lines = prev_block.num_of_lines
655
+ pre_block_seg_end_coordinate = prev_block.seg_end_coordinate
656
+ prev_end_space_small = (
657
+ abs(prev_block.end_coordinate - pre_block_seg_end_coordinate) < 10
658
+ )
659
+ prev_lines_more_than_one = num_of_prev_lines > 1
2095
660
 
2096
- Args:
2097
- input_bbox (list): The bounding box coordinates [x1, y1, x2, y2] of the input object.
2098
- match_bbox (list): The bounding box coordinates [x1', y1', x2', y2'] of the object to match against.
2099
- weight (list, optional): Directional weights for the edge distances [left, right, up, down]. Defaults to [1, 1, 1, 1].
2100
- label (str, optional): The label/type of the object in the bounding box (e.g., 'text'). Defaults to 'text'.
2101
- no_mask_labels (list, optional): Labels for which no masking is applied when calculating edge distances. Defaults to an empty list.
2102
- min_edge_distance_config (list, optional): Configuration for minimum edge distances [min_edge_distance_x, min_edge_distance_y].
2103
- Defaults to [float('inf'), float('inf')].
2104
- tolerance_len (float, optional): The tolerance length for adjusting edge distances. Defaults to 10.
661
+ overlap_blocks = (
662
+ context_left_coordinate < prev_block.end_coordinate
663
+ and context_right_coordinate > prev_block.start_coordinate
664
+ )
2105
665
 
2106
- Returns:
2107
- Tuple[float, List[float]]: A tuple containing:
2108
- - The calculated minimum edge distance between the bounding boxes.
2109
- - A list with the minimum edge distances in the x and y directions.
2110
- """
2111
- match_bbox_iou = _calculate_overlap_area_div_minbox_area_ratio(
2112
- input_bbox,
2113
- match_bbox,
2114
- )
2115
- if match_bbox_iou > 0 and label not in no_mask_labels:
2116
- return 0, [0, 0]
2117
-
2118
- if not min_edge_distance_config:
2119
- min_edge_distance_config = [float("inf"), float("inf")]
2120
- min_edge_distance_x, min_edge_distance_y = min_edge_distance_config
2121
-
2122
- x1, y1, x2, y2 = input_bbox
2123
- x1_prime, y1_prime, x2_prime, y2_prime = match_bbox
2124
-
2125
- direction_num = 0
2126
- distance_x = float("inf")
2127
- distance_y = float("inf")
2128
- distance = [float("inf")] * 4
2129
-
2130
- # input_bbox is to the left of match_bbox
2131
- if x2 < x1_prime:
2132
- direction_num += 1
2133
- distance[0] = x1_prime - x2
2134
- if abs(distance[0] - min_edge_distance_x) <= tolerance_len:
2135
- distance_x = min_edge_distance_x * weight[0]
2136
- else:
2137
- distance_x = distance[0] * weight[0]
2138
- # input_bbox is to the right of match_bbox
2139
- elif x1 > x2_prime:
2140
- direction_num += 1
2141
- distance[1] = x1 - x2_prime
2142
- if abs(distance[1] - min_edge_distance_x) <= tolerance_len:
2143
- distance_x = min_edge_distance_x * weight[1]
2144
- else:
2145
- distance_x = distance[1] * weight[1]
2146
- elif match_bbox_iou > 0:
2147
- distance[0] = 0
2148
- distance_x = 0
2149
-
2150
- # input_bbox is above match_bbox
2151
- if y2 < y1_prime:
2152
- direction_num += 1
2153
- distance[2] = y1_prime - y2
2154
- if abs(distance[2] - min_edge_distance_y) <= tolerance_len:
2155
- distance_y = min_edge_distance_y * weight[2]
2156
- else:
2157
- distance_y = distance[2] * weight[2]
2158
- if label in no_mask_labels:
2159
- distance_y = max(0.1, distance_y) * 10 # for abstract
2160
- # input_bbox is below match_bbox
2161
- elif y1 > y2_prime:
2162
- direction_num += 1
2163
- distance[3] = y1 - y2_prime
2164
- if abs(distance[3] - min_edge_distance_y) <= tolerance_len:
2165
- distance_y = min_edge_distance_y * weight[3]
666
+ # update context_left_coordinate and context_right_coordinate
667
+ if overlap_blocks:
668
+ context_left_coordinate = min(
669
+ prev_block.start_coordinate, context_left_coordinate
670
+ )
671
+ context_right_coordinate = max(
672
+ prev_block.end_coordinate, context_right_coordinate
673
+ )
674
+ prev_end_space_small = (
675
+ abs(context_right_coordinate - pre_block_seg_end_coordinate) < 10
676
+ )
677
+ edge_distance = 0
2166
678
  else:
2167
- distance_y = distance[3] * weight[3]
2168
- elif match_bbox_iou > 0:
2169
- distance[2] = 0
2170
- distance_y = 0
2171
-
2172
- if direction_num == 2:
2173
- return (distance_x + distance_y), [
2174
- min(distance[0], distance[1]),
2175
- min(distance[2], distance[3]),
2176
- ]
2177
- else:
2178
- return min(distance_x, distance_y), [
2179
- min(distance[0], distance[1]),
2180
- min(distance[2], distance[3]),
2181
- ]
679
+ edge_distance = abs(block.start_coordinate - prev_block.end_coordinate)
2182
680
 
681
+ current_start_space_small = seg_start_coordinate - context_left_coordinate < 10
2183
682
 
2184
- def _get_weights(label, horizontal):
2185
- """Define weights based on the label and orientation."""
2186
- if label == "doc_title":
2187
- return (
2188
- [1, 0.1, 0.1, 1] if horizontal else [0.2, 0.1, 1, 1]
2189
- ) # left-down , right-left
2190
- elif label in [
2191
- "paragraph_title",
2192
- "table_title",
2193
- "abstract",
2194
- "image",
2195
- "seal",
2196
- "chart",
2197
- "figure",
2198
- ]:
2199
- return [1, 1, 0.1, 1] # down
683
+ if (
684
+ prev_end_space_small
685
+ and current_start_space_small
686
+ and prev_lines_more_than_one
687
+ and edge_distance < max(prev_block.width, block.width)
688
+ ):
689
+ seg_start_flag = False
2200
690
  else:
2201
- return [1, 1, 1, 0.1] # up
2202
-
2203
-
2204
- def _nearest_iou_edge_distance(
2205
- input_bbox: List[int],
2206
- match_bbox: List[int],
2207
- label: str,
2208
- vision_labels: List[str],
2209
- no_mask_labels: List[str],
2210
- median_width: int = -1,
2211
- title_labels: List[str] = [],
2212
- title_text: List[Tuple[int, List[int]]] = [],
2213
- sub_title: List[List[int]] = [],
2214
- min_distance_config: List[float] = [],
2215
- tolerance_len: float = 10.0,
2216
- ) -> Tuple[float, List[float]]:
2217
- """
2218
- Calculate the nearest IOU edge distance between two bounding boxes, considering label types, title adjustments, and minimum distance configurations.
2219
- This function computes the edge distance between two bounding boxes while considering their overlap (IOU) and various adjustments based on label types,
2220
- title text, and subtitle information. It also applies minimum distance configurations and tolerance adjustments.
2221
-
2222
- Args:
2223
- input_bbox (List[int]): The bounding box coordinates [x1, y1, x2, y2] of the input object.
2224
- match_bbox (List[int]): The bounding box coordinates [x1', y1', x2', y2'] of the object to match against.
2225
- label (str): The label/type of the object in the bounding box (e.g., 'image', 'text', etc.).
2226
- vision_labels (List[str]): List of labels for vision-related objects (e.g., images, icons).
2227
- no_mask_labels (List[str]): Labels for which no masking is applied when calculating edge distances.
2228
- median_width (int, optional): The median width for title dispersion calculation. Defaults to -1.
2229
- title_labels (List[str], optional): Labels that indicate the object is a title. Defaults to an empty list.
2230
- title_text (List[Tuple[int, List[int]]], optional): Text content associated with title labels, in the format [(position_indicator, [x1, y1, x2, y2]), ...].
2231
- sub_title (List[List[int]], optional): List of subtitle bounding boxes to adjust the input_bbox. Defaults to an empty list.
2232
- min_distance_config (List[float], optional): Configuration for minimum distances [min_edge_distance_config, up_edge_distances_config, total_distance].
2233
- tolerance_len (float, optional): The tolerance length for adjusting edge distances. Defaults to 10.0.
2234
-
2235
- Returns:
2236
- Tuple[float, List[float]]: A tuple containing:
2237
- - The calculated distance considering IOU and adjustments.
2238
- - The updated minimum distance configuration.
2239
- """
2240
-
2241
- x1, y1, x2, y2 = input_bbox
2242
- x1_prime, y1_prime, x2_prime, y2_prime = match_bbox
2243
-
2244
- min_edge_distance_config, up_edge_distances_config, total_distance = (
2245
- min_distance_config
2246
- )
2247
-
2248
- iou_distance = 0
2249
-
2250
- if label in vision_labels:
2251
- horizontal1 = horizontal2 = True
691
+ if seg_start_coordinate - context_left_coordinate < 10:
692
+ seg_start_flag = False
693
+
694
+ if context_right_coordinate - seg_end_coordinate < 10:
695
+ seg_end_flag = False
696
+
697
+ return seg_start_flag, seg_end_flag
698
+
699
+
700
+ def get_show_color(label: str, order_label=False) -> Tuple:
701
+ if order_label:
702
+ label_colors = {
703
+ "doc_title": (255, 248, 220, 100), # Cornsilk
704
+ "doc_title_text": (255, 239, 213, 100),
705
+ "paragraph_title": (102, 102, 255, 100),
706
+ "sub_paragraph_title": (102, 178, 255, 100),
707
+ "vision": (153, 255, 51, 100),
708
+ "vision_title": (144, 238, 144, 100), # Light Green
709
+ "vision_footnote": (144, 238, 144, 100), # Light Green
710
+ "normal_text": (153, 0, 76, 100),
711
+ "cross_layout": (53, 218, 207, 100), # Thistle
712
+ "cross_reference": (221, 160, 221, 100), # Floral White
713
+ }
2252
714
  else:
2253
- horizontal1 = _get_bbox_direction(input_bbox)
2254
- horizontal2 = _get_bbox_direction(match_bbox, 3)
2255
-
2256
- if (
2257
- horizontal1 != horizontal2
2258
- or _get_projection_iou(input_bbox, match_bbox, horizontal1) < 0.01
2259
- ):
2260
- iou_distance = 1
2261
-
2262
- if label == "doc_title":
2263
- # Calculate distance for titles
2264
- disperse = max(1, median_width)
2265
- tolerance_len = max(tolerance_len, disperse)
2266
-
2267
- # Adjust input_bbox based on sub_title
2268
- if sub_title:
2269
- for sub in sub_title:
2270
- x1_, y1_, x2_, y2_ = sub
2271
- x1, y1, x2, y2 = (
2272
- min(x1, x1_),
2273
- min(y1, y1_),
2274
- min(x2, x2_),
2275
- max(y2, y2_),
2276
- )
2277
- input_bbox = [x1, y1, x2, y2]
2278
-
2279
- if title_text:
2280
- for sub in title_text:
2281
- x1_, y1_, x2_, y2_ = sub[1]
2282
- if horizontal1:
2283
- x1, y1, x2, y2 = (
2284
- min(x1, x1_),
2285
- min(y1, y1_),
2286
- min(x2, x2_),
2287
- max(y2, y2_),
2288
- )
2289
- else:
2290
- x1, y1, x2, y2 = (
2291
- min(x1, x1_),
2292
- min(y1, y1_),
2293
- max(x2, x2_),
2294
- min(y2, y2_),
2295
- )
2296
- input_bbox = [x1, y1, x2, y2]
2297
-
2298
- # Calculate edge distance
2299
- weight = _get_weights(label, horizontal1)
2300
- if label == "abstract":
2301
- tolerance_len *= 2
2302
-
2303
- edge_distance, edge_distance_config = _nearest_edge_distance(
2304
- input_bbox,
2305
- match_bbox,
2306
- weight,
2307
- label=label,
2308
- no_mask_labels=no_mask_labels,
2309
- min_edge_distance_config=min_edge_distance_config,
2310
- tolerance_len=tolerance_len,
2311
- )
2312
-
2313
- # Weights for combining distances
2314
- iou_edge_weight = [10**8, 10**4, 1, 0.0001]
2315
-
2316
- # Calculate up and left edge distances
2317
- up_edge_distance = y1_prime
2318
- left_edge_distance = x1_prime
2319
- if (
2320
- label in no_mask_labels or label in title_labels or label in vision_labels
2321
- ) and y1 > y2_prime:
2322
- up_edge_distance = -y2_prime
2323
- left_edge_distance = -x2_prime
2324
-
2325
- min_up_edge_distance = up_edge_distances_config
2326
- if abs(min_up_edge_distance - up_edge_distance) <= tolerance_len:
2327
- up_edge_distance = min_up_edge_distance
2328
-
2329
- # Calculate total distance
2330
- distance = (
2331
- iou_distance * iou_edge_weight[0]
2332
- + edge_distance * iou_edge_weight[1]
2333
- + up_edge_distance * iou_edge_weight[2]
2334
- + left_edge_distance * iou_edge_weight[3]
2335
- )
2336
-
2337
- # Update minimum distance configuration if a smaller distance is found
2338
- if total_distance > distance:
2339
- edge_distance_config = [
2340
- edge_distance_config[0],
2341
- edge_distance_config[1],
2342
- ]
2343
- min_distance_config = [
2344
- edge_distance_config,
2345
- up_edge_distance,
2346
- distance,
2347
- ]
2348
-
2349
- return distance, min_distance_config
2350
-
2351
-
2352
- def get_show_color(label: str) -> Tuple:
2353
- label_colors = {
2354
- # Medium Blue (from 'titles_list')
2355
- "paragraph_title": (102, 102, 255, 100),
2356
- "doc_title": (255, 248, 220, 100), # Cornsilk
2357
- # Light Yellow (from 'tables_caption_list')
2358
- "table_title": (255, 255, 102, 100),
2359
- # Sky Blue (from 'imgs_caption_list')
2360
- "figure_title": (102, 178, 255, 100),
2361
- "chart_title": (221, 160, 221, 100), # Plum
2362
- "vision_footnote": (144, 238, 144, 100), # Light Green
2363
- # Deep Purple (from 'texts_list')
2364
- "text": (153, 0, 76, 100),
2365
- # Bright Green (from 'interequations_list')
2366
- "formula": (0, 255, 0, 100),
2367
- "abstract": (255, 239, 213, 100), # Papaya Whip
2368
- # Medium Green (from 'lists_list' and 'indexs_list')
2369
- "content": (40, 169, 92, 100),
2370
- # Neutral Gray (from 'dropped_bbox_list')
2371
- "seal": (158, 158, 158, 100),
2372
- # Olive Yellow (from 'tables_body_list')
2373
- "table": (204, 204, 0, 100),
2374
- # Bright Green (from 'imgs_body_list')
2375
- "image": (153, 255, 51, 100),
2376
- # Bright Green (from 'imgs_body_list')
2377
- "figure": (153, 255, 51, 100),
2378
- "chart": (216, 191, 216, 100), # Thistle
2379
- # Pale Yellow-Green (from 'tables_footnote_list')
2380
- "reference": (229, 255, 204, 100),
2381
- "algorithm": (255, 250, 240, 100), # Floral White
2382
- }
715
+ label_colors = {
716
+ # Medium Blue (from 'titles_list')
717
+ "paragraph_title": (102, 102, 255, 100),
718
+ "doc_title": (255, 248, 220, 100), # Cornsilk
719
+ # Light Yellow (from 'tables_caption_list')
720
+ "table_title": (255, 255, 102, 100),
721
+ # Sky Blue (from 'imgs_caption_list')
722
+ "figure_title": (102, 178, 255, 100),
723
+ "chart_title": (221, 160, 221, 100), # Plum
724
+ "vision_footnote": (144, 238, 144, 100), # Light Green
725
+ # Deep Purple (from 'texts_list')
726
+ "text": (153, 0, 76, 100),
727
+ # Bright Green (from 'interequations_list')
728
+ "formula": (0, 255, 0, 100),
729
+ "abstract": (255, 239, 213, 100), # Papaya Whip
730
+ # Medium Green (from 'lists_list' and 'indexs_list')
731
+ "content": (40, 169, 92, 100),
732
+ # Neutral Gray (from 'dropped_bbox_list')
733
+ "seal": (158, 158, 158, 100),
734
+ # Olive Yellow (from 'tables_body_list')
735
+ "table": (204, 204, 0, 100),
736
+ # Bright Green (from 'imgs_body_list')
737
+ "image": (153, 255, 51, 100),
738
+ # Bright Green (from 'imgs_body_list')
739
+ "figure": (153, 255, 51, 100),
740
+ "chart": (216, 191, 216, 100), # Thistle
741
+ # Pale Yellow-Green (from 'tables_footnote_list')
742
+ "reference": (229, 255, 204, 100),
743
+ # "reference_content": (229, 255, 204, 100),
744
+ "algorithm": (255, 250, 240, 100), # Floral White
745
+ }
2383
746
  default_color = (158, 158, 158, 100)
2384
747
  return label_colors.get(label, default_color)