paddlex 3.0.0rc1__py3-none-any.whl → 3.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (240) hide show
  1. paddlex/.version +1 -1
  2. paddlex/__init__.py +1 -1
  3. paddlex/configs/modules/chart_parsing/PP-Chart2Table.yaml +13 -0
  4. paddlex/configs/modules/doc_vlm/PP-DocBee2-3B.yaml +14 -0
  5. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-L.yaml +40 -0
  6. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-M.yaml +40 -0
  7. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-S.yaml +40 -0
  8. paddlex/configs/modules/layout_detection/PP-DocBlockLayout.yaml +40 -0
  9. paddlex/configs/modules/layout_detection/PP-DocLayout-L.yaml +2 -2
  10. paddlex/configs/modules/layout_detection/PP-DocLayout-M.yaml +2 -2
  11. paddlex/configs/modules/layout_detection/PP-DocLayout-S.yaml +2 -2
  12. paddlex/configs/modules/layout_detection/PP-DocLayout_plus-L.yaml +40 -0
  13. paddlex/configs/modules/text_detection/PP-OCRv5_mobile_det.yaml +40 -0
  14. paddlex/configs/modules/text_detection/PP-OCRv5_server_det.yaml +40 -0
  15. paddlex/configs/modules/text_recognition/PP-OCRv5_mobile_rec.yaml +39 -0
  16. paddlex/configs/modules/text_recognition/PP-OCRv5_server_rec.yaml +39 -0
  17. paddlex/configs/modules/textline_orientation/PP-LCNet_x1_0_textline_ori.yaml +41 -0
  18. paddlex/configs/pipelines/OCR.yaml +7 -6
  19. paddlex/configs/pipelines/PP-ChatOCRv3-doc.yaml +3 -1
  20. paddlex/configs/pipelines/PP-ChatOCRv4-doc.yaml +91 -34
  21. paddlex/configs/pipelines/PP-StructureV3.yaml +72 -72
  22. paddlex/configs/pipelines/doc_understanding.yaml +1 -1
  23. paddlex/configs/pipelines/formula_recognition.yaml +2 -2
  24. paddlex/configs/pipelines/layout_parsing.yaml +3 -2
  25. paddlex/configs/pipelines/seal_recognition.yaml +1 -0
  26. paddlex/configs/pipelines/table_recognition.yaml +2 -1
  27. paddlex/configs/pipelines/table_recognition_v2.yaml +7 -1
  28. paddlex/hpip_links.html +20 -20
  29. paddlex/inference/common/batch_sampler/doc_vlm_batch_sampler.py +33 -10
  30. paddlex/inference/common/batch_sampler/image_batch_sampler.py +34 -25
  31. paddlex/inference/common/result/mixin.py +19 -12
  32. paddlex/inference/models/base/predictor/base_predictor.py +2 -8
  33. paddlex/inference/models/common/static_infer.py +29 -73
  34. paddlex/inference/models/common/tokenizer/__init__.py +2 -0
  35. paddlex/inference/models/common/tokenizer/clip_tokenizer.py +1 -1
  36. paddlex/inference/models/common/tokenizer/gpt_tokenizer.py +2 -2
  37. paddlex/inference/models/common/tokenizer/qwen2_5_tokenizer.py +112 -0
  38. paddlex/inference/models/common/tokenizer/qwen2_tokenizer.py +7 -1
  39. paddlex/inference/models/common/tokenizer/qwen_tokenizer.py +288 -0
  40. paddlex/inference/models/common/tokenizer/tokenizer_utils.py +13 -13
  41. paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py +3 -3
  42. paddlex/inference/models/common/tokenizer/vocab.py +7 -7
  43. paddlex/inference/models/common/ts/funcs.py +19 -8
  44. paddlex/inference/models/common/vlm/conversion_utils.py +99 -0
  45. paddlex/inference/models/common/vlm/fusion_ops.py +205 -0
  46. paddlex/inference/models/common/vlm/generation/configuration_utils.py +1 -1
  47. paddlex/inference/models/common/vlm/generation/logits_process.py +1 -1
  48. paddlex/inference/models/common/vlm/generation/utils.py +1 -1
  49. paddlex/inference/models/common/vlm/transformers/configuration_utils.py +3 -3
  50. paddlex/inference/models/common/vlm/transformers/conversion_utils.py +3 -3
  51. paddlex/inference/models/common/vlm/transformers/model_outputs.py +2 -2
  52. paddlex/inference/models/common/vlm/transformers/model_utils.py +7 -31
  53. paddlex/inference/models/doc_vlm/modeling/GOT_ocr_2_0.py +830 -0
  54. paddlex/inference/models/doc_vlm/modeling/__init__.py +2 -0
  55. paddlex/inference/models/doc_vlm/modeling/qwen2.py +1606 -0
  56. paddlex/inference/models/doc_vlm/modeling/qwen2_5_vl.py +3006 -0
  57. paddlex/inference/models/doc_vlm/modeling/qwen2_vl.py +0 -105
  58. paddlex/inference/models/doc_vlm/predictor.py +79 -24
  59. paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py +97 -0
  60. paddlex/inference/models/doc_vlm/processors/__init__.py +2 -0
  61. paddlex/inference/models/doc_vlm/processors/common.py +189 -0
  62. paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py +548 -0
  63. paddlex/inference/models/doc_vlm/processors/qwen2_vl.py +21 -176
  64. paddlex/inference/models/formula_recognition/predictor.py +8 -2
  65. paddlex/inference/models/formula_recognition/processors.py +90 -77
  66. paddlex/inference/models/formula_recognition/result.py +28 -27
  67. paddlex/inference/models/image_feature/processors.py +3 -4
  68. paddlex/inference/models/keypoint_detection/predictor.py +3 -0
  69. paddlex/inference/models/object_detection/predictor.py +2 -0
  70. paddlex/inference/models/object_detection/processors.py +28 -3
  71. paddlex/inference/models/object_detection/utils.py +2 -0
  72. paddlex/inference/models/table_structure_recognition/result.py +0 -10
  73. paddlex/inference/models/text_detection/predictor.py +8 -0
  74. paddlex/inference/models/text_detection/processors.py +44 -10
  75. paddlex/inference/models/text_detection/result.py +0 -10
  76. paddlex/inference/models/text_recognition/result.py +1 -1
  77. paddlex/inference/pipelines/__init__.py +9 -5
  78. paddlex/inference/pipelines/_parallel.py +172 -0
  79. paddlex/inference/pipelines/anomaly_detection/pipeline.py +16 -6
  80. paddlex/inference/pipelines/attribute_recognition/pipeline.py +11 -1
  81. paddlex/inference/pipelines/base.py +14 -4
  82. paddlex/inference/pipelines/components/faisser.py +1 -1
  83. paddlex/inference/pipelines/doc_preprocessor/pipeline.py +53 -27
  84. paddlex/inference/pipelines/formula_recognition/pipeline.py +120 -82
  85. paddlex/inference/pipelines/formula_recognition/result.py +1 -11
  86. paddlex/inference/pipelines/image_classification/pipeline.py +16 -6
  87. paddlex/inference/pipelines/image_multilabel_classification/pipeline.py +16 -6
  88. paddlex/inference/pipelines/instance_segmentation/pipeline.py +16 -6
  89. paddlex/inference/pipelines/keypoint_detection/pipeline.py +16 -6
  90. paddlex/inference/pipelines/layout_parsing/layout_objects.py +859 -0
  91. paddlex/inference/pipelines/layout_parsing/pipeline.py +34 -47
  92. paddlex/inference/pipelines/layout_parsing/pipeline_v2.py +832 -260
  93. paddlex/inference/pipelines/layout_parsing/result.py +4 -17
  94. paddlex/inference/pipelines/layout_parsing/result_v2.py +259 -245
  95. paddlex/inference/pipelines/layout_parsing/setting.py +88 -0
  96. paddlex/inference/pipelines/layout_parsing/utils.py +391 -2028
  97. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/__init__.py +16 -0
  98. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/utils.py +1199 -0
  99. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/xycuts.py +615 -0
  100. paddlex/inference/pipelines/m_3d_bev_detection/pipeline.py +2 -2
  101. paddlex/inference/pipelines/multilingual_speech_recognition/pipeline.py +2 -2
  102. paddlex/inference/pipelines/object_detection/pipeline.py +16 -6
  103. paddlex/inference/pipelines/ocr/pipeline.py +127 -70
  104. paddlex/inference/pipelines/ocr/result.py +21 -18
  105. paddlex/inference/pipelines/open_vocabulary_detection/pipeline.py +2 -2
  106. paddlex/inference/pipelines/open_vocabulary_segmentation/pipeline.py +2 -2
  107. paddlex/inference/pipelines/pp_chatocr/pipeline_base.py +2 -2
  108. paddlex/inference/pipelines/pp_chatocr/pipeline_v3.py +2 -5
  109. paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py +6 -6
  110. paddlex/inference/pipelines/rotated_object_detection/pipeline.py +16 -6
  111. paddlex/inference/pipelines/seal_recognition/pipeline.py +109 -53
  112. paddlex/inference/pipelines/semantic_segmentation/pipeline.py +16 -6
  113. paddlex/inference/pipelines/small_object_detection/pipeline.py +16 -6
  114. paddlex/inference/pipelines/table_recognition/pipeline.py +26 -18
  115. paddlex/inference/pipelines/table_recognition/pipeline_v2.py +624 -53
  116. paddlex/inference/pipelines/table_recognition/result.py +1 -1
  117. paddlex/inference/pipelines/table_recognition/table_recognition_post_processing_v2.py +9 -5
  118. paddlex/inference/pipelines/ts_anomaly_detection/pipeline.py +2 -2
  119. paddlex/inference/pipelines/ts_classification/pipeline.py +2 -2
  120. paddlex/inference/pipelines/ts_forecasting/pipeline.py +2 -2
  121. paddlex/inference/pipelines/video_classification/pipeline.py +2 -2
  122. paddlex/inference/pipelines/video_detection/pipeline.py +2 -2
  123. paddlex/inference/serving/basic_serving/_app.py +46 -13
  124. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/common.py +5 -1
  125. paddlex/inference/serving/basic_serving/_pipeline_apps/layout_parsing.py +0 -1
  126. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv3_doc.py +0 -1
  127. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv4_doc.py +1 -1
  128. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py +6 -2
  129. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition.py +1 -5
  130. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition_v2.py +4 -5
  131. paddlex/inference/serving/infra/utils.py +20 -22
  132. paddlex/inference/serving/schemas/formula_recognition.py +1 -1
  133. paddlex/inference/serving/schemas/layout_parsing.py +1 -2
  134. paddlex/inference/serving/schemas/pp_chatocrv3_doc.py +1 -2
  135. paddlex/inference/serving/schemas/pp_chatocrv4_doc.py +2 -2
  136. paddlex/inference/serving/schemas/pp_structurev3.py +10 -6
  137. paddlex/inference/serving/schemas/seal_recognition.py +1 -1
  138. paddlex/inference/serving/schemas/table_recognition.py +2 -6
  139. paddlex/inference/serving/schemas/table_recognition_v2.py +5 -6
  140. paddlex/inference/utils/hpi.py +30 -16
  141. paddlex/inference/utils/hpi_model_info_collection.json +666 -162
  142. paddlex/inference/utils/io/readers.py +12 -12
  143. paddlex/inference/utils/misc.py +20 -0
  144. paddlex/inference/utils/mkldnn_blocklist.py +59 -0
  145. paddlex/inference/utils/official_models.py +140 -5
  146. paddlex/inference/utils/pp_option.py +74 -9
  147. paddlex/model.py +2 -2
  148. paddlex/modules/__init__.py +1 -1
  149. paddlex/modules/anomaly_detection/evaluator.py +2 -2
  150. paddlex/modules/base/__init__.py +1 -1
  151. paddlex/modules/base/evaluator.py +5 -5
  152. paddlex/modules/base/trainer.py +1 -1
  153. paddlex/modules/doc_vlm/dataset_checker.py +2 -2
  154. paddlex/modules/doc_vlm/evaluator.py +2 -2
  155. paddlex/modules/doc_vlm/exportor.py +2 -2
  156. paddlex/modules/doc_vlm/model_list.py +1 -1
  157. paddlex/modules/doc_vlm/trainer.py +2 -2
  158. paddlex/modules/face_recognition/evaluator.py +2 -2
  159. paddlex/modules/formula_recognition/evaluator.py +5 -2
  160. paddlex/modules/formula_recognition/model_list.py +3 -0
  161. paddlex/modules/formula_recognition/trainer.py +3 -0
  162. paddlex/modules/general_recognition/evaluator.py +1 -1
  163. paddlex/modules/image_classification/evaluator.py +2 -2
  164. paddlex/modules/image_classification/model_list.py +1 -0
  165. paddlex/modules/instance_segmentation/evaluator.py +1 -1
  166. paddlex/modules/keypoint_detection/evaluator.py +1 -1
  167. paddlex/modules/m_3d_bev_detection/evaluator.py +2 -2
  168. paddlex/modules/multilabel_classification/evaluator.py +2 -2
  169. paddlex/modules/object_detection/dataset_checker/dataset_src/convert_dataset.py +4 -4
  170. paddlex/modules/object_detection/evaluator.py +2 -2
  171. paddlex/modules/object_detection/model_list.py +2 -0
  172. paddlex/modules/semantic_segmentation/dataset_checker/__init__.py +12 -2
  173. paddlex/modules/semantic_segmentation/evaluator.py +2 -2
  174. paddlex/modules/table_recognition/evaluator.py +2 -2
  175. paddlex/modules/text_detection/evaluator.py +2 -2
  176. paddlex/modules/text_detection/model_list.py +2 -0
  177. paddlex/modules/text_recognition/evaluator.py +2 -2
  178. paddlex/modules/text_recognition/model_list.py +2 -0
  179. paddlex/modules/ts_anomaly_detection/evaluator.py +2 -2
  180. paddlex/modules/ts_classification/dataset_checker/dataset_src/split_dataset.py +1 -1
  181. paddlex/modules/ts_classification/evaluator.py +2 -2
  182. paddlex/modules/ts_forecast/evaluator.py +2 -2
  183. paddlex/modules/video_classification/evaluator.py +2 -2
  184. paddlex/modules/video_detection/evaluator.py +2 -2
  185. paddlex/ops/__init__.py +8 -5
  186. paddlex/paddlex_cli.py +19 -13
  187. paddlex/repo_apis/Paddle3D_api/bev_fusion/model.py +2 -2
  188. paddlex/repo_apis/PaddleClas_api/cls/config.py +1 -1
  189. paddlex/repo_apis/PaddleClas_api/cls/model.py +1 -1
  190. paddlex/repo_apis/PaddleClas_api/cls/register.py +10 -0
  191. paddlex/repo_apis/PaddleClas_api/cls/runner.py +1 -1
  192. paddlex/repo_apis/PaddleDetection_api/instance_seg/model.py +1 -1
  193. paddlex/repo_apis/PaddleDetection_api/instance_seg/runner.py +1 -1
  194. paddlex/repo_apis/PaddleDetection_api/object_det/config.py +1 -1
  195. paddlex/repo_apis/PaddleDetection_api/object_det/model.py +1 -1
  196. paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py +25 -0
  197. paddlex/repo_apis/PaddleDetection_api/object_det/register.py +30 -0
  198. paddlex/repo_apis/PaddleDetection_api/object_det/runner.py +1 -1
  199. paddlex/repo_apis/PaddleOCR_api/formula_rec/config.py +3 -3
  200. paddlex/repo_apis/PaddleOCR_api/formula_rec/model.py +5 -9
  201. paddlex/repo_apis/PaddleOCR_api/formula_rec/register.py +27 -0
  202. paddlex/repo_apis/PaddleOCR_api/formula_rec/runner.py +1 -1
  203. paddlex/repo_apis/PaddleOCR_api/table_rec/model.py +1 -1
  204. paddlex/repo_apis/PaddleOCR_api/table_rec/runner.py +1 -1
  205. paddlex/repo_apis/PaddleOCR_api/text_det/model.py +1 -1
  206. paddlex/repo_apis/PaddleOCR_api/text_det/register.py +18 -0
  207. paddlex/repo_apis/PaddleOCR_api/text_det/runner.py +1 -1
  208. paddlex/repo_apis/PaddleOCR_api/text_rec/config.py +3 -3
  209. paddlex/repo_apis/PaddleOCR_api/text_rec/model.py +5 -9
  210. paddlex/repo_apis/PaddleOCR_api/text_rec/register.py +18 -0
  211. paddlex/repo_apis/PaddleOCR_api/text_rec/runner.py +1 -1
  212. paddlex/repo_apis/PaddleSeg_api/seg/model.py +1 -1
  213. paddlex/repo_apis/PaddleSeg_api/seg/runner.py +1 -1
  214. paddlex/repo_apis/PaddleTS_api/ts_ad/config.py +3 -3
  215. paddlex/repo_apis/PaddleTS_api/ts_cls/config.py +2 -2
  216. paddlex/repo_apis/PaddleTS_api/ts_fc/config.py +4 -4
  217. paddlex/repo_apis/PaddleVideo_api/video_cls/config.py +1 -1
  218. paddlex/repo_apis/PaddleVideo_api/video_cls/model.py +1 -1
  219. paddlex/repo_apis/PaddleVideo_api/video_cls/runner.py +1 -1
  220. paddlex/repo_apis/PaddleVideo_api/video_det/config.py +1 -1
  221. paddlex/repo_apis/PaddleVideo_api/video_det/model.py +1 -1
  222. paddlex/repo_apis/PaddleVideo_api/video_det/runner.py +1 -1
  223. paddlex/repo_apis/base/config.py +1 -1
  224. paddlex/repo_manager/core.py +3 -3
  225. paddlex/repo_manager/meta.py +6 -2
  226. paddlex/repo_manager/repo.py +17 -16
  227. paddlex/utils/custom_device_list.py +26 -2
  228. paddlex/utils/deps.py +3 -3
  229. paddlex/utils/device.py +5 -13
  230. paddlex/utils/env.py +4 -0
  231. paddlex/utils/flags.py +11 -4
  232. paddlex/utils/fonts/__init__.py +34 -4
  233. paddlex/utils/misc.py +1 -1
  234. paddlex/utils/subclass_register.py +2 -2
  235. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/METADATA +349 -208
  236. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/RECORD +240 -211
  237. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/WHEEL +1 -1
  238. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/entry_points.txt +1 -0
  239. {paddlex-3.0.0rc1.dist-info/licenses → paddlex-3.0.2.dist-info}/LICENSE +0 -0
  240. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1199 @@
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import List, Tuple
16
+
17
+ import numpy as np
18
+
19
+ from ..layout_objects import LayoutBlock, LayoutRegion
20
+ from ..setting import BLOCK_LABEL_MAP, XYCUT_SETTINGS
21
+ from ..utils import (
22
+ calculate_overlap_ratio,
23
+ calculate_projection_overlap_ratio,
24
+ get_seg_flag,
25
+ )
26
+
27
+
28
+ def get_nearest_edge_distance(
29
+ bbox1: List[int],
30
+ bbox2: List[int],
31
+ weight: List[float] = [1.0, 1.0, 1.0, 1.0],
32
+ ) -> Tuple[float]:
33
+ """
34
+ Calculate the nearest edge distance between two bounding boxes, considering directional weights.
35
+
36
+ Args:
37
+ bbox1 (list): The bounding box coordinates [x1, y1, x2, y2] of the input object.
38
+ bbox2 (list): The bounding box coordinates [x1', y1', x2', y2'] of the object to match against.
39
+ weight (list, optional): directional weights for the edge distances [left, right, up, down]. Defaults to [1, 1, 1, 1].
40
+
41
+ Returns:
42
+ float: The calculated minimum edge distance between the bounding boxes.
43
+ """
44
+ x1, y1, x2, y2 = bbox1
45
+ x1_prime, y1_prime, x2_prime, y2_prime = bbox2
46
+ min_x_distance, min_y_distance = 0, 0
47
+ horizontal_iou = calculate_projection_overlap_ratio(bbox1, bbox2, "horizontal")
48
+ vertical_iou = calculate_projection_overlap_ratio(bbox1, bbox2, "vertical")
49
+ if horizontal_iou > 0 and vertical_iou > 0:
50
+ return 0.0
51
+ if horizontal_iou == 0:
52
+ min_x_distance = min(abs(x1 - x2_prime), abs(x2 - x1_prime)) * (
53
+ weight[0] if x2 < x1_prime else weight[1]
54
+ )
55
+ if vertical_iou == 0:
56
+ min_y_distance = min(abs(y1 - y2_prime), abs(y2 - y1_prime)) * (
57
+ weight[2] if y2 < y1_prime else weight[3]
58
+ )
59
+
60
+ return min_x_distance + min_y_distance
61
+
62
+
63
+ def projection_by_bboxes(boxes: np.ndarray, axis: int) -> np.ndarray:
64
+ """
65
+ Generate a 1D projection histogram from bounding boxes along a specified axis.
66
+
67
+ Args:
68
+ boxes: A (N, 4) array of bounding boxes defined by [x_min, y_min, x_max, y_max].
69
+ axis: Axis for projection; 0 for horizontal (x-axis), 1 for vertical (y-axis).
70
+
71
+ Returns:
72
+ A 1D numpy array representing the projection histogram based on bounding box intervals.
73
+ """
74
+ assert axis in [0, 1]
75
+
76
+ if np.min(boxes[:, axis::2]) < 0:
77
+ max_length = abs(np.min(boxes[:, axis::2]))
78
+ else:
79
+ max_length = np.max(boxes[:, axis::2])
80
+
81
+ projection = np.zeros(max_length, dtype=int)
82
+
83
+ # Increment projection histogram over the interval defined by each bounding box
84
+ for start, end in boxes[:, axis::2]:
85
+ start = abs(start)
86
+ end = abs(end)
87
+ projection[start:end] += 1
88
+
89
+ return projection
90
+
91
+
92
+ def split_projection_profile(arr_values: np.ndarray, min_value: float, min_gap: float):
93
+ """
94
+ Split the projection profile into segments based on specified thresholds.
95
+
96
+ Args:
97
+ arr_values: 1D array representing the projection profile.
98
+ min_value: Minimum value threshold to consider a profile segment significant.
99
+ min_gap: Minimum gap width to consider a separation between segments.
100
+
101
+ Returns:
102
+ A tuple of start and end indices for each segment that meets the criteria.
103
+ """
104
+ # Identify indices where the projection exceeds the minimum value
105
+ significant_indices = np.where(arr_values > min_value)[0]
106
+ if not len(significant_indices):
107
+ return
108
+
109
+ # Calculate gaps between significant indices
110
+ index_diffs = significant_indices[1:] - significant_indices[:-1]
111
+ gap_indices = np.where(index_diffs > min_gap)[0]
112
+
113
+ # Determine start and end indices of segments
114
+ segment_starts = np.insert(
115
+ significant_indices[gap_indices + 1],
116
+ 0,
117
+ significant_indices[0],
118
+ )
119
+ segment_ends = np.append(
120
+ significant_indices[gap_indices],
121
+ significant_indices[-1] + 1,
122
+ )
123
+
124
+ return segment_starts, segment_ends
125
+
126
+
127
+ def recursive_yx_cut(
128
+ boxes: np.ndarray, indices: List[int], res: List[int], min_gap: int = 1
129
+ ):
130
+ """
131
+ Recursively project and segment bounding boxes, starting with Y-axis and followed by X-axis.
132
+
133
+ Args:
134
+ boxes: A (N, 4) array representing bounding boxes.
135
+ indices: List of indices indicating the original position of boxes.
136
+ res: List to store indices of the final segmented bounding boxes.
137
+ min_gap (int): Minimum gap width to consider a separation between segments on the X-axis. Defaults to 1.
138
+
139
+ Returns:
140
+ None: This function modifies the `res` list in place.
141
+ """
142
+ assert len(boxes) == len(
143
+ indices
144
+ ), "The length of boxes and indices must be the same."
145
+
146
+ # Sort by y_min for Y-axis projection
147
+ y_sorted_indices = boxes[:, 1].argsort()
148
+ y_sorted_boxes = boxes[y_sorted_indices]
149
+ y_sorted_indices = np.array(indices)[y_sorted_indices]
150
+
151
+ # Perform Y-axis projection
152
+ y_projection = projection_by_bboxes(boxes=y_sorted_boxes, axis=1)
153
+ y_intervals = split_projection_profile(y_projection, 0, 1)
154
+
155
+ if not y_intervals:
156
+ return
157
+
158
+ # Process each segment defined by Y-axis projection
159
+ for y_start, y_end in zip(*y_intervals):
160
+ # Select boxes within the current y interval
161
+ y_interval_indices = (y_start <= y_sorted_boxes[:, 1]) & (
162
+ y_sorted_boxes[:, 1] < y_end
163
+ )
164
+ y_boxes_chunk = y_sorted_boxes[y_interval_indices]
165
+ y_indices_chunk = y_sorted_indices[y_interval_indices]
166
+
167
+ # Sort by x_min for X-axis projection
168
+ x_sorted_indices = y_boxes_chunk[:, 0].argsort()
169
+ x_sorted_boxes_chunk = y_boxes_chunk[x_sorted_indices]
170
+ x_sorted_indices_chunk = y_indices_chunk[x_sorted_indices]
171
+
172
+ # Perform X-axis projection
173
+ x_projection = projection_by_bboxes(boxes=x_sorted_boxes_chunk, axis=0)
174
+ x_intervals = split_projection_profile(x_projection, 0, min_gap)
175
+
176
+ if not x_intervals:
177
+ continue
178
+
179
+ # If X-axis cannot be further segmented, add current indices to results
180
+ if len(x_intervals[0]) == 1:
181
+ res.extend(x_sorted_indices_chunk)
182
+ continue
183
+
184
+ if np.min(x_sorted_boxes_chunk[:, 0]) < 0:
185
+ x_intervals = np.flip(x_intervals, axis=1)
186
+ # Recursively process each segment defined by X-axis projection
187
+ for x_start, x_end in zip(*x_intervals):
188
+ x_interval_indices = (x_start <= abs(x_sorted_boxes_chunk[:, 0])) & (
189
+ abs(x_sorted_boxes_chunk[:, 0]) < x_end
190
+ )
191
+ recursive_yx_cut(
192
+ x_sorted_boxes_chunk[x_interval_indices],
193
+ x_sorted_indices_chunk[x_interval_indices],
194
+ res,
195
+ )
196
+
197
+
198
+ def recursive_xy_cut(
199
+ boxes: np.ndarray, indices: List[int], res: List[int], min_gap: int = 1
200
+ ):
201
+ """
202
+ Recursively performs X-axis projection followed by Y-axis projection to segment bounding boxes.
203
+
204
+ Args:
205
+ boxes: A (N, 4) array representing bounding boxes with [x_min, y_min, x_max, y_max].
206
+ indices: A list of indices representing the position of boxes in the original data.
207
+ res: A list to store indices of bounding boxes that meet the criteria.
208
+ min_gap (int): Minimum gap width to consider a separation between segments on the X-axis. Defaults to 1.
209
+
210
+ Returns:
211
+ None: This function modifies the `res` list in place.
212
+ """
213
+ # Ensure boxes and indices have the same length
214
+ assert len(boxes) == len(
215
+ indices
216
+ ), "The length of boxes and indices must be the same."
217
+
218
+ # Sort by x_min to prepare for X-axis projection
219
+ x_sorted_indices = boxes[:, 0].argsort()
220
+ x_sorted_boxes = boxes[x_sorted_indices]
221
+ x_sorted_indices = np.array(indices)[x_sorted_indices]
222
+
223
+ # Perform X-axis projection
224
+ x_projection = projection_by_bboxes(boxes=x_sorted_boxes, axis=0)
225
+ x_intervals = split_projection_profile(x_projection, 0, 1)
226
+
227
+ if not x_intervals:
228
+ return
229
+
230
+ if np.min(x_sorted_boxes[:, 0]) < 0:
231
+ x_intervals = np.flip(x_intervals, axis=1)
232
+ # Process each segment defined by X-axis projection
233
+ for x_start, x_end in zip(*x_intervals):
234
+ # Select boxes within the current x interval
235
+ x_interval_indices = (x_start <= abs(x_sorted_boxes[:, 0])) & (
236
+ abs(x_sorted_boxes[:, 0]) < x_end
237
+ )
238
+ x_boxes_chunk = x_sorted_boxes[x_interval_indices]
239
+ x_indices_chunk = x_sorted_indices[x_interval_indices]
240
+
241
+ # Sort selected boxes by y_min to prepare for Y-axis projection
242
+ y_sorted_indices = x_boxes_chunk[:, 1].argsort()
243
+ y_sorted_boxes_chunk = x_boxes_chunk[y_sorted_indices]
244
+ y_sorted_indices_chunk = x_indices_chunk[y_sorted_indices]
245
+
246
+ # Perform Y-axis projection
247
+ y_projection = projection_by_bboxes(boxes=y_sorted_boxes_chunk, axis=1)
248
+ y_intervals = split_projection_profile(y_projection, 0, min_gap)
249
+
250
+ if not y_intervals:
251
+ continue
252
+
253
+ # If Y-axis cannot be further segmented, add current indices to results
254
+ if len(y_intervals[0]) == 1:
255
+ res.extend(y_sorted_indices_chunk)
256
+ continue
257
+
258
+ # Recursively process each segment defined by Y-axis projection
259
+ for y_start, y_end in zip(*y_intervals):
260
+ y_interval_indices = (y_start <= y_sorted_boxes_chunk[:, 1]) & (
261
+ y_sorted_boxes_chunk[:, 1] < y_end
262
+ )
263
+ recursive_xy_cut(
264
+ y_sorted_boxes_chunk[y_interval_indices],
265
+ y_sorted_indices_chunk[y_interval_indices],
266
+ res,
267
+ )
268
+
269
+
270
+ def reference_insert(
271
+ block: LayoutBlock,
272
+ sorted_blocks: List[LayoutBlock],
273
+ **kwargs,
274
+ ):
275
+ """
276
+ Insert reference block into sorted blocks based on the distance between the block and the nearest sorted block.
277
+
278
+ Args:
279
+ block: The block to insert into the sorted blocks.
280
+ sorted_blocks: The sorted blocks where the new block will be inserted.
281
+ config: Configuration dictionary containing parameters related to the layout parsing.
282
+ median_width: Median width of the document. Defaults to 0.0.
283
+
284
+ Returns:
285
+ sorted_blocks: The updated sorted blocks after insertion.
286
+ """
287
+ min_distance = float("inf")
288
+ nearest_sorted_block_index = 0
289
+ for sorted_block_idx, sorted_block in enumerate(sorted_blocks):
290
+ if sorted_block.bbox[3] <= block.bbox[1]:
291
+ distance = -(sorted_block.bbox[2] * 10 + sorted_block.bbox[3])
292
+ if distance < min_distance:
293
+ min_distance = distance
294
+ nearest_sorted_block_index = sorted_block_idx
295
+
296
+ sorted_blocks.insert(nearest_sorted_block_index + 1, block)
297
+ return sorted_blocks
298
+
299
+
300
+ def manhattan_insert(
301
+ block: LayoutBlock,
302
+ sorted_blocks: List[LayoutBlock],
303
+ **kwargs,
304
+ ):
305
+ """
306
+ Insert a block into a sorted list of blocks based on the Manhattan distance between the block and the nearest sorted block.
307
+
308
+ Args:
309
+ block: The block to insert into the sorted blocks.
310
+ sorted_blocks: The sorted blocks where the new block will be inserted.
311
+ config: Configuration dictionary containing parameters related to the layout parsing.
312
+ median_width: Median width of the document. Defaults to 0.0.
313
+
314
+ Returns:
315
+ sorted_blocks: The updated sorted blocks after insertion.
316
+ """
317
+ min_distance = float("inf")
318
+ nearest_sorted_block_index = 0
319
+ for sorted_block_idx, sorted_block in enumerate(sorted_blocks):
320
+ distance = _manhattan_distance(block.bbox, sorted_block.bbox)
321
+ if distance < min_distance:
322
+ min_distance = distance
323
+ nearest_sorted_block_index = sorted_block_idx
324
+
325
+ sorted_blocks.insert(nearest_sorted_block_index + 1, block)
326
+ return sorted_blocks
327
+
328
+
329
+ def euclidean_insert(
330
+ block: LayoutRegion,
331
+ sorted_blocks: List[LayoutRegion],
332
+ **kwargs,
333
+ ):
334
+ """
335
+ Insert a block into a sorted list of blocks based on the Euclidean distance between the block and the nearest sorted block.
336
+
337
+ Args:
338
+ block: The block to insert into the sorted blocks.
339
+ sorted_blocks: The sorted blocks where the new block will be inserted.
340
+ config: Configuration dictionary containing parameters related to the layout parsing.
341
+ median_width: Median width of the document. Defaults to 0.0.
342
+
343
+ Returns:
344
+ sorted_blocks: The updated sorted blocks after insertion.
345
+ """
346
+ nearest_sorted_block_index = len(sorted_blocks)
347
+ block_euclidean_distance = block.euclidean_distance
348
+ for sorted_block_idx, sorted_block in enumerate(sorted_blocks):
349
+ distance = sorted_block.euclidean_distance
350
+ if distance > block_euclidean_distance:
351
+ nearest_sorted_block_index = sorted_block_idx
352
+ break
353
+ sorted_blocks.insert(nearest_sorted_block_index, block)
354
+ return sorted_blocks
355
+
356
+
357
+ def weighted_distance_insert(
358
+ block: LayoutBlock,
359
+ sorted_blocks: List[LayoutBlock],
360
+ region: LayoutRegion,
361
+ ):
362
+ """
363
+ Insert a block into a sorted list of blocks based on the weighted distance between the block and the nearest sorted block.
364
+
365
+ Args:
366
+ block: The block to insert into the sorted blocks.
367
+ sorted_blocks: The sorted blocks where the new block will be inserted.
368
+ config: Configuration dictionary containing parameters related to the layout parsing.
369
+ median_width: Median width of the document. Defaults to 0.0.
370
+
371
+ Returns:
372
+ sorted_blocks: The updated sorted blocks after insertion.
373
+ """
374
+
375
+ tolerance_len = XYCUT_SETTINGS["edge_distance_compare_tolerance_len"]
376
+ x1, y1, x2, y2 = block.bbox
377
+ min_weighted_distance, min_edge_distance, min_up_edge_distance = (
378
+ float("inf"),
379
+ float("inf"),
380
+ float("inf"),
381
+ )
382
+ nearest_sorted_block_index = 0
383
+ for sorted_block_idx, sorted_block in enumerate(sorted_blocks):
384
+
385
+ x1_prime, y1_prime, x2_prime, y2_prime = sorted_block.bbox
386
+
387
+ # Calculate edge distance
388
+ weight = _get_weights(block.order_label, block.direction)
389
+ edge_distance = get_nearest_edge_distance(block.bbox, sorted_block.bbox, weight)
390
+
391
+ if block.label in BLOCK_LABEL_MAP["doc_title_labels"]:
392
+ disperse = max(1, region.text_line_width)
393
+ tolerance_len = max(tolerance_len, disperse)
394
+ if block.label == "abstract":
395
+ tolerance_len *= 2
396
+ edge_distance = max(0.1, edge_distance) * 10
397
+
398
+ # Calculate up edge distances
399
+ up_edge_distance = y1_prime if region.direction == "horizontal" else -x2_prime
400
+ left_edge_distance = x1_prime if region.direction == "horizontal" else y1_prime
401
+ is_below_sorted_block = (
402
+ y2_prime < y1 if region.direction == "horizontal" else x1_prime > x2
403
+ )
404
+
405
+ if (
406
+ block.label not in BLOCK_LABEL_MAP["unordered_labels"]
407
+ or block.label in BLOCK_LABEL_MAP["doc_title_labels"]
408
+ or block.label in BLOCK_LABEL_MAP["paragraph_title_labels"]
409
+ or block.label in BLOCK_LABEL_MAP["vision_labels"]
410
+ ) and is_below_sorted_block:
411
+ up_edge_distance = -up_edge_distance
412
+ left_edge_distance = -left_edge_distance
413
+
414
+ if abs(min_up_edge_distance - up_edge_distance) <= tolerance_len:
415
+ up_edge_distance = min_up_edge_distance
416
+
417
+ # Calculate weighted distance
418
+ weighted_distance = (
419
+ +edge_distance
420
+ * XYCUT_SETTINGS["distance_weight_map"].get("edge_weight", 10**4)
421
+ + up_edge_distance
422
+ * XYCUT_SETTINGS["distance_weight_map"].get("up_edge_weight", 1)
423
+ + left_edge_distance
424
+ * XYCUT_SETTINGS["distance_weight_map"].get("left_edge_weight", 0.0001)
425
+ )
426
+
427
+ min_edge_distance = min(edge_distance, min_edge_distance)
428
+ min_up_edge_distance = min(up_edge_distance, min_up_edge_distance)
429
+
430
+ if weighted_distance < min_weighted_distance:
431
+ nearest_sorted_block_index = sorted_block_idx
432
+ min_weighted_distance = weighted_distance
433
+ if abs(y1 // 2 - y1_prime // 2) > 0:
434
+ sorted_distance = y1_prime
435
+ block_distance = y1
436
+ else:
437
+ if region.direction == "horizontal":
438
+ if abs(x1 // 2 - x2 // 2) > 0:
439
+ sorted_distance = x1_prime
440
+ block_distance = x1
441
+ else:
442
+ # distance with (0,0)
443
+ sorted_block_center_x, sorted_block_center_y = (
444
+ sorted_block.get_centroid()
445
+ )
446
+ block_center_x, block_center_y = block.get_centroid()
447
+ sorted_distance = (
448
+ sorted_block_center_x**2 + sorted_block_center_y**2
449
+ )
450
+ block_distance = block_center_x**2 + block_center_y**2
451
+ else:
452
+ if abs(x1 - x2) > 0:
453
+ sorted_distance = -x2_prime
454
+ block_distance = -x2
455
+ else:
456
+ # distance with (max,0)
457
+ sorted_block_center_x, sorted_block_center_y = (
458
+ sorted_block.get_centroid()
459
+ )
460
+ block_center_x, block_center_y = block.get_centroid()
461
+ sorted_distance = (
462
+ sorted_block_center_x**2 + sorted_block_center_y**2
463
+ )
464
+ block_distance = block_center_x**2 + block_center_y**2
465
+ if block_distance > sorted_distance:
466
+ nearest_sorted_block_index = sorted_block_idx + 1
467
+ if (
468
+ sorted_block_idx < len(sorted_blocks) - 1
469
+ and block.label
470
+ in BLOCK_LABEL_MAP["vision_labels"]
471
+ + BLOCK_LABEL_MAP["vision_title_labels"]
472
+ ):
473
+ seg_start_flag, _ = get_seg_flag(
474
+ sorted_blocks[sorted_block_idx + 1],
475
+ sorted_blocks[sorted_block_idx],
476
+ )
477
+ if not seg_start_flag:
478
+ nearest_sorted_block_index += 1
479
+ else:
480
+ if (
481
+ sorted_block_idx > 0
482
+ and block.label
483
+ in BLOCK_LABEL_MAP["vision_labels"]
484
+ + BLOCK_LABEL_MAP["vision_title_labels"]
485
+ ):
486
+ seg_start_flag, _ = get_seg_flag(
487
+ sorted_blocks[sorted_block_idx],
488
+ sorted_blocks[sorted_block_idx - 1],
489
+ )
490
+ if not seg_start_flag:
491
+ nearest_sorted_block_index = sorted_block_idx - 1
492
+
493
+ sorted_blocks.insert(nearest_sorted_block_index, block)
494
+ return sorted_blocks
495
+
496
+
497
+ def insert_child_blocks(
498
+ block: LayoutBlock,
499
+ block_idx: int,
500
+ sorted_blocks: List[LayoutBlock],
501
+ ) -> List[LayoutBlock]:
502
+ """
503
+ Insert child blocks of a block into the sorted blocks list.
504
+
505
+ Args:
506
+ block: The parent block whose child blocks need to be inserted.
507
+ block_idx: Index at which the parent block exists in the sorted blocks list.
508
+ sorted_blocks: Sorted blocks list where the child blocks are to be inserted.
509
+
510
+ Returns:
511
+ sorted_blocks: Updated sorted blocks list after inserting child blocks.
512
+ """
513
+ if block.child_blocks:
514
+ sub_blocks = block.get_child_blocks()
515
+ sub_blocks.append(block)
516
+ sub_blocks = sort_child_blocks(sub_blocks, sub_blocks[0].direction)
517
+ sorted_blocks[block_idx] = sub_blocks[0]
518
+ for block in sub_blocks[1:]:
519
+ block_idx += 1
520
+ sorted_blocks.insert(block_idx, block)
521
+ return sorted_blocks
522
+
523
+
524
+ def sort_child_blocks(
525
+ blocks: List[LayoutRegion], direction="horizontal"
526
+ ) -> List[LayoutBlock]:
527
+ """
528
+ Sort child blocks based on their bounding box coordinates.
529
+
530
+ Args:
531
+ blocks: A list of LayoutBlock objects representing the child blocks.
532
+ direction: direction of the blocks ('horizontal' or 'vertical'). Default is 'horizontal'.
533
+ Returns:
534
+ sorted_blocks: A sorted list of LayoutBlock objects.
535
+ """
536
+ if blocks[0].label != "region":
537
+ if direction == "horizontal":
538
+ blocks.sort(
539
+ key=lambda x: (
540
+ x.bbox[1],
541
+ x.bbox[0],
542
+ x.get_centroid()[0] ** 2 + x.get_centroid()[1] ** 2,
543
+ ), # distance with (0,0)
544
+ )
545
+ else:
546
+ blocks.sort(
547
+ key=lambda x: (
548
+ -x.bbox[2],
549
+ x.bbox[1],
550
+ -x.get_centroid()[0] ** 2 + x.get_centroid()[1] ** 2,
551
+ ), # distance with (max,0)
552
+ )
553
+ else:
554
+ blocks.sort(key=lambda x: x.euclidean_distance)
555
+ return blocks
556
+
557
+
558
+ def _get_weights(label, direction="horizontal"):
559
+ """Define weights based on the label and direction."""
560
+ if label == "doc_title":
561
+ return (
562
+ [1, 0.1, 0.1, 1] if direction == "horizontal" else [0.2, 0.1, 1, 1]
563
+ ) # left-down , right-left
564
+ elif label in [
565
+ "paragraph_title",
566
+ "table_title",
567
+ "abstract",
568
+ "image",
569
+ "seal",
570
+ "chart",
571
+ "figure",
572
+ ]:
573
+ return [1, 1, 0.1, 1] # down
574
+ else:
575
+ return [1, 1, 1, 0.1] # up
576
+
577
+
578
+ def _manhattan_distance(
579
+ point1: Tuple[float, float],
580
+ point2: Tuple[float, float],
581
+ weight_x: float = 1.0,
582
+ weight_y: float = 1.0,
583
+ ) -> float:
584
+ """
585
+ Calculate the weighted Manhattan distance between two points.
586
+
587
+ Args:
588
+ point1 (Tuple[float, float]): The first point as (x, y).
589
+ point2 (Tuple[float, float]): The second point as (x, y).
590
+ weight_x (float): The weight for the x-axis distance. Default is 1.0.
591
+ weight_y (float): The weight for the y-axis distance. Default is 1.0.
592
+
593
+ Returns:
594
+ float: The weighted Manhattan distance between the two points.
595
+ """
596
+ return weight_x * abs(point1[0] - point2[0]) + weight_y * abs(point1[1] - point2[1])
597
+
598
+
599
+ def sort_normal_blocks(
600
+ blocks, text_line_height, text_line_width, region_direction
601
+ ) -> List[LayoutBlock]:
602
+ """Sort blocks by their position within the page
603
+
604
+ Args:
605
+ blocks (List[LayoutBlock]): List of blocks to be sorted.
606
+ text_line_height (int): Height of each line of text.
607
+ text_line_width (int): Width of each line of text.
608
+ region_direction (str): Direction of the region, either "horizontal" or "vertical".
609
+
610
+ Returns:
611
+ List[LayoutBlock]: Sorted list of blocks.
612
+ """
613
+ if region_direction == "horizontal":
614
+ blocks.sort(
615
+ key=lambda x: (
616
+ x.bbox[1] // text_line_height,
617
+ x.bbox[0] // text_line_width,
618
+ x.get_centroid()[0] ** 2 + x.get_centroid()[1] ** 2,
619
+ ),
620
+ )
621
+ else:
622
+ blocks.sort(
623
+ key=lambda x: (
624
+ -x.bbox[2] // text_line_width,
625
+ x.bbox[1] // text_line_height,
626
+ -x.get_centroid()[0] ** 2 + x.get_centroid()[1] ** 2,
627
+ ),
628
+ )
629
+ return blocks
630
+
631
+
632
+ def get_cut_blocks(blocks, cut_direction, cut_coordinates, mask_labels=[]):
633
+ """
634
+ Cut blocks based on the given cut direction and coordinates.
635
+
636
+ Args:
637
+ blocks (list): list of blocks to be cut.
638
+ cut_direction (str): cut direction, either "horizontal" or "vertical".
639
+ cut_coordinates (list): list of cut coordinates.
640
+
641
+ Returns:
642
+ list: a list of tuples containing the cutted blocks and their corresponding mean width。
643
+ """
644
+ cuted_list = []
645
+ # filter out mask blocks,including header, footer, unordered and child_blocks
646
+
647
+ # 0: horizontal, 1: vertical
648
+ cut_aixis = 0 if cut_direction == "horizontal" else 1
649
+ blocks.sort(key=lambda x: x.bbox[cut_aixis + 2])
650
+ cut_coordinates.append(float("inf"))
651
+
652
+ cut_coordinates = list(set(cut_coordinates))
653
+ cut_coordinates.sort()
654
+
655
+ cut_idx = 0
656
+ for cut_coordinate in cut_coordinates:
657
+ group_blocks = []
658
+ block_idx = cut_idx
659
+ while block_idx < len(blocks):
660
+ block = blocks[block_idx]
661
+ if block.bbox[cut_aixis + 2] > cut_coordinate:
662
+ break
663
+ elif block.order_label not in mask_labels:
664
+ group_blocks.append(block)
665
+ block_idx += 1
666
+ cut_idx = block_idx
667
+ if group_blocks:
668
+ cuted_list.append(group_blocks)
669
+
670
+ return cuted_list
671
+
672
+
673
+ def get_blocks_by_direction_interval(
674
+ blocks: List[LayoutBlock],
675
+ start_index: int,
676
+ end_index: int,
677
+ direction: str = "horizontal",
678
+ ) -> List[LayoutBlock]:
679
+ """
680
+ Get blocks within a specified direction interval.
681
+
682
+ Args:
683
+ blocks (List[LayoutBlock]): A list of blocks.
684
+ start_index (int): The starting index of the direction.
685
+ end_index (int): The ending index of the direction.
686
+ direction (str, optional): The direction to consider. Defaults to "horizontal".
687
+
688
+ Returns:
689
+ List[LayoutBlock]: A list of blocks within the specified direction interval.
690
+ """
691
+ interval_blocks = []
692
+ aixis = 0 if direction == "horizontal" else 1
693
+ blocks.sort(key=lambda x: x.bbox[aixis + 2])
694
+
695
+ for block in blocks:
696
+ if block.bbox[aixis] >= start_index and block.bbox[aixis + 2] <= end_index:
697
+ interval_blocks.append(block)
698
+
699
+ return interval_blocks
700
+
701
+
702
+ def get_nearest_blocks(
703
+ block: LayoutBlock,
704
+ ref_blocks: List[LayoutBlock],
705
+ overlap_threshold,
706
+ direction="horizontal",
707
+ ) -> List:
708
+ """
709
+ Get the adjacent blocks with the same direction as the current block.
710
+ Args:
711
+ block (LayoutBlock): The current block.
712
+ blocks (List[LayoutBlock]): A list of all blocks.
713
+ ref_block_idxes (List[int]): A list of indices of reference blocks.
714
+ iou_threshold (float): The IOU threshold to determine if two blocks are considered adjacent.
715
+ Returns:
716
+ Int: The index of the previous block with same direction.
717
+ Int: The index of the following block with same direction.
718
+ """
719
+ prev_blocks: List[LayoutBlock] = []
720
+ post_blocks: List[LayoutBlock] = []
721
+ sort_index = 1 if direction == "horizontal" else 0
722
+ for ref_block in ref_blocks:
723
+ if ref_block.index == block.index:
724
+ continue
725
+ overlap_ratio = calculate_projection_overlap_ratio(
726
+ block.bbox, ref_block.bbox, direction, mode="small"
727
+ )
728
+ if overlap_ratio > overlap_threshold:
729
+ if ref_block.bbox[sort_index] <= block.bbox[sort_index]:
730
+ prev_blocks.append(ref_block)
731
+ else:
732
+ post_blocks.append(ref_block)
733
+
734
+ if prev_blocks:
735
+ prev_blocks.sort(key=lambda x: x.bbox[sort_index], reverse=True)
736
+ if post_blocks:
737
+ post_blocks.sort(key=lambda x: x.bbox[sort_index])
738
+
739
+ return prev_blocks, post_blocks
740
+
741
+
742
+ def update_doc_title_child_blocks(
743
+ block: LayoutBlock,
744
+ region: LayoutRegion,
745
+ ) -> None:
746
+ """
747
+ Update the child blocks of a document title block.
748
+
749
+ The child blocks need to meet the following conditions:
750
+ 1. They must be adjacent
751
+ 2. They must have the same direction as the parent block.
752
+ 3. Their short side length should be less than 80% of the parent's short side length.
753
+ 4. Their long side length should be less than 150% of the parent's long side length.
754
+ 5. The child block must be text block.
755
+ 6. The nearest edge distance should be less than 2 times of the text line height.
756
+
757
+ Args:
758
+ blocks (List[LayoutBlock]): overall blocks.
759
+ block (LayoutBlock): document title block.
760
+ prev_idx (int): previous block index, None if not exist.
761
+ post_idx (int): post block index, None if not exist.
762
+ config (dict): configurations.
763
+
764
+ Returns:
765
+ None
766
+
767
+ """
768
+ ref_blocks = [region.block_map[idx] for idx in region.normal_text_block_idxes]
769
+ overlap_threshold = XYCUT_SETTINGS["child_block_overlap_ratio_threshold"]
770
+ prev_blocks, post_blocks = get_nearest_blocks(
771
+ block, ref_blocks, overlap_threshold, block.direction
772
+ )
773
+ prev_block = None
774
+ post_block = None
775
+
776
+ if prev_blocks:
777
+ prev_block = prev_blocks[0]
778
+ if post_blocks:
779
+ post_block = post_blocks[0]
780
+
781
+ for ref_block in [prev_block, post_block]:
782
+ if ref_block is None:
783
+ continue
784
+ with_seem_direction = ref_block.direction == block.direction
785
+
786
+ short_side_length_condition = (
787
+ ref_block.short_side_length < block.short_side_length * 0.8
788
+ )
789
+
790
+ long_side_length_condition = (
791
+ ref_block.long_side_length < block.long_side_length
792
+ or ref_block.long_side_length > 1.5 * block.long_side_length
793
+ )
794
+
795
+ nearest_edge_distance = get_nearest_edge_distance(block.bbox, ref_block.bbox)
796
+
797
+ if (
798
+ with_seem_direction
799
+ and ref_block.label in BLOCK_LABEL_MAP["text_labels"]
800
+ and short_side_length_condition
801
+ and long_side_length_condition
802
+ and ref_block.num_of_lines < 3
803
+ and nearest_edge_distance < ref_block.text_line_height * 2
804
+ ):
805
+ ref_block.order_label = "doc_title_text"
806
+ block.append_child_block(ref_block)
807
+ region.normal_text_block_idxes.remove(ref_block.index)
808
+
809
+ for ref_block in ref_blocks:
810
+ if ref_block.order_label == "doc_title_text":
811
+ continue
812
+ with_seem_direction = ref_block.direction == block.direction
813
+
814
+ overlap_ratio = calculate_overlap_ratio(
815
+ block.bbox, ref_block.bbox, mode="small"
816
+ )
817
+
818
+ if overlap_ratio > 0.9 and with_seem_direction:
819
+ ref_block.order_label = "doc_title_text"
820
+ block.append_child_block(ref_block)
821
+ region.normal_text_block_idxes.remove(ref_block.index)
822
+
823
+
824
+ def update_paragraph_title_child_blocks(
825
+ block: LayoutBlock,
826
+ region: LayoutRegion,
827
+ ) -> None:
828
+ """
829
+ Update the child blocks of a paragraph title block.
830
+
831
+ The child blocks need to meet the following conditions:
832
+ 1. They must be adjacent
833
+ 2. They must have the same direction as the parent block.
834
+ 3. The child block must be paragraph title block.
835
+
836
+ Args:
837
+ blocks (List[LayoutBlock]): overall blocks.
838
+ block (LayoutBlock): document title block.
839
+ prev_idx (int): previous block index, None if not exist.
840
+ post_idx (int): post block index, None if not exist.
841
+ config (dict): configurations.
842
+
843
+ Returns:
844
+ None
845
+
846
+ """
847
+ if block.order_label == "sub_paragraph_title":
848
+ return
849
+ ref_blocks = [
850
+ region.block_map[idx]
851
+ for idx in region.paragraph_title_block_idxes + region.normal_text_block_idxes
852
+ ]
853
+ overlap_threshold = XYCUT_SETTINGS["child_block_overlap_ratio_threshold"]
854
+ prev_blocks, post_blocks = get_nearest_blocks(
855
+ block, ref_blocks, overlap_threshold, block.direction
856
+ )
857
+ for ref_blocks in [prev_blocks, post_blocks]:
858
+ for ref_block in ref_blocks:
859
+ if ref_block.label not in BLOCK_LABEL_MAP["paragraph_title_labels"]:
860
+ break
861
+ min_text_line_height = min(
862
+ block.text_line_height, ref_block.text_line_height
863
+ )
864
+ nearest_edge_distance = get_nearest_edge_distance(
865
+ block.bbox, ref_block.bbox
866
+ )
867
+ with_seem_direction = ref_block.direction == block.direction
868
+ with_seem_start = (
869
+ abs(ref_block.start_coordinate - block.start_coordinate)
870
+ < min_text_line_height * 2
871
+ )
872
+ if (
873
+ with_seem_direction
874
+ and with_seem_start
875
+ and nearest_edge_distance <= min_text_line_height * 1.5
876
+ ):
877
+ ref_block.order_label = "sub_paragraph_title"
878
+ block.append_child_block(ref_block)
879
+ region.paragraph_title_block_idxes.remove(ref_block.index)
880
+
881
+
882
+ def update_vision_child_blocks(
883
+ block: LayoutBlock,
884
+ region: LayoutRegion,
885
+ ) -> None:
886
+ """
887
+ Update the child blocks of a paragraph title block.
888
+
889
+ The child blocks need to meet the following conditions:
890
+ - For Both:
891
+ 1. They must be adjacent
892
+ 2. The child block must be vision_title or text block.
893
+ - For vision_title:
894
+ 1. The distance between the child block and the parent block should be less than 1/2 of the parent's height.
895
+ - For text block:
896
+ 1. The distance between the child block and the parent block should be less than 15.
897
+ 2. The child short_side_length should be less than the parent's short side length.
898
+ 3. The child long_side_length should be less than 50% of the parent's long side length.
899
+ 4. The difference between their centers is very small.
900
+
901
+ Args:
902
+ blocks (List[LayoutBlock]): overall blocks.
903
+ block (LayoutBlock): document title block.
904
+ ref_block_idxes (List[int]): A list of indices of reference blocks.
905
+ prev_idx (int): previous block index, None if not exist.
906
+ post_idx (int): post block index, None if not exist.
907
+ config (dict): configurations.
908
+
909
+ Returns:
910
+ None
911
+
912
+ """
913
+ ref_blocks = [
914
+ region.block_map[idx]
915
+ for idx in region.normal_text_block_idxes + region.vision_title_block_idxes
916
+ ]
917
+ overlap_threshold = XYCUT_SETTINGS["child_block_overlap_ratio_threshold"]
918
+ has_vision_footnote = False
919
+ has_vision_title = False
920
+ for direction in [block.direction, block.secondary_direction]:
921
+ prev_blocks, post_blocks = get_nearest_blocks(
922
+ block, ref_blocks, overlap_threshold, direction
923
+ )
924
+ for ref_block in prev_blocks:
925
+ if (
926
+ ref_block.label
927
+ not in BLOCK_LABEL_MAP["text_labels"]
928
+ + BLOCK_LABEL_MAP["vision_title_labels"]
929
+ ):
930
+ break
931
+ nearest_edge_distance = get_nearest_edge_distance(
932
+ block.bbox, ref_block.bbox
933
+ )
934
+ block_center = block.get_centroid()
935
+ ref_block_center = ref_block.get_centroid()
936
+ if (
937
+ ref_block.label in BLOCK_LABEL_MAP["vision_title_labels"]
938
+ and nearest_edge_distance <= ref_block.text_line_height * 2
939
+ ):
940
+ has_vision_title = True
941
+ ref_block.order_label = "vision_title"
942
+ block.append_child_block(ref_block)
943
+ region.vision_title_block_idxes.remove(ref_block.index)
944
+ if ref_block.label in BLOCK_LABEL_MAP["text_labels"]:
945
+ if (
946
+ not has_vision_footnote
947
+ and ref_block.direction == block.direction
948
+ and ref_block.long_side_length < block.long_side_length
949
+ and nearest_edge_distance <= ref_block.text_line_height * 2
950
+ ):
951
+ if (
952
+ (
953
+ ref_block.short_side_length < block.short_side_length
954
+ and ref_block.long_side_length
955
+ < 0.5 * block.long_side_length
956
+ and abs(block_center[0] - ref_block_center[0]) < 10
957
+ )
958
+ or (
959
+ block.bbox[0] - ref_block.bbox[0] < 10
960
+ and ref_block.num_of_lines == 1
961
+ )
962
+ or (
963
+ block.bbox[2] - ref_block.bbox[2] < 10
964
+ and ref_block.num_of_lines == 1
965
+ )
966
+ ):
967
+ has_vision_footnote = True
968
+ ref_block.order_label = "vision_footnote"
969
+ block.append_child_block(ref_block)
970
+ region.normal_text_block_idxes.remove(ref_block.index)
971
+ break
972
+ for ref_block in post_blocks:
973
+ if (
974
+ has_vision_footnote
975
+ and ref_block.label in BLOCK_LABEL_MAP["text_labels"]
976
+ ):
977
+ break
978
+ nearest_edge_distance = get_nearest_edge_distance(
979
+ block.bbox, ref_block.bbox
980
+ )
981
+ block_center = block.get_centroid()
982
+ ref_block_center = ref_block.get_centroid()
983
+ if (
984
+ ref_block.label in BLOCK_LABEL_MAP["vision_title_labels"]
985
+ and nearest_edge_distance <= ref_block.text_line_height * 2
986
+ ):
987
+ has_vision_title = True
988
+ ref_block.order_label = "vision_title"
989
+ block.append_child_block(ref_block)
990
+ region.vision_title_block_idxes.remove(ref_block.index)
991
+ if ref_block.label in BLOCK_LABEL_MAP["text_labels"]:
992
+ if (
993
+ not has_vision_footnote
994
+ and ref_block.direction == block.direction
995
+ and ref_block.long_side_length < block.long_side_length
996
+ and nearest_edge_distance <= ref_block.text_line_height * 2
997
+ ):
998
+ if (
999
+ (
1000
+ ref_block.short_side_length < block.short_side_length
1001
+ and ref_block.long_side_length
1002
+ < 0.5 * block.long_side_length
1003
+ and abs(block_center[0] - ref_block_center[0]) < 10
1004
+ )
1005
+ or (
1006
+ block.bbox[0] - ref_block.bbox[0] < 10
1007
+ and ref_block.num_of_lines == 1
1008
+ )
1009
+ or (
1010
+ block.bbox[2] - ref_block.bbox[2] < 10
1011
+ and ref_block.num_of_lines == 1
1012
+ )
1013
+ ):
1014
+ has_vision_footnote = True
1015
+ ref_block.label = "vision_footnote"
1016
+ ref_block.order_label = "vision_footnote"
1017
+ block.append_child_block(ref_block)
1018
+ region.normal_text_block_idxes.remove(ref_block.index)
1019
+ break
1020
+ if has_vision_title:
1021
+ break
1022
+
1023
+ for ref_block in ref_blocks:
1024
+ if ref_block.index not in region.normal_text_block_idxes:
1025
+ continue
1026
+
1027
+ overlap_ratio = calculate_overlap_ratio(
1028
+ block.bbox, ref_block.bbox, mode="small"
1029
+ )
1030
+
1031
+ if overlap_ratio > 0.9:
1032
+ ref_block.label = "vision_footnote"
1033
+ ref_block.order_label = "vision_footnote"
1034
+ block.append_child_block(ref_block)
1035
+ region.normal_text_block_idxes.remove(ref_block.index)
1036
+
1037
+
1038
+ def update_region_child_blocks(
1039
+ block: LayoutBlock,
1040
+ region: LayoutRegion,
1041
+ ) -> None:
1042
+ """Update child blocks of a region.
1043
+
1044
+ Args:
1045
+ block (LayoutBlock): document title block.
1046
+ region (LayoutRegion): layout region.
1047
+
1048
+ Returns:
1049
+ None
1050
+ """
1051
+ for ref_block in region.block_map.values():
1052
+ if block.index != ref_block.index:
1053
+ bbox_iou = calculate_overlap_ratio(block.bbox, ref_block.bbox)
1054
+ if (
1055
+ bbox_iou > 0
1056
+ and block.area > ref_block.area
1057
+ and ref_block.order_label != "sub_region"
1058
+ ):
1059
+ ref_block.order_label = "sub_region"
1060
+ block.append_child_block(ref_block)
1061
+ region.normal_text_block_idxes.remove(ref_block.index)
1062
+
1063
+
1064
+ def calculate_discontinuous_projection(
1065
+ boxes, direction="horizontal", return_num=False
1066
+ ) -> List:
1067
+ """
1068
+ Calculate the discontinuous projection of boxes along the specified direction.
1069
+
1070
+ Args:
1071
+ boxes (ndarray): Array of bounding boxes represented by [[x_min, y_min, x_max, y_max]].
1072
+ direction (str): direction along which to perform the projection ('horizontal' or 'vertical').
1073
+
1074
+ Returns:
1075
+ list: List of tuples representing the merged intervals.
1076
+ """
1077
+ boxes = np.array(boxes)
1078
+ if direction == "horizontal":
1079
+ intervals = boxes[:, [0, 2]]
1080
+ elif direction == "vertical":
1081
+ intervals = boxes[:, [1, 3]]
1082
+ else:
1083
+ raise ValueError("direction must be 'horizontal' or 'vertical'")
1084
+
1085
+ intervals = intervals[np.argsort(intervals[:, 0])]
1086
+
1087
+ merged_intervals = []
1088
+ num = 1
1089
+ current_start, current_end = intervals[0]
1090
+ num_list = []
1091
+
1092
+ for start, end in intervals[1:]:
1093
+ if start <= current_end:
1094
+ num += 1
1095
+ current_end = max(current_end, end)
1096
+ else:
1097
+ num_list.append(num)
1098
+ merged_intervals.append((current_start, current_end))
1099
+ num = 1
1100
+ current_start, current_end = start, end
1101
+
1102
+ num_list.append(num)
1103
+ merged_intervals.append((current_start, current_end))
1104
+ if return_num:
1105
+ return merged_intervals, num_list
1106
+ return merged_intervals
1107
+
1108
+
1109
+ def shrink_overlapping_boxes(
1110
+ boxes, direction="horizontal", min_threshold=0, max_threshold=0.1
1111
+ ) -> List:
1112
+ """
1113
+ Shrink overlapping boxes along the specified direction.
1114
+
1115
+ Args:
1116
+ boxes (ndarray): Array of bounding boxes represented by [[x_min, y_min, x_max, y_max]].
1117
+ direction (str): direction along which to perform the shrinking ('horizontal' or 'vertical').
1118
+ min_threshold (float): Minimum threshold for shrinking. Default is 0.
1119
+ max_threshold (float): Maximum threshold for shrinking. Default is 0.2.
1120
+
1121
+ Returns:
1122
+ list: List of tuples representing the merged intervals.
1123
+ """
1124
+ current_block = boxes[0]
1125
+ for block in boxes[1:]:
1126
+ x1, y1, x2, y2 = current_block.bbox
1127
+ x1_prime, y1_prime, x2_prime, y2_prime = block.bbox
1128
+ cut_iou = calculate_projection_overlap_ratio(
1129
+ current_block.bbox, block.bbox, direction=direction
1130
+ )
1131
+ match_iou = calculate_projection_overlap_ratio(
1132
+ current_block.bbox,
1133
+ block.bbox,
1134
+ direction="horizontal" if direction == "vertical" else "vertical",
1135
+ )
1136
+ if direction == "vertical":
1137
+ if (
1138
+ (match_iou > 0 and cut_iou > min_threshold and cut_iou < max_threshold)
1139
+ or y2 == y1_prime
1140
+ or abs(y2 - y1_prime) <= 3
1141
+ ):
1142
+ overlap_y_min = max(y1, y1_prime)
1143
+ overlap_y_max = min(y2, y2_prime)
1144
+ split_y = int((overlap_y_min + overlap_y_max) / 2)
1145
+ overlap_y_min = split_y - 1
1146
+ overlap_y_max = split_y + 1
1147
+ if y1 < y1_prime:
1148
+ current_block.bbox = [x1, y1, x2, overlap_y_min]
1149
+ block.bbox = [x1_prime, overlap_y_max, x2_prime, y2_prime]
1150
+ else:
1151
+ current_block.bbox = [x1, overlap_y_min, x2, y2]
1152
+ block.bbox = [x1_prime, y1_prime, x2_prime, overlap_y_max]
1153
+ else:
1154
+ if (
1155
+ (match_iou > 0 and cut_iou > min_threshold and cut_iou < max_threshold)
1156
+ or x2 == x1_prime
1157
+ or abs(x2 - x1_prime) <= 3
1158
+ ):
1159
+ overlap_x_min = max(x1, x1_prime)
1160
+ overlap_x_max = min(x2, x2_prime)
1161
+ split_x = int((overlap_x_min + overlap_x_max) / 2)
1162
+ overlap_x_min = split_x - 1
1163
+ overlap_x_max = split_x + 1
1164
+ if x1 < x1_prime:
1165
+ current_block.bbox = [x1, y1, overlap_x_min, y2]
1166
+ block.bbox = [overlap_x_max, y1_prime, x2_prime, y2_prime]
1167
+ else:
1168
+ current_block.bbox = [overlap_x_min, y1, x2, y2]
1169
+ block.bbox = [x1_prime, y1_prime, overlap_x_max, y2_prime]
1170
+ current_block = block
1171
+ return boxes
1172
+
1173
+
1174
+ def find_local_minima_flat_regions(arr) -> List:
1175
+ """
1176
+ Find all local minima regions in a flat array.
1177
+
1178
+ Args:
1179
+ arr (list): The input array.
1180
+
1181
+ Returns:
1182
+ list: A list of tuples containing the indices of the local minima regions.
1183
+ """
1184
+ n = len(arr)
1185
+ if n == 0:
1186
+ return []
1187
+
1188
+ flat_minima_regions = []
1189
+ start = 0
1190
+
1191
+ for i in range(1, n):
1192
+ if arr[i] != arr[i - 1]:
1193
+ if (start == 0 or arr[start - 1] > arr[start]) and (
1194
+ i == n or arr[i] > arr[start]
1195
+ ):
1196
+ flat_minima_regions.append((start, i - 1))
1197
+ start = i
1198
+
1199
+ return flat_minima_regions[1:] if len(flat_minima_regions) > 1 else None