paddlex 3.0.0rc1__py3-none-any.whl → 3.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. paddlex/.version +1 -1
  2. paddlex/__init__.py +1 -1
  3. paddlex/configs/modules/chart_parsing/PP-Chart2Table.yaml +13 -0
  4. paddlex/configs/modules/doc_vlm/PP-DocBee2-3B.yaml +14 -0
  5. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-L.yaml +40 -0
  6. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-M.yaml +40 -0
  7. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-S.yaml +40 -0
  8. paddlex/configs/modules/layout_detection/PP-DocBlockLayout.yaml +40 -0
  9. paddlex/configs/modules/layout_detection/PP-DocLayout-L.yaml +2 -2
  10. paddlex/configs/modules/layout_detection/PP-DocLayout-M.yaml +2 -2
  11. paddlex/configs/modules/layout_detection/PP-DocLayout-S.yaml +2 -2
  12. paddlex/configs/modules/layout_detection/PP-DocLayout_plus-L.yaml +40 -0
  13. paddlex/configs/modules/text_detection/PP-OCRv5_mobile_det.yaml +40 -0
  14. paddlex/configs/modules/text_detection/PP-OCRv5_server_det.yaml +40 -0
  15. paddlex/configs/modules/text_recognition/PP-OCRv5_mobile_rec.yaml +39 -0
  16. paddlex/configs/modules/text_recognition/PP-OCRv5_server_rec.yaml +39 -0
  17. paddlex/configs/modules/textline_orientation/PP-LCNet_x1_0_textline_ori.yaml +41 -0
  18. paddlex/configs/pipelines/OCR.yaml +7 -6
  19. paddlex/configs/pipelines/PP-ChatOCRv3-doc.yaml +3 -1
  20. paddlex/configs/pipelines/PP-ChatOCRv4-doc.yaml +91 -34
  21. paddlex/configs/pipelines/PP-StructureV3.yaml +72 -72
  22. paddlex/configs/pipelines/doc_understanding.yaml +1 -1
  23. paddlex/configs/pipelines/formula_recognition.yaml +2 -2
  24. paddlex/configs/pipelines/layout_parsing.yaml +3 -2
  25. paddlex/configs/pipelines/seal_recognition.yaml +1 -0
  26. paddlex/configs/pipelines/table_recognition.yaml +2 -1
  27. paddlex/configs/pipelines/table_recognition_v2.yaml +7 -1
  28. paddlex/hpip_links.html +20 -20
  29. paddlex/inference/common/batch_sampler/doc_vlm_batch_sampler.py +33 -10
  30. paddlex/inference/common/batch_sampler/image_batch_sampler.py +34 -25
  31. paddlex/inference/common/result/mixin.py +19 -12
  32. paddlex/inference/models/base/predictor/base_predictor.py +2 -8
  33. paddlex/inference/models/common/static_infer.py +11 -59
  34. paddlex/inference/models/common/tokenizer/__init__.py +2 -0
  35. paddlex/inference/models/common/tokenizer/clip_tokenizer.py +1 -1
  36. paddlex/inference/models/common/tokenizer/gpt_tokenizer.py +2 -2
  37. paddlex/inference/models/common/tokenizer/qwen2_5_tokenizer.py +112 -0
  38. paddlex/inference/models/common/tokenizer/qwen2_tokenizer.py +7 -1
  39. paddlex/inference/models/common/tokenizer/qwen_tokenizer.py +288 -0
  40. paddlex/inference/models/common/tokenizer/tokenizer_utils.py +13 -13
  41. paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py +3 -3
  42. paddlex/inference/models/common/tokenizer/vocab.py +7 -7
  43. paddlex/inference/models/common/vlm/conversion_utils.py +99 -0
  44. paddlex/inference/models/common/vlm/fusion_ops.py +205 -0
  45. paddlex/inference/models/common/vlm/generation/configuration_utils.py +1 -1
  46. paddlex/inference/models/common/vlm/generation/logits_process.py +1 -1
  47. paddlex/inference/models/common/vlm/generation/utils.py +1 -1
  48. paddlex/inference/models/common/vlm/transformers/configuration_utils.py +3 -3
  49. paddlex/inference/models/common/vlm/transformers/conversion_utils.py +3 -3
  50. paddlex/inference/models/common/vlm/transformers/model_outputs.py +2 -2
  51. paddlex/inference/models/common/vlm/transformers/model_utils.py +7 -31
  52. paddlex/inference/models/doc_vlm/modeling/GOT_ocr_2_0.py +830 -0
  53. paddlex/inference/models/doc_vlm/modeling/__init__.py +2 -0
  54. paddlex/inference/models/doc_vlm/modeling/qwen2.py +1606 -0
  55. paddlex/inference/models/doc_vlm/modeling/qwen2_5_vl.py +3006 -0
  56. paddlex/inference/models/doc_vlm/modeling/qwen2_vl.py +0 -105
  57. paddlex/inference/models/doc_vlm/predictor.py +79 -24
  58. paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py +97 -0
  59. paddlex/inference/models/doc_vlm/processors/__init__.py +2 -0
  60. paddlex/inference/models/doc_vlm/processors/common.py +189 -0
  61. paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py +548 -0
  62. paddlex/inference/models/doc_vlm/processors/qwen2_vl.py +21 -176
  63. paddlex/inference/models/formula_recognition/predictor.py +7 -1
  64. paddlex/inference/models/formula_recognition/processors.py +92 -79
  65. paddlex/inference/models/formula_recognition/result.py +28 -27
  66. paddlex/inference/models/image_feature/processors.py +3 -4
  67. paddlex/inference/models/keypoint_detection/predictor.py +3 -0
  68. paddlex/inference/models/object_detection/predictor.py +2 -0
  69. paddlex/inference/models/object_detection/processors.py +28 -3
  70. paddlex/inference/models/object_detection/utils.py +2 -0
  71. paddlex/inference/models/table_structure_recognition/result.py +0 -10
  72. paddlex/inference/models/text_detection/predictor.py +8 -0
  73. paddlex/inference/models/text_detection/processors.py +44 -10
  74. paddlex/inference/models/text_detection/result.py +0 -10
  75. paddlex/inference/pipelines/__init__.py +9 -5
  76. paddlex/inference/pipelines/_parallel.py +172 -0
  77. paddlex/inference/pipelines/anomaly_detection/pipeline.py +16 -6
  78. paddlex/inference/pipelines/attribute_recognition/pipeline.py +11 -1
  79. paddlex/inference/pipelines/base.py +14 -4
  80. paddlex/inference/pipelines/components/faisser.py +1 -1
  81. paddlex/inference/pipelines/doc_preprocessor/pipeline.py +53 -27
  82. paddlex/inference/pipelines/formula_recognition/pipeline.py +120 -82
  83. paddlex/inference/pipelines/formula_recognition/result.py +1 -11
  84. paddlex/inference/pipelines/image_classification/pipeline.py +16 -6
  85. paddlex/inference/pipelines/image_multilabel_classification/pipeline.py +16 -6
  86. paddlex/inference/pipelines/instance_segmentation/pipeline.py +16 -6
  87. paddlex/inference/pipelines/keypoint_detection/pipeline.py +16 -6
  88. paddlex/inference/pipelines/layout_parsing/pipeline.py +34 -47
  89. paddlex/inference/pipelines/layout_parsing/pipeline_v2.py +893 -260
  90. paddlex/inference/pipelines/layout_parsing/result.py +4 -17
  91. paddlex/inference/pipelines/layout_parsing/result_v2.py +523 -245
  92. paddlex/inference/pipelines/layout_parsing/setting.py +87 -0
  93. paddlex/inference/pipelines/layout_parsing/utils.py +565 -1998
  94. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/__init__.py +16 -0
  95. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/utils.py +1144 -0
  96. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/xycuts.py +563 -0
  97. paddlex/inference/pipelines/m_3d_bev_detection/pipeline.py +2 -2
  98. paddlex/inference/pipelines/multilingual_speech_recognition/pipeline.py +2 -2
  99. paddlex/inference/pipelines/object_detection/pipeline.py +16 -6
  100. paddlex/inference/pipelines/ocr/pipeline.py +127 -70
  101. paddlex/inference/pipelines/ocr/result.py +19 -16
  102. paddlex/inference/pipelines/open_vocabulary_detection/pipeline.py +2 -2
  103. paddlex/inference/pipelines/open_vocabulary_segmentation/pipeline.py +2 -2
  104. paddlex/inference/pipelines/pp_chatocr/pipeline_base.py +2 -2
  105. paddlex/inference/pipelines/pp_chatocr/pipeline_v3.py +2 -5
  106. paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py +5 -5
  107. paddlex/inference/pipelines/rotated_object_detection/pipeline.py +16 -6
  108. paddlex/inference/pipelines/seal_recognition/pipeline.py +109 -53
  109. paddlex/inference/pipelines/semantic_segmentation/pipeline.py +16 -6
  110. paddlex/inference/pipelines/small_object_detection/pipeline.py +16 -6
  111. paddlex/inference/pipelines/table_recognition/pipeline.py +26 -18
  112. paddlex/inference/pipelines/table_recognition/pipeline_v2.py +624 -53
  113. paddlex/inference/pipelines/table_recognition/result.py +1 -1
  114. paddlex/inference/pipelines/table_recognition/table_recognition_post_processing_v2.py +9 -5
  115. paddlex/inference/pipelines/ts_anomaly_detection/pipeline.py +2 -2
  116. paddlex/inference/pipelines/ts_classification/pipeline.py +2 -2
  117. paddlex/inference/pipelines/ts_forecasting/pipeline.py +2 -2
  118. paddlex/inference/pipelines/video_classification/pipeline.py +2 -2
  119. paddlex/inference/pipelines/video_detection/pipeline.py +2 -2
  120. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/common.py +5 -1
  121. paddlex/inference/serving/basic_serving/_pipeline_apps/layout_parsing.py +0 -1
  122. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv3_doc.py +0 -1
  123. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv4_doc.py +1 -1
  124. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py +6 -2
  125. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition.py +1 -5
  126. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition_v2.py +4 -5
  127. paddlex/inference/serving/infra/utils.py +20 -22
  128. paddlex/inference/serving/schemas/formula_recognition.py +1 -1
  129. paddlex/inference/serving/schemas/layout_parsing.py +1 -2
  130. paddlex/inference/serving/schemas/pp_chatocrv3_doc.py +1 -2
  131. paddlex/inference/serving/schemas/pp_chatocrv4_doc.py +2 -2
  132. paddlex/inference/serving/schemas/pp_structurev3.py +10 -6
  133. paddlex/inference/serving/schemas/seal_recognition.py +1 -1
  134. paddlex/inference/serving/schemas/table_recognition.py +2 -6
  135. paddlex/inference/serving/schemas/table_recognition_v2.py +5 -6
  136. paddlex/inference/utils/hpi.py +8 -1
  137. paddlex/inference/utils/hpi_model_info_collection.json +81 -2
  138. paddlex/inference/utils/io/readers.py +12 -12
  139. paddlex/inference/utils/mkldnn_blocklist.py +25 -0
  140. paddlex/inference/utils/official_models.py +14 -0
  141. paddlex/inference/utils/pp_option.py +29 -8
  142. paddlex/model.py +2 -2
  143. paddlex/modules/__init__.py +1 -1
  144. paddlex/modules/anomaly_detection/evaluator.py +2 -2
  145. paddlex/modules/base/__init__.py +1 -1
  146. paddlex/modules/base/evaluator.py +5 -5
  147. paddlex/modules/base/trainer.py +1 -1
  148. paddlex/modules/doc_vlm/dataset_checker.py +2 -2
  149. paddlex/modules/doc_vlm/evaluator.py +2 -2
  150. paddlex/modules/doc_vlm/exportor.py +2 -2
  151. paddlex/modules/doc_vlm/model_list.py +1 -1
  152. paddlex/modules/doc_vlm/trainer.py +2 -2
  153. paddlex/modules/face_recognition/evaluator.py +2 -2
  154. paddlex/modules/formula_recognition/evaluator.py +5 -2
  155. paddlex/modules/formula_recognition/model_list.py +3 -0
  156. paddlex/modules/formula_recognition/trainer.py +3 -0
  157. paddlex/modules/general_recognition/evaluator.py +1 -1
  158. paddlex/modules/image_classification/evaluator.py +2 -2
  159. paddlex/modules/image_classification/model_list.py +1 -0
  160. paddlex/modules/instance_segmentation/evaluator.py +1 -1
  161. paddlex/modules/keypoint_detection/evaluator.py +1 -1
  162. paddlex/modules/m_3d_bev_detection/evaluator.py +2 -2
  163. paddlex/modules/multilabel_classification/evaluator.py +2 -2
  164. paddlex/modules/object_detection/dataset_checker/dataset_src/convert_dataset.py +4 -4
  165. paddlex/modules/object_detection/evaluator.py +2 -2
  166. paddlex/modules/object_detection/model_list.py +2 -0
  167. paddlex/modules/semantic_segmentation/evaluator.py +2 -2
  168. paddlex/modules/table_recognition/evaluator.py +2 -2
  169. paddlex/modules/text_detection/evaluator.py +2 -2
  170. paddlex/modules/text_detection/model_list.py +2 -0
  171. paddlex/modules/text_recognition/evaluator.py +2 -2
  172. paddlex/modules/text_recognition/model_list.py +2 -0
  173. paddlex/modules/ts_anomaly_detection/evaluator.py +2 -2
  174. paddlex/modules/ts_classification/dataset_checker/dataset_src/split_dataset.py +1 -1
  175. paddlex/modules/ts_classification/evaluator.py +2 -2
  176. paddlex/modules/ts_forecast/evaluator.py +2 -2
  177. paddlex/modules/video_classification/evaluator.py +2 -2
  178. paddlex/modules/video_detection/evaluator.py +2 -2
  179. paddlex/ops/__init__.py +2 -2
  180. paddlex/paddlex_cli.py +19 -13
  181. paddlex/repo_apis/Paddle3D_api/bev_fusion/model.py +2 -2
  182. paddlex/repo_apis/PaddleClas_api/cls/config.py +1 -1
  183. paddlex/repo_apis/PaddleClas_api/cls/model.py +1 -1
  184. paddlex/repo_apis/PaddleClas_api/cls/register.py +10 -0
  185. paddlex/repo_apis/PaddleClas_api/cls/runner.py +1 -1
  186. paddlex/repo_apis/PaddleDetection_api/instance_seg/model.py +1 -1
  187. paddlex/repo_apis/PaddleDetection_api/instance_seg/runner.py +1 -1
  188. paddlex/repo_apis/PaddleDetection_api/object_det/config.py +1 -1
  189. paddlex/repo_apis/PaddleDetection_api/object_det/model.py +1 -1
  190. paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py +25 -0
  191. paddlex/repo_apis/PaddleDetection_api/object_det/register.py +30 -0
  192. paddlex/repo_apis/PaddleDetection_api/object_det/runner.py +1 -1
  193. paddlex/repo_apis/PaddleOCR_api/formula_rec/config.py +3 -3
  194. paddlex/repo_apis/PaddleOCR_api/formula_rec/model.py +5 -9
  195. paddlex/repo_apis/PaddleOCR_api/formula_rec/register.py +27 -0
  196. paddlex/repo_apis/PaddleOCR_api/formula_rec/runner.py +1 -1
  197. paddlex/repo_apis/PaddleOCR_api/table_rec/model.py +1 -1
  198. paddlex/repo_apis/PaddleOCR_api/table_rec/runner.py +1 -1
  199. paddlex/repo_apis/PaddleOCR_api/text_det/model.py +1 -1
  200. paddlex/repo_apis/PaddleOCR_api/text_det/register.py +18 -0
  201. paddlex/repo_apis/PaddleOCR_api/text_det/runner.py +1 -1
  202. paddlex/repo_apis/PaddleOCR_api/text_rec/config.py +3 -3
  203. paddlex/repo_apis/PaddleOCR_api/text_rec/model.py +5 -9
  204. paddlex/repo_apis/PaddleOCR_api/text_rec/register.py +18 -0
  205. paddlex/repo_apis/PaddleOCR_api/text_rec/runner.py +1 -1
  206. paddlex/repo_apis/PaddleSeg_api/seg/model.py +1 -1
  207. paddlex/repo_apis/PaddleSeg_api/seg/runner.py +1 -1
  208. paddlex/repo_apis/PaddleTS_api/ts_ad/config.py +3 -3
  209. paddlex/repo_apis/PaddleTS_api/ts_cls/config.py +2 -2
  210. paddlex/repo_apis/PaddleTS_api/ts_fc/config.py +4 -4
  211. paddlex/repo_apis/PaddleVideo_api/video_cls/config.py +1 -1
  212. paddlex/repo_apis/PaddleVideo_api/video_cls/model.py +1 -1
  213. paddlex/repo_apis/PaddleVideo_api/video_cls/runner.py +1 -1
  214. paddlex/repo_apis/PaddleVideo_api/video_det/config.py +1 -1
  215. paddlex/repo_apis/PaddleVideo_api/video_det/model.py +1 -1
  216. paddlex/repo_apis/PaddleVideo_api/video_det/runner.py +1 -1
  217. paddlex/repo_apis/base/config.py +1 -1
  218. paddlex/repo_manager/core.py +3 -3
  219. paddlex/repo_manager/meta.py +6 -2
  220. paddlex/repo_manager/repo.py +17 -16
  221. paddlex/utils/custom_device_list.py +26 -2
  222. paddlex/utils/deps.py +1 -1
  223. paddlex/utils/device.py +15 -8
  224. paddlex/utils/env.py +4 -0
  225. paddlex/utils/flags.py +2 -4
  226. paddlex/utils/fonts/__init__.py +34 -4
  227. paddlex/utils/misc.py +1 -1
  228. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/METADATA +52 -56
  229. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/RECORD +233 -206
  230. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/WHEEL +1 -1
  231. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/entry_points.txt +0 -0
  232. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/licenses/LICENSE +0 -0
  233. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/top_level.txt +0 -0
@@ -14,12 +14,15 @@
14
14
  from __future__ import annotations
15
15
 
16
16
  import copy
17
+ import math
17
18
  import re
18
- from pathlib import Path
19
+ from functools import partial
20
+ from typing import List
19
21
 
20
22
  import numpy as np
21
- from PIL import Image, ImageDraw
23
+ from PIL import Image, ImageDraw, ImageFont
22
24
 
25
+ from ....utils.fonts import PINGFANG_FONT_FILE_PATH
23
26
  from ...common.result import (
24
27
  BaseCVResult,
25
28
  HtmlMixin,
@@ -27,7 +30,166 @@ from ...common.result import (
27
30
  MarkdownMixin,
28
31
  XlsxMixin,
29
32
  )
30
- from .utils import get_show_color
33
+ from .setting import BLOCK_LABEL_MAP
34
+
35
+
36
+ def compile_title_pattern():
37
+ # Precompiled regex pattern for matching numbering at the beginning of the title
38
+ numbering_pattern = (
39
+ r"(?:" + r"[1-9][0-9]*(?:\.[1-9][0-9]*)*[\.、]?|" + r"[\(\(](?:[1-9][0-9]*|["
40
+ r"一二三四五六七八九十百千万亿零壹贰叁肆伍陆柒捌玖拾]+)[\)\)]|" + r"["
41
+ r"一二三四五六七八九十百千万亿零壹贰叁肆伍陆柒捌玖拾]+"
42
+ r"[、\.]?|" + r"(?:I|II|III|IV|V|VI|VII|VIII|IX|X)\.?" + r")"
43
+ )
44
+ return re.compile(r"^\s*(" + numbering_pattern + r")(\s*)(.*)$")
45
+
46
+
47
+ TITLE_RE_PATTERN = compile_title_pattern()
48
+
49
+
50
+ def format_title_func(block):
51
+ """
52
+ Normalize chapter title.
53
+ Add the '#' to indicate the level of the title.
54
+ If numbering exists, ensure there's exactly one space between it and the title content.
55
+ If numbering does not exist, return the original title unchanged.
56
+
57
+ :param title: Original chapter title string.
58
+ :return: Normalized chapter title string.
59
+ """
60
+ title = block.content
61
+ match = TITLE_RE_PATTERN.match(title)
62
+ if match:
63
+ numbering = match.group(1).strip()
64
+ title_content = match.group(3).lstrip()
65
+ # Return numbering and title content separated by one space
66
+ title = numbering + " " + title_content
67
+
68
+ title = title.rstrip(".")
69
+ level = (
70
+ title.count(
71
+ ".",
72
+ )
73
+ + 1
74
+ if "." in title
75
+ else 1
76
+ )
77
+ return f"#{'#' * level} {title}".replace("-\n", "").replace(
78
+ "\n",
79
+ " ",
80
+ )
81
+
82
+
83
+ def format_centered_by_html(string):
84
+ return (
85
+ f'<div style="text-align: center;">{string}</div>'.replace(
86
+ "-\n",
87
+ "",
88
+ ).replace("\n", " ")
89
+ + "\n"
90
+ )
91
+
92
+
93
+ def format_text_plain_func(block):
94
+ return block.content
95
+
96
+
97
+ def format_image_scaled_by_html_func(block, original_image_width):
98
+ img_tags = []
99
+ image_path = block.image["path"]
100
+ image_width = block.image["img"].width
101
+ scale = int(image_width / original_image_width * 100)
102
+ img_tags.append(
103
+ '<img src="{}" alt="Image" width="{}%" />'.format(
104
+ image_path.replace("-\n", "").replace("\n", " "), scale
105
+ ),
106
+ )
107
+ return "\n".join(img_tags)
108
+
109
+
110
+ def format_image_plain_func(block):
111
+ img_tags = []
112
+ image_path = block.image["path"]
113
+ img_tags.append("![]({})".format(image_path.replace("-\n", "").replace("\n", " ")))
114
+ return "\n".join(img_tags)
115
+
116
+
117
+ def format_chart2table_func(block):
118
+ lines_list = block.content.split("\n")
119
+ column_num = len(lines_list[0].split("|"))
120
+ lines_list.insert(1, "|".join(["---"] * column_num))
121
+ lines_list = [f"|{line}|" for line in lines_list]
122
+ return "\n".join(lines_list)
123
+
124
+
125
+ def simplify_table_func(table_code):
126
+ return "\n" + table_code.replace("<html>", "").replace("</html>", "").replace(
127
+ "<body>", ""
128
+ ).replace("</body>", "")
129
+
130
+
131
+ def format_first_line_func(block, templates, format_func, spliter):
132
+ lines = block.content.split(spliter)
133
+ for idx in range(len(lines)):
134
+ line = lines[idx]
135
+ if line.strip() == "":
136
+ continue
137
+ if line.lower() in templates:
138
+ lines[idx] = format_func(line)
139
+ break
140
+ return spliter.join(lines)
141
+
142
+
143
+ def get_seg_flag(block: LayoutParsingBlock, prev_block: LayoutParsingBlock):
144
+
145
+ seg_start_flag = True
146
+ seg_end_flag = True
147
+
148
+ block_box = block.bbox
149
+ context_left_coordinate = block_box[0]
150
+ context_right_coordinate = block_box[2]
151
+ seg_start_coordinate = block.seg_start_coordinate
152
+ seg_end_coordinate = block.seg_end_coordinate
153
+
154
+ if prev_block is not None:
155
+ prev_block_bbox = prev_block.bbox
156
+ num_of_prev_lines = prev_block.num_of_lines
157
+ pre_block_seg_end_coordinate = prev_block.seg_end_coordinate
158
+ prev_end_space_small = (
159
+ abs(prev_block_bbox[2] - pre_block_seg_end_coordinate) < 10
160
+ )
161
+ prev_lines_more_than_one = num_of_prev_lines > 1
162
+
163
+ overlap_blocks = context_left_coordinate < prev_block_bbox[2]
164
+
165
+ # update context_left_coordinate and context_right_coordinate
166
+ if overlap_blocks:
167
+ context_left_coordinate = min(prev_block_bbox[0], context_left_coordinate)
168
+ context_right_coordinate = max(prev_block_bbox[2], context_right_coordinate)
169
+ prev_end_space_small = (
170
+ abs(context_right_coordinate - pre_block_seg_end_coordinate) < 10
171
+ )
172
+ edge_distance = 0
173
+ else:
174
+ edge_distance = abs(block_box[0] - prev_block_bbox[2])
175
+
176
+ current_start_space_small = seg_start_coordinate - context_left_coordinate < 10
177
+
178
+ if (
179
+ prev_end_space_small
180
+ and current_start_space_small
181
+ and prev_lines_more_than_one
182
+ and edge_distance < max(prev_block.width, block.width)
183
+ ):
184
+ seg_start_flag = False
185
+ else:
186
+ if seg_start_coordinate - context_left_coordinate < 10:
187
+ seg_start_flag = False
188
+
189
+ if context_right_coordinate - seg_end_coordinate < 10:
190
+ seg_end_flag = False
191
+
192
+ return seg_start_flag, seg_end_flag
31
193
 
32
194
 
33
195
  class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
@@ -40,30 +202,10 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
40
202
  XlsxMixin.__init__(self)
41
203
  MarkdownMixin.__init__(self)
42
204
  JsonMixin.__init__(self)
43
- self.title_pattern = self._build_title_pattern()
44
-
45
- def _build_title_pattern(self):
46
- # Precompiled regex pattern for matching numbering at the beginning of the title
47
- numbering_pattern = (
48
- r"(?:"
49
- + r"[1-9][0-9]*(?:\.[1-9][0-9]*)*[\.、]?|"
50
- + r"[\(\(](?:[1-9][0-9]*|["
51
- r"一二三四五六七八九十百千万亿零壹贰叁肆伍陆柒捌玖拾]+)[\)\)]|" + r"["
52
- r"一二三四五六七八九十百千万亿零壹贰叁肆伍陆柒捌玖拾]+"
53
- r"[、\.]?|" + r"(?:I|II|III|IV|V|VI|VII|VIII|IX|X)\.?" + r")"
54
- )
55
- return re.compile(r"^\s*(" + numbering_pattern + r")(\s*)(.*)$")
56
-
57
- def _get_input_fn(self):
58
- fn = super()._get_input_fn()
59
- if (page_idx := self["page_index"]) is not None:
60
- fp = Path(fn)
61
- stem, suffix = fp.stem, fp.suffix
62
- return f"{stem}_{page_idx}{suffix}"
63
- else:
64
- return fn
65
205
 
66
206
  def _to_img(self) -> dict[str, np.ndarray]:
207
+ from .utils import get_show_color
208
+
67
209
  res_img_dict = {}
68
210
  model_settings = self["model_settings"]
69
211
  if model_settings["use_doc_preprocessor"]:
@@ -71,12 +213,14 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
71
213
  res_img_dict[key] = value
72
214
  res_img_dict["layout_det_res"] = self["layout_det_res"].img["res"]
73
215
 
74
- if model_settings["use_general_ocr"] or model_settings["use_table_recognition"]:
75
- res_img_dict["overall_ocr_res"] = self["overall_ocr_res"].img["ocr_res_img"]
216
+ if model_settings["use_region_detection"]:
217
+ res_img_dict["region_det_res"] = self["region_det_res"].img["res"]
218
+
219
+ res_img_dict["overall_ocr_res"] = self["overall_ocr_res"].img["ocr_res_img"]
76
220
 
77
221
  if model_settings["use_table_recognition"] and len(self["table_res_list"]) > 0:
78
222
  table_cell_img = Image.fromarray(
79
- copy.deepcopy(self["doc_preprocessor_res"]["output_img"])
223
+ copy.deepcopy(self["doc_preprocessor_res"]["output_img"][:, :, ::-1])
80
224
  )
81
225
  table_draw = ImageDraw.Draw(table_cell_img)
82
226
  rectangle_color = (255, 0, 0)
@@ -101,16 +245,23 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
101
245
  # for layout ordering image
102
246
  image = Image.fromarray(self["doc_preprocessor_res"]["output_img"][:, :, ::-1])
103
247
  draw = ImageDraw.Draw(image, "RGBA")
104
- parsing_result = self["parsing_res_list"]
248
+ font_size = int(0.018 * int(image.width)) + 2
249
+ font = ImageFont.truetype(PINGFANG_FONT_FILE_PATH, font_size, encoding="utf-8")
250
+ parsing_result: List[LayoutParsingBlock] = self["parsing_res_list"]
105
251
  for block in parsing_result:
106
- bbox = block["block_bbox"]
107
- index = block.get("index", None)
108
- label = block["sub_label"]
109
- fill_color = get_show_color(label)
252
+ bbox = block.bbox
253
+ index = block.order_index
254
+ label = block.label
255
+ fill_color = get_show_color(label, False)
110
256
  draw.rectangle(bbox, fill=fill_color)
111
257
  if index is not None:
112
- text_position = (bbox[2] + 2, bbox[1] - 10)
113
- draw.text(text_position, str(index), fill="red")
258
+ text_position = (bbox[2] + 2, bbox[1] - font_size // 2)
259
+ if int(image.width) - bbox[2] < font_size:
260
+ text_position = (
261
+ int(bbox[2] - font_size * 1.1),
262
+ bbox[1] - font_size // 2,
263
+ )
264
+ draw.text(text_position, str(index), font=font, fill="red")
114
265
 
115
266
  res_img_dict["layout_order_res"] = image
116
267
 
@@ -134,8 +285,7 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
134
285
  if self["model_settings"]["use_doc_preprocessor"]:
135
286
  data["doc_preprocessor_res"] = self["doc_preprocessor_res"].str["res"]
136
287
  data["layout_det_res"] = self["layout_det_res"].str["res"]
137
- if model_settings["use_general_ocr"] or model_settings["use_table_recognition"]:
138
- data["overall_ocr_res"] = self["overall_ocr_res"].str["res"]
288
+ data["overall_ocr_res"] = self["overall_ocr_res"].str["res"]
139
289
  if model_settings["use_table_recognition"] and len(self["table_res_list"]) > 0:
140
290
  data["table_res_list"] = []
141
291
  for sno in range(len(self["table_res_list"])):
@@ -176,9 +326,9 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
176
326
  parsing_res_list = self["parsing_res_list"]
177
327
  parsing_res_list = [
178
328
  {
179
- "block_label": parsing_res["block_label"],
180
- "block_content": parsing_res["block_content"],
181
- "block_bbox": parsing_res["block_bbox"],
329
+ "block_label": parsing_res.label,
330
+ "block_content": parsing_res.content,
331
+ "block_bbox": parsing_res.bbox,
182
332
  }
183
333
  for parsing_res in parsing_res_list
184
334
  ]
@@ -186,8 +336,7 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
186
336
  if self["model_settings"]["use_doc_preprocessor"]:
187
337
  data["doc_preprocessor_res"] = self["doc_preprocessor_res"].json["res"]
188
338
  data["layout_det_res"] = self["layout_det_res"].json["res"]
189
- if model_settings["use_general_ocr"] or model_settings["use_table_recognition"]:
190
- data["overall_ocr_res"] = self["overall_ocr_res"].json["res"]
339
+ data["overall_ocr_res"] = self["overall_ocr_res"].json["res"]
191
340
  if model_settings["use_table_recognition"] and len(self["table_res_list"]) > 0:
192
341
  data["table_res_list"] = []
193
342
  for sno in range(len(self["table_res_list"])):
@@ -240,228 +389,357 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
240
389
  res_xlsx_dict[key] = table_res.xlsx["pred"]
241
390
  return res_xlsx_dict
242
391
 
243
- def _to_markdown(self) -> dict:
392
+ def _to_markdown(self, pretty=True) -> dict:
244
393
  """
245
394
  Save the parsing result to a Markdown file.
246
395
 
396
+ Args:
397
+ pretty (Optional[bool]): whether to pretty markdown by HTML, default by True.
398
+
247
399
  Returns:
248
400
  Dict
249
401
  """
402
+ original_image_width = self["doc_preprocessor_res"]["output_img"].shape[1]
250
403
 
251
- def _format_data(obj):
252
-
253
- def format_title(title):
254
- """
255
- Normalize chapter title.
256
- Add the '#' to indicate the level of the title.
257
- If numbering exists, ensure there's exactly one space between it and the title content.
258
- If numbering does not exist, return the original title unchanged.
259
-
260
- :param title: Original chapter title string.
261
- :return: Normalized chapter title string.
262
- """
263
- match = self.title_pattern.match(title)
264
- if match:
265
- numbering = match.group(1).strip()
266
- title_content = match.group(3).lstrip()
267
- # Return numbering and title content separated by one space
268
- title = numbering + " " + title_content
269
-
270
- title = title.rstrip(".")
271
- level = (
272
- title.count(
273
- ".",
274
- )
275
- + 1
276
- if "." in title
277
- else 1
278
- )
279
- return f"#{'#' * level} {title}".replace("-\n", "").replace(
280
- "\n",
281
- " ",
282
- )
283
-
284
- def format_centered_text(key):
285
- return (
286
- f'<div style="text-align: center;">{block[key]}</div>'.replace(
287
- "-\n",
288
- "",
289
- ).replace("\n", " ")
290
- + "\n"
404
+ if pretty:
405
+ format_text_func = lambda block: format_centered_by_html(
406
+ format_text_plain_func(block)
407
+ )
408
+ format_image_func = lambda block: format_centered_by_html(
409
+ format_image_scaled_by_html_func(
410
+ block,
411
+ original_image_width=original_image_width,
291
412
  )
413
+ )
414
+ else:
415
+ format_text_func = lambda block: block.content
416
+ format_image_func = format_image_plain_func
292
417
 
293
- def format_image(label):
294
- img_tags = []
295
- image_path = "".join(block[label].keys())
296
- img_tags.append(
297
- '<div style="text-align: center;"><img src="{}" alt="Image" /></div>'.format(
298
- image_path.replace("-\n", "").replace("\n", " "),
299
- ),
300
- )
301
- return "\n".join(img_tags)
302
-
303
- def format_first_line(templates, format_func, spliter):
304
- lines = block["block_content"].split(spliter)
305
- for idx in range(len(lines)):
306
- line = lines[idx]
307
- if line.strip() == "":
308
- continue
309
- if line.lower() in templates:
310
- lines[idx] = format_func(line)
311
- break
312
- return spliter.join(lines)
313
-
314
- def format_table():
315
- return "\n" + block["block_content"]
316
-
317
- def get_seg_flag(block, prev_block):
318
-
319
- seg_start_flag = True
320
- seg_end_flag = True
321
-
322
- block_box = block["block_bbox"]
323
- context_left_coordinate = block_box[0]
324
- context_right_coordinate = block_box[2]
325
- seg_start_coordinate = block.get("seg_start_coordinate")
326
- seg_end_coordinate = block.get("seg_end_coordinate")
327
-
328
- if prev_block is not None:
329
- prev_block_bbox = prev_block["block_bbox"]
330
- num_of_prev_lines = prev_block.get("num_of_lines")
331
- pre_block_seg_end_coordinate = prev_block.get("seg_end_coordinate")
332
- prev_end_space_small = (
333
- context_right_coordinate - pre_block_seg_end_coordinate < 10
334
- )
335
- prev_lines_more_than_one = num_of_prev_lines > 1
336
-
337
- overlap_blocks = context_left_coordinate < prev_block_bbox[2]
338
-
339
- # update context_left_coordinate and context_right_coordinate
340
- if overlap_blocks:
341
- context_left_coordinate = min(
342
- prev_block_bbox[0], context_left_coordinate
343
- )
344
- context_right_coordinate = max(
345
- prev_block_bbox[2], context_right_coordinate
346
- )
347
- prev_end_space_small = (
348
- prev_block_bbox[2] - pre_block_seg_end_coordinate < 10
349
- )
350
-
351
- current_start_space_small = (
352
- seg_start_coordinate - context_left_coordinate < 10
353
- )
418
+ if self["model_settings"].get("use_chart_recognition", False):
419
+ format_chart_func = format_chart2table_func
420
+ else:
421
+ format_chart_func = format_image_func
354
422
 
355
- if (
356
- prev_end_space_small
357
- and current_start_space_small
358
- and prev_lines_more_than_one
359
- ):
360
- seg_start_flag = False
361
- else:
362
- if seg_start_coordinate - context_left_coordinate < 10:
363
- seg_start_flag = False
364
-
365
- if context_right_coordinate - seg_end_coordinate < 10:
366
- seg_end_flag = False
367
-
368
- return seg_start_flag, seg_end_flag
369
-
370
- handlers = {
371
- "paragraph_title": lambda: format_title(block["block_content"]),
372
- "doc_title": lambda: f"# {block['block_content']}".replace(
373
- "-\n",
374
- "",
375
- ).replace("\n", " "),
376
- "table_title": lambda: format_centered_text("block_content"),
377
- "figure_title": lambda: format_centered_text("block_content"),
378
- "chart_title": lambda: format_centered_text("block_content"),
379
- "text": lambda: block["block_content"]
380
- .replace("-\n", " ")
381
- .replace("\n", " "),
382
- "abstract": lambda: format_first_line(
383
- ["摘要", "abstract"], lambda l: f"## {l}\n", " "
384
- ),
385
- "content": lambda: block["block_content"]
386
- .replace("-\n", " \n")
387
- .replace("\n", " \n"),
388
- "image": lambda: format_image("block_image"),
389
- "chart": lambda: format_image("block_image"),
390
- "formula": lambda: f"$${block['block_content']}$$",
391
- "table": format_table,
392
- "reference": lambda: format_first_line(
393
- ["参考文献", "references"], lambda l: f"## {l}", "\n"
394
- ),
395
- "algorithm": lambda: block["block_content"].strip("\n"),
396
- "seal": lambda: f"Words of Seals:\n{block['block_content']}",
397
- }
398
- parsing_res_list = obj["parsing_res_list"]
399
- markdown_content = ""
400
- last_label = None
401
- seg_start_flag = None
402
- seg_end_flag = None
403
- prev_block = None
404
- page_first_element_seg_start_flag = None
405
- page_last_element_seg_end_flag = None
406
- parsing_res_list = sorted(
407
- parsing_res_list,
408
- key=lambda x: x.get("sub_index", 999),
423
+ if self["model_settings"].get("use_seal_recognition", False):
424
+ format_seal_func = lambda block: "\n".join(
425
+ [format_image_func(block), format_text_func(block)]
409
426
  )
410
- for block in parsing_res_list:
411
- seg_start_flag, seg_end_flag = get_seg_flag(block, prev_block)
412
-
413
- label = block.get("block_label")
414
- page_first_element_seg_start_flag = (
415
- seg_start_flag
416
- if (page_first_element_seg_start_flag is None)
417
- else page_first_element_seg_start_flag
427
+ else:
428
+ format_seal_func = format_image_func
429
+
430
+ if self["model_settings"].get("use_table_recognition", False):
431
+ if pretty:
432
+ format_table_func = lambda block: "\n" + format_text_func(
433
+ block
434
+ ).replace("<table>", '<table border="1">')
435
+ else:
436
+ format_table_func = lambda block: simplify_table_func(
437
+ "\n" + block.content
418
438
  )
419
- handler = handlers.get(label)
420
- if handler:
421
- prev_block = block
422
- if label == last_label == "text" and seg_start_flag == False:
423
- last_char_of_markdown = (
424
- markdown_content[-1] if markdown_content else ""
425
- )
426
- first_char_of_handler = handler()[0] if handler() else ""
427
- last_is_chinese_char = (
428
- re.match(r"[\u4e00-\u9fff]", last_char_of_markdown)
429
- if last_char_of_markdown
430
- else False
431
- )
432
- first_is_chinese_char = (
433
- re.match(r"[\u4e00-\u9fff]", first_char_of_handler)
434
- if first_char_of_handler
435
- else False
436
- )
437
- if not (last_is_chinese_char or first_is_chinese_char):
438
- markdown_content += " " + handler()
439
- else:
440
- markdown_content += handler()
441
- else:
442
- markdown_content += (
443
- "\n\n" + handler() if markdown_content else handler()
444
- )
445
- last_label = label
446
- page_last_element_seg_end_flag = seg_end_flag
447
-
448
- return markdown_content, (
449
- page_first_element_seg_start_flag,
450
- page_last_element_seg_end_flag,
439
+ else:
440
+ format_table_func = format_image_func
441
+
442
+ if self["model_settings"].get("use_formula_recognition", False):
443
+ format_formula_func = lambda block: f"$${block.content}$$"
444
+ else:
445
+ format_formula_func = format_image_func
446
+
447
+ handle_funcs_dict = {
448
+ "paragraph_title": format_title_func,
449
+ "abstract_title": format_title_func,
450
+ "reference_title": format_title_func,
451
+ "content_title": format_title_func,
452
+ "doc_title": lambda block: f"# {block.content}".replace(
453
+ "-\n",
454
+ "",
455
+ ).replace("\n", " "),
456
+ "table_title": format_text_func,
457
+ "figure_title": format_text_func,
458
+ "chart_title": format_text_func,
459
+ "text": lambda block: block.content.replace("\n\n", "\n").replace(
460
+ "\n", "\n\n"
461
+ ),
462
+ "abstract": partial(
463
+ format_first_line_func,
464
+ templates=["摘要", "abstract"],
465
+ format_func=lambda l: f"## {l}\n",
466
+ spliter=" ",
467
+ ),
468
+ "content": lambda block: block.content.replace("-\n", " \n").replace(
469
+ "\n", " \n"
470
+ ),
471
+ "image": format_image_func,
472
+ "chart": format_chart_func,
473
+ "formula": format_formula_func,
474
+ "table": format_table_func,
475
+ "reference": partial(
476
+ format_first_line_func,
477
+ templates=["参考文献", "references"],
478
+ format_func=lambda l: f"## {l}",
479
+ spliter="\n",
480
+ ),
481
+ "algorithm": lambda block: block.content.strip("\n"),
482
+ "seal": format_seal_func,
483
+ }
484
+
485
+ markdown_content = ""
486
+ last_label = None
487
+ seg_start_flag = None
488
+ seg_end_flag = None
489
+ prev_block = None
490
+ page_first_element_seg_start_flag = None
491
+ page_last_element_seg_end_flag = None
492
+ markdown_info = {}
493
+ markdown_info["markdown_images"] = {}
494
+ for block in self["parsing_res_list"]:
495
+ seg_start_flag, seg_end_flag = get_seg_flag(block, prev_block)
496
+
497
+ label = block.label
498
+ if block.image is not None:
499
+ markdown_info["markdown_images"][block.image["path"]] = block.image[
500
+ "img"
501
+ ]
502
+ page_first_element_seg_start_flag = (
503
+ seg_start_flag
504
+ if (page_first_element_seg_start_flag is None)
505
+ else page_first_element_seg_start_flag
451
506
  )
452
507
 
453
- markdown_info = dict()
454
- markdown_info["markdown_texts"], (
455
- page_first_element_seg_start_flag,
456
- page_last_element_seg_end_flag,
457
- ) = _format_data(self)
508
+ handle_func = handle_funcs_dict.get(label, None)
509
+ if handle_func:
510
+ prev_block = block
511
+ if label == last_label == "text" and seg_start_flag == False:
512
+ markdown_content += handle_func(block)
513
+ else:
514
+ markdown_content += (
515
+ "\n\n" + handle_func(block)
516
+ if markdown_content
517
+ else handle_func(block)
518
+ )
519
+ last_label = label
520
+ page_last_element_seg_end_flag = seg_end_flag
521
+
522
+ markdown_info["markdown_texts"] = markdown_content
458
523
  markdown_info["page_continuation_flags"] = (
459
524
  page_first_element_seg_start_flag,
460
525
  page_last_element_seg_end_flag,
461
526
  )
462
-
463
- markdown_info["markdown_images"] = {}
464
527
  for img in self["imgs_in_doc"]:
465
528
  markdown_info["markdown_images"][img["path"]] = img["img"]
466
529
 
467
530
  return markdown_info
531
+
532
+
533
+ class LayoutParsingBlock:
534
+
535
+ def __init__(self, label, bbox, content="") -> None:
536
+ self.label = label
537
+ self.order_label = None
538
+ self.bbox = list(map(int, bbox))
539
+ self.content = content
540
+ self.seg_start_coordinate = float("inf")
541
+ self.seg_end_coordinate = float("-inf")
542
+ self.width = bbox[2] - bbox[0]
543
+ self.height = bbox[3] - bbox[1]
544
+ self.area = self.width * self.height
545
+ self.num_of_lines = 1
546
+ self.image = None
547
+ self.index = None
548
+ self.order_index = None
549
+ self.text_line_width = 1
550
+ self.text_line_height = 1
551
+ self.direction = self.get_bbox_direction()
552
+ self.child_blocks = []
553
+ self.update_direction_info()
554
+
555
+ def __str__(self) -> str:
556
+ return f"{self.__dict__}"
557
+
558
+ def __repr__(self) -> str:
559
+ _str = f"\n\n#################\nindex:\t{self.index}\nlabel:\t{self.label}\nregion_label:\t{self.order_label}\nbbox:\t{self.bbox}\ncontent:\t{self.content}\n#################"
560
+ return _str
561
+
562
+ def to_dict(self) -> dict:
563
+ return self.__dict__
564
+
565
+ def update_direction_info(self) -> None:
566
+ if self.direction == "horizontal":
567
+ self.secondary_direction = "vertical"
568
+ self.short_side_length = self.height
569
+ self.long_side_length = self.width
570
+ self.start_coordinate = self.bbox[0]
571
+ self.end_coordinate = self.bbox[2]
572
+ self.secondary_direction_start_coordinate = self.bbox[1]
573
+ self.secondary_direction_end_coordinate = self.bbox[3]
574
+ else:
575
+ self.secondary_direction = "horizontal"
576
+ self.short_side_length = self.width
577
+ self.long_side_length = self.height
578
+ self.start_coordinate = self.bbox[1]
579
+ self.end_coordinate = self.bbox[3]
580
+ self.secondary_direction_start_coordinate = self.bbox[0]
581
+ self.secondary_direction_end_coordinate = self.bbox[2]
582
+
583
+ def append_child_block(self, child_block: LayoutParsingBlock) -> None:
584
+ if not self.child_blocks:
585
+ self.ori_bbox = self.bbox.copy()
586
+ x1, y1, x2, y2 = self.bbox
587
+ x1_child, y1_child, x2_child, y2_child = child_block.bbox
588
+ union_bbox = (
589
+ min(x1, x1_child),
590
+ min(y1, y1_child),
591
+ max(x2, x2_child),
592
+ max(y2, y2_child),
593
+ )
594
+ self.bbox = union_bbox
595
+ self.update_direction_info()
596
+ child_blocks = [child_block]
597
+ if child_block.child_blocks:
598
+ child_blocks.extend(child_block.get_child_blocks())
599
+ self.child_blocks.extend(child_blocks)
600
+
601
+ def get_child_blocks(self) -> list:
602
+ self.bbox = self.ori_bbox
603
+ child_blocks = self.child_blocks.copy()
604
+ self.child_blocks = []
605
+ return child_blocks
606
+
607
+ def get_centroid(self) -> tuple:
608
+ x1, y1, x2, y2 = self.bbox
609
+ centroid = ((x1 + x2) / 2, (y1 + y2) / 2)
610
+ return centroid
611
+
612
+ def get_bbox_direction(self, direction_ratio: float = 1.0) -> bool:
613
+ """
614
+ Determine if a bounding box is horizontal or vertical.
615
+
616
+ Args:
617
+ bbox (List[float]): Bounding box [x_min, y_min, x_max, y_max].
618
+ direction_ratio (float): Ratio for determining direction. Default is 1.0.
619
+
620
+ Returns:
621
+ str: "horizontal" or "vertical".
622
+ """
623
+ return (
624
+ "horizontal" if self.width * direction_ratio >= self.height else "vertical"
625
+ )
626
+
627
+
628
+ class LayoutParsingRegion:
629
+
630
+ def __init__(
631
+ self, bbox, blocks: List[LayoutParsingBlock] = [], image_shape=None
632
+ ) -> None:
633
+ self.bbox = bbox
634
+ self.block_map = {}
635
+ self.direction = "horizontal"
636
+ self.calculate_bbox_metrics(image_shape)
637
+ self.doc_title_block_idxes = []
638
+ self.paragraph_title_block_idxes = []
639
+ self.vision_block_idxes = []
640
+ self.unordered_block_idxes = []
641
+ self.vision_title_block_idxes = []
642
+ self.normal_text_block_idxes = []
643
+ self.header_block_idxes = []
644
+ self.footer_block_idxes = []
645
+ self.text_line_width = 20
646
+ self.text_line_height = 10
647
+ self.init_region_info_from_layout(blocks)
648
+ self.init_direction_info()
649
+
650
+ def init_region_info_from_layout(self, blocks: List[LayoutParsingBlock]):
651
+ horizontal_normal_text_block_num = 0
652
+ text_line_height_list = []
653
+ text_line_width_list = []
654
+ for idx, block in enumerate(blocks):
655
+ self.block_map[idx] = block
656
+ block.index = idx
657
+ if block.label in BLOCK_LABEL_MAP["header_labels"]:
658
+ self.header_block_idxes.append(idx)
659
+ elif block.label in BLOCK_LABEL_MAP["doc_title_labels"]:
660
+ self.doc_title_block_idxes.append(idx)
661
+ elif block.label in BLOCK_LABEL_MAP["paragraph_title_labels"]:
662
+ self.paragraph_title_block_idxes.append(idx)
663
+ elif block.label in BLOCK_LABEL_MAP["vision_labels"]:
664
+ self.vision_block_idxes.append(idx)
665
+ elif block.label in BLOCK_LABEL_MAP["vision_title_labels"]:
666
+ self.vision_title_block_idxes.append(idx)
667
+ elif block.label in BLOCK_LABEL_MAP["footer_labels"]:
668
+ self.footer_block_idxes.append(idx)
669
+ elif block.label in BLOCK_LABEL_MAP["unordered_labels"]:
670
+ self.unordered_block_idxes.append(idx)
671
+ else:
672
+ self.normal_text_block_idxes.append(idx)
673
+ text_line_height_list.append(block.text_line_height)
674
+ text_line_width_list.append(block.text_line_width)
675
+ if block.direction == "horizontal":
676
+ horizontal_normal_text_block_num += 1
677
+ self.direction = (
678
+ "horizontal"
679
+ if horizontal_normal_text_block_num
680
+ >= len(self.normal_text_block_idxes) * 0.5
681
+ else "vertical"
682
+ )
683
+ self.text_line_width = (
684
+ np.mean(text_line_width_list) if text_line_width_list else 20
685
+ )
686
+ self.text_line_height = (
687
+ np.mean(text_line_height_list) if text_line_height_list else 10
688
+ )
689
+
690
+ def init_direction_info(self):
691
+ if self.direction == "horizontal":
692
+ self.direction_start_index = 0
693
+ self.direction_end_index = 2
694
+ self.secondary_direction_start_index = 1
695
+ self.secondary_direction_end_index = 3
696
+ self.secondary_direction = "vertical"
697
+ else:
698
+ self.direction_start_index = 1
699
+ self.direction_end_index = 3
700
+ self.secondary_direction_start_index = 0
701
+ self.secondary_direction_end_index = 2
702
+ self.secondary_direction = "horizontal"
703
+
704
+ self.direction_center_coordinate = (
705
+ self.bbox[self.direction_start_index] + self.bbox[self.direction_end_index]
706
+ ) / 2
707
+ self.secondary_direction_center_coordinate = (
708
+ self.bbox[self.secondary_direction_start_index]
709
+ + self.bbox[self.secondary_direction_end_index]
710
+ ) / 2
711
+
712
+ def calculate_bbox_metrics(self, image_shape):
713
+ x1, y1, x2, y2 = self.bbox
714
+ image_height, image_width = image_shape
715
+ width = x2 - x1
716
+ x_center, y_center = (x1 + x2) / 2, (y1 + y2) / 2
717
+ self.euclidean_distance = math.sqrt(((x1) ** 2 + (y1) ** 2))
718
+ self.center_euclidean_distance = math.sqrt(((x_center) ** 2 + (y_center) ** 2))
719
+ self.angle_rad = math.atan2(y_center, x_center)
720
+ self.weighted_distance = (
721
+ y2 + width + (x1 // (image_width // 10)) * (image_width // 10) * 1.5
722
+ )
723
+
724
+ def sort_normal_blocks(self, blocks):
725
+ if self.direction == "horizontal":
726
+ blocks.sort(
727
+ key=lambda x: (
728
+ x.bbox[1] // self.text_line_height,
729
+ x.bbox[0] // self.text_line_width,
730
+ x.bbox[1] ** 2 + x.bbox[0] ** 2,
731
+ ),
732
+ )
733
+ else:
734
+ blocks.sort(
735
+ key=lambda x: (
736
+ -x.bbox[0] // self.text_line_width,
737
+ x.bbox[1] // self.text_line_height,
738
+ -(x.bbox[2] ** 2 + x.bbox[1] ** 2),
739
+ ),
740
+ )
741
+
742
+ def sort(self):
743
+ from .xycut_enhanced import xycut_enhanced
744
+
745
+ return xycut_enhanced(self)