paddlex 3.0.0rc0__py3-none-any.whl → 3.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (824) hide show
  1. paddlex/.version +1 -1
  2. paddlex/__init__.py +17 -34
  3. paddlex/__main__.py +1 -1
  4. paddlex/configs/modules/chart_parsing/PP-Chart2Table.yaml +13 -0
  5. paddlex/configs/modules/doc_vlm/PP-DocBee-2B.yaml +14 -0
  6. paddlex/configs/modules/doc_vlm/PP-DocBee-7B.yaml +14 -0
  7. paddlex/configs/modules/doc_vlm/PP-DocBee2-3B.yaml +14 -0
  8. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-L.yaml +40 -0
  9. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-M.yaml +40 -0
  10. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-S.yaml +40 -0
  11. paddlex/configs/modules/layout_detection/PP-DocBlockLayout.yaml +40 -0
  12. paddlex/configs/modules/layout_detection/PP-DocLayout-L.yaml +2 -2
  13. paddlex/configs/modules/layout_detection/PP-DocLayout-M.yaml +2 -2
  14. paddlex/configs/modules/layout_detection/PP-DocLayout-S.yaml +2 -2
  15. paddlex/configs/modules/layout_detection/PP-DocLayout_plus-L.yaml +40 -0
  16. paddlex/configs/modules/open_vocabulary_detection/YOLO-Worldv2-L.yaml +13 -0
  17. paddlex/configs/modules/text_detection/PP-OCRv5_mobile_det.yaml +40 -0
  18. paddlex/configs/modules/text_detection/PP-OCRv5_server_det.yaml +40 -0
  19. paddlex/configs/modules/text_recognition/PP-OCRv5_mobile_rec.yaml +39 -0
  20. paddlex/configs/modules/text_recognition/PP-OCRv5_server_rec.yaml +39 -0
  21. paddlex/configs/modules/textline_orientation/PP-LCNet_x1_0_textline_ori.yaml +41 -0
  22. paddlex/configs/pipelines/OCR.yaml +7 -6
  23. paddlex/configs/pipelines/PP-ChatOCRv3-doc.yaml +3 -1
  24. paddlex/configs/pipelines/PP-ChatOCRv4-doc.yaml +91 -34
  25. paddlex/configs/pipelines/PP-StructureV3.yaml +72 -72
  26. paddlex/configs/pipelines/anomaly_detection.yaml +1 -1
  27. paddlex/configs/pipelines/doc_understanding.yaml +9 -0
  28. paddlex/configs/pipelines/formula_recognition.yaml +2 -2
  29. paddlex/configs/pipelines/layout_parsing.yaml +3 -2
  30. paddlex/configs/pipelines/seal_recognition.yaml +1 -0
  31. paddlex/configs/pipelines/table_recognition.yaml +2 -1
  32. paddlex/configs/pipelines/table_recognition_v2.yaml +7 -1
  33. paddlex/configs/pipelines/ts_anomaly_detection.yaml +1 -1
  34. paddlex/configs/pipelines/ts_classification.yaml +1 -1
  35. paddlex/configs/pipelines/ts_forecast.yaml +1 -1
  36. paddlex/constants.py +17 -0
  37. paddlex/engine.py +7 -5
  38. paddlex/hpip_links.html +23 -11
  39. paddlex/inference/__init__.py +3 -3
  40. paddlex/inference/common/__init__.py +1 -1
  41. paddlex/inference/common/batch_sampler/__init__.py +5 -4
  42. paddlex/inference/common/batch_sampler/audio_batch_sampler.py +5 -6
  43. paddlex/inference/common/batch_sampler/base_batch_sampler.py +20 -16
  44. paddlex/inference/common/batch_sampler/det_3d_batch_sampler.py +4 -7
  45. paddlex/inference/common/batch_sampler/doc_vlm_batch_sampler.py +87 -0
  46. paddlex/inference/common/batch_sampler/image_batch_sampler.py +45 -60
  47. paddlex/inference/common/batch_sampler/ts_batch_sampler.py +9 -10
  48. paddlex/inference/common/batch_sampler/video_batch_sampler.py +2 -22
  49. paddlex/inference/common/reader/__init__.py +4 -4
  50. paddlex/inference/common/reader/audio_reader.py +3 -3
  51. paddlex/inference/common/reader/det_3d_reader.py +7 -5
  52. paddlex/inference/common/reader/image_reader.py +16 -12
  53. paddlex/inference/common/reader/ts_reader.py +3 -2
  54. paddlex/inference/common/reader/video_reader.py +3 -3
  55. paddlex/inference/common/result/__init__.py +7 -7
  56. paddlex/inference/common/result/base_cv_result.py +12 -2
  57. paddlex/inference/common/result/base_result.py +7 -5
  58. paddlex/inference/common/result/base_ts_result.py +1 -2
  59. paddlex/inference/common/result/base_video_result.py +2 -2
  60. paddlex/inference/common/result/mixin.py +31 -25
  61. paddlex/inference/models/__init__.py +41 -85
  62. paddlex/inference/models/anomaly_detection/__init__.py +1 -1
  63. paddlex/inference/models/anomaly_detection/predictor.py +9 -19
  64. paddlex/inference/models/anomaly_detection/processors.py +9 -2
  65. paddlex/inference/models/anomaly_detection/result.py +3 -2
  66. paddlex/inference/models/base/__init__.py +2 -2
  67. paddlex/inference/models/base/predictor/__init__.py +1 -2
  68. paddlex/inference/models/base/predictor/base_predictor.py +278 -39
  69. paddlex/inference/models/common/__init__.py +6 -15
  70. paddlex/inference/models/common/static_infer.py +724 -251
  71. paddlex/inference/models/common/tokenizer/__init__.py +7 -3
  72. paddlex/inference/models/common/tokenizer/bert_tokenizer.py +1 -1
  73. paddlex/inference/models/common/tokenizer/clip_tokenizer.py +609 -0
  74. paddlex/inference/models/common/tokenizer/gpt_tokenizer.py +9 -7
  75. paddlex/inference/models/common/tokenizer/qwen2_5_tokenizer.py +112 -0
  76. paddlex/inference/models/common/tokenizer/qwen2_tokenizer.py +438 -0
  77. paddlex/inference/models/common/tokenizer/qwen_tokenizer.py +288 -0
  78. paddlex/inference/models/common/tokenizer/tokenizer_utils.py +85 -77
  79. paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py +339 -123
  80. paddlex/inference/models/common/tokenizer/utils.py +1 -1
  81. paddlex/inference/models/common/tokenizer/vocab.py +8 -8
  82. paddlex/inference/models/common/ts/__init__.py +1 -1
  83. paddlex/inference/models/common/ts/funcs.py +13 -6
  84. paddlex/inference/models/common/ts/processors.py +14 -5
  85. paddlex/inference/models/common/vision/__init__.py +3 -3
  86. paddlex/inference/models/common/vision/funcs.py +17 -12
  87. paddlex/inference/models/common/vision/processors.py +61 -46
  88. paddlex/inference/models/common/vlm/__init__.py +13 -0
  89. paddlex/inference/models/common/vlm/activations.py +189 -0
  90. paddlex/inference/models/common/vlm/bert_padding.py +127 -0
  91. paddlex/inference/models/common/vlm/conversion_utils.py +99 -0
  92. paddlex/inference/models/common/vlm/distributed.py +229 -0
  93. paddlex/inference/models/common/vlm/flash_attn_utils.py +119 -0
  94. paddlex/inference/models/common/vlm/fusion_ops.py +205 -0
  95. paddlex/inference/models/common/vlm/generation/__init__.py +34 -0
  96. paddlex/inference/models/common/vlm/generation/configuration_utils.py +533 -0
  97. paddlex/inference/models/common/vlm/generation/logits_process.py +730 -0
  98. paddlex/inference/models/common/vlm/generation/stopping_criteria.py +106 -0
  99. paddlex/inference/models/common/vlm/generation/utils.py +2162 -0
  100. paddlex/inference/models/common/vlm/transformers/__init__.py +16 -0
  101. paddlex/inference/models/common/vlm/transformers/configuration_utils.py +1037 -0
  102. paddlex/inference/models/common/vlm/transformers/conversion_utils.py +408 -0
  103. paddlex/inference/models/common/vlm/transformers/model_outputs.py +1612 -0
  104. paddlex/inference/models/common/vlm/transformers/model_utils.py +2014 -0
  105. paddlex/inference/models/common/vlm/transformers/utils.py +178 -0
  106. paddlex/inference/models/common/vlm/utils.py +109 -0
  107. paddlex/inference/models/doc_vlm/__init__.py +15 -0
  108. paddlex/inference/models/doc_vlm/modeling/GOT_ocr_2_0.py +830 -0
  109. paddlex/inference/models/doc_vlm/modeling/__init__.py +17 -0
  110. paddlex/inference/models/doc_vlm/modeling/qwen2.py +1606 -0
  111. paddlex/inference/models/doc_vlm/modeling/qwen2_5_vl.py +3006 -0
  112. paddlex/inference/models/doc_vlm/modeling/qwen2_vl.py +2495 -0
  113. paddlex/inference/models/doc_vlm/predictor.py +253 -0
  114. paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py +97 -0
  115. paddlex/inference/models/doc_vlm/processors/__init__.py +17 -0
  116. paddlex/inference/models/doc_vlm/processors/common.py +561 -0
  117. paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py +548 -0
  118. paddlex/inference/models/doc_vlm/processors/qwen2_vl.py +543 -0
  119. paddlex/inference/models/doc_vlm/result.py +21 -0
  120. paddlex/inference/models/face_feature/__init__.py +1 -1
  121. paddlex/inference/models/face_feature/predictor.py +2 -1
  122. paddlex/inference/models/formula_recognition/__init__.py +1 -1
  123. paddlex/inference/models/formula_recognition/predictor.py +18 -28
  124. paddlex/inference/models/formula_recognition/processors.py +126 -97
  125. paddlex/inference/models/formula_recognition/result.py +43 -35
  126. paddlex/inference/models/image_classification/__init__.py +1 -1
  127. paddlex/inference/models/image_classification/predictor.py +9 -19
  128. paddlex/inference/models/image_classification/processors.py +4 -2
  129. paddlex/inference/models/image_classification/result.py +4 -3
  130. paddlex/inference/models/image_feature/__init__.py +1 -1
  131. paddlex/inference/models/image_feature/predictor.py +9 -19
  132. paddlex/inference/models/image_feature/processors.py +7 -5
  133. paddlex/inference/models/image_feature/result.py +2 -3
  134. paddlex/inference/models/image_multilabel_classification/__init__.py +1 -1
  135. paddlex/inference/models/image_multilabel_classification/predictor.py +7 -6
  136. paddlex/inference/models/image_multilabel_classification/processors.py +6 -2
  137. paddlex/inference/models/image_multilabel_classification/result.py +4 -3
  138. paddlex/inference/models/image_unwarping/__init__.py +1 -1
  139. paddlex/inference/models/image_unwarping/predictor.py +8 -16
  140. paddlex/inference/models/image_unwarping/processors.py +6 -2
  141. paddlex/inference/models/image_unwarping/result.py +4 -2
  142. paddlex/inference/models/instance_segmentation/__init__.py +1 -1
  143. paddlex/inference/models/instance_segmentation/predictor.py +7 -15
  144. paddlex/inference/models/instance_segmentation/processors.py +4 -7
  145. paddlex/inference/models/instance_segmentation/result.py +11 -10
  146. paddlex/inference/models/keypoint_detection/__init__.py +1 -1
  147. paddlex/inference/models/keypoint_detection/predictor.py +5 -3
  148. paddlex/inference/models/keypoint_detection/processors.py +11 -3
  149. paddlex/inference/models/keypoint_detection/result.py +9 -4
  150. paddlex/inference/models/{3d_bev_detection → m_3d_bev_detection}/__init__.py +1 -1
  151. paddlex/inference/models/{3d_bev_detection → m_3d_bev_detection}/predictor.py +15 -26
  152. paddlex/inference/models/{3d_bev_detection → m_3d_bev_detection}/processors.py +26 -14
  153. paddlex/inference/models/{3d_bev_detection → m_3d_bev_detection}/result.py +15 -12
  154. paddlex/inference/models/{3d_bev_detection → m_3d_bev_detection}/visualizer_3d.py +77 -39
  155. paddlex/inference/models/multilingual_speech_recognition/__init__.py +1 -1
  156. paddlex/inference/models/multilingual_speech_recognition/predictor.py +11 -15
  157. paddlex/inference/models/multilingual_speech_recognition/processors.py +45 -53
  158. paddlex/inference/models/multilingual_speech_recognition/result.py +1 -1
  159. paddlex/inference/models/object_detection/__init__.py +1 -1
  160. paddlex/inference/models/object_detection/predictor.py +8 -12
  161. paddlex/inference/models/object_detection/processors.py +63 -33
  162. paddlex/inference/models/object_detection/result.py +5 -4
  163. paddlex/inference/models/object_detection/utils.py +3 -1
  164. paddlex/inference/models/open_vocabulary_detection/__init__.py +1 -1
  165. paddlex/inference/models/open_vocabulary_detection/predictor.py +31 -14
  166. paddlex/inference/models/open_vocabulary_detection/processors/__init__.py +3 -2
  167. paddlex/inference/models/open_vocabulary_detection/processors/common.py +114 -0
  168. paddlex/inference/models/open_vocabulary_detection/processors/groundingdino_processors.py +19 -8
  169. paddlex/inference/models/open_vocabulary_detection/processors/yoloworld_processors.py +209 -0
  170. paddlex/inference/models/open_vocabulary_segmentation/__init__.py +1 -1
  171. paddlex/inference/models/open_vocabulary_segmentation/predictor.py +6 -13
  172. paddlex/inference/models/open_vocabulary_segmentation/processors/__init__.py +1 -1
  173. paddlex/inference/models/open_vocabulary_segmentation/processors/sam_processer.py +12 -12
  174. paddlex/inference/models/open_vocabulary_segmentation/results/__init__.py +1 -1
  175. paddlex/inference/models/open_vocabulary_segmentation/results/sam_result.py +11 -9
  176. paddlex/inference/models/semantic_segmentation/__init__.py +1 -1
  177. paddlex/inference/models/semantic_segmentation/predictor.py +9 -18
  178. paddlex/inference/models/semantic_segmentation/processors.py +11 -8
  179. paddlex/inference/models/semantic_segmentation/result.py +4 -3
  180. paddlex/inference/models/table_structure_recognition/__init__.py +1 -1
  181. paddlex/inference/models/table_structure_recognition/predictor.py +8 -18
  182. paddlex/inference/models/table_structure_recognition/processors.py +23 -29
  183. paddlex/inference/models/table_structure_recognition/result.py +8 -15
  184. paddlex/inference/models/text_detection/__init__.py +1 -1
  185. paddlex/inference/models/text_detection/predictor.py +24 -24
  186. paddlex/inference/models/text_detection/processors.py +116 -44
  187. paddlex/inference/models/text_detection/result.py +8 -13
  188. paddlex/inference/models/text_recognition/__init__.py +1 -1
  189. paddlex/inference/models/text_recognition/predictor.py +11 -19
  190. paddlex/inference/models/text_recognition/processors.py +27 -13
  191. paddlex/inference/models/text_recognition/result.py +3 -2
  192. paddlex/inference/models/ts_anomaly_detection/__init__.py +1 -1
  193. paddlex/inference/models/ts_anomaly_detection/predictor.py +12 -17
  194. paddlex/inference/models/ts_anomaly_detection/processors.py +6 -2
  195. paddlex/inference/models/ts_anomaly_detection/result.py +21 -10
  196. paddlex/inference/models/ts_classification/__init__.py +1 -1
  197. paddlex/inference/models/ts_classification/predictor.py +14 -27
  198. paddlex/inference/models/ts_classification/processors.py +7 -2
  199. paddlex/inference/models/ts_classification/result.py +21 -12
  200. paddlex/inference/models/ts_forecasting/__init__.py +1 -1
  201. paddlex/inference/models/ts_forecasting/predictor.py +13 -18
  202. paddlex/inference/models/ts_forecasting/processors.py +12 -3
  203. paddlex/inference/models/ts_forecasting/result.py +24 -11
  204. paddlex/inference/models/video_classification/__init__.py +1 -1
  205. paddlex/inference/models/video_classification/predictor.py +9 -15
  206. paddlex/inference/models/video_classification/processors.py +24 -24
  207. paddlex/inference/models/video_classification/result.py +7 -3
  208. paddlex/inference/models/video_detection/__init__.py +1 -1
  209. paddlex/inference/models/video_detection/predictor.py +8 -15
  210. paddlex/inference/models/video_detection/processors.py +24 -11
  211. paddlex/inference/models/video_detection/result.py +10 -5
  212. paddlex/inference/pipelines/__init__.py +48 -37
  213. paddlex/inference/pipelines/_parallel.py +172 -0
  214. paddlex/inference/pipelines/anomaly_detection/__init__.py +1 -1
  215. paddlex/inference/pipelines/anomaly_detection/pipeline.py +29 -9
  216. paddlex/inference/pipelines/attribute_recognition/__init__.py +1 -1
  217. paddlex/inference/pipelines/attribute_recognition/pipeline.py +24 -9
  218. paddlex/inference/pipelines/attribute_recognition/result.py +10 -8
  219. paddlex/inference/pipelines/base.py +43 -13
  220. paddlex/inference/pipelines/components/__init__.py +14 -8
  221. paddlex/inference/pipelines/components/chat_server/__init__.py +1 -1
  222. paddlex/inference/pipelines/components/chat_server/base.py +2 -2
  223. paddlex/inference/pipelines/components/chat_server/openai_bot_chat.py +8 -8
  224. paddlex/inference/pipelines/components/common/__init__.py +5 -4
  225. paddlex/inference/pipelines/components/common/base_operator.py +2 -1
  226. paddlex/inference/pipelines/components/common/base_result.py +3 -2
  227. paddlex/inference/pipelines/components/common/convert_points_and_boxes.py +1 -2
  228. paddlex/inference/pipelines/components/common/crop_image_regions.py +11 -5
  229. paddlex/inference/pipelines/components/common/seal_det_warp.py +44 -13
  230. paddlex/inference/pipelines/components/common/sort_boxes.py +4 -2
  231. paddlex/inference/pipelines/components/common/warp_image.py +50 -0
  232. paddlex/inference/pipelines/components/faisser.py +10 -5
  233. paddlex/inference/pipelines/components/prompt_engineering/__init__.py +2 -2
  234. paddlex/inference/pipelines/components/prompt_engineering/base.py +2 -2
  235. paddlex/inference/pipelines/components/prompt_engineering/generate_ensemble_prompt.py +2 -1
  236. paddlex/inference/pipelines/components/prompt_engineering/generate_kie_prompt.py +2 -2
  237. paddlex/inference/pipelines/components/retriever/__init__.py +2 -2
  238. paddlex/inference/pipelines/components/retriever/base.py +18 -16
  239. paddlex/inference/pipelines/components/retriever/openai_bot_retriever.py +2 -2
  240. paddlex/inference/pipelines/components/retriever/qianfan_bot_retriever.py +87 -84
  241. paddlex/inference/pipelines/components/utils/__init__.py +1 -1
  242. paddlex/inference/pipelines/components/utils/mixin.py +7 -7
  243. paddlex/inference/pipelines/doc_preprocessor/__init__.py +1 -1
  244. paddlex/inference/pipelines/doc_preprocessor/pipeline.py +70 -51
  245. paddlex/inference/pipelines/doc_preprocessor/result.py +5 -10
  246. paddlex/inference/pipelines/doc_understanding/__init__.py +15 -0
  247. paddlex/inference/pipelines/doc_understanding/pipeline.py +71 -0
  248. paddlex/inference/pipelines/face_recognition/__init__.py +1 -1
  249. paddlex/inference/pipelines/face_recognition/pipeline.py +3 -1
  250. paddlex/inference/pipelines/face_recognition/result.py +3 -2
  251. paddlex/inference/pipelines/formula_recognition/__init__.py +1 -1
  252. paddlex/inference/pipelines/formula_recognition/pipeline.py +137 -93
  253. paddlex/inference/pipelines/formula_recognition/result.py +20 -29
  254. paddlex/inference/pipelines/image_classification/__init__.py +1 -1
  255. paddlex/inference/pipelines/image_classification/pipeline.py +30 -11
  256. paddlex/inference/pipelines/image_multilabel_classification/__init__.py +1 -1
  257. paddlex/inference/pipelines/image_multilabel_classification/pipeline.py +31 -12
  258. paddlex/inference/pipelines/instance_segmentation/__init__.py +1 -1
  259. paddlex/inference/pipelines/instance_segmentation/pipeline.py +30 -9
  260. paddlex/inference/pipelines/keypoint_detection/__init__.py +1 -1
  261. paddlex/inference/pipelines/keypoint_detection/pipeline.py +30 -9
  262. paddlex/inference/pipelines/layout_parsing/__init__.py +1 -1
  263. paddlex/inference/pipelines/layout_parsing/pipeline.py +54 -56
  264. paddlex/inference/pipelines/layout_parsing/pipeline_v2.py +904 -261
  265. paddlex/inference/pipelines/layout_parsing/result.py +9 -21
  266. paddlex/inference/pipelines/layout_parsing/result_v2.py +525 -250
  267. paddlex/inference/pipelines/layout_parsing/setting.py +87 -0
  268. paddlex/inference/pipelines/layout_parsing/utils.py +570 -2004
  269. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/__init__.py +16 -0
  270. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/utils.py +1144 -0
  271. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/xycuts.py +563 -0
  272. paddlex/inference/pipelines/{3d_bev_detection → m_3d_bev_detection}/__init__.py +1 -1
  273. paddlex/inference/pipelines/{3d_bev_detection → m_3d_bev_detection}/pipeline.py +17 -10
  274. paddlex/inference/pipelines/multilingual_speech_recognition/__init__.py +1 -1
  275. paddlex/inference/pipelines/multilingual_speech_recognition/pipeline.py +17 -6
  276. paddlex/inference/pipelines/object_detection/__init__.py +1 -1
  277. paddlex/inference/pipelines/object_detection/pipeline.py +29 -9
  278. paddlex/inference/pipelines/ocr/__init__.py +1 -1
  279. paddlex/inference/pipelines/ocr/pipeline.py +151 -77
  280. paddlex/inference/pipelines/ocr/result.py +31 -24
  281. paddlex/inference/pipelines/open_vocabulary_detection/__init__.py +1 -1
  282. paddlex/inference/pipelines/open_vocabulary_detection/pipeline.py +17 -6
  283. paddlex/inference/pipelines/open_vocabulary_segmentation/__init__.py +1 -1
  284. paddlex/inference/pipelines/open_vocabulary_segmentation/pipeline.py +17 -6
  285. paddlex/inference/pipelines/pp_chatocr/__init__.py +1 -1
  286. paddlex/inference/pipelines/pp_chatocr/pipeline_base.py +14 -5
  287. paddlex/inference/pipelines/pp_chatocr/pipeline_v3.py +22 -14
  288. paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py +34 -16
  289. paddlex/inference/pipelines/pp_shitu_v2/__init__.py +1 -1
  290. paddlex/inference/pipelines/pp_shitu_v2/pipeline.py +12 -8
  291. paddlex/inference/pipelines/pp_shitu_v2/result.py +4 -4
  292. paddlex/inference/pipelines/rotated_object_detection/__init__.py +1 -1
  293. paddlex/inference/pipelines/rotated_object_detection/pipeline.py +30 -9
  294. paddlex/inference/pipelines/seal_recognition/__init__.py +1 -1
  295. paddlex/inference/pipelines/seal_recognition/pipeline.py +127 -63
  296. paddlex/inference/pipelines/seal_recognition/result.py +4 -2
  297. paddlex/inference/pipelines/semantic_segmentation/__init__.py +1 -1
  298. paddlex/inference/pipelines/semantic_segmentation/pipeline.py +30 -9
  299. paddlex/inference/pipelines/small_object_detection/__init__.py +1 -1
  300. paddlex/inference/pipelines/small_object_detection/pipeline.py +30 -9
  301. paddlex/inference/pipelines/table_recognition/__init__.py +1 -1
  302. paddlex/inference/pipelines/table_recognition/pipeline.py +61 -37
  303. paddlex/inference/pipelines/table_recognition/pipeline_v2.py +668 -65
  304. paddlex/inference/pipelines/table_recognition/result.py +12 -10
  305. paddlex/inference/pipelines/table_recognition/table_recognition_post_processing.py +12 -8
  306. paddlex/inference/pipelines/table_recognition/table_recognition_post_processing_v2.py +55 -37
  307. paddlex/inference/pipelines/table_recognition/utils.py +1 -1
  308. paddlex/inference/pipelines/ts_anomaly_detection/__init__.py +1 -1
  309. paddlex/inference/pipelines/ts_anomaly_detection/pipeline.py +16 -6
  310. paddlex/inference/pipelines/ts_classification/__init__.py +1 -1
  311. paddlex/inference/pipelines/ts_classification/pipeline.py +16 -6
  312. paddlex/inference/pipelines/ts_forecasting/__init__.py +1 -1
  313. paddlex/inference/pipelines/ts_forecasting/pipeline.py +16 -6
  314. paddlex/inference/pipelines/video_classification/__init__.py +1 -1
  315. paddlex/inference/pipelines/video_classification/pipeline.py +17 -6
  316. paddlex/inference/pipelines/video_detection/__init__.py +1 -1
  317. paddlex/inference/pipelines/video_detection/pipeline.py +20 -7
  318. paddlex/inference/serving/__init__.py +5 -1
  319. paddlex/inference/serving/basic_serving/__init__.py +1 -1
  320. paddlex/inference/serving/basic_serving/_app.py +31 -19
  321. paddlex/inference/serving/basic_serving/_pipeline_apps/__init__.py +7 -4
  322. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/__init__.py +1 -1
  323. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/common.py +12 -4
  324. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/image_recognition.py +1 -1
  325. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/ocr.py +7 -2
  326. paddlex/inference/serving/basic_serving/_pipeline_apps/anomaly_detection.py +10 -7
  327. paddlex/inference/serving/basic_serving/_pipeline_apps/doc_preprocessor.py +10 -7
  328. paddlex/inference/serving/basic_serving/_pipeline_apps/doc_understanding.py +153 -0
  329. paddlex/inference/serving/basic_serving/_pipeline_apps/face_recognition.py +16 -13
  330. paddlex/inference/serving/basic_serving/_pipeline_apps/formula_recognition.py +10 -7
  331. paddlex/inference/serving/basic_serving/_pipeline_apps/human_keypoint_detection.py +10 -7
  332. paddlex/inference/serving/basic_serving/_pipeline_apps/image_classification.py +10 -7
  333. paddlex/inference/serving/basic_serving/_pipeline_apps/image_multilabel_classification.py +10 -7
  334. paddlex/inference/serving/basic_serving/_pipeline_apps/instance_segmentation.py +13 -7
  335. paddlex/inference/serving/basic_serving/_pipeline_apps/layout_parsing.py +10 -8
  336. paddlex/inference/serving/basic_serving/_pipeline_apps/m_3d_bev_detection.py +10 -7
  337. paddlex/inference/serving/basic_serving/_pipeline_apps/multilingual_speech_recognition.py +10 -7
  338. paddlex/inference/serving/basic_serving/_pipeline_apps/object_detection.py +10 -7
  339. paddlex/inference/serving/basic_serving/_pipeline_apps/ocr.py +10 -7
  340. paddlex/inference/serving/basic_serving/_pipeline_apps/open_vocabulary_detection.py +10 -7
  341. paddlex/inference/serving/basic_serving/_pipeline_apps/open_vocabulary_segmentation.py +13 -7
  342. paddlex/inference/serving/basic_serving/_pipeline_apps/pedestrian_attribute_recognition.py +10 -7
  343. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv3_doc.py +14 -12
  344. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv4_doc.py +17 -14
  345. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_shituv2.py +16 -13
  346. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py +16 -9
  347. paddlex/inference/serving/basic_serving/_pipeline_apps/rotated_object_detection.py +10 -7
  348. paddlex/inference/serving/basic_serving/_pipeline_apps/seal_recognition.py +10 -7
  349. paddlex/inference/serving/basic_serving/_pipeline_apps/semantic_segmentation.py +10 -7
  350. paddlex/inference/serving/basic_serving/_pipeline_apps/small_object_detection.py +10 -7
  351. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition.py +11 -12
  352. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition_v2.py +14 -12
  353. paddlex/inference/serving/basic_serving/_pipeline_apps/ts_anomaly_detection.py +10 -7
  354. paddlex/inference/serving/basic_serving/_pipeline_apps/ts_classification.py +10 -7
  355. paddlex/inference/serving/basic_serving/_pipeline_apps/ts_forecast.py +10 -7
  356. paddlex/inference/serving/basic_serving/_pipeline_apps/vehicle_attribute_recognition.py +10 -7
  357. paddlex/inference/serving/basic_serving/_pipeline_apps/video_classification.py +10 -7
  358. paddlex/inference/serving/basic_serving/_pipeline_apps/video_detection.py +10 -7
  359. paddlex/inference/serving/basic_serving/_server.py +9 -4
  360. paddlex/inference/serving/infra/__init__.py +1 -1
  361. paddlex/inference/serving/infra/config.py +1 -1
  362. paddlex/inference/serving/infra/models.py +13 -6
  363. paddlex/inference/serving/infra/storage.py +9 -4
  364. paddlex/inference/serving/infra/utils.py +54 -28
  365. paddlex/inference/serving/schemas/__init__.py +1 -1
  366. paddlex/inference/serving/schemas/anomaly_detection.py +1 -1
  367. paddlex/inference/serving/schemas/doc_preprocessor.py +1 -1
  368. paddlex/inference/serving/schemas/doc_understanding.py +78 -0
  369. paddlex/inference/serving/schemas/face_recognition.py +1 -1
  370. paddlex/inference/serving/schemas/formula_recognition.py +2 -2
  371. paddlex/inference/serving/schemas/human_keypoint_detection.py +1 -1
  372. paddlex/inference/serving/schemas/image_classification.py +1 -1
  373. paddlex/inference/serving/schemas/image_multilabel_classification.py +1 -1
  374. paddlex/inference/serving/schemas/instance_segmentation.py +1 -1
  375. paddlex/inference/serving/schemas/layout_parsing.py +2 -3
  376. paddlex/inference/serving/schemas/m_3d_bev_detection.py +1 -1
  377. paddlex/inference/serving/schemas/multilingual_speech_recognition.py +1 -1
  378. paddlex/inference/serving/schemas/object_detection.py +1 -1
  379. paddlex/inference/serving/schemas/ocr.py +1 -1
  380. paddlex/inference/serving/schemas/open_vocabulary_detection.py +1 -1
  381. paddlex/inference/serving/schemas/open_vocabulary_segmentation.py +1 -1
  382. paddlex/inference/serving/schemas/pedestrian_attribute_recognition.py +1 -1
  383. paddlex/inference/serving/schemas/pp_chatocrv3_doc.py +2 -3
  384. paddlex/inference/serving/schemas/pp_chatocrv4_doc.py +3 -3
  385. paddlex/inference/serving/schemas/pp_shituv2.py +1 -1
  386. paddlex/inference/serving/schemas/pp_structurev3.py +11 -7
  387. paddlex/inference/serving/schemas/rotated_object_detection.py +1 -1
  388. paddlex/inference/serving/schemas/seal_recognition.py +2 -2
  389. paddlex/inference/serving/schemas/semantic_segmentation.py +1 -1
  390. paddlex/inference/serving/schemas/shared/__init__.py +1 -1
  391. paddlex/inference/serving/schemas/shared/classification.py +1 -1
  392. paddlex/inference/serving/schemas/shared/image_segmentation.py +1 -1
  393. paddlex/inference/serving/schemas/shared/object_detection.py +1 -1
  394. paddlex/inference/serving/schemas/shared/ocr.py +1 -1
  395. paddlex/inference/serving/schemas/small_object_detection.py +1 -1
  396. paddlex/inference/serving/schemas/table_recognition.py +3 -7
  397. paddlex/inference/serving/schemas/table_recognition_v2.py +6 -7
  398. paddlex/inference/serving/schemas/ts_anomaly_detection.py +1 -1
  399. paddlex/inference/serving/schemas/ts_classification.py +1 -1
  400. paddlex/inference/serving/schemas/ts_forecast.py +1 -1
  401. paddlex/inference/serving/schemas/vehicle_attribute_recognition.py +1 -1
  402. paddlex/inference/serving/schemas/video_classification.py +1 -1
  403. paddlex/inference/serving/schemas/video_detection.py +1 -1
  404. paddlex/inference/utils/__init__.py +1 -1
  405. paddlex/inference/utils/benchmark.py +332 -179
  406. paddlex/inference/utils/color_map.py +1 -1
  407. paddlex/inference/utils/get_pipeline_path.py +1 -1
  408. paddlex/inference/utils/hpi.py +258 -0
  409. paddlex/inference/utils/hpi_model_info_collection.json +2331 -0
  410. paddlex/inference/utils/io/__init__.py +11 -11
  411. paddlex/inference/utils/io/readers.py +31 -27
  412. paddlex/inference/utils/io/style.py +21 -14
  413. paddlex/inference/utils/io/tablepyxl.py +13 -5
  414. paddlex/inference/utils/io/writers.py +9 -10
  415. paddlex/inference/utils/mkldnn_blocklist.py +25 -0
  416. paddlex/inference/utils/model_paths.py +48 -0
  417. paddlex/inference/utils/{new_ir_blacklist.py → new_ir_blocklist.py} +1 -2
  418. paddlex/inference/utils/official_models.py +278 -262
  419. paddlex/inference/utils/pp_option.py +184 -92
  420. paddlex/inference/utils/trt_blocklist.py +43 -0
  421. paddlex/inference/utils/trt_config.py +420 -0
  422. paddlex/model.py +30 -12
  423. paddlex/modules/__init__.py +57 -80
  424. paddlex/modules/anomaly_detection/__init__.py +2 -2
  425. paddlex/modules/anomaly_detection/dataset_checker/__init__.py +2 -3
  426. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/__init__.py +2 -2
  427. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/analyse_dataset.py +6 -3
  428. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/check_dataset.py +8 -4
  429. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/convert_dataset.py +7 -4
  430. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/split_dataset.py +2 -2
  431. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/utils/__init__.py +1 -1
  432. paddlex/modules/anomaly_detection/dataset_checker/dataset_src/utils/visualizer.py +7 -2
  433. paddlex/modules/anomaly_detection/evaluator.py +3 -3
  434. paddlex/modules/anomaly_detection/exportor.py +1 -1
  435. paddlex/modules/anomaly_detection/model_list.py +1 -1
  436. paddlex/modules/anomaly_detection/trainer.py +3 -4
  437. paddlex/modules/base/__init__.py +5 -5
  438. paddlex/modules/base/build_model.py +1 -2
  439. paddlex/modules/base/dataset_checker/__init__.py +2 -2
  440. paddlex/modules/base/dataset_checker/dataset_checker.py +4 -4
  441. paddlex/modules/base/dataset_checker/utils.py +1 -3
  442. paddlex/modules/base/evaluator.py +13 -13
  443. paddlex/modules/base/exportor.py +12 -13
  444. paddlex/modules/base/trainer.py +21 -11
  445. paddlex/modules/base/utils/__init__.py +13 -0
  446. paddlex/modules/base/utils/cinn_setting.py +89 -0
  447. paddlex/modules/base/utils/coco_eval.py +94 -0
  448. paddlex/modules/base/utils/topk_eval.py +118 -0
  449. paddlex/modules/doc_vlm/__init__.py +18 -0
  450. paddlex/modules/doc_vlm/dataset_checker.py +29 -0
  451. paddlex/modules/doc_vlm/evaluator.py +29 -0
  452. paddlex/modules/doc_vlm/exportor.py +29 -0
  453. paddlex/modules/doc_vlm/model_list.py +16 -0
  454. paddlex/modules/doc_vlm/trainer.py +41 -0
  455. paddlex/modules/face_recognition/__init__.py +2 -2
  456. paddlex/modules/face_recognition/dataset_checker/__init__.py +2 -2
  457. paddlex/modules/face_recognition/dataset_checker/dataset_src/__init__.py +1 -1
  458. paddlex/modules/face_recognition/dataset_checker/dataset_src/check_dataset.py +3 -5
  459. paddlex/modules/face_recognition/dataset_checker/dataset_src/utils/__init__.py +1 -1
  460. paddlex/modules/face_recognition/dataset_checker/dataset_src/utils/visualizer.py +2 -5
  461. paddlex/modules/face_recognition/evaluator.py +3 -3
  462. paddlex/modules/face_recognition/exportor.py +1 -1
  463. paddlex/modules/face_recognition/model_list.py +1 -1
  464. paddlex/modules/face_recognition/trainer.py +1 -1
  465. paddlex/modules/formula_recognition/__init__.py +2 -2
  466. paddlex/modules/formula_recognition/dataset_checker/__init__.py +3 -3
  467. paddlex/modules/formula_recognition/dataset_checker/dataset_src/__init__.py +2 -2
  468. paddlex/modules/formula_recognition/dataset_checker/dataset_src/analyse_dataset.py +13 -12
  469. paddlex/modules/formula_recognition/dataset_checker/dataset_src/check_dataset.py +2 -6
  470. paddlex/modules/formula_recognition/dataset_checker/dataset_src/convert_dataset.py +11 -10
  471. paddlex/modules/formula_recognition/dataset_checker/dataset_src/split_dataset.py +1 -2
  472. paddlex/modules/formula_recognition/evaluator.py +6 -3
  473. paddlex/modules/formula_recognition/exportor.py +1 -1
  474. paddlex/modules/formula_recognition/model_list.py +4 -1
  475. paddlex/modules/formula_recognition/trainer.py +5 -3
  476. paddlex/modules/general_recognition/__init__.py +2 -2
  477. paddlex/modules/general_recognition/dataset_checker/__init__.py +2 -2
  478. paddlex/modules/general_recognition/dataset_checker/dataset_src/__init__.py +2 -2
  479. paddlex/modules/general_recognition/dataset_checker/dataset_src/analyse_dataset.py +7 -9
  480. paddlex/modules/general_recognition/dataset_checker/dataset_src/check_dataset.py +4 -5
  481. paddlex/modules/general_recognition/dataset_checker/dataset_src/convert_dataset.py +6 -5
  482. paddlex/modules/general_recognition/dataset_checker/dataset_src/split_dataset.py +1 -1
  483. paddlex/modules/general_recognition/dataset_checker/dataset_src/utils/__init__.py +1 -1
  484. paddlex/modules/general_recognition/dataset_checker/dataset_src/utils/visualizer.py +2 -5
  485. paddlex/modules/general_recognition/evaluator.py +2 -2
  486. paddlex/modules/general_recognition/exportor.py +1 -1
  487. paddlex/modules/general_recognition/model_list.py +1 -1
  488. paddlex/modules/general_recognition/trainer.py +1 -1
  489. paddlex/modules/image_classification/__init__.py +2 -2
  490. paddlex/modules/image_classification/dataset_checker/__init__.py +2 -2
  491. paddlex/modules/image_classification/dataset_checker/dataset_src/__init__.py +2 -2
  492. paddlex/modules/image_classification/dataset_checker/dataset_src/analyse_dataset.py +8 -9
  493. paddlex/modules/image_classification/dataset_checker/dataset_src/check_dataset.py +4 -3
  494. paddlex/modules/image_classification/dataset_checker/dataset_src/convert_dataset.py +4 -4
  495. paddlex/modules/image_classification/dataset_checker/dataset_src/split_dataset.py +1 -1
  496. paddlex/modules/image_classification/dataset_checker/dataset_src/utils/__init__.py +1 -1
  497. paddlex/modules/image_classification/dataset_checker/dataset_src/utils/visualizer.py +2 -5
  498. paddlex/modules/image_classification/evaluator.py +3 -3
  499. paddlex/modules/image_classification/exportor.py +1 -1
  500. paddlex/modules/image_classification/model_list.py +2 -1
  501. paddlex/modules/image_classification/trainer.py +3 -3
  502. paddlex/modules/image_unwarping/__init__.py +1 -1
  503. paddlex/modules/image_unwarping/model_list.py +1 -1
  504. paddlex/modules/instance_segmentation/__init__.py +2 -2
  505. paddlex/modules/instance_segmentation/dataset_checker/__init__.py +2 -3
  506. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/__init__.py +2 -2
  507. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/analyse_dataset.py +9 -5
  508. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/check_dataset.py +8 -5
  509. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/convert_dataset.py +8 -8
  510. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/split_dataset.py +7 -4
  511. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/utils/__init__.py +1 -1
  512. paddlex/modules/instance_segmentation/dataset_checker/dataset_src/utils/visualizer.py +10 -8
  513. paddlex/modules/instance_segmentation/evaluator.py +2 -2
  514. paddlex/modules/instance_segmentation/exportor.py +1 -1
  515. paddlex/modules/instance_segmentation/model_list.py +1 -1
  516. paddlex/modules/instance_segmentation/trainer.py +1 -1
  517. paddlex/modules/keypoint_detection/__init__.py +2 -2
  518. paddlex/modules/keypoint_detection/dataset_checker/__init__.py +2 -2
  519. paddlex/modules/keypoint_detection/dataset_checker/dataset_src/__init__.py +1 -1
  520. paddlex/modules/keypoint_detection/dataset_checker/dataset_src/check_dataset.py +10 -5
  521. paddlex/modules/keypoint_detection/dataset_checker/dataset_src/utils/__init__.py +1 -1
  522. paddlex/modules/keypoint_detection/dataset_checker/dataset_src/utils/visualizer.py +8 -3
  523. paddlex/modules/keypoint_detection/evaluator.py +2 -2
  524. paddlex/modules/keypoint_detection/exportor.py +1 -1
  525. paddlex/modules/keypoint_detection/model_list.py +1 -1
  526. paddlex/modules/keypoint_detection/trainer.py +2 -2
  527. paddlex/modules/{3d_bev_detection → m_3d_bev_detection}/__init__.py +2 -2
  528. paddlex/modules/{3d_bev_detection → m_3d_bev_detection}/dataset_checker/__init__.py +3 -3
  529. paddlex/modules/{3d_bev_detection → m_3d_bev_detection}/dataset_checker/dataset_src/__init__.py +2 -2
  530. paddlex/modules/{3d_bev_detection → m_3d_bev_detection}/dataset_checker/dataset_src/analyse_dataset.py +8 -8
  531. paddlex/modules/{3d_bev_detection → m_3d_bev_detection}/dataset_checker/dataset_src/check_dataset.py +1 -2
  532. paddlex/modules/{3d_bev_detection → m_3d_bev_detection}/evaluator.py +3 -3
  533. paddlex/modules/{3d_bev_detection → m_3d_bev_detection}/exportor.py +1 -1
  534. paddlex/modules/{3d_bev_detection → m_3d_bev_detection}/model_list.py +1 -1
  535. paddlex/modules/{3d_bev_detection → m_3d_bev_detection}/trainer.py +5 -7
  536. paddlex/modules/multilabel_classification/__init__.py +2 -2
  537. paddlex/modules/multilabel_classification/dataset_checker/__init__.py +2 -2
  538. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/__init__.py +2 -2
  539. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/analyse_dataset.py +8 -9
  540. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/check_dataset.py +4 -3
  541. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/convert_dataset.py +10 -7
  542. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/split_dataset.py +1 -1
  543. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/utils/__init__.py +1 -1
  544. paddlex/modules/multilabel_classification/dataset_checker/dataset_src/utils/visualizer.py +1 -5
  545. paddlex/modules/multilabel_classification/evaluator.py +3 -3
  546. paddlex/modules/multilabel_classification/exportor.py +1 -1
  547. paddlex/modules/multilabel_classification/model_list.py +1 -1
  548. paddlex/modules/multilabel_classification/trainer.py +3 -3
  549. paddlex/modules/multilingual_speech_recognition/__init__.py +2 -2
  550. paddlex/modules/multilingual_speech_recognition/dataset_checker.py +3 -3
  551. paddlex/modules/multilingual_speech_recognition/evaluator.py +3 -3
  552. paddlex/modules/multilingual_speech_recognition/exportor.py +3 -3
  553. paddlex/modules/multilingual_speech_recognition/model_list.py +1 -1
  554. paddlex/modules/multilingual_speech_recognition/trainer.py +7 -5
  555. paddlex/modules/object_detection/__init__.py +2 -2
  556. paddlex/modules/object_detection/dataset_checker/__init__.py +2 -11
  557. paddlex/modules/object_detection/dataset_checker/dataset_src/__init__.py +2 -2
  558. paddlex/modules/object_detection/dataset_checker/dataset_src/analyse_dataset.py +10 -8
  559. paddlex/modules/object_detection/dataset_checker/dataset_src/check_dataset.py +10 -5
  560. paddlex/modules/object_detection/dataset_checker/dataset_src/convert_dataset.py +17 -12
  561. paddlex/modules/object_detection/dataset_checker/dataset_src/split_dataset.py +8 -4
  562. paddlex/modules/object_detection/dataset_checker/dataset_src/utils/__init__.py +1 -1
  563. paddlex/modules/object_detection/dataset_checker/dataset_src/utils/visualizer.py +9 -8
  564. paddlex/modules/object_detection/evaluator.py +11 -6
  565. paddlex/modules/object_detection/exportor.py +1 -1
  566. paddlex/modules/object_detection/model_list.py +3 -1
  567. paddlex/modules/object_detection/trainer.py +4 -5
  568. paddlex/modules/open_vocabulary_detection/__init__.py +2 -2
  569. paddlex/modules/open_vocabulary_detection/dataset_checker.py +3 -3
  570. paddlex/modules/open_vocabulary_detection/evaluator.py +3 -3
  571. paddlex/modules/open_vocabulary_detection/exportor.py +3 -3
  572. paddlex/modules/open_vocabulary_detection/model_list.py +2 -4
  573. paddlex/modules/open_vocabulary_detection/trainer.py +7 -5
  574. paddlex/modules/open_vocabulary_segmentation/__init__.py +2 -2
  575. paddlex/modules/open_vocabulary_segmentation/dataset_checker.py +3 -3
  576. paddlex/modules/open_vocabulary_segmentation/evaluator.py +3 -3
  577. paddlex/modules/open_vocabulary_segmentation/exportor.py +3 -3
  578. paddlex/modules/open_vocabulary_segmentation/model_list.py +1 -1
  579. paddlex/modules/open_vocabulary_segmentation/trainer.py +7 -5
  580. paddlex/modules/semantic_segmentation/__init__.py +2 -2
  581. paddlex/modules/semantic_segmentation/dataset_checker/__init__.py +2 -3
  582. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/__init__.py +2 -2
  583. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/analyse_dataset.py +6 -3
  584. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/check_dataset.py +2 -2
  585. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/convert_dataset.py +7 -4
  586. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/split_dataset.py +2 -2
  587. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/utils/__init__.py +1 -1
  588. paddlex/modules/semantic_segmentation/dataset_checker/dataset_src/utils/visualizer.py +6 -2
  589. paddlex/modules/semantic_segmentation/evaluator.py +3 -3
  590. paddlex/modules/semantic_segmentation/exportor.py +1 -1
  591. paddlex/modules/semantic_segmentation/model_list.py +1 -1
  592. paddlex/modules/semantic_segmentation/trainer.py +3 -4
  593. paddlex/modules/table_recognition/__init__.py +2 -2
  594. paddlex/modules/table_recognition/dataset_checker/__init__.py +5 -5
  595. paddlex/modules/table_recognition/dataset_checker/dataset_src/__init__.py +2 -2
  596. paddlex/modules/table_recognition/dataset_checker/dataset_src/analyse_dataset.py +3 -2
  597. paddlex/modules/table_recognition/dataset_checker/dataset_src/check_dataset.py +8 -7
  598. paddlex/modules/table_recognition/dataset_checker/dataset_src/split_dataset.py +2 -1
  599. paddlex/modules/table_recognition/evaluator.py +3 -3
  600. paddlex/modules/table_recognition/exportor.py +1 -1
  601. paddlex/modules/table_recognition/model_list.py +1 -1
  602. paddlex/modules/table_recognition/trainer.py +2 -5
  603. paddlex/modules/text_detection/__init__.py +2 -2
  604. paddlex/modules/text_detection/dataset_checker/__init__.py +4 -6
  605. paddlex/modules/text_detection/dataset_checker/dataset_src/__init__.py +2 -2
  606. paddlex/modules/text_detection/dataset_checker/dataset_src/analyse_dataset.py +12 -9
  607. paddlex/modules/text_detection/dataset_checker/dataset_src/check_dataset.py +3 -3
  608. paddlex/modules/text_detection/dataset_checker/dataset_src/split_dataset.py +3 -3
  609. paddlex/modules/text_detection/evaluator.py +3 -3
  610. paddlex/modules/text_detection/exportor.py +1 -1
  611. paddlex/modules/text_detection/model_list.py +3 -1
  612. paddlex/modules/text_detection/trainer.py +2 -5
  613. paddlex/modules/text_recognition/__init__.py +2 -2
  614. paddlex/modules/text_recognition/dataset_checker/__init__.py +4 -5
  615. paddlex/modules/text_recognition/dataset_checker/dataset_src/__init__.py +2 -2
  616. paddlex/modules/text_recognition/dataset_checker/dataset_src/analyse_dataset.py +13 -12
  617. paddlex/modules/text_recognition/dataset_checker/dataset_src/check_dataset.py +2 -5
  618. paddlex/modules/text_recognition/dataset_checker/dataset_src/convert_dataset.py +11 -10
  619. paddlex/modules/text_recognition/dataset_checker/dataset_src/split_dataset.py +1 -2
  620. paddlex/modules/text_recognition/evaluator.py +3 -3
  621. paddlex/modules/text_recognition/exportor.py +1 -1
  622. paddlex/modules/text_recognition/model_list.py +3 -1
  623. paddlex/modules/text_recognition/trainer.py +2 -3
  624. paddlex/modules/ts_anomaly_detection/__init__.py +2 -2
  625. paddlex/modules/ts_anomaly_detection/dataset_checker/__init__.py +4 -5
  626. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/__init__.py +2 -2
  627. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/analyse_dataset.py +1 -9
  628. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/check_dataset.py +2 -2
  629. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/convert_dataset.py +2 -6
  630. paddlex/modules/ts_anomaly_detection/dataset_checker/dataset_src/split_dataset.py +4 -4
  631. paddlex/modules/ts_anomaly_detection/evaluator.py +3 -3
  632. paddlex/modules/ts_anomaly_detection/exportor.py +2 -3
  633. paddlex/modules/ts_anomaly_detection/model_list.py +1 -1
  634. paddlex/modules/ts_anomaly_detection/trainer.py +8 -8
  635. paddlex/modules/ts_classification/__init__.py +2 -2
  636. paddlex/modules/ts_classification/dataset_checker/__init__.py +4 -5
  637. paddlex/modules/ts_classification/dataset_checker/dataset_src/__init__.py +2 -2
  638. paddlex/modules/ts_classification/dataset_checker/dataset_src/analyse_dataset.py +8 -5
  639. paddlex/modules/ts_classification/dataset_checker/dataset_src/check_dataset.py +2 -2
  640. paddlex/modules/ts_classification/dataset_checker/dataset_src/convert_dataset.py +2 -6
  641. paddlex/modules/ts_classification/dataset_checker/dataset_src/split_dataset.py +5 -5
  642. paddlex/modules/ts_classification/evaluator.py +3 -3
  643. paddlex/modules/ts_classification/exportor.py +2 -3
  644. paddlex/modules/ts_classification/model_list.py +1 -1
  645. paddlex/modules/ts_classification/trainer.py +7 -7
  646. paddlex/modules/ts_forecast/__init__.py +2 -2
  647. paddlex/modules/ts_forecast/dataset_checker/__init__.py +4 -5
  648. paddlex/modules/ts_forecast/dataset_checker/dataset_src/__init__.py +2 -2
  649. paddlex/modules/ts_forecast/dataset_checker/dataset_src/analyse_dataset.py +1 -9
  650. paddlex/modules/ts_forecast/dataset_checker/dataset_src/check_dataset.py +2 -2
  651. paddlex/modules/ts_forecast/dataset_checker/dataset_src/convert_dataset.py +2 -6
  652. paddlex/modules/ts_forecast/dataset_checker/dataset_src/split_dataset.py +4 -4
  653. paddlex/modules/ts_forecast/evaluator.py +3 -3
  654. paddlex/modules/ts_forecast/exportor.py +2 -3
  655. paddlex/modules/ts_forecast/model_list.py +1 -1
  656. paddlex/modules/ts_forecast/trainer.py +7 -7
  657. paddlex/modules/video_classification/__init__.py +2 -2
  658. paddlex/modules/video_classification/dataset_checker/__init__.py +2 -2
  659. paddlex/modules/video_classification/dataset_checker/dataset_src/__init__.py +2 -2
  660. paddlex/modules/video_classification/dataset_checker/dataset_src/analyse_dataset.py +9 -9
  661. paddlex/modules/video_classification/dataset_checker/dataset_src/check_dataset.py +2 -3
  662. paddlex/modules/video_classification/dataset_checker/dataset_src/split_dataset.py +1 -1
  663. paddlex/modules/video_classification/evaluator.py +3 -3
  664. paddlex/modules/video_classification/exportor.py +1 -1
  665. paddlex/modules/video_classification/model_list.py +1 -1
  666. paddlex/modules/video_classification/trainer.py +3 -3
  667. paddlex/modules/video_detection/__init__.py +2 -2
  668. paddlex/modules/video_detection/dataset_checker/__init__.py +2 -2
  669. paddlex/modules/video_detection/dataset_checker/dataset_src/__init__.py +2 -2
  670. paddlex/modules/video_detection/dataset_checker/dataset_src/analyse_dataset.py +8 -9
  671. paddlex/modules/video_detection/dataset_checker/dataset_src/check_dataset.py +3 -5
  672. paddlex/modules/video_detection/evaluator.py +3 -3
  673. paddlex/modules/video_detection/exportor.py +1 -1
  674. paddlex/modules/video_detection/model_list.py +1 -1
  675. paddlex/modules/video_detection/trainer.py +3 -3
  676. paddlex/ops/__init__.py +7 -4
  677. paddlex/ops/iou3d_nms/iou3d_cpu.cpp +8 -6
  678. paddlex/ops/iou3d_nms/iou3d_cpu.h +3 -2
  679. paddlex/ops/iou3d_nms/iou3d_nms.cpp +8 -6
  680. paddlex/ops/iou3d_nms/iou3d_nms.h +6 -4
  681. paddlex/ops/iou3d_nms/iou3d_nms_api.cpp +24 -18
  682. paddlex/ops/iou3d_nms/iou3d_nms_kernel.cu +9 -7
  683. paddlex/ops/setup.py +3 -3
  684. paddlex/ops/voxel/voxelize_op.cc +22 -19
  685. paddlex/ops/voxel/voxelize_op.cu +25 -25
  686. paddlex/paddlex_cli.py +104 -87
  687. paddlex/repo_apis/Paddle3D_api/__init__.py +1 -1
  688. paddlex/repo_apis/Paddle3D_api/bev_fusion/__init__.py +1 -1
  689. paddlex/repo_apis/Paddle3D_api/bev_fusion/config.py +1 -1
  690. paddlex/repo_apis/Paddle3D_api/bev_fusion/model.py +6 -6
  691. paddlex/repo_apis/Paddle3D_api/bev_fusion/register.py +2 -2
  692. paddlex/repo_apis/Paddle3D_api/bev_fusion/runner.py +1 -1
  693. paddlex/repo_apis/Paddle3D_api/pp3d_config.py +3 -2
  694. paddlex/repo_apis/PaddleClas_api/__init__.py +1 -1
  695. paddlex/repo_apis/PaddleClas_api/cls/__init__.py +3 -3
  696. paddlex/repo_apis/PaddleClas_api/cls/config.py +5 -4
  697. paddlex/repo_apis/PaddleClas_api/cls/model.py +4 -4
  698. paddlex/repo_apis/PaddleClas_api/cls/register.py +12 -3
  699. paddlex/repo_apis/PaddleClas_api/cls/runner.py +2 -3
  700. paddlex/repo_apis/PaddleClas_api/shitu_rec/__init__.py +2 -2
  701. paddlex/repo_apis/PaddleClas_api/shitu_rec/config.py +2 -2
  702. paddlex/repo_apis/PaddleClas_api/shitu_rec/model.py +1 -4
  703. paddlex/repo_apis/PaddleClas_api/shitu_rec/register.py +2 -2
  704. paddlex/repo_apis/PaddleClas_api/shitu_rec/runner.py +1 -6
  705. paddlex/repo_apis/PaddleDetection_api/__init__.py +2 -2
  706. paddlex/repo_apis/PaddleDetection_api/config_helper.py +3 -3
  707. paddlex/repo_apis/PaddleDetection_api/instance_seg/__init__.py +2 -2
  708. paddlex/repo_apis/PaddleDetection_api/instance_seg/config.py +2 -3
  709. paddlex/repo_apis/PaddleDetection_api/instance_seg/model.py +4 -4
  710. paddlex/repo_apis/PaddleDetection_api/instance_seg/register.py +2 -3
  711. paddlex/repo_apis/PaddleDetection_api/instance_seg/runner.py +2 -3
  712. paddlex/repo_apis/PaddleDetection_api/object_det/__init__.py +3 -3
  713. paddlex/repo_apis/PaddleDetection_api/object_det/config.py +5 -4
  714. paddlex/repo_apis/PaddleDetection_api/object_det/model.py +6 -7
  715. paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py +26 -1
  716. paddlex/repo_apis/PaddleDetection_api/object_det/register.py +32 -3
  717. paddlex/repo_apis/PaddleDetection_api/object_det/runner.py +2 -3
  718. paddlex/repo_apis/PaddleNLP_api/__init__.py +1 -1
  719. paddlex/repo_apis/PaddleOCR_api/__init__.py +4 -3
  720. paddlex/repo_apis/PaddleOCR_api/config_utils.py +1 -1
  721. paddlex/repo_apis/PaddleOCR_api/formula_rec/__init__.py +1 -1
  722. paddlex/repo_apis/PaddleOCR_api/formula_rec/config.py +7 -6
  723. paddlex/repo_apis/PaddleOCR_api/formula_rec/model.py +9 -13
  724. paddlex/repo_apis/PaddleOCR_api/formula_rec/register.py +29 -3
  725. paddlex/repo_apis/PaddleOCR_api/formula_rec/runner.py +2 -3
  726. paddlex/repo_apis/PaddleOCR_api/table_rec/__init__.py +1 -1
  727. paddlex/repo_apis/PaddleOCR_api/table_rec/config.py +1 -1
  728. paddlex/repo_apis/PaddleOCR_api/table_rec/model.py +4 -4
  729. paddlex/repo_apis/PaddleOCR_api/table_rec/register.py +2 -3
  730. paddlex/repo_apis/PaddleOCR_api/table_rec/runner.py +3 -3
  731. paddlex/repo_apis/PaddleOCR_api/text_det/__init__.py +1 -1
  732. paddlex/repo_apis/PaddleOCR_api/text_det/config.py +1 -1
  733. paddlex/repo_apis/PaddleOCR_api/text_det/model.py +4 -4
  734. paddlex/repo_apis/PaddleOCR_api/text_det/register.py +20 -3
  735. paddlex/repo_apis/PaddleOCR_api/text_det/runner.py +3 -3
  736. paddlex/repo_apis/PaddleOCR_api/text_rec/__init__.py +1 -1
  737. paddlex/repo_apis/PaddleOCR_api/text_rec/config.py +7 -6
  738. paddlex/repo_apis/PaddleOCR_api/text_rec/model.py +9 -13
  739. paddlex/repo_apis/PaddleOCR_api/text_rec/register.py +20 -3
  740. paddlex/repo_apis/PaddleOCR_api/text_rec/runner.py +2 -3
  741. paddlex/repo_apis/PaddleSeg_api/__init__.py +1 -1
  742. paddlex/repo_apis/PaddleSeg_api/base_seg_config.py +2 -2
  743. paddlex/repo_apis/PaddleSeg_api/seg/__init__.py +1 -1
  744. paddlex/repo_apis/PaddleSeg_api/seg/config.py +3 -6
  745. paddlex/repo_apis/PaddleSeg_api/seg/model.py +6 -6
  746. paddlex/repo_apis/PaddleSeg_api/seg/register.py +2 -3
  747. paddlex/repo_apis/PaddleSeg_api/seg/runner.py +2 -3
  748. paddlex/repo_apis/PaddleTS_api/__init__.py +4 -3
  749. paddlex/repo_apis/PaddleTS_api/ts_ad/__init__.py +1 -1
  750. paddlex/repo_apis/PaddleTS_api/ts_ad/config.py +5 -6
  751. paddlex/repo_apis/PaddleTS_api/ts_ad/register.py +2 -2
  752. paddlex/repo_apis/PaddleTS_api/ts_ad/runner.py +2 -2
  753. paddlex/repo_apis/PaddleTS_api/ts_base/__init__.py +1 -1
  754. paddlex/repo_apis/PaddleTS_api/ts_base/config.py +2 -4
  755. paddlex/repo_apis/PaddleTS_api/ts_base/model.py +4 -4
  756. paddlex/repo_apis/PaddleTS_api/ts_base/runner.py +2 -2
  757. paddlex/repo_apis/PaddleTS_api/ts_cls/__init__.py +1 -1
  758. paddlex/repo_apis/PaddleTS_api/ts_cls/config.py +4 -5
  759. paddlex/repo_apis/PaddleTS_api/ts_cls/register.py +2 -2
  760. paddlex/repo_apis/PaddleTS_api/ts_cls/runner.py +2 -2
  761. paddlex/repo_apis/PaddleTS_api/ts_fc/__init__.py +1 -1
  762. paddlex/repo_apis/PaddleTS_api/ts_fc/config.py +6 -7
  763. paddlex/repo_apis/PaddleTS_api/ts_fc/register.py +1 -1
  764. paddlex/repo_apis/PaddleVideo_api/__init__.py +1 -1
  765. paddlex/repo_apis/PaddleVideo_api/config_utils.py +1 -1
  766. paddlex/repo_apis/PaddleVideo_api/video_cls/__init__.py +3 -3
  767. paddlex/repo_apis/PaddleVideo_api/video_cls/config.py +5 -4
  768. paddlex/repo_apis/PaddleVideo_api/video_cls/model.py +4 -4
  769. paddlex/repo_apis/PaddleVideo_api/video_cls/register.py +2 -3
  770. paddlex/repo_apis/PaddleVideo_api/video_cls/runner.py +2 -3
  771. paddlex/repo_apis/PaddleVideo_api/video_det/__init__.py +3 -3
  772. paddlex/repo_apis/PaddleVideo_api/video_det/config.py +5 -4
  773. paddlex/repo_apis/PaddleVideo_api/video_det/model.py +5 -5
  774. paddlex/repo_apis/PaddleVideo_api/video_det/register.py +2 -3
  775. paddlex/repo_apis/PaddleVideo_api/video_det/runner.py +2 -3
  776. paddlex/repo_apis/__init__.py +1 -1
  777. paddlex/repo_apis/base/__init__.py +4 -5
  778. paddlex/repo_apis/base/config.py +3 -4
  779. paddlex/repo_apis/base/model.py +11 -19
  780. paddlex/repo_apis/base/register.py +1 -1
  781. paddlex/repo_apis/base/runner.py +11 -12
  782. paddlex/repo_apis/base/utils/__init__.py +1 -1
  783. paddlex/repo_apis/base/utils/arg.py +1 -1
  784. paddlex/repo_apis/base/utils/subprocess.py +1 -1
  785. paddlex/repo_manager/__init__.py +2 -9
  786. paddlex/repo_manager/core.py +12 -30
  787. paddlex/repo_manager/meta.py +41 -31
  788. paddlex/repo_manager/repo.py +171 -161
  789. paddlex/repo_manager/utils.py +13 -224
  790. paddlex/utils/__init__.py +1 -1
  791. paddlex/utils/cache.py +8 -10
  792. paddlex/utils/config.py +6 -5
  793. paddlex/utils/{custom_device_whitelist.py → custom_device_list.py} +53 -199
  794. paddlex/utils/deps.py +249 -0
  795. paddlex/utils/device.py +87 -36
  796. paddlex/utils/download.py +4 -4
  797. paddlex/utils/env.py +37 -7
  798. paddlex/utils/errors/__init__.py +1 -1
  799. paddlex/utils/errors/dataset_checker.py +1 -1
  800. paddlex/utils/errors/others.py +2 -16
  801. paddlex/utils/file_interface.py +4 -5
  802. paddlex/utils/flags.py +17 -12
  803. paddlex/utils/fonts/__init__.py +36 -5
  804. paddlex/utils/func_register.py +1 -1
  805. paddlex/utils/install.py +87 -0
  806. paddlex/utils/interactive_get_pipeline.py +3 -3
  807. paddlex/utils/lazy_loader.py +3 -3
  808. paddlex/utils/logging.py +10 -1
  809. paddlex/utils/misc.py +6 -6
  810. paddlex/utils/pipeline_arguments.py +15 -7
  811. paddlex/utils/result_saver.py +4 -5
  812. paddlex/utils/subclass_register.py +2 -4
  813. paddlex/version.py +2 -1
  814. {paddlex-3.0.0rc0.dist-info → paddlex-3.0.1.dist-info}/METADATA +237 -102
  815. paddlex-3.0.1.dist-info/RECORD +1095 -0
  816. {paddlex-3.0.0rc0.dist-info → paddlex-3.0.1.dist-info}/WHEEL +1 -1
  817. paddlex/inference/models/base/predictor/basic_predictor.py +0 -139
  818. paddlex/paddle2onnx_requirements.txt +0 -1
  819. paddlex/repo_manager/requirements.txt +0 -21
  820. paddlex/serving_requirements.txt +0 -9
  821. paddlex-3.0.0rc0.dist-info/RECORD +0 -1015
  822. {paddlex-3.0.0rc0.dist-info → paddlex-3.0.1.dist-info}/entry_points.txt +0 -0
  823. {paddlex-3.0.0rc0.dist-info → paddlex-3.0.1.dist-info/licenses}/LICENSE +0 -0
  824. {paddlex-3.0.0rc0.dist-info → paddlex-3.0.1.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,4 @@
1
- # copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -14,22 +14,20 @@
14
14
 
15
15
  __all__ = [
16
16
  "get_sub_regions_ocr_res",
17
- "get_layout_ordering",
18
- "get_single_block_parsing_res",
19
17
  "get_show_color",
20
18
  "sorted_layout_boxes",
21
19
  ]
22
20
 
23
- import numpy as np
24
- from PIL import Image
25
- import uuid
26
21
  import re
27
- from pathlib import Path
28
22
  from copy import deepcopy
29
- from typing import Optional, Union, List, Tuple, Dict, Any
30
- from ..ocr.result import OCRResult
31
- from ...models.object_detection.result import DetResult
23
+ from typing import Dict, List, Optional, Tuple, Union
24
+
25
+ import numpy as np
26
+ from PIL import Image
27
+
32
28
  from ..components import convert_points_to_boxes
29
+ from ..ocr.result import OCRResult
30
+ from .setting import BLOCK_LABEL_MAP, REGION_SETTINGS
33
31
 
34
32
 
35
33
  def get_overlap_boxes_idx(src_boxes: np.ndarray, ref_boxes: np.ndarray) -> List:
@@ -173,808 +171,453 @@ def sorted_layout_boxes(res, w):
173
171
  return new_res
174
172
 
175
173
 
176
- def _calculate_overlap_area_div_minbox_area_ratio(
177
- bbox1: Union[list, tuple],
178
- bbox2: Union[list, tuple],
174
+ def calculate_projection_overlap_ratio(
175
+ bbox1: List[float],
176
+ bbox2: List[float],
177
+ direction: str = "horizontal",
178
+ mode="union",
179
179
  ) -> float:
180
180
  """
181
- Calculate the ratio of the overlap area between bbox1 and bbox2
182
- to the area of the smaller bounding box.
181
+ Calculate the IoU of lines between two bounding boxes.
183
182
 
184
183
  Args:
185
- bbox1 (list or tuple): Coordinates of the first bounding box [x_min, y_min, x_max, y_max].
186
- bbox2 (list or tuple): Coordinates of the second bounding box [x_min, y_min, x_max, y_max].
184
+ bbox1 (List[float]): First bounding box [x_min, y_min, x_max, y_max].
185
+ bbox2 (List[float]): Second bounding box [x_min, y_min, x_max, y_max].
186
+ direction (str): direction of the projection, "horizontal" or "vertical".
187
187
 
188
188
  Returns:
189
- float: The ratio of the overlap area to the area of the smaller bounding box.
190
- """
191
- bbox1 = list(map(int, bbox1))
192
- bbox2 = list(map(int, bbox2))
193
-
194
- x_left = max(bbox1[0], bbox2[0])
195
- y_top = max(bbox1[1], bbox2[1])
196
- x_right = min(bbox1[2], bbox2[2])
197
- y_bottom = min(bbox1[3], bbox2[3])
198
-
199
- if x_right <= x_left or y_bottom <= y_top:
200
- return 0.0
201
-
202
- intersection_area = (x_right - x_left) * (y_bottom - y_top)
203
- area_bbox1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
204
- area_bbox2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
205
- min_box_area = min(area_bbox1, area_bbox2)
206
-
207
- if min_box_area <= 0:
208
- return 0.0
209
-
210
- return intersection_area / min_box_area
211
-
212
-
213
- def _whether_y_overlap_exceeds_threshold(
214
- bbox1: Union[list, tuple],
215
- bbox2: Union[list, tuple],
216
- overlap_ratio_threshold: float = 0.6,
217
- ) -> bool:
189
+ float: Line overlap ratio. Returns 0 if there is no overlap.
218
190
  """
219
- Determines whether the vertical overlap between two bounding boxes exceeds a given threshold.
191
+ start_index, end_index = 1, 3
192
+ if direction == "horizontal":
193
+ start_index, end_index = 0, 2
220
194
 
221
- Args:
222
- bbox1 (list or tuple): The first bounding box defined as (left, top, right, bottom).
223
- bbox2 (list or tuple): The second bounding box defined as (left, top, right, bottom).
224
- overlap_ratio_threshold (float): The threshold ratio to determine if the overlap is significant.
225
- Defaults to 0.6.
226
-
227
- Returns:
228
- bool: True if the vertical overlap divided by the minimum height of the two bounding boxes
229
- exceeds the overlap_ratio_threshold, otherwise False.
230
- """
231
- _, y1_0, _, y1_1 = bbox1
232
- _, y2_0, _, y2_1 = bbox2
195
+ intersection_start = max(bbox1[start_index], bbox2[start_index])
196
+ intersection_end = min(bbox1[end_index], bbox2[end_index])
197
+ overlap = intersection_end - intersection_start
198
+ if overlap <= 0:
199
+ return 0
233
200
 
234
- overlap = max(0, min(y1_1, y2_1) - max(y1_0, y2_0))
235
- min_height = min(y1_1 - y1_0, y2_1 - y2_0)
201
+ if mode == "union":
202
+ ref_width = max(bbox1[end_index], bbox2[end_index]) - min(
203
+ bbox1[start_index], bbox2[start_index]
204
+ )
205
+ elif mode == "small":
206
+ ref_width = min(
207
+ bbox1[end_index] - bbox1[start_index], bbox2[end_index] - bbox2[start_index]
208
+ )
209
+ elif mode == "large":
210
+ ref_width = max(
211
+ bbox1[end_index] - bbox1[start_index], bbox2[end_index] - bbox2[start_index]
212
+ )
213
+ else:
214
+ raise ValueError(
215
+ f"Invalid mode {mode}, must be one of ['union', 'small', 'large']."
216
+ )
236
217
 
237
- return (overlap / min_height) > overlap_ratio_threshold
218
+ return overlap / ref_width if ref_width > 0 else 0.0
238
219
 
239
220
 
240
- def _adjust_span_text(span: List[str], prepend: bool = False, append: bool = False):
221
+ def calculate_overlap_ratio(
222
+ bbox1: Union[list, tuple], bbox2: Union[list, tuple], mode="union"
223
+ ) -> float:
241
224
  """
242
- Adjust the text of a span by prepending or appending a newline.
225
+ Calculate the overlap ratio between two bounding boxes.
243
226
 
244
227
  Args:
245
- span (list): A list where the second element is the text of the span.
246
- prepend (bool): If True, prepend a newline to the text.
247
- append (bool): If True, append a newline to the text.
228
+ bbox1 (list or tuple): The first bounding box, format [x_min, y_min, x_max, y_max]
229
+ bbox2 (list or tuple): The second bounding box, format [x_min, y_min, x_max, y_max]
230
+ mode (str): The mode of calculation, either 'union', 'small', or 'large'.
248
231
 
249
232
  Returns:
250
- None: The function modifies the span in place.
233
+ float: The overlap ratio value between the two bounding boxes
251
234
  """
252
- if prepend:
253
- span[1] = "\n" + span[1]
254
- if append:
255
- span[1] = span[1] + "\n"
256
- return span
235
+ x_min_inter = max(bbox1[0], bbox2[0])
236
+ y_min_inter = max(bbox1[1], bbox2[1])
237
+ x_max_inter = min(bbox1[2], bbox2[2])
238
+ y_max_inter = min(bbox1[3], bbox2[3])
257
239
 
240
+ inter_width = max(0, x_max_inter - x_min_inter)
241
+ inter_height = max(0, y_max_inter - y_min_inter)
258
242
 
259
- def _format_line(
260
- line: List[List[Union[List[int], str]]],
261
- layout_min: int,
262
- layout_max: int,
263
- is_reference: bool = False,
264
- ) -> None:
265
- """
266
- Format a line of text spans based on layout constraints.
243
+ inter_area = inter_width * inter_height
267
244
 
268
- Args:
269
- line (list): A list of spans, where each span is a list containing a bounding box and text.
270
- layout_min (int): The minimum x-coordinate of the layout bounding box.
271
- layout_max (int): The maximum x-coordinate of the layout bounding box.
272
- is_reference (bool): A flag indicating whether the line is a reference line, which affects formatting rules.
245
+ bbox1_area = caculate_bbox_area(bbox1)
246
+ bbox2_area = caculate_bbox_area(bbox2)
273
247
 
274
- Returns:
275
- None: The function modifies the line in place.
276
- """
277
- first_span = line[0]
278
- end_span = line[-1]
279
-
280
- if not is_reference:
281
- if first_span[0][0] - layout_min > 10:
282
- first_span = _adjust_span_text(first_span, prepend=True)
283
- if layout_max - end_span[0][2] > 10:
284
- end_span = _adjust_span_text(end_span, append=True)
248
+ if mode == "union":
249
+ ref_area = bbox1_area + bbox2_area - inter_area
250
+ elif mode == "small":
251
+ ref_area = min(bbox1_area, bbox2_area)
252
+ elif mode == "large":
253
+ ref_area = max(bbox1_area, bbox2_area)
285
254
  else:
286
- if first_span[0][0] - layout_min < 5:
287
- first_span = _adjust_span_text(first_span, prepend=True)
288
- if layout_max - end_span[0][2] > 20:
289
- end_span = _adjust_span_text(end_span, append=True)
255
+ raise ValueError(
256
+ f"Invalid mode {mode}, must be one of ['union', 'small', 'large']."
257
+ )
290
258
 
291
- line[0] = first_span
292
- line[-1] = end_span
259
+ if ref_area == 0:
260
+ return 0.0
293
261
 
294
- return line
262
+ return inter_area / ref_area
295
263
 
296
264
 
297
- def split_boxes_if_x_contained(boxes, offset=1e-5):
298
- """
299
- Check if there is any complete containment in the x-direction
300
- between the bounding boxes and split the containing box accordingly.
265
+ def group_boxes_into_lines(ocr_rec_res, line_height_iou_threshold):
266
+ rec_boxes = ocr_rec_res["boxes"]
267
+ rec_texts = ocr_rec_res["rec_texts"]
268
+ rec_labels = ocr_rec_res["rec_labels"]
301
269
 
302
- Args:
303
- boxes (list of lists): Each element is a list containing an ndarray of length 4, a description, and a label.
304
- offset (float): A small offset value to ensure that the split boxes are not too close to the original boxes.
305
- Returns:
306
- A new list of boxes, including split boxes, with the same `rec_text` and `label` attributes.
307
- """
270
+ text_boxes = [
271
+ rec_boxes[i] for i in range(len(rec_boxes)) if rec_labels[i] == "text"
272
+ ]
273
+ text_orientation = calculate_text_orientation(text_boxes)
308
274
 
309
- def is_x_contained(box_a, box_b):
310
- """Check if box_a completely contains box_b in the x-direction."""
311
- return box_a[0][0] <= box_b[0][0] and box_a[0][2] >= box_b[0][2]
275
+ match_direction = "vertical" if text_orientation == "horizontal" else "horizontal"
312
276
 
313
- new_boxes = []
277
+ line_start_index = 1 if text_orientation == "horizontal" else 0
278
+ line_end_index = 3 if text_orientation == "horizontal" else 2
314
279
 
315
- for i in range(len(boxes)):
316
- box_a = boxes[i]
317
- is_split = False
318
- for j in range(len(boxes)):
319
- if i == j:
320
- continue
321
- box_b = boxes[j]
322
- if is_x_contained(box_a, box_b):
323
- is_split = True
324
- # Split box_a based on the x-coordinates of box_b
325
- if box_a[0][0] < box_b[0][0]:
326
- w = box_b[0][0] - offset - box_a[0][0]
327
- if w > 1:
328
- new_boxes.append(
329
- [
330
- np.array(
331
- [
332
- box_a[0][0],
333
- box_a[0][1],
334
- box_b[0][0] - offset,
335
- box_a[0][3],
336
- ]
337
- ),
338
- box_a[1],
339
- box_a[2],
340
- ]
341
- )
342
- if box_a[0][2] > box_b[0][2]:
343
- w = box_a[0][2] - box_b[0][2] + offset
344
- if w > 1:
345
- box_a = [
346
- np.array(
347
- [
348
- box_b[0][2] + offset,
349
- box_a[0][1],
350
- box_a[0][2],
351
- box_a[0][3],
352
- ]
353
- ),
354
- box_a[1],
355
- box_a[2],
356
- ]
357
- if j == len(boxes) - 1 and is_split:
358
- new_boxes.append(box_a)
359
- if not is_split:
360
- new_boxes.append(box_a)
361
-
362
- return new_boxes
280
+ spans = list(zip(rec_boxes, rec_texts, rec_labels))
281
+ sort_index = 1
282
+ reverse = False
283
+ if text_orientation == "vertical":
284
+ sort_index = 0
285
+ reverse = True
286
+ spans.sort(key=lambda span: span[0][sort_index], reverse=reverse)
287
+ spans = [list(span) for span in spans]
363
288
 
289
+ lines = []
290
+ line = [spans[0]]
291
+ line_region_box = spans[0][0].copy()
292
+ line_heights = []
293
+ # merge line
294
+ for span in spans[1:]:
295
+ rec_bbox = span[0]
296
+ if (
297
+ calculate_projection_overlap_ratio(
298
+ line_region_box, rec_bbox, match_direction, mode="small"
299
+ )
300
+ >= line_height_iou_threshold
301
+ ):
302
+ line.append(span)
303
+ line_region_box[line_start_index] = min(
304
+ line_region_box[line_start_index], rec_bbox[line_start_index]
305
+ )
306
+ line_region_box[line_end_index] = max(
307
+ line_region_box[line_end_index], rec_bbox[line_end_index]
308
+ )
309
+ else:
310
+ line_heights.append(
311
+ line_region_box[line_end_index] - line_region_box[line_start_index]
312
+ )
313
+ lines.append(line)
314
+ line = [span]
315
+ line_region_box = rec_bbox.copy()
364
316
 
365
- def _sort_line_by_x_projection(
366
- input_img: np.ndarray,
367
- general_ocr_pipeline: Any,
368
- line: List[List[Union[List[int], str]]],
369
- ) -> None:
370
- """
371
- Sort a line of text spans based on their vertical position within the layout bounding box.
317
+ lines.append(line)
318
+ line_heights.append(
319
+ line_region_box[line_end_index] - line_region_box[line_start_index]
320
+ )
372
321
 
373
- Args:
374
- input_img (ndarray): The input image used for OCR.
375
- general_ocr_pipeline (Any): The general OCR pipeline used for text recognition.
376
- line (list): A list of spans, where each span is a list containing a bounding box and text.
322
+ min_height = min(line_heights) if line_heights else 0
323
+ max_height = max(line_heights) if line_heights else 0
377
324
 
378
- Returns:
379
- list: The sorted line of text spans.
380
- """
381
- splited_boxes = split_boxes_if_x_contained(line)
382
- splited_lines = []
383
- if len(line) != len(splited_boxes):
384
- splited_boxes.sort(key=lambda span: span[0][0])
385
- text_rec_model = general_ocr_pipeline.text_rec_model
386
- for span in splited_boxes:
387
- if span[2] == "text":
388
- crop_img = input_img[
389
- int(span[0][1]) : int(span[0][3]),
390
- int(span[0][0]) : int(span[0][2]),
391
- ]
392
- span[1] = next(text_rec_model([crop_img]))["rec_text"]
393
- splited_lines.append(span)
394
- else:
395
- splited_lines = line
325
+ if max_height > min_height * 2 and text_orientation == "vertical":
326
+ line_heights = np.array(line_heights)
327
+ min_height_num = np.sum(line_heights < min_height * 1.1)
328
+ if min_height_num < len(lines) * 0.4:
329
+ condition = line_heights > min_height * 1.1
330
+ lines = [value for value, keep in zip(lines, condition) if keep]
396
331
 
397
- return splited_lines
332
+ return lines, text_orientation, np.mean(line_heights)
398
333
 
399
334
 
400
- def _sort_ocr_res_by_y_projection(
401
- input_img: np.ndarray,
402
- general_ocr_pipeline: Any,
403
- label: Any,
404
- block_bbox: Tuple[int, int, int, int],
405
- ocr_res: Dict[str, List[Any]],
406
- line_height_iou_threshold: float = 0.7,
407
- ) -> Dict[str, List[Any]]:
335
+ def calculate_minimum_enclosing_bbox(bboxes):
408
336
  """
409
- Sorts OCR results based on their spatial arrangement, grouping them into lines and blocks.
337
+ Calculate the minimum enclosing bounding box for a list of bounding boxes.
410
338
 
411
339
  Args:
412
- input_img (ndarray): The input image used for OCR.
413
- general_ocr_pipeline (Any): The general OCR pipeline used for text recognition.
414
- label (Any): The label associated with the OCR results. It's not used in the function but might be
415
- relevant for other parts of the calling context.
416
- block_bbox (Tuple[int, int, int, int]): A tuple representing the layout bounding box, defined as
417
- (left, top, right, bottom).
418
- ocr_res (Dict[str, List[Any]]): A dictionary containing OCR results with the following keys:
419
- - "boxes": A list of bounding boxes, each defined as [left, top, right, bottom].
420
- - "rec_texts": A corresponding list of recognized text strings for each box.
421
- line_height_iou_threshold (float): The threshold for determining whether two boxes belong to
422
- the same line based on their vertical overlap. Defaults to 0.7.
340
+ bboxes (list): A list of bounding boxes represented as lists of four integers [x1, y1, x2, y2].
423
341
 
424
342
  Returns:
425
- Dict[str, List[Any]]: A dictionary with the same structure as `ocr_res`, but with boxes and texts sorted
426
- and grouped into lines and blocks.
343
+ list: The minimum enclosing bounding box represented as a list of four integers [x1, y1, x2, y2].
427
344
  """
428
- assert (
429
- ocr_res["boxes"] and ocr_res["rec_texts"]
430
- ), "OCR results must contain 'boxes' and 'rec_texts'"
345
+ if not bboxes:
346
+ raise ValueError("The list of bounding boxes is empty.")
431
347
 
432
- boxes = ocr_res["boxes"]
433
- rec_texts = ocr_res["rec_texts"]
434
- rec_labels = ocr_res["rec_labels"]
348
+ # Convert the list of bounding boxes to a NumPy array
349
+ bboxes_array = np.array(bboxes)
435
350
 
436
- x_min, _, x_max, _ = block_bbox
437
- inline_x_min = min([box[0] for box in boxes])
438
- inline_x_max = max([box[2] for box in boxes])
351
+ # Compute the minimum and maximum values along the respective axes
352
+ min_x = np.min(bboxes_array[:, 0])
353
+ min_y = np.min(bboxes_array[:, 1])
354
+ max_x = np.max(bboxes_array[:, 2])
355
+ max_y = np.max(bboxes_array[:, 3])
439
356
 
440
- spans = list(zip(boxes, rec_texts, rec_labels))
357
+ # Return the minimum enclosing bounding box
358
+ return [min_x, min_y, max_x, max_y]
441
359
 
442
- spans.sort(key=lambda span: span[0][1])
443
- spans = [list(span) for span in spans]
444
-
445
- lines = []
446
- current_line = [spans[0]]
447
- current_y0, current_y1 = spans[0][0][1], spans[0][0][3]
448
360
 
449
- for span in spans[1:]:
450
- y0, y1 = span[0][1], span[0][3]
451
- if _whether_y_overlap_exceeds_threshold(
452
- (0, current_y0, 0, current_y1),
453
- (0, y0, 0, y1),
454
- line_height_iou_threshold,
455
- ):
456
- current_line.append(span)
457
- current_y0 = min(current_y0, y0)
458
- current_y1 = max(current_y1, y1)
459
- else:
460
- lines.append(current_line)
461
- current_line = [span]
462
- current_y0, current_y1 = y0, y1
463
-
464
- if current_line:
465
- lines.append(current_line)
466
-
467
- new_lines = []
468
- for line in lines:
469
- line.sort(key=lambda span: span[0][0])
470
-
471
- ocr_labels = [span[2] for span in line]
472
- if "formula" in ocr_labels:
473
- line = _sort_line_by_x_projection(input_img, general_ocr_pipeline, line)
474
- if label == "reference":
475
- line = _format_line(line, inline_x_min, inline_x_max, is_reference=True)
476
- elif label != "content":
477
- line = _format_line(line, x_min, x_max)
478
- new_lines.append(line)
479
-
480
- ocr_res["boxes"] = [span[0] for line in new_lines for span in line]
481
- if label == "content":
482
- ocr_res["rec_texts"] = [
483
- "".join(f"{span[1]} " for span in line).rstrip() for line in new_lines
484
- ]
485
- else:
486
- ocr_res["rec_texts"] = [span[1] + " " for line in new_lines for span in line]
487
- return ocr_res, len(new_lines)
488
-
489
-
490
- def _process_text(input_text: str) -> str:
361
+ def calculate_text_orientation(
362
+ bboxes: List[List[int]], orientation_ratio: float = 1.5
363
+ ) -> bool:
491
364
  """
492
- Process the input text to handle spaces.
493
-
494
- The function removes multiple consecutive spaces between Chinese characters and ensures that
495
- only a single space is retained between Chinese and non-Chinese characters.
365
+ Calculate the orientation of the text based on the bounding boxes.
496
366
 
497
367
  Args:
498
- input_text (str): The text to be processed.
368
+ bboxes (list): A list of bounding boxes.
369
+ orientation_ratio (float): Ratio for determining orientation. Default is 1.5.
499
370
 
500
371
  Returns:
501
- str: The processed text with properly formatted spaces.
372
+ str: "horizontal" or "vertical".
502
373
  """
503
374
 
504
- def handle_spaces_(text: str) -> str:
505
- """
506
- Handle spaces in the text by removing multiple consecutive spaces and inserting a single space
507
- between Chinese and non-Chinese characters.
508
-
509
- Args:
510
- text (str): The text to handle spaces for.
511
-
512
- Returns:
513
- str: The text with properly formatted spaces.
514
- """
515
- spaces = re.finditer(r"\s+", text)
516
- processed_text = list(text)
517
-
518
- for space in reversed(list(spaces)):
519
- start, end = space.span()
520
- prev_char = processed_text[start - 1] if start > 0 else ""
521
- next_char = processed_text[end] if end < len(processed_text) else ""
522
-
523
- is_prev_chinese = (
524
- re.match(r"[\u4e00-\u9fff]", prev_char) if prev_char else False
525
- )
526
- is_next_chinese = (
527
- re.match(r"[\u4e00-\u9fff]", next_char) if next_char else False
375
+ horizontal_box_num = 0
376
+ for bbox in bboxes:
377
+ if len(bbox) != 4:
378
+ raise ValueError(
379
+ "Invalid bounding box format. Expected a list of length 4."
528
380
  )
381
+ x1, y1, x2, y2 = bbox
382
+ width = x2 - x1
383
+ height = y2 - y1
384
+ horizontal_box_num += 1 if width * orientation_ratio >= height else 0
529
385
 
530
- if is_prev_chinese and is_next_chinese:
531
- processed_text[start:end] = []
532
- else:
533
- processed_text[start:end] = [" "]
386
+ return "horizontal" if horizontal_box_num >= len(bboxes) * 0.5 else "vertical"
534
387
 
535
- return "".join(processed_text)
536
388
 
537
- text_without_spaces = handle_spaces_(input_text)
389
+ def is_english_letter(char):
390
+ return bool(re.match(r"^[A-Za-z]$", char))
538
391
 
539
- final_text = re.sub(r"\s+", " ", text_without_spaces).strip()
540
- return final_text
541
392
 
393
+ def is_numeric(char):
394
+ return bool(re.match(r"^[\d.]+$", char))
542
395
 
543
- def get_single_block_parsing_res(
544
- general_ocr_pipeline: Any,
545
- overall_ocr_res: OCRResult,
546
- layout_det_res: DetResult,
547
- table_res_list: list,
548
- seal_res_list: list,
549
- ) -> OCRResult:
396
+
397
+ def is_non_breaking_punctuation(char):
550
398
  """
551
- Extract structured information from OCR and layout detection results.
399
+ 判断一个字符是否是不需要换行的标点符号,包括全角和半角的符号。
552
400
 
553
- Args:
554
- overall_ocr_res (OCRResult): An object containing the overall OCR results, including detected text boxes and recognized text. The structure is expected to have:
555
- - "input_img": The image on which OCR was performed.
556
- - "dt_boxes": A list of detected text box coordinates.
557
- - "rec_texts": A list of recognized text corresponding to the detected boxes.
401
+ :param char: str, 单个字符
402
+ :return: bool, 如果字符是不需要换行的标点符号,返回True,否则返回False
403
+ """
404
+ non_breaking_punctuations = {
405
+ ",", # 半角逗号
406
+ ",", # 全角逗号
407
+ "、", # 顿号
408
+ ";", # 半角分号
409
+ ";", # 全角分号
410
+ ":", # 半角冒号
411
+ ":", # 全角冒号
412
+ "-", # 连字符
413
+ }
558
414
 
559
- layout_det_res (DetResult): An object containing the layout detection results, including detected layout boxes and their labels. The structure is expected to have:
560
- - "boxes": A list of dictionaries with keys "coordinate" for box coordinates and "block_label" for the type of content.
415
+ return char in non_breaking_punctuations
561
416
 
562
- table_res_list (list): A list of table detection results, where each item is a dictionary containing:
563
- - "block_bbox": The bounding box of the table layout.
564
- - "pred_html": The predicted HTML representation of the table.
565
417
 
566
- seal_res_list (List): A list of seal detection results. The details of each item depend on the specific application context.
418
+ def format_line(
419
+ line: List[List[Union[List[int], str]]],
420
+ text_direction: int,
421
+ block_width: int,
422
+ block_start_coordinate: int,
423
+ block_stop_coordinate: int,
424
+ line_gap_limit: int = 10,
425
+ block_label: str = "text",
426
+ ) -> None:
427
+ """
428
+ Format a line of text spans based on layout constraints.
567
429
 
430
+ Args:
431
+ line (list): A list of spans, where each span is a list containing a bounding box and text.
432
+ block_left_coordinate (int): The text line directional minimum coordinate of the layout bounding box.
433
+ block_stop_coordinate (int): The text line directional maximum x-coordinate of the layout bounding box.
434
+ first_line_span_limit (int): The limit for the number of pixels before the first span that should be considered part of the first line. Default is 10.
435
+ line_gap_limit (int): The limit for the number of pixels after the last span that should be considered part of the last line. Default is 10.
436
+ block_label (str): The label associated with the entire block. Default is 'text'.
568
437
  Returns:
569
- list: A list of structured boxes where each item is a dictionary containing:
570
- - "block_label": The label of the content (e.g., 'table', 'chart', 'image').
571
- - The label as a key with either table HTML or image data and text.
572
- - "block_bbox": The coordinates of the layout box.
438
+ None: The function modifies the line in place.
573
439
  """
440
+ first_span_box = line[0][0]
441
+ last_span_box = line[-1][0]
574
442
 
575
- single_block_layout_parsing_res = []
576
- input_img = overall_ocr_res["doc_preprocessor_res"]["output_img"]
577
- seal_index = 0
578
- with_doc_title = False
579
- max_block_area = 0.0
580
- paragraph_title_indexs = []
581
-
582
- layout_det_res_list, _ = _remove_overlap_blocks(
583
- deepcopy(layout_det_res["boxes"]),
584
- threshold=0.5,
585
- smaller=True,
586
- )
587
-
588
- for box_idx, box_info in enumerate(layout_det_res_list):
589
- block_bbox = box_info["coordinate"]
590
- label = box_info["label"]
591
- rec_res = {"boxes": [], "rec_texts": [], "rec_labels": [], "flag": False}
592
- seg_start_coordinate = float("inf")
593
- seg_end_coordinate = float("-inf")
594
- num_of_lines = 1
595
-
596
- if label == "doc_title":
597
- with_doc_title = True
598
- elif label == "paragraph_title":
599
- paragraph_title_indexs.append(box_idx)
600
-
601
- block_area = (block_bbox[2] - block_bbox[0]) * (block_bbox[3] - block_bbox[1])
602
- max_block_area = max(max_block_area, block_area)
603
-
604
- if label == "table":
605
- for table_res in table_res_list:
606
- if len(table_res["cell_box_list"]) == 0:
607
- continue
608
- if (
609
- _calculate_overlap_area_div_minbox_area_ratio(
610
- block_bbox, table_res["cell_box_list"][0]
611
- )
612
- > 0.5
613
- ):
614
- single_block_layout_parsing_res.append(
615
- {
616
- "block_label": label,
617
- "block_content": table_res["pred_html"],
618
- "block_bbox": block_bbox,
619
- },
620
- )
621
- break
622
- elif label == "seal":
623
- if len(seal_res_list) > 0:
624
- single_block_layout_parsing_res.append(
625
- {
626
- "block_label": label,
627
- "block_content": _process_text(
628
- ", ".join(seal_res_list[seal_index]["rec_texts"])
629
- ),
630
- "block_bbox": block_bbox,
631
- },
632
- )
633
- seal_index += 1
634
- else:
635
- overall_text_boxes = overall_ocr_res["rec_boxes"]
636
- for box_no in range(len(overall_text_boxes)):
637
- if (
638
- _calculate_overlap_area_div_minbox_area_ratio(
639
- block_bbox, overall_text_boxes[box_no]
640
- )
641
- > 0.5
642
- ):
643
- rec_res["boxes"].append(overall_text_boxes[box_no])
644
- rec_res["rec_texts"].append(
645
- overall_ocr_res["rec_texts"][box_no],
646
- )
647
- rec_res["rec_labels"].append(
648
- overall_ocr_res["rec_labels"][box_no],
649
- )
650
- rec_res["flag"] = True
651
-
652
- if rec_res["flag"]:
653
- rec_res, num_of_lines = _sort_ocr_res_by_y_projection(
654
- input_img, general_ocr_pipeline, label, block_bbox, rec_res, 0.7
655
- )
656
- seg_start_coordinate = rec_res["boxes"][0][0]
657
- seg_end_coordinate = rec_res["boxes"][-1][2]
658
- if label == "formula":
659
- rec_res["rec_texts"] = [
660
- rec_res_text.replace("$", "")
661
- for rec_res_text in rec_res["rec_texts"]
662
- ]
663
-
664
- if label in ["chart", "image"]:
665
- x_min, y_min, x_max, y_max = list(map(int, block_bbox))
666
- img_path = f"imgs/img_in_table_box_{x_min}_{y_min}_{x_max}_{y_max}.jpg"
667
- img = Image.fromarray(input_img[y_min:y_max, x_min:x_max, ::-1])
668
- single_block_layout_parsing_res.append(
669
- {
670
- "block_label": label,
671
- "block_content": _process_text("".join(rec_res["rec_texts"])),
672
- "block_image": {img_path: img},
673
- "block_bbox": block_bbox,
674
- },
675
- )
676
- else:
677
- if label in ["doc_title"]:
678
- content = " ".join(rec_res["rec_texts"])
679
- elif label in ["content"]:
680
- content = "\n".join(rec_res["rec_texts"])
443
+ for span in line:
444
+ if span[2] == "formula" and block_label != "formula":
445
+ formula_rec = span[1]
446
+ if not formula_rec.startswith("$") and not formula_rec.endswith("$"):
447
+ if len(line) > 1:
448
+ span[1] = f"${span[1]}$"
681
449
  else:
682
- content = "".join(rec_res["rec_texts"])
683
- if label != "reference":
684
- content = _process_text(content)
685
- single_block_layout_parsing_res.append(
686
- {
687
- "block_label": label,
688
- "block_content": content,
689
- "block_bbox": block_bbox,
690
- "seg_start_coordinate": seg_start_coordinate,
691
- "seg_end_coordinate": seg_end_coordinate,
692
- "num_of_lines": num_of_lines,
693
- "block_area": block_area,
694
- },
695
- )
450
+ span[1] = f"\n${span[1]}$"
451
+
452
+ line_text = ""
453
+ for span in line:
454
+ _, text, label = span
455
+ line_text += text
456
+ if len(text) > 0 and is_english_letter(line_text[-1]) or label == "formula":
457
+ line_text += " "
458
+
459
+ if text_direction == "horizontal":
460
+ text_start_index = 0
461
+ text_stop_index = 2
462
+ else:
463
+ text_start_index = 1
464
+ text_stop_index = 3
696
465
 
466
+ need_new_line = False
697
467
  if (
698
- not with_doc_title
699
- and len(paragraph_title_indexs) == 1
700
- and single_block_layout_parsing_res[paragraph_title_indexs[0]].get(
701
- "block_area", 0
702
- )
703
- > max_block_area * 0.3
468
+ len(line_text) > 0
469
+ and not is_english_letter(line_text[-1])
470
+ and not is_non_breaking_punctuation(line_text[-1])
704
471
  ):
705
- single_block_layout_parsing_res[paragraph_title_indexs[0]][
706
- "block_label"
707
- ] = "doc_title"
708
-
709
- if len(layout_det_res_list) == 0:
710
- for ocr_rec_box, ocr_rec_text in zip(
711
- overall_ocr_res["rec_boxes"], overall_ocr_res["rec_texts"]
712
- ):
713
- single_block_layout_parsing_res.append(
714
- {
715
- "block_label": "text",
716
- "block_content": ocr_rec_text,
717
- "block_bbox": ocr_rec_box,
718
- "seg_start_coordinate": ocr_rec_box[0],
719
- "seg_end_coordinate": ocr_rec_box[2],
720
- },
472
+ if (
473
+ text_direction == "horizontal"
474
+ and block_stop_coordinate - last_span_box[text_stop_index] > line_gap_limit
475
+ ) or (
476
+ text_direction == "vertical"
477
+ and (
478
+ block_stop_coordinate - last_span_box[text_stop_index] > line_gap_limit
479
+ or first_span_box[1] - block_start_coordinate > line_gap_limit
721
480
  )
481
+ ):
482
+ need_new_line = True
483
+
484
+ if line_text.endswith("-"):
485
+ line_text = line_text[:-1]
486
+ elif (
487
+ len(line_text) > 0 and is_english_letter(line_text[-1])
488
+ ) or line_text.endswith("$"):
489
+ line_text += " "
490
+ elif (
491
+ len(line_text) > 0
492
+ and not is_english_letter(line_text[-1])
493
+ and not is_non_breaking_punctuation(line_text[-1])
494
+ and not is_numeric(line_text[-1])
495
+ ) or text_direction == "vertical":
496
+ if block_stop_coordinate - last_span_box[text_stop_index] > block_width * 0.4:
497
+ line_text += "\n"
498
+ if (
499
+ first_span_box[text_start_index] - block_start_coordinate
500
+ > block_width * 0.4
501
+ ):
502
+ line_text = "\n" + line_text
722
503
 
723
- single_block_layout_parsing_res = get_layout_ordering(
724
- single_block_layout_parsing_res,
725
- no_mask_labels=[
726
- "text",
727
- "formula",
728
- "algorithm",
729
- "reference",
730
- "content",
731
- "abstract",
732
- ],
733
- )
734
-
735
- return single_block_layout_parsing_res
504
+ return line_text, need_new_line
736
505
 
737
506
 
738
- def _projection_by_bboxes(boxes: np.ndarray, axis: int) -> np.ndarray:
507
+ def split_boxes_by_projection(spans: List[List[int]], direction, offset=1e-5):
739
508
  """
740
- Generate a 1D projection histogram from bounding boxes along a specified axis.
509
+ Check if there is any complete containment in the x-direction
510
+ between the bounding boxes and split the containing box accordingly.
741
511
 
742
512
  Args:
743
- boxes: A (N, 4) array of bounding boxes defined by [x_min, y_min, x_max, y_max].
744
- axis: Axis for projection; 0 for horizontal (x-axis), 1 for vertical (y-axis).
745
-
513
+ spans (list of lists): Each element is a list containing an ndarray of length 4, a text string, and a label.
514
+ direction: 'horizontal' or 'vertical', indicating whether the spans are arranged horizontally or vertically.
515
+ offset (float): A small offset value to ensure that the split boxes are not too close to the original boxes.
746
516
  Returns:
747
- A 1D numpy array representing the projection histogram based on bounding box intervals.
748
- """
749
- assert axis in [0, 1]
750
- max_length = np.max(boxes[:, axis::2])
751
- projection = np.zeros(max_length, dtype=int)
752
-
753
- # Increment projection histogram over the interval defined by each bounding box
754
- for start, end in boxes[:, axis::2]:
755
- projection[start:end] += 1
756
-
757
- return projection
758
-
759
-
760
- def _split_projection_profile(arr_values: np.ndarray, min_value: float, min_gap: float):
517
+ A new list of boxes, including split boxes, with the same `rec_text` and `label` attributes.
761
518
  """
762
- Split the projection profile into segments based on specified thresholds.
763
-
764
- Args:
765
- arr_values: 1D array representing the projection profile.
766
- min_value: Minimum value threshold to consider a profile segment significant.
767
- min_gap: Minimum gap width to consider a separation between segments.
768
519
 
769
- Returns:
770
- A tuple of start and end indices for each segment that meets the criteria.
771
- """
772
- # Identify indices where the projection exceeds the minimum value
773
- significant_indices = np.where(arr_values > min_value)[0]
774
- if not len(significant_indices):
775
- return
776
-
777
- # Calculate gaps between significant indices
778
- index_diffs = significant_indices[1:] - significant_indices[:-1]
779
- gap_indices = np.where(index_diffs > min_gap)[0]
780
-
781
- # Determine start and end indices of segments
782
- segment_starts = np.insert(
783
- significant_indices[gap_indices + 1],
784
- 0,
785
- significant_indices[0],
786
- )
787
- segment_ends = np.append(
788
- significant_indices[gap_indices],
789
- significant_indices[-1] + 1,
790
- )
520
+ def is_projection_contained(box_a, box_b, start_idx, end_idx):
521
+ """Check if box_a completely contains box_b in the x-direction."""
522
+ return box_a[start_idx] <= box_b[start_idx] and box_a[end_idx] >= box_b[end_idx]
791
523
 
792
- return segment_starts, segment_ends
524
+ new_boxes = []
525
+ if direction == "horizontal":
526
+ projection_start_index, projection_end_index = 0, 2
527
+ else:
528
+ projection_start_index, projection_end_index = 1, 3
793
529
 
530
+ for i in range(len(spans)):
531
+ span = spans[i]
532
+ is_split = False
533
+ for j in range(i, len(spans)):
534
+ box_b = spans[j][0]
535
+ box_a, text, label = span
536
+ if is_projection_contained(
537
+ box_a, box_b, projection_start_index, projection_end_index
538
+ ):
539
+ is_split = True
540
+ # Split box_a based on the x-coordinates of box_b
541
+ if box_a[projection_start_index] < box_b[projection_start_index]:
542
+ w = (
543
+ box_b[projection_start_index]
544
+ - offset
545
+ - box_a[projection_start_index]
546
+ )
547
+ if w > 1:
548
+ new_bbox = box_a.copy()
549
+ new_bbox[projection_end_index] = (
550
+ box_b[projection_start_index] - offset
551
+ )
552
+ new_boxes.append(
553
+ [
554
+ np.array(new_bbox),
555
+ text,
556
+ label,
557
+ ]
558
+ )
559
+ if box_a[projection_end_index] > box_b[projection_end_index]:
560
+ w = (
561
+ box_a[projection_end_index]
562
+ - box_b[projection_end_index]
563
+ + offset
564
+ )
565
+ if w > 1:
566
+ box_a[projection_start_index] = (
567
+ box_b[projection_end_index] + offset
568
+ )
569
+ span = [
570
+ np.array(box_a),
571
+ text,
572
+ label,
573
+ ]
574
+ if j == len(spans) - 1 and is_split:
575
+ new_boxes.append(span)
576
+ if not is_split:
577
+ new_boxes.append(span)
794
578
 
795
- def _recursive_yx_cut(
796
- boxes: np.ndarray, indices: List[int], res: List[int], min_gap: int = 1
797
- ):
798
- """
799
- Recursively project and segment bounding boxes, starting with Y-axis and followed by X-axis.
579
+ return new_boxes
800
580
 
801
- Args:
802
- boxes: A (N, 4) array representing bounding boxes.
803
- indices: List of indices indicating the original position of boxes.
804
- res: List to store indices of the final segmented bounding boxes.
805
- min_gap (int): Minimum gap width to consider a separation between segments on the X-axis. Defaults to 1.
806
581
 
807
- Returns:
808
- None: This function modifies the `res` list in place.
582
+ def remove_extra_space(input_text: str) -> str:
809
583
  """
810
- assert len(boxes) == len(
811
- indices
812
- ), "The length of boxes and indices must be the same."
813
-
814
- # Sort by y_min for Y-axis projection
815
- y_sorted_indices = boxes[:, 1].argsort()
816
- y_sorted_boxes = boxes[y_sorted_indices]
817
- y_sorted_indices = np.array(indices)[y_sorted_indices]
818
-
819
- # Perform Y-axis projection
820
- y_projection = _projection_by_bboxes(boxes=y_sorted_boxes, axis=1)
821
- y_intervals = _split_projection_profile(y_projection, 0, 1)
822
-
823
- if not y_intervals:
824
- return
825
-
826
- # Process each segment defined by Y-axis projection
827
- for y_start, y_end in zip(*y_intervals):
828
- # Select boxes within the current y interval
829
- y_interval_indices = (y_start <= y_sorted_boxes[:, 1]) & (
830
- y_sorted_boxes[:, 1] < y_end
831
- )
832
- y_boxes_chunk = y_sorted_boxes[y_interval_indices]
833
- y_indices_chunk = y_sorted_indices[y_interval_indices]
834
-
835
- # Sort by x_min for X-axis projection
836
- x_sorted_indices = y_boxes_chunk[:, 0].argsort()
837
- x_sorted_boxes_chunk = y_boxes_chunk[x_sorted_indices]
838
- x_sorted_indices_chunk = y_indices_chunk[x_sorted_indices]
839
-
840
- # Perform X-axis projection
841
- x_projection = _projection_by_bboxes(boxes=x_sorted_boxes_chunk, axis=0)
842
- x_intervals = _split_projection_profile(x_projection, 0, min_gap)
843
-
844
- if not x_intervals:
845
- continue
846
-
847
- # If X-axis cannot be further segmented, add current indices to results
848
- if len(x_intervals[0]) == 1:
849
- res.extend(x_sorted_indices_chunk)
850
- continue
851
-
852
- # Recursively process each segment defined by X-axis projection
853
- for x_start, x_end in zip(*x_intervals):
854
- x_interval_indices = (x_start <= x_sorted_boxes_chunk[:, 0]) & (
855
- x_sorted_boxes_chunk[:, 0] < x_end
856
- )
857
- _recursive_yx_cut(
858
- x_sorted_boxes_chunk[x_interval_indices],
859
- x_sorted_indices_chunk[x_interval_indices],
860
- res,
861
- )
862
-
584
+ Process the input text to handle spaces.
863
585
 
864
- def _recursive_xy_cut(
865
- boxes: np.ndarray, indices: List[int], res: List[int], min_gap: int = 1
866
- ):
867
- """
868
- Recursively performs X-axis projection followed by Y-axis projection to segment bounding boxes.
586
+ The function removes multiple consecutive spaces between Chinese characters and ensures that
587
+ only a single space is retained between Chinese and non-Chinese characters.
869
588
 
870
589
  Args:
871
- boxes: A (N, 4) array representing bounding boxes with [x_min, y_min, x_max, y_max].
872
- indices: A list of indices representing the position of boxes in the original data.
873
- res: A list to store indices of bounding boxes that meet the criteria.
874
- min_gap (int): Minimum gap width to consider a separation between segments on the X-axis. Defaults to 1.
590
+ input_text (str): The text to be processed.
875
591
 
876
592
  Returns:
877
- None: This function modifies the `res` list in place.
593
+ str: The processed text with properly formatted spaces.
878
594
  """
879
- # Ensure boxes and indices have the same length
880
- assert len(boxes) == len(
881
- indices
882
- ), "The length of boxes and indices must be the same."
883
-
884
- # Sort by x_min to prepare for X-axis projection
885
- x_sorted_indices = boxes[:, 0].argsort()
886
- x_sorted_boxes = boxes[x_sorted_indices]
887
- x_sorted_indices = np.array(indices)[x_sorted_indices]
888
-
889
- # Perform X-axis projection
890
- x_projection = _projection_by_bboxes(boxes=x_sorted_boxes, axis=0)
891
- x_intervals = _split_projection_profile(x_projection, 0, 1)
892
-
893
- if not x_intervals:
894
- return
895
-
896
- # Process each segment defined by X-axis projection
897
- for x_start, x_end in zip(*x_intervals):
898
- # Select boxes within the current x interval
899
- x_interval_indices = (x_start <= x_sorted_boxes[:, 0]) & (
900
- x_sorted_boxes[:, 0] < x_end
901
- )
902
- x_boxes_chunk = x_sorted_boxes[x_interval_indices]
903
- x_indices_chunk = x_sorted_indices[x_interval_indices]
904
-
905
- # Sort selected boxes by y_min to prepare for Y-axis projection
906
- y_sorted_indices = x_boxes_chunk[:, 1].argsort()
907
- y_sorted_boxes_chunk = x_boxes_chunk[y_sorted_indices]
908
- y_sorted_indices_chunk = x_indices_chunk[y_sorted_indices]
909
-
910
- # Perform Y-axis projection
911
- y_projection = _projection_by_bboxes(boxes=y_sorted_boxes_chunk, axis=1)
912
- y_intervals = _split_projection_profile(y_projection, 0, min_gap)
913
-
914
- if not y_intervals:
915
- continue
916
-
917
- # If Y-axis cannot be further segmented, add current indices to results
918
- if len(y_intervals[0]) == 1:
919
- res.extend(y_sorted_indices_chunk)
920
- continue
921
-
922
- # Recursively process each segment defined by Y-axis projection
923
- for y_start, y_end in zip(*y_intervals):
924
- y_interval_indices = (y_start <= y_sorted_boxes_chunk[:, 1]) & (
925
- y_sorted_boxes_chunk[:, 1] < y_end
926
- )
927
- _recursive_xy_cut(
928
- y_sorted_boxes_chunk[y_interval_indices],
929
- y_sorted_indices_chunk[y_interval_indices],
930
- res,
931
- )
932
595
 
596
+ # Remove spaces between Chinese characters
597
+ text_without_spaces = re.sub(
598
+ r"(?<=[\u4e00-\u9fff])\s+(?=[\u4e00-\u9fff])", "", input_text
599
+ )
933
600
 
934
- def sort_by_xycut(
935
- block_bboxes: Union[np.ndarray, List[List[int]]],
936
- direction: int = 0,
937
- min_gap: int = 1,
938
- ) -> List[int]:
939
- """
940
- Sort bounding boxes using recursive XY cut method based on the specified direction.
601
+ # Ensure single space between Chinese and non-Chinese characters
602
+ text_with_single_spaces = re.sub(
603
+ r"(?<=[\u4e00-\u9fff])\s+(?=[^\u4e00-\u9fff])|(?<=[^\u4e00-\u9fff])\s+(?=[\u4e00-\u9fff])",
604
+ " ",
605
+ text_without_spaces,
606
+ )
941
607
 
942
- Args:
943
- block_bboxes (Union[np.ndarray, List[List[int]]]): An array or list of bounding boxes,
944
- where each box is represented as
945
- [x_min, y_min, x_max, y_max].
946
- direction (int): Direction for the initial cut. Use 1 for Y-axis first and 0 for X-axis first.
947
- Defaults to 0.
948
- min_gap (int): Minimum gap width to consider a separation between segments. Defaults to 1.
608
+ # Reduce any remaining consecutive spaces to a single space
609
+ final_text = re.sub(r"\s+", " ", text_with_single_spaces).strip()
949
610
 
950
- Returns:
951
- List[int]: A list of indices representing the order of sorted bounding boxes.
952
- """
953
- block_bboxes = np.asarray(block_bboxes).astype(int)
954
- res = []
955
- if direction == 1:
956
- _recursive_yx_cut(
957
- block_bboxes,
958
- np.arange(len(block_bboxes)).tolist(),
959
- res,
960
- min_gap,
961
- )
962
- else:
963
- _recursive_xy_cut(
964
- block_bboxes,
965
- np.arange(len(block_bboxes)).tolist(),
966
- res,
967
- min_gap,
968
- )
969
- return res
611
+ return final_text
970
612
 
971
613
 
972
614
  def gather_imgs(original_img, layout_det_objs):
973
615
  imgs_in_doc = []
974
616
  for det_obj in layout_det_objs:
975
- if det_obj["label"] in ("image", "chart"):
617
+ if det_obj["label"] in BLOCK_LABEL_MAP["image_labels"]:
618
+ label = det_obj["label"]
976
619
  x_min, y_min, x_max, y_max = list(map(int, det_obj["coordinate"]))
977
- img_path = f"imgs/img_in_table_box_{x_min}_{y_min}_{x_max}_{y_max}.jpg"
620
+ img_path = f"imgs/img_in_{label}_box_{x_min}_{y_min}_{x_max}_{y_max}.jpg"
978
621
  img = Image.fromarray(original_img[y_min:y_max, x_min:x_max, ::-1])
979
622
  imgs_in_doc.append(
980
623
  {
@@ -1008,10 +651,10 @@ def _get_minbox_if_overlap_by_ratio(
1008
651
  The selected bounding box or None if the overlap ratio is not exceeded.
1009
652
  """
1010
653
  # Calculate the areas of both bounding boxes
1011
- area1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
1012
- area2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
654
+ area1 = caculate_bbox_area(bbox1)
655
+ area2 = caculate_bbox_area(bbox2)
1013
656
  # Calculate the overlap ratio using a helper function
1014
- overlap_ratio = _calculate_overlap_area_div_minbox_area_ratio(bbox1, bbox2)
657
+ overlap_ratio = calculate_overlap_ratio(bbox1, bbox2, mode="small")
1015
658
  # Check if the overlap ratio exceeds the threshold
1016
659
  if overlap_ratio > ratio:
1017
660
  if (area1 <= area2 and smaller) or (area1 >= area2 and not smaller):
@@ -1021,7 +664,7 @@ def _get_minbox_if_overlap_by_ratio(
1021
664
  return None
1022
665
 
1023
666
 
1024
- def _remove_overlap_blocks(
667
+ def remove_overlap_blocks(
1025
668
  blocks: List[Dict[str, List[int]]], threshold: float = 0.65, smaller: bool = True
1026
669
  ) -> Tuple[List[Dict[str, List[int]]], List[Dict[str, List[int]]]]:
1027
670
  """
@@ -1036,13 +679,13 @@ def _remove_overlap_blocks(
1036
679
  Tuple[List[Dict[str, List[int]]], List[Dict[str, List[int]]]]:
1037
680
  A tuple containing the updated list of blocks and a list of dropped blocks.
1038
681
  """
1039
- dropped_blocks = []
1040
682
  dropped_indexes = set()
1041
-
683
+ blocks = deepcopy(blocks)
684
+ overlap_image_blocks = []
1042
685
  # Iterate over each pair of blocks to find overlaps
1043
- for i, block1 in enumerate(blocks):
1044
- for j in range(i + 1, len(blocks)):
1045
- block2 = blocks[j]
686
+ for i, block1 in enumerate(blocks["boxes"]):
687
+ for j in range(i + 1, len(blocks["boxes"])):
688
+ block2 = blocks["boxes"][j]
1046
689
  # Skip blocks that are already marked for removal
1047
690
  if i in dropped_indexes or j in dropped_indexes:
1048
691
  continue
@@ -1054,1332 +697,255 @@ def _remove_overlap_blocks(
1054
697
  smaller=smaller,
1055
698
  )
1056
699
  if overlap_box_index is not None:
1057
- # Determine which block to remove based on overlap_box_index
1058
- if overlap_box_index == 1:
1059
- drop_index = i
700
+ is_block1_image = block1["label"] == "image"
701
+ is_block2_image = block2["label"] == "image"
702
+
703
+ if is_block1_image != is_block2_image:
704
+ # 如果只有一个块在视觉标签中,删除在视觉标签中的那个块
705
+ drop_index = i if is_block1_image else j
706
+ overlap_image_blocks.append(blocks["boxes"][drop_index])
1060
707
  else:
1061
- drop_index = j
708
+ # 如果两个块都在或都不在视觉标签中,根据 overlap_box_index 决定删除哪个块
709
+ drop_index = i if overlap_box_index == 1 else j
710
+
1062
711
  dropped_indexes.add(drop_index)
1063
712
 
1064
713
  # Remove marked blocks from the original list
1065
714
  for index in sorted(dropped_indexes, reverse=True):
1066
- dropped_blocks.append(blocks[index])
1067
- del blocks[index]
1068
-
1069
- return blocks, dropped_blocks
1070
-
715
+ del blocks["boxes"][index]
1071
716
 
1072
- def _get_text_median_width(blocks: List[Dict[str, any]]) -> float:
1073
- """
1074
- Calculate the median width of blocks labeled as "text".
1075
-
1076
- Args:
1077
- blocks (List[Dict[str, any]]): List of block dictionaries, each containing a 'block_bbox' and 'label'.
1078
-
1079
- Returns:
1080
- float: The median width of text blocks, or infinity if no text blocks are found.
1081
- """
1082
- widths = [
1083
- block["block_bbox"][2] - block["block_bbox"][0]
1084
- for block in blocks
1085
- if block.get("block_label") == "text"
1086
- ]
1087
- return np.median(widths) if widths else float("inf")
717
+ return blocks
1088
718
 
1089
719
 
1090
- def _get_layout_property(
1091
- blocks: List[Dict[str, any]],
1092
- median_width: float,
1093
- no_mask_labels: List[str],
1094
- threshold: float = 0.8,
1095
- ) -> Tuple[List[Dict[str, any]], bool]:
720
+ def get_bbox_intersection(bbox1, bbox2, return_format="bbox"):
1096
721
  """
1097
- Determine the layout (single or double column) of text blocks.
722
+ Compute the intersection of two bounding boxes, supporting both 4-coordinate and 8-coordinate formats.
1098
723
 
1099
724
  Args:
1100
- blocks (List[Dict[str, any]]): List of block dictionaries containing 'label' and 'block_bbox'.
1101
- median_width (float): Median width of text blocks.
1102
- no_mask_labels (List[str]): Labels of blocks to be considered for layout analysis.
1103
- threshold (float): Threshold for determining layout overlap.
725
+ bbox1 (tuple): The first bounding box, either in 4-coordinate format (x_min, y_min, x_max, y_max)
726
+ or 8-coordinate format (x1, y1, x2, y2, x3, y3, x4, y4).
727
+ bbox2 (tuple): The second bounding box in the same format as bbox1.
728
+ return_format (str): The format of the output intersection, either 'bbox' or 'poly'.
1104
729
 
1105
730
  Returns:
1106
- Tuple[List[Dict[str, any]], bool]: Updated list of blocks with layout information and a boolean
1107
- indicating if the double layout area is greater than the single layout area.
1108
- """
1109
- blocks.sort(
1110
- key=lambda x: (
1111
- x["block_bbox"][0],
1112
- (x["block_bbox"][2] - x["block_bbox"][0]),
1113
- ),
1114
- )
1115
- check_single_layout = {}
1116
- page_min_x, page_max_x = float("inf"), 0
1117
- double_label_area = 0
1118
- single_label_area = 0
1119
-
1120
- for i, block in enumerate(blocks):
1121
- page_min_x = min(page_min_x, block["block_bbox"][0])
1122
- page_max_x = max(page_max_x, block["block_bbox"][2])
1123
- page_width = page_max_x - page_min_x
1124
-
1125
- for i, block in enumerate(blocks):
1126
- if block["block_label"] not in no_mask_labels:
1127
- continue
1128
-
1129
- x_min_i, _, x_max_i, _ = block["block_bbox"]
1130
- layout_length = x_max_i - x_min_i
1131
- cover_count, cover_with_threshold_count = 0, 0
1132
- match_block_with_threshold_indexes = []
1133
-
1134
- for j, other_block in enumerate(blocks):
1135
- if i == j or other_block["block_label"] not in no_mask_labels:
1136
- continue
1137
-
1138
- x_min_j, _, x_max_j, _ = other_block["block_bbox"]
1139
- x_match_min, x_match_max = max(
1140
- x_min_i,
1141
- x_min_j,
1142
- ), min(x_max_i, x_max_j)
1143
- match_block_iou = (x_match_max - x_match_min) / (x_max_j - x_min_j)
1144
-
1145
- if match_block_iou > 0:
1146
- cover_count += 1
1147
- if match_block_iou > threshold:
1148
- cover_with_threshold_count += 1
1149
- match_block_with_threshold_indexes.append(
1150
- (j, match_block_iou),
1151
- )
1152
- x_min_i = x_match_max
1153
- if x_min_i >= x_max_i:
1154
- break
1155
-
1156
- if (
1157
- layout_length > median_width * 1.3
1158
- and (cover_with_threshold_count >= 2 or cover_count >= 2)
1159
- ) or layout_length > 0.6 * page_width:
1160
- # if layout_length > median_width * 1.3 and (cover_with_threshold_count >= 2):
1161
- block["layout"] = "double"
1162
- double_label_area += (block["block_bbox"][2] - block["block_bbox"][0]) * (
1163
- block["block_bbox"][3] - block["block_bbox"][1]
1164
- )
1165
- else:
1166
- block["layout"] = "single"
1167
- check_single_layout[i] = match_block_with_threshold_indexes
1168
-
1169
- # Check single-layout block
1170
- for i, single_layout in check_single_layout.items():
1171
- if single_layout:
1172
- index, match_iou = single_layout[-1]
1173
- if match_iou > 0.9 and blocks[index]["layout"] == "double":
1174
- blocks[i]["layout"] = "double"
1175
- double_label_area += (
1176
- blocks[i]["block_bbox"][2] - blocks[i]["block_bbox"][0]
1177
- ) * (blocks[i]["block_bbox"][3] - blocks[i]["block_bbox"][1])
1178
- else:
1179
- single_label_area += (
1180
- blocks[i]["block_bbox"][2] - blocks[i]["block_bbox"][0]
1181
- ) * (blocks[i]["block_bbox"][3] - blocks[i]["block_bbox"][1])
1182
-
1183
- return blocks, (double_label_area > single_label_area)
1184
-
1185
-
1186
- def _get_bbox_direction(input_bbox: List[float], ratio: float = 1.0) -> bool:
1187
- """
1188
- Determine if a bounding box is horizontal or vertical.
1189
-
1190
- Args:
1191
- input_bbox (List[float]): Bounding box [x_min, y_min, x_max, y_max].
1192
- ratio (float): Ratio for determining orientation. Default is 1.0.
1193
-
1194
- Returns:
1195
- bool: True if the bounding box is considered horizontal, False if vertical.
1196
- """
1197
- width = input_bbox[2] - input_bbox[0]
1198
- height = input_bbox[3] - input_bbox[1]
1199
- return width * ratio >= height
1200
-
1201
-
1202
- def _get_projection_iou(
1203
- input_bbox: List[float], match_bbox: List[float], is_horizontal: bool = True
1204
- ) -> float:
1205
- """
1206
- Calculate the IoU of lines between two bounding boxes.
1207
-
1208
- Args:
1209
- input_bbox (List[float]): First bounding box [x_min, y_min, x_max, y_max].
1210
- match_bbox (List[float]): Second bounding box [x_min, y_min, x_max, y_max].
1211
- is_horizontal (bool): Whether to compare horizontally or vertically.
1212
-
1213
- Returns:
1214
- float: Line IoU. Returns 0 if there is no overlap.
1215
- """
1216
- if is_horizontal:
1217
- x_match_min = max(input_bbox[0], match_bbox[0])
1218
- x_match_max = min(input_bbox[2], match_bbox[2])
1219
- overlap = max(0, x_match_max - x_match_min)
1220
- input_width = min(input_bbox[2] - input_bbox[0], match_bbox[2] - match_bbox[0])
731
+ tuple or None: The intersection bounding box in the specified format, or None if there is no intersection.
732
+ """
733
+ bbox1 = np.array(bbox1)
734
+ bbox2 = np.array(bbox2)
735
+ # Convert both bounding boxes to rectangles
736
+ rect1 = bbox1 if len(bbox1.shape) == 1 else convert_points_to_boxes([bbox1])[0]
737
+ rect2 = bbox2 if len(bbox2.shape) == 1 else convert_points_to_boxes([bbox2])[0]
738
+
739
+ # Calculate the intersection rectangle
740
+
741
+ x_min_inter = max(rect1[0], rect2[0])
742
+ y_min_inter = max(rect1[1], rect2[1])
743
+ x_max_inter = min(rect1[2], rect2[2])
744
+ y_max_inter = min(rect1[3], rect2[3])
745
+
746
+ # Check if there is an intersection
747
+ if x_min_inter >= x_max_inter or y_min_inter >= y_max_inter:
748
+ return None
749
+
750
+ if return_format == "bbox":
751
+ return np.array([x_min_inter, y_min_inter, x_max_inter, y_max_inter])
752
+ elif return_format == "poly":
753
+ return np.array(
754
+ [
755
+ [x_min_inter, y_min_inter],
756
+ [x_max_inter, y_min_inter],
757
+ [x_max_inter, y_max_inter],
758
+ [x_min_inter, y_max_inter],
759
+ ],
760
+ dtype=np.int16,
761
+ )
1221
762
  else:
1222
- y_match_min = max(input_bbox[1], match_bbox[1])
1223
- y_match_max = min(input_bbox[3], match_bbox[3])
1224
- overlap = max(0, y_match_max - y_match_min)
1225
- input_width = min(input_bbox[3] - input_bbox[1], match_bbox[3] - match_bbox[1])
1226
-
1227
- return overlap / input_width if input_width > 0 else 0.0
763
+ raise ValueError("return_format must be either 'bbox' or 'poly'.")
1228
764
 
1229
765
 
1230
- def _get_sub_category(
1231
- blocks: List[Dict[str, Any]], title_labels: List[str]
1232
- ) -> Tuple[List[Dict[str, Any]], List[float]]:
766
+ def shrink_supplement_region_bbox(
767
+ supplement_region_bbox,
768
+ ref_region_bbox,
769
+ image_width,
770
+ image_height,
771
+ block_idxes_set,
772
+ block_bboxes,
773
+ ) -> List:
1233
774
  """
1234
- Determine the layout of title and text blocks and collect pre_cuts.
775
+ Shrink the supplement region bbox according to the reference region bbox and match the block bboxes.
1235
776
 
1236
777
  Args:
1237
- blocks (List[Dict[str, Any]]): List of block dictionaries.
1238
- title_labels (List[str]): List of labels considered as titles.
778
+ supplement_region_bbox (list): The supplement region bbox.
779
+ ref_region_bbox (list): The reference region bbox.
780
+ image_width (int): The width of the image.
781
+ image_height (int): The height of the image.
782
+ block_idxes_set (set): The indexes of the blocks that intersect with the region bbox.
783
+ block_bboxes (dict): The dictionary of block bboxes.
1239
784
 
1240
785
  Returns:
1241
- List[Dict[str, Any]]: Updated list of blocks with title-text layout information.
1242
- Dict[float]: Dict of pre_cuts coordinates.
1243
- """
1244
-
1245
- sub_title_labels = ["paragraph_title"]
1246
- vision_labels = ["image", "table", "chart", "figure"]
1247
- vision_title_labels = ["figure_title", "chart_title", "table_title"]
1248
- all_labels = title_labels + sub_title_labels + vision_labels + vision_title_labels
1249
- special_pre_cut_labels = sub_title_labels
1250
-
1251
- # single doc title is irregular,pre cut not applicable
1252
- num_doc_title = 0
1253
- for block in blocks:
1254
- if block["block_label"] == "doc_title":
1255
- num_doc_title += 1
1256
- if num_doc_title == 2:
1257
- special_pre_cut_labels = title_labels + sub_title_labels
1258
- break
1259
- if len(blocks) == 0:
1260
- return blocks, {}
1261
-
1262
- min_x = min(block["block_bbox"][0] for block in blocks)
1263
- min_y = min(block["block_bbox"][1] for block in blocks)
1264
- max_x = max(block["block_bbox"][2] for block in blocks)
1265
- max_y = max(block["block_bbox"][3] for block in blocks)
1266
- region_bbox = (min_x, min_y, max_x, max_y)
1267
- region_x_center = (region_bbox[0] + region_bbox[2]) / 2
1268
- region_y_center = (region_bbox[1] + region_bbox[3]) / 2
1269
- region_width = region_bbox[2] - region_bbox[0]
1270
- region_height = region_bbox[3] - region_bbox[1]
1271
-
1272
- pre_cuts = {}
1273
-
1274
- for i, block1 in enumerate(blocks):
1275
- block1.setdefault("title_text", [])
1276
- block1.setdefault("sub_title", [])
1277
- block1.setdefault("vision_footnote", [])
1278
- block1.setdefault("sub_label", block1["block_label"])
1279
-
1280
- if block1["block_label"] not in all_labels:
1281
- continue
1282
-
1283
- bbox1 = block1["block_bbox"]
1284
- x1, y1, x2, y2 = bbox1
1285
- is_horizontal_1 = _get_bbox_direction(block1["block_bbox"])
1286
- left_up_title_text_distance = float("inf")
1287
- left_up_title_text_index = -1
1288
- left_up_title_text_direction = None
1289
- right_down_title_text_distance = float("inf")
1290
- right_down_title_text_index = -1
1291
- right_down_title_text_direction = None
1292
-
1293
- # pre-cuts
1294
- # Condition 1: Length is greater than half of the layout region
1295
- if is_horizontal_1:
1296
- block_length = x2 - x1
1297
- required_length = region_width / 2
1298
- else:
1299
- block_length = y2 - y1
1300
- required_length = region_height / 2
1301
- if block1["block_label"] in special_pre_cut_labels:
1302
- length_condition = True
1303
- else:
1304
- length_condition = block_length > required_length
1305
-
1306
- # Condition 2: Centered check (must be within ±20 in both horizontal and vertical directions)
1307
- block_x_center = (x1 + x2) / 2
1308
- block_y_center = (y1 + y2) / 2
1309
- tolerance_len = block_length // 5
1310
- if block1["block_label"] in special_pre_cut_labels:
1311
- tolerance_len = block_length // 10
1312
- if is_horizontal_1:
1313
- is_centered = abs(block_x_center - region_x_center) <= tolerance_len
1314
- else:
1315
- is_centered = abs(block_y_center - region_y_center) <= tolerance_len
1316
-
1317
- # Condition 3: Check for surrounding text
1318
- has_left_text = False
1319
- has_right_text = False
1320
- has_above_text = False
1321
- has_below_text = False
1322
- for block2 in blocks:
1323
- if block2["block_label"] != "text":
1324
- continue
1325
- bbox2 = block2["block_bbox"]
1326
- x1_2, y1_2, x2_2, y2_2 = bbox2
1327
- if is_horizontal_1:
1328
- if x2_2 <= x1 and not (y2_2 <= y1 or y1_2 >= y2):
1329
- has_left_text = True
1330
- if x1_2 >= x2 and not (y2_2 <= y1 or y1_2 >= y2):
1331
- has_right_text = True
1332
- else:
1333
- if y2_2 <= y1 and not (x2_2 <= x1 or x1_2 >= x2):
1334
- has_above_text = True
1335
- if y1_2 >= y2 and not (x2_2 <= x1 or x1_2 >= x2):
1336
- has_below_text = True
1337
-
1338
- if (is_horizontal_1 and has_left_text and has_right_text) or (
1339
- not is_horizontal_1 and has_above_text and has_below_text
1340
- ):
1341
- break
1342
-
1343
- no_text_on_sides = (
1344
- not (has_left_text or has_right_text)
1345
- if is_horizontal_1
1346
- else not (has_above_text or has_below_text)
1347
- )
1348
-
1349
- # Add coordinates if all conditions are met
1350
- if is_centered and length_condition and no_text_on_sides:
1351
- if is_horizontal_1:
1352
- pre_cuts.setdefault("y", []).append(y1)
1353
- else:
1354
- pre_cuts.setdefault("x", []).append(x1)
1355
-
1356
- for j, block2 in enumerate(blocks):
1357
- if i == j:
1358
- continue
1359
-
1360
- bbox2 = block2["block_bbox"]
1361
- x1_prime, y1_prime, x2_prime, y2_prime = bbox2
1362
- is_horizontal_2 = _get_bbox_direction(bbox2)
1363
- match_block_iou = _get_projection_iou(
1364
- bbox2,
1365
- bbox1,
1366
- is_horizontal_1,
786
+ list: The new region bbox and the matched block idxes.
787
+ """
788
+ x1, y1, x2, y2 = supplement_region_bbox
789
+ x1_prime, y1_prime, x2_prime, y2_prime = ref_region_bbox
790
+ index_conversion_map = {0: 2, 1: 3, 2: 0, 3: 1}
791
+ edge_distance_list = [
792
+ (x1_prime - x1) / image_width,
793
+ (y1_prime - y1) / image_height,
794
+ (x2 - x2_prime) / image_width,
795
+ (y2 - y2_prime) / image_height,
796
+ ]
797
+ edge_distance_list_tmp = edge_distance_list[:]
798
+ min_distance = min(edge_distance_list)
799
+ src_index = index_conversion_map[edge_distance_list.index(min_distance)]
800
+ if len(block_idxes_set) == 0:
801
+ return supplement_region_bbox, []
802
+ for _ in range(3):
803
+ dst_index = index_conversion_map[src_index]
804
+ tmp_region_bbox = supplement_region_bbox[:]
805
+ tmp_region_bbox[dst_index] = ref_region_bbox[src_index]
806
+ iner_block_idxes, split_block_idxes = [], []
807
+ for block_idx in block_idxes_set:
808
+ overlap_ratio = calculate_overlap_ratio(
809
+ tmp_region_bbox, block_bboxes[block_idx], mode="small"
1367
810
  )
1368
-
1369
- def distance_(is_horizontal, is_left_up):
1370
- if is_horizontal:
1371
- if is_left_up:
1372
- return (y1 - y2_prime + 2) // 5 + x1_prime / 5000
1373
- else:
1374
- return (y1_prime - y2 + 2) // 5 + x1_prime / 5000
1375
-
1376
- else:
1377
- if is_left_up:
1378
- return (x1 - x2_prime + 2) // 5 + y1_prime / 5000
1379
- else:
1380
- return (x1_prime - x2 + 2) // 5 + y1_prime / 5000
1381
-
1382
- block_iou_threshold = 0.1
1383
- if block1["block_label"] in sub_title_labels:
1384
- block_iou_threshold = 0.5
1385
-
1386
- if is_horizontal_1:
1387
- if match_block_iou >= block_iou_threshold:
1388
- left_up_distance = distance_(True, True)
1389
- right_down_distance = distance_(True, False)
1390
- if (
1391
- y2_prime <= y1
1392
- and left_up_distance <= left_up_title_text_distance
1393
- ):
1394
- left_up_title_text_distance = left_up_distance
1395
- left_up_title_text_index = j
1396
- left_up_title_text_direction = is_horizontal_2
1397
- elif (
1398
- y1_prime > y2
1399
- and right_down_distance < right_down_title_text_distance
1400
- ):
1401
- right_down_title_text_distance = right_down_distance
1402
- right_down_title_text_index = j
1403
- right_down_title_text_direction = is_horizontal_2
1404
- else:
1405
- if match_block_iou >= block_iou_threshold:
1406
- left_up_distance = distance_(False, True)
1407
- right_down_distance = distance_(False, False)
1408
- if (
1409
- x2_prime <= x1
1410
- and left_up_distance <= left_up_title_text_distance
1411
- ):
1412
- left_up_title_text_distance = left_up_distance
1413
- left_up_title_text_index = j
1414
- left_up_title_text_direction = is_horizontal_2
1415
- elif (
1416
- x1_prime > x2
1417
- and right_down_distance < right_down_title_text_distance
1418
- ):
1419
- right_down_title_text_distance = right_down_distance
1420
- right_down_title_text_index = j
1421
- right_down_title_text_direction = is_horizontal_2
1422
-
1423
- height = bbox1[3] - bbox1[1]
1424
- width = bbox1[2] - bbox1[0]
1425
- title_text_weight = [0.8, 0.8]
1426
-
1427
- title_text, sub_title, vision_footnote = [], [], []
1428
-
1429
- def get_sub_category_(
1430
- title_text_direction,
1431
- title_text_index,
1432
- label,
1433
- is_left_up=True,
1434
- ):
1435
- direction_ = [1, 3] if is_left_up else [2, 4]
1436
- if (
1437
- title_text_direction == is_horizontal_1
1438
- and title_text_index != -1
1439
- and (label == "text" or label == "paragraph_title")
811
+ if overlap_ratio > REGION_SETTINGS.get(
812
+ "match_block_overlap_ratio_threshold", 0.8
1440
813
  ):
1441
- bbox2 = blocks[title_text_index]["block_bbox"]
1442
- if is_horizontal_1:
1443
- height1 = bbox2[3] - bbox2[1]
1444
- width1 = bbox2[2] - bbox2[0]
1445
- if label == "text":
1446
- if (
1447
- _nearest_edge_distance(bbox1, bbox2)[0] <= 15
1448
- and block1["block_label"] in vision_labels
1449
- and width1 < width
1450
- and height1 < 0.5 * height
1451
- ):
1452
- blocks[title_text_index]["sub_label"] = "vision_footnote"
1453
- vision_footnote.append(bbox2)
1454
- elif (
1455
- height1 < height * title_text_weight[0]
1456
- and (width1 < width or width1 > 1.5 * width)
1457
- and block1["block_label"] in title_labels
1458
- ):
1459
- blocks[title_text_index]["sub_label"] = "title_text"
1460
- title_text.append((direction_[0], bbox2))
1461
- elif (
1462
- label == "paragraph_title"
1463
- and block1["block_label"] in sub_title_labels
1464
- ):
1465
- sub_title.append(bbox2)
1466
- else:
1467
- height1 = bbox2[3] - bbox2[1]
1468
- width1 = bbox2[2] - bbox2[0]
1469
- if label == "text":
1470
- if (
1471
- _nearest_edge_distance(bbox1, bbox2)[0] <= 15
1472
- and block1["block_label"] in vision_labels
1473
- and height1 < height
1474
- and width1 < 0.5 * width
1475
- ):
1476
- blocks[title_text_index]["sub_label"] = "vision_footnote"
1477
- vision_footnote.append(bbox2)
1478
- elif (
1479
- width1 < width * title_text_weight[1]
1480
- and block1["block_label"] in title_labels
1481
- ):
1482
- blocks[title_text_index]["sub_label"] = "title_text"
1483
- title_text.append((direction_[1], bbox2))
1484
- elif (
1485
- label == "paragraph_title"
1486
- and block1["block_label"] in sub_title_labels
1487
- ):
1488
- sub_title.append(bbox2)
1489
-
1490
- if (
1491
- is_horizontal_1
1492
- and abs(left_up_title_text_distance - right_down_title_text_distance) * 5
1493
- > height
1494
- ) or (
1495
- not is_horizontal_1
1496
- and abs(left_up_title_text_distance - right_down_title_text_distance) * 5
1497
- > width
1498
- ):
1499
- if left_up_title_text_distance < right_down_title_text_distance:
1500
- get_sub_category_(
1501
- left_up_title_text_direction,
1502
- left_up_title_text_index,
1503
- blocks[left_up_title_text_index]["block_label"],
1504
- True,
1505
- )
1506
- else:
1507
- get_sub_category_(
1508
- right_down_title_text_direction,
1509
- right_down_title_text_index,
1510
- blocks[right_down_title_text_index]["block_label"],
1511
- False,
1512
- )
1513
- else:
1514
- get_sub_category_(
1515
- left_up_title_text_direction,
1516
- left_up_title_text_index,
1517
- blocks[left_up_title_text_index]["block_label"],
1518
- True,
1519
- )
1520
- get_sub_category_(
1521
- right_down_title_text_direction,
1522
- right_down_title_text_index,
1523
- blocks[right_down_title_text_index]["block_label"],
1524
- False,
1525
- )
1526
-
1527
- if block1["block_label"] in title_labels:
1528
- if blocks[i].get("title_text") == []:
1529
- blocks[i]["title_text"] = title_text
1530
-
1531
- if block1["block_label"] in sub_title_labels:
1532
- if blocks[i].get("sub_title") == []:
1533
- blocks[i]["sub_title"] = sub_title
1534
-
1535
- if block1["block_label"] in vision_labels:
1536
- if blocks[i].get("vision_footnote") == []:
1537
- blocks[i]["vision_footnote"] = vision_footnote
1538
-
1539
- return blocks, pre_cuts
1540
-
1541
-
1542
- def get_layout_ordering(
1543
- parsing_res_list: List[Dict[str, Any]],
1544
- no_mask_labels: List[str] = [],
1545
- ) -> None:
1546
- """
1547
- Process layout parsing results to remove overlapping bounding boxes
1548
- and assign an ordering index based on their positions.
1549
-
1550
- Modifies:
1551
- The 'parsing_res_list' list by adding an 'index' to each block.
1552
-
1553
- Args:
1554
- parsing_res_list (List[Dict[str, Any]]): List of block dictionaries with 'block_bbox' and 'block_label'.
1555
- no_mask_labels (List[str]): Labels for which overlapping removal is not performed.
1556
- """
1557
- title_text_labels = ["doc_title"]
1558
- title_labels = ["doc_title", "paragraph_title"]
1559
- vision_labels = ["image", "table", "seal", "chart", "figure"]
1560
- vision_title_labels = ["table_title", "chart_title", "figure_title"]
1561
-
1562
- parsing_res_list, pre_cuts = _get_sub_category(parsing_res_list, title_text_labels)
1563
-
1564
- parsing_res_by_pre_cuts_list = []
1565
- if len(pre_cuts) > 0:
1566
- block_bboxes = [block["block_bbox"] for block in parsing_res_list]
1567
- for axis, cuts in pre_cuts.items():
1568
- axis_index = 1 if axis == "y" else 0
1569
-
1570
- max_val = max(bbox[axis_index + 2] for bbox in block_bboxes)
1571
-
1572
- intervals = []
1573
- prev = 0
1574
- for cut in sorted(cuts):
1575
- intervals.append((prev, cut))
1576
- prev = cut
1577
- intervals.append((prev, max_val))
1578
-
1579
- for start, end in intervals:
1580
- mask = [
1581
- (bbox[axis_index] >= start) and (bbox[axis_index] < end)
1582
- for bbox in block_bboxes
1583
- ]
1584
- parsing_res_by_pre_cuts_list.append(
1585
- [parsing_res_list[i] for i, m in enumerate(mask) if m]
1586
- )
1587
- else:
1588
- parsing_res_by_pre_cuts_list = [parsing_res_list]
1589
-
1590
- final_parsing_res_list = []
1591
- num_index = 0
1592
- num_sub_index = 0
1593
- for parsing_res_by_pre_cuts in parsing_res_by_pre_cuts_list:
1594
-
1595
- doc_flag = False
1596
- median_width = _get_text_median_width(parsing_res_by_pre_cuts)
1597
- parsing_res_by_pre_cuts, projection_direction = _get_layout_property(
1598
- parsing_res_by_pre_cuts,
1599
- median_width,
1600
- no_mask_labels=no_mask_labels,
1601
- threshold=0.3,
1602
- )
1603
- # Convert bounding boxes to float and remove overlaps
1604
- (
1605
- double_text_blocks,
1606
- title_text_blocks,
1607
- title_blocks,
1608
- vision_blocks,
1609
- vision_title_blocks,
1610
- vision_footnote_blocks,
1611
- other_blocks,
1612
- ) = ([], [], [], [], [], [], [])
1613
-
1614
- drop_indexes = []
1615
-
1616
- for index, block in enumerate(parsing_res_by_pre_cuts):
1617
- label = block["sub_label"]
1618
- block["block_bbox"] = list(map(int, block["block_bbox"]))
1619
-
1620
- if label == "doc_title":
1621
- doc_flag = True
1622
-
1623
- if label in no_mask_labels:
1624
- if block["layout"] == "double":
1625
- double_text_blocks.append(block)
1626
- drop_indexes.append(index)
1627
- elif label == "title_text":
1628
- title_text_blocks.append(block)
1629
- drop_indexes.append(index)
1630
- elif label == "vision_footnote":
1631
- vision_footnote_blocks.append(block)
1632
- drop_indexes.append(index)
1633
- elif label in vision_title_labels:
1634
- vision_title_blocks.append(block)
1635
- drop_indexes.append(index)
1636
- elif label in title_labels:
1637
- title_blocks.append(block)
1638
- drop_indexes.append(index)
1639
- elif label in vision_labels:
1640
- vision_blocks.append(block)
1641
- drop_indexes.append(index)
1642
- else:
1643
- other_blocks.append(block)
1644
- drop_indexes.append(index)
1645
-
1646
- for index in sorted(drop_indexes, reverse=True):
1647
- del parsing_res_by_pre_cuts[index]
1648
-
1649
- if len(parsing_res_by_pre_cuts) > 0:
1650
- # single text label
1651
- if (
1652
- len(double_text_blocks) > len(parsing_res_by_pre_cuts)
1653
- or projection_direction
814
+ iner_block_idxes.append(block_idx)
815
+ elif overlap_ratio > REGION_SETTINGS.get(
816
+ "split_block_overlap_ratio_threshold", 0.4
1654
817
  ):
1655
- parsing_res_by_pre_cuts.extend(title_blocks + double_text_blocks)
1656
- title_blocks = []
1657
- double_text_blocks = []
1658
- block_bboxes = [
1659
- block["block_bbox"] for block in parsing_res_by_pre_cuts
1660
- ]
1661
- block_bboxes.sort(
1662
- key=lambda x: (
1663
- x[0] // max(20, median_width),
1664
- x[1],
1665
- ),
1666
- )
1667
- block_bboxes = np.array(block_bboxes)
1668
- sorted_indices = sort_by_xycut(block_bboxes, direction=1, min_gap=1)
1669
- else:
1670
- block_bboxes = [
1671
- block["block_bbox"] for block in parsing_res_by_pre_cuts
1672
- ]
1673
- block_bboxes.sort(key=lambda x: (x[0] // 20, x[1]))
1674
- block_bboxes = np.array(block_bboxes)
1675
- sorted_indices = sort_by_xycut(block_bboxes, direction=0, min_gap=20)
1676
-
1677
- sorted_boxes = block_bboxes[sorted_indices].tolist()
1678
-
1679
- for block in parsing_res_by_pre_cuts:
1680
- block["index"] = num_index + sorted_boxes.index(block["block_bbox"]) + 1
1681
- block["sub_index"] = (
1682
- num_sub_index + sorted_boxes.index(block["block_bbox"]) + 1
1683
- )
1684
-
1685
- def nearest_match_(input_blocks, distance_type="manhattan", is_add_index=True):
1686
- for block in input_blocks:
1687
- bbox = block["block_bbox"]
1688
- min_distance = float("inf")
1689
- min_distance_config = [
1690
- [float("inf"), float("inf")],
1691
- float("inf"),
1692
- float("inf"),
1693
- ] # for double text
1694
- nearest_gt_index = 0
1695
- for match_block in parsing_res_by_pre_cuts:
1696
- match_bbox = match_block["block_bbox"]
1697
- if distance_type == "nearest_iou_edge_distance":
1698
- distance, min_distance_config = _nearest_iou_edge_distance(
1699
- bbox,
1700
- match_bbox,
1701
- block["sub_label"],
1702
- vision_labels=vision_labels,
1703
- no_mask_labels=no_mask_labels,
1704
- median_width=median_width,
1705
- title_labels=title_labels,
1706
- title_text=block["title_text"],
1707
- sub_title=block["sub_title"],
1708
- min_distance_config=min_distance_config,
1709
- tolerance_len=10,
1710
- )
1711
- elif distance_type == "title_text":
1712
- if (
1713
- match_block["block_label"] in title_labels + ["abstract"]
1714
- and match_block["title_text"] != []
1715
- ):
1716
- iou_left_up = _calculate_overlap_area_div_minbox_area_ratio(
1717
- bbox,
1718
- match_block["title_text"][0][1],
1719
- )
1720
- iou_right_down = (
1721
- _calculate_overlap_area_div_minbox_area_ratio(
1722
- bbox,
1723
- match_block["title_text"][-1][1],
1724
- )
1725
- )
1726
- iou = 1 - max(iou_left_up, iou_right_down)
1727
- distance = _manhattan_distance(bbox, match_bbox) * iou
1728
- else:
1729
- distance = float("inf")
1730
- elif distance_type == "manhattan":
1731
- distance = _manhattan_distance(bbox, match_bbox)
1732
- elif distance_type == "vision_footnote":
1733
- if (
1734
- match_block["block_label"] in vision_labels
1735
- and match_block["vision_footnote"] != []
1736
- ):
1737
- iou_left_up = _calculate_overlap_area_div_minbox_area_ratio(
1738
- bbox,
1739
- match_block["vision_footnote"][0],
1740
- )
1741
- iou_right_down = (
1742
- _calculate_overlap_area_div_minbox_area_ratio(
1743
- bbox,
1744
- match_block["vision_footnote"][-1],
1745
- )
1746
- )
1747
- iou = 1 - max(iou_left_up, iou_right_down)
1748
- distance = _manhattan_distance(bbox, match_bbox) * iou
1749
- else:
1750
- distance = float("inf")
1751
- elif distance_type == "vision_body":
1752
- if (
1753
- match_block["block_label"] in vision_title_labels
1754
- and block["vision_footnote"] != []
1755
- ):
1756
- iou_left_up = _calculate_overlap_area_div_minbox_area_ratio(
1757
- match_bbox,
1758
- block["vision_footnote"][0],
1759
- )
1760
- iou_right_down = (
1761
- _calculate_overlap_area_div_minbox_area_ratio(
1762
- match_bbox,
1763
- block["vision_footnote"][-1],
1764
- )
1765
- )
1766
- iou = 1 - max(iou_left_up, iou_right_down)
1767
- distance = _manhattan_distance(bbox, match_bbox) * iou
1768
- else:
1769
- distance = float("inf")
1770
- # when reference block cross mulitple columns, its order should be after the blocks above it.
1771
- elif distance_type == "append":
1772
- if match_bbox[3] <= bbox[1]:
1773
- distance = -(match_bbox[2] * 10 + match_bbox[3])
1774
- else:
1775
- distance = float("inf")
1776
- else:
1777
- raise NotImplementedError
1778
-
1779
- if distance < min_distance:
1780
- min_distance = distance
1781
- if is_add_index:
1782
- nearest_gt_index = match_block.get("index", 999)
1783
- else:
1784
- nearest_gt_index = match_block.get("sub_index", 999)
1785
-
1786
- if is_add_index:
1787
- block["index"] = nearest_gt_index
1788
- else:
1789
- block["sub_index"] = nearest_gt_index
1790
-
1791
- parsing_res_by_pre_cuts.append(block)
1792
-
1793
- # double text label
1794
- double_text_blocks.sort(
1795
- key=lambda x: (
1796
- x["block_bbox"][1] // 10,
1797
- x["block_bbox"][0] // median_width,
1798
- x["block_bbox"][1] ** 2 + x["block_bbox"][0] ** 2,
1799
- ),
1800
- )
1801
- # filter the reference blocks from all blocks that cross mulitple columns.
1802
- # they should be ordered using "append".
1803
- double_text_reference_blocks = []
1804
- i = 0
1805
- while i < len(double_text_blocks):
1806
- if double_text_blocks[i]["block_label"] == "reference":
1807
- double_text_reference_blocks.append(double_text_blocks.pop(i))
1808
- else:
1809
- i += 1
1810
- nearest_match_(
1811
- double_text_blocks,
1812
- distance_type="nearest_iou_edge_distance",
1813
- )
1814
- nearest_match_(
1815
- double_text_reference_blocks,
1816
- distance_type="append",
1817
- )
1818
- parsing_res_by_pre_cuts.sort(
1819
- key=lambda x: (x["index"], x["block_bbox"][1], x["block_bbox"][0]),
1820
- )
1821
-
1822
- for idx, block in enumerate(parsing_res_by_pre_cuts):
1823
- block["index"] = num_index + idx + 1
1824
- block["sub_index"] = num_sub_index + idx + 1
1825
-
1826
- # title label
1827
- title_blocks.sort(
1828
- key=lambda x: (
1829
- x["block_bbox"][1] // 10,
1830
- x["block_bbox"][0] // median_width,
1831
- x["block_bbox"][1] ** 2 + x["block_bbox"][0] ** 2,
1832
- ),
1833
- )
1834
- nearest_match_(title_blocks, distance_type="nearest_iou_edge_distance")
1835
-
1836
- if doc_flag:
1837
- text_sort_labels = ["doc_title"]
1838
- text_label_priority = {
1839
- label: priority for priority, label in enumerate(text_sort_labels)
1840
- }
1841
- doc_titles = []
1842
- for i, block in enumerate(parsing_res_by_pre_cuts):
1843
- if block["block_label"] == "doc_title":
1844
- doc_titles.append(
1845
- (i, block["block_bbox"][1], block["block_bbox"][0]),
818
+ split_block_idxes.append(block_idx)
819
+
820
+ if len(iner_block_idxes) > 0:
821
+ if len(split_block_idxes) > 0:
822
+ for split_block_idx in split_block_idxes:
823
+ split_block_bbox = block_bboxes[split_block_idx]
824
+ x1, y1, x2, y2 = tmp_region_bbox
825
+ x1_prime, y1_prime, x2_prime, y2_prime = split_block_bbox
826
+ edge_distance_list = [
827
+ (x1_prime - x1) / image_width,
828
+ (y1_prime - y1) / image_height,
829
+ (x2 - x2_prime) / image_width,
830
+ (y2 - y2_prime) / image_height,
831
+ ]
832
+ max_distance = max(edge_distance_list)
833
+ src_index = edge_distance_list.index(max_distance)
834
+ dst_index = index_conversion_map[src_index]
835
+ tmp_region_bbox[dst_index] = split_block_bbox[src_index]
836
+ tmp_region_bbox, iner_idxes = shrink_supplement_region_bbox(
837
+ tmp_region_bbox,
838
+ ref_region_bbox,
839
+ image_width,
840
+ image_height,
841
+ iner_block_idxes,
842
+ block_bboxes,
1846
843
  )
1847
- doc_titles.sort(key=lambda x: (x[1], x[2]))
1848
- first_doc_title_index = doc_titles[0][0]
1849
- parsing_res_by_pre_cuts[first_doc_title_index]["index"] = 1
1850
- parsing_res_by_pre_cuts.sort(
1851
- key=lambda x: (
1852
- x["index"],
1853
- text_label_priority.get(x["block_label"], 9999),
1854
- x["block_bbox"][1],
1855
- x["block_bbox"][0],
1856
- ),
1857
- )
844
+ if len(iner_idxes) == 0:
845
+ continue
846
+ matched_bboxes = [block_bboxes[idx] for idx in iner_block_idxes]
847
+ supplement_region_bbox = calculate_minimum_enclosing_bbox(matched_bboxes)
848
+ break
1858
849
  else:
1859
- parsing_res_by_pre_cuts.sort(
1860
- key=lambda x: (
1861
- x["index"],
1862
- x["block_bbox"][1],
1863
- x["block_bbox"][0],
1864
- ),
1865
- )
1866
-
1867
- for idx, block in enumerate(parsing_res_by_pre_cuts):
1868
- block["index"] = num_index + idx + 1
1869
- block["sub_index"] = num_sub_index + idx + 1
1870
-
1871
- # title-text label
1872
- nearest_match_(title_text_blocks, distance_type="title_text")
1873
-
1874
- def hor_tb_and_ver_lr(x):
1875
- input_bbox = x["block_bbox"]
1876
- is_horizontal = _get_bbox_direction(input_bbox)
1877
- if is_horizontal:
1878
- return input_bbox[1]
1879
- else:
1880
- return input_bbox[0]
1881
-
1882
- parsing_res_by_pre_cuts.sort(
1883
- key=lambda x: (x["index"], hor_tb_and_ver_lr(x)),
1884
- )
1885
-
1886
- for idx, block in enumerate(parsing_res_by_pre_cuts):
1887
- block["index"] = num_index + idx + 1
1888
- block["sub_index"] = num_sub_index + idx + 1
1889
-
1890
- # image,figure,chart,seal label
1891
- nearest_match_(
1892
- vision_blocks,
1893
- distance_type="nearest_iou_edge_distance",
1894
- is_add_index=False,
1895
- )
1896
- parsing_res_by_pre_cuts.sort(
1897
- key=lambda x: (
1898
- x["sub_index"],
1899
- x["block_bbox"][1],
1900
- x["block_bbox"][0],
1901
- ),
1902
- )
1903
-
1904
- for idx, block in enumerate(parsing_res_by_pre_cuts):
1905
- block["sub_index"] = num_sub_index + idx + 1
1906
-
1907
- # image,figure,chart,seal title label
1908
- nearest_match_(
1909
- vision_title_blocks,
1910
- distance_type="nearest_iou_edge_distance",
1911
- is_add_index=False,
1912
- )
1913
- parsing_res_by_pre_cuts.sort(
1914
- key=lambda x: (
1915
- x["sub_index"],
1916
- x["block_bbox"][1],
1917
- x["block_bbox"][0],
1918
- ),
1919
- )
1920
-
1921
- for idx, block in enumerate(parsing_res_by_pre_cuts):
1922
- block["sub_index"] = num_sub_index + idx + 1
1923
-
1924
- # vision footnote label
1925
- nearest_match_(
1926
- vision_footnote_blocks,
1927
- distance_type="vision_footnote",
1928
- is_add_index=False,
1929
- )
1930
- text_label_priority = {"vision_footnote": 9999}
1931
- parsing_res_by_pre_cuts.sort(
1932
- key=lambda x: (
1933
- x["sub_index"],
1934
- text_label_priority.get(x["sub_label"], 0),
1935
- x["block_bbox"][1],
1936
- x["block_bbox"][0],
1937
- ),
1938
- )
1939
-
1940
- for idx, block in enumerate(parsing_res_by_pre_cuts):
1941
- block["sub_index"] = num_sub_index + idx + 1
1942
-
1943
- # header、footnote、header_image... label
1944
- nearest_match_(other_blocks, distance_type="manhattan", is_add_index=False)
1945
-
1946
- # add all parsing result
1947
- final_parsing_res_list.extend(parsing_res_by_pre_cuts)
1948
-
1949
- # update num index
1950
- num_sub_index += len(parsing_res_by_pre_cuts)
1951
- for parsing_res in parsing_res_by_pre_cuts:
1952
- if parsing_res.get("index"):
1953
- num_index += 1
1954
-
1955
- parsing_res_list = [
1956
- {
1957
- "block_label": parsing_res["block_label"],
1958
- "block_content": parsing_res["block_content"],
1959
- "block_bbox": parsing_res["block_bbox"],
1960
- "block_image": parsing_res.get("block_image", None),
1961
- "sub_label": parsing_res["sub_label"],
1962
- "sub_index": parsing_res["sub_index"],
1963
- "index": parsing_res.get("index", None),
1964
- "seg_start_coordinate": parsing_res.get(
1965
- "seg_start_coordinate", float("inf")
1966
- ),
1967
- "seg_end_coordinate": parsing_res.get("seg_end_coordinate", float("-inf")),
1968
- "num_of_lines": parsing_res.get("num_of_lines", 1),
1969
- }
1970
- for parsing_res in final_parsing_res_list
1971
- ]
1972
-
1973
- return parsing_res_list
1974
-
1975
-
1976
- def _manhattan_distance(
1977
- point1: Tuple[float, float],
1978
- point2: Tuple[float, float],
1979
- weight_x: float = 1.0,
1980
- weight_y: float = 1.0,
1981
- ) -> float:
1982
- """
1983
- Calculate the weighted Manhattan distance between two points.
1984
-
1985
- Args:
1986
- point1 (Tuple[float, float]): The first point as (x, y).
1987
- point2 (Tuple[float, float]): The second point as (x, y).
1988
- weight_x (float): The weight for the x-axis distance. Default is 1.0.
1989
- weight_y (float): The weight for the y-axis distance. Default is 1.0.
1990
-
1991
- Returns:
1992
- float: The weighted Manhattan distance between the two points.
1993
- """
1994
- return weight_x * abs(point1[0] - point2[0]) + weight_y * abs(point1[1] - point2[1])
1995
-
1996
-
1997
- def _calculate_horizontal_distance(
1998
- input_bbox: List[int],
1999
- match_bbox: List[int],
2000
- height: int,
2001
- disperse: int,
2002
- title_text: List[Tuple[int, List[int]]],
2003
- ) -> float:
2004
- """
2005
- Calculate the horizontal distance between two bounding boxes, considering title text adjustments.
2006
-
2007
- Args:
2008
- input_bbox (List[int]): The bounding box coordinates [x1, y1, x2, y2] of the input object.
2009
- match_bbox (List[int]): The bounding box coordinates [x1', y1', x2', y2'] of the object to match against.
2010
- height (int): The height of the input bounding box used for normalization.
2011
- disperse (int): The dispersion factor used to normalize the horizontal distance.
2012
- title_text (List[Tuple[int, List[int]]]): A list of tuples containing title text information and their bounding box coordinates.
2013
- Format: [(position_indicator, [x1, y1, x2, y2]), ...].
2014
-
2015
- Returns:
2016
- float: The calculated horizontal distance taking into account the title text adjustments.
2017
- """
2018
- x1, y1, x2, y2 = input_bbox
2019
- x1_prime, y1_prime, x2_prime, y2_prime = match_bbox
2020
-
2021
- # Determine vertical distance adjustment based on title text
2022
- if y2 < y1_prime:
2023
- if title_text and title_text[-1][0] == 2:
2024
- y2 += title_text[-1][1][3] - title_text[-1][1][1]
2025
- vertical_adjustment = (y1_prime - y2) * 0.5
2026
- else:
2027
- if title_text and title_text[0][0] == 1:
2028
- y1 -= title_text[0][1][3] - title_text[0][1][1]
2029
- vertical_adjustment = y1 - y2_prime
2030
-
2031
- # Calculate horizontal distance with adjustments
2032
- horizontal_distance = (
2033
- abs(x2_prime - x1) // disperse
2034
- + vertical_adjustment // height
2035
- + vertical_adjustment / 5000
2036
- )
2037
-
2038
- return horizontal_distance
2039
-
850
+ edge_distance_list_tmp = [
851
+ x for x in edge_distance_list_tmp if x != min_distance
852
+ ]
853
+ min_distance = min(edge_distance_list_tmp)
854
+ src_index = index_conversion_map[edge_distance_list.index(min_distance)]
855
+ return supplement_region_bbox, iner_block_idxes
2040
856
 
2041
- def _calculate_vertical_distance(
2042
- input_bbox: List[int],
2043
- match_bbox: List[int],
2044
- width: int,
2045
- disperse: int,
2046
- title_text: List[Tuple[int, List[int]]],
2047
- ) -> float:
2048
- """
2049
- Calculate the vertical distance between two bounding boxes, considering title text adjustments.
2050
857
 
2051
- Args:
2052
- input_bbox (List[int]): The bounding box coordinates [x1, y1, x2, y2] of the input object.
2053
- match_bbox (List[int]): The bounding box coordinates [x1', y1', x2', y2'] of the object to match against.
2054
- width (int): The width of the input bounding box used for normalization.
2055
- disperse (int): The dispersion factor used to normalize the vertical distance.
2056
- title_text (List[Tuple[int, List[int]]]): A list of tuples containing title text information and their bounding box coordinates.
2057
- Format: [(position_indicator, [x1, y1, x2, y2]), ...].
858
+ def update_region_box(bbox, region_box):
859
+ if region_box is None:
860
+ return bbox
2058
861
 
2059
- Returns:
2060
- float: The calculated vertical distance taking into account the title text adjustments.
2061
- """
2062
- x1, y1, x2, y2 = input_bbox
2063
- x1_prime, y1_prime, x2_prime, y2_prime = match_bbox
2064
-
2065
- # Determine horizontal distance adjustment based on title text
2066
- if x1 > x2_prime:
2067
- if title_text and title_text[0][0] == 3:
2068
- x1 -= title_text[0][1][2] - title_text[0][1][0]
2069
- horizontal_adjustment = (x1 - x2_prime) * 0.5
2070
- else:
2071
- if title_text and title_text[-1][0] == 4:
2072
- x2 += title_text[-1][1][2] - title_text[-1][1][0]
2073
- horizontal_adjustment = x1_prime - x2
2074
-
2075
- # Calculate vertical distance with adjustments
2076
- vertical_distance = (
2077
- abs(y2_prime - y1) // disperse
2078
- + horizontal_adjustment // width
2079
- + horizontal_adjustment / 5000
2080
- )
862
+ x1, y1, x2, y2 = bbox
863
+ x1_region, y1_region, x2_region, y2_region = region_box
2081
864
 
2082
- return vertical_distance
865
+ x1_region = int(min(x1, x1_region))
866
+ y1_region = int(min(y1, y1_region))
867
+ x2_region = int(max(x2, x2_region))
868
+ y2_region = int(max(y2, y2_region))
2083
869
 
870
+ region_box = [x1_region, y1_region, x2_region, y2_region]
2084
871
 
2085
- def _nearest_edge_distance(
2086
- input_bbox: List[int],
2087
- match_bbox: List[int],
2088
- weight: List[float] = [1.0, 1.0, 1.0, 1.0],
2089
- label: str = "text",
2090
- no_mask_labels: List[str] = [],
2091
- min_edge_distance_config: List[float] = [],
2092
- tolerance_len: float = 10.0,
2093
- ) -> Tuple[float, List[float]]:
2094
- """
2095
- Calculate the nearest edge distance between two bounding boxes, considering directional weights.
872
+ return region_box
2096
873
 
2097
- Args:
2098
- input_bbox (list): The bounding box coordinates [x1, y1, x2, y2] of the input object.
2099
- match_bbox (list): The bounding box coordinates [x1', y1', x2', y2'] of the object to match against.
2100
- weight (list, optional): Directional weights for the edge distances [left, right, up, down]. Defaults to [1, 1, 1, 1].
2101
- label (str, optional): The label/type of the object in the bounding box (e.g., 'text'). Defaults to 'text'.
2102
- no_mask_labels (list, optional): Labels for which no masking is applied when calculating edge distances. Defaults to an empty list.
2103
- min_edge_distance_config (list, optional): Configuration for minimum edge distances [min_edge_distance_x, min_edge_distance_y].
2104
- Defaults to [float('inf'), float('inf')].
2105
- tolerance_len (float, optional): The tolerance length for adjusting edge distances. Defaults to 10.
2106
874
 
2107
- Returns:
2108
- Tuple[float, List[float]]: A tuple containing:
2109
- - The calculated minimum edge distance between the bounding boxes.
2110
- - A list with the minimum edge distances in the x and y directions.
2111
- """
2112
- match_bbox_iou = _calculate_overlap_area_div_minbox_area_ratio(
2113
- input_bbox,
2114
- match_bbox,
2115
- )
2116
- if match_bbox_iou > 0 and label not in no_mask_labels:
2117
- return 0, [0, 0]
2118
-
2119
- if not min_edge_distance_config:
2120
- min_edge_distance_config = [float("inf"), float("inf")]
2121
- min_edge_distance_x, min_edge_distance_y = min_edge_distance_config
2122
-
2123
- x1, y1, x2, y2 = input_bbox
2124
- x1_prime, y1_prime, x2_prime, y2_prime = match_bbox
2125
-
2126
- direction_num = 0
2127
- distance_x = float("inf")
2128
- distance_y = float("inf")
2129
- distance = [float("inf")] * 4
2130
-
2131
- # input_bbox is to the left of match_bbox
2132
- if x2 < x1_prime:
2133
- direction_num += 1
2134
- distance[0] = x1_prime - x2
2135
- if abs(distance[0] - min_edge_distance_x) <= tolerance_len:
2136
- distance_x = min_edge_distance_x * weight[0]
2137
- else:
2138
- distance_x = distance[0] * weight[0]
2139
- # input_bbox is to the right of match_bbox
2140
- elif x1 > x2_prime:
2141
- direction_num += 1
2142
- distance[1] = x1 - x2_prime
2143
- if abs(distance[1] - min_edge_distance_x) <= tolerance_len:
2144
- distance_x = min_edge_distance_x * weight[1]
2145
- else:
2146
- distance_x = distance[1] * weight[1]
2147
- elif match_bbox_iou > 0:
2148
- distance[0] = 0
2149
- distance_x = 0
2150
-
2151
- # input_bbox is above match_bbox
2152
- if y2 < y1_prime:
2153
- direction_num += 1
2154
- distance[2] = y1_prime - y2
2155
- if abs(distance[2] - min_edge_distance_y) <= tolerance_len:
2156
- distance_y = min_edge_distance_y * weight[2]
2157
- else:
2158
- distance_y = distance[2] * weight[2]
2159
- if label in no_mask_labels:
2160
- distance_y = max(0.1, distance_y) * 10 # for abstract
2161
- # input_bbox is below match_bbox
2162
- elif y1 > y2_prime:
2163
- direction_num += 1
2164
- distance[3] = y1 - y2_prime
2165
- if abs(distance[3] - min_edge_distance_y) <= tolerance_len:
2166
- distance_y = min_edge_distance_y * weight[3]
2167
- else:
2168
- distance_y = distance[3] * weight[3]
2169
- elif match_bbox_iou > 0:
2170
- distance[2] = 0
2171
- distance_y = 0
2172
-
2173
- if direction_num == 2:
2174
- return (distance_x + distance_y), [
2175
- min(distance[0], distance[1]),
2176
- min(distance[2], distance[3]),
875
+ def convert_formula_res_to_ocr_format(formula_res_list: List, ocr_res: dict):
876
+ for formula_res in formula_res_list:
877
+ x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
878
+ poly_points = [
879
+ (x_min, y_min),
880
+ (x_max, y_min),
881
+ (x_max, y_max),
882
+ (x_min, y_max),
2177
883
  ]
2178
- else:
2179
- return min(distance_x, distance_y), [
2180
- min(distance[0], distance[1]),
2181
- min(distance[2], distance[3]),
2182
- ]
2183
-
2184
-
2185
- def _get_weights(label, horizontal):
2186
- """Define weights based on the label and orientation."""
2187
- if label == "doc_title":
2188
- return (
2189
- [1, 0.1, 0.1, 1] if horizontal else [0.2, 0.1, 1, 1]
2190
- ) # left-down , right-left
2191
- elif label in [
2192
- "paragraph_title",
2193
- "table_title",
2194
- "abstract",
2195
- "image",
2196
- "seal",
2197
- "chart",
2198
- "figure",
2199
- ]:
2200
- return [1, 1, 0.1, 1] # down
2201
- else:
2202
- return [1, 1, 1, 0.1] # up
2203
-
2204
-
2205
- def _nearest_iou_edge_distance(
2206
- input_bbox: List[int],
2207
- match_bbox: List[int],
2208
- label: str,
2209
- vision_labels: List[str],
2210
- no_mask_labels: List[str],
2211
- median_width: int = -1,
2212
- title_labels: List[str] = [],
2213
- title_text: List[Tuple[int, List[int]]] = [],
2214
- sub_title: List[List[int]] = [],
2215
- min_distance_config: List[float] = [],
2216
- tolerance_len: float = 10.0,
2217
- ) -> Tuple[float, List[float]]:
2218
- """
2219
- Calculate the nearest IOU edge distance between two bounding boxes, considering label types, title adjustments, and minimum distance configurations.
2220
- This function computes the edge distance between two bounding boxes while considering their overlap (IOU) and various adjustments based on label types,
2221
- title text, and subtitle information. It also applies minimum distance configurations and tolerance adjustments.
2222
-
2223
- Args:
2224
- input_bbox (List[int]): The bounding box coordinates [x1, y1, x2, y2] of the input object.
2225
- match_bbox (List[int]): The bounding box coordinates [x1', y1', x2', y2'] of the object to match against.
2226
- label (str): The label/type of the object in the bounding box (e.g., 'image', 'text', etc.).
2227
- vision_labels (List[str]): List of labels for vision-related objects (e.g., images, icons).
2228
- no_mask_labels (List[str]): Labels for which no masking is applied when calculating edge distances.
2229
- median_width (int, optional): The median width for title dispersion calculation. Defaults to -1.
2230
- title_labels (List[str], optional): Labels that indicate the object is a title. Defaults to an empty list.
2231
- title_text (List[Tuple[int, List[int]]], optional): Text content associated with title labels, in the format [(position_indicator, [x1, y1, x2, y2]), ...].
2232
- sub_title (List[List[int]], optional): List of subtitle bounding boxes to adjust the input_bbox. Defaults to an empty list.
2233
- min_distance_config (List[float], optional): Configuration for minimum distances [min_edge_distance_config, up_edge_distances_config, total_distance].
2234
- tolerance_len (float, optional): The tolerance length for adjusting edge distances. Defaults to 10.0.
2235
-
2236
- Returns:
2237
- Tuple[float, List[float]]: A tuple containing:
2238
- - The calculated distance considering IOU and adjustments.
2239
- - The updated minimum distance configuration.
2240
- """
2241
-
2242
- x1, y1, x2, y2 = input_bbox
2243
- x1_prime, y1_prime, x2_prime, y2_prime = match_bbox
2244
-
2245
- min_edge_distance_config, up_edge_distances_config, total_distance = (
2246
- min_distance_config
2247
- )
2248
-
2249
- iou_distance = 0
2250
-
2251
- if label in vision_labels:
2252
- horizontal1 = horizontal2 = True
2253
- else:
2254
- horizontal1 = _get_bbox_direction(input_bbox)
2255
- horizontal2 = _get_bbox_direction(match_bbox, 3)
2256
-
2257
- if (
2258
- horizontal1 != horizontal2
2259
- or _get_projection_iou(input_bbox, match_bbox, horizontal1) < 0.01
2260
- ):
2261
- iou_distance = 1
2262
-
2263
- if label == "doc_title":
2264
- # Calculate distance for titles
2265
- disperse = max(1, median_width)
2266
- tolerance_len = max(tolerance_len, disperse)
2267
-
2268
- # Adjust input_bbox based on sub_title
2269
- if sub_title:
2270
- for sub in sub_title:
2271
- x1_, y1_, x2_, y2_ = sub
2272
- x1, y1, x2, y2 = (
2273
- min(x1, x1_),
2274
- min(y1, y1_),
2275
- min(x2, x2_),
2276
- max(y2, y2_),
884
+ ocr_res["dt_polys"].append(poly_points)
885
+ formula_res_text: str = formula_res["rec_formula"]
886
+ ocr_res["rec_texts"].append(formula_res_text)
887
+ if ocr_res["rec_boxes"].size == 0:
888
+ ocr_res["rec_boxes"] = np.array(formula_res["dt_polys"])
889
+ else:
890
+ ocr_res["rec_boxes"] = np.vstack(
891
+ (ocr_res["rec_boxes"], [formula_res["dt_polys"]])
2277
892
  )
2278
- input_bbox = [x1, y1, x2, y2]
2279
-
2280
- if title_text:
2281
- for sub in title_text:
2282
- x1_, y1_, x2_, y2_ = sub[1]
2283
- if horizontal1:
2284
- x1, y1, x2, y2 = (
2285
- min(x1, x1_),
2286
- min(y1, y1_),
2287
- min(x2, x2_),
2288
- max(y2, y2_),
2289
- )
2290
- else:
2291
- x1, y1, x2, y2 = (
2292
- min(x1, x1_),
2293
- min(y1, y1_),
2294
- max(x2, x2_),
2295
- min(y2, y2_),
2296
- )
2297
- input_bbox = [x1, y1, x2, y2]
2298
-
2299
- # Calculate edge distance
2300
- weight = _get_weights(label, horizontal1)
2301
- if label == "abstract":
2302
- tolerance_len *= 2
2303
-
2304
- edge_distance, edge_distance_config = _nearest_edge_distance(
2305
- input_bbox,
2306
- match_bbox,
2307
- weight,
2308
- label=label,
2309
- no_mask_labels=no_mask_labels,
2310
- min_edge_distance_config=min_edge_distance_config,
2311
- tolerance_len=tolerance_len,
2312
- )
2313
-
2314
- # Weights for combining distances
2315
- iou_edge_weight = [10**8, 10**4, 1, 0.0001]
2316
-
2317
- # Calculate up and left edge distances
2318
- up_edge_distance = y1_prime
2319
- left_edge_distance = x1_prime
2320
- if (
2321
- label in no_mask_labels or label in title_labels or label in vision_labels
2322
- ) and y1 > y2_prime:
2323
- up_edge_distance = -y2_prime
2324
- left_edge_distance = -x2_prime
2325
-
2326
- min_up_edge_distance = up_edge_distances_config
2327
- if abs(min_up_edge_distance - up_edge_distance) <= tolerance_len:
2328
- up_edge_distance = min_up_edge_distance
2329
-
2330
- # Calculate total distance
2331
- distance = (
2332
- iou_distance * iou_edge_weight[0]
2333
- + edge_distance * iou_edge_weight[1]
2334
- + up_edge_distance * iou_edge_weight[2]
2335
- + left_edge_distance * iou_edge_weight[3]
2336
- )
2337
-
2338
- # Update minimum distance configuration if a smaller distance is found
2339
- if total_distance > distance:
2340
- edge_distance_config = [
2341
- edge_distance_config[0],
2342
- edge_distance_config[1],
2343
- ]
2344
- min_distance_config = [
2345
- edge_distance_config,
2346
- up_edge_distance,
2347
- distance,
2348
- ]
2349
-
2350
- return distance, min_distance_config
2351
-
2352
-
2353
- def get_show_color(label: str) -> Tuple:
2354
- label_colors = {
2355
- # Medium Blue (from 'titles_list')
2356
- "paragraph_title": (102, 102, 255, 100),
2357
- "doc_title": (255, 248, 220, 100), # Cornsilk
2358
- # Light Yellow (from 'tables_caption_list')
2359
- "table_title": (255, 255, 102, 100),
2360
- # Sky Blue (from 'imgs_caption_list')
2361
- "figure_title": (102, 178, 255, 100),
2362
- "chart_title": (221, 160, 221, 100), # Plum
2363
- "vision_footnote": (144, 238, 144, 100), # Light Green
2364
- # Deep Purple (from 'texts_list')
2365
- "text": (153, 0, 76, 100),
2366
- # Bright Green (from 'interequations_list')
2367
- "formula": (0, 255, 0, 100),
2368
- "abstract": (255, 239, 213, 100), # Papaya Whip
2369
- # Medium Green (from 'lists_list' and 'indexs_list')
2370
- "content": (40, 169, 92, 100),
2371
- # Neutral Gray (from 'dropped_bbox_list')
2372
- "seal": (158, 158, 158, 100),
2373
- # Olive Yellow (from 'tables_body_list')
2374
- "table": (204, 204, 0, 100),
2375
- # Bright Green (from 'imgs_body_list')
2376
- "image": (153, 255, 51, 100),
2377
- # Bright Green (from 'imgs_body_list')
2378
- "figure": (153, 255, 51, 100),
2379
- "chart": (216, 191, 216, 100), # Thistle
2380
- # Pale Yellow-Green (from 'tables_footnote_list')
2381
- "reference": (229, 255, 204, 100),
2382
- "algorithm": (255, 250, 240, 100), # Floral White
2383
- }
893
+ ocr_res["rec_labels"].append("formula")
894
+ ocr_res["rec_polys"].append(poly_points)
895
+ ocr_res["rec_scores"].append(1)
896
+
897
+
898
+ def caculate_bbox_area(bbox):
899
+ x1, y1, x2, y2 = map(float, bbox)
900
+ area = abs((x2 - x1) * (y2 - y1))
901
+ return area
902
+
903
+
904
+ def get_show_color(label: str, order_label=False) -> Tuple:
905
+ if order_label:
906
+ label_colors = {
907
+ "doc_title": (255, 248, 220, 100), # Cornsilk
908
+ "doc_title_text": (255, 239, 213, 100),
909
+ "paragraph_title": (102, 102, 255, 100),
910
+ "sub_paragraph_title": (102, 178, 255, 100),
911
+ "vision": (153, 255, 51, 100),
912
+ "vision_title": (144, 238, 144, 100), # Light Green
913
+ "vision_footnote": (144, 238, 144, 100), # Light Green
914
+ "normal_text": (153, 0, 76, 100),
915
+ "cross_layout": (53, 218, 207, 100), # Thistle
916
+ "cross_reference": (221, 160, 221, 100), # Floral White
917
+ }
918
+ else:
919
+ label_colors = {
920
+ # Medium Blue (from 'titles_list')
921
+ "paragraph_title": (102, 102, 255, 100),
922
+ "doc_title": (255, 248, 220, 100), # Cornsilk
923
+ # Light Yellow (from 'tables_caption_list')
924
+ "table_title": (255, 255, 102, 100),
925
+ # Sky Blue (from 'imgs_caption_list')
926
+ "figure_title": (102, 178, 255, 100),
927
+ "chart_title": (221, 160, 221, 100), # Plum
928
+ "vision_footnote": (144, 238, 144, 100), # Light Green
929
+ # Deep Purple (from 'texts_list')
930
+ "text": (153, 0, 76, 100),
931
+ # Bright Green (from 'interequations_list')
932
+ "formula": (0, 255, 0, 100),
933
+ "abstract": (255, 239, 213, 100), # Papaya Whip
934
+ # Medium Green (from 'lists_list' and 'indexs_list')
935
+ "content": (40, 169, 92, 100),
936
+ # Neutral Gray (from 'dropped_bbox_list')
937
+ "seal": (158, 158, 158, 100),
938
+ # Olive Yellow (from 'tables_body_list')
939
+ "table": (204, 204, 0, 100),
940
+ # Bright Green (from 'imgs_body_list')
941
+ "image": (153, 255, 51, 100),
942
+ # Bright Green (from 'imgs_body_list')
943
+ "figure": (153, 255, 51, 100),
944
+ "chart": (216, 191, 216, 100), # Thistle
945
+ # Pale Yellow-Green (from 'tables_footnote_list')
946
+ "reference": (229, 255, 204, 100),
947
+ # "reference_content": (229, 255, 204, 100),
948
+ "algorithm": (255, 250, 240, 100), # Floral White
949
+ }
2384
950
  default_color = (158, 158, 158, 100)
2385
951
  return label_colors.get(label, default_color)