paddlex 3.0.0rc1__py3-none-any.whl → 3.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (240) hide show
  1. paddlex/.version +1 -1
  2. paddlex/__init__.py +1 -1
  3. paddlex/configs/modules/chart_parsing/PP-Chart2Table.yaml +13 -0
  4. paddlex/configs/modules/doc_vlm/PP-DocBee2-3B.yaml +14 -0
  5. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-L.yaml +40 -0
  6. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-M.yaml +40 -0
  7. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-S.yaml +40 -0
  8. paddlex/configs/modules/layout_detection/PP-DocBlockLayout.yaml +40 -0
  9. paddlex/configs/modules/layout_detection/PP-DocLayout-L.yaml +2 -2
  10. paddlex/configs/modules/layout_detection/PP-DocLayout-M.yaml +2 -2
  11. paddlex/configs/modules/layout_detection/PP-DocLayout-S.yaml +2 -2
  12. paddlex/configs/modules/layout_detection/PP-DocLayout_plus-L.yaml +40 -0
  13. paddlex/configs/modules/text_detection/PP-OCRv5_mobile_det.yaml +40 -0
  14. paddlex/configs/modules/text_detection/PP-OCRv5_server_det.yaml +40 -0
  15. paddlex/configs/modules/text_recognition/PP-OCRv5_mobile_rec.yaml +39 -0
  16. paddlex/configs/modules/text_recognition/PP-OCRv5_server_rec.yaml +39 -0
  17. paddlex/configs/modules/textline_orientation/PP-LCNet_x1_0_textline_ori.yaml +41 -0
  18. paddlex/configs/pipelines/OCR.yaml +7 -6
  19. paddlex/configs/pipelines/PP-ChatOCRv3-doc.yaml +3 -1
  20. paddlex/configs/pipelines/PP-ChatOCRv4-doc.yaml +91 -34
  21. paddlex/configs/pipelines/PP-StructureV3.yaml +72 -72
  22. paddlex/configs/pipelines/doc_understanding.yaml +1 -1
  23. paddlex/configs/pipelines/formula_recognition.yaml +2 -2
  24. paddlex/configs/pipelines/layout_parsing.yaml +3 -2
  25. paddlex/configs/pipelines/seal_recognition.yaml +1 -0
  26. paddlex/configs/pipelines/table_recognition.yaml +2 -1
  27. paddlex/configs/pipelines/table_recognition_v2.yaml +7 -1
  28. paddlex/hpip_links.html +20 -20
  29. paddlex/inference/common/batch_sampler/doc_vlm_batch_sampler.py +33 -10
  30. paddlex/inference/common/batch_sampler/image_batch_sampler.py +34 -25
  31. paddlex/inference/common/result/mixin.py +19 -12
  32. paddlex/inference/models/base/predictor/base_predictor.py +2 -8
  33. paddlex/inference/models/common/static_infer.py +29 -73
  34. paddlex/inference/models/common/tokenizer/__init__.py +2 -0
  35. paddlex/inference/models/common/tokenizer/clip_tokenizer.py +1 -1
  36. paddlex/inference/models/common/tokenizer/gpt_tokenizer.py +2 -2
  37. paddlex/inference/models/common/tokenizer/qwen2_5_tokenizer.py +112 -0
  38. paddlex/inference/models/common/tokenizer/qwen2_tokenizer.py +7 -1
  39. paddlex/inference/models/common/tokenizer/qwen_tokenizer.py +288 -0
  40. paddlex/inference/models/common/tokenizer/tokenizer_utils.py +13 -13
  41. paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py +3 -3
  42. paddlex/inference/models/common/tokenizer/vocab.py +7 -7
  43. paddlex/inference/models/common/ts/funcs.py +19 -8
  44. paddlex/inference/models/common/vlm/conversion_utils.py +99 -0
  45. paddlex/inference/models/common/vlm/fusion_ops.py +205 -0
  46. paddlex/inference/models/common/vlm/generation/configuration_utils.py +1 -1
  47. paddlex/inference/models/common/vlm/generation/logits_process.py +1 -1
  48. paddlex/inference/models/common/vlm/generation/utils.py +1 -1
  49. paddlex/inference/models/common/vlm/transformers/configuration_utils.py +3 -3
  50. paddlex/inference/models/common/vlm/transformers/conversion_utils.py +3 -3
  51. paddlex/inference/models/common/vlm/transformers/model_outputs.py +2 -2
  52. paddlex/inference/models/common/vlm/transformers/model_utils.py +7 -31
  53. paddlex/inference/models/doc_vlm/modeling/GOT_ocr_2_0.py +830 -0
  54. paddlex/inference/models/doc_vlm/modeling/__init__.py +2 -0
  55. paddlex/inference/models/doc_vlm/modeling/qwen2.py +1606 -0
  56. paddlex/inference/models/doc_vlm/modeling/qwen2_5_vl.py +3006 -0
  57. paddlex/inference/models/doc_vlm/modeling/qwen2_vl.py +0 -105
  58. paddlex/inference/models/doc_vlm/predictor.py +79 -24
  59. paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py +97 -0
  60. paddlex/inference/models/doc_vlm/processors/__init__.py +2 -0
  61. paddlex/inference/models/doc_vlm/processors/common.py +189 -0
  62. paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py +548 -0
  63. paddlex/inference/models/doc_vlm/processors/qwen2_vl.py +21 -176
  64. paddlex/inference/models/formula_recognition/predictor.py +8 -2
  65. paddlex/inference/models/formula_recognition/processors.py +90 -77
  66. paddlex/inference/models/formula_recognition/result.py +28 -27
  67. paddlex/inference/models/image_feature/processors.py +3 -4
  68. paddlex/inference/models/keypoint_detection/predictor.py +3 -0
  69. paddlex/inference/models/object_detection/predictor.py +2 -0
  70. paddlex/inference/models/object_detection/processors.py +28 -3
  71. paddlex/inference/models/object_detection/utils.py +2 -0
  72. paddlex/inference/models/table_structure_recognition/result.py +0 -10
  73. paddlex/inference/models/text_detection/predictor.py +8 -0
  74. paddlex/inference/models/text_detection/processors.py +44 -10
  75. paddlex/inference/models/text_detection/result.py +0 -10
  76. paddlex/inference/models/text_recognition/result.py +1 -1
  77. paddlex/inference/pipelines/__init__.py +9 -5
  78. paddlex/inference/pipelines/_parallel.py +172 -0
  79. paddlex/inference/pipelines/anomaly_detection/pipeline.py +16 -6
  80. paddlex/inference/pipelines/attribute_recognition/pipeline.py +11 -1
  81. paddlex/inference/pipelines/base.py +14 -4
  82. paddlex/inference/pipelines/components/faisser.py +1 -1
  83. paddlex/inference/pipelines/doc_preprocessor/pipeline.py +53 -27
  84. paddlex/inference/pipelines/formula_recognition/pipeline.py +120 -82
  85. paddlex/inference/pipelines/formula_recognition/result.py +1 -11
  86. paddlex/inference/pipelines/image_classification/pipeline.py +16 -6
  87. paddlex/inference/pipelines/image_multilabel_classification/pipeline.py +16 -6
  88. paddlex/inference/pipelines/instance_segmentation/pipeline.py +16 -6
  89. paddlex/inference/pipelines/keypoint_detection/pipeline.py +16 -6
  90. paddlex/inference/pipelines/layout_parsing/layout_objects.py +859 -0
  91. paddlex/inference/pipelines/layout_parsing/pipeline.py +34 -47
  92. paddlex/inference/pipelines/layout_parsing/pipeline_v2.py +832 -260
  93. paddlex/inference/pipelines/layout_parsing/result.py +4 -17
  94. paddlex/inference/pipelines/layout_parsing/result_v2.py +259 -245
  95. paddlex/inference/pipelines/layout_parsing/setting.py +88 -0
  96. paddlex/inference/pipelines/layout_parsing/utils.py +391 -2028
  97. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/__init__.py +16 -0
  98. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/utils.py +1199 -0
  99. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/xycuts.py +615 -0
  100. paddlex/inference/pipelines/m_3d_bev_detection/pipeline.py +2 -2
  101. paddlex/inference/pipelines/multilingual_speech_recognition/pipeline.py +2 -2
  102. paddlex/inference/pipelines/object_detection/pipeline.py +16 -6
  103. paddlex/inference/pipelines/ocr/pipeline.py +127 -70
  104. paddlex/inference/pipelines/ocr/result.py +21 -18
  105. paddlex/inference/pipelines/open_vocabulary_detection/pipeline.py +2 -2
  106. paddlex/inference/pipelines/open_vocabulary_segmentation/pipeline.py +2 -2
  107. paddlex/inference/pipelines/pp_chatocr/pipeline_base.py +2 -2
  108. paddlex/inference/pipelines/pp_chatocr/pipeline_v3.py +2 -5
  109. paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py +6 -6
  110. paddlex/inference/pipelines/rotated_object_detection/pipeline.py +16 -6
  111. paddlex/inference/pipelines/seal_recognition/pipeline.py +109 -53
  112. paddlex/inference/pipelines/semantic_segmentation/pipeline.py +16 -6
  113. paddlex/inference/pipelines/small_object_detection/pipeline.py +16 -6
  114. paddlex/inference/pipelines/table_recognition/pipeline.py +26 -18
  115. paddlex/inference/pipelines/table_recognition/pipeline_v2.py +624 -53
  116. paddlex/inference/pipelines/table_recognition/result.py +1 -1
  117. paddlex/inference/pipelines/table_recognition/table_recognition_post_processing_v2.py +9 -5
  118. paddlex/inference/pipelines/ts_anomaly_detection/pipeline.py +2 -2
  119. paddlex/inference/pipelines/ts_classification/pipeline.py +2 -2
  120. paddlex/inference/pipelines/ts_forecasting/pipeline.py +2 -2
  121. paddlex/inference/pipelines/video_classification/pipeline.py +2 -2
  122. paddlex/inference/pipelines/video_detection/pipeline.py +2 -2
  123. paddlex/inference/serving/basic_serving/_app.py +46 -13
  124. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/common.py +5 -1
  125. paddlex/inference/serving/basic_serving/_pipeline_apps/layout_parsing.py +0 -1
  126. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv3_doc.py +0 -1
  127. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv4_doc.py +1 -1
  128. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py +6 -2
  129. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition.py +1 -5
  130. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition_v2.py +4 -5
  131. paddlex/inference/serving/infra/utils.py +20 -22
  132. paddlex/inference/serving/schemas/formula_recognition.py +1 -1
  133. paddlex/inference/serving/schemas/layout_parsing.py +1 -2
  134. paddlex/inference/serving/schemas/pp_chatocrv3_doc.py +1 -2
  135. paddlex/inference/serving/schemas/pp_chatocrv4_doc.py +2 -2
  136. paddlex/inference/serving/schemas/pp_structurev3.py +10 -6
  137. paddlex/inference/serving/schemas/seal_recognition.py +1 -1
  138. paddlex/inference/serving/schemas/table_recognition.py +2 -6
  139. paddlex/inference/serving/schemas/table_recognition_v2.py +5 -6
  140. paddlex/inference/utils/hpi.py +30 -16
  141. paddlex/inference/utils/hpi_model_info_collection.json +666 -162
  142. paddlex/inference/utils/io/readers.py +12 -12
  143. paddlex/inference/utils/misc.py +20 -0
  144. paddlex/inference/utils/mkldnn_blocklist.py +59 -0
  145. paddlex/inference/utils/official_models.py +140 -5
  146. paddlex/inference/utils/pp_option.py +74 -9
  147. paddlex/model.py +2 -2
  148. paddlex/modules/__init__.py +1 -1
  149. paddlex/modules/anomaly_detection/evaluator.py +2 -2
  150. paddlex/modules/base/__init__.py +1 -1
  151. paddlex/modules/base/evaluator.py +5 -5
  152. paddlex/modules/base/trainer.py +1 -1
  153. paddlex/modules/doc_vlm/dataset_checker.py +2 -2
  154. paddlex/modules/doc_vlm/evaluator.py +2 -2
  155. paddlex/modules/doc_vlm/exportor.py +2 -2
  156. paddlex/modules/doc_vlm/model_list.py +1 -1
  157. paddlex/modules/doc_vlm/trainer.py +2 -2
  158. paddlex/modules/face_recognition/evaluator.py +2 -2
  159. paddlex/modules/formula_recognition/evaluator.py +5 -2
  160. paddlex/modules/formula_recognition/model_list.py +3 -0
  161. paddlex/modules/formula_recognition/trainer.py +3 -0
  162. paddlex/modules/general_recognition/evaluator.py +1 -1
  163. paddlex/modules/image_classification/evaluator.py +2 -2
  164. paddlex/modules/image_classification/model_list.py +1 -0
  165. paddlex/modules/instance_segmentation/evaluator.py +1 -1
  166. paddlex/modules/keypoint_detection/evaluator.py +1 -1
  167. paddlex/modules/m_3d_bev_detection/evaluator.py +2 -2
  168. paddlex/modules/multilabel_classification/evaluator.py +2 -2
  169. paddlex/modules/object_detection/dataset_checker/dataset_src/convert_dataset.py +4 -4
  170. paddlex/modules/object_detection/evaluator.py +2 -2
  171. paddlex/modules/object_detection/model_list.py +2 -0
  172. paddlex/modules/semantic_segmentation/dataset_checker/__init__.py +12 -2
  173. paddlex/modules/semantic_segmentation/evaluator.py +2 -2
  174. paddlex/modules/table_recognition/evaluator.py +2 -2
  175. paddlex/modules/text_detection/evaluator.py +2 -2
  176. paddlex/modules/text_detection/model_list.py +2 -0
  177. paddlex/modules/text_recognition/evaluator.py +2 -2
  178. paddlex/modules/text_recognition/model_list.py +2 -0
  179. paddlex/modules/ts_anomaly_detection/evaluator.py +2 -2
  180. paddlex/modules/ts_classification/dataset_checker/dataset_src/split_dataset.py +1 -1
  181. paddlex/modules/ts_classification/evaluator.py +2 -2
  182. paddlex/modules/ts_forecast/evaluator.py +2 -2
  183. paddlex/modules/video_classification/evaluator.py +2 -2
  184. paddlex/modules/video_detection/evaluator.py +2 -2
  185. paddlex/ops/__init__.py +8 -5
  186. paddlex/paddlex_cli.py +19 -13
  187. paddlex/repo_apis/Paddle3D_api/bev_fusion/model.py +2 -2
  188. paddlex/repo_apis/PaddleClas_api/cls/config.py +1 -1
  189. paddlex/repo_apis/PaddleClas_api/cls/model.py +1 -1
  190. paddlex/repo_apis/PaddleClas_api/cls/register.py +10 -0
  191. paddlex/repo_apis/PaddleClas_api/cls/runner.py +1 -1
  192. paddlex/repo_apis/PaddleDetection_api/instance_seg/model.py +1 -1
  193. paddlex/repo_apis/PaddleDetection_api/instance_seg/runner.py +1 -1
  194. paddlex/repo_apis/PaddleDetection_api/object_det/config.py +1 -1
  195. paddlex/repo_apis/PaddleDetection_api/object_det/model.py +1 -1
  196. paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py +25 -0
  197. paddlex/repo_apis/PaddleDetection_api/object_det/register.py +30 -0
  198. paddlex/repo_apis/PaddleDetection_api/object_det/runner.py +1 -1
  199. paddlex/repo_apis/PaddleOCR_api/formula_rec/config.py +3 -3
  200. paddlex/repo_apis/PaddleOCR_api/formula_rec/model.py +5 -9
  201. paddlex/repo_apis/PaddleOCR_api/formula_rec/register.py +27 -0
  202. paddlex/repo_apis/PaddleOCR_api/formula_rec/runner.py +1 -1
  203. paddlex/repo_apis/PaddleOCR_api/table_rec/model.py +1 -1
  204. paddlex/repo_apis/PaddleOCR_api/table_rec/runner.py +1 -1
  205. paddlex/repo_apis/PaddleOCR_api/text_det/model.py +1 -1
  206. paddlex/repo_apis/PaddleOCR_api/text_det/register.py +18 -0
  207. paddlex/repo_apis/PaddleOCR_api/text_det/runner.py +1 -1
  208. paddlex/repo_apis/PaddleOCR_api/text_rec/config.py +3 -3
  209. paddlex/repo_apis/PaddleOCR_api/text_rec/model.py +5 -9
  210. paddlex/repo_apis/PaddleOCR_api/text_rec/register.py +18 -0
  211. paddlex/repo_apis/PaddleOCR_api/text_rec/runner.py +1 -1
  212. paddlex/repo_apis/PaddleSeg_api/seg/model.py +1 -1
  213. paddlex/repo_apis/PaddleSeg_api/seg/runner.py +1 -1
  214. paddlex/repo_apis/PaddleTS_api/ts_ad/config.py +3 -3
  215. paddlex/repo_apis/PaddleTS_api/ts_cls/config.py +2 -2
  216. paddlex/repo_apis/PaddleTS_api/ts_fc/config.py +4 -4
  217. paddlex/repo_apis/PaddleVideo_api/video_cls/config.py +1 -1
  218. paddlex/repo_apis/PaddleVideo_api/video_cls/model.py +1 -1
  219. paddlex/repo_apis/PaddleVideo_api/video_cls/runner.py +1 -1
  220. paddlex/repo_apis/PaddleVideo_api/video_det/config.py +1 -1
  221. paddlex/repo_apis/PaddleVideo_api/video_det/model.py +1 -1
  222. paddlex/repo_apis/PaddleVideo_api/video_det/runner.py +1 -1
  223. paddlex/repo_apis/base/config.py +1 -1
  224. paddlex/repo_manager/core.py +3 -3
  225. paddlex/repo_manager/meta.py +6 -2
  226. paddlex/repo_manager/repo.py +17 -16
  227. paddlex/utils/custom_device_list.py +26 -2
  228. paddlex/utils/deps.py +3 -3
  229. paddlex/utils/device.py +5 -13
  230. paddlex/utils/env.py +4 -0
  231. paddlex/utils/flags.py +11 -4
  232. paddlex/utils/fonts/__init__.py +34 -4
  233. paddlex/utils/misc.py +1 -1
  234. paddlex/utils/subclass_register.py +2 -2
  235. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/METADATA +349 -208
  236. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/RECORD +240 -211
  237. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/WHEEL +1 -1
  238. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/entry_points.txt +1 -0
  239. {paddlex-3.0.0rc1.dist-info/licenses → paddlex-3.0.2.dist-info}/LICENSE +0 -0
  240. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/top_level.txt +0 -0
@@ -15,9 +15,10 @@ from __future__ import annotations
15
15
 
16
16
  import copy
17
17
  import re
18
- from typing import Any, Dict, Optional, Tuple, Union
18
+ from typing import Any, Dict, List, Optional, Tuple, Union
19
19
 
20
20
  import numpy as np
21
+ from PIL import Image
21
22
 
22
23
  from ....utils import logging
23
24
  from ....utils.deps import pipeline_requires_extra
@@ -26,18 +27,30 @@ from ...common.reader import ReadImage
26
27
  from ...models.object_detection.result import DetResult
27
28
  from ...utils.hpi import HPIConfig
28
29
  from ...utils.pp_option import PaddlePredictorOption
30
+ from .._parallel import AutoParallelImageSimpleInferencePipeline
29
31
  from ..base import BasePipeline
30
32
  from ..ocr.result import OCRResult
33
+ from .layout_objects import LayoutBlock, LayoutRegion
31
34
  from .result_v2 import LayoutParsingResultV2
32
- from .utils import gather_imgs, get_single_block_parsing_res, get_sub_regions_ocr_res
33
-
34
-
35
- @pipeline_requires_extra("ocr")
36
- class LayoutParsingPipelineV2(BasePipeline):
35
+ from .setting import BLOCK_LABEL_MAP, BLOCK_SETTINGS, REGION_SETTINGS
36
+ from .utils import (
37
+ caculate_bbox_area,
38
+ calculate_minimum_enclosing_bbox,
39
+ calculate_overlap_ratio,
40
+ convert_formula_res_to_ocr_format,
41
+ gather_imgs,
42
+ get_bbox_intersection,
43
+ get_sub_regions_ocr_res,
44
+ remove_overlap_blocks,
45
+ shrink_supplement_region_bbox,
46
+ update_region_box,
47
+ )
48
+ from .xycut_enhanced import xycut_enhanced
49
+
50
+
51
+ class _LayoutParsingPipelineV2(BasePipeline):
37
52
  """Layout Parsing Pipeline V2"""
38
53
 
39
- entities = ["PP-StructureV3"]
40
-
41
54
  def __init__(
42
55
  self,
43
56
  config: dict,
@@ -53,9 +66,9 @@ class LayoutParsingPipelineV2(BasePipeline):
53
66
  device (str, optional): Device to run the predictions on. Defaults to None.
54
67
  pp_option (PaddlePredictorOption, optional): PaddlePredictor options. Defaults to None.
55
68
  use_hpip (bool, optional): Whether to use the high-performance
56
- inference plugin (HPIP). Defaults to False.
69
+ inference plugin (HPIP) by default. Defaults to False.
57
70
  hpi_config (Optional[Union[Dict[str, Any], HPIConfig]], optional):
58
- The high-performance inference configuration dictionary.
71
+ The default high-performance inference configuration dictionary.
59
72
  Defaults to None.
60
73
  """
61
74
 
@@ -68,8 +81,7 @@ class LayoutParsingPipelineV2(BasePipeline):
68
81
 
69
82
  self.inintial_predictor(config)
70
83
 
71
- self.batch_sampler = ImageBatchSampler(batch_size=1)
72
-
84
+ self.batch_sampler = ImageBatchSampler(batch_size=config.get("batch_size", 1))
73
85
  self.img_reader = ReadImage(format="BGR")
74
86
 
75
87
  def inintial_predictor(self, config: dict) -> None:
@@ -83,13 +95,20 @@ class LayoutParsingPipelineV2(BasePipeline):
83
95
  """
84
96
 
85
97
  self.use_doc_preprocessor = config.get("use_doc_preprocessor", True)
86
- self.use_general_ocr = config.get("use_general_ocr", True)
87
98
  self.use_table_recognition = config.get("use_table_recognition", True)
88
99
  self.use_seal_recognition = config.get("use_seal_recognition", True)
100
+ self.use_region_detection = config.get(
101
+ "use_region_detection",
102
+ True,
103
+ )
89
104
  self.use_formula_recognition = config.get(
90
105
  "use_formula_recognition",
91
106
  True,
92
107
  )
108
+ self.use_chart_recognition = config.get(
109
+ "use_chart_recognition",
110
+ False,
111
+ )
93
112
 
94
113
  if self.use_doc_preprocessor:
95
114
  doc_preprocessor_config = config.get("SubPipelines", {}).get(
@@ -101,6 +120,16 @@ class LayoutParsingPipelineV2(BasePipeline):
101
120
  self.doc_preprocessor_pipeline = self.create_pipeline(
102
121
  doc_preprocessor_config,
103
122
  )
123
+ if self.use_region_detection:
124
+ region_detection_config = config.get("SubModules", {}).get(
125
+ "RegionDetection",
126
+ {
127
+ "model_config_error": "config error for block_region_detection_model!"
128
+ },
129
+ )
130
+ self.region_detection_model = self.create_model(
131
+ region_detection_config,
132
+ )
104
133
 
105
134
  layout_det_config = config.get("SubModules", {}).get(
106
135
  "LayoutDetection",
@@ -123,14 +152,13 @@ class LayoutParsingPipelineV2(BasePipeline):
123
152
  layout_kwargs["layout_merge_bboxes_mode"] = layout_merge_bboxes_mode
124
153
  self.layout_det_model = self.create_model(layout_det_config, **layout_kwargs)
125
154
 
126
- if self.use_general_ocr or self.use_table_recognition:
127
- general_ocr_config = config.get("SubPipelines", {}).get(
128
- "GeneralOCR",
129
- {"pipeline_config_error": "config error for general_ocr_pipeline!"},
130
- )
131
- self.general_ocr_pipeline = self.create_pipeline(
132
- general_ocr_config,
133
- )
155
+ general_ocr_config = config.get("SubPipelines", {}).get(
156
+ "GeneralOCR",
157
+ {"pipeline_config_error": "config error for general_ocr_pipeline!"},
158
+ )
159
+ self.general_ocr_pipeline = self.create_pipeline(
160
+ general_ocr_config,
161
+ )
134
162
 
135
163
  if self.use_seal_recognition:
136
164
  seal_recognition_config = config.get("SubPipelines", {}).get(
@@ -165,6 +193,17 @@ class LayoutParsingPipelineV2(BasePipeline):
165
193
  formula_recognition_config,
166
194
  )
167
195
 
196
+ if self.use_chart_recognition:
197
+ chart_recognition_config = config.get("SubModules", {}).get(
198
+ "ChartRecognition",
199
+ {
200
+ "model_config_error": "config error for block_region_detection_model!"
201
+ },
202
+ )
203
+ self.chart_recognition_model = self.create_model(
204
+ chart_recognition_config,
205
+ )
206
+
168
207
  return
169
208
 
170
209
  def get_text_paragraphs_ocr_res(
@@ -209,12 +248,6 @@ class LayoutParsingPipelineV2(BasePipeline):
209
248
  )
210
249
  return False
211
250
 
212
- if input_params["use_general_ocr"] and not self.use_general_ocr:
213
- logging.error(
214
- "Set use_general_ocr, but the models for general OCR are not initialized.",
215
- )
216
- return False
217
-
218
251
  if input_params["use_seal_recognition"] and not self.use_seal_recognition:
219
252
  logging.error(
220
253
  "Set use_seal_recognition, but the models for seal recognition are not initialized.",
@@ -229,159 +262,584 @@ class LayoutParsingPipelineV2(BasePipeline):
229
262
 
230
263
  return True
231
264
 
232
- def get_layout_parsing_res(
265
+ def standardized_data(
233
266
  self,
234
267
  image: list,
268
+ region_det_res: DetResult,
235
269
  layout_det_res: DetResult,
236
270
  overall_ocr_res: OCRResult,
237
- table_res_list: list,
238
- seal_res_list: list,
239
271
  formula_res_list: list,
240
- imgs_in_doc: list,
241
- text_det_limit_side_len: Optional[int] = None,
242
- text_det_limit_type: Optional[str] = None,
243
- text_det_thresh: Optional[float] = None,
244
- text_det_box_thresh: Optional[float] = None,
245
- text_det_unclip_ratio: Optional[float] = None,
246
- text_rec_score_thresh: Optional[float] = None,
272
+ text_rec_model: Any,
273
+ text_rec_score_thresh: Union[float, None] = None,
247
274
  ) -> list:
248
275
  """
249
276
  Retrieves the layout parsing result based on the layout detection result, OCR result, and other recognition results.
250
277
  Args:
251
278
  image (list): The input image.
252
- layout_det_res (DetResult): The detection result containing the layout information of the document.
253
- overall_ocr_res (OCRResult): The overall OCR result containing text information.
254
- table_res_list (list): A list of table recognition results.
255
- seal_res_list (list): A list of seal recognition results.
279
+ overall_ocr_res (OCRResult): An object containing the overall OCR results, including detected text boxes and recognized text. The structure is expected to have:
280
+ - "input_img": The image on which OCR was performed.
281
+ - "dt_boxes": A list of detected text box coordinates.
282
+ - "rec_texts": A list of recognized text corresponding to the detected boxes.
283
+
284
+ layout_det_res (DetResult): An object containing the layout detection results, including detected layout boxes and their labels. The structure is expected to have:
285
+ - "boxes": A list of dictionaries with keys "coordinate" for box coordinates and "block_label" for the type of content.
286
+
287
+ table_res_list (list): A list of table detection results, where each item is a dictionary containing:
288
+ - "block_bbox": The bounding box of the table layout.
289
+ - "pred_html": The predicted HTML representation of the table.
290
+
256
291
  formula_res_list (list): A list of formula recognition results.
257
- text_det_limit_side_len (Optional[int], optional): The maximum side length of the text detection region. Defaults to None.
258
- text_det_limit_type (Optional[str], optional): The type of limit for the text detection region. Defaults to None.
259
- text_det_thresh (Optional[float], optional): The confidence threshold for text detection. Defaults to None.
260
- text_det_box_thresh (Optional[float], optional): The confidence threshold for text detection bounding boxes. Defaults to None
261
- text_det_unclip_ratio (Optional[float], optional): The unclip ratio for text detection. Defaults to None.
292
+ text_rec_model (Any): The text recognition model.
262
293
  text_rec_score_thresh (Optional[float], optional): The score threshold for text recognition. Defaults to None.
263
294
  Returns:
264
295
  list: A list of dictionaries representing the layout parsing result.
265
296
  """
297
+
266
298
  matched_ocr_dict = {}
267
- image = np.array(image)
299
+ region_to_block_map = {}
300
+ block_to_ocr_map = {}
268
301
  object_boxes = []
269
302
  footnote_list = []
270
- max_bottom_text_coordinate = 0
303
+ paragraph_title_list = []
304
+ bottom_text_y_max = 0
305
+ max_block_area = 0.0
306
+ doc_title_num = 0
307
+
308
+ base_region_bbox = [65535, 65535, 0, 0]
309
+ layout_det_res = remove_overlap_blocks(
310
+ layout_det_res,
311
+ threshold=0.5,
312
+ smaller=True,
313
+ )
271
314
 
272
- for object_box_idx, box_info in enumerate(layout_det_res["boxes"]):
315
+ # convert formula_res_list to OCRResult format
316
+ convert_formula_res_to_ocr_format(formula_res_list, overall_ocr_res)
317
+
318
+ # match layout boxes and ocr boxes and get some information for layout_order_config
319
+ for box_idx, box_info in enumerate(layout_det_res["boxes"]):
273
320
  box = box_info["coordinate"]
274
321
  label = box_info["label"].lower()
275
322
  object_boxes.append(box)
323
+ _, _, _, y2 = box
324
+
325
+ # update the region box and max_block_area according to the layout boxes
326
+ base_region_bbox = update_region_box(box, base_region_bbox)
327
+ max_block_area = max(max_block_area, caculate_bbox_area(box))
328
+
329
+ # update_layout_order_config_block_index(layout_order_config, label, box_idx)
276
330
 
277
331
  # set the label of footnote to text, when it is above the text boxes
278
332
  if label == "footnote":
279
- footnote_list.append(object_box_idx)
280
- if label == "text" and box[3] > max_bottom_text_coordinate:
281
- max_bottom_text_coordinate = box[3]
333
+ footnote_list.append(box_idx)
334
+ elif label == "paragraph_title":
335
+ paragraph_title_list.append(box_idx)
336
+ if label == "text":
337
+ bottom_text_y_max = max(y2, bottom_text_y_max)
338
+ if label == "doc_title":
339
+ doc_title_num += 1
282
340
 
283
341
  if label not in ["formula", "table", "seal"]:
284
- _, matched_idxs = get_sub_regions_ocr_res(
342
+ _, matched_idxes = get_sub_regions_ocr_res(
285
343
  overall_ocr_res, [box], return_match_idx=True
286
344
  )
287
- for matched_idx in matched_idxs:
345
+ block_to_ocr_map[box_idx] = matched_idxes
346
+ for matched_idx in matched_idxes:
288
347
  if matched_ocr_dict.get(matched_idx, None) is None:
289
- matched_ocr_dict[matched_idx] = [object_box_idx]
348
+ matched_ocr_dict[matched_idx] = [box_idx]
290
349
  else:
291
- matched_ocr_dict[matched_idx].append(object_box_idx)
350
+ matched_ocr_dict[matched_idx].append(box_idx)
292
351
 
352
+ # fix the footnote label
293
353
  for footnote_idx in footnote_list:
294
354
  if (
295
355
  layout_det_res["boxes"][footnote_idx]["coordinate"][3]
296
- < max_bottom_text_coordinate
356
+ < bottom_text_y_max
297
357
  ):
298
358
  layout_det_res["boxes"][footnote_idx]["label"] = "text"
299
359
 
300
- already_processed = set()
301
- for matched_idx, layout_box_ids in matched_ocr_dict.items():
302
- if len(layout_box_ids) <= 1:
303
- continue
304
-
305
- # one ocr is matched to multiple layout boxes, split the text into multiple lines
306
- for idx in layout_box_ids:
307
- if idx in already_processed:
308
- continue
309
-
310
- already_processed.add(idx)
311
- wht_im = np.ones(image.shape, dtype=image.dtype) * 255
312
- box = object_boxes[idx]
313
- x1, y1, x2, y2 = [int(i) for i in box]
314
- wht_im[y1:y2, x1:x2, :] = image[y1:y2, x1:x2, :]
315
- sub_ocr_res = next(
316
- self.general_ocr_pipeline(
317
- wht_im,
318
- text_det_limit_side_len=text_det_limit_side_len,
319
- text_det_limit_type=text_det_limit_type,
320
- text_det_thresh=text_det_thresh,
321
- text_det_box_thresh=text_det_box_thresh,
322
- text_det_unclip_ratio=text_det_unclip_ratio,
323
- text_rec_score_thresh=text_rec_score_thresh,
360
+ # check if there is only one paragraph title and without doc_title
361
+ only_one_paragraph_title = len(paragraph_title_list) == 1 and doc_title_num == 0
362
+ if only_one_paragraph_title:
363
+ paragraph_title_block_area = caculate_bbox_area(
364
+ layout_det_res["boxes"][paragraph_title_list[0]]["coordinate"]
365
+ )
366
+ title_area_max_block_threshold = BLOCK_SETTINGS.get(
367
+ "title_conversion_area_ratio_threshold", 0.3
368
+ )
369
+ if (
370
+ paragraph_title_block_area
371
+ > max_block_area * title_area_max_block_threshold
372
+ ):
373
+ layout_det_res["boxes"][paragraph_title_list[0]]["label"] = "doc_title"
374
+
375
+ # Replace the OCR information of the hurdles.
376
+ for overall_ocr_idx, layout_box_ids in matched_ocr_dict.items():
377
+ if len(layout_box_ids) > 1:
378
+ matched_no = 0
379
+ overall_ocr_box = copy.deepcopy(
380
+ overall_ocr_res["rec_boxes"][overall_ocr_idx]
381
+ )
382
+ overall_ocr_dt_poly = copy.deepcopy(
383
+ overall_ocr_res["dt_polys"][overall_ocr_idx]
384
+ )
385
+ for box_idx in layout_box_ids:
386
+ layout_box = layout_det_res["boxes"][box_idx]["coordinate"]
387
+ crop_box = get_bbox_intersection(overall_ocr_box, layout_box)
388
+ for ocr_idx in block_to_ocr_map[box_idx]:
389
+ ocr_box = overall_ocr_res["rec_boxes"][ocr_idx]
390
+ iou = calculate_overlap_ratio(ocr_box, crop_box, "small")
391
+ if iou > 0.8:
392
+ overall_ocr_res["rec_texts"][ocr_idx] = ""
393
+ x1, y1, x2, y2 = [int(i) for i in crop_box]
394
+ crop_img = np.array(image)[y1:y2, x1:x2]
395
+ crop_img_rec_res = list(text_rec_model([crop_img]))[0]
396
+ crop_img_dt_poly = get_bbox_intersection(
397
+ overall_ocr_dt_poly, layout_box, return_format="poly"
324
398
  )
399
+ crop_img_rec_score = crop_img_rec_res["rec_score"]
400
+ crop_img_rec_text = crop_img_rec_res["rec_text"]
401
+ text_rec_score_thresh = (
402
+ text_rec_score_thresh
403
+ if text_rec_score_thresh is not None
404
+ else (self.general_ocr_pipeline.text_rec_score_thresh)
405
+ )
406
+ if crop_img_rec_score >= text_rec_score_thresh:
407
+ matched_no += 1
408
+ if matched_no == 1:
409
+ # the first matched ocr be replaced by the first matched layout box
410
+ overall_ocr_res["dt_polys"][
411
+ overall_ocr_idx
412
+ ] = crop_img_dt_poly
413
+ overall_ocr_res["rec_boxes"][overall_ocr_idx] = crop_box
414
+ overall_ocr_res["rec_polys"][
415
+ overall_ocr_idx
416
+ ] = crop_img_dt_poly
417
+ overall_ocr_res["rec_scores"][
418
+ overall_ocr_idx
419
+ ] = crop_img_rec_score
420
+ overall_ocr_res["rec_texts"][
421
+ overall_ocr_idx
422
+ ] = crop_img_rec_text
423
+ else:
424
+ # the other matched ocr be appended to the overall ocr result
425
+ overall_ocr_res["dt_polys"].append(crop_img_dt_poly)
426
+ if len(overall_ocr_res["rec_boxes"]) == 0:
427
+ overall_ocr_res["rec_boxes"] = np.array([crop_box])
428
+ else:
429
+ overall_ocr_res["rec_boxes"] = np.vstack(
430
+ (overall_ocr_res["rec_boxes"], crop_box)
431
+ )
432
+ overall_ocr_res["rec_polys"].append(crop_img_dt_poly)
433
+ overall_ocr_res["rec_scores"].append(crop_img_rec_score)
434
+ overall_ocr_res["rec_texts"].append(crop_img_rec_text)
435
+ overall_ocr_res["rec_labels"].append("text")
436
+ block_to_ocr_map[box_idx].remove(overall_ocr_idx)
437
+ block_to_ocr_map[box_idx].append(
438
+ len(overall_ocr_res["rec_texts"]) - 1
439
+ )
440
+
441
+ # use layout bbox to do ocr recognition when there is no matched ocr
442
+ for layout_box_idx, overall_ocr_idxes in block_to_ocr_map.items():
443
+ has_text = False
444
+ for idx in overall_ocr_idxes:
445
+ if overall_ocr_res["rec_texts"][idx] != "":
446
+ has_text = True
447
+ break
448
+ if not has_text and layout_det_res["boxes"][layout_box_idx][
449
+ "label"
450
+ ] not in BLOCK_LABEL_MAP.get("vision_labels", []):
451
+ crop_box = layout_det_res["boxes"][layout_box_idx]["coordinate"]
452
+ x1, y1, x2, y2 = [int(i) for i in crop_box]
453
+ crop_img = np.array(image)[y1:y2, x1:x2]
454
+ crop_img_rec_res = next(text_rec_model([crop_img]))
455
+ crop_img_dt_poly = get_bbox_intersection(
456
+ crop_box, crop_box, return_format="poly"
325
457
  )
326
- _, matched_idxs = get_sub_regions_ocr_res(
327
- overall_ocr_res, [box], return_match_idx=True
458
+ crop_img_rec_score = crop_img_rec_res["rec_score"]
459
+ crop_img_rec_text = crop_img_rec_res["rec_text"]
460
+ text_rec_score_thresh = (
461
+ text_rec_score_thresh
462
+ if text_rec_score_thresh is not None
463
+ else (self.general_ocr_pipeline.text_rec_score_thresh)
328
464
  )
329
- for matched_idx in sorted(matched_idxs, reverse=True):
330
- del overall_ocr_res["dt_polys"][matched_idx]
331
- del overall_ocr_res["rec_texts"][matched_idx]
332
- overall_ocr_res["rec_boxes"] = np.delete(
333
- overall_ocr_res["rec_boxes"], matched_idx, axis=0
465
+ if crop_img_rec_score >= text_rec_score_thresh:
466
+ if len(overall_ocr_res["rec_boxes"]) == 0:
467
+ overall_ocr_res["rec_boxes"] = np.array([crop_box])
468
+ else:
469
+ overall_ocr_res["rec_boxes"] = np.vstack(
470
+ (overall_ocr_res["rec_boxes"], crop_box)
471
+ )
472
+ overall_ocr_res["rec_polys"].append(crop_img_dt_poly)
473
+ overall_ocr_res["rec_scores"].append(crop_img_rec_score)
474
+ overall_ocr_res["rec_texts"].append(crop_img_rec_text)
475
+ overall_ocr_res["rec_labels"].append("text")
476
+ block_to_ocr_map[layout_box_idx].append(
477
+ len(overall_ocr_res["rec_texts"]) - 1
334
478
  )
335
- del overall_ocr_res["rec_polys"][matched_idx]
336
- del overall_ocr_res["rec_scores"][matched_idx]
337
479
 
338
- if sub_ocr_res["rec_boxes"].size > 0:
339
- sub_ocr_res["rec_labels"] = ["text"] * len(sub_ocr_res["rec_texts"])
480
+ # when there is no layout detection result but there is ocr result, convert ocr detection result to layout detection result
481
+ if len(layout_det_res["boxes"]) == 0 and len(overall_ocr_res["rec_boxes"]) > 0:
482
+ for idx, ocr_rec_box in enumerate(overall_ocr_res["rec_boxes"]):
483
+ base_region_bbox = update_region_box(ocr_rec_box, base_region_bbox)
484
+ layout_det_res["boxes"].append(
485
+ {
486
+ "label": "text",
487
+ "coordinate": ocr_rec_box,
488
+ "score": overall_ocr_res["rec_scores"][idx],
489
+ }
490
+ )
491
+ block_to_ocr_map[idx] = [idx]
340
492
 
341
- overall_ocr_res["dt_polys"].extend(sub_ocr_res["dt_polys"])
342
- overall_ocr_res["rec_texts"].extend(sub_ocr_res["rec_texts"])
343
- overall_ocr_res["rec_boxes"] = np.concatenate(
344
- [overall_ocr_res["rec_boxes"], sub_ocr_res["rec_boxes"]], axis=0
493
+ mask_labels = (
494
+ BLOCK_LABEL_MAP.get("unordered_labels", [])
495
+ + BLOCK_LABEL_MAP.get("header_labels", [])
496
+ + BLOCK_LABEL_MAP.get("footer_labels", [])
497
+ )
498
+ block_bboxes = [box["coordinate"] for box in layout_det_res["boxes"]]
499
+ region_det_res["boxes"] = sorted(
500
+ region_det_res["boxes"],
501
+ key=lambda item: caculate_bbox_area(item["coordinate"]),
502
+ )
503
+ if len(region_det_res["boxes"]) == 0:
504
+ region_det_res["boxes"] = [
505
+ {
506
+ "coordinate": base_region_bbox,
507
+ "label": "SupplementaryRegion",
508
+ "score": 1,
509
+ }
510
+ ]
511
+ region_to_block_map[0] = range(len(block_bboxes))
512
+ else:
513
+ block_idxes_set = set(range(len(block_bboxes)))
514
+ # match block to region
515
+ for region_idx, region_info in enumerate(region_det_res["boxes"]):
516
+ matched_idxes = []
517
+ region_to_block_map[region_idx] = []
518
+ region_bbox = region_info["coordinate"]
519
+ for block_idx in block_idxes_set:
520
+ if layout_det_res["boxes"][block_idx]["label"] in mask_labels:
521
+ continue
522
+ overlap_ratio = calculate_overlap_ratio(
523
+ region_bbox, block_bboxes[block_idx], mode="small"
345
524
  )
346
- overall_ocr_res["rec_polys"].extend(sub_ocr_res["rec_polys"])
347
- overall_ocr_res["rec_scores"].extend(sub_ocr_res["rec_scores"])
348
- overall_ocr_res["rec_labels"].extend(sub_ocr_res["rec_labels"])
349
-
350
- for formula_res in formula_res_list:
351
- x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
352
- poly_points = [
353
- (x_min, y_min),
354
- (x_max, y_min),
355
- (x_max, y_max),
356
- (x_min, y_max),
525
+ if overlap_ratio > REGION_SETTINGS.get(
526
+ "match_block_overlap_ratio_threshold", 0.8
527
+ ):
528
+ matched_idxes.append(block_idx)
529
+ old_region_bbox_matched_idxes = []
530
+ if len(matched_idxes) > 0:
531
+ while len(old_region_bbox_matched_idxes) != len(matched_idxes):
532
+ old_region_bbox_matched_idxes = copy.deepcopy(matched_idxes)
533
+ matched_idxes = []
534
+ matched_bboxes = [
535
+ block_bboxes[idx] for idx in old_region_bbox_matched_idxes
536
+ ]
537
+ new_region_bbox = calculate_minimum_enclosing_bbox(
538
+ matched_bboxes
539
+ )
540
+ for block_idx in block_idxes_set:
541
+ if (
542
+ layout_det_res["boxes"][block_idx]["label"]
543
+ in mask_labels
544
+ ):
545
+ continue
546
+ overlap_ratio = calculate_overlap_ratio(
547
+ new_region_bbox, block_bboxes[block_idx], mode="small"
548
+ )
549
+ if overlap_ratio > REGION_SETTINGS.get(
550
+ "match_block_overlap_ratio_threshold", 0.8
551
+ ):
552
+ matched_idxes.append(block_idx)
553
+ for block_idx in matched_idxes:
554
+ block_idxes_set.remove(block_idx)
555
+ region_to_block_map[region_idx] = matched_idxes
556
+ region_det_res["boxes"][region_idx]["coordinate"] = new_region_bbox
557
+ # Supplement region when there is no matched block
558
+ while len(block_idxes_set) > 0:
559
+ unmatched_bboxes = [block_bboxes[idx] for idx in block_idxes_set]
560
+ if len(unmatched_bboxes) == 0:
561
+ break
562
+ supplement_region_bbox = calculate_minimum_enclosing_bbox(
563
+ unmatched_bboxes
564
+ )
565
+ matched_idxes = []
566
+ # check if the new region bbox is overlapped with other region bbox, if have, then shrink the new region bbox
567
+ for region_idx, region_info in enumerate(region_det_res["boxes"]):
568
+ if len(region_to_block_map[region_idx]) == 0:
569
+ continue
570
+ region_bbox = region_info["coordinate"]
571
+ overlap_ratio = calculate_overlap_ratio(
572
+ supplement_region_bbox, region_bbox
573
+ )
574
+ if overlap_ratio > 0:
575
+ supplement_region_bbox, matched_idxes = (
576
+ shrink_supplement_region_bbox(
577
+ supplement_region_bbox,
578
+ region_bbox,
579
+ image.shape[1],
580
+ image.shape[0],
581
+ block_idxes_set,
582
+ block_bboxes,
583
+ )
584
+ )
585
+
586
+ matched_idxes = [
587
+ idx
588
+ for idx in matched_idxes
589
+ if layout_det_res["boxes"][idx]["label"] not in mask_labels
590
+ ]
591
+ if len(matched_idxes) == 0:
592
+ matched_idxes = [
593
+ idx
594
+ for idx in block_idxes_set
595
+ if layout_det_res["boxes"][idx]["label"] not in mask_labels
596
+ ]
597
+ if len(matched_idxes) == 0:
598
+ break
599
+ matched_bboxes = [block_bboxes[idx] for idx in matched_idxes]
600
+ supplement_region_bbox = calculate_minimum_enclosing_bbox(
601
+ matched_bboxes
602
+ )
603
+ region_idx = len(region_det_res["boxes"])
604
+ region_to_block_map[region_idx] = list(matched_idxes)
605
+ for block_idx in matched_idxes:
606
+ block_idxes_set.remove(block_idx)
607
+ region_det_res["boxes"].append(
608
+ {
609
+ "coordinate": supplement_region_bbox,
610
+ "label": "SupplementaryRegion",
611
+ "score": 1,
612
+ }
613
+ )
614
+
615
+ mask_idxes = [
616
+ idx
617
+ for idx in range(len(layout_det_res["boxes"]))
618
+ if layout_det_res["boxes"][idx]["label"] in mask_labels
357
619
  ]
358
- overall_ocr_res["dt_polys"].append(poly_points)
359
- overall_ocr_res["rec_texts"].append(f"${formula_res['rec_formula']}$")
360
- overall_ocr_res["rec_boxes"] = np.vstack(
361
- (overall_ocr_res["rec_boxes"], [formula_res["dt_polys"]])
620
+ for idx in mask_idxes:
621
+ bbox = layout_det_res["boxes"][idx]["coordinate"]
622
+ region_idx = len(region_det_res["boxes"])
623
+ region_to_block_map[region_idx] = [idx]
624
+ region_det_res["boxes"].append(
625
+ {
626
+ "coordinate": bbox,
627
+ "label": "SupplementaryRegion",
628
+ "score": 1,
629
+ }
630
+ )
631
+
632
+ region_block_ocr_idx_map = dict(
633
+ region_to_block_map=region_to_block_map,
634
+ block_to_ocr_map=block_to_ocr_map,
635
+ )
636
+
637
+ return region_block_ocr_idx_map, region_det_res, layout_det_res
638
+
639
+ def get_layout_parsing_objects(
640
+ self,
641
+ image: list,
642
+ region_block_ocr_idx_map: dict,
643
+ region_det_res: DetResult,
644
+ overall_ocr_res: OCRResult,
645
+ layout_det_res: DetResult,
646
+ table_res_list: list,
647
+ seal_res_list: list,
648
+ chart_res_list: list,
649
+ text_rec_model: Any,
650
+ text_rec_score_thresh: Union[float, None] = None,
651
+ ) -> list:
652
+ """
653
+ Extract structured information from OCR and layout detection results.
654
+
655
+ Args:
656
+ image (list): The input image.
657
+ overall_ocr_res (OCRResult): An object containing the overall OCR results, including detected text boxes and recognized text. The structure is expected to have:
658
+ - "input_img": The image on which OCR was performed.
659
+ - "dt_boxes": A list of detected text box coordinates.
660
+ - "rec_texts": A list of recognized text corresponding to the detected boxes.
661
+
662
+ layout_det_res (DetResult): An object containing the layout detection results, including detected layout boxes and their labels. The structure is expected to have:
663
+ - "boxes": A list of dictionaries with keys "coordinate" for box coordinates and "block_label" for the type of content.
664
+
665
+ table_res_list (list): A list of table detection results, where each item is a dictionary containing:
666
+ - "block_bbox": The bounding box of the table layout.
667
+ - "pred_html": The predicted HTML representation of the table.
668
+
669
+ seal_res_list (List): A list of seal detection results. The details of each item depend on the specific application context.
670
+ text_rec_model (Any): A model for text recognition.
671
+ text_rec_score_thresh (Union[float, None]): The minimum score required for a recognized character to be considered valid. If None, use the default value specified during initialization. Default is None.
672
+
673
+ Returns:
674
+ list: A list of structured boxes where each item is a dictionary containing:
675
+ - "block_label": The label of the content (e.g., 'table', 'chart', 'image').
676
+ - The label as a key with either table HTML or image data and text.
677
+ - "block_bbox": The coordinates of the layout box.
678
+ """
679
+
680
+ table_index = 0
681
+ seal_index = 0
682
+ chart_index = 0
683
+ layout_parsing_blocks: List[LayoutBlock] = []
684
+
685
+ for box_idx, box_info in enumerate(layout_det_res["boxes"]):
686
+
687
+ label = box_info["label"]
688
+ block_bbox = box_info["coordinate"]
689
+ rec_res = {"boxes": [], "rec_texts": [], "rec_labels": []}
690
+
691
+ block = LayoutBlock(label=label, bbox=block_bbox)
692
+
693
+ if label == "table" and len(table_res_list) > 0:
694
+ block.content = table_res_list[table_index]["pred_html"]
695
+ table_index += 1
696
+ elif label == "seal" and len(seal_res_list) > 0:
697
+ block.content = "\n".join(seal_res_list[seal_index]["rec_texts"])
698
+ seal_index += 1
699
+ elif label == "chart" and len(chart_res_list) > 0:
700
+ block.content = chart_res_list[chart_index]
701
+ chart_index += 1
702
+ else:
703
+ if label == "formula":
704
+ _, ocr_idx_list = get_sub_regions_ocr_res(
705
+ overall_ocr_res, [block_bbox], return_match_idx=True
706
+ )
707
+ region_block_ocr_idx_map["block_to_ocr_map"][box_idx] = ocr_idx_list
708
+ else:
709
+ ocr_idx_list = region_block_ocr_idx_map["block_to_ocr_map"].get(
710
+ box_idx, []
711
+ )
712
+ for box_no in ocr_idx_list:
713
+ rec_res["boxes"].append(overall_ocr_res["rec_boxes"][box_no])
714
+ rec_res["rec_texts"].append(
715
+ overall_ocr_res["rec_texts"][box_no],
716
+ )
717
+ rec_res["rec_labels"].append(
718
+ overall_ocr_res["rec_labels"][box_no],
719
+ )
720
+ block.update_text_content(
721
+ image=image,
722
+ ocr_rec_res=rec_res,
723
+ text_rec_model=text_rec_model,
724
+ text_rec_score_thresh=text_rec_score_thresh,
725
+ )
726
+
727
+ if (
728
+ label
729
+ in ["seal", "table", "formula", "chart"]
730
+ + BLOCK_LABEL_MAP["image_labels"]
731
+ ):
732
+ x_min, y_min, x_max, y_max = list(map(int, block_bbox))
733
+ img_path = (
734
+ f"imgs/img_in_{block.label}_box_{x_min}_{y_min}_{x_max}_{y_max}.jpg"
735
+ )
736
+ img = Image.fromarray(image[y_min:y_max, x_min:x_max, ::-1])
737
+ block.image = {"path": img_path, "img": img}
738
+
739
+ layout_parsing_blocks.append(block)
740
+
741
+ page_region_bbox = [65535, 65535, 0, 0]
742
+ layout_parsing_regions: List[LayoutRegion] = []
743
+ for region_idx, region_info in enumerate(region_det_res["boxes"]):
744
+ region_bbox = np.array(region_info["coordinate"]).astype("int")
745
+ region_blocks = [
746
+ layout_parsing_blocks[idx]
747
+ for idx in region_block_ocr_idx_map["region_to_block_map"][region_idx]
748
+ ]
749
+ if region_blocks:
750
+ page_region_bbox = update_region_box(region_bbox, page_region_bbox)
751
+ region = LayoutRegion(bbox=region_bbox, blocks=region_blocks)
752
+ layout_parsing_regions.append(region)
753
+
754
+ layout_parsing_page = LayoutRegion(
755
+ bbox=np.array(page_region_bbox).astype("int"), blocks=layout_parsing_regions
756
+ )
757
+
758
+ return layout_parsing_page
759
+
760
+ def sort_layout_parsing_blocks(
761
+ self, layout_parsing_page: LayoutRegion
762
+ ) -> List[LayoutBlock]:
763
+ layout_parsing_regions = xycut_enhanced(layout_parsing_page)
764
+ parsing_res_list = []
765
+ for region in layout_parsing_regions:
766
+ layout_parsing_blocks = xycut_enhanced(region)
767
+ parsing_res_list.extend(layout_parsing_blocks)
768
+
769
+ return parsing_res_list
770
+
771
+ def get_layout_parsing_res(
772
+ self,
773
+ image: list,
774
+ region_det_res: DetResult,
775
+ layout_det_res: DetResult,
776
+ overall_ocr_res: OCRResult,
777
+ table_res_list: list,
778
+ seal_res_list: list,
779
+ chart_res_list: list,
780
+ formula_res_list: list,
781
+ text_rec_score_thresh: Union[float, None] = None,
782
+ ) -> list:
783
+ """
784
+ Retrieves the layout parsing result based on the layout detection result, OCR result, and other recognition results.
785
+ Args:
786
+ image (list): The input image.
787
+ layout_det_res (DetResult): The detection result containing the layout information of the document.
788
+ overall_ocr_res (OCRResult): The overall OCR result containing text information.
789
+ table_res_list (list): A list of table recognition results.
790
+ seal_res_list (list): A list of seal recognition results.
791
+ formula_res_list (list): A list of formula recognition results.
792
+ text_rec_score_thresh (Optional[float], optional): The score threshold for text recognition. Defaults to None.
793
+ Returns:
794
+ list: A list of dictionaries representing the layout parsing result.
795
+ """
796
+
797
+ # Standardize data
798
+ region_block_ocr_idx_map, region_det_res, layout_det_res = (
799
+ self.standardized_data(
800
+ image=image,
801
+ region_det_res=region_det_res,
802
+ layout_det_res=layout_det_res,
803
+ overall_ocr_res=overall_ocr_res,
804
+ formula_res_list=formula_res_list,
805
+ text_rec_model=self.general_ocr_pipeline.text_rec_model,
806
+ text_rec_score_thresh=text_rec_score_thresh,
362
807
  )
363
- overall_ocr_res["rec_labels"].append("formula")
364
- overall_ocr_res["rec_polys"].append(poly_points)
365
- overall_ocr_res["rec_scores"].append(1)
808
+ )
366
809
 
367
- parsing_res_list = get_single_block_parsing_res(
368
- self.general_ocr_pipeline,
810
+ # Format layout parsing block
811
+ layout_parsing_page = self.get_layout_parsing_objects(
812
+ image=image,
813
+ region_block_ocr_idx_map=region_block_ocr_idx_map,
814
+ region_det_res=region_det_res,
369
815
  overall_ocr_res=overall_ocr_res,
370
816
  layout_det_res=layout_det_res,
371
817
  table_res_list=table_res_list,
372
818
  seal_res_list=seal_res_list,
819
+ chart_res_list=chart_res_list,
820
+ text_rec_model=self.general_ocr_pipeline.text_rec_model,
821
+ text_rec_score_thresh=self.general_ocr_pipeline.text_rec_score_thresh,
373
822
  )
374
823
 
824
+ parsing_res_list = self.sort_layout_parsing_blocks(layout_parsing_page)
825
+
826
+ index = 1
827
+ for block in parsing_res_list:
828
+ if block.label in BLOCK_LABEL_MAP["visualize_index_labels"]:
829
+ block.order_index = index
830
+ index += 1
831
+
375
832
  return parsing_res_list
376
833
 
377
834
  def get_model_settings(
378
835
  self,
379
836
  use_doc_orientation_classify: Union[bool, None],
380
837
  use_doc_unwarping: Union[bool, None],
381
- use_general_ocr: Union[bool, None],
382
838
  use_seal_recognition: Union[bool, None],
383
839
  use_table_recognition: Union[bool, None],
384
840
  use_formula_recognition: Union[bool, None],
841
+ use_chart_recognition: Union[bool, None],
842
+ use_region_detection: Union[bool, None],
385
843
  ) -> dict:
386
844
  """
387
845
  Get the model settings based on the provided parameters or default values.
@@ -389,7 +847,6 @@ class LayoutParsingPipelineV2(BasePipeline):
389
847
  Args:
390
848
  use_doc_orientation_classify (Union[bool, None]): Enables document orientation classification if True. Defaults to system setting if None.
391
849
  use_doc_unwarping (Union[bool, None]): Enables document unwarping if True. Defaults to system setting if None.
392
- use_general_ocr (Union[bool, None]): Enables general OCR if True. Defaults to system setting if None.
393
850
  use_seal_recognition (Union[bool, None]): Enables seal recognition if True. Defaults to system setting if None.
394
851
  use_table_recognition (Union[bool, None]): Enables table recognition if True. Defaults to system setting if None.
395
852
  use_formula_recognition (Union[bool, None]): Enables formula recognition if True. Defaults to system setting if None.
@@ -406,9 +863,6 @@ class LayoutParsingPipelineV2(BasePipeline):
406
863
  else:
407
864
  use_doc_preprocessor = False
408
865
 
409
- if use_general_ocr is None:
410
- use_general_ocr = self.use_general_ocr
411
-
412
866
  if use_seal_recognition is None:
413
867
  use_seal_recognition = self.use_seal_recognition
414
868
 
@@ -418,24 +872,32 @@ class LayoutParsingPipelineV2(BasePipeline):
418
872
  if use_formula_recognition is None:
419
873
  use_formula_recognition = self.use_formula_recognition
420
874
 
875
+ if use_region_detection is None:
876
+ use_region_detection = self.use_region_detection
877
+
878
+ if use_chart_recognition is None:
879
+ use_chart_recognition = self.use_chart_recognition
880
+
421
881
  return dict(
422
882
  use_doc_preprocessor=use_doc_preprocessor,
423
- use_general_ocr=use_general_ocr,
424
883
  use_seal_recognition=use_seal_recognition,
425
884
  use_table_recognition=use_table_recognition,
426
885
  use_formula_recognition=use_formula_recognition,
886
+ use_chart_recognition=use_chart_recognition,
887
+ use_region_detection=use_region_detection,
427
888
  )
428
889
 
429
890
  def predict(
430
891
  self,
431
892
  input: Union[str, list[str], np.ndarray, list[np.ndarray]],
432
- use_doc_orientation_classify: Union[bool, None] = None,
433
- use_doc_unwarping: Union[bool, None] = None,
893
+ use_doc_orientation_classify: Union[bool, None] = False,
894
+ use_doc_unwarping: Union[bool, None] = False,
434
895
  use_textline_orientation: Optional[bool] = None,
435
- use_general_ocr: Union[bool, None] = None,
436
896
  use_seal_recognition: Union[bool, None] = None,
437
897
  use_table_recognition: Union[bool, None] = None,
438
898
  use_formula_recognition: Union[bool, None] = None,
899
+ use_chart_recognition: Union[bool, None] = False,
900
+ use_region_detection: Union[bool, None] = None,
439
901
  layout_threshold: Optional[Union[float, dict]] = None,
440
902
  layout_nms: Optional[bool] = None,
441
903
  layout_unclip_ratio: Optional[Union[float, Tuple[float, float], dict]] = None,
@@ -452,7 +914,10 @@ class LayoutParsingPipelineV2(BasePipeline):
452
914
  seal_det_box_thresh: Union[float, None] = None,
453
915
  seal_det_unclip_ratio: Union[float, None] = None,
454
916
  seal_rec_score_thresh: Union[float, None] = None,
455
- use_table_cells_ocr_results: bool = False,
917
+ use_wired_table_cells_trans_to_html: bool = False,
918
+ use_wireless_table_cells_trans_to_html: bool = False,
919
+ use_table_orientation_classify: bool = True,
920
+ use_ocr_results_with_table_cells: bool = True,
456
921
  use_e2e_wired_table_rec_model: bool = False,
457
922
  use_e2e_wireless_table_rec_model: bool = True,
458
923
  **kwargs,
@@ -464,10 +929,10 @@ class LayoutParsingPipelineV2(BasePipeline):
464
929
  use_doc_orientation_classify (Optional[bool]): Whether to use document orientation classification.
465
930
  use_doc_unwarping (Optional[bool]): Whether to use document unwarping.
466
931
  use_textline_orientation (Optional[bool]): Whether to use textline orientation prediction.
467
- use_general_ocr (Optional[bool]): Whether to use general OCR.
468
932
  use_seal_recognition (Optional[bool]): Whether to use seal recognition.
469
933
  use_table_recognition (Optional[bool]): Whether to use table recognition.
470
934
  use_formula_recognition (Optional[bool]): Whether to use formula recognition.
935
+ use_region_detection (Optional[bool]): Whether to use region detection.
471
936
  layout_threshold (Optional[float]): The threshold value to filter out low-confidence predictions. Default is None.
472
937
  layout_nms (bool, optional): Whether to use layout-aware NMS. Defaults to False.
473
938
  layout_unclip_ratio (Optional[Union[float, Tuple[float, float]]], optional): The ratio of unclipping the bounding box.
@@ -488,7 +953,10 @@ class LayoutParsingPipelineV2(BasePipeline):
488
953
  seal_det_box_thresh (Optional[float]): Threshold for seal detection boxes.
489
954
  seal_det_unclip_ratio (Optional[float]): Ratio for unclipping seal detection boxes.
490
955
  seal_rec_score_thresh (Optional[float]): Score threshold for seal recognition.
491
- use_table_cells_ocr_results (bool): whether to use OCR results with cells.
956
+ use_wired_table_cells_trans_to_html (bool): Whether to use wired table cells trans to HTML.
957
+ use_wireless_table_cells_trans_to_html (bool): Whether to use wireless table cells trans to HTML.
958
+ use_table_orientation_classify (bool): Whether to use table orientation classification.
959
+ use_ocr_results_with_table_cells (bool): Whether to use OCR results processed by table cells.
492
960
  use_e2e_wired_table_rec_model (bool): Whether to use end-to-end wired table recognition model.
493
961
  use_e2e_wireless_table_rec_model (bool): Whether to use end-to-end wireless table recognition model.
494
962
  **kwargs (Any): Additional settings to extend functionality.
@@ -496,154 +964,207 @@ class LayoutParsingPipelineV2(BasePipeline):
496
964
  Returns:
497
965
  LayoutParsingResultV2: The predicted layout parsing result.
498
966
  """
499
-
500
967
  model_settings = self.get_model_settings(
501
968
  use_doc_orientation_classify,
502
969
  use_doc_unwarping,
503
- use_general_ocr,
504
970
  use_seal_recognition,
505
971
  use_table_recognition,
506
972
  use_formula_recognition,
973
+ use_chart_recognition,
974
+ use_region_detection,
507
975
  )
508
976
 
509
977
  if not self.check_model_settings_valid(model_settings):
510
978
  yield {"error": "the input params for model settings are invalid!"}
511
979
 
512
980
  for batch_data in self.batch_sampler(input):
513
- image_array = self.img_reader(batch_data.instances)[0]
981
+ image_arrays = self.img_reader(batch_data.instances)
514
982
 
515
983
  if model_settings["use_doc_preprocessor"]:
516
- doc_preprocessor_res = next(
984
+ doc_preprocessor_results = list(
517
985
  self.doc_preprocessor_pipeline(
518
- image_array,
986
+ image_arrays,
519
987
  use_doc_orientation_classify=use_doc_orientation_classify,
520
988
  use_doc_unwarping=use_doc_unwarping,
521
- ),
989
+ )
522
990
  )
523
991
  else:
524
- doc_preprocessor_res = {"output_img": image_array}
992
+ doc_preprocessor_results = [{"output_img": arr} for arr in image_arrays]
525
993
 
526
- doc_preprocessor_image = doc_preprocessor_res["output_img"]
994
+ doc_preprocessor_images = [
995
+ item["output_img"] for item in doc_preprocessor_results
996
+ ]
527
997
 
528
- layout_det_res = next(
998
+ layout_det_results = list(
529
999
  self.layout_det_model(
530
- doc_preprocessor_image,
1000
+ doc_preprocessor_images,
531
1001
  threshold=layout_threshold,
532
1002
  layout_nms=layout_nms,
533
1003
  layout_unclip_ratio=layout_unclip_ratio,
534
1004
  layout_merge_bboxes_mode=layout_merge_bboxes_mode,
535
1005
  )
536
1006
  )
537
- imgs_in_doc = gather_imgs(doc_preprocessor_image, layout_det_res["boxes"])
1007
+ imgs_in_doc = [
1008
+ gather_imgs(img, res["boxes"])
1009
+ for img, res in zip(doc_preprocessor_images, layout_det_results)
1010
+ ]
1011
+
1012
+ if model_settings["use_region_detection"]:
1013
+ region_det_results = list(
1014
+ self.region_detection_model(
1015
+ doc_preprocessor_images,
1016
+ layout_nms=True,
1017
+ layout_merge_bboxes_mode="small",
1018
+ ),
1019
+ )
1020
+ else:
1021
+ region_det_results = [{"boxes": []} for _ in doc_preprocessor_images]
538
1022
 
539
1023
  if model_settings["use_formula_recognition"]:
540
- formula_res_all = next(
1024
+ formula_res_all = list(
541
1025
  self.formula_recognition_pipeline(
542
- doc_preprocessor_image,
1026
+ doc_preprocessor_images,
543
1027
  use_layout_detection=False,
544
1028
  use_doc_orientation_classify=False,
545
1029
  use_doc_unwarping=False,
546
- layout_det_res=layout_det_res,
1030
+ layout_det_res=layout_det_results,
547
1031
  ),
548
1032
  )
549
- formula_res_list = formula_res_all["formula_res_list"]
1033
+ formula_res_lists = [
1034
+ item["formula_res_list"] for item in formula_res_all
1035
+ ]
550
1036
  else:
551
- formula_res_list = []
1037
+ formula_res_lists = [[] for _ in doc_preprocessor_images]
552
1038
 
553
- for formula_res in formula_res_list:
554
- x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
555
- doc_preprocessor_image[y_min:y_max, x_min:x_max, :] = 255.0
556
-
557
- if (
558
- model_settings["use_general_ocr"]
559
- or model_settings["use_table_recognition"]
1039
+ for doc_preprocessor_image, formula_res_list in zip(
1040
+ doc_preprocessor_images, formula_res_lists
560
1041
  ):
561
- overall_ocr_res = next(
562
- self.general_ocr_pipeline(
563
- doc_preprocessor_image,
564
- use_textline_orientation=use_textline_orientation,
565
- text_det_limit_side_len=text_det_limit_side_len,
566
- text_det_limit_type=text_det_limit_type,
567
- text_det_thresh=text_det_thresh,
568
- text_det_box_thresh=text_det_box_thresh,
569
- text_det_unclip_ratio=text_det_unclip_ratio,
570
- text_rec_score_thresh=text_rec_score_thresh,
571
- ),
572
- )
573
- else:
574
- overall_ocr_res = {}
1042
+ for formula_res in formula_res_list:
1043
+ x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
1044
+ doc_preprocessor_image[y_min:y_max, x_min:x_max, :] = 255.0
1045
+
1046
+ overall_ocr_results = list(
1047
+ self.general_ocr_pipeline(
1048
+ doc_preprocessor_images,
1049
+ use_textline_orientation=use_textline_orientation,
1050
+ text_det_limit_side_len=text_det_limit_side_len,
1051
+ text_det_limit_type=text_det_limit_type,
1052
+ text_det_thresh=text_det_thresh,
1053
+ text_det_box_thresh=text_det_box_thresh,
1054
+ text_det_unclip_ratio=text_det_unclip_ratio,
1055
+ text_rec_score_thresh=text_rec_score_thresh,
1056
+ ),
1057
+ )
575
1058
 
576
- overall_ocr_res["rec_labels"] = ["text"] * len(overall_ocr_res["rec_texts"])
1059
+ for overall_ocr_res in overall_ocr_results:
1060
+ overall_ocr_res["rec_labels"] = ["text"] * len(
1061
+ overall_ocr_res["rec_texts"]
1062
+ )
577
1063
 
578
1064
  if model_settings["use_table_recognition"]:
579
- table_contents = copy.deepcopy(overall_ocr_res)
580
- for formula_res in formula_res_list:
581
- x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
582
- poly_points = [
583
- (x_min, y_min),
584
- (x_max, y_min),
585
- (x_max, y_max),
586
- (x_min, y_max),
587
- ]
588
- table_contents["dt_polys"].append(poly_points)
589
- table_contents["rec_texts"].append(
590
- f"${formula_res['rec_formula']}$"
591
- )
592
- table_contents["rec_boxes"] = np.vstack(
593
- (table_contents["rec_boxes"], [formula_res["dt_polys"]])
1065
+ table_res_lists = []
1066
+ for (
1067
+ layout_det_res,
1068
+ doc_preprocessor_image,
1069
+ overall_ocr_res,
1070
+ formula_res_list,
1071
+ imgs_in_doc_for_img,
1072
+ ) in zip(
1073
+ layout_det_results,
1074
+ doc_preprocessor_images,
1075
+ overall_ocr_results,
1076
+ formula_res_lists,
1077
+ imgs_in_doc,
1078
+ ):
1079
+ table_contents_for_img = copy.deepcopy(overall_ocr_res)
1080
+ for formula_res in formula_res_list:
1081
+ x_min, y_min, x_max, y_max = list(
1082
+ map(int, formula_res["dt_polys"])
1083
+ )
1084
+ poly_points = [
1085
+ (x_min, y_min),
1086
+ (x_max, y_min),
1087
+ (x_max, y_max),
1088
+ (x_min, y_max),
1089
+ ]
1090
+ table_contents_for_img["dt_polys"].append(poly_points)
1091
+ rec_formula = formula_res["rec_formula"]
1092
+ if not rec_formula.startswith("$") or not rec_formula.endswith(
1093
+ "$"
1094
+ ):
1095
+ rec_formula = f"${rec_formula}$"
1096
+ table_contents_for_img["rec_texts"].append(f"{rec_formula}")
1097
+ if table_contents_for_img["rec_boxes"].size == 0:
1098
+ table_contents_for_img["rec_boxes"] = np.array(
1099
+ [formula_res["dt_polys"]]
1100
+ )
1101
+ else:
1102
+ table_contents_for_img["rec_boxes"] = np.vstack(
1103
+ (
1104
+ table_contents_for_img["rec_boxes"],
1105
+ [formula_res["dt_polys"]],
1106
+ )
1107
+ )
1108
+ table_contents_for_img["rec_polys"].append(poly_points)
1109
+ table_contents_for_img["rec_scores"].append(1)
1110
+
1111
+ for img in imgs_in_doc_for_img:
1112
+ img_path = img["path"]
1113
+ x_min, y_min, x_max, y_max = img["coordinate"]
1114
+ poly_points = [
1115
+ (x_min, y_min),
1116
+ (x_max, y_min),
1117
+ (x_max, y_max),
1118
+ (x_min, y_max),
1119
+ ]
1120
+ table_contents_for_img["dt_polys"].append(poly_points)
1121
+ table_contents_for_img["rec_texts"].append(
1122
+ f'<div style="text-align: center;"><img src="{img_path}" alt="Image" /></div>'
1123
+ )
1124
+ if table_contents_for_img["rec_boxes"].size == 0:
1125
+ table_contents_for_img["rec_boxes"] = np.array(
1126
+ [img["coordinate"]]
1127
+ )
1128
+ else:
1129
+ table_contents_for_img["rec_boxes"] = np.vstack(
1130
+ (table_contents_for_img["rec_boxes"], img["coordinate"])
1131
+ )
1132
+ table_contents_for_img["rec_polys"].append(poly_points)
1133
+ table_contents_for_img["rec_scores"].append(img["score"])
1134
+
1135
+ table_res_all = list(
1136
+ self.table_recognition_pipeline(
1137
+ doc_preprocessor_image,
1138
+ use_doc_orientation_classify=False,
1139
+ use_doc_unwarping=False,
1140
+ use_layout_detection=False,
1141
+ use_ocr_model=False,
1142
+ overall_ocr_res=table_contents_for_img,
1143
+ layout_det_res=layout_det_res,
1144
+ cell_sort_by_y_projection=True,
1145
+ use_wired_table_cells_trans_to_html=use_wired_table_cells_trans_to_html,
1146
+ use_wireless_table_cells_trans_to_html=use_wireless_table_cells_trans_to_html,
1147
+ use_table_orientation_classify=use_table_orientation_classify,
1148
+ use_ocr_results_with_table_cells=use_ocr_results_with_table_cells,
1149
+ use_e2e_wired_table_rec_model=use_e2e_wired_table_rec_model,
1150
+ use_e2e_wireless_table_rec_model=use_e2e_wireless_table_rec_model,
1151
+ ),
594
1152
  )
595
- table_contents["rec_polys"].append(poly_points)
596
- table_contents["rec_scores"].append(1)
597
-
598
- for img in imgs_in_doc:
599
- img_path = img["path"]
600
- x_min, y_min, x_max, y_max = img["coordinate"]
601
- poly_points = [
602
- (x_min, y_min),
603
- (x_max, y_min),
604
- (x_max, y_max),
605
- (x_min, y_max),
1153
+ single_table_res_lists = [
1154
+ item["table_res_list"] for item in table_res_all
606
1155
  ]
607
- table_contents["dt_polys"].append(poly_points)
608
- table_contents["rec_texts"].append(
609
- f'<div style="text-align: center;"><img src="{img_path}" alt="Image" /></div>'
610
- )
611
- if table_contents["rec_boxes"].size == 0:
612
- table_contents["rec_boxes"] = np.array([img["coordinate"]])
613
- else:
614
- table_contents["rec_boxes"] = np.vstack(
615
- (table_contents["rec_boxes"], img["coordinate"])
616
- )
617
- table_contents["rec_polys"].append(poly_points)
618
- table_contents["rec_scores"].append(img["score"])
619
-
620
- table_res_all = next(
621
- self.table_recognition_pipeline(
622
- doc_preprocessor_image,
623
- use_doc_orientation_classify=False,
624
- use_doc_unwarping=False,
625
- use_layout_detection=False,
626
- use_ocr_model=False,
627
- overall_ocr_res=table_contents,
628
- layout_det_res=layout_det_res,
629
- cell_sort_by_y_projection=True,
630
- use_table_cells_ocr_results=use_table_cells_ocr_results,
631
- use_e2e_wired_table_rec_model=use_e2e_wired_table_rec_model,
632
- use_e2e_wireless_table_rec_model=use_e2e_wireless_table_rec_model,
633
- ),
634
- )
635
- table_res_list = table_res_all["table_res_list"]
1156
+ table_res_lists.extend(single_table_res_lists)
636
1157
  else:
637
- table_res_list = []
1158
+ table_res_lists = [[] for _ in doc_preprocessor_images]
638
1159
 
639
1160
  if model_settings["use_seal_recognition"]:
640
- seal_res_all = next(
1161
+ seal_res_all = list(
641
1162
  self.seal_recognition_pipeline(
642
- doc_preprocessor_image,
1163
+ doc_preprocessor_images,
643
1164
  use_doc_orientation_classify=False,
644
1165
  use_doc_unwarping=False,
645
1166
  use_layout_detection=False,
646
- layout_det_res=layout_det_res,
1167
+ layout_det_res=layout_det_results,
647
1168
  seal_det_limit_side_len=seal_det_limit_side_len,
648
1169
  seal_det_limit_type=seal_det_limit_type,
649
1170
  seal_det_thresh=seal_det_thresh,
@@ -652,46 +1173,85 @@ class LayoutParsingPipelineV2(BasePipeline):
652
1173
  seal_rec_score_thresh=seal_rec_score_thresh,
653
1174
  ),
654
1175
  )
655
- seal_res_list = seal_res_all["seal_res_list"]
1176
+ seal_res_lists = [item["seal_res_list"] for item in seal_res_all]
656
1177
  else:
657
- seal_res_list = []
1178
+ seal_res_lists = [[] for _ in doc_preprocessor_images]
658
1179
 
659
- parsing_res_list = self.get_layout_parsing_res(
1180
+ for (
1181
+ input_path,
1182
+ page_index,
660
1183
  doc_preprocessor_image,
661
- layout_det_res=layout_det_res,
662
- overall_ocr_res=overall_ocr_res,
663
- table_res_list=table_res_list,
664
- seal_res_list=seal_res_list,
665
- formula_res_list=formula_res_list,
666
- imgs_in_doc=imgs_in_doc,
667
- text_det_limit_side_len=text_det_limit_side_len,
668
- text_det_limit_type=text_det_limit_type,
669
- text_det_thresh=text_det_thresh,
670
- text_det_box_thresh=text_det_box_thresh,
671
- text_det_unclip_ratio=text_det_unclip_ratio,
672
- text_rec_score_thresh=text_rec_score_thresh,
673
- )
1184
+ doc_preprocessor_res,
1185
+ layout_det_res,
1186
+ region_det_res,
1187
+ overall_ocr_res,
1188
+ table_res_list,
1189
+ seal_res_list,
1190
+ formula_res_list,
1191
+ imgs_in_doc_for_img,
1192
+ ) in zip(
1193
+ batch_data.input_paths,
1194
+ batch_data.page_indexes,
1195
+ doc_preprocessor_images,
1196
+ doc_preprocessor_results,
1197
+ layout_det_results,
1198
+ region_det_results,
1199
+ overall_ocr_results,
1200
+ table_res_lists,
1201
+ seal_res_lists,
1202
+ formula_res_lists,
1203
+ imgs_in_doc,
1204
+ ):
1205
+ chart_res_list = []
1206
+ if model_settings["use_chart_recognition"]:
1207
+ chart_imgs_list = []
1208
+ for bbox in layout_det_res["boxes"]:
1209
+ if bbox["label"] == "chart":
1210
+ x_min, y_min, x_max, y_max = bbox["coordinate"]
1211
+ chart_img = doc_preprocessor_image[
1212
+ int(y_min) : int(y_max), int(x_min) : int(x_max), :
1213
+ ]
1214
+ chart_imgs_list.append({"image": chart_img})
1215
+
1216
+ for chart_res_batch in self.chart_recognition_model(
1217
+ input=chart_imgs_list
1218
+ ):
1219
+ chart_res_list.append(chart_res_batch["result"])
1220
+
1221
+ parsing_res_list = self.get_layout_parsing_res(
1222
+ doc_preprocessor_image,
1223
+ region_det_res=region_det_res,
1224
+ layout_det_res=layout_det_res,
1225
+ overall_ocr_res=overall_ocr_res,
1226
+ table_res_list=table_res_list,
1227
+ seal_res_list=seal_res_list,
1228
+ chart_res_list=chart_res_list,
1229
+ formula_res_list=formula_res_list,
1230
+ text_rec_score_thresh=text_rec_score_thresh,
1231
+ )
674
1232
 
675
- for formula_res in formula_res_list:
676
- x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
677
- doc_preprocessor_image[y_min:y_max, x_min:x_max, :] = formula_res[
678
- "input_img"
679
- ]
1233
+ for formula_res in formula_res_list:
1234
+ x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
1235
+ doc_preprocessor_image[y_min:y_max, x_min:x_max, :] = formula_res[
1236
+ "input_img"
1237
+ ]
680
1238
 
681
- single_img_res = {
682
- "input_path": batch_data.input_paths[0],
683
- "page_index": batch_data.page_indexes[0],
684
- "doc_preprocessor_res": doc_preprocessor_res,
685
- "layout_det_res": layout_det_res,
686
- "overall_ocr_res": overall_ocr_res,
687
- "table_res_list": table_res_list,
688
- "seal_res_list": seal_res_list,
689
- "formula_res_list": formula_res_list,
690
- "parsing_res_list": parsing_res_list,
691
- "imgs_in_doc": imgs_in_doc,
692
- "model_settings": model_settings,
693
- }
694
- yield LayoutParsingResultV2(single_img_res)
1239
+ single_img_res = {
1240
+ "input_path": input_path,
1241
+ "page_index": page_index,
1242
+ "doc_preprocessor_res": doc_preprocessor_res,
1243
+ "layout_det_res": layout_det_res,
1244
+ "region_det_res": region_det_res,
1245
+ "overall_ocr_res": overall_ocr_res,
1246
+ "table_res_list": table_res_list,
1247
+ "seal_res_list": seal_res_list,
1248
+ "chart_res_list": chart_res_list,
1249
+ "formula_res_list": formula_res_list,
1250
+ "parsing_res_list": parsing_res_list,
1251
+ "imgs_in_doc": imgs_in_doc_for_img,
1252
+ "model_settings": model_settings,
1253
+ }
1254
+ yield LayoutParsingResultV2(single_img_res)
695
1255
 
696
1256
  def concatenate_markdown_pages(self, markdown_list: list) -> tuple:
697
1257
  """
@@ -747,3 +1307,15 @@ class LayoutParsingPipelineV2(BasePipeline):
747
1307
  )
748
1308
 
749
1309
  return markdown_texts
1310
+
1311
+
1312
+ @pipeline_requires_extra("ocr")
1313
+ class LayoutParsingPipelineV2(AutoParallelImageSimpleInferencePipeline):
1314
+ entities = ["PP-StructureV3"]
1315
+
1316
+ @property
1317
+ def _pipeline_cls(self):
1318
+ return _LayoutParsingPipelineV2
1319
+
1320
+ def _get_batch_size(self, config):
1321
+ return config.get("batch_size", 1)