paddlex 3.0.0rc1__py3-none-any.whl → 3.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. paddlex/.version +1 -1
  2. paddlex/__init__.py +1 -1
  3. paddlex/configs/modules/chart_parsing/PP-Chart2Table.yaml +13 -0
  4. paddlex/configs/modules/doc_vlm/PP-DocBee2-3B.yaml +14 -0
  5. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-L.yaml +40 -0
  6. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-M.yaml +40 -0
  7. paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-S.yaml +40 -0
  8. paddlex/configs/modules/layout_detection/PP-DocBlockLayout.yaml +40 -0
  9. paddlex/configs/modules/layout_detection/PP-DocLayout-L.yaml +2 -2
  10. paddlex/configs/modules/layout_detection/PP-DocLayout-M.yaml +2 -2
  11. paddlex/configs/modules/layout_detection/PP-DocLayout-S.yaml +2 -2
  12. paddlex/configs/modules/layout_detection/PP-DocLayout_plus-L.yaml +40 -0
  13. paddlex/configs/modules/text_detection/PP-OCRv5_mobile_det.yaml +40 -0
  14. paddlex/configs/modules/text_detection/PP-OCRv5_server_det.yaml +40 -0
  15. paddlex/configs/modules/text_recognition/PP-OCRv5_mobile_rec.yaml +39 -0
  16. paddlex/configs/modules/text_recognition/PP-OCRv5_server_rec.yaml +39 -0
  17. paddlex/configs/modules/textline_orientation/PP-LCNet_x1_0_textline_ori.yaml +41 -0
  18. paddlex/configs/pipelines/OCR.yaml +7 -6
  19. paddlex/configs/pipelines/PP-ChatOCRv3-doc.yaml +3 -1
  20. paddlex/configs/pipelines/PP-ChatOCRv4-doc.yaml +91 -34
  21. paddlex/configs/pipelines/PP-StructureV3.yaml +72 -72
  22. paddlex/configs/pipelines/doc_understanding.yaml +1 -1
  23. paddlex/configs/pipelines/formula_recognition.yaml +2 -2
  24. paddlex/configs/pipelines/layout_parsing.yaml +3 -2
  25. paddlex/configs/pipelines/seal_recognition.yaml +1 -0
  26. paddlex/configs/pipelines/table_recognition.yaml +2 -1
  27. paddlex/configs/pipelines/table_recognition_v2.yaml +7 -1
  28. paddlex/hpip_links.html +20 -20
  29. paddlex/inference/common/batch_sampler/doc_vlm_batch_sampler.py +33 -10
  30. paddlex/inference/common/batch_sampler/image_batch_sampler.py +34 -25
  31. paddlex/inference/common/result/mixin.py +19 -12
  32. paddlex/inference/models/base/predictor/base_predictor.py +2 -8
  33. paddlex/inference/models/common/static_infer.py +11 -59
  34. paddlex/inference/models/common/tokenizer/__init__.py +2 -0
  35. paddlex/inference/models/common/tokenizer/clip_tokenizer.py +1 -1
  36. paddlex/inference/models/common/tokenizer/gpt_tokenizer.py +2 -2
  37. paddlex/inference/models/common/tokenizer/qwen2_5_tokenizer.py +112 -0
  38. paddlex/inference/models/common/tokenizer/qwen2_tokenizer.py +7 -1
  39. paddlex/inference/models/common/tokenizer/qwen_tokenizer.py +288 -0
  40. paddlex/inference/models/common/tokenizer/tokenizer_utils.py +13 -13
  41. paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py +3 -3
  42. paddlex/inference/models/common/tokenizer/vocab.py +7 -7
  43. paddlex/inference/models/common/vlm/conversion_utils.py +99 -0
  44. paddlex/inference/models/common/vlm/fusion_ops.py +205 -0
  45. paddlex/inference/models/common/vlm/generation/configuration_utils.py +1 -1
  46. paddlex/inference/models/common/vlm/generation/logits_process.py +1 -1
  47. paddlex/inference/models/common/vlm/generation/utils.py +1 -1
  48. paddlex/inference/models/common/vlm/transformers/configuration_utils.py +3 -3
  49. paddlex/inference/models/common/vlm/transformers/conversion_utils.py +3 -3
  50. paddlex/inference/models/common/vlm/transformers/model_outputs.py +2 -2
  51. paddlex/inference/models/common/vlm/transformers/model_utils.py +7 -31
  52. paddlex/inference/models/doc_vlm/modeling/GOT_ocr_2_0.py +830 -0
  53. paddlex/inference/models/doc_vlm/modeling/__init__.py +2 -0
  54. paddlex/inference/models/doc_vlm/modeling/qwen2.py +1606 -0
  55. paddlex/inference/models/doc_vlm/modeling/qwen2_5_vl.py +3006 -0
  56. paddlex/inference/models/doc_vlm/modeling/qwen2_vl.py +0 -105
  57. paddlex/inference/models/doc_vlm/predictor.py +79 -24
  58. paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py +97 -0
  59. paddlex/inference/models/doc_vlm/processors/__init__.py +2 -0
  60. paddlex/inference/models/doc_vlm/processors/common.py +189 -0
  61. paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py +548 -0
  62. paddlex/inference/models/doc_vlm/processors/qwen2_vl.py +21 -176
  63. paddlex/inference/models/formula_recognition/predictor.py +7 -1
  64. paddlex/inference/models/formula_recognition/processors.py +92 -79
  65. paddlex/inference/models/formula_recognition/result.py +28 -27
  66. paddlex/inference/models/image_feature/processors.py +3 -4
  67. paddlex/inference/models/keypoint_detection/predictor.py +3 -0
  68. paddlex/inference/models/object_detection/predictor.py +2 -0
  69. paddlex/inference/models/object_detection/processors.py +28 -3
  70. paddlex/inference/models/object_detection/utils.py +2 -0
  71. paddlex/inference/models/table_structure_recognition/result.py +0 -10
  72. paddlex/inference/models/text_detection/predictor.py +8 -0
  73. paddlex/inference/models/text_detection/processors.py +44 -10
  74. paddlex/inference/models/text_detection/result.py +0 -10
  75. paddlex/inference/pipelines/__init__.py +9 -5
  76. paddlex/inference/pipelines/_parallel.py +172 -0
  77. paddlex/inference/pipelines/anomaly_detection/pipeline.py +16 -6
  78. paddlex/inference/pipelines/attribute_recognition/pipeline.py +11 -1
  79. paddlex/inference/pipelines/base.py +14 -4
  80. paddlex/inference/pipelines/components/faisser.py +1 -1
  81. paddlex/inference/pipelines/doc_preprocessor/pipeline.py +53 -27
  82. paddlex/inference/pipelines/formula_recognition/pipeline.py +120 -82
  83. paddlex/inference/pipelines/formula_recognition/result.py +1 -11
  84. paddlex/inference/pipelines/image_classification/pipeline.py +16 -6
  85. paddlex/inference/pipelines/image_multilabel_classification/pipeline.py +16 -6
  86. paddlex/inference/pipelines/instance_segmentation/pipeline.py +16 -6
  87. paddlex/inference/pipelines/keypoint_detection/pipeline.py +16 -6
  88. paddlex/inference/pipelines/layout_parsing/pipeline.py +34 -47
  89. paddlex/inference/pipelines/layout_parsing/pipeline_v2.py +893 -260
  90. paddlex/inference/pipelines/layout_parsing/result.py +4 -17
  91. paddlex/inference/pipelines/layout_parsing/result_v2.py +523 -245
  92. paddlex/inference/pipelines/layout_parsing/setting.py +87 -0
  93. paddlex/inference/pipelines/layout_parsing/utils.py +565 -1998
  94. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/__init__.py +16 -0
  95. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/utils.py +1144 -0
  96. paddlex/inference/pipelines/layout_parsing/xycut_enhanced/xycuts.py +563 -0
  97. paddlex/inference/pipelines/m_3d_bev_detection/pipeline.py +2 -2
  98. paddlex/inference/pipelines/multilingual_speech_recognition/pipeline.py +2 -2
  99. paddlex/inference/pipelines/object_detection/pipeline.py +16 -6
  100. paddlex/inference/pipelines/ocr/pipeline.py +127 -70
  101. paddlex/inference/pipelines/ocr/result.py +19 -16
  102. paddlex/inference/pipelines/open_vocabulary_detection/pipeline.py +2 -2
  103. paddlex/inference/pipelines/open_vocabulary_segmentation/pipeline.py +2 -2
  104. paddlex/inference/pipelines/pp_chatocr/pipeline_base.py +2 -2
  105. paddlex/inference/pipelines/pp_chatocr/pipeline_v3.py +2 -5
  106. paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py +5 -5
  107. paddlex/inference/pipelines/rotated_object_detection/pipeline.py +16 -6
  108. paddlex/inference/pipelines/seal_recognition/pipeline.py +109 -53
  109. paddlex/inference/pipelines/semantic_segmentation/pipeline.py +16 -6
  110. paddlex/inference/pipelines/small_object_detection/pipeline.py +16 -6
  111. paddlex/inference/pipelines/table_recognition/pipeline.py +26 -18
  112. paddlex/inference/pipelines/table_recognition/pipeline_v2.py +624 -53
  113. paddlex/inference/pipelines/table_recognition/result.py +1 -1
  114. paddlex/inference/pipelines/table_recognition/table_recognition_post_processing_v2.py +9 -5
  115. paddlex/inference/pipelines/ts_anomaly_detection/pipeline.py +2 -2
  116. paddlex/inference/pipelines/ts_classification/pipeline.py +2 -2
  117. paddlex/inference/pipelines/ts_forecasting/pipeline.py +2 -2
  118. paddlex/inference/pipelines/video_classification/pipeline.py +2 -2
  119. paddlex/inference/pipelines/video_detection/pipeline.py +2 -2
  120. paddlex/inference/serving/basic_serving/_pipeline_apps/_common/common.py +5 -1
  121. paddlex/inference/serving/basic_serving/_pipeline_apps/layout_parsing.py +0 -1
  122. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv3_doc.py +0 -1
  123. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv4_doc.py +1 -1
  124. paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py +6 -2
  125. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition.py +1 -5
  126. paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition_v2.py +4 -5
  127. paddlex/inference/serving/infra/utils.py +20 -22
  128. paddlex/inference/serving/schemas/formula_recognition.py +1 -1
  129. paddlex/inference/serving/schemas/layout_parsing.py +1 -2
  130. paddlex/inference/serving/schemas/pp_chatocrv3_doc.py +1 -2
  131. paddlex/inference/serving/schemas/pp_chatocrv4_doc.py +2 -2
  132. paddlex/inference/serving/schemas/pp_structurev3.py +10 -6
  133. paddlex/inference/serving/schemas/seal_recognition.py +1 -1
  134. paddlex/inference/serving/schemas/table_recognition.py +2 -6
  135. paddlex/inference/serving/schemas/table_recognition_v2.py +5 -6
  136. paddlex/inference/utils/hpi.py +8 -1
  137. paddlex/inference/utils/hpi_model_info_collection.json +81 -2
  138. paddlex/inference/utils/io/readers.py +12 -12
  139. paddlex/inference/utils/mkldnn_blocklist.py +25 -0
  140. paddlex/inference/utils/official_models.py +14 -0
  141. paddlex/inference/utils/pp_option.py +29 -8
  142. paddlex/model.py +2 -2
  143. paddlex/modules/__init__.py +1 -1
  144. paddlex/modules/anomaly_detection/evaluator.py +2 -2
  145. paddlex/modules/base/__init__.py +1 -1
  146. paddlex/modules/base/evaluator.py +5 -5
  147. paddlex/modules/base/trainer.py +1 -1
  148. paddlex/modules/doc_vlm/dataset_checker.py +2 -2
  149. paddlex/modules/doc_vlm/evaluator.py +2 -2
  150. paddlex/modules/doc_vlm/exportor.py +2 -2
  151. paddlex/modules/doc_vlm/model_list.py +1 -1
  152. paddlex/modules/doc_vlm/trainer.py +2 -2
  153. paddlex/modules/face_recognition/evaluator.py +2 -2
  154. paddlex/modules/formula_recognition/evaluator.py +5 -2
  155. paddlex/modules/formula_recognition/model_list.py +3 -0
  156. paddlex/modules/formula_recognition/trainer.py +3 -0
  157. paddlex/modules/general_recognition/evaluator.py +1 -1
  158. paddlex/modules/image_classification/evaluator.py +2 -2
  159. paddlex/modules/image_classification/model_list.py +1 -0
  160. paddlex/modules/instance_segmentation/evaluator.py +1 -1
  161. paddlex/modules/keypoint_detection/evaluator.py +1 -1
  162. paddlex/modules/m_3d_bev_detection/evaluator.py +2 -2
  163. paddlex/modules/multilabel_classification/evaluator.py +2 -2
  164. paddlex/modules/object_detection/dataset_checker/dataset_src/convert_dataset.py +4 -4
  165. paddlex/modules/object_detection/evaluator.py +2 -2
  166. paddlex/modules/object_detection/model_list.py +2 -0
  167. paddlex/modules/semantic_segmentation/evaluator.py +2 -2
  168. paddlex/modules/table_recognition/evaluator.py +2 -2
  169. paddlex/modules/text_detection/evaluator.py +2 -2
  170. paddlex/modules/text_detection/model_list.py +2 -0
  171. paddlex/modules/text_recognition/evaluator.py +2 -2
  172. paddlex/modules/text_recognition/model_list.py +2 -0
  173. paddlex/modules/ts_anomaly_detection/evaluator.py +2 -2
  174. paddlex/modules/ts_classification/dataset_checker/dataset_src/split_dataset.py +1 -1
  175. paddlex/modules/ts_classification/evaluator.py +2 -2
  176. paddlex/modules/ts_forecast/evaluator.py +2 -2
  177. paddlex/modules/video_classification/evaluator.py +2 -2
  178. paddlex/modules/video_detection/evaluator.py +2 -2
  179. paddlex/ops/__init__.py +2 -2
  180. paddlex/paddlex_cli.py +19 -13
  181. paddlex/repo_apis/Paddle3D_api/bev_fusion/model.py +2 -2
  182. paddlex/repo_apis/PaddleClas_api/cls/config.py +1 -1
  183. paddlex/repo_apis/PaddleClas_api/cls/model.py +1 -1
  184. paddlex/repo_apis/PaddleClas_api/cls/register.py +10 -0
  185. paddlex/repo_apis/PaddleClas_api/cls/runner.py +1 -1
  186. paddlex/repo_apis/PaddleDetection_api/instance_seg/model.py +1 -1
  187. paddlex/repo_apis/PaddleDetection_api/instance_seg/runner.py +1 -1
  188. paddlex/repo_apis/PaddleDetection_api/object_det/config.py +1 -1
  189. paddlex/repo_apis/PaddleDetection_api/object_det/model.py +1 -1
  190. paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py +25 -0
  191. paddlex/repo_apis/PaddleDetection_api/object_det/register.py +30 -0
  192. paddlex/repo_apis/PaddleDetection_api/object_det/runner.py +1 -1
  193. paddlex/repo_apis/PaddleOCR_api/formula_rec/config.py +3 -3
  194. paddlex/repo_apis/PaddleOCR_api/formula_rec/model.py +5 -9
  195. paddlex/repo_apis/PaddleOCR_api/formula_rec/register.py +27 -0
  196. paddlex/repo_apis/PaddleOCR_api/formula_rec/runner.py +1 -1
  197. paddlex/repo_apis/PaddleOCR_api/table_rec/model.py +1 -1
  198. paddlex/repo_apis/PaddleOCR_api/table_rec/runner.py +1 -1
  199. paddlex/repo_apis/PaddleOCR_api/text_det/model.py +1 -1
  200. paddlex/repo_apis/PaddleOCR_api/text_det/register.py +18 -0
  201. paddlex/repo_apis/PaddleOCR_api/text_det/runner.py +1 -1
  202. paddlex/repo_apis/PaddleOCR_api/text_rec/config.py +3 -3
  203. paddlex/repo_apis/PaddleOCR_api/text_rec/model.py +5 -9
  204. paddlex/repo_apis/PaddleOCR_api/text_rec/register.py +18 -0
  205. paddlex/repo_apis/PaddleOCR_api/text_rec/runner.py +1 -1
  206. paddlex/repo_apis/PaddleSeg_api/seg/model.py +1 -1
  207. paddlex/repo_apis/PaddleSeg_api/seg/runner.py +1 -1
  208. paddlex/repo_apis/PaddleTS_api/ts_ad/config.py +3 -3
  209. paddlex/repo_apis/PaddleTS_api/ts_cls/config.py +2 -2
  210. paddlex/repo_apis/PaddleTS_api/ts_fc/config.py +4 -4
  211. paddlex/repo_apis/PaddleVideo_api/video_cls/config.py +1 -1
  212. paddlex/repo_apis/PaddleVideo_api/video_cls/model.py +1 -1
  213. paddlex/repo_apis/PaddleVideo_api/video_cls/runner.py +1 -1
  214. paddlex/repo_apis/PaddleVideo_api/video_det/config.py +1 -1
  215. paddlex/repo_apis/PaddleVideo_api/video_det/model.py +1 -1
  216. paddlex/repo_apis/PaddleVideo_api/video_det/runner.py +1 -1
  217. paddlex/repo_apis/base/config.py +1 -1
  218. paddlex/repo_manager/core.py +3 -3
  219. paddlex/repo_manager/meta.py +6 -2
  220. paddlex/repo_manager/repo.py +17 -16
  221. paddlex/utils/custom_device_list.py +26 -2
  222. paddlex/utils/deps.py +1 -1
  223. paddlex/utils/device.py +15 -8
  224. paddlex/utils/env.py +4 -0
  225. paddlex/utils/flags.py +2 -4
  226. paddlex/utils/fonts/__init__.py +34 -4
  227. paddlex/utils/misc.py +1 -1
  228. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/METADATA +52 -56
  229. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/RECORD +233 -206
  230. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/WHEEL +1 -1
  231. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/entry_points.txt +0 -0
  232. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/licenses/LICENSE +0 -0
  233. {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/top_level.txt +0 -0
@@ -15,9 +15,10 @@ from __future__ import annotations
15
15
 
16
16
  import copy
17
17
  import re
18
- from typing import Any, Dict, Optional, Tuple, Union
18
+ from typing import Any, Dict, List, Optional, Tuple, Union
19
19
 
20
20
  import numpy as np
21
+ from PIL import Image
21
22
 
22
23
  from ....utils import logging
23
24
  from ....utils.deps import pipeline_requires_extra
@@ -26,18 +27,31 @@ from ...common.reader import ReadImage
26
27
  from ...models.object_detection.result import DetResult
27
28
  from ...utils.hpi import HPIConfig
28
29
  from ...utils.pp_option import PaddlePredictorOption
30
+ from .._parallel import AutoParallelImageSimpleInferencePipeline
29
31
  from ..base import BasePipeline
30
32
  from ..ocr.result import OCRResult
31
- from .result_v2 import LayoutParsingResultV2
32
- from .utils import gather_imgs, get_single_block_parsing_res, get_sub_regions_ocr_res
33
-
34
-
35
- @pipeline_requires_extra("ocr")
36
- class LayoutParsingPipelineV2(BasePipeline):
33
+ from .result_v2 import LayoutParsingBlock, LayoutParsingRegion, LayoutParsingResultV2
34
+ from .setting import BLOCK_LABEL_MAP, BLOCK_SETTINGS, LINE_SETTINGS, REGION_SETTINGS
35
+ from .utils import (
36
+ caculate_bbox_area,
37
+ calculate_minimum_enclosing_bbox,
38
+ calculate_overlap_ratio,
39
+ convert_formula_res_to_ocr_format,
40
+ format_line,
41
+ gather_imgs,
42
+ get_bbox_intersection,
43
+ get_sub_regions_ocr_res,
44
+ group_boxes_into_lines,
45
+ remove_overlap_blocks,
46
+ shrink_supplement_region_bbox,
47
+ split_boxes_by_projection,
48
+ update_region_box,
49
+ )
50
+
51
+
52
+ class _LayoutParsingPipelineV2(BasePipeline):
37
53
  """Layout Parsing Pipeline V2"""
38
54
 
39
- entities = ["PP-StructureV3"]
40
-
41
55
  def __init__(
42
56
  self,
43
57
  config: dict,
@@ -53,9 +67,9 @@ class LayoutParsingPipelineV2(BasePipeline):
53
67
  device (str, optional): Device to run the predictions on. Defaults to None.
54
68
  pp_option (PaddlePredictorOption, optional): PaddlePredictor options. Defaults to None.
55
69
  use_hpip (bool, optional): Whether to use the high-performance
56
- inference plugin (HPIP). Defaults to False.
70
+ inference plugin (HPIP) by default. Defaults to False.
57
71
  hpi_config (Optional[Union[Dict[str, Any], HPIConfig]], optional):
58
- The high-performance inference configuration dictionary.
72
+ The default high-performance inference configuration dictionary.
59
73
  Defaults to None.
60
74
  """
61
75
 
@@ -68,8 +82,7 @@ class LayoutParsingPipelineV2(BasePipeline):
68
82
 
69
83
  self.inintial_predictor(config)
70
84
 
71
- self.batch_sampler = ImageBatchSampler(batch_size=1)
72
-
85
+ self.batch_sampler = ImageBatchSampler(batch_size=config.get("batch_size", 1))
73
86
  self.img_reader = ReadImage(format="BGR")
74
87
 
75
88
  def inintial_predictor(self, config: dict) -> None:
@@ -83,13 +96,20 @@ class LayoutParsingPipelineV2(BasePipeline):
83
96
  """
84
97
 
85
98
  self.use_doc_preprocessor = config.get("use_doc_preprocessor", True)
86
- self.use_general_ocr = config.get("use_general_ocr", True)
87
99
  self.use_table_recognition = config.get("use_table_recognition", True)
88
100
  self.use_seal_recognition = config.get("use_seal_recognition", True)
101
+ self.use_region_detection = config.get(
102
+ "use_region_detection",
103
+ True,
104
+ )
89
105
  self.use_formula_recognition = config.get(
90
106
  "use_formula_recognition",
91
107
  True,
92
108
  )
109
+ self.use_chart_recognition = config.get(
110
+ "use_chart_recognition",
111
+ False,
112
+ )
93
113
 
94
114
  if self.use_doc_preprocessor:
95
115
  doc_preprocessor_config = config.get("SubPipelines", {}).get(
@@ -101,6 +121,16 @@ class LayoutParsingPipelineV2(BasePipeline):
101
121
  self.doc_preprocessor_pipeline = self.create_pipeline(
102
122
  doc_preprocessor_config,
103
123
  )
124
+ if self.use_region_detection:
125
+ region_detection_config = config.get("SubModules", {}).get(
126
+ "RegionDetection",
127
+ {
128
+ "model_config_error": "config error for block_region_detection_model!"
129
+ },
130
+ )
131
+ self.region_detection_model = self.create_model(
132
+ region_detection_config,
133
+ )
104
134
 
105
135
  layout_det_config = config.get("SubModules", {}).get(
106
136
  "LayoutDetection",
@@ -123,14 +153,13 @@ class LayoutParsingPipelineV2(BasePipeline):
123
153
  layout_kwargs["layout_merge_bboxes_mode"] = layout_merge_bboxes_mode
124
154
  self.layout_det_model = self.create_model(layout_det_config, **layout_kwargs)
125
155
 
126
- if self.use_general_ocr or self.use_table_recognition:
127
- general_ocr_config = config.get("SubPipelines", {}).get(
128
- "GeneralOCR",
129
- {"pipeline_config_error": "config error for general_ocr_pipeline!"},
130
- )
131
- self.general_ocr_pipeline = self.create_pipeline(
132
- general_ocr_config,
133
- )
156
+ general_ocr_config = config.get("SubPipelines", {}).get(
157
+ "GeneralOCR",
158
+ {"pipeline_config_error": "config error for general_ocr_pipeline!"},
159
+ )
160
+ self.general_ocr_pipeline = self.create_pipeline(
161
+ general_ocr_config,
162
+ )
134
163
 
135
164
  if self.use_seal_recognition:
136
165
  seal_recognition_config = config.get("SubPipelines", {}).get(
@@ -165,6 +194,17 @@ class LayoutParsingPipelineV2(BasePipeline):
165
194
  formula_recognition_config,
166
195
  )
167
196
 
197
+ if self.use_chart_recognition:
198
+ chart_recognition_config = config.get("SubModules", {}).get(
199
+ "ChartRecognition",
200
+ {
201
+ "model_config_error": "config error for block_region_detection_model!"
202
+ },
203
+ )
204
+ self.chart_recognition_model = self.create_model(
205
+ chart_recognition_config,
206
+ )
207
+
168
208
  return
169
209
 
170
210
  def get_text_paragraphs_ocr_res(
@@ -209,12 +249,6 @@ class LayoutParsingPipelineV2(BasePipeline):
209
249
  )
210
250
  return False
211
251
 
212
- if input_params["use_general_ocr"] and not self.use_general_ocr:
213
- logging.error(
214
- "Set use_general_ocr, but the models for general OCR are not initialized.",
215
- )
216
- return False
217
-
218
252
  if input_params["use_seal_recognition"] and not self.use_seal_recognition:
219
253
  logging.error(
220
254
  "Set use_seal_recognition, but the models for seal recognition are not initialized.",
@@ -229,159 +263,643 @@ class LayoutParsingPipelineV2(BasePipeline):
229
263
 
230
264
  return True
231
265
 
232
- def get_layout_parsing_res(
266
+ def standardized_data(
233
267
  self,
234
268
  image: list,
269
+ region_det_res: DetResult,
235
270
  layout_det_res: DetResult,
236
271
  overall_ocr_res: OCRResult,
237
- table_res_list: list,
238
- seal_res_list: list,
239
272
  formula_res_list: list,
240
- imgs_in_doc: list,
241
- text_det_limit_side_len: Optional[int] = None,
242
- text_det_limit_type: Optional[str] = None,
243
- text_det_thresh: Optional[float] = None,
244
- text_det_box_thresh: Optional[float] = None,
245
- text_det_unclip_ratio: Optional[float] = None,
246
- text_rec_score_thresh: Optional[float] = None,
273
+ text_rec_model: Any,
274
+ text_rec_score_thresh: Union[float, None] = None,
247
275
  ) -> list:
248
276
  """
249
277
  Retrieves the layout parsing result based on the layout detection result, OCR result, and other recognition results.
250
278
  Args:
251
279
  image (list): The input image.
252
- layout_det_res (DetResult): The detection result containing the layout information of the document.
253
- overall_ocr_res (OCRResult): The overall OCR result containing text information.
254
- table_res_list (list): A list of table recognition results.
255
- seal_res_list (list): A list of seal recognition results.
280
+ overall_ocr_res (OCRResult): An object containing the overall OCR results, including detected text boxes and recognized text. The structure is expected to have:
281
+ - "input_img": The image on which OCR was performed.
282
+ - "dt_boxes": A list of detected text box coordinates.
283
+ - "rec_texts": A list of recognized text corresponding to the detected boxes.
284
+
285
+ layout_det_res (DetResult): An object containing the layout detection results, including detected layout boxes and their labels. The structure is expected to have:
286
+ - "boxes": A list of dictionaries with keys "coordinate" for box coordinates and "block_label" for the type of content.
287
+
288
+ table_res_list (list): A list of table detection results, where each item is a dictionary containing:
289
+ - "block_bbox": The bounding box of the table layout.
290
+ - "pred_html": The predicted HTML representation of the table.
291
+
256
292
  formula_res_list (list): A list of formula recognition results.
257
- text_det_limit_side_len (Optional[int], optional): The maximum side length of the text detection region. Defaults to None.
258
- text_det_limit_type (Optional[str], optional): The type of limit for the text detection region. Defaults to None.
259
- text_det_thresh (Optional[float], optional): The confidence threshold for text detection. Defaults to None.
260
- text_det_box_thresh (Optional[float], optional): The confidence threshold for text detection bounding boxes. Defaults to None
261
- text_det_unclip_ratio (Optional[float], optional): The unclip ratio for text detection. Defaults to None.
293
+ text_rec_model (Any): The text recognition model.
262
294
  text_rec_score_thresh (Optional[float], optional): The score threshold for text recognition. Defaults to None.
263
295
  Returns:
264
296
  list: A list of dictionaries representing the layout parsing result.
265
297
  """
298
+
266
299
  matched_ocr_dict = {}
267
- image = np.array(image)
300
+ region_to_block_map = {}
301
+ block_to_ocr_map = {}
268
302
  object_boxes = []
269
303
  footnote_list = []
270
- max_bottom_text_coordinate = 0
304
+ paragraph_title_list = []
305
+ bottom_text_y_max = 0
306
+ max_block_area = 0.0
307
+ doc_title_num = 0
308
+
309
+ base_region_bbox = [65535, 65535, 0, 0]
310
+ layout_det_res = remove_overlap_blocks(
311
+ layout_det_res,
312
+ threshold=0.5,
313
+ smaller=True,
314
+ )
271
315
 
272
- for object_box_idx, box_info in enumerate(layout_det_res["boxes"]):
316
+ # convert formula_res_list to OCRResult format
317
+ convert_formula_res_to_ocr_format(formula_res_list, overall_ocr_res)
318
+
319
+ # match layout boxes and ocr boxes and get some information for layout_order_config
320
+ for box_idx, box_info in enumerate(layout_det_res["boxes"]):
273
321
  box = box_info["coordinate"]
274
322
  label = box_info["label"].lower()
275
323
  object_boxes.append(box)
324
+ _, _, _, y2 = box
325
+
326
+ # update the region box and max_block_area according to the layout boxes
327
+ base_region_bbox = update_region_box(box, base_region_bbox)
328
+ max_block_area = max(max_block_area, caculate_bbox_area(box))
329
+
330
+ # update_layout_order_config_block_index(layout_order_config, label, box_idx)
276
331
 
277
332
  # set the label of footnote to text, when it is above the text boxes
278
333
  if label == "footnote":
279
- footnote_list.append(object_box_idx)
280
- if label == "text" and box[3] > max_bottom_text_coordinate:
281
- max_bottom_text_coordinate = box[3]
334
+ footnote_list.append(box_idx)
335
+ elif label == "paragraph_title":
336
+ paragraph_title_list.append(box_idx)
337
+ if label == "text":
338
+ bottom_text_y_max = max(y2, bottom_text_y_max)
339
+ if label == "doc_title":
340
+ doc_title_num += 1
282
341
 
283
342
  if label not in ["formula", "table", "seal"]:
284
- _, matched_idxs = get_sub_regions_ocr_res(
343
+ _, matched_idxes = get_sub_regions_ocr_res(
285
344
  overall_ocr_res, [box], return_match_idx=True
286
345
  )
287
- for matched_idx in matched_idxs:
346
+ block_to_ocr_map[box_idx] = matched_idxes
347
+ for matched_idx in matched_idxes:
288
348
  if matched_ocr_dict.get(matched_idx, None) is None:
289
- matched_ocr_dict[matched_idx] = [object_box_idx]
349
+ matched_ocr_dict[matched_idx] = [box_idx]
290
350
  else:
291
- matched_ocr_dict[matched_idx].append(object_box_idx)
351
+ matched_ocr_dict[matched_idx].append(box_idx)
292
352
 
353
+ # fix the footnote label
293
354
  for footnote_idx in footnote_list:
294
355
  if (
295
356
  layout_det_res["boxes"][footnote_idx]["coordinate"][3]
296
- < max_bottom_text_coordinate
357
+ < bottom_text_y_max
297
358
  ):
298
359
  layout_det_res["boxes"][footnote_idx]["label"] = "text"
299
360
 
300
- already_processed = set()
301
- for matched_idx, layout_box_ids in matched_ocr_dict.items():
302
- if len(layout_box_ids) <= 1:
303
- continue
304
-
305
- # one ocr is matched to multiple layout boxes, split the text into multiple lines
306
- for idx in layout_box_ids:
307
- if idx in already_processed:
308
- continue
309
-
310
- already_processed.add(idx)
311
- wht_im = np.ones(image.shape, dtype=image.dtype) * 255
312
- box = object_boxes[idx]
313
- x1, y1, x2, y2 = [int(i) for i in box]
314
- wht_im[y1:y2, x1:x2, :] = image[y1:y2, x1:x2, :]
315
- sub_ocr_res = next(
316
- self.general_ocr_pipeline(
317
- wht_im,
318
- text_det_limit_side_len=text_det_limit_side_len,
319
- text_det_limit_type=text_det_limit_type,
320
- text_det_thresh=text_det_thresh,
321
- text_det_box_thresh=text_det_box_thresh,
322
- text_det_unclip_ratio=text_det_unclip_ratio,
323
- text_rec_score_thresh=text_rec_score_thresh,
361
+ # check if there is only one paragraph title and without doc_title
362
+ only_one_paragraph_title = len(paragraph_title_list) == 1 and doc_title_num == 0
363
+ if only_one_paragraph_title:
364
+ paragraph_title_block_area = caculate_bbox_area(
365
+ layout_det_res["boxes"][paragraph_title_list[0]]["coordinate"]
366
+ )
367
+ title_area_max_block_threshold = BLOCK_SETTINGS.get(
368
+ "title_conversion_area_ratio_threshold", 0.3
369
+ )
370
+ if (
371
+ paragraph_title_block_area
372
+ > max_block_area * title_area_max_block_threshold
373
+ ):
374
+ layout_det_res["boxes"][paragraph_title_list[0]]["label"] = "doc_title"
375
+
376
+ # Replace the OCR information of the hurdles.
377
+ for overall_ocr_idx, layout_box_ids in matched_ocr_dict.items():
378
+ if len(layout_box_ids) > 1:
379
+ matched_no = 0
380
+ overall_ocr_box = copy.deepcopy(
381
+ overall_ocr_res["rec_boxes"][overall_ocr_idx]
382
+ )
383
+ overall_ocr_dt_poly = copy.deepcopy(
384
+ overall_ocr_res["dt_polys"][overall_ocr_idx]
385
+ )
386
+ for box_idx in layout_box_ids:
387
+ layout_box = layout_det_res["boxes"][box_idx]["coordinate"]
388
+ crop_box = get_bbox_intersection(overall_ocr_box, layout_box)
389
+ for ocr_idx in block_to_ocr_map[box_idx]:
390
+ ocr_box = overall_ocr_res["rec_boxes"][ocr_idx]
391
+ iou = calculate_overlap_ratio(ocr_box, crop_box, "small")
392
+ if iou > 0.8:
393
+ overall_ocr_res["rec_texts"][ocr_idx] = ""
394
+ x1, y1, x2, y2 = [int(i) for i in crop_box]
395
+ crop_img = np.array(image)[y1:y2, x1:x2]
396
+ crop_img_rec_res = list(text_rec_model([crop_img]))[0]
397
+ crop_img_dt_poly = get_bbox_intersection(
398
+ overall_ocr_dt_poly, layout_box, return_format="poly"
399
+ )
400
+ crop_img_rec_score = crop_img_rec_res["rec_score"]
401
+ crop_img_rec_text = crop_img_rec_res["rec_text"]
402
+ text_rec_score_thresh = (
403
+ text_rec_score_thresh
404
+ if text_rec_score_thresh is not None
405
+ else (self.general_ocr_pipeline.text_rec_score_thresh)
324
406
  )
407
+ if crop_img_rec_score >= text_rec_score_thresh:
408
+ matched_no += 1
409
+ if matched_no == 1:
410
+ # the first matched ocr be replaced by the first matched layout box
411
+ overall_ocr_res["dt_polys"][
412
+ overall_ocr_idx
413
+ ] = crop_img_dt_poly
414
+ overall_ocr_res["rec_boxes"][overall_ocr_idx] = crop_box
415
+ overall_ocr_res["rec_polys"][
416
+ overall_ocr_idx
417
+ ] = crop_img_dt_poly
418
+ overall_ocr_res["rec_scores"][
419
+ overall_ocr_idx
420
+ ] = crop_img_rec_score
421
+ overall_ocr_res["rec_texts"][
422
+ overall_ocr_idx
423
+ ] = crop_img_rec_text
424
+ else:
425
+ # the other matched ocr be appended to the overall ocr result
426
+ overall_ocr_res["dt_polys"].append(crop_img_dt_poly)
427
+ overall_ocr_res["rec_boxes"] = np.vstack(
428
+ (overall_ocr_res["rec_boxes"], crop_box)
429
+ )
430
+ overall_ocr_res["rec_polys"].append(crop_img_dt_poly)
431
+ overall_ocr_res["rec_scores"].append(crop_img_rec_score)
432
+ overall_ocr_res["rec_texts"].append(crop_img_rec_text)
433
+ overall_ocr_res["rec_labels"].append("text")
434
+ block_to_ocr_map[box_idx].remove(overall_ocr_idx)
435
+ block_to_ocr_map[box_idx].append(
436
+ len(overall_ocr_res["rec_texts"]) - 1
437
+ )
438
+
439
+ # use layout bbox to do ocr recognition when there is no matched ocr
440
+ for layout_box_idx, overall_ocr_idxes in block_to_ocr_map.items():
441
+ has_text = False
442
+ for idx in overall_ocr_idxes:
443
+ if overall_ocr_res["rec_texts"][idx] != "":
444
+ has_text = True
445
+ break
446
+ if not has_text and layout_det_res["boxes"][layout_box_idx][
447
+ "label"
448
+ ] not in BLOCK_LABEL_MAP.get("vision_labels", []):
449
+ crop_box = layout_det_res["boxes"][layout_box_idx]["coordinate"]
450
+ x1, y1, x2, y2 = [int(i) for i in crop_box]
451
+ crop_img = np.array(image)[y1:y2, x1:x2]
452
+ crop_img_rec_res = next(text_rec_model([crop_img]))
453
+ crop_img_dt_poly = get_bbox_intersection(
454
+ crop_box, crop_box, return_format="poly"
325
455
  )
326
- _, matched_idxs = get_sub_regions_ocr_res(
327
- overall_ocr_res, [box], return_match_idx=True
456
+ crop_img_rec_score = crop_img_rec_res["rec_score"]
457
+ crop_img_rec_text = crop_img_rec_res["rec_text"]
458
+ text_rec_score_thresh = (
459
+ text_rec_score_thresh
460
+ if text_rec_score_thresh is not None
461
+ else (self.general_ocr_pipeline.text_rec_score_thresh)
328
462
  )
329
- for matched_idx in sorted(matched_idxs, reverse=True):
330
- del overall_ocr_res["dt_polys"][matched_idx]
331
- del overall_ocr_res["rec_texts"][matched_idx]
332
- overall_ocr_res["rec_boxes"] = np.delete(
333
- overall_ocr_res["rec_boxes"], matched_idx, axis=0
463
+ if crop_img_rec_score >= text_rec_score_thresh:
464
+ overall_ocr_res["rec_boxes"] = np.vstack(
465
+ (overall_ocr_res["rec_boxes"], crop_box)
466
+ )
467
+ overall_ocr_res["rec_polys"].append(crop_img_dt_poly)
468
+ overall_ocr_res["rec_scores"].append(crop_img_rec_score)
469
+ overall_ocr_res["rec_texts"].append(crop_img_rec_text)
470
+ overall_ocr_res["rec_labels"].append("text")
471
+ block_to_ocr_map[layout_box_idx].append(
472
+ len(overall_ocr_res["rec_texts"]) - 1
334
473
  )
335
- del overall_ocr_res["rec_polys"][matched_idx]
336
- del overall_ocr_res["rec_scores"][matched_idx]
337
474
 
338
- if sub_ocr_res["rec_boxes"].size > 0:
339
- sub_ocr_res["rec_labels"] = ["text"] * len(sub_ocr_res["rec_texts"])
475
+ # when there is no layout detection result but there is ocr result, convert ocr detection result to layout detection result
476
+ if len(layout_det_res["boxes"]) == 0 and len(overall_ocr_res["rec_boxes"]) > 0:
477
+ for idx, ocr_rec_box in enumerate(overall_ocr_res["rec_boxes"]):
478
+ base_region_bbox = update_region_box(ocr_rec_box, base_region_bbox)
479
+ layout_det_res["boxes"].append(
480
+ {
481
+ "label": "text",
482
+ "coordinate": ocr_rec_box,
483
+ "score": overall_ocr_res["rec_scores"][idx],
484
+ }
485
+ )
486
+ block_to_ocr_map[idx] = [idx]
340
487
 
341
- overall_ocr_res["dt_polys"].extend(sub_ocr_res["dt_polys"])
342
- overall_ocr_res["rec_texts"].extend(sub_ocr_res["rec_texts"])
343
- overall_ocr_res["rec_boxes"] = np.concatenate(
344
- [overall_ocr_res["rec_boxes"], sub_ocr_res["rec_boxes"]], axis=0
488
+ block_bboxes = [box["coordinate"] for box in layout_det_res["boxes"]]
489
+ region_det_res["boxes"] = sorted(
490
+ region_det_res["boxes"],
491
+ key=lambda item: caculate_bbox_area(item["coordinate"]),
492
+ )
493
+ if len(region_det_res["boxes"]) == 0:
494
+ region_det_res["boxes"] = [
495
+ {
496
+ "coordinate": base_region_bbox,
497
+ "label": "SupplementaryRegion",
498
+ "score": 1,
499
+ }
500
+ ]
501
+ region_to_block_map[0] = range(len(block_bboxes))
502
+ else:
503
+ block_idxes_set = set(range(len(block_bboxes)))
504
+ # match block to region
505
+ for region_idx, region_info in enumerate(region_det_res["boxes"]):
506
+ matched_idxes = []
507
+ region_to_block_map[region_idx] = []
508
+ region_bbox = region_info["coordinate"]
509
+ for block_idx in block_idxes_set:
510
+ overlap_ratio = calculate_overlap_ratio(
511
+ region_bbox, block_bboxes[block_idx], mode="small"
345
512
  )
346
- overall_ocr_res["rec_polys"].extend(sub_ocr_res["rec_polys"])
347
- overall_ocr_res["rec_scores"].extend(sub_ocr_res["rec_scores"])
348
- overall_ocr_res["rec_labels"].extend(sub_ocr_res["rec_labels"])
349
-
350
- for formula_res in formula_res_list:
351
- x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
352
- poly_points = [
353
- (x_min, y_min),
354
- (x_max, y_min),
355
- (x_max, y_max),
356
- (x_min, y_max),
513
+ if overlap_ratio > REGION_SETTINGS.get(
514
+ "match_block_overlap_ratio_threshold", 0.8
515
+ ):
516
+ region_to_block_map[region_idx].append(block_idx)
517
+ matched_idxes.append(block_idx)
518
+ if len(matched_idxes) > 0:
519
+ for block_idx in matched_idxes:
520
+ block_idxes_set.remove(block_idx)
521
+ matched_bboxes = [block_bboxes[idx] for idx in matched_idxes]
522
+ new_region_bbox = calculate_minimum_enclosing_bbox(matched_bboxes)
523
+ region_det_res["boxes"][region_idx]["coordinate"] = new_region_bbox
524
+ # Supplement region when there is no matched block
525
+ if len(block_idxes_set) > 0:
526
+ while len(block_idxes_set) > 0:
527
+ matched_idxes = []
528
+ unmatched_bboxes = [block_bboxes[idx] for idx in block_idxes_set]
529
+ supplement_region_bbox = calculate_minimum_enclosing_bbox(
530
+ unmatched_bboxes
531
+ )
532
+ # check if the new region bbox is overlapped with other region bbox, if have, then shrink the new region bbox
533
+ for region_info in region_det_res["boxes"]:
534
+ region_bbox = region_info["coordinate"]
535
+ overlap_ratio = calculate_overlap_ratio(
536
+ supplement_region_bbox, region_bbox
537
+ )
538
+ if overlap_ratio > 0:
539
+ supplement_region_bbox, matched_idxes = (
540
+ shrink_supplement_region_bbox(
541
+ supplement_region_bbox,
542
+ region_bbox,
543
+ image.shape[1],
544
+ image.shape[0],
545
+ block_idxes_set,
546
+ block_bboxes,
547
+ )
548
+ )
549
+ if len(matched_idxes) == 0:
550
+ matched_idxes = list(block_idxes_set)
551
+ region_idx = len(region_det_res["boxes"])
552
+ region_to_block_map[region_idx] = list(matched_idxes)
553
+ for block_idx in matched_idxes:
554
+ block_idxes_set.remove(block_idx)
555
+ region_det_res["boxes"].append(
556
+ {
557
+ "coordinate": supplement_region_bbox,
558
+ "label": "SupplementaryRegion",
559
+ "score": 1,
560
+ }
561
+ )
562
+
563
+ region_block_ocr_idx_map = dict(
564
+ region_to_block_map=region_to_block_map,
565
+ block_to_ocr_map=block_to_ocr_map,
566
+ )
567
+
568
+ return region_block_ocr_idx_map, region_det_res, layout_det_res
569
+
570
+ def sort_line_by_projection(
571
+ self,
572
+ line: List[List[Union[List[int], str]]],
573
+ input_img: np.ndarray,
574
+ text_rec_model: Any,
575
+ text_rec_score_thresh: Union[float, None] = None,
576
+ direction: str = "vertical",
577
+ ) -> None:
578
+ """
579
+ Sort a line of text spans based on their vertical position within the layout bounding box.
580
+
581
+ Args:
582
+ line (list): A list of spans, where each span is a list containing a bounding box and text.
583
+ input_img (ndarray): The input image used for OCR.
584
+ general_ocr_pipeline (Any): The general OCR pipeline used for text recognition.
585
+
586
+ Returns:
587
+ list: The sorted line of text spans.
588
+ """
589
+ sort_index = 0 if direction == "horizontal" else 1
590
+ splited_boxes = split_boxes_by_projection(line, direction)
591
+ splited_lines = []
592
+ if len(line) != len(splited_boxes):
593
+ splited_boxes.sort(key=lambda span: span[0][sort_index])
594
+ for span in splited_boxes:
595
+ bbox, text, label = span
596
+ if label == "text":
597
+ crop_img = input_img[
598
+ int(bbox[1]) : int(bbox[3]),
599
+ int(bbox[0]) : int(bbox[2]),
600
+ ]
601
+ crop_img_rec_res = list(text_rec_model([crop_img]))[0]
602
+ crop_img_rec_score = crop_img_rec_res["rec_score"]
603
+ crop_img_rec_text = crop_img_rec_res["rec_text"]
604
+ text = (
605
+ crop_img_rec_text
606
+ if crop_img_rec_score >= text_rec_score_thresh
607
+ else ""
608
+ )
609
+ span[1] = text
610
+
611
+ splited_lines.append(span)
612
+ else:
613
+ splited_lines = line
614
+
615
+ return splited_lines
616
+
617
+ def get_block_rec_content(
618
+ self,
619
+ image: list,
620
+ ocr_rec_res: dict,
621
+ block: LayoutParsingBlock,
622
+ text_rec_model: Any,
623
+ text_rec_score_thresh: Union[float, None] = None,
624
+ ) -> str:
625
+
626
+ if len(ocr_rec_res["rec_texts"]) == 0:
627
+ block.content = ""
628
+ return block
629
+
630
+ lines, text_direction, text_line_height = group_boxes_into_lines(
631
+ ocr_rec_res,
632
+ LINE_SETTINGS.get("line_height_iou_threshold", 0.8),
633
+ )
634
+
635
+ # format line
636
+ text_lines = []
637
+ need_new_line_num = 0
638
+ # words start coordinate and stop coordinate in the line
639
+ words_start_index = 0 if text_direction == "horizontal" else 1
640
+ words_stop_index = words_start_index + 2
641
+ lines_start_index = 1 if text_direction == "horizontal" else 3
642
+ line_width_list = []
643
+
644
+ if block.label == "reference":
645
+ rec_boxes = ocr_rec_res["boxes"]
646
+ block_start_coordinate = min([box[words_start_index] for box in rec_boxes])
647
+ block_stop_coordinate = max([box[words_stop_index] for box in rec_boxes])
648
+ else:
649
+ block_start_coordinate = block.bbox[words_start_index]
650
+ block_stop_coordinate = block.bbox[words_stop_index]
651
+
652
+ for idx, line in enumerate(lines):
653
+ line.sort(
654
+ key=lambda span: (
655
+ span[0][words_start_index] // 2,
656
+ (
657
+ span[0][lines_start_index]
658
+ if text_direction == "horizontal"
659
+ else -span[0][lines_start_index]
660
+ ),
661
+ )
662
+ )
663
+
664
+ line_width = line[-1][0][words_stop_index] - line[0][0][words_start_index]
665
+ line_width_list.append(line_width)
666
+ # merge formula and text
667
+ ocr_labels = [span[2] for span in line]
668
+ if "formula" in ocr_labels:
669
+ line = self.sort_line_by_projection(
670
+ line, image, text_rec_model, text_rec_score_thresh, text_direction
671
+ )
672
+
673
+ line_text, need_new_line = format_line(
674
+ line,
675
+ text_direction,
676
+ np.max(line_width_list),
677
+ block_start_coordinate,
678
+ block_stop_coordinate,
679
+ line_gap_limit=text_line_height * 1.5,
680
+ block_label=block.label,
681
+ )
682
+ if need_new_line:
683
+ need_new_line_num += 1
684
+ if idx == 0:
685
+ line_start_coordinate = line[0][0][0]
686
+ block.seg_start_coordinate = line_start_coordinate
687
+ elif idx == len(lines) - 1:
688
+ line_end_coordinate = line[-1][0][2]
689
+ block.seg_end_coordinate = line_end_coordinate
690
+ text_lines.append(line_text)
691
+
692
+ delim = LINE_SETTINGS["delimiter_map"].get(block.label, "")
693
+ if need_new_line_num > len(text_lines) * 0.5 and delim == "":
694
+ text_lines = [text.replace("\n", "") for text in text_lines]
695
+ delim = "\n"
696
+ content = delim.join(text_lines)
697
+ block.content = content
698
+ block.num_of_lines = len(text_lines)
699
+ block.direction = text_direction
700
+ block.text_line_height = text_line_height
701
+ block.text_line_width = np.mean(line_width_list)
702
+
703
+ return block
704
+
705
+ def get_layout_parsing_blocks(
706
+ self,
707
+ image: list,
708
+ region_block_ocr_idx_map: dict,
709
+ region_det_res: DetResult,
710
+ overall_ocr_res: OCRResult,
711
+ layout_det_res: DetResult,
712
+ table_res_list: list,
713
+ seal_res_list: list,
714
+ chart_res_list: list,
715
+ text_rec_model: Any,
716
+ text_rec_score_thresh: Union[float, None] = None,
717
+ ) -> list:
718
+ """
719
+ Extract structured information from OCR and layout detection results.
720
+
721
+ Args:
722
+ image (list): The input image.
723
+ overall_ocr_res (OCRResult): An object containing the overall OCR results, including detected text boxes and recognized text. The structure is expected to have:
724
+ - "input_img": The image on which OCR was performed.
725
+ - "dt_boxes": A list of detected text box coordinates.
726
+ - "rec_texts": A list of recognized text corresponding to the detected boxes.
727
+
728
+ layout_det_res (DetResult): An object containing the layout detection results, including detected layout boxes and their labels. The structure is expected to have:
729
+ - "boxes": A list of dictionaries with keys "coordinate" for box coordinates and "block_label" for the type of content.
730
+
731
+ table_res_list (list): A list of table detection results, where each item is a dictionary containing:
732
+ - "block_bbox": The bounding box of the table layout.
733
+ - "pred_html": The predicted HTML representation of the table.
734
+
735
+ seal_res_list (List): A list of seal detection results. The details of each item depend on the specific application context.
736
+ text_rec_model (Any): A model for text recognition.
737
+ text_rec_score_thresh (Union[float, None]): The minimum score required for a recognized character to be considered valid. If None, use the default value specified during initialization. Default is None.
738
+
739
+ Returns:
740
+ list: A list of structured boxes where each item is a dictionary containing:
741
+ - "block_label": The label of the content (e.g., 'table', 'chart', 'image').
742
+ - The label as a key with either table HTML or image data and text.
743
+ - "block_bbox": The coordinates of the layout box.
744
+ """
745
+
746
+ table_index = 0
747
+ seal_index = 0
748
+ chart_index = 0
749
+ layout_parsing_blocks: List[LayoutParsingBlock] = []
750
+
751
+ for box_idx, box_info in enumerate(layout_det_res["boxes"]):
752
+
753
+ label = box_info["label"]
754
+ block_bbox = box_info["coordinate"]
755
+ rec_res = {"boxes": [], "rec_texts": [], "rec_labels": []}
756
+
757
+ block = LayoutParsingBlock(label=label, bbox=block_bbox)
758
+
759
+ if label == "table" and len(table_res_list) > 0:
760
+ block.content = table_res_list[table_index]["pred_html"]
761
+ table_index += 1
762
+ elif label == "seal" and len(seal_res_list) > 0:
763
+ block.content = "\n".join(seal_res_list[seal_index]["rec_texts"])
764
+ seal_index += 1
765
+ elif label == "chart" and len(chart_res_list) > 0:
766
+ block.content = chart_res_list[chart_index]
767
+ chart_index += 1
768
+ else:
769
+ if label == "formula":
770
+ _, ocr_idx_list = get_sub_regions_ocr_res(
771
+ overall_ocr_res, [block_bbox], return_match_idx=True
772
+ )
773
+ region_block_ocr_idx_map["block_to_ocr_map"][box_idx] = ocr_idx_list
774
+ else:
775
+ ocr_idx_list = region_block_ocr_idx_map["block_to_ocr_map"].get(
776
+ box_idx, []
777
+ )
778
+ for box_no in ocr_idx_list:
779
+ rec_res["boxes"].append(overall_ocr_res["rec_boxes"][box_no])
780
+ rec_res["rec_texts"].append(
781
+ overall_ocr_res["rec_texts"][box_no],
782
+ )
783
+ rec_res["rec_labels"].append(
784
+ overall_ocr_res["rec_labels"][box_no],
785
+ )
786
+ block = self.get_block_rec_content(
787
+ image=image,
788
+ block=block,
789
+ ocr_rec_res=rec_res,
790
+ text_rec_model=text_rec_model,
791
+ text_rec_score_thresh=text_rec_score_thresh,
792
+ )
793
+
794
+ if (
795
+ label
796
+ in ["seal", "table", "formula", "chart"]
797
+ + BLOCK_LABEL_MAP["image_labels"]
798
+ ):
799
+ x_min, y_min, x_max, y_max = list(map(int, block_bbox))
800
+ img_path = (
801
+ f"imgs/img_in_{block.label}_box_{x_min}_{y_min}_{x_max}_{y_max}.jpg"
802
+ )
803
+ img = Image.fromarray(image[y_min:y_max, x_min:x_max, ::-1])
804
+ block.image = {"path": img_path, "img": img}
805
+
806
+ layout_parsing_blocks.append(block)
807
+
808
+ region_list: List[LayoutParsingRegion] = []
809
+ for region_idx, region_info in enumerate(region_det_res["boxes"]):
810
+ region_bbox = region_info["coordinate"]
811
+ region_blocks = [
812
+ layout_parsing_blocks[idx]
813
+ for idx in region_block_ocr_idx_map["region_to_block_map"][region_idx]
357
814
  ]
358
- overall_ocr_res["dt_polys"].append(poly_points)
359
- overall_ocr_res["rec_texts"].append(f"${formula_res['rec_formula']}$")
360
- overall_ocr_res["rec_boxes"] = np.vstack(
361
- (overall_ocr_res["rec_boxes"], [formula_res["dt_polys"]])
815
+ region = LayoutParsingRegion(
816
+ bbox=region_bbox,
817
+ blocks=region_blocks,
818
+ image_shape=image.shape[:2],
362
819
  )
363
- overall_ocr_res["rec_labels"].append("formula")
364
- overall_ocr_res["rec_polys"].append(poly_points)
365
- overall_ocr_res["rec_scores"].append(1)
820
+ region_list.append(region)
821
+
822
+ region_list = sorted(
823
+ region_list,
824
+ key=lambda r: (r.weighted_distance),
825
+ )
826
+
827
+ return region_list
366
828
 
367
- parsing_res_list = get_single_block_parsing_res(
368
- self.general_ocr_pipeline,
829
+ def get_layout_parsing_res(
830
+ self,
831
+ image: list,
832
+ region_det_res: DetResult,
833
+ layout_det_res: DetResult,
834
+ overall_ocr_res: OCRResult,
835
+ table_res_list: list,
836
+ seal_res_list: list,
837
+ chart_res_list: list,
838
+ formula_res_list: list,
839
+ text_rec_score_thresh: Union[float, None] = None,
840
+ ) -> list:
841
+ """
842
+ Retrieves the layout parsing result based on the layout detection result, OCR result, and other recognition results.
843
+ Args:
844
+ image (list): The input image.
845
+ layout_det_res (DetResult): The detection result containing the layout information of the document.
846
+ overall_ocr_res (OCRResult): The overall OCR result containing text information.
847
+ table_res_list (list): A list of table recognition results.
848
+ seal_res_list (list): A list of seal recognition results.
849
+ formula_res_list (list): A list of formula recognition results.
850
+ text_rec_score_thresh (Optional[float], optional): The score threshold for text recognition. Defaults to None.
851
+ Returns:
852
+ list: A list of dictionaries representing the layout parsing result.
853
+ """
854
+
855
+ # Standardize data
856
+ region_block_ocr_idx_map, region_det_res, layout_det_res = (
857
+ self.standardized_data(
858
+ image=image,
859
+ region_det_res=region_det_res,
860
+ layout_det_res=layout_det_res,
861
+ overall_ocr_res=overall_ocr_res,
862
+ formula_res_list=formula_res_list,
863
+ text_rec_model=self.general_ocr_pipeline.text_rec_model,
864
+ text_rec_score_thresh=text_rec_score_thresh,
865
+ )
866
+ )
867
+
868
+ # Format layout parsing block
869
+ region_list = self.get_layout_parsing_blocks(
870
+ image=image,
871
+ region_block_ocr_idx_map=region_block_ocr_idx_map,
872
+ region_det_res=region_det_res,
369
873
  overall_ocr_res=overall_ocr_res,
370
874
  layout_det_res=layout_det_res,
371
875
  table_res_list=table_res_list,
372
876
  seal_res_list=seal_res_list,
877
+ chart_res_list=chart_res_list,
878
+ text_rec_model=self.general_ocr_pipeline.text_rec_model,
879
+ text_rec_score_thresh=self.general_ocr_pipeline.text_rec_score_thresh,
373
880
  )
374
881
 
882
+ parsing_res_list = []
883
+ for region in region_list:
884
+ parsing_res_list.extend(region.sort())
885
+
886
+ index = 1
887
+ for block in parsing_res_list:
888
+ if block.label in BLOCK_LABEL_MAP["visualize_index_labels"]:
889
+ block.order_index = index
890
+ index += 1
891
+
375
892
  return parsing_res_list
376
893
 
377
894
  def get_model_settings(
378
895
  self,
379
896
  use_doc_orientation_classify: Union[bool, None],
380
897
  use_doc_unwarping: Union[bool, None],
381
- use_general_ocr: Union[bool, None],
382
898
  use_seal_recognition: Union[bool, None],
383
899
  use_table_recognition: Union[bool, None],
384
900
  use_formula_recognition: Union[bool, None],
901
+ use_chart_recognition: Union[bool, None],
902
+ use_region_detection: Union[bool, None],
385
903
  ) -> dict:
386
904
  """
387
905
  Get the model settings based on the provided parameters or default values.
@@ -389,7 +907,6 @@ class LayoutParsingPipelineV2(BasePipeline):
389
907
  Args:
390
908
  use_doc_orientation_classify (Union[bool, None]): Enables document orientation classification if True. Defaults to system setting if None.
391
909
  use_doc_unwarping (Union[bool, None]): Enables document unwarping if True. Defaults to system setting if None.
392
- use_general_ocr (Union[bool, None]): Enables general OCR if True. Defaults to system setting if None.
393
910
  use_seal_recognition (Union[bool, None]): Enables seal recognition if True. Defaults to system setting if None.
394
911
  use_table_recognition (Union[bool, None]): Enables table recognition if True. Defaults to system setting if None.
395
912
  use_formula_recognition (Union[bool, None]): Enables formula recognition if True. Defaults to system setting if None.
@@ -406,9 +923,6 @@ class LayoutParsingPipelineV2(BasePipeline):
406
923
  else:
407
924
  use_doc_preprocessor = False
408
925
 
409
- if use_general_ocr is None:
410
- use_general_ocr = self.use_general_ocr
411
-
412
926
  if use_seal_recognition is None:
413
927
  use_seal_recognition = self.use_seal_recognition
414
928
 
@@ -418,24 +932,32 @@ class LayoutParsingPipelineV2(BasePipeline):
418
932
  if use_formula_recognition is None:
419
933
  use_formula_recognition = self.use_formula_recognition
420
934
 
935
+ if use_region_detection is None:
936
+ use_region_detection = self.use_region_detection
937
+
938
+ if use_chart_recognition is None:
939
+ use_chart_recognition = self.use_chart_recognition
940
+
421
941
  return dict(
422
942
  use_doc_preprocessor=use_doc_preprocessor,
423
- use_general_ocr=use_general_ocr,
424
943
  use_seal_recognition=use_seal_recognition,
425
944
  use_table_recognition=use_table_recognition,
426
945
  use_formula_recognition=use_formula_recognition,
946
+ use_chart_recognition=use_chart_recognition,
947
+ use_region_detection=use_region_detection,
427
948
  )
428
949
 
429
950
  def predict(
430
951
  self,
431
952
  input: Union[str, list[str], np.ndarray, list[np.ndarray]],
432
- use_doc_orientation_classify: Union[bool, None] = None,
433
- use_doc_unwarping: Union[bool, None] = None,
953
+ use_doc_orientation_classify: Union[bool, None] = False,
954
+ use_doc_unwarping: Union[bool, None] = False,
434
955
  use_textline_orientation: Optional[bool] = None,
435
- use_general_ocr: Union[bool, None] = None,
436
956
  use_seal_recognition: Union[bool, None] = None,
437
957
  use_table_recognition: Union[bool, None] = None,
438
958
  use_formula_recognition: Union[bool, None] = None,
959
+ use_chart_recognition: Union[bool, None] = False,
960
+ use_region_detection: Union[bool, None] = None,
439
961
  layout_threshold: Optional[Union[float, dict]] = None,
440
962
  layout_nms: Optional[bool] = None,
441
963
  layout_unclip_ratio: Optional[Union[float, Tuple[float, float], dict]] = None,
@@ -452,7 +974,10 @@ class LayoutParsingPipelineV2(BasePipeline):
452
974
  seal_det_box_thresh: Union[float, None] = None,
453
975
  seal_det_unclip_ratio: Union[float, None] = None,
454
976
  seal_rec_score_thresh: Union[float, None] = None,
455
- use_table_cells_ocr_results: bool = False,
977
+ use_wired_table_cells_trans_to_html: bool = False,
978
+ use_wireless_table_cells_trans_to_html: bool = False,
979
+ use_table_orientation_classify: bool = True,
980
+ use_ocr_results_with_table_cells: bool = True,
456
981
  use_e2e_wired_table_rec_model: bool = False,
457
982
  use_e2e_wireless_table_rec_model: bool = True,
458
983
  **kwargs,
@@ -464,10 +989,10 @@ class LayoutParsingPipelineV2(BasePipeline):
464
989
  use_doc_orientation_classify (Optional[bool]): Whether to use document orientation classification.
465
990
  use_doc_unwarping (Optional[bool]): Whether to use document unwarping.
466
991
  use_textline_orientation (Optional[bool]): Whether to use textline orientation prediction.
467
- use_general_ocr (Optional[bool]): Whether to use general OCR.
468
992
  use_seal_recognition (Optional[bool]): Whether to use seal recognition.
469
993
  use_table_recognition (Optional[bool]): Whether to use table recognition.
470
994
  use_formula_recognition (Optional[bool]): Whether to use formula recognition.
995
+ use_region_detection (Optional[bool]): Whether to use region detection.
471
996
  layout_threshold (Optional[float]): The threshold value to filter out low-confidence predictions. Default is None.
472
997
  layout_nms (bool, optional): Whether to use layout-aware NMS. Defaults to False.
473
998
  layout_unclip_ratio (Optional[Union[float, Tuple[float, float]]], optional): The ratio of unclipping the bounding box.
@@ -488,7 +1013,10 @@ class LayoutParsingPipelineV2(BasePipeline):
488
1013
  seal_det_box_thresh (Optional[float]): Threshold for seal detection boxes.
489
1014
  seal_det_unclip_ratio (Optional[float]): Ratio for unclipping seal detection boxes.
490
1015
  seal_rec_score_thresh (Optional[float]): Score threshold for seal recognition.
491
- use_table_cells_ocr_results (bool): whether to use OCR results with cells.
1016
+ use_wired_table_cells_trans_to_html (bool): Whether to use wired table cells trans to HTML.
1017
+ use_wireless_table_cells_trans_to_html (bool): Whether to use wireless table cells trans to HTML.
1018
+ use_table_orientation_classify (bool): Whether to use table orientation classification.
1019
+ use_ocr_results_with_table_cells (bool): Whether to use OCR results processed by table cells.
492
1020
  use_e2e_wired_table_rec_model (bool): Whether to use end-to-end wired table recognition model.
493
1021
  use_e2e_wireless_table_rec_model (bool): Whether to use end-to-end wireless table recognition model.
494
1022
  **kwargs (Any): Additional settings to extend functionality.
@@ -500,150 +1028,204 @@ class LayoutParsingPipelineV2(BasePipeline):
500
1028
  model_settings = self.get_model_settings(
501
1029
  use_doc_orientation_classify,
502
1030
  use_doc_unwarping,
503
- use_general_ocr,
504
1031
  use_seal_recognition,
505
1032
  use_table_recognition,
506
1033
  use_formula_recognition,
1034
+ use_chart_recognition,
1035
+ use_region_detection,
507
1036
  )
508
1037
 
509
1038
  if not self.check_model_settings_valid(model_settings):
510
1039
  yield {"error": "the input params for model settings are invalid!"}
511
1040
 
512
1041
  for batch_data in self.batch_sampler(input):
513
- image_array = self.img_reader(batch_data.instances)[0]
1042
+ image_arrays = self.img_reader(batch_data.instances)
514
1043
 
515
1044
  if model_settings["use_doc_preprocessor"]:
516
- doc_preprocessor_res = next(
1045
+ doc_preprocessor_results = list(
517
1046
  self.doc_preprocessor_pipeline(
518
- image_array,
1047
+ image_arrays,
519
1048
  use_doc_orientation_classify=use_doc_orientation_classify,
520
1049
  use_doc_unwarping=use_doc_unwarping,
521
- ),
1050
+ )
522
1051
  )
523
1052
  else:
524
- doc_preprocessor_res = {"output_img": image_array}
1053
+ doc_preprocessor_results = [{"output_img": arr} for arr in image_arrays]
525
1054
 
526
- doc_preprocessor_image = doc_preprocessor_res["output_img"]
1055
+ doc_preprocessor_images = [
1056
+ item["output_img"] for item in doc_preprocessor_results
1057
+ ]
527
1058
 
528
- layout_det_res = next(
1059
+ layout_det_results = list(
529
1060
  self.layout_det_model(
530
- doc_preprocessor_image,
1061
+ doc_preprocessor_images,
531
1062
  threshold=layout_threshold,
532
1063
  layout_nms=layout_nms,
533
1064
  layout_unclip_ratio=layout_unclip_ratio,
534
1065
  layout_merge_bboxes_mode=layout_merge_bboxes_mode,
535
1066
  )
536
1067
  )
537
- imgs_in_doc = gather_imgs(doc_preprocessor_image, layout_det_res["boxes"])
1068
+ imgs_in_doc = [
1069
+ gather_imgs(img, res["boxes"])
1070
+ for img, res in zip(doc_preprocessor_images, layout_det_results)
1071
+ ]
1072
+
1073
+ if model_settings["use_region_detection"]:
1074
+ region_det_results = list(
1075
+ self.region_detection_model(
1076
+ doc_preprocessor_images,
1077
+ layout_nms=True,
1078
+ layout_merge_bboxes_mode="small",
1079
+ ),
1080
+ )
1081
+ else:
1082
+ region_det_results = [{"boxes": []} for _ in doc_preprocessor_images]
538
1083
 
539
1084
  if model_settings["use_formula_recognition"]:
540
- formula_res_all = next(
1085
+ formula_res_all = list(
541
1086
  self.formula_recognition_pipeline(
542
- doc_preprocessor_image,
1087
+ doc_preprocessor_images,
543
1088
  use_layout_detection=False,
544
1089
  use_doc_orientation_classify=False,
545
1090
  use_doc_unwarping=False,
546
- layout_det_res=layout_det_res,
1091
+ layout_det_res=layout_det_results,
547
1092
  ),
548
1093
  )
549
- formula_res_list = formula_res_all["formula_res_list"]
1094
+ formula_res_lists = [
1095
+ item["formula_res_list"] for item in formula_res_all
1096
+ ]
550
1097
  else:
551
- formula_res_list = []
552
-
553
- for formula_res in formula_res_list:
554
- x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
555
- doc_preprocessor_image[y_min:y_max, x_min:x_max, :] = 255.0
1098
+ formula_res_lists = [[] for _ in doc_preprocessor_images]
556
1099
 
557
- if (
558
- model_settings["use_general_ocr"]
559
- or model_settings["use_table_recognition"]
1100
+ for doc_preprocessor_image, formula_res_list in zip(
1101
+ doc_preprocessor_images, formula_res_lists
560
1102
  ):
561
- overall_ocr_res = next(
562
- self.general_ocr_pipeline(
563
- doc_preprocessor_image,
564
- use_textline_orientation=use_textline_orientation,
565
- text_det_limit_side_len=text_det_limit_side_len,
566
- text_det_limit_type=text_det_limit_type,
567
- text_det_thresh=text_det_thresh,
568
- text_det_box_thresh=text_det_box_thresh,
569
- text_det_unclip_ratio=text_det_unclip_ratio,
570
- text_rec_score_thresh=text_rec_score_thresh,
571
- ),
572
- )
573
- else:
574
- overall_ocr_res = {}
1103
+ for formula_res in formula_res_list:
1104
+ x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
1105
+ doc_preprocessor_image[y_min:y_max, x_min:x_max, :] = 255.0
1106
+
1107
+ overall_ocr_results = list(
1108
+ self.general_ocr_pipeline(
1109
+ doc_preprocessor_images,
1110
+ use_textline_orientation=use_textline_orientation,
1111
+ text_det_limit_side_len=text_det_limit_side_len,
1112
+ text_det_limit_type=text_det_limit_type,
1113
+ text_det_thresh=text_det_thresh,
1114
+ text_det_box_thresh=text_det_box_thresh,
1115
+ text_det_unclip_ratio=text_det_unclip_ratio,
1116
+ text_rec_score_thresh=text_rec_score_thresh,
1117
+ ),
1118
+ )
575
1119
 
576
- overall_ocr_res["rec_labels"] = ["text"] * len(overall_ocr_res["rec_texts"])
1120
+ for overall_ocr_res in overall_ocr_results:
1121
+ overall_ocr_res["rec_labels"] = ["text"] * len(
1122
+ overall_ocr_res["rec_texts"]
1123
+ )
577
1124
 
578
1125
  if model_settings["use_table_recognition"]:
579
- table_contents = copy.deepcopy(overall_ocr_res)
580
- for formula_res in formula_res_list:
581
- x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
582
- poly_points = [
583
- (x_min, y_min),
584
- (x_max, y_min),
585
- (x_max, y_max),
586
- (x_min, y_max),
587
- ]
588
- table_contents["dt_polys"].append(poly_points)
589
- table_contents["rec_texts"].append(
590
- f"${formula_res['rec_formula']}$"
591
- )
592
- table_contents["rec_boxes"] = np.vstack(
593
- (table_contents["rec_boxes"], [formula_res["dt_polys"]])
1126
+ table_res_lists = []
1127
+ for (
1128
+ layout_det_res,
1129
+ doc_preprocessor_image,
1130
+ overall_ocr_res,
1131
+ formula_res_list,
1132
+ imgs_in_doc_for_img,
1133
+ ) in zip(
1134
+ layout_det_results,
1135
+ doc_preprocessor_images,
1136
+ overall_ocr_results,
1137
+ formula_res_lists,
1138
+ imgs_in_doc,
1139
+ ):
1140
+ table_contents_for_img = copy.deepcopy(overall_ocr_res)
1141
+ for formula_res in formula_res_list:
1142
+ x_min, y_min, x_max, y_max = list(
1143
+ map(int, formula_res["dt_polys"])
1144
+ )
1145
+ poly_points = [
1146
+ (x_min, y_min),
1147
+ (x_max, y_min),
1148
+ (x_max, y_max),
1149
+ (x_min, y_max),
1150
+ ]
1151
+ table_contents_for_img["dt_polys"].append(poly_points)
1152
+ rec_formula = formula_res["rec_formula"]
1153
+ if not rec_formula.startswith("$") or not rec_formula.endswith(
1154
+ "$"
1155
+ ):
1156
+ rec_formula = f"${rec_formula}$"
1157
+ table_contents_for_img["rec_texts"].append(f"{rec_formula}")
1158
+ if table_contents_for_img["rec_boxes"].size == 0:
1159
+ table_contents_for_img["rec_boxes"] = np.array(
1160
+ [formula_res["dt_polys"]]
1161
+ )
1162
+ else:
1163
+ table_contents_for_img["rec_boxes"] = np.vstack(
1164
+ (
1165
+ table_contents_for_img["rec_boxes"],
1166
+ [formula_res["dt_polys"]],
1167
+ )
1168
+ )
1169
+ table_contents_for_img["rec_polys"].append(poly_points)
1170
+ table_contents_for_img["rec_scores"].append(1)
1171
+
1172
+ for img in imgs_in_doc_for_img:
1173
+ img_path = img["path"]
1174
+ x_min, y_min, x_max, y_max = img["coordinate"]
1175
+ poly_points = [
1176
+ (x_min, y_min),
1177
+ (x_max, y_min),
1178
+ (x_max, y_max),
1179
+ (x_min, y_max),
1180
+ ]
1181
+ table_contents_for_img["dt_polys"].append(poly_points)
1182
+ table_contents_for_img["rec_texts"].append(
1183
+ f'<div style="text-align: center;"><img src="{img_path}" alt="Image" /></div>'
1184
+ )
1185
+ if table_contents_for_img["rec_boxes"].size == 0:
1186
+ table_contents_for_img["rec_boxes"] = np.array(
1187
+ [img["coordinate"]]
1188
+ )
1189
+ else:
1190
+ table_contents_for_img["rec_boxes"] = np.vstack(
1191
+ (table_contents_for_img["rec_boxes"], img["coordinate"])
1192
+ )
1193
+ table_contents_for_img["rec_polys"].append(poly_points)
1194
+ table_contents_for_img["rec_scores"].append(img["score"])
1195
+
1196
+ table_res_all = list(
1197
+ self.table_recognition_pipeline(
1198
+ doc_preprocessor_image,
1199
+ use_doc_orientation_classify=False,
1200
+ use_doc_unwarping=False,
1201
+ use_layout_detection=False,
1202
+ use_ocr_model=False,
1203
+ overall_ocr_res=table_contents_for_img,
1204
+ layout_det_res=layout_det_res,
1205
+ cell_sort_by_y_projection=True,
1206
+ use_wired_table_cells_trans_to_html=use_wired_table_cells_trans_to_html,
1207
+ use_wireless_table_cells_trans_to_html=use_wireless_table_cells_trans_to_html,
1208
+ use_table_orientation_classify=use_table_orientation_classify,
1209
+ use_ocr_results_with_table_cells=use_ocr_results_with_table_cells,
1210
+ use_e2e_wired_table_rec_model=use_e2e_wired_table_rec_model,
1211
+ use_e2e_wireless_table_rec_model=use_e2e_wireless_table_rec_model,
1212
+ ),
594
1213
  )
595
- table_contents["rec_polys"].append(poly_points)
596
- table_contents["rec_scores"].append(1)
597
-
598
- for img in imgs_in_doc:
599
- img_path = img["path"]
600
- x_min, y_min, x_max, y_max = img["coordinate"]
601
- poly_points = [
602
- (x_min, y_min),
603
- (x_max, y_min),
604
- (x_max, y_max),
605
- (x_min, y_max),
1214
+ single_table_res_lists = [
1215
+ item["table_res_list"] for item in table_res_all
606
1216
  ]
607
- table_contents["dt_polys"].append(poly_points)
608
- table_contents["rec_texts"].append(
609
- f'<div style="text-align: center;"><img src="{img_path}" alt="Image" /></div>'
610
- )
611
- if table_contents["rec_boxes"].size == 0:
612
- table_contents["rec_boxes"] = np.array([img["coordinate"]])
613
- else:
614
- table_contents["rec_boxes"] = np.vstack(
615
- (table_contents["rec_boxes"], img["coordinate"])
616
- )
617
- table_contents["rec_polys"].append(poly_points)
618
- table_contents["rec_scores"].append(img["score"])
619
-
620
- table_res_all = next(
621
- self.table_recognition_pipeline(
622
- doc_preprocessor_image,
623
- use_doc_orientation_classify=False,
624
- use_doc_unwarping=False,
625
- use_layout_detection=False,
626
- use_ocr_model=False,
627
- overall_ocr_res=table_contents,
628
- layout_det_res=layout_det_res,
629
- cell_sort_by_y_projection=True,
630
- use_table_cells_ocr_results=use_table_cells_ocr_results,
631
- use_e2e_wired_table_rec_model=use_e2e_wired_table_rec_model,
632
- use_e2e_wireless_table_rec_model=use_e2e_wireless_table_rec_model,
633
- ),
634
- )
635
- table_res_list = table_res_all["table_res_list"]
1217
+ table_res_lists.extend(single_table_res_lists)
636
1218
  else:
637
- table_res_list = []
1219
+ table_res_lists = [[] for _ in doc_preprocessor_images]
638
1220
 
639
1221
  if model_settings["use_seal_recognition"]:
640
- seal_res_all = next(
1222
+ seal_res_all = list(
641
1223
  self.seal_recognition_pipeline(
642
- doc_preprocessor_image,
1224
+ doc_preprocessor_images,
643
1225
  use_doc_orientation_classify=False,
644
1226
  use_doc_unwarping=False,
645
1227
  use_layout_detection=False,
646
- layout_det_res=layout_det_res,
1228
+ layout_det_res=layout_det_results,
647
1229
  seal_det_limit_side_len=seal_det_limit_side_len,
648
1230
  seal_det_limit_type=seal_det_limit_type,
649
1231
  seal_det_thresh=seal_det_thresh,
@@ -652,46 +1234,85 @@ class LayoutParsingPipelineV2(BasePipeline):
652
1234
  seal_rec_score_thresh=seal_rec_score_thresh,
653
1235
  ),
654
1236
  )
655
- seal_res_list = seal_res_all["seal_res_list"]
1237
+ seal_res_lists = [item["seal_res_list"] for item in seal_res_all]
656
1238
  else:
657
- seal_res_list = []
1239
+ seal_res_lists = [[] for _ in doc_preprocessor_images]
658
1240
 
659
- parsing_res_list = self.get_layout_parsing_res(
1241
+ for (
1242
+ input_path,
1243
+ page_index,
660
1244
  doc_preprocessor_image,
661
- layout_det_res=layout_det_res,
662
- overall_ocr_res=overall_ocr_res,
663
- table_res_list=table_res_list,
664
- seal_res_list=seal_res_list,
665
- formula_res_list=formula_res_list,
666
- imgs_in_doc=imgs_in_doc,
667
- text_det_limit_side_len=text_det_limit_side_len,
668
- text_det_limit_type=text_det_limit_type,
669
- text_det_thresh=text_det_thresh,
670
- text_det_box_thresh=text_det_box_thresh,
671
- text_det_unclip_ratio=text_det_unclip_ratio,
672
- text_rec_score_thresh=text_rec_score_thresh,
673
- )
1245
+ doc_preprocessor_res,
1246
+ layout_det_res,
1247
+ region_det_res,
1248
+ overall_ocr_res,
1249
+ table_res_list,
1250
+ seal_res_list,
1251
+ formula_res_list,
1252
+ imgs_in_doc_for_img,
1253
+ ) in zip(
1254
+ batch_data.input_paths,
1255
+ batch_data.page_indexes,
1256
+ doc_preprocessor_images,
1257
+ doc_preprocessor_results,
1258
+ layout_det_results,
1259
+ region_det_results,
1260
+ overall_ocr_results,
1261
+ table_res_lists,
1262
+ seal_res_lists,
1263
+ formula_res_lists,
1264
+ imgs_in_doc,
1265
+ ):
1266
+ chart_res_list = []
1267
+ if model_settings["use_chart_recognition"]:
1268
+ chart_imgs_list = []
1269
+ for bbox in layout_det_res["boxes"]:
1270
+ if bbox["label"] == "chart":
1271
+ x_min, y_min, x_max, y_max = bbox["coordinate"]
1272
+ chart_img = doc_preprocessor_image[
1273
+ int(y_min) : int(y_max), int(x_min) : int(x_max), :
1274
+ ]
1275
+ chart_imgs_list.append({"image": chart_img})
1276
+
1277
+ for chart_res_batch in self.chart_recognition_model(
1278
+ input=chart_imgs_list
1279
+ ):
1280
+ chart_res_list.append(chart_res_batch["result"])
1281
+
1282
+ parsing_res_list = self.get_layout_parsing_res(
1283
+ doc_preprocessor_image,
1284
+ region_det_res=region_det_res,
1285
+ layout_det_res=layout_det_res,
1286
+ overall_ocr_res=overall_ocr_res,
1287
+ table_res_list=table_res_list,
1288
+ seal_res_list=seal_res_list,
1289
+ chart_res_list=chart_res_list,
1290
+ formula_res_list=formula_res_list,
1291
+ text_rec_score_thresh=text_rec_score_thresh,
1292
+ )
674
1293
 
675
- for formula_res in formula_res_list:
676
- x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
677
- doc_preprocessor_image[y_min:y_max, x_min:x_max, :] = formula_res[
678
- "input_img"
679
- ]
1294
+ for formula_res in formula_res_list:
1295
+ x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
1296
+ doc_preprocessor_image[y_min:y_max, x_min:x_max, :] = formula_res[
1297
+ "input_img"
1298
+ ]
680
1299
 
681
- single_img_res = {
682
- "input_path": batch_data.input_paths[0],
683
- "page_index": batch_data.page_indexes[0],
684
- "doc_preprocessor_res": doc_preprocessor_res,
685
- "layout_det_res": layout_det_res,
686
- "overall_ocr_res": overall_ocr_res,
687
- "table_res_list": table_res_list,
688
- "seal_res_list": seal_res_list,
689
- "formula_res_list": formula_res_list,
690
- "parsing_res_list": parsing_res_list,
691
- "imgs_in_doc": imgs_in_doc,
692
- "model_settings": model_settings,
693
- }
694
- yield LayoutParsingResultV2(single_img_res)
1300
+ single_img_res = {
1301
+ "input_path": input_path,
1302
+ "page_index": page_index,
1303
+ "doc_preprocessor_res": doc_preprocessor_res,
1304
+ "layout_det_res": layout_det_res,
1305
+ "region_det_res": region_det_res,
1306
+ "overall_ocr_res": overall_ocr_res,
1307
+ "table_res_list": table_res_list,
1308
+ "seal_res_list": seal_res_list,
1309
+ "chart_res_list": chart_res_list,
1310
+ "formula_res_list": formula_res_list,
1311
+ "parsing_res_list": parsing_res_list,
1312
+ "imgs_in_doc": imgs_in_doc_for_img,
1313
+ "model_settings": model_settings,
1314
+ }
1315
+ yield LayoutParsingResultV2(single_img_res)
695
1316
 
696
1317
  def concatenate_markdown_pages(self, markdown_list: list) -> tuple:
697
1318
  """
@@ -747,3 +1368,15 @@ class LayoutParsingPipelineV2(BasePipeline):
747
1368
  )
748
1369
 
749
1370
  return markdown_texts
1371
+
1372
+
1373
+ @pipeline_requires_extra("ocr")
1374
+ class LayoutParsingPipelineV2(AutoParallelImageSimpleInferencePipeline):
1375
+ entities = ["PP-StructureV3"]
1376
+
1377
+ @property
1378
+ def _pipeline_cls(self):
1379
+ return _LayoutParsingPipelineV2
1380
+
1381
+ def _get_batch_size(self, config):
1382
+ return config.get("batch_size", 1)