paddlex 3.0.0rc1__py3-none-any.whl → 3.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- paddlex/.version +1 -1
- paddlex/__init__.py +1 -1
- paddlex/configs/modules/chart_parsing/PP-Chart2Table.yaml +13 -0
- paddlex/configs/modules/doc_vlm/PP-DocBee2-3B.yaml +14 -0
- paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-L.yaml +40 -0
- paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-M.yaml +40 -0
- paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-S.yaml +40 -0
- paddlex/configs/modules/layout_detection/PP-DocBlockLayout.yaml +40 -0
- paddlex/configs/modules/layout_detection/PP-DocLayout-L.yaml +2 -2
- paddlex/configs/modules/layout_detection/PP-DocLayout-M.yaml +2 -2
- paddlex/configs/modules/layout_detection/PP-DocLayout-S.yaml +2 -2
- paddlex/configs/modules/layout_detection/PP-DocLayout_plus-L.yaml +40 -0
- paddlex/configs/modules/text_detection/PP-OCRv5_mobile_det.yaml +40 -0
- paddlex/configs/modules/text_detection/PP-OCRv5_server_det.yaml +40 -0
- paddlex/configs/modules/text_recognition/PP-OCRv5_mobile_rec.yaml +39 -0
- paddlex/configs/modules/text_recognition/PP-OCRv5_server_rec.yaml +39 -0
- paddlex/configs/modules/textline_orientation/PP-LCNet_x1_0_textline_ori.yaml +41 -0
- paddlex/configs/pipelines/OCR.yaml +7 -6
- paddlex/configs/pipelines/PP-ChatOCRv3-doc.yaml +3 -1
- paddlex/configs/pipelines/PP-ChatOCRv4-doc.yaml +91 -34
- paddlex/configs/pipelines/PP-StructureV3.yaml +72 -72
- paddlex/configs/pipelines/doc_understanding.yaml +1 -1
- paddlex/configs/pipelines/formula_recognition.yaml +2 -2
- paddlex/configs/pipelines/layout_parsing.yaml +3 -2
- paddlex/configs/pipelines/seal_recognition.yaml +1 -0
- paddlex/configs/pipelines/table_recognition.yaml +2 -1
- paddlex/configs/pipelines/table_recognition_v2.yaml +7 -1
- paddlex/hpip_links.html +20 -20
- paddlex/inference/common/batch_sampler/doc_vlm_batch_sampler.py +33 -10
- paddlex/inference/common/batch_sampler/image_batch_sampler.py +34 -25
- paddlex/inference/common/result/mixin.py +19 -12
- paddlex/inference/models/base/predictor/base_predictor.py +2 -8
- paddlex/inference/models/common/static_infer.py +11 -59
- paddlex/inference/models/common/tokenizer/__init__.py +2 -0
- paddlex/inference/models/common/tokenizer/clip_tokenizer.py +1 -1
- paddlex/inference/models/common/tokenizer/gpt_tokenizer.py +2 -2
- paddlex/inference/models/common/tokenizer/qwen2_5_tokenizer.py +112 -0
- paddlex/inference/models/common/tokenizer/qwen2_tokenizer.py +7 -1
- paddlex/inference/models/common/tokenizer/qwen_tokenizer.py +288 -0
- paddlex/inference/models/common/tokenizer/tokenizer_utils.py +13 -13
- paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py +3 -3
- paddlex/inference/models/common/tokenizer/vocab.py +7 -7
- paddlex/inference/models/common/vlm/conversion_utils.py +99 -0
- paddlex/inference/models/common/vlm/fusion_ops.py +205 -0
- paddlex/inference/models/common/vlm/generation/configuration_utils.py +1 -1
- paddlex/inference/models/common/vlm/generation/logits_process.py +1 -1
- paddlex/inference/models/common/vlm/generation/utils.py +1 -1
- paddlex/inference/models/common/vlm/transformers/configuration_utils.py +3 -3
- paddlex/inference/models/common/vlm/transformers/conversion_utils.py +3 -3
- paddlex/inference/models/common/vlm/transformers/model_outputs.py +2 -2
- paddlex/inference/models/common/vlm/transformers/model_utils.py +7 -31
- paddlex/inference/models/doc_vlm/modeling/GOT_ocr_2_0.py +830 -0
- paddlex/inference/models/doc_vlm/modeling/__init__.py +2 -0
- paddlex/inference/models/doc_vlm/modeling/qwen2.py +1606 -0
- paddlex/inference/models/doc_vlm/modeling/qwen2_5_vl.py +3006 -0
- paddlex/inference/models/doc_vlm/modeling/qwen2_vl.py +0 -105
- paddlex/inference/models/doc_vlm/predictor.py +79 -24
- paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py +97 -0
- paddlex/inference/models/doc_vlm/processors/__init__.py +2 -0
- paddlex/inference/models/doc_vlm/processors/common.py +189 -0
- paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py +548 -0
- paddlex/inference/models/doc_vlm/processors/qwen2_vl.py +21 -176
- paddlex/inference/models/formula_recognition/predictor.py +7 -1
- paddlex/inference/models/formula_recognition/processors.py +92 -79
- paddlex/inference/models/formula_recognition/result.py +28 -27
- paddlex/inference/models/image_feature/processors.py +3 -4
- paddlex/inference/models/keypoint_detection/predictor.py +3 -0
- paddlex/inference/models/object_detection/predictor.py +2 -0
- paddlex/inference/models/object_detection/processors.py +28 -3
- paddlex/inference/models/object_detection/utils.py +2 -0
- paddlex/inference/models/table_structure_recognition/result.py +0 -10
- paddlex/inference/models/text_detection/predictor.py +8 -0
- paddlex/inference/models/text_detection/processors.py +44 -10
- paddlex/inference/models/text_detection/result.py +0 -10
- paddlex/inference/pipelines/__init__.py +9 -5
- paddlex/inference/pipelines/_parallel.py +172 -0
- paddlex/inference/pipelines/anomaly_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/attribute_recognition/pipeline.py +11 -1
- paddlex/inference/pipelines/base.py +14 -4
- paddlex/inference/pipelines/components/faisser.py +1 -1
- paddlex/inference/pipelines/doc_preprocessor/pipeline.py +53 -27
- paddlex/inference/pipelines/formula_recognition/pipeline.py +120 -82
- paddlex/inference/pipelines/formula_recognition/result.py +1 -11
- paddlex/inference/pipelines/image_classification/pipeline.py +16 -6
- paddlex/inference/pipelines/image_multilabel_classification/pipeline.py +16 -6
- paddlex/inference/pipelines/instance_segmentation/pipeline.py +16 -6
- paddlex/inference/pipelines/keypoint_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/layout_parsing/pipeline.py +34 -47
- paddlex/inference/pipelines/layout_parsing/pipeline_v2.py +893 -260
- paddlex/inference/pipelines/layout_parsing/result.py +4 -17
- paddlex/inference/pipelines/layout_parsing/result_v2.py +523 -245
- paddlex/inference/pipelines/layout_parsing/setting.py +87 -0
- paddlex/inference/pipelines/layout_parsing/utils.py +565 -1998
- paddlex/inference/pipelines/layout_parsing/xycut_enhanced/__init__.py +16 -0
- paddlex/inference/pipelines/layout_parsing/xycut_enhanced/utils.py +1144 -0
- paddlex/inference/pipelines/layout_parsing/xycut_enhanced/xycuts.py +563 -0
- paddlex/inference/pipelines/m_3d_bev_detection/pipeline.py +2 -2
- paddlex/inference/pipelines/multilingual_speech_recognition/pipeline.py +2 -2
- paddlex/inference/pipelines/object_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/ocr/pipeline.py +127 -70
- paddlex/inference/pipelines/ocr/result.py +19 -16
- paddlex/inference/pipelines/open_vocabulary_detection/pipeline.py +2 -2
- paddlex/inference/pipelines/open_vocabulary_segmentation/pipeline.py +2 -2
- paddlex/inference/pipelines/pp_chatocr/pipeline_base.py +2 -2
- paddlex/inference/pipelines/pp_chatocr/pipeline_v3.py +2 -5
- paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py +5 -5
- paddlex/inference/pipelines/rotated_object_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/seal_recognition/pipeline.py +109 -53
- paddlex/inference/pipelines/semantic_segmentation/pipeline.py +16 -6
- paddlex/inference/pipelines/small_object_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/table_recognition/pipeline.py +26 -18
- paddlex/inference/pipelines/table_recognition/pipeline_v2.py +624 -53
- paddlex/inference/pipelines/table_recognition/result.py +1 -1
- paddlex/inference/pipelines/table_recognition/table_recognition_post_processing_v2.py +9 -5
- paddlex/inference/pipelines/ts_anomaly_detection/pipeline.py +2 -2
- paddlex/inference/pipelines/ts_classification/pipeline.py +2 -2
- paddlex/inference/pipelines/ts_forecasting/pipeline.py +2 -2
- paddlex/inference/pipelines/video_classification/pipeline.py +2 -2
- paddlex/inference/pipelines/video_detection/pipeline.py +2 -2
- paddlex/inference/serving/basic_serving/_pipeline_apps/_common/common.py +5 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/layout_parsing.py +0 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv3_doc.py +0 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv4_doc.py +1 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py +6 -2
- paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition.py +1 -5
- paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition_v2.py +4 -5
- paddlex/inference/serving/infra/utils.py +20 -22
- paddlex/inference/serving/schemas/formula_recognition.py +1 -1
- paddlex/inference/serving/schemas/layout_parsing.py +1 -2
- paddlex/inference/serving/schemas/pp_chatocrv3_doc.py +1 -2
- paddlex/inference/serving/schemas/pp_chatocrv4_doc.py +2 -2
- paddlex/inference/serving/schemas/pp_structurev3.py +10 -6
- paddlex/inference/serving/schemas/seal_recognition.py +1 -1
- paddlex/inference/serving/schemas/table_recognition.py +2 -6
- paddlex/inference/serving/schemas/table_recognition_v2.py +5 -6
- paddlex/inference/utils/hpi.py +8 -1
- paddlex/inference/utils/hpi_model_info_collection.json +81 -2
- paddlex/inference/utils/io/readers.py +12 -12
- paddlex/inference/utils/mkldnn_blocklist.py +25 -0
- paddlex/inference/utils/official_models.py +14 -0
- paddlex/inference/utils/pp_option.py +29 -8
- paddlex/model.py +2 -2
- paddlex/modules/__init__.py +1 -1
- paddlex/modules/anomaly_detection/evaluator.py +2 -2
- paddlex/modules/base/__init__.py +1 -1
- paddlex/modules/base/evaluator.py +5 -5
- paddlex/modules/base/trainer.py +1 -1
- paddlex/modules/doc_vlm/dataset_checker.py +2 -2
- paddlex/modules/doc_vlm/evaluator.py +2 -2
- paddlex/modules/doc_vlm/exportor.py +2 -2
- paddlex/modules/doc_vlm/model_list.py +1 -1
- paddlex/modules/doc_vlm/trainer.py +2 -2
- paddlex/modules/face_recognition/evaluator.py +2 -2
- paddlex/modules/formula_recognition/evaluator.py +5 -2
- paddlex/modules/formula_recognition/model_list.py +3 -0
- paddlex/modules/formula_recognition/trainer.py +3 -0
- paddlex/modules/general_recognition/evaluator.py +1 -1
- paddlex/modules/image_classification/evaluator.py +2 -2
- paddlex/modules/image_classification/model_list.py +1 -0
- paddlex/modules/instance_segmentation/evaluator.py +1 -1
- paddlex/modules/keypoint_detection/evaluator.py +1 -1
- paddlex/modules/m_3d_bev_detection/evaluator.py +2 -2
- paddlex/modules/multilabel_classification/evaluator.py +2 -2
- paddlex/modules/object_detection/dataset_checker/dataset_src/convert_dataset.py +4 -4
- paddlex/modules/object_detection/evaluator.py +2 -2
- paddlex/modules/object_detection/model_list.py +2 -0
- paddlex/modules/semantic_segmentation/evaluator.py +2 -2
- paddlex/modules/table_recognition/evaluator.py +2 -2
- paddlex/modules/text_detection/evaluator.py +2 -2
- paddlex/modules/text_detection/model_list.py +2 -0
- paddlex/modules/text_recognition/evaluator.py +2 -2
- paddlex/modules/text_recognition/model_list.py +2 -0
- paddlex/modules/ts_anomaly_detection/evaluator.py +2 -2
- paddlex/modules/ts_classification/dataset_checker/dataset_src/split_dataset.py +1 -1
- paddlex/modules/ts_classification/evaluator.py +2 -2
- paddlex/modules/ts_forecast/evaluator.py +2 -2
- paddlex/modules/video_classification/evaluator.py +2 -2
- paddlex/modules/video_detection/evaluator.py +2 -2
- paddlex/ops/__init__.py +2 -2
- paddlex/paddlex_cli.py +19 -13
- paddlex/repo_apis/Paddle3D_api/bev_fusion/model.py +2 -2
- paddlex/repo_apis/PaddleClas_api/cls/config.py +1 -1
- paddlex/repo_apis/PaddleClas_api/cls/model.py +1 -1
- paddlex/repo_apis/PaddleClas_api/cls/register.py +10 -0
- paddlex/repo_apis/PaddleClas_api/cls/runner.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/instance_seg/model.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/instance_seg/runner.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/object_det/config.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/object_det/model.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py +25 -0
- paddlex/repo_apis/PaddleDetection_api/object_det/register.py +30 -0
- paddlex/repo_apis/PaddleDetection_api/object_det/runner.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/formula_rec/config.py +3 -3
- paddlex/repo_apis/PaddleOCR_api/formula_rec/model.py +5 -9
- paddlex/repo_apis/PaddleOCR_api/formula_rec/register.py +27 -0
- paddlex/repo_apis/PaddleOCR_api/formula_rec/runner.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/table_rec/model.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/table_rec/runner.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/text_det/model.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/text_det/register.py +18 -0
- paddlex/repo_apis/PaddleOCR_api/text_det/runner.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/text_rec/config.py +3 -3
- paddlex/repo_apis/PaddleOCR_api/text_rec/model.py +5 -9
- paddlex/repo_apis/PaddleOCR_api/text_rec/register.py +18 -0
- paddlex/repo_apis/PaddleOCR_api/text_rec/runner.py +1 -1
- paddlex/repo_apis/PaddleSeg_api/seg/model.py +1 -1
- paddlex/repo_apis/PaddleSeg_api/seg/runner.py +1 -1
- paddlex/repo_apis/PaddleTS_api/ts_ad/config.py +3 -3
- paddlex/repo_apis/PaddleTS_api/ts_cls/config.py +2 -2
- paddlex/repo_apis/PaddleTS_api/ts_fc/config.py +4 -4
- paddlex/repo_apis/PaddleVideo_api/video_cls/config.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_cls/model.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_cls/runner.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_det/config.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_det/model.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_det/runner.py +1 -1
- paddlex/repo_apis/base/config.py +1 -1
- paddlex/repo_manager/core.py +3 -3
- paddlex/repo_manager/meta.py +6 -2
- paddlex/repo_manager/repo.py +17 -16
- paddlex/utils/custom_device_list.py +26 -2
- paddlex/utils/deps.py +1 -1
- paddlex/utils/device.py +15 -8
- paddlex/utils/env.py +4 -0
- paddlex/utils/flags.py +2 -4
- paddlex/utils/fonts/__init__.py +34 -4
- paddlex/utils/misc.py +1 -1
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/METADATA +52 -56
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/RECORD +233 -206
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/WHEEL +1 -1
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/entry_points.txt +0 -0
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/licenses/LICENSE +0 -0
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.1.dist-info}/top_level.txt +0 -0
@@ -15,9 +15,10 @@ from __future__ import annotations
|
|
15
15
|
|
16
16
|
import copy
|
17
17
|
import re
|
18
|
-
from typing import Any, Dict, Optional, Tuple, Union
|
18
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
19
19
|
|
20
20
|
import numpy as np
|
21
|
+
from PIL import Image
|
21
22
|
|
22
23
|
from ....utils import logging
|
23
24
|
from ....utils.deps import pipeline_requires_extra
|
@@ -26,18 +27,31 @@ from ...common.reader import ReadImage
|
|
26
27
|
from ...models.object_detection.result import DetResult
|
27
28
|
from ...utils.hpi import HPIConfig
|
28
29
|
from ...utils.pp_option import PaddlePredictorOption
|
30
|
+
from .._parallel import AutoParallelImageSimpleInferencePipeline
|
29
31
|
from ..base import BasePipeline
|
30
32
|
from ..ocr.result import OCRResult
|
31
|
-
from .result_v2 import LayoutParsingResultV2
|
32
|
-
from .
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
33
|
+
from .result_v2 import LayoutParsingBlock, LayoutParsingRegion, LayoutParsingResultV2
|
34
|
+
from .setting import BLOCK_LABEL_MAP, BLOCK_SETTINGS, LINE_SETTINGS, REGION_SETTINGS
|
35
|
+
from .utils import (
|
36
|
+
caculate_bbox_area,
|
37
|
+
calculate_minimum_enclosing_bbox,
|
38
|
+
calculate_overlap_ratio,
|
39
|
+
convert_formula_res_to_ocr_format,
|
40
|
+
format_line,
|
41
|
+
gather_imgs,
|
42
|
+
get_bbox_intersection,
|
43
|
+
get_sub_regions_ocr_res,
|
44
|
+
group_boxes_into_lines,
|
45
|
+
remove_overlap_blocks,
|
46
|
+
shrink_supplement_region_bbox,
|
47
|
+
split_boxes_by_projection,
|
48
|
+
update_region_box,
|
49
|
+
)
|
50
|
+
|
51
|
+
|
52
|
+
class _LayoutParsingPipelineV2(BasePipeline):
|
37
53
|
"""Layout Parsing Pipeline V2"""
|
38
54
|
|
39
|
-
entities = ["PP-StructureV3"]
|
40
|
-
|
41
55
|
def __init__(
|
42
56
|
self,
|
43
57
|
config: dict,
|
@@ -53,9 +67,9 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
53
67
|
device (str, optional): Device to run the predictions on. Defaults to None.
|
54
68
|
pp_option (PaddlePredictorOption, optional): PaddlePredictor options. Defaults to None.
|
55
69
|
use_hpip (bool, optional): Whether to use the high-performance
|
56
|
-
inference plugin (HPIP). Defaults to False.
|
70
|
+
inference plugin (HPIP) by default. Defaults to False.
|
57
71
|
hpi_config (Optional[Union[Dict[str, Any], HPIConfig]], optional):
|
58
|
-
The high-performance inference configuration dictionary.
|
72
|
+
The default high-performance inference configuration dictionary.
|
59
73
|
Defaults to None.
|
60
74
|
"""
|
61
75
|
|
@@ -68,8 +82,7 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
68
82
|
|
69
83
|
self.inintial_predictor(config)
|
70
84
|
|
71
|
-
self.batch_sampler = ImageBatchSampler(batch_size=1)
|
72
|
-
|
85
|
+
self.batch_sampler = ImageBatchSampler(batch_size=config.get("batch_size", 1))
|
73
86
|
self.img_reader = ReadImage(format="BGR")
|
74
87
|
|
75
88
|
def inintial_predictor(self, config: dict) -> None:
|
@@ -83,13 +96,20 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
83
96
|
"""
|
84
97
|
|
85
98
|
self.use_doc_preprocessor = config.get("use_doc_preprocessor", True)
|
86
|
-
self.use_general_ocr = config.get("use_general_ocr", True)
|
87
99
|
self.use_table_recognition = config.get("use_table_recognition", True)
|
88
100
|
self.use_seal_recognition = config.get("use_seal_recognition", True)
|
101
|
+
self.use_region_detection = config.get(
|
102
|
+
"use_region_detection",
|
103
|
+
True,
|
104
|
+
)
|
89
105
|
self.use_formula_recognition = config.get(
|
90
106
|
"use_formula_recognition",
|
91
107
|
True,
|
92
108
|
)
|
109
|
+
self.use_chart_recognition = config.get(
|
110
|
+
"use_chart_recognition",
|
111
|
+
False,
|
112
|
+
)
|
93
113
|
|
94
114
|
if self.use_doc_preprocessor:
|
95
115
|
doc_preprocessor_config = config.get("SubPipelines", {}).get(
|
@@ -101,6 +121,16 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
101
121
|
self.doc_preprocessor_pipeline = self.create_pipeline(
|
102
122
|
doc_preprocessor_config,
|
103
123
|
)
|
124
|
+
if self.use_region_detection:
|
125
|
+
region_detection_config = config.get("SubModules", {}).get(
|
126
|
+
"RegionDetection",
|
127
|
+
{
|
128
|
+
"model_config_error": "config error for block_region_detection_model!"
|
129
|
+
},
|
130
|
+
)
|
131
|
+
self.region_detection_model = self.create_model(
|
132
|
+
region_detection_config,
|
133
|
+
)
|
104
134
|
|
105
135
|
layout_det_config = config.get("SubModules", {}).get(
|
106
136
|
"LayoutDetection",
|
@@ -123,14 +153,13 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
123
153
|
layout_kwargs["layout_merge_bboxes_mode"] = layout_merge_bboxes_mode
|
124
154
|
self.layout_det_model = self.create_model(layout_det_config, **layout_kwargs)
|
125
155
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
)
|
156
|
+
general_ocr_config = config.get("SubPipelines", {}).get(
|
157
|
+
"GeneralOCR",
|
158
|
+
{"pipeline_config_error": "config error for general_ocr_pipeline!"},
|
159
|
+
)
|
160
|
+
self.general_ocr_pipeline = self.create_pipeline(
|
161
|
+
general_ocr_config,
|
162
|
+
)
|
134
163
|
|
135
164
|
if self.use_seal_recognition:
|
136
165
|
seal_recognition_config = config.get("SubPipelines", {}).get(
|
@@ -165,6 +194,17 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
165
194
|
formula_recognition_config,
|
166
195
|
)
|
167
196
|
|
197
|
+
if self.use_chart_recognition:
|
198
|
+
chart_recognition_config = config.get("SubModules", {}).get(
|
199
|
+
"ChartRecognition",
|
200
|
+
{
|
201
|
+
"model_config_error": "config error for block_region_detection_model!"
|
202
|
+
},
|
203
|
+
)
|
204
|
+
self.chart_recognition_model = self.create_model(
|
205
|
+
chart_recognition_config,
|
206
|
+
)
|
207
|
+
|
168
208
|
return
|
169
209
|
|
170
210
|
def get_text_paragraphs_ocr_res(
|
@@ -209,12 +249,6 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
209
249
|
)
|
210
250
|
return False
|
211
251
|
|
212
|
-
if input_params["use_general_ocr"] and not self.use_general_ocr:
|
213
|
-
logging.error(
|
214
|
-
"Set use_general_ocr, but the models for general OCR are not initialized.",
|
215
|
-
)
|
216
|
-
return False
|
217
|
-
|
218
252
|
if input_params["use_seal_recognition"] and not self.use_seal_recognition:
|
219
253
|
logging.error(
|
220
254
|
"Set use_seal_recognition, but the models for seal recognition are not initialized.",
|
@@ -229,159 +263,643 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
229
263
|
|
230
264
|
return True
|
231
265
|
|
232
|
-
def
|
266
|
+
def standardized_data(
|
233
267
|
self,
|
234
268
|
image: list,
|
269
|
+
region_det_res: DetResult,
|
235
270
|
layout_det_res: DetResult,
|
236
271
|
overall_ocr_res: OCRResult,
|
237
|
-
table_res_list: list,
|
238
|
-
seal_res_list: list,
|
239
272
|
formula_res_list: list,
|
240
|
-
|
241
|
-
|
242
|
-
text_det_limit_type: Optional[str] = None,
|
243
|
-
text_det_thresh: Optional[float] = None,
|
244
|
-
text_det_box_thresh: Optional[float] = None,
|
245
|
-
text_det_unclip_ratio: Optional[float] = None,
|
246
|
-
text_rec_score_thresh: Optional[float] = None,
|
273
|
+
text_rec_model: Any,
|
274
|
+
text_rec_score_thresh: Union[float, None] = None,
|
247
275
|
) -> list:
|
248
276
|
"""
|
249
277
|
Retrieves the layout parsing result based on the layout detection result, OCR result, and other recognition results.
|
250
278
|
Args:
|
251
279
|
image (list): The input image.
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
280
|
+
overall_ocr_res (OCRResult): An object containing the overall OCR results, including detected text boxes and recognized text. The structure is expected to have:
|
281
|
+
- "input_img": The image on which OCR was performed.
|
282
|
+
- "dt_boxes": A list of detected text box coordinates.
|
283
|
+
- "rec_texts": A list of recognized text corresponding to the detected boxes.
|
284
|
+
|
285
|
+
layout_det_res (DetResult): An object containing the layout detection results, including detected layout boxes and their labels. The structure is expected to have:
|
286
|
+
- "boxes": A list of dictionaries with keys "coordinate" for box coordinates and "block_label" for the type of content.
|
287
|
+
|
288
|
+
table_res_list (list): A list of table detection results, where each item is a dictionary containing:
|
289
|
+
- "block_bbox": The bounding box of the table layout.
|
290
|
+
- "pred_html": The predicted HTML representation of the table.
|
291
|
+
|
256
292
|
formula_res_list (list): A list of formula recognition results.
|
257
|
-
|
258
|
-
text_det_limit_type (Optional[str], optional): The type of limit for the text detection region. Defaults to None.
|
259
|
-
text_det_thresh (Optional[float], optional): The confidence threshold for text detection. Defaults to None.
|
260
|
-
text_det_box_thresh (Optional[float], optional): The confidence threshold for text detection bounding boxes. Defaults to None
|
261
|
-
text_det_unclip_ratio (Optional[float], optional): The unclip ratio for text detection. Defaults to None.
|
293
|
+
text_rec_model (Any): The text recognition model.
|
262
294
|
text_rec_score_thresh (Optional[float], optional): The score threshold for text recognition. Defaults to None.
|
263
295
|
Returns:
|
264
296
|
list: A list of dictionaries representing the layout parsing result.
|
265
297
|
"""
|
298
|
+
|
266
299
|
matched_ocr_dict = {}
|
267
|
-
|
300
|
+
region_to_block_map = {}
|
301
|
+
block_to_ocr_map = {}
|
268
302
|
object_boxes = []
|
269
303
|
footnote_list = []
|
270
|
-
|
304
|
+
paragraph_title_list = []
|
305
|
+
bottom_text_y_max = 0
|
306
|
+
max_block_area = 0.0
|
307
|
+
doc_title_num = 0
|
308
|
+
|
309
|
+
base_region_bbox = [65535, 65535, 0, 0]
|
310
|
+
layout_det_res = remove_overlap_blocks(
|
311
|
+
layout_det_res,
|
312
|
+
threshold=0.5,
|
313
|
+
smaller=True,
|
314
|
+
)
|
271
315
|
|
272
|
-
|
316
|
+
# convert formula_res_list to OCRResult format
|
317
|
+
convert_formula_res_to_ocr_format(formula_res_list, overall_ocr_res)
|
318
|
+
|
319
|
+
# match layout boxes and ocr boxes and get some information for layout_order_config
|
320
|
+
for box_idx, box_info in enumerate(layout_det_res["boxes"]):
|
273
321
|
box = box_info["coordinate"]
|
274
322
|
label = box_info["label"].lower()
|
275
323
|
object_boxes.append(box)
|
324
|
+
_, _, _, y2 = box
|
325
|
+
|
326
|
+
# update the region box and max_block_area according to the layout boxes
|
327
|
+
base_region_bbox = update_region_box(box, base_region_bbox)
|
328
|
+
max_block_area = max(max_block_area, caculate_bbox_area(box))
|
329
|
+
|
330
|
+
# update_layout_order_config_block_index(layout_order_config, label, box_idx)
|
276
331
|
|
277
332
|
# set the label of footnote to text, when it is above the text boxes
|
278
333
|
if label == "footnote":
|
279
|
-
footnote_list.append(
|
280
|
-
|
281
|
-
|
334
|
+
footnote_list.append(box_idx)
|
335
|
+
elif label == "paragraph_title":
|
336
|
+
paragraph_title_list.append(box_idx)
|
337
|
+
if label == "text":
|
338
|
+
bottom_text_y_max = max(y2, bottom_text_y_max)
|
339
|
+
if label == "doc_title":
|
340
|
+
doc_title_num += 1
|
282
341
|
|
283
342
|
if label not in ["formula", "table", "seal"]:
|
284
|
-
_,
|
343
|
+
_, matched_idxes = get_sub_regions_ocr_res(
|
285
344
|
overall_ocr_res, [box], return_match_idx=True
|
286
345
|
)
|
287
|
-
|
346
|
+
block_to_ocr_map[box_idx] = matched_idxes
|
347
|
+
for matched_idx in matched_idxes:
|
288
348
|
if matched_ocr_dict.get(matched_idx, None) is None:
|
289
|
-
matched_ocr_dict[matched_idx] = [
|
349
|
+
matched_ocr_dict[matched_idx] = [box_idx]
|
290
350
|
else:
|
291
|
-
matched_ocr_dict[matched_idx].append(
|
351
|
+
matched_ocr_dict[matched_idx].append(box_idx)
|
292
352
|
|
353
|
+
# fix the footnote label
|
293
354
|
for footnote_idx in footnote_list:
|
294
355
|
if (
|
295
356
|
layout_det_res["boxes"][footnote_idx]["coordinate"][3]
|
296
|
-
<
|
357
|
+
< bottom_text_y_max
|
297
358
|
):
|
298
359
|
layout_det_res["boxes"][footnote_idx]["label"] = "text"
|
299
360
|
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
361
|
+
# check if there is only one paragraph title and without doc_title
|
362
|
+
only_one_paragraph_title = len(paragraph_title_list) == 1 and doc_title_num == 0
|
363
|
+
if only_one_paragraph_title:
|
364
|
+
paragraph_title_block_area = caculate_bbox_area(
|
365
|
+
layout_det_res["boxes"][paragraph_title_list[0]]["coordinate"]
|
366
|
+
)
|
367
|
+
title_area_max_block_threshold = BLOCK_SETTINGS.get(
|
368
|
+
"title_conversion_area_ratio_threshold", 0.3
|
369
|
+
)
|
370
|
+
if (
|
371
|
+
paragraph_title_block_area
|
372
|
+
> max_block_area * title_area_max_block_threshold
|
373
|
+
):
|
374
|
+
layout_det_res["boxes"][paragraph_title_list[0]]["label"] = "doc_title"
|
375
|
+
|
376
|
+
# Replace the OCR information of the hurdles.
|
377
|
+
for overall_ocr_idx, layout_box_ids in matched_ocr_dict.items():
|
378
|
+
if len(layout_box_ids) > 1:
|
379
|
+
matched_no = 0
|
380
|
+
overall_ocr_box = copy.deepcopy(
|
381
|
+
overall_ocr_res["rec_boxes"][overall_ocr_idx]
|
382
|
+
)
|
383
|
+
overall_ocr_dt_poly = copy.deepcopy(
|
384
|
+
overall_ocr_res["dt_polys"][overall_ocr_idx]
|
385
|
+
)
|
386
|
+
for box_idx in layout_box_ids:
|
387
|
+
layout_box = layout_det_res["boxes"][box_idx]["coordinate"]
|
388
|
+
crop_box = get_bbox_intersection(overall_ocr_box, layout_box)
|
389
|
+
for ocr_idx in block_to_ocr_map[box_idx]:
|
390
|
+
ocr_box = overall_ocr_res["rec_boxes"][ocr_idx]
|
391
|
+
iou = calculate_overlap_ratio(ocr_box, crop_box, "small")
|
392
|
+
if iou > 0.8:
|
393
|
+
overall_ocr_res["rec_texts"][ocr_idx] = ""
|
394
|
+
x1, y1, x2, y2 = [int(i) for i in crop_box]
|
395
|
+
crop_img = np.array(image)[y1:y2, x1:x2]
|
396
|
+
crop_img_rec_res = list(text_rec_model([crop_img]))[0]
|
397
|
+
crop_img_dt_poly = get_bbox_intersection(
|
398
|
+
overall_ocr_dt_poly, layout_box, return_format="poly"
|
399
|
+
)
|
400
|
+
crop_img_rec_score = crop_img_rec_res["rec_score"]
|
401
|
+
crop_img_rec_text = crop_img_rec_res["rec_text"]
|
402
|
+
text_rec_score_thresh = (
|
403
|
+
text_rec_score_thresh
|
404
|
+
if text_rec_score_thresh is not None
|
405
|
+
else (self.general_ocr_pipeline.text_rec_score_thresh)
|
324
406
|
)
|
407
|
+
if crop_img_rec_score >= text_rec_score_thresh:
|
408
|
+
matched_no += 1
|
409
|
+
if matched_no == 1:
|
410
|
+
# the first matched ocr be replaced by the first matched layout box
|
411
|
+
overall_ocr_res["dt_polys"][
|
412
|
+
overall_ocr_idx
|
413
|
+
] = crop_img_dt_poly
|
414
|
+
overall_ocr_res["rec_boxes"][overall_ocr_idx] = crop_box
|
415
|
+
overall_ocr_res["rec_polys"][
|
416
|
+
overall_ocr_idx
|
417
|
+
] = crop_img_dt_poly
|
418
|
+
overall_ocr_res["rec_scores"][
|
419
|
+
overall_ocr_idx
|
420
|
+
] = crop_img_rec_score
|
421
|
+
overall_ocr_res["rec_texts"][
|
422
|
+
overall_ocr_idx
|
423
|
+
] = crop_img_rec_text
|
424
|
+
else:
|
425
|
+
# the other matched ocr be appended to the overall ocr result
|
426
|
+
overall_ocr_res["dt_polys"].append(crop_img_dt_poly)
|
427
|
+
overall_ocr_res["rec_boxes"] = np.vstack(
|
428
|
+
(overall_ocr_res["rec_boxes"], crop_box)
|
429
|
+
)
|
430
|
+
overall_ocr_res["rec_polys"].append(crop_img_dt_poly)
|
431
|
+
overall_ocr_res["rec_scores"].append(crop_img_rec_score)
|
432
|
+
overall_ocr_res["rec_texts"].append(crop_img_rec_text)
|
433
|
+
overall_ocr_res["rec_labels"].append("text")
|
434
|
+
block_to_ocr_map[box_idx].remove(overall_ocr_idx)
|
435
|
+
block_to_ocr_map[box_idx].append(
|
436
|
+
len(overall_ocr_res["rec_texts"]) - 1
|
437
|
+
)
|
438
|
+
|
439
|
+
# use layout bbox to do ocr recognition when there is no matched ocr
|
440
|
+
for layout_box_idx, overall_ocr_idxes in block_to_ocr_map.items():
|
441
|
+
has_text = False
|
442
|
+
for idx in overall_ocr_idxes:
|
443
|
+
if overall_ocr_res["rec_texts"][idx] != "":
|
444
|
+
has_text = True
|
445
|
+
break
|
446
|
+
if not has_text and layout_det_res["boxes"][layout_box_idx][
|
447
|
+
"label"
|
448
|
+
] not in BLOCK_LABEL_MAP.get("vision_labels", []):
|
449
|
+
crop_box = layout_det_res["boxes"][layout_box_idx]["coordinate"]
|
450
|
+
x1, y1, x2, y2 = [int(i) for i in crop_box]
|
451
|
+
crop_img = np.array(image)[y1:y2, x1:x2]
|
452
|
+
crop_img_rec_res = next(text_rec_model([crop_img]))
|
453
|
+
crop_img_dt_poly = get_bbox_intersection(
|
454
|
+
crop_box, crop_box, return_format="poly"
|
325
455
|
)
|
326
|
-
|
327
|
-
|
456
|
+
crop_img_rec_score = crop_img_rec_res["rec_score"]
|
457
|
+
crop_img_rec_text = crop_img_rec_res["rec_text"]
|
458
|
+
text_rec_score_thresh = (
|
459
|
+
text_rec_score_thresh
|
460
|
+
if text_rec_score_thresh is not None
|
461
|
+
else (self.general_ocr_pipeline.text_rec_score_thresh)
|
328
462
|
)
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
463
|
+
if crop_img_rec_score >= text_rec_score_thresh:
|
464
|
+
overall_ocr_res["rec_boxes"] = np.vstack(
|
465
|
+
(overall_ocr_res["rec_boxes"], crop_box)
|
466
|
+
)
|
467
|
+
overall_ocr_res["rec_polys"].append(crop_img_dt_poly)
|
468
|
+
overall_ocr_res["rec_scores"].append(crop_img_rec_score)
|
469
|
+
overall_ocr_res["rec_texts"].append(crop_img_rec_text)
|
470
|
+
overall_ocr_res["rec_labels"].append("text")
|
471
|
+
block_to_ocr_map[layout_box_idx].append(
|
472
|
+
len(overall_ocr_res["rec_texts"]) - 1
|
334
473
|
)
|
335
|
-
del overall_ocr_res["rec_polys"][matched_idx]
|
336
|
-
del overall_ocr_res["rec_scores"][matched_idx]
|
337
474
|
|
338
|
-
|
339
|
-
|
475
|
+
# when there is no layout detection result but there is ocr result, convert ocr detection result to layout detection result
|
476
|
+
if len(layout_det_res["boxes"]) == 0 and len(overall_ocr_res["rec_boxes"]) > 0:
|
477
|
+
for idx, ocr_rec_box in enumerate(overall_ocr_res["rec_boxes"]):
|
478
|
+
base_region_bbox = update_region_box(ocr_rec_box, base_region_bbox)
|
479
|
+
layout_det_res["boxes"].append(
|
480
|
+
{
|
481
|
+
"label": "text",
|
482
|
+
"coordinate": ocr_rec_box,
|
483
|
+
"score": overall_ocr_res["rec_scores"][idx],
|
484
|
+
}
|
485
|
+
)
|
486
|
+
block_to_ocr_map[idx] = [idx]
|
340
487
|
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
488
|
+
block_bboxes = [box["coordinate"] for box in layout_det_res["boxes"]]
|
489
|
+
region_det_res["boxes"] = sorted(
|
490
|
+
region_det_res["boxes"],
|
491
|
+
key=lambda item: caculate_bbox_area(item["coordinate"]),
|
492
|
+
)
|
493
|
+
if len(region_det_res["boxes"]) == 0:
|
494
|
+
region_det_res["boxes"] = [
|
495
|
+
{
|
496
|
+
"coordinate": base_region_bbox,
|
497
|
+
"label": "SupplementaryRegion",
|
498
|
+
"score": 1,
|
499
|
+
}
|
500
|
+
]
|
501
|
+
region_to_block_map[0] = range(len(block_bboxes))
|
502
|
+
else:
|
503
|
+
block_idxes_set = set(range(len(block_bboxes)))
|
504
|
+
# match block to region
|
505
|
+
for region_idx, region_info in enumerate(region_det_res["boxes"]):
|
506
|
+
matched_idxes = []
|
507
|
+
region_to_block_map[region_idx] = []
|
508
|
+
region_bbox = region_info["coordinate"]
|
509
|
+
for block_idx in block_idxes_set:
|
510
|
+
overlap_ratio = calculate_overlap_ratio(
|
511
|
+
region_bbox, block_bboxes[block_idx], mode="small"
|
345
512
|
)
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
513
|
+
if overlap_ratio > REGION_SETTINGS.get(
|
514
|
+
"match_block_overlap_ratio_threshold", 0.8
|
515
|
+
):
|
516
|
+
region_to_block_map[region_idx].append(block_idx)
|
517
|
+
matched_idxes.append(block_idx)
|
518
|
+
if len(matched_idxes) > 0:
|
519
|
+
for block_idx in matched_idxes:
|
520
|
+
block_idxes_set.remove(block_idx)
|
521
|
+
matched_bboxes = [block_bboxes[idx] for idx in matched_idxes]
|
522
|
+
new_region_bbox = calculate_minimum_enclosing_bbox(matched_bboxes)
|
523
|
+
region_det_res["boxes"][region_idx]["coordinate"] = new_region_bbox
|
524
|
+
# Supplement region when there is no matched block
|
525
|
+
if len(block_idxes_set) > 0:
|
526
|
+
while len(block_idxes_set) > 0:
|
527
|
+
matched_idxes = []
|
528
|
+
unmatched_bboxes = [block_bboxes[idx] for idx in block_idxes_set]
|
529
|
+
supplement_region_bbox = calculate_minimum_enclosing_bbox(
|
530
|
+
unmatched_bboxes
|
531
|
+
)
|
532
|
+
# check if the new region bbox is overlapped with other region bbox, if have, then shrink the new region bbox
|
533
|
+
for region_info in region_det_res["boxes"]:
|
534
|
+
region_bbox = region_info["coordinate"]
|
535
|
+
overlap_ratio = calculate_overlap_ratio(
|
536
|
+
supplement_region_bbox, region_bbox
|
537
|
+
)
|
538
|
+
if overlap_ratio > 0:
|
539
|
+
supplement_region_bbox, matched_idxes = (
|
540
|
+
shrink_supplement_region_bbox(
|
541
|
+
supplement_region_bbox,
|
542
|
+
region_bbox,
|
543
|
+
image.shape[1],
|
544
|
+
image.shape[0],
|
545
|
+
block_idxes_set,
|
546
|
+
block_bboxes,
|
547
|
+
)
|
548
|
+
)
|
549
|
+
if len(matched_idxes) == 0:
|
550
|
+
matched_idxes = list(block_idxes_set)
|
551
|
+
region_idx = len(region_det_res["boxes"])
|
552
|
+
region_to_block_map[region_idx] = list(matched_idxes)
|
553
|
+
for block_idx in matched_idxes:
|
554
|
+
block_idxes_set.remove(block_idx)
|
555
|
+
region_det_res["boxes"].append(
|
556
|
+
{
|
557
|
+
"coordinate": supplement_region_bbox,
|
558
|
+
"label": "SupplementaryRegion",
|
559
|
+
"score": 1,
|
560
|
+
}
|
561
|
+
)
|
562
|
+
|
563
|
+
region_block_ocr_idx_map = dict(
|
564
|
+
region_to_block_map=region_to_block_map,
|
565
|
+
block_to_ocr_map=block_to_ocr_map,
|
566
|
+
)
|
567
|
+
|
568
|
+
return region_block_ocr_idx_map, region_det_res, layout_det_res
|
569
|
+
|
570
|
+
def sort_line_by_projection(
|
571
|
+
self,
|
572
|
+
line: List[List[Union[List[int], str]]],
|
573
|
+
input_img: np.ndarray,
|
574
|
+
text_rec_model: Any,
|
575
|
+
text_rec_score_thresh: Union[float, None] = None,
|
576
|
+
direction: str = "vertical",
|
577
|
+
) -> None:
|
578
|
+
"""
|
579
|
+
Sort a line of text spans based on their vertical position within the layout bounding box.
|
580
|
+
|
581
|
+
Args:
|
582
|
+
line (list): A list of spans, where each span is a list containing a bounding box and text.
|
583
|
+
input_img (ndarray): The input image used for OCR.
|
584
|
+
general_ocr_pipeline (Any): The general OCR pipeline used for text recognition.
|
585
|
+
|
586
|
+
Returns:
|
587
|
+
list: The sorted line of text spans.
|
588
|
+
"""
|
589
|
+
sort_index = 0 if direction == "horizontal" else 1
|
590
|
+
splited_boxes = split_boxes_by_projection(line, direction)
|
591
|
+
splited_lines = []
|
592
|
+
if len(line) != len(splited_boxes):
|
593
|
+
splited_boxes.sort(key=lambda span: span[0][sort_index])
|
594
|
+
for span in splited_boxes:
|
595
|
+
bbox, text, label = span
|
596
|
+
if label == "text":
|
597
|
+
crop_img = input_img[
|
598
|
+
int(bbox[1]) : int(bbox[3]),
|
599
|
+
int(bbox[0]) : int(bbox[2]),
|
600
|
+
]
|
601
|
+
crop_img_rec_res = list(text_rec_model([crop_img]))[0]
|
602
|
+
crop_img_rec_score = crop_img_rec_res["rec_score"]
|
603
|
+
crop_img_rec_text = crop_img_rec_res["rec_text"]
|
604
|
+
text = (
|
605
|
+
crop_img_rec_text
|
606
|
+
if crop_img_rec_score >= text_rec_score_thresh
|
607
|
+
else ""
|
608
|
+
)
|
609
|
+
span[1] = text
|
610
|
+
|
611
|
+
splited_lines.append(span)
|
612
|
+
else:
|
613
|
+
splited_lines = line
|
614
|
+
|
615
|
+
return splited_lines
|
616
|
+
|
617
|
+
def get_block_rec_content(
|
618
|
+
self,
|
619
|
+
image: list,
|
620
|
+
ocr_rec_res: dict,
|
621
|
+
block: LayoutParsingBlock,
|
622
|
+
text_rec_model: Any,
|
623
|
+
text_rec_score_thresh: Union[float, None] = None,
|
624
|
+
) -> str:
|
625
|
+
|
626
|
+
if len(ocr_rec_res["rec_texts"]) == 0:
|
627
|
+
block.content = ""
|
628
|
+
return block
|
629
|
+
|
630
|
+
lines, text_direction, text_line_height = group_boxes_into_lines(
|
631
|
+
ocr_rec_res,
|
632
|
+
LINE_SETTINGS.get("line_height_iou_threshold", 0.8),
|
633
|
+
)
|
634
|
+
|
635
|
+
# format line
|
636
|
+
text_lines = []
|
637
|
+
need_new_line_num = 0
|
638
|
+
# words start coordinate and stop coordinate in the line
|
639
|
+
words_start_index = 0 if text_direction == "horizontal" else 1
|
640
|
+
words_stop_index = words_start_index + 2
|
641
|
+
lines_start_index = 1 if text_direction == "horizontal" else 3
|
642
|
+
line_width_list = []
|
643
|
+
|
644
|
+
if block.label == "reference":
|
645
|
+
rec_boxes = ocr_rec_res["boxes"]
|
646
|
+
block_start_coordinate = min([box[words_start_index] for box in rec_boxes])
|
647
|
+
block_stop_coordinate = max([box[words_stop_index] for box in rec_boxes])
|
648
|
+
else:
|
649
|
+
block_start_coordinate = block.bbox[words_start_index]
|
650
|
+
block_stop_coordinate = block.bbox[words_stop_index]
|
651
|
+
|
652
|
+
for idx, line in enumerate(lines):
|
653
|
+
line.sort(
|
654
|
+
key=lambda span: (
|
655
|
+
span[0][words_start_index] // 2,
|
656
|
+
(
|
657
|
+
span[0][lines_start_index]
|
658
|
+
if text_direction == "horizontal"
|
659
|
+
else -span[0][lines_start_index]
|
660
|
+
),
|
661
|
+
)
|
662
|
+
)
|
663
|
+
|
664
|
+
line_width = line[-1][0][words_stop_index] - line[0][0][words_start_index]
|
665
|
+
line_width_list.append(line_width)
|
666
|
+
# merge formula and text
|
667
|
+
ocr_labels = [span[2] for span in line]
|
668
|
+
if "formula" in ocr_labels:
|
669
|
+
line = self.sort_line_by_projection(
|
670
|
+
line, image, text_rec_model, text_rec_score_thresh, text_direction
|
671
|
+
)
|
672
|
+
|
673
|
+
line_text, need_new_line = format_line(
|
674
|
+
line,
|
675
|
+
text_direction,
|
676
|
+
np.max(line_width_list),
|
677
|
+
block_start_coordinate,
|
678
|
+
block_stop_coordinate,
|
679
|
+
line_gap_limit=text_line_height * 1.5,
|
680
|
+
block_label=block.label,
|
681
|
+
)
|
682
|
+
if need_new_line:
|
683
|
+
need_new_line_num += 1
|
684
|
+
if idx == 0:
|
685
|
+
line_start_coordinate = line[0][0][0]
|
686
|
+
block.seg_start_coordinate = line_start_coordinate
|
687
|
+
elif idx == len(lines) - 1:
|
688
|
+
line_end_coordinate = line[-1][0][2]
|
689
|
+
block.seg_end_coordinate = line_end_coordinate
|
690
|
+
text_lines.append(line_text)
|
691
|
+
|
692
|
+
delim = LINE_SETTINGS["delimiter_map"].get(block.label, "")
|
693
|
+
if need_new_line_num > len(text_lines) * 0.5 and delim == "":
|
694
|
+
text_lines = [text.replace("\n", "") for text in text_lines]
|
695
|
+
delim = "\n"
|
696
|
+
content = delim.join(text_lines)
|
697
|
+
block.content = content
|
698
|
+
block.num_of_lines = len(text_lines)
|
699
|
+
block.direction = text_direction
|
700
|
+
block.text_line_height = text_line_height
|
701
|
+
block.text_line_width = np.mean(line_width_list)
|
702
|
+
|
703
|
+
return block
|
704
|
+
|
705
|
+
def get_layout_parsing_blocks(
|
706
|
+
self,
|
707
|
+
image: list,
|
708
|
+
region_block_ocr_idx_map: dict,
|
709
|
+
region_det_res: DetResult,
|
710
|
+
overall_ocr_res: OCRResult,
|
711
|
+
layout_det_res: DetResult,
|
712
|
+
table_res_list: list,
|
713
|
+
seal_res_list: list,
|
714
|
+
chart_res_list: list,
|
715
|
+
text_rec_model: Any,
|
716
|
+
text_rec_score_thresh: Union[float, None] = None,
|
717
|
+
) -> list:
|
718
|
+
"""
|
719
|
+
Extract structured information from OCR and layout detection results.
|
720
|
+
|
721
|
+
Args:
|
722
|
+
image (list): The input image.
|
723
|
+
overall_ocr_res (OCRResult): An object containing the overall OCR results, including detected text boxes and recognized text. The structure is expected to have:
|
724
|
+
- "input_img": The image on which OCR was performed.
|
725
|
+
- "dt_boxes": A list of detected text box coordinates.
|
726
|
+
- "rec_texts": A list of recognized text corresponding to the detected boxes.
|
727
|
+
|
728
|
+
layout_det_res (DetResult): An object containing the layout detection results, including detected layout boxes and their labels. The structure is expected to have:
|
729
|
+
- "boxes": A list of dictionaries with keys "coordinate" for box coordinates and "block_label" for the type of content.
|
730
|
+
|
731
|
+
table_res_list (list): A list of table detection results, where each item is a dictionary containing:
|
732
|
+
- "block_bbox": The bounding box of the table layout.
|
733
|
+
- "pred_html": The predicted HTML representation of the table.
|
734
|
+
|
735
|
+
seal_res_list (List): A list of seal detection results. The details of each item depend on the specific application context.
|
736
|
+
text_rec_model (Any): A model for text recognition.
|
737
|
+
text_rec_score_thresh (Union[float, None]): The minimum score required for a recognized character to be considered valid. If None, use the default value specified during initialization. Default is None.
|
738
|
+
|
739
|
+
Returns:
|
740
|
+
list: A list of structured boxes where each item is a dictionary containing:
|
741
|
+
- "block_label": The label of the content (e.g., 'table', 'chart', 'image').
|
742
|
+
- The label as a key with either table HTML or image data and text.
|
743
|
+
- "block_bbox": The coordinates of the layout box.
|
744
|
+
"""
|
745
|
+
|
746
|
+
table_index = 0
|
747
|
+
seal_index = 0
|
748
|
+
chart_index = 0
|
749
|
+
layout_parsing_blocks: List[LayoutParsingBlock] = []
|
750
|
+
|
751
|
+
for box_idx, box_info in enumerate(layout_det_res["boxes"]):
|
752
|
+
|
753
|
+
label = box_info["label"]
|
754
|
+
block_bbox = box_info["coordinate"]
|
755
|
+
rec_res = {"boxes": [], "rec_texts": [], "rec_labels": []}
|
756
|
+
|
757
|
+
block = LayoutParsingBlock(label=label, bbox=block_bbox)
|
758
|
+
|
759
|
+
if label == "table" and len(table_res_list) > 0:
|
760
|
+
block.content = table_res_list[table_index]["pred_html"]
|
761
|
+
table_index += 1
|
762
|
+
elif label == "seal" and len(seal_res_list) > 0:
|
763
|
+
block.content = "\n".join(seal_res_list[seal_index]["rec_texts"])
|
764
|
+
seal_index += 1
|
765
|
+
elif label == "chart" and len(chart_res_list) > 0:
|
766
|
+
block.content = chart_res_list[chart_index]
|
767
|
+
chart_index += 1
|
768
|
+
else:
|
769
|
+
if label == "formula":
|
770
|
+
_, ocr_idx_list = get_sub_regions_ocr_res(
|
771
|
+
overall_ocr_res, [block_bbox], return_match_idx=True
|
772
|
+
)
|
773
|
+
region_block_ocr_idx_map["block_to_ocr_map"][box_idx] = ocr_idx_list
|
774
|
+
else:
|
775
|
+
ocr_idx_list = region_block_ocr_idx_map["block_to_ocr_map"].get(
|
776
|
+
box_idx, []
|
777
|
+
)
|
778
|
+
for box_no in ocr_idx_list:
|
779
|
+
rec_res["boxes"].append(overall_ocr_res["rec_boxes"][box_no])
|
780
|
+
rec_res["rec_texts"].append(
|
781
|
+
overall_ocr_res["rec_texts"][box_no],
|
782
|
+
)
|
783
|
+
rec_res["rec_labels"].append(
|
784
|
+
overall_ocr_res["rec_labels"][box_no],
|
785
|
+
)
|
786
|
+
block = self.get_block_rec_content(
|
787
|
+
image=image,
|
788
|
+
block=block,
|
789
|
+
ocr_rec_res=rec_res,
|
790
|
+
text_rec_model=text_rec_model,
|
791
|
+
text_rec_score_thresh=text_rec_score_thresh,
|
792
|
+
)
|
793
|
+
|
794
|
+
if (
|
795
|
+
label
|
796
|
+
in ["seal", "table", "formula", "chart"]
|
797
|
+
+ BLOCK_LABEL_MAP["image_labels"]
|
798
|
+
):
|
799
|
+
x_min, y_min, x_max, y_max = list(map(int, block_bbox))
|
800
|
+
img_path = (
|
801
|
+
f"imgs/img_in_{block.label}_box_{x_min}_{y_min}_{x_max}_{y_max}.jpg"
|
802
|
+
)
|
803
|
+
img = Image.fromarray(image[y_min:y_max, x_min:x_max, ::-1])
|
804
|
+
block.image = {"path": img_path, "img": img}
|
805
|
+
|
806
|
+
layout_parsing_blocks.append(block)
|
807
|
+
|
808
|
+
region_list: List[LayoutParsingRegion] = []
|
809
|
+
for region_idx, region_info in enumerate(region_det_res["boxes"]):
|
810
|
+
region_bbox = region_info["coordinate"]
|
811
|
+
region_blocks = [
|
812
|
+
layout_parsing_blocks[idx]
|
813
|
+
for idx in region_block_ocr_idx_map["region_to_block_map"][region_idx]
|
357
814
|
]
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
815
|
+
region = LayoutParsingRegion(
|
816
|
+
bbox=region_bbox,
|
817
|
+
blocks=region_blocks,
|
818
|
+
image_shape=image.shape[:2],
|
362
819
|
)
|
363
|
-
|
364
|
-
|
365
|
-
|
820
|
+
region_list.append(region)
|
821
|
+
|
822
|
+
region_list = sorted(
|
823
|
+
region_list,
|
824
|
+
key=lambda r: (r.weighted_distance),
|
825
|
+
)
|
826
|
+
|
827
|
+
return region_list
|
366
828
|
|
367
|
-
|
368
|
-
|
829
|
+
def get_layout_parsing_res(
|
830
|
+
self,
|
831
|
+
image: list,
|
832
|
+
region_det_res: DetResult,
|
833
|
+
layout_det_res: DetResult,
|
834
|
+
overall_ocr_res: OCRResult,
|
835
|
+
table_res_list: list,
|
836
|
+
seal_res_list: list,
|
837
|
+
chart_res_list: list,
|
838
|
+
formula_res_list: list,
|
839
|
+
text_rec_score_thresh: Union[float, None] = None,
|
840
|
+
) -> list:
|
841
|
+
"""
|
842
|
+
Retrieves the layout parsing result based on the layout detection result, OCR result, and other recognition results.
|
843
|
+
Args:
|
844
|
+
image (list): The input image.
|
845
|
+
layout_det_res (DetResult): The detection result containing the layout information of the document.
|
846
|
+
overall_ocr_res (OCRResult): The overall OCR result containing text information.
|
847
|
+
table_res_list (list): A list of table recognition results.
|
848
|
+
seal_res_list (list): A list of seal recognition results.
|
849
|
+
formula_res_list (list): A list of formula recognition results.
|
850
|
+
text_rec_score_thresh (Optional[float], optional): The score threshold for text recognition. Defaults to None.
|
851
|
+
Returns:
|
852
|
+
list: A list of dictionaries representing the layout parsing result.
|
853
|
+
"""
|
854
|
+
|
855
|
+
# Standardize data
|
856
|
+
region_block_ocr_idx_map, region_det_res, layout_det_res = (
|
857
|
+
self.standardized_data(
|
858
|
+
image=image,
|
859
|
+
region_det_res=region_det_res,
|
860
|
+
layout_det_res=layout_det_res,
|
861
|
+
overall_ocr_res=overall_ocr_res,
|
862
|
+
formula_res_list=formula_res_list,
|
863
|
+
text_rec_model=self.general_ocr_pipeline.text_rec_model,
|
864
|
+
text_rec_score_thresh=text_rec_score_thresh,
|
865
|
+
)
|
866
|
+
)
|
867
|
+
|
868
|
+
# Format layout parsing block
|
869
|
+
region_list = self.get_layout_parsing_blocks(
|
870
|
+
image=image,
|
871
|
+
region_block_ocr_idx_map=region_block_ocr_idx_map,
|
872
|
+
region_det_res=region_det_res,
|
369
873
|
overall_ocr_res=overall_ocr_res,
|
370
874
|
layout_det_res=layout_det_res,
|
371
875
|
table_res_list=table_res_list,
|
372
876
|
seal_res_list=seal_res_list,
|
877
|
+
chart_res_list=chart_res_list,
|
878
|
+
text_rec_model=self.general_ocr_pipeline.text_rec_model,
|
879
|
+
text_rec_score_thresh=self.general_ocr_pipeline.text_rec_score_thresh,
|
373
880
|
)
|
374
881
|
|
882
|
+
parsing_res_list = []
|
883
|
+
for region in region_list:
|
884
|
+
parsing_res_list.extend(region.sort())
|
885
|
+
|
886
|
+
index = 1
|
887
|
+
for block in parsing_res_list:
|
888
|
+
if block.label in BLOCK_LABEL_MAP["visualize_index_labels"]:
|
889
|
+
block.order_index = index
|
890
|
+
index += 1
|
891
|
+
|
375
892
|
return parsing_res_list
|
376
893
|
|
377
894
|
def get_model_settings(
|
378
895
|
self,
|
379
896
|
use_doc_orientation_classify: Union[bool, None],
|
380
897
|
use_doc_unwarping: Union[bool, None],
|
381
|
-
use_general_ocr: Union[bool, None],
|
382
898
|
use_seal_recognition: Union[bool, None],
|
383
899
|
use_table_recognition: Union[bool, None],
|
384
900
|
use_formula_recognition: Union[bool, None],
|
901
|
+
use_chart_recognition: Union[bool, None],
|
902
|
+
use_region_detection: Union[bool, None],
|
385
903
|
) -> dict:
|
386
904
|
"""
|
387
905
|
Get the model settings based on the provided parameters or default values.
|
@@ -389,7 +907,6 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
389
907
|
Args:
|
390
908
|
use_doc_orientation_classify (Union[bool, None]): Enables document orientation classification if True. Defaults to system setting if None.
|
391
909
|
use_doc_unwarping (Union[bool, None]): Enables document unwarping if True. Defaults to system setting if None.
|
392
|
-
use_general_ocr (Union[bool, None]): Enables general OCR if True. Defaults to system setting if None.
|
393
910
|
use_seal_recognition (Union[bool, None]): Enables seal recognition if True. Defaults to system setting if None.
|
394
911
|
use_table_recognition (Union[bool, None]): Enables table recognition if True. Defaults to system setting if None.
|
395
912
|
use_formula_recognition (Union[bool, None]): Enables formula recognition if True. Defaults to system setting if None.
|
@@ -406,9 +923,6 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
406
923
|
else:
|
407
924
|
use_doc_preprocessor = False
|
408
925
|
|
409
|
-
if use_general_ocr is None:
|
410
|
-
use_general_ocr = self.use_general_ocr
|
411
|
-
|
412
926
|
if use_seal_recognition is None:
|
413
927
|
use_seal_recognition = self.use_seal_recognition
|
414
928
|
|
@@ -418,24 +932,32 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
418
932
|
if use_formula_recognition is None:
|
419
933
|
use_formula_recognition = self.use_formula_recognition
|
420
934
|
|
935
|
+
if use_region_detection is None:
|
936
|
+
use_region_detection = self.use_region_detection
|
937
|
+
|
938
|
+
if use_chart_recognition is None:
|
939
|
+
use_chart_recognition = self.use_chart_recognition
|
940
|
+
|
421
941
|
return dict(
|
422
942
|
use_doc_preprocessor=use_doc_preprocessor,
|
423
|
-
use_general_ocr=use_general_ocr,
|
424
943
|
use_seal_recognition=use_seal_recognition,
|
425
944
|
use_table_recognition=use_table_recognition,
|
426
945
|
use_formula_recognition=use_formula_recognition,
|
946
|
+
use_chart_recognition=use_chart_recognition,
|
947
|
+
use_region_detection=use_region_detection,
|
427
948
|
)
|
428
949
|
|
429
950
|
def predict(
|
430
951
|
self,
|
431
952
|
input: Union[str, list[str], np.ndarray, list[np.ndarray]],
|
432
|
-
use_doc_orientation_classify: Union[bool, None] =
|
433
|
-
use_doc_unwarping: Union[bool, None] =
|
953
|
+
use_doc_orientation_classify: Union[bool, None] = False,
|
954
|
+
use_doc_unwarping: Union[bool, None] = False,
|
434
955
|
use_textline_orientation: Optional[bool] = None,
|
435
|
-
use_general_ocr: Union[bool, None] = None,
|
436
956
|
use_seal_recognition: Union[bool, None] = None,
|
437
957
|
use_table_recognition: Union[bool, None] = None,
|
438
958
|
use_formula_recognition: Union[bool, None] = None,
|
959
|
+
use_chart_recognition: Union[bool, None] = False,
|
960
|
+
use_region_detection: Union[bool, None] = None,
|
439
961
|
layout_threshold: Optional[Union[float, dict]] = None,
|
440
962
|
layout_nms: Optional[bool] = None,
|
441
963
|
layout_unclip_ratio: Optional[Union[float, Tuple[float, float], dict]] = None,
|
@@ -452,7 +974,10 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
452
974
|
seal_det_box_thresh: Union[float, None] = None,
|
453
975
|
seal_det_unclip_ratio: Union[float, None] = None,
|
454
976
|
seal_rec_score_thresh: Union[float, None] = None,
|
455
|
-
|
977
|
+
use_wired_table_cells_trans_to_html: bool = False,
|
978
|
+
use_wireless_table_cells_trans_to_html: bool = False,
|
979
|
+
use_table_orientation_classify: bool = True,
|
980
|
+
use_ocr_results_with_table_cells: bool = True,
|
456
981
|
use_e2e_wired_table_rec_model: bool = False,
|
457
982
|
use_e2e_wireless_table_rec_model: bool = True,
|
458
983
|
**kwargs,
|
@@ -464,10 +989,10 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
464
989
|
use_doc_orientation_classify (Optional[bool]): Whether to use document orientation classification.
|
465
990
|
use_doc_unwarping (Optional[bool]): Whether to use document unwarping.
|
466
991
|
use_textline_orientation (Optional[bool]): Whether to use textline orientation prediction.
|
467
|
-
use_general_ocr (Optional[bool]): Whether to use general OCR.
|
468
992
|
use_seal_recognition (Optional[bool]): Whether to use seal recognition.
|
469
993
|
use_table_recognition (Optional[bool]): Whether to use table recognition.
|
470
994
|
use_formula_recognition (Optional[bool]): Whether to use formula recognition.
|
995
|
+
use_region_detection (Optional[bool]): Whether to use region detection.
|
471
996
|
layout_threshold (Optional[float]): The threshold value to filter out low-confidence predictions. Default is None.
|
472
997
|
layout_nms (bool, optional): Whether to use layout-aware NMS. Defaults to False.
|
473
998
|
layout_unclip_ratio (Optional[Union[float, Tuple[float, float]]], optional): The ratio of unclipping the bounding box.
|
@@ -488,7 +1013,10 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
488
1013
|
seal_det_box_thresh (Optional[float]): Threshold for seal detection boxes.
|
489
1014
|
seal_det_unclip_ratio (Optional[float]): Ratio for unclipping seal detection boxes.
|
490
1015
|
seal_rec_score_thresh (Optional[float]): Score threshold for seal recognition.
|
491
|
-
|
1016
|
+
use_wired_table_cells_trans_to_html (bool): Whether to use wired table cells trans to HTML.
|
1017
|
+
use_wireless_table_cells_trans_to_html (bool): Whether to use wireless table cells trans to HTML.
|
1018
|
+
use_table_orientation_classify (bool): Whether to use table orientation classification.
|
1019
|
+
use_ocr_results_with_table_cells (bool): Whether to use OCR results processed by table cells.
|
492
1020
|
use_e2e_wired_table_rec_model (bool): Whether to use end-to-end wired table recognition model.
|
493
1021
|
use_e2e_wireless_table_rec_model (bool): Whether to use end-to-end wireless table recognition model.
|
494
1022
|
**kwargs (Any): Additional settings to extend functionality.
|
@@ -500,150 +1028,204 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
500
1028
|
model_settings = self.get_model_settings(
|
501
1029
|
use_doc_orientation_classify,
|
502
1030
|
use_doc_unwarping,
|
503
|
-
use_general_ocr,
|
504
1031
|
use_seal_recognition,
|
505
1032
|
use_table_recognition,
|
506
1033
|
use_formula_recognition,
|
1034
|
+
use_chart_recognition,
|
1035
|
+
use_region_detection,
|
507
1036
|
)
|
508
1037
|
|
509
1038
|
if not self.check_model_settings_valid(model_settings):
|
510
1039
|
yield {"error": "the input params for model settings are invalid!"}
|
511
1040
|
|
512
1041
|
for batch_data in self.batch_sampler(input):
|
513
|
-
|
1042
|
+
image_arrays = self.img_reader(batch_data.instances)
|
514
1043
|
|
515
1044
|
if model_settings["use_doc_preprocessor"]:
|
516
|
-
|
1045
|
+
doc_preprocessor_results = list(
|
517
1046
|
self.doc_preprocessor_pipeline(
|
518
|
-
|
1047
|
+
image_arrays,
|
519
1048
|
use_doc_orientation_classify=use_doc_orientation_classify,
|
520
1049
|
use_doc_unwarping=use_doc_unwarping,
|
521
|
-
)
|
1050
|
+
)
|
522
1051
|
)
|
523
1052
|
else:
|
524
|
-
|
1053
|
+
doc_preprocessor_results = [{"output_img": arr} for arr in image_arrays]
|
525
1054
|
|
526
|
-
|
1055
|
+
doc_preprocessor_images = [
|
1056
|
+
item["output_img"] for item in doc_preprocessor_results
|
1057
|
+
]
|
527
1058
|
|
528
|
-
|
1059
|
+
layout_det_results = list(
|
529
1060
|
self.layout_det_model(
|
530
|
-
|
1061
|
+
doc_preprocessor_images,
|
531
1062
|
threshold=layout_threshold,
|
532
1063
|
layout_nms=layout_nms,
|
533
1064
|
layout_unclip_ratio=layout_unclip_ratio,
|
534
1065
|
layout_merge_bboxes_mode=layout_merge_bboxes_mode,
|
535
1066
|
)
|
536
1067
|
)
|
537
|
-
imgs_in_doc =
|
1068
|
+
imgs_in_doc = [
|
1069
|
+
gather_imgs(img, res["boxes"])
|
1070
|
+
for img, res in zip(doc_preprocessor_images, layout_det_results)
|
1071
|
+
]
|
1072
|
+
|
1073
|
+
if model_settings["use_region_detection"]:
|
1074
|
+
region_det_results = list(
|
1075
|
+
self.region_detection_model(
|
1076
|
+
doc_preprocessor_images,
|
1077
|
+
layout_nms=True,
|
1078
|
+
layout_merge_bboxes_mode="small",
|
1079
|
+
),
|
1080
|
+
)
|
1081
|
+
else:
|
1082
|
+
region_det_results = [{"boxes": []} for _ in doc_preprocessor_images]
|
538
1083
|
|
539
1084
|
if model_settings["use_formula_recognition"]:
|
540
|
-
formula_res_all =
|
1085
|
+
formula_res_all = list(
|
541
1086
|
self.formula_recognition_pipeline(
|
542
|
-
|
1087
|
+
doc_preprocessor_images,
|
543
1088
|
use_layout_detection=False,
|
544
1089
|
use_doc_orientation_classify=False,
|
545
1090
|
use_doc_unwarping=False,
|
546
|
-
layout_det_res=
|
1091
|
+
layout_det_res=layout_det_results,
|
547
1092
|
),
|
548
1093
|
)
|
549
|
-
|
1094
|
+
formula_res_lists = [
|
1095
|
+
item["formula_res_list"] for item in formula_res_all
|
1096
|
+
]
|
550
1097
|
else:
|
551
|
-
|
552
|
-
|
553
|
-
for formula_res in formula_res_list:
|
554
|
-
x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
|
555
|
-
doc_preprocessor_image[y_min:y_max, x_min:x_max, :] = 255.0
|
1098
|
+
formula_res_lists = [[] for _ in doc_preprocessor_images]
|
556
1099
|
|
557
|
-
|
558
|
-
|
559
|
-
or model_settings["use_table_recognition"]
|
1100
|
+
for doc_preprocessor_image, formula_res_list in zip(
|
1101
|
+
doc_preprocessor_images, formula_res_lists
|
560
1102
|
):
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
1103
|
+
for formula_res in formula_res_list:
|
1104
|
+
x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
|
1105
|
+
doc_preprocessor_image[y_min:y_max, x_min:x_max, :] = 255.0
|
1106
|
+
|
1107
|
+
overall_ocr_results = list(
|
1108
|
+
self.general_ocr_pipeline(
|
1109
|
+
doc_preprocessor_images,
|
1110
|
+
use_textline_orientation=use_textline_orientation,
|
1111
|
+
text_det_limit_side_len=text_det_limit_side_len,
|
1112
|
+
text_det_limit_type=text_det_limit_type,
|
1113
|
+
text_det_thresh=text_det_thresh,
|
1114
|
+
text_det_box_thresh=text_det_box_thresh,
|
1115
|
+
text_det_unclip_ratio=text_det_unclip_ratio,
|
1116
|
+
text_rec_score_thresh=text_rec_score_thresh,
|
1117
|
+
),
|
1118
|
+
)
|
575
1119
|
|
576
|
-
overall_ocr_res
|
1120
|
+
for overall_ocr_res in overall_ocr_results:
|
1121
|
+
overall_ocr_res["rec_labels"] = ["text"] * len(
|
1122
|
+
overall_ocr_res["rec_texts"]
|
1123
|
+
)
|
577
1124
|
|
578
1125
|
if model_settings["use_table_recognition"]:
|
579
|
-
|
580
|
-
for
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
1126
|
+
table_res_lists = []
|
1127
|
+
for (
|
1128
|
+
layout_det_res,
|
1129
|
+
doc_preprocessor_image,
|
1130
|
+
overall_ocr_res,
|
1131
|
+
formula_res_list,
|
1132
|
+
imgs_in_doc_for_img,
|
1133
|
+
) in zip(
|
1134
|
+
layout_det_results,
|
1135
|
+
doc_preprocessor_images,
|
1136
|
+
overall_ocr_results,
|
1137
|
+
formula_res_lists,
|
1138
|
+
imgs_in_doc,
|
1139
|
+
):
|
1140
|
+
table_contents_for_img = copy.deepcopy(overall_ocr_res)
|
1141
|
+
for formula_res in formula_res_list:
|
1142
|
+
x_min, y_min, x_max, y_max = list(
|
1143
|
+
map(int, formula_res["dt_polys"])
|
1144
|
+
)
|
1145
|
+
poly_points = [
|
1146
|
+
(x_min, y_min),
|
1147
|
+
(x_max, y_min),
|
1148
|
+
(x_max, y_max),
|
1149
|
+
(x_min, y_max),
|
1150
|
+
]
|
1151
|
+
table_contents_for_img["dt_polys"].append(poly_points)
|
1152
|
+
rec_formula = formula_res["rec_formula"]
|
1153
|
+
if not rec_formula.startswith("$") or not rec_formula.endswith(
|
1154
|
+
"$"
|
1155
|
+
):
|
1156
|
+
rec_formula = f"${rec_formula}$"
|
1157
|
+
table_contents_for_img["rec_texts"].append(f"{rec_formula}")
|
1158
|
+
if table_contents_for_img["rec_boxes"].size == 0:
|
1159
|
+
table_contents_for_img["rec_boxes"] = np.array(
|
1160
|
+
[formula_res["dt_polys"]]
|
1161
|
+
)
|
1162
|
+
else:
|
1163
|
+
table_contents_for_img["rec_boxes"] = np.vstack(
|
1164
|
+
(
|
1165
|
+
table_contents_for_img["rec_boxes"],
|
1166
|
+
[formula_res["dt_polys"]],
|
1167
|
+
)
|
1168
|
+
)
|
1169
|
+
table_contents_for_img["rec_polys"].append(poly_points)
|
1170
|
+
table_contents_for_img["rec_scores"].append(1)
|
1171
|
+
|
1172
|
+
for img in imgs_in_doc_for_img:
|
1173
|
+
img_path = img["path"]
|
1174
|
+
x_min, y_min, x_max, y_max = img["coordinate"]
|
1175
|
+
poly_points = [
|
1176
|
+
(x_min, y_min),
|
1177
|
+
(x_max, y_min),
|
1178
|
+
(x_max, y_max),
|
1179
|
+
(x_min, y_max),
|
1180
|
+
]
|
1181
|
+
table_contents_for_img["dt_polys"].append(poly_points)
|
1182
|
+
table_contents_for_img["rec_texts"].append(
|
1183
|
+
f'<div style="text-align: center;"><img src="{img_path}" alt="Image" /></div>'
|
1184
|
+
)
|
1185
|
+
if table_contents_for_img["rec_boxes"].size == 0:
|
1186
|
+
table_contents_for_img["rec_boxes"] = np.array(
|
1187
|
+
[img["coordinate"]]
|
1188
|
+
)
|
1189
|
+
else:
|
1190
|
+
table_contents_for_img["rec_boxes"] = np.vstack(
|
1191
|
+
(table_contents_for_img["rec_boxes"], img["coordinate"])
|
1192
|
+
)
|
1193
|
+
table_contents_for_img["rec_polys"].append(poly_points)
|
1194
|
+
table_contents_for_img["rec_scores"].append(img["score"])
|
1195
|
+
|
1196
|
+
table_res_all = list(
|
1197
|
+
self.table_recognition_pipeline(
|
1198
|
+
doc_preprocessor_image,
|
1199
|
+
use_doc_orientation_classify=False,
|
1200
|
+
use_doc_unwarping=False,
|
1201
|
+
use_layout_detection=False,
|
1202
|
+
use_ocr_model=False,
|
1203
|
+
overall_ocr_res=table_contents_for_img,
|
1204
|
+
layout_det_res=layout_det_res,
|
1205
|
+
cell_sort_by_y_projection=True,
|
1206
|
+
use_wired_table_cells_trans_to_html=use_wired_table_cells_trans_to_html,
|
1207
|
+
use_wireless_table_cells_trans_to_html=use_wireless_table_cells_trans_to_html,
|
1208
|
+
use_table_orientation_classify=use_table_orientation_classify,
|
1209
|
+
use_ocr_results_with_table_cells=use_ocr_results_with_table_cells,
|
1210
|
+
use_e2e_wired_table_rec_model=use_e2e_wired_table_rec_model,
|
1211
|
+
use_e2e_wireless_table_rec_model=use_e2e_wireless_table_rec_model,
|
1212
|
+
),
|
594
1213
|
)
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
for img in imgs_in_doc:
|
599
|
-
img_path = img["path"]
|
600
|
-
x_min, y_min, x_max, y_max = img["coordinate"]
|
601
|
-
poly_points = [
|
602
|
-
(x_min, y_min),
|
603
|
-
(x_max, y_min),
|
604
|
-
(x_max, y_max),
|
605
|
-
(x_min, y_max),
|
1214
|
+
single_table_res_lists = [
|
1215
|
+
item["table_res_list"] for item in table_res_all
|
606
1216
|
]
|
607
|
-
|
608
|
-
table_contents["rec_texts"].append(
|
609
|
-
f'<div style="text-align: center;"><img src="{img_path}" alt="Image" /></div>'
|
610
|
-
)
|
611
|
-
if table_contents["rec_boxes"].size == 0:
|
612
|
-
table_contents["rec_boxes"] = np.array([img["coordinate"]])
|
613
|
-
else:
|
614
|
-
table_contents["rec_boxes"] = np.vstack(
|
615
|
-
(table_contents["rec_boxes"], img["coordinate"])
|
616
|
-
)
|
617
|
-
table_contents["rec_polys"].append(poly_points)
|
618
|
-
table_contents["rec_scores"].append(img["score"])
|
619
|
-
|
620
|
-
table_res_all = next(
|
621
|
-
self.table_recognition_pipeline(
|
622
|
-
doc_preprocessor_image,
|
623
|
-
use_doc_orientation_classify=False,
|
624
|
-
use_doc_unwarping=False,
|
625
|
-
use_layout_detection=False,
|
626
|
-
use_ocr_model=False,
|
627
|
-
overall_ocr_res=table_contents,
|
628
|
-
layout_det_res=layout_det_res,
|
629
|
-
cell_sort_by_y_projection=True,
|
630
|
-
use_table_cells_ocr_results=use_table_cells_ocr_results,
|
631
|
-
use_e2e_wired_table_rec_model=use_e2e_wired_table_rec_model,
|
632
|
-
use_e2e_wireless_table_rec_model=use_e2e_wireless_table_rec_model,
|
633
|
-
),
|
634
|
-
)
|
635
|
-
table_res_list = table_res_all["table_res_list"]
|
1217
|
+
table_res_lists.extend(single_table_res_lists)
|
636
1218
|
else:
|
637
|
-
|
1219
|
+
table_res_lists = [[] for _ in doc_preprocessor_images]
|
638
1220
|
|
639
1221
|
if model_settings["use_seal_recognition"]:
|
640
|
-
seal_res_all =
|
1222
|
+
seal_res_all = list(
|
641
1223
|
self.seal_recognition_pipeline(
|
642
|
-
|
1224
|
+
doc_preprocessor_images,
|
643
1225
|
use_doc_orientation_classify=False,
|
644
1226
|
use_doc_unwarping=False,
|
645
1227
|
use_layout_detection=False,
|
646
|
-
layout_det_res=
|
1228
|
+
layout_det_res=layout_det_results,
|
647
1229
|
seal_det_limit_side_len=seal_det_limit_side_len,
|
648
1230
|
seal_det_limit_type=seal_det_limit_type,
|
649
1231
|
seal_det_thresh=seal_det_thresh,
|
@@ -652,46 +1234,85 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
652
1234
|
seal_rec_score_thresh=seal_rec_score_thresh,
|
653
1235
|
),
|
654
1236
|
)
|
655
|
-
|
1237
|
+
seal_res_lists = [item["seal_res_list"] for item in seal_res_all]
|
656
1238
|
else:
|
657
|
-
|
1239
|
+
seal_res_lists = [[] for _ in doc_preprocessor_images]
|
658
1240
|
|
659
|
-
|
1241
|
+
for (
|
1242
|
+
input_path,
|
1243
|
+
page_index,
|
660
1244
|
doc_preprocessor_image,
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
1245
|
+
doc_preprocessor_res,
|
1246
|
+
layout_det_res,
|
1247
|
+
region_det_res,
|
1248
|
+
overall_ocr_res,
|
1249
|
+
table_res_list,
|
1250
|
+
seal_res_list,
|
1251
|
+
formula_res_list,
|
1252
|
+
imgs_in_doc_for_img,
|
1253
|
+
) in zip(
|
1254
|
+
batch_data.input_paths,
|
1255
|
+
batch_data.page_indexes,
|
1256
|
+
doc_preprocessor_images,
|
1257
|
+
doc_preprocessor_results,
|
1258
|
+
layout_det_results,
|
1259
|
+
region_det_results,
|
1260
|
+
overall_ocr_results,
|
1261
|
+
table_res_lists,
|
1262
|
+
seal_res_lists,
|
1263
|
+
formula_res_lists,
|
1264
|
+
imgs_in_doc,
|
1265
|
+
):
|
1266
|
+
chart_res_list = []
|
1267
|
+
if model_settings["use_chart_recognition"]:
|
1268
|
+
chart_imgs_list = []
|
1269
|
+
for bbox in layout_det_res["boxes"]:
|
1270
|
+
if bbox["label"] == "chart":
|
1271
|
+
x_min, y_min, x_max, y_max = bbox["coordinate"]
|
1272
|
+
chart_img = doc_preprocessor_image[
|
1273
|
+
int(y_min) : int(y_max), int(x_min) : int(x_max), :
|
1274
|
+
]
|
1275
|
+
chart_imgs_list.append({"image": chart_img})
|
1276
|
+
|
1277
|
+
for chart_res_batch in self.chart_recognition_model(
|
1278
|
+
input=chart_imgs_list
|
1279
|
+
):
|
1280
|
+
chart_res_list.append(chart_res_batch["result"])
|
1281
|
+
|
1282
|
+
parsing_res_list = self.get_layout_parsing_res(
|
1283
|
+
doc_preprocessor_image,
|
1284
|
+
region_det_res=region_det_res,
|
1285
|
+
layout_det_res=layout_det_res,
|
1286
|
+
overall_ocr_res=overall_ocr_res,
|
1287
|
+
table_res_list=table_res_list,
|
1288
|
+
seal_res_list=seal_res_list,
|
1289
|
+
chart_res_list=chart_res_list,
|
1290
|
+
formula_res_list=formula_res_list,
|
1291
|
+
text_rec_score_thresh=text_rec_score_thresh,
|
1292
|
+
)
|
674
1293
|
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
1294
|
+
for formula_res in formula_res_list:
|
1295
|
+
x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
|
1296
|
+
doc_preprocessor_image[y_min:y_max, x_min:x_max, :] = formula_res[
|
1297
|
+
"input_img"
|
1298
|
+
]
|
680
1299
|
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
|
1300
|
+
single_img_res = {
|
1301
|
+
"input_path": input_path,
|
1302
|
+
"page_index": page_index,
|
1303
|
+
"doc_preprocessor_res": doc_preprocessor_res,
|
1304
|
+
"layout_det_res": layout_det_res,
|
1305
|
+
"region_det_res": region_det_res,
|
1306
|
+
"overall_ocr_res": overall_ocr_res,
|
1307
|
+
"table_res_list": table_res_list,
|
1308
|
+
"seal_res_list": seal_res_list,
|
1309
|
+
"chart_res_list": chart_res_list,
|
1310
|
+
"formula_res_list": formula_res_list,
|
1311
|
+
"parsing_res_list": parsing_res_list,
|
1312
|
+
"imgs_in_doc": imgs_in_doc_for_img,
|
1313
|
+
"model_settings": model_settings,
|
1314
|
+
}
|
1315
|
+
yield LayoutParsingResultV2(single_img_res)
|
695
1316
|
|
696
1317
|
def concatenate_markdown_pages(self, markdown_list: list) -> tuple:
|
697
1318
|
"""
|
@@ -747,3 +1368,15 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
747
1368
|
)
|
748
1369
|
|
749
1370
|
return markdown_texts
|
1371
|
+
|
1372
|
+
|
1373
|
+
@pipeline_requires_extra("ocr")
|
1374
|
+
class LayoutParsingPipelineV2(AutoParallelImageSimpleInferencePipeline):
|
1375
|
+
entities = ["PP-StructureV3"]
|
1376
|
+
|
1377
|
+
@property
|
1378
|
+
def _pipeline_cls(self):
|
1379
|
+
return _LayoutParsingPipelineV2
|
1380
|
+
|
1381
|
+
def _get_batch_size(self, config):
|
1382
|
+
return config.get("batch_size", 1)
|