paddlex 3.0.0rc1__py3-none-any.whl → 3.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- paddlex/.version +1 -1
- paddlex/__init__.py +1 -1
- paddlex/configs/modules/chart_parsing/PP-Chart2Table.yaml +13 -0
- paddlex/configs/modules/doc_vlm/PP-DocBee2-3B.yaml +14 -0
- paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-L.yaml +40 -0
- paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-M.yaml +40 -0
- paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-S.yaml +40 -0
- paddlex/configs/modules/layout_detection/PP-DocBlockLayout.yaml +40 -0
- paddlex/configs/modules/layout_detection/PP-DocLayout-L.yaml +2 -2
- paddlex/configs/modules/layout_detection/PP-DocLayout-M.yaml +2 -2
- paddlex/configs/modules/layout_detection/PP-DocLayout-S.yaml +2 -2
- paddlex/configs/modules/layout_detection/PP-DocLayout_plus-L.yaml +40 -0
- paddlex/configs/modules/text_detection/PP-OCRv5_mobile_det.yaml +40 -0
- paddlex/configs/modules/text_detection/PP-OCRv5_server_det.yaml +40 -0
- paddlex/configs/modules/text_recognition/PP-OCRv5_mobile_rec.yaml +39 -0
- paddlex/configs/modules/text_recognition/PP-OCRv5_server_rec.yaml +39 -0
- paddlex/configs/modules/textline_orientation/PP-LCNet_x1_0_textline_ori.yaml +41 -0
- paddlex/configs/pipelines/OCR.yaml +7 -6
- paddlex/configs/pipelines/PP-ChatOCRv3-doc.yaml +3 -1
- paddlex/configs/pipelines/PP-ChatOCRv4-doc.yaml +91 -34
- paddlex/configs/pipelines/PP-StructureV3.yaml +72 -72
- paddlex/configs/pipelines/doc_understanding.yaml +1 -1
- paddlex/configs/pipelines/formula_recognition.yaml +2 -2
- paddlex/configs/pipelines/layout_parsing.yaml +3 -2
- paddlex/configs/pipelines/seal_recognition.yaml +1 -0
- paddlex/configs/pipelines/table_recognition.yaml +2 -1
- paddlex/configs/pipelines/table_recognition_v2.yaml +7 -1
- paddlex/hpip_links.html +20 -20
- paddlex/inference/common/batch_sampler/doc_vlm_batch_sampler.py +33 -10
- paddlex/inference/common/batch_sampler/image_batch_sampler.py +34 -25
- paddlex/inference/common/result/mixin.py +19 -12
- paddlex/inference/models/base/predictor/base_predictor.py +2 -8
- paddlex/inference/models/common/static_infer.py +29 -73
- paddlex/inference/models/common/tokenizer/__init__.py +2 -0
- paddlex/inference/models/common/tokenizer/clip_tokenizer.py +1 -1
- paddlex/inference/models/common/tokenizer/gpt_tokenizer.py +2 -2
- paddlex/inference/models/common/tokenizer/qwen2_5_tokenizer.py +112 -0
- paddlex/inference/models/common/tokenizer/qwen2_tokenizer.py +7 -1
- paddlex/inference/models/common/tokenizer/qwen_tokenizer.py +288 -0
- paddlex/inference/models/common/tokenizer/tokenizer_utils.py +13 -13
- paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py +3 -3
- paddlex/inference/models/common/tokenizer/vocab.py +7 -7
- paddlex/inference/models/common/ts/funcs.py +19 -8
- paddlex/inference/models/common/vlm/conversion_utils.py +99 -0
- paddlex/inference/models/common/vlm/fusion_ops.py +205 -0
- paddlex/inference/models/common/vlm/generation/configuration_utils.py +1 -1
- paddlex/inference/models/common/vlm/generation/logits_process.py +1 -1
- paddlex/inference/models/common/vlm/generation/utils.py +1 -1
- paddlex/inference/models/common/vlm/transformers/configuration_utils.py +3 -3
- paddlex/inference/models/common/vlm/transformers/conversion_utils.py +3 -3
- paddlex/inference/models/common/vlm/transformers/model_outputs.py +2 -2
- paddlex/inference/models/common/vlm/transformers/model_utils.py +7 -31
- paddlex/inference/models/doc_vlm/modeling/GOT_ocr_2_0.py +830 -0
- paddlex/inference/models/doc_vlm/modeling/__init__.py +2 -0
- paddlex/inference/models/doc_vlm/modeling/qwen2.py +1606 -0
- paddlex/inference/models/doc_vlm/modeling/qwen2_5_vl.py +3006 -0
- paddlex/inference/models/doc_vlm/modeling/qwen2_vl.py +0 -105
- paddlex/inference/models/doc_vlm/predictor.py +79 -24
- paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py +97 -0
- paddlex/inference/models/doc_vlm/processors/__init__.py +2 -0
- paddlex/inference/models/doc_vlm/processors/common.py +189 -0
- paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py +548 -0
- paddlex/inference/models/doc_vlm/processors/qwen2_vl.py +21 -176
- paddlex/inference/models/formula_recognition/predictor.py +8 -2
- paddlex/inference/models/formula_recognition/processors.py +90 -77
- paddlex/inference/models/formula_recognition/result.py +28 -27
- paddlex/inference/models/image_feature/processors.py +3 -4
- paddlex/inference/models/keypoint_detection/predictor.py +3 -0
- paddlex/inference/models/object_detection/predictor.py +2 -0
- paddlex/inference/models/object_detection/processors.py +28 -3
- paddlex/inference/models/object_detection/utils.py +2 -0
- paddlex/inference/models/table_structure_recognition/result.py +0 -10
- paddlex/inference/models/text_detection/predictor.py +8 -0
- paddlex/inference/models/text_detection/processors.py +44 -10
- paddlex/inference/models/text_detection/result.py +0 -10
- paddlex/inference/models/text_recognition/result.py +1 -1
- paddlex/inference/pipelines/__init__.py +9 -5
- paddlex/inference/pipelines/_parallel.py +172 -0
- paddlex/inference/pipelines/anomaly_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/attribute_recognition/pipeline.py +11 -1
- paddlex/inference/pipelines/base.py +14 -4
- paddlex/inference/pipelines/components/faisser.py +1 -1
- paddlex/inference/pipelines/doc_preprocessor/pipeline.py +53 -27
- paddlex/inference/pipelines/formula_recognition/pipeline.py +120 -82
- paddlex/inference/pipelines/formula_recognition/result.py +1 -11
- paddlex/inference/pipelines/image_classification/pipeline.py +16 -6
- paddlex/inference/pipelines/image_multilabel_classification/pipeline.py +16 -6
- paddlex/inference/pipelines/instance_segmentation/pipeline.py +16 -6
- paddlex/inference/pipelines/keypoint_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/layout_parsing/layout_objects.py +859 -0
- paddlex/inference/pipelines/layout_parsing/pipeline.py +34 -47
- paddlex/inference/pipelines/layout_parsing/pipeline_v2.py +832 -260
- paddlex/inference/pipelines/layout_parsing/result.py +4 -17
- paddlex/inference/pipelines/layout_parsing/result_v2.py +259 -245
- paddlex/inference/pipelines/layout_parsing/setting.py +88 -0
- paddlex/inference/pipelines/layout_parsing/utils.py +391 -2028
- paddlex/inference/pipelines/layout_parsing/xycut_enhanced/__init__.py +16 -0
- paddlex/inference/pipelines/layout_parsing/xycut_enhanced/utils.py +1199 -0
- paddlex/inference/pipelines/layout_parsing/xycut_enhanced/xycuts.py +615 -0
- paddlex/inference/pipelines/m_3d_bev_detection/pipeline.py +2 -2
- paddlex/inference/pipelines/multilingual_speech_recognition/pipeline.py +2 -2
- paddlex/inference/pipelines/object_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/ocr/pipeline.py +127 -70
- paddlex/inference/pipelines/ocr/result.py +21 -18
- paddlex/inference/pipelines/open_vocabulary_detection/pipeline.py +2 -2
- paddlex/inference/pipelines/open_vocabulary_segmentation/pipeline.py +2 -2
- paddlex/inference/pipelines/pp_chatocr/pipeline_base.py +2 -2
- paddlex/inference/pipelines/pp_chatocr/pipeline_v3.py +2 -5
- paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py +6 -6
- paddlex/inference/pipelines/rotated_object_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/seal_recognition/pipeline.py +109 -53
- paddlex/inference/pipelines/semantic_segmentation/pipeline.py +16 -6
- paddlex/inference/pipelines/small_object_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/table_recognition/pipeline.py +26 -18
- paddlex/inference/pipelines/table_recognition/pipeline_v2.py +624 -53
- paddlex/inference/pipelines/table_recognition/result.py +1 -1
- paddlex/inference/pipelines/table_recognition/table_recognition_post_processing_v2.py +9 -5
- paddlex/inference/pipelines/ts_anomaly_detection/pipeline.py +2 -2
- paddlex/inference/pipelines/ts_classification/pipeline.py +2 -2
- paddlex/inference/pipelines/ts_forecasting/pipeline.py +2 -2
- paddlex/inference/pipelines/video_classification/pipeline.py +2 -2
- paddlex/inference/pipelines/video_detection/pipeline.py +2 -2
- paddlex/inference/serving/basic_serving/_app.py +46 -13
- paddlex/inference/serving/basic_serving/_pipeline_apps/_common/common.py +5 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/layout_parsing.py +0 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv3_doc.py +0 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv4_doc.py +1 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py +6 -2
- paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition.py +1 -5
- paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition_v2.py +4 -5
- paddlex/inference/serving/infra/utils.py +20 -22
- paddlex/inference/serving/schemas/formula_recognition.py +1 -1
- paddlex/inference/serving/schemas/layout_parsing.py +1 -2
- paddlex/inference/serving/schemas/pp_chatocrv3_doc.py +1 -2
- paddlex/inference/serving/schemas/pp_chatocrv4_doc.py +2 -2
- paddlex/inference/serving/schemas/pp_structurev3.py +10 -6
- paddlex/inference/serving/schemas/seal_recognition.py +1 -1
- paddlex/inference/serving/schemas/table_recognition.py +2 -6
- paddlex/inference/serving/schemas/table_recognition_v2.py +5 -6
- paddlex/inference/utils/hpi.py +30 -16
- paddlex/inference/utils/hpi_model_info_collection.json +666 -162
- paddlex/inference/utils/io/readers.py +12 -12
- paddlex/inference/utils/misc.py +20 -0
- paddlex/inference/utils/mkldnn_blocklist.py +59 -0
- paddlex/inference/utils/official_models.py +140 -5
- paddlex/inference/utils/pp_option.py +74 -9
- paddlex/model.py +2 -2
- paddlex/modules/__init__.py +1 -1
- paddlex/modules/anomaly_detection/evaluator.py +2 -2
- paddlex/modules/base/__init__.py +1 -1
- paddlex/modules/base/evaluator.py +5 -5
- paddlex/modules/base/trainer.py +1 -1
- paddlex/modules/doc_vlm/dataset_checker.py +2 -2
- paddlex/modules/doc_vlm/evaluator.py +2 -2
- paddlex/modules/doc_vlm/exportor.py +2 -2
- paddlex/modules/doc_vlm/model_list.py +1 -1
- paddlex/modules/doc_vlm/trainer.py +2 -2
- paddlex/modules/face_recognition/evaluator.py +2 -2
- paddlex/modules/formula_recognition/evaluator.py +5 -2
- paddlex/modules/formula_recognition/model_list.py +3 -0
- paddlex/modules/formula_recognition/trainer.py +3 -0
- paddlex/modules/general_recognition/evaluator.py +1 -1
- paddlex/modules/image_classification/evaluator.py +2 -2
- paddlex/modules/image_classification/model_list.py +1 -0
- paddlex/modules/instance_segmentation/evaluator.py +1 -1
- paddlex/modules/keypoint_detection/evaluator.py +1 -1
- paddlex/modules/m_3d_bev_detection/evaluator.py +2 -2
- paddlex/modules/multilabel_classification/evaluator.py +2 -2
- paddlex/modules/object_detection/dataset_checker/dataset_src/convert_dataset.py +4 -4
- paddlex/modules/object_detection/evaluator.py +2 -2
- paddlex/modules/object_detection/model_list.py +2 -0
- paddlex/modules/semantic_segmentation/dataset_checker/__init__.py +12 -2
- paddlex/modules/semantic_segmentation/evaluator.py +2 -2
- paddlex/modules/table_recognition/evaluator.py +2 -2
- paddlex/modules/text_detection/evaluator.py +2 -2
- paddlex/modules/text_detection/model_list.py +2 -0
- paddlex/modules/text_recognition/evaluator.py +2 -2
- paddlex/modules/text_recognition/model_list.py +2 -0
- paddlex/modules/ts_anomaly_detection/evaluator.py +2 -2
- paddlex/modules/ts_classification/dataset_checker/dataset_src/split_dataset.py +1 -1
- paddlex/modules/ts_classification/evaluator.py +2 -2
- paddlex/modules/ts_forecast/evaluator.py +2 -2
- paddlex/modules/video_classification/evaluator.py +2 -2
- paddlex/modules/video_detection/evaluator.py +2 -2
- paddlex/ops/__init__.py +8 -5
- paddlex/paddlex_cli.py +19 -13
- paddlex/repo_apis/Paddle3D_api/bev_fusion/model.py +2 -2
- paddlex/repo_apis/PaddleClas_api/cls/config.py +1 -1
- paddlex/repo_apis/PaddleClas_api/cls/model.py +1 -1
- paddlex/repo_apis/PaddleClas_api/cls/register.py +10 -0
- paddlex/repo_apis/PaddleClas_api/cls/runner.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/instance_seg/model.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/instance_seg/runner.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/object_det/config.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/object_det/model.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py +25 -0
- paddlex/repo_apis/PaddleDetection_api/object_det/register.py +30 -0
- paddlex/repo_apis/PaddleDetection_api/object_det/runner.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/formula_rec/config.py +3 -3
- paddlex/repo_apis/PaddleOCR_api/formula_rec/model.py +5 -9
- paddlex/repo_apis/PaddleOCR_api/formula_rec/register.py +27 -0
- paddlex/repo_apis/PaddleOCR_api/formula_rec/runner.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/table_rec/model.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/table_rec/runner.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/text_det/model.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/text_det/register.py +18 -0
- paddlex/repo_apis/PaddleOCR_api/text_det/runner.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/text_rec/config.py +3 -3
- paddlex/repo_apis/PaddleOCR_api/text_rec/model.py +5 -9
- paddlex/repo_apis/PaddleOCR_api/text_rec/register.py +18 -0
- paddlex/repo_apis/PaddleOCR_api/text_rec/runner.py +1 -1
- paddlex/repo_apis/PaddleSeg_api/seg/model.py +1 -1
- paddlex/repo_apis/PaddleSeg_api/seg/runner.py +1 -1
- paddlex/repo_apis/PaddleTS_api/ts_ad/config.py +3 -3
- paddlex/repo_apis/PaddleTS_api/ts_cls/config.py +2 -2
- paddlex/repo_apis/PaddleTS_api/ts_fc/config.py +4 -4
- paddlex/repo_apis/PaddleVideo_api/video_cls/config.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_cls/model.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_cls/runner.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_det/config.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_det/model.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_det/runner.py +1 -1
- paddlex/repo_apis/base/config.py +1 -1
- paddlex/repo_manager/core.py +3 -3
- paddlex/repo_manager/meta.py +6 -2
- paddlex/repo_manager/repo.py +17 -16
- paddlex/utils/custom_device_list.py +26 -2
- paddlex/utils/deps.py +3 -3
- paddlex/utils/device.py +5 -13
- paddlex/utils/env.py +4 -0
- paddlex/utils/flags.py +11 -4
- paddlex/utils/fonts/__init__.py +34 -4
- paddlex/utils/misc.py +1 -1
- paddlex/utils/subclass_register.py +2 -2
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/METADATA +349 -208
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/RECORD +240 -211
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/WHEEL +1 -1
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/entry_points.txt +1 -0
- {paddlex-3.0.0rc1.dist-info/licenses → paddlex-3.0.2.dist-info}/LICENSE +0 -0
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/top_level.txt +0 -0
@@ -15,9 +15,10 @@ from __future__ import annotations
|
|
15
15
|
|
16
16
|
import copy
|
17
17
|
import re
|
18
|
-
from typing import Any, Dict, Optional, Tuple, Union
|
18
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
19
19
|
|
20
20
|
import numpy as np
|
21
|
+
from PIL import Image
|
21
22
|
|
22
23
|
from ....utils import logging
|
23
24
|
from ....utils.deps import pipeline_requires_extra
|
@@ -26,18 +27,30 @@ from ...common.reader import ReadImage
|
|
26
27
|
from ...models.object_detection.result import DetResult
|
27
28
|
from ...utils.hpi import HPIConfig
|
28
29
|
from ...utils.pp_option import PaddlePredictorOption
|
30
|
+
from .._parallel import AutoParallelImageSimpleInferencePipeline
|
29
31
|
from ..base import BasePipeline
|
30
32
|
from ..ocr.result import OCRResult
|
33
|
+
from .layout_objects import LayoutBlock, LayoutRegion
|
31
34
|
from .result_v2 import LayoutParsingResultV2
|
32
|
-
from .
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
35
|
+
from .setting import BLOCK_LABEL_MAP, BLOCK_SETTINGS, REGION_SETTINGS
|
36
|
+
from .utils import (
|
37
|
+
caculate_bbox_area,
|
38
|
+
calculate_minimum_enclosing_bbox,
|
39
|
+
calculate_overlap_ratio,
|
40
|
+
convert_formula_res_to_ocr_format,
|
41
|
+
gather_imgs,
|
42
|
+
get_bbox_intersection,
|
43
|
+
get_sub_regions_ocr_res,
|
44
|
+
remove_overlap_blocks,
|
45
|
+
shrink_supplement_region_bbox,
|
46
|
+
update_region_box,
|
47
|
+
)
|
48
|
+
from .xycut_enhanced import xycut_enhanced
|
49
|
+
|
50
|
+
|
51
|
+
class _LayoutParsingPipelineV2(BasePipeline):
|
37
52
|
"""Layout Parsing Pipeline V2"""
|
38
53
|
|
39
|
-
entities = ["PP-StructureV3"]
|
40
|
-
|
41
54
|
def __init__(
|
42
55
|
self,
|
43
56
|
config: dict,
|
@@ -53,9 +66,9 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
53
66
|
device (str, optional): Device to run the predictions on. Defaults to None.
|
54
67
|
pp_option (PaddlePredictorOption, optional): PaddlePredictor options. Defaults to None.
|
55
68
|
use_hpip (bool, optional): Whether to use the high-performance
|
56
|
-
inference plugin (HPIP). Defaults to False.
|
69
|
+
inference plugin (HPIP) by default. Defaults to False.
|
57
70
|
hpi_config (Optional[Union[Dict[str, Any], HPIConfig]], optional):
|
58
|
-
The high-performance inference configuration dictionary.
|
71
|
+
The default high-performance inference configuration dictionary.
|
59
72
|
Defaults to None.
|
60
73
|
"""
|
61
74
|
|
@@ -68,8 +81,7 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
68
81
|
|
69
82
|
self.inintial_predictor(config)
|
70
83
|
|
71
|
-
self.batch_sampler = ImageBatchSampler(batch_size=1)
|
72
|
-
|
84
|
+
self.batch_sampler = ImageBatchSampler(batch_size=config.get("batch_size", 1))
|
73
85
|
self.img_reader = ReadImage(format="BGR")
|
74
86
|
|
75
87
|
def inintial_predictor(self, config: dict) -> None:
|
@@ -83,13 +95,20 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
83
95
|
"""
|
84
96
|
|
85
97
|
self.use_doc_preprocessor = config.get("use_doc_preprocessor", True)
|
86
|
-
self.use_general_ocr = config.get("use_general_ocr", True)
|
87
98
|
self.use_table_recognition = config.get("use_table_recognition", True)
|
88
99
|
self.use_seal_recognition = config.get("use_seal_recognition", True)
|
100
|
+
self.use_region_detection = config.get(
|
101
|
+
"use_region_detection",
|
102
|
+
True,
|
103
|
+
)
|
89
104
|
self.use_formula_recognition = config.get(
|
90
105
|
"use_formula_recognition",
|
91
106
|
True,
|
92
107
|
)
|
108
|
+
self.use_chart_recognition = config.get(
|
109
|
+
"use_chart_recognition",
|
110
|
+
False,
|
111
|
+
)
|
93
112
|
|
94
113
|
if self.use_doc_preprocessor:
|
95
114
|
doc_preprocessor_config = config.get("SubPipelines", {}).get(
|
@@ -101,6 +120,16 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
101
120
|
self.doc_preprocessor_pipeline = self.create_pipeline(
|
102
121
|
doc_preprocessor_config,
|
103
122
|
)
|
123
|
+
if self.use_region_detection:
|
124
|
+
region_detection_config = config.get("SubModules", {}).get(
|
125
|
+
"RegionDetection",
|
126
|
+
{
|
127
|
+
"model_config_error": "config error for block_region_detection_model!"
|
128
|
+
},
|
129
|
+
)
|
130
|
+
self.region_detection_model = self.create_model(
|
131
|
+
region_detection_config,
|
132
|
+
)
|
104
133
|
|
105
134
|
layout_det_config = config.get("SubModules", {}).get(
|
106
135
|
"LayoutDetection",
|
@@ -123,14 +152,13 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
123
152
|
layout_kwargs["layout_merge_bboxes_mode"] = layout_merge_bboxes_mode
|
124
153
|
self.layout_det_model = self.create_model(layout_det_config, **layout_kwargs)
|
125
154
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
)
|
155
|
+
general_ocr_config = config.get("SubPipelines", {}).get(
|
156
|
+
"GeneralOCR",
|
157
|
+
{"pipeline_config_error": "config error for general_ocr_pipeline!"},
|
158
|
+
)
|
159
|
+
self.general_ocr_pipeline = self.create_pipeline(
|
160
|
+
general_ocr_config,
|
161
|
+
)
|
134
162
|
|
135
163
|
if self.use_seal_recognition:
|
136
164
|
seal_recognition_config = config.get("SubPipelines", {}).get(
|
@@ -165,6 +193,17 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
165
193
|
formula_recognition_config,
|
166
194
|
)
|
167
195
|
|
196
|
+
if self.use_chart_recognition:
|
197
|
+
chart_recognition_config = config.get("SubModules", {}).get(
|
198
|
+
"ChartRecognition",
|
199
|
+
{
|
200
|
+
"model_config_error": "config error for block_region_detection_model!"
|
201
|
+
},
|
202
|
+
)
|
203
|
+
self.chart_recognition_model = self.create_model(
|
204
|
+
chart_recognition_config,
|
205
|
+
)
|
206
|
+
|
168
207
|
return
|
169
208
|
|
170
209
|
def get_text_paragraphs_ocr_res(
|
@@ -209,12 +248,6 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
209
248
|
)
|
210
249
|
return False
|
211
250
|
|
212
|
-
if input_params["use_general_ocr"] and not self.use_general_ocr:
|
213
|
-
logging.error(
|
214
|
-
"Set use_general_ocr, but the models for general OCR are not initialized.",
|
215
|
-
)
|
216
|
-
return False
|
217
|
-
|
218
251
|
if input_params["use_seal_recognition"] and not self.use_seal_recognition:
|
219
252
|
logging.error(
|
220
253
|
"Set use_seal_recognition, but the models for seal recognition are not initialized.",
|
@@ -229,159 +262,584 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
229
262
|
|
230
263
|
return True
|
231
264
|
|
232
|
-
def
|
265
|
+
def standardized_data(
|
233
266
|
self,
|
234
267
|
image: list,
|
268
|
+
region_det_res: DetResult,
|
235
269
|
layout_det_res: DetResult,
|
236
270
|
overall_ocr_res: OCRResult,
|
237
|
-
table_res_list: list,
|
238
|
-
seal_res_list: list,
|
239
271
|
formula_res_list: list,
|
240
|
-
|
241
|
-
|
242
|
-
text_det_limit_type: Optional[str] = None,
|
243
|
-
text_det_thresh: Optional[float] = None,
|
244
|
-
text_det_box_thresh: Optional[float] = None,
|
245
|
-
text_det_unclip_ratio: Optional[float] = None,
|
246
|
-
text_rec_score_thresh: Optional[float] = None,
|
272
|
+
text_rec_model: Any,
|
273
|
+
text_rec_score_thresh: Union[float, None] = None,
|
247
274
|
) -> list:
|
248
275
|
"""
|
249
276
|
Retrieves the layout parsing result based on the layout detection result, OCR result, and other recognition results.
|
250
277
|
Args:
|
251
278
|
image (list): The input image.
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
279
|
+
overall_ocr_res (OCRResult): An object containing the overall OCR results, including detected text boxes and recognized text. The structure is expected to have:
|
280
|
+
- "input_img": The image on which OCR was performed.
|
281
|
+
- "dt_boxes": A list of detected text box coordinates.
|
282
|
+
- "rec_texts": A list of recognized text corresponding to the detected boxes.
|
283
|
+
|
284
|
+
layout_det_res (DetResult): An object containing the layout detection results, including detected layout boxes and their labels. The structure is expected to have:
|
285
|
+
- "boxes": A list of dictionaries with keys "coordinate" for box coordinates and "block_label" for the type of content.
|
286
|
+
|
287
|
+
table_res_list (list): A list of table detection results, where each item is a dictionary containing:
|
288
|
+
- "block_bbox": The bounding box of the table layout.
|
289
|
+
- "pred_html": The predicted HTML representation of the table.
|
290
|
+
|
256
291
|
formula_res_list (list): A list of formula recognition results.
|
257
|
-
|
258
|
-
text_det_limit_type (Optional[str], optional): The type of limit for the text detection region. Defaults to None.
|
259
|
-
text_det_thresh (Optional[float], optional): The confidence threshold for text detection. Defaults to None.
|
260
|
-
text_det_box_thresh (Optional[float], optional): The confidence threshold for text detection bounding boxes. Defaults to None
|
261
|
-
text_det_unclip_ratio (Optional[float], optional): The unclip ratio for text detection. Defaults to None.
|
292
|
+
text_rec_model (Any): The text recognition model.
|
262
293
|
text_rec_score_thresh (Optional[float], optional): The score threshold for text recognition. Defaults to None.
|
263
294
|
Returns:
|
264
295
|
list: A list of dictionaries representing the layout parsing result.
|
265
296
|
"""
|
297
|
+
|
266
298
|
matched_ocr_dict = {}
|
267
|
-
|
299
|
+
region_to_block_map = {}
|
300
|
+
block_to_ocr_map = {}
|
268
301
|
object_boxes = []
|
269
302
|
footnote_list = []
|
270
|
-
|
303
|
+
paragraph_title_list = []
|
304
|
+
bottom_text_y_max = 0
|
305
|
+
max_block_area = 0.0
|
306
|
+
doc_title_num = 0
|
307
|
+
|
308
|
+
base_region_bbox = [65535, 65535, 0, 0]
|
309
|
+
layout_det_res = remove_overlap_blocks(
|
310
|
+
layout_det_res,
|
311
|
+
threshold=0.5,
|
312
|
+
smaller=True,
|
313
|
+
)
|
271
314
|
|
272
|
-
|
315
|
+
# convert formula_res_list to OCRResult format
|
316
|
+
convert_formula_res_to_ocr_format(formula_res_list, overall_ocr_res)
|
317
|
+
|
318
|
+
# match layout boxes and ocr boxes and get some information for layout_order_config
|
319
|
+
for box_idx, box_info in enumerate(layout_det_res["boxes"]):
|
273
320
|
box = box_info["coordinate"]
|
274
321
|
label = box_info["label"].lower()
|
275
322
|
object_boxes.append(box)
|
323
|
+
_, _, _, y2 = box
|
324
|
+
|
325
|
+
# update the region box and max_block_area according to the layout boxes
|
326
|
+
base_region_bbox = update_region_box(box, base_region_bbox)
|
327
|
+
max_block_area = max(max_block_area, caculate_bbox_area(box))
|
328
|
+
|
329
|
+
# update_layout_order_config_block_index(layout_order_config, label, box_idx)
|
276
330
|
|
277
331
|
# set the label of footnote to text, when it is above the text boxes
|
278
332
|
if label == "footnote":
|
279
|
-
footnote_list.append(
|
280
|
-
|
281
|
-
|
333
|
+
footnote_list.append(box_idx)
|
334
|
+
elif label == "paragraph_title":
|
335
|
+
paragraph_title_list.append(box_idx)
|
336
|
+
if label == "text":
|
337
|
+
bottom_text_y_max = max(y2, bottom_text_y_max)
|
338
|
+
if label == "doc_title":
|
339
|
+
doc_title_num += 1
|
282
340
|
|
283
341
|
if label not in ["formula", "table", "seal"]:
|
284
|
-
_,
|
342
|
+
_, matched_idxes = get_sub_regions_ocr_res(
|
285
343
|
overall_ocr_res, [box], return_match_idx=True
|
286
344
|
)
|
287
|
-
|
345
|
+
block_to_ocr_map[box_idx] = matched_idxes
|
346
|
+
for matched_idx in matched_idxes:
|
288
347
|
if matched_ocr_dict.get(matched_idx, None) is None:
|
289
|
-
matched_ocr_dict[matched_idx] = [
|
348
|
+
matched_ocr_dict[matched_idx] = [box_idx]
|
290
349
|
else:
|
291
|
-
matched_ocr_dict[matched_idx].append(
|
350
|
+
matched_ocr_dict[matched_idx].append(box_idx)
|
292
351
|
|
352
|
+
# fix the footnote label
|
293
353
|
for footnote_idx in footnote_list:
|
294
354
|
if (
|
295
355
|
layout_det_res["boxes"][footnote_idx]["coordinate"][3]
|
296
|
-
<
|
356
|
+
< bottom_text_y_max
|
297
357
|
):
|
298
358
|
layout_det_res["boxes"][footnote_idx]["label"] = "text"
|
299
359
|
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
360
|
+
# check if there is only one paragraph title and without doc_title
|
361
|
+
only_one_paragraph_title = len(paragraph_title_list) == 1 and doc_title_num == 0
|
362
|
+
if only_one_paragraph_title:
|
363
|
+
paragraph_title_block_area = caculate_bbox_area(
|
364
|
+
layout_det_res["boxes"][paragraph_title_list[0]]["coordinate"]
|
365
|
+
)
|
366
|
+
title_area_max_block_threshold = BLOCK_SETTINGS.get(
|
367
|
+
"title_conversion_area_ratio_threshold", 0.3
|
368
|
+
)
|
369
|
+
if (
|
370
|
+
paragraph_title_block_area
|
371
|
+
> max_block_area * title_area_max_block_threshold
|
372
|
+
):
|
373
|
+
layout_det_res["boxes"][paragraph_title_list[0]]["label"] = "doc_title"
|
374
|
+
|
375
|
+
# Replace the OCR information of the hurdles.
|
376
|
+
for overall_ocr_idx, layout_box_ids in matched_ocr_dict.items():
|
377
|
+
if len(layout_box_ids) > 1:
|
378
|
+
matched_no = 0
|
379
|
+
overall_ocr_box = copy.deepcopy(
|
380
|
+
overall_ocr_res["rec_boxes"][overall_ocr_idx]
|
381
|
+
)
|
382
|
+
overall_ocr_dt_poly = copy.deepcopy(
|
383
|
+
overall_ocr_res["dt_polys"][overall_ocr_idx]
|
384
|
+
)
|
385
|
+
for box_idx in layout_box_ids:
|
386
|
+
layout_box = layout_det_res["boxes"][box_idx]["coordinate"]
|
387
|
+
crop_box = get_bbox_intersection(overall_ocr_box, layout_box)
|
388
|
+
for ocr_idx in block_to_ocr_map[box_idx]:
|
389
|
+
ocr_box = overall_ocr_res["rec_boxes"][ocr_idx]
|
390
|
+
iou = calculate_overlap_ratio(ocr_box, crop_box, "small")
|
391
|
+
if iou > 0.8:
|
392
|
+
overall_ocr_res["rec_texts"][ocr_idx] = ""
|
393
|
+
x1, y1, x2, y2 = [int(i) for i in crop_box]
|
394
|
+
crop_img = np.array(image)[y1:y2, x1:x2]
|
395
|
+
crop_img_rec_res = list(text_rec_model([crop_img]))[0]
|
396
|
+
crop_img_dt_poly = get_bbox_intersection(
|
397
|
+
overall_ocr_dt_poly, layout_box, return_format="poly"
|
324
398
|
)
|
399
|
+
crop_img_rec_score = crop_img_rec_res["rec_score"]
|
400
|
+
crop_img_rec_text = crop_img_rec_res["rec_text"]
|
401
|
+
text_rec_score_thresh = (
|
402
|
+
text_rec_score_thresh
|
403
|
+
if text_rec_score_thresh is not None
|
404
|
+
else (self.general_ocr_pipeline.text_rec_score_thresh)
|
405
|
+
)
|
406
|
+
if crop_img_rec_score >= text_rec_score_thresh:
|
407
|
+
matched_no += 1
|
408
|
+
if matched_no == 1:
|
409
|
+
# the first matched ocr be replaced by the first matched layout box
|
410
|
+
overall_ocr_res["dt_polys"][
|
411
|
+
overall_ocr_idx
|
412
|
+
] = crop_img_dt_poly
|
413
|
+
overall_ocr_res["rec_boxes"][overall_ocr_idx] = crop_box
|
414
|
+
overall_ocr_res["rec_polys"][
|
415
|
+
overall_ocr_idx
|
416
|
+
] = crop_img_dt_poly
|
417
|
+
overall_ocr_res["rec_scores"][
|
418
|
+
overall_ocr_idx
|
419
|
+
] = crop_img_rec_score
|
420
|
+
overall_ocr_res["rec_texts"][
|
421
|
+
overall_ocr_idx
|
422
|
+
] = crop_img_rec_text
|
423
|
+
else:
|
424
|
+
# the other matched ocr be appended to the overall ocr result
|
425
|
+
overall_ocr_res["dt_polys"].append(crop_img_dt_poly)
|
426
|
+
if len(overall_ocr_res["rec_boxes"]) == 0:
|
427
|
+
overall_ocr_res["rec_boxes"] = np.array([crop_box])
|
428
|
+
else:
|
429
|
+
overall_ocr_res["rec_boxes"] = np.vstack(
|
430
|
+
(overall_ocr_res["rec_boxes"], crop_box)
|
431
|
+
)
|
432
|
+
overall_ocr_res["rec_polys"].append(crop_img_dt_poly)
|
433
|
+
overall_ocr_res["rec_scores"].append(crop_img_rec_score)
|
434
|
+
overall_ocr_res["rec_texts"].append(crop_img_rec_text)
|
435
|
+
overall_ocr_res["rec_labels"].append("text")
|
436
|
+
block_to_ocr_map[box_idx].remove(overall_ocr_idx)
|
437
|
+
block_to_ocr_map[box_idx].append(
|
438
|
+
len(overall_ocr_res["rec_texts"]) - 1
|
439
|
+
)
|
440
|
+
|
441
|
+
# use layout bbox to do ocr recognition when there is no matched ocr
|
442
|
+
for layout_box_idx, overall_ocr_idxes in block_to_ocr_map.items():
|
443
|
+
has_text = False
|
444
|
+
for idx in overall_ocr_idxes:
|
445
|
+
if overall_ocr_res["rec_texts"][idx] != "":
|
446
|
+
has_text = True
|
447
|
+
break
|
448
|
+
if not has_text and layout_det_res["boxes"][layout_box_idx][
|
449
|
+
"label"
|
450
|
+
] not in BLOCK_LABEL_MAP.get("vision_labels", []):
|
451
|
+
crop_box = layout_det_res["boxes"][layout_box_idx]["coordinate"]
|
452
|
+
x1, y1, x2, y2 = [int(i) for i in crop_box]
|
453
|
+
crop_img = np.array(image)[y1:y2, x1:x2]
|
454
|
+
crop_img_rec_res = next(text_rec_model([crop_img]))
|
455
|
+
crop_img_dt_poly = get_bbox_intersection(
|
456
|
+
crop_box, crop_box, return_format="poly"
|
325
457
|
)
|
326
|
-
|
327
|
-
|
458
|
+
crop_img_rec_score = crop_img_rec_res["rec_score"]
|
459
|
+
crop_img_rec_text = crop_img_rec_res["rec_text"]
|
460
|
+
text_rec_score_thresh = (
|
461
|
+
text_rec_score_thresh
|
462
|
+
if text_rec_score_thresh is not None
|
463
|
+
else (self.general_ocr_pipeline.text_rec_score_thresh)
|
328
464
|
)
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
overall_ocr_res["rec_boxes"]
|
465
|
+
if crop_img_rec_score >= text_rec_score_thresh:
|
466
|
+
if len(overall_ocr_res["rec_boxes"]) == 0:
|
467
|
+
overall_ocr_res["rec_boxes"] = np.array([crop_box])
|
468
|
+
else:
|
469
|
+
overall_ocr_res["rec_boxes"] = np.vstack(
|
470
|
+
(overall_ocr_res["rec_boxes"], crop_box)
|
471
|
+
)
|
472
|
+
overall_ocr_res["rec_polys"].append(crop_img_dt_poly)
|
473
|
+
overall_ocr_res["rec_scores"].append(crop_img_rec_score)
|
474
|
+
overall_ocr_res["rec_texts"].append(crop_img_rec_text)
|
475
|
+
overall_ocr_res["rec_labels"].append("text")
|
476
|
+
block_to_ocr_map[layout_box_idx].append(
|
477
|
+
len(overall_ocr_res["rec_texts"]) - 1
|
334
478
|
)
|
335
|
-
del overall_ocr_res["rec_polys"][matched_idx]
|
336
|
-
del overall_ocr_res["rec_scores"][matched_idx]
|
337
479
|
|
338
|
-
|
339
|
-
|
480
|
+
# when there is no layout detection result but there is ocr result, convert ocr detection result to layout detection result
|
481
|
+
if len(layout_det_res["boxes"]) == 0 and len(overall_ocr_res["rec_boxes"]) > 0:
|
482
|
+
for idx, ocr_rec_box in enumerate(overall_ocr_res["rec_boxes"]):
|
483
|
+
base_region_bbox = update_region_box(ocr_rec_box, base_region_bbox)
|
484
|
+
layout_det_res["boxes"].append(
|
485
|
+
{
|
486
|
+
"label": "text",
|
487
|
+
"coordinate": ocr_rec_box,
|
488
|
+
"score": overall_ocr_res["rec_scores"][idx],
|
489
|
+
}
|
490
|
+
)
|
491
|
+
block_to_ocr_map[idx] = [idx]
|
340
492
|
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
493
|
+
mask_labels = (
|
494
|
+
BLOCK_LABEL_MAP.get("unordered_labels", [])
|
495
|
+
+ BLOCK_LABEL_MAP.get("header_labels", [])
|
496
|
+
+ BLOCK_LABEL_MAP.get("footer_labels", [])
|
497
|
+
)
|
498
|
+
block_bboxes = [box["coordinate"] for box in layout_det_res["boxes"]]
|
499
|
+
region_det_res["boxes"] = sorted(
|
500
|
+
region_det_res["boxes"],
|
501
|
+
key=lambda item: caculate_bbox_area(item["coordinate"]),
|
502
|
+
)
|
503
|
+
if len(region_det_res["boxes"]) == 0:
|
504
|
+
region_det_res["boxes"] = [
|
505
|
+
{
|
506
|
+
"coordinate": base_region_bbox,
|
507
|
+
"label": "SupplementaryRegion",
|
508
|
+
"score": 1,
|
509
|
+
}
|
510
|
+
]
|
511
|
+
region_to_block_map[0] = range(len(block_bboxes))
|
512
|
+
else:
|
513
|
+
block_idxes_set = set(range(len(block_bboxes)))
|
514
|
+
# match block to region
|
515
|
+
for region_idx, region_info in enumerate(region_det_res["boxes"]):
|
516
|
+
matched_idxes = []
|
517
|
+
region_to_block_map[region_idx] = []
|
518
|
+
region_bbox = region_info["coordinate"]
|
519
|
+
for block_idx in block_idxes_set:
|
520
|
+
if layout_det_res["boxes"][block_idx]["label"] in mask_labels:
|
521
|
+
continue
|
522
|
+
overlap_ratio = calculate_overlap_ratio(
|
523
|
+
region_bbox, block_bboxes[block_idx], mode="small"
|
345
524
|
)
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
525
|
+
if overlap_ratio > REGION_SETTINGS.get(
|
526
|
+
"match_block_overlap_ratio_threshold", 0.8
|
527
|
+
):
|
528
|
+
matched_idxes.append(block_idx)
|
529
|
+
old_region_bbox_matched_idxes = []
|
530
|
+
if len(matched_idxes) > 0:
|
531
|
+
while len(old_region_bbox_matched_idxes) != len(matched_idxes):
|
532
|
+
old_region_bbox_matched_idxes = copy.deepcopy(matched_idxes)
|
533
|
+
matched_idxes = []
|
534
|
+
matched_bboxes = [
|
535
|
+
block_bboxes[idx] for idx in old_region_bbox_matched_idxes
|
536
|
+
]
|
537
|
+
new_region_bbox = calculate_minimum_enclosing_bbox(
|
538
|
+
matched_bboxes
|
539
|
+
)
|
540
|
+
for block_idx in block_idxes_set:
|
541
|
+
if (
|
542
|
+
layout_det_res["boxes"][block_idx]["label"]
|
543
|
+
in mask_labels
|
544
|
+
):
|
545
|
+
continue
|
546
|
+
overlap_ratio = calculate_overlap_ratio(
|
547
|
+
new_region_bbox, block_bboxes[block_idx], mode="small"
|
548
|
+
)
|
549
|
+
if overlap_ratio > REGION_SETTINGS.get(
|
550
|
+
"match_block_overlap_ratio_threshold", 0.8
|
551
|
+
):
|
552
|
+
matched_idxes.append(block_idx)
|
553
|
+
for block_idx in matched_idxes:
|
554
|
+
block_idxes_set.remove(block_idx)
|
555
|
+
region_to_block_map[region_idx] = matched_idxes
|
556
|
+
region_det_res["boxes"][region_idx]["coordinate"] = new_region_bbox
|
557
|
+
# Supplement region when there is no matched block
|
558
|
+
while len(block_idxes_set) > 0:
|
559
|
+
unmatched_bboxes = [block_bboxes[idx] for idx in block_idxes_set]
|
560
|
+
if len(unmatched_bboxes) == 0:
|
561
|
+
break
|
562
|
+
supplement_region_bbox = calculate_minimum_enclosing_bbox(
|
563
|
+
unmatched_bboxes
|
564
|
+
)
|
565
|
+
matched_idxes = []
|
566
|
+
# check if the new region bbox is overlapped with other region bbox, if have, then shrink the new region bbox
|
567
|
+
for region_idx, region_info in enumerate(region_det_res["boxes"]):
|
568
|
+
if len(region_to_block_map[region_idx]) == 0:
|
569
|
+
continue
|
570
|
+
region_bbox = region_info["coordinate"]
|
571
|
+
overlap_ratio = calculate_overlap_ratio(
|
572
|
+
supplement_region_bbox, region_bbox
|
573
|
+
)
|
574
|
+
if overlap_ratio > 0:
|
575
|
+
supplement_region_bbox, matched_idxes = (
|
576
|
+
shrink_supplement_region_bbox(
|
577
|
+
supplement_region_bbox,
|
578
|
+
region_bbox,
|
579
|
+
image.shape[1],
|
580
|
+
image.shape[0],
|
581
|
+
block_idxes_set,
|
582
|
+
block_bboxes,
|
583
|
+
)
|
584
|
+
)
|
585
|
+
|
586
|
+
matched_idxes = [
|
587
|
+
idx
|
588
|
+
for idx in matched_idxes
|
589
|
+
if layout_det_res["boxes"][idx]["label"] not in mask_labels
|
590
|
+
]
|
591
|
+
if len(matched_idxes) == 0:
|
592
|
+
matched_idxes = [
|
593
|
+
idx
|
594
|
+
for idx in block_idxes_set
|
595
|
+
if layout_det_res["boxes"][idx]["label"] not in mask_labels
|
596
|
+
]
|
597
|
+
if len(matched_idxes) == 0:
|
598
|
+
break
|
599
|
+
matched_bboxes = [block_bboxes[idx] for idx in matched_idxes]
|
600
|
+
supplement_region_bbox = calculate_minimum_enclosing_bbox(
|
601
|
+
matched_bboxes
|
602
|
+
)
|
603
|
+
region_idx = len(region_det_res["boxes"])
|
604
|
+
region_to_block_map[region_idx] = list(matched_idxes)
|
605
|
+
for block_idx in matched_idxes:
|
606
|
+
block_idxes_set.remove(block_idx)
|
607
|
+
region_det_res["boxes"].append(
|
608
|
+
{
|
609
|
+
"coordinate": supplement_region_bbox,
|
610
|
+
"label": "SupplementaryRegion",
|
611
|
+
"score": 1,
|
612
|
+
}
|
613
|
+
)
|
614
|
+
|
615
|
+
mask_idxes = [
|
616
|
+
idx
|
617
|
+
for idx in range(len(layout_det_res["boxes"]))
|
618
|
+
if layout_det_res["boxes"][idx]["label"] in mask_labels
|
357
619
|
]
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
620
|
+
for idx in mask_idxes:
|
621
|
+
bbox = layout_det_res["boxes"][idx]["coordinate"]
|
622
|
+
region_idx = len(region_det_res["boxes"])
|
623
|
+
region_to_block_map[region_idx] = [idx]
|
624
|
+
region_det_res["boxes"].append(
|
625
|
+
{
|
626
|
+
"coordinate": bbox,
|
627
|
+
"label": "SupplementaryRegion",
|
628
|
+
"score": 1,
|
629
|
+
}
|
630
|
+
)
|
631
|
+
|
632
|
+
region_block_ocr_idx_map = dict(
|
633
|
+
region_to_block_map=region_to_block_map,
|
634
|
+
block_to_ocr_map=block_to_ocr_map,
|
635
|
+
)
|
636
|
+
|
637
|
+
return region_block_ocr_idx_map, region_det_res, layout_det_res
|
638
|
+
|
639
|
+
def get_layout_parsing_objects(
|
640
|
+
self,
|
641
|
+
image: list,
|
642
|
+
region_block_ocr_idx_map: dict,
|
643
|
+
region_det_res: DetResult,
|
644
|
+
overall_ocr_res: OCRResult,
|
645
|
+
layout_det_res: DetResult,
|
646
|
+
table_res_list: list,
|
647
|
+
seal_res_list: list,
|
648
|
+
chart_res_list: list,
|
649
|
+
text_rec_model: Any,
|
650
|
+
text_rec_score_thresh: Union[float, None] = None,
|
651
|
+
) -> list:
|
652
|
+
"""
|
653
|
+
Extract structured information from OCR and layout detection results.
|
654
|
+
|
655
|
+
Args:
|
656
|
+
image (list): The input image.
|
657
|
+
overall_ocr_res (OCRResult): An object containing the overall OCR results, including detected text boxes and recognized text. The structure is expected to have:
|
658
|
+
- "input_img": The image on which OCR was performed.
|
659
|
+
- "dt_boxes": A list of detected text box coordinates.
|
660
|
+
- "rec_texts": A list of recognized text corresponding to the detected boxes.
|
661
|
+
|
662
|
+
layout_det_res (DetResult): An object containing the layout detection results, including detected layout boxes and their labels. The structure is expected to have:
|
663
|
+
- "boxes": A list of dictionaries with keys "coordinate" for box coordinates and "block_label" for the type of content.
|
664
|
+
|
665
|
+
table_res_list (list): A list of table detection results, where each item is a dictionary containing:
|
666
|
+
- "block_bbox": The bounding box of the table layout.
|
667
|
+
- "pred_html": The predicted HTML representation of the table.
|
668
|
+
|
669
|
+
seal_res_list (List): A list of seal detection results. The details of each item depend on the specific application context.
|
670
|
+
text_rec_model (Any): A model for text recognition.
|
671
|
+
text_rec_score_thresh (Union[float, None]): The minimum score required for a recognized character to be considered valid. If None, use the default value specified during initialization. Default is None.
|
672
|
+
|
673
|
+
Returns:
|
674
|
+
list: A list of structured boxes where each item is a dictionary containing:
|
675
|
+
- "block_label": The label of the content (e.g., 'table', 'chart', 'image').
|
676
|
+
- The label as a key with either table HTML or image data and text.
|
677
|
+
- "block_bbox": The coordinates of the layout box.
|
678
|
+
"""
|
679
|
+
|
680
|
+
table_index = 0
|
681
|
+
seal_index = 0
|
682
|
+
chart_index = 0
|
683
|
+
layout_parsing_blocks: List[LayoutBlock] = []
|
684
|
+
|
685
|
+
for box_idx, box_info in enumerate(layout_det_res["boxes"]):
|
686
|
+
|
687
|
+
label = box_info["label"]
|
688
|
+
block_bbox = box_info["coordinate"]
|
689
|
+
rec_res = {"boxes": [], "rec_texts": [], "rec_labels": []}
|
690
|
+
|
691
|
+
block = LayoutBlock(label=label, bbox=block_bbox)
|
692
|
+
|
693
|
+
if label == "table" and len(table_res_list) > 0:
|
694
|
+
block.content = table_res_list[table_index]["pred_html"]
|
695
|
+
table_index += 1
|
696
|
+
elif label == "seal" and len(seal_res_list) > 0:
|
697
|
+
block.content = "\n".join(seal_res_list[seal_index]["rec_texts"])
|
698
|
+
seal_index += 1
|
699
|
+
elif label == "chart" and len(chart_res_list) > 0:
|
700
|
+
block.content = chart_res_list[chart_index]
|
701
|
+
chart_index += 1
|
702
|
+
else:
|
703
|
+
if label == "formula":
|
704
|
+
_, ocr_idx_list = get_sub_regions_ocr_res(
|
705
|
+
overall_ocr_res, [block_bbox], return_match_idx=True
|
706
|
+
)
|
707
|
+
region_block_ocr_idx_map["block_to_ocr_map"][box_idx] = ocr_idx_list
|
708
|
+
else:
|
709
|
+
ocr_idx_list = region_block_ocr_idx_map["block_to_ocr_map"].get(
|
710
|
+
box_idx, []
|
711
|
+
)
|
712
|
+
for box_no in ocr_idx_list:
|
713
|
+
rec_res["boxes"].append(overall_ocr_res["rec_boxes"][box_no])
|
714
|
+
rec_res["rec_texts"].append(
|
715
|
+
overall_ocr_res["rec_texts"][box_no],
|
716
|
+
)
|
717
|
+
rec_res["rec_labels"].append(
|
718
|
+
overall_ocr_res["rec_labels"][box_no],
|
719
|
+
)
|
720
|
+
block.update_text_content(
|
721
|
+
image=image,
|
722
|
+
ocr_rec_res=rec_res,
|
723
|
+
text_rec_model=text_rec_model,
|
724
|
+
text_rec_score_thresh=text_rec_score_thresh,
|
725
|
+
)
|
726
|
+
|
727
|
+
if (
|
728
|
+
label
|
729
|
+
in ["seal", "table", "formula", "chart"]
|
730
|
+
+ BLOCK_LABEL_MAP["image_labels"]
|
731
|
+
):
|
732
|
+
x_min, y_min, x_max, y_max = list(map(int, block_bbox))
|
733
|
+
img_path = (
|
734
|
+
f"imgs/img_in_{block.label}_box_{x_min}_{y_min}_{x_max}_{y_max}.jpg"
|
735
|
+
)
|
736
|
+
img = Image.fromarray(image[y_min:y_max, x_min:x_max, ::-1])
|
737
|
+
block.image = {"path": img_path, "img": img}
|
738
|
+
|
739
|
+
layout_parsing_blocks.append(block)
|
740
|
+
|
741
|
+
page_region_bbox = [65535, 65535, 0, 0]
|
742
|
+
layout_parsing_regions: List[LayoutRegion] = []
|
743
|
+
for region_idx, region_info in enumerate(region_det_res["boxes"]):
|
744
|
+
region_bbox = np.array(region_info["coordinate"]).astype("int")
|
745
|
+
region_blocks = [
|
746
|
+
layout_parsing_blocks[idx]
|
747
|
+
for idx in region_block_ocr_idx_map["region_to_block_map"][region_idx]
|
748
|
+
]
|
749
|
+
if region_blocks:
|
750
|
+
page_region_bbox = update_region_box(region_bbox, page_region_bbox)
|
751
|
+
region = LayoutRegion(bbox=region_bbox, blocks=region_blocks)
|
752
|
+
layout_parsing_regions.append(region)
|
753
|
+
|
754
|
+
layout_parsing_page = LayoutRegion(
|
755
|
+
bbox=np.array(page_region_bbox).astype("int"), blocks=layout_parsing_regions
|
756
|
+
)
|
757
|
+
|
758
|
+
return layout_parsing_page
|
759
|
+
|
760
|
+
def sort_layout_parsing_blocks(
|
761
|
+
self, layout_parsing_page: LayoutRegion
|
762
|
+
) -> List[LayoutBlock]:
|
763
|
+
layout_parsing_regions = xycut_enhanced(layout_parsing_page)
|
764
|
+
parsing_res_list = []
|
765
|
+
for region in layout_parsing_regions:
|
766
|
+
layout_parsing_blocks = xycut_enhanced(region)
|
767
|
+
parsing_res_list.extend(layout_parsing_blocks)
|
768
|
+
|
769
|
+
return parsing_res_list
|
770
|
+
|
771
|
+
def get_layout_parsing_res(
|
772
|
+
self,
|
773
|
+
image: list,
|
774
|
+
region_det_res: DetResult,
|
775
|
+
layout_det_res: DetResult,
|
776
|
+
overall_ocr_res: OCRResult,
|
777
|
+
table_res_list: list,
|
778
|
+
seal_res_list: list,
|
779
|
+
chart_res_list: list,
|
780
|
+
formula_res_list: list,
|
781
|
+
text_rec_score_thresh: Union[float, None] = None,
|
782
|
+
) -> list:
|
783
|
+
"""
|
784
|
+
Retrieves the layout parsing result based on the layout detection result, OCR result, and other recognition results.
|
785
|
+
Args:
|
786
|
+
image (list): The input image.
|
787
|
+
layout_det_res (DetResult): The detection result containing the layout information of the document.
|
788
|
+
overall_ocr_res (OCRResult): The overall OCR result containing text information.
|
789
|
+
table_res_list (list): A list of table recognition results.
|
790
|
+
seal_res_list (list): A list of seal recognition results.
|
791
|
+
formula_res_list (list): A list of formula recognition results.
|
792
|
+
text_rec_score_thresh (Optional[float], optional): The score threshold for text recognition. Defaults to None.
|
793
|
+
Returns:
|
794
|
+
list: A list of dictionaries representing the layout parsing result.
|
795
|
+
"""
|
796
|
+
|
797
|
+
# Standardize data
|
798
|
+
region_block_ocr_idx_map, region_det_res, layout_det_res = (
|
799
|
+
self.standardized_data(
|
800
|
+
image=image,
|
801
|
+
region_det_res=region_det_res,
|
802
|
+
layout_det_res=layout_det_res,
|
803
|
+
overall_ocr_res=overall_ocr_res,
|
804
|
+
formula_res_list=formula_res_list,
|
805
|
+
text_rec_model=self.general_ocr_pipeline.text_rec_model,
|
806
|
+
text_rec_score_thresh=text_rec_score_thresh,
|
362
807
|
)
|
363
|
-
|
364
|
-
overall_ocr_res["rec_polys"].append(poly_points)
|
365
|
-
overall_ocr_res["rec_scores"].append(1)
|
808
|
+
)
|
366
809
|
|
367
|
-
|
368
|
-
|
810
|
+
# Format layout parsing block
|
811
|
+
layout_parsing_page = self.get_layout_parsing_objects(
|
812
|
+
image=image,
|
813
|
+
region_block_ocr_idx_map=region_block_ocr_idx_map,
|
814
|
+
region_det_res=region_det_res,
|
369
815
|
overall_ocr_res=overall_ocr_res,
|
370
816
|
layout_det_res=layout_det_res,
|
371
817
|
table_res_list=table_res_list,
|
372
818
|
seal_res_list=seal_res_list,
|
819
|
+
chart_res_list=chart_res_list,
|
820
|
+
text_rec_model=self.general_ocr_pipeline.text_rec_model,
|
821
|
+
text_rec_score_thresh=self.general_ocr_pipeline.text_rec_score_thresh,
|
373
822
|
)
|
374
823
|
|
824
|
+
parsing_res_list = self.sort_layout_parsing_blocks(layout_parsing_page)
|
825
|
+
|
826
|
+
index = 1
|
827
|
+
for block in parsing_res_list:
|
828
|
+
if block.label in BLOCK_LABEL_MAP["visualize_index_labels"]:
|
829
|
+
block.order_index = index
|
830
|
+
index += 1
|
831
|
+
|
375
832
|
return parsing_res_list
|
376
833
|
|
377
834
|
def get_model_settings(
|
378
835
|
self,
|
379
836
|
use_doc_orientation_classify: Union[bool, None],
|
380
837
|
use_doc_unwarping: Union[bool, None],
|
381
|
-
use_general_ocr: Union[bool, None],
|
382
838
|
use_seal_recognition: Union[bool, None],
|
383
839
|
use_table_recognition: Union[bool, None],
|
384
840
|
use_formula_recognition: Union[bool, None],
|
841
|
+
use_chart_recognition: Union[bool, None],
|
842
|
+
use_region_detection: Union[bool, None],
|
385
843
|
) -> dict:
|
386
844
|
"""
|
387
845
|
Get the model settings based on the provided parameters or default values.
|
@@ -389,7 +847,6 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
389
847
|
Args:
|
390
848
|
use_doc_orientation_classify (Union[bool, None]): Enables document orientation classification if True. Defaults to system setting if None.
|
391
849
|
use_doc_unwarping (Union[bool, None]): Enables document unwarping if True. Defaults to system setting if None.
|
392
|
-
use_general_ocr (Union[bool, None]): Enables general OCR if True. Defaults to system setting if None.
|
393
850
|
use_seal_recognition (Union[bool, None]): Enables seal recognition if True. Defaults to system setting if None.
|
394
851
|
use_table_recognition (Union[bool, None]): Enables table recognition if True. Defaults to system setting if None.
|
395
852
|
use_formula_recognition (Union[bool, None]): Enables formula recognition if True. Defaults to system setting if None.
|
@@ -406,9 +863,6 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
406
863
|
else:
|
407
864
|
use_doc_preprocessor = False
|
408
865
|
|
409
|
-
if use_general_ocr is None:
|
410
|
-
use_general_ocr = self.use_general_ocr
|
411
|
-
|
412
866
|
if use_seal_recognition is None:
|
413
867
|
use_seal_recognition = self.use_seal_recognition
|
414
868
|
|
@@ -418,24 +872,32 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
418
872
|
if use_formula_recognition is None:
|
419
873
|
use_formula_recognition = self.use_formula_recognition
|
420
874
|
|
875
|
+
if use_region_detection is None:
|
876
|
+
use_region_detection = self.use_region_detection
|
877
|
+
|
878
|
+
if use_chart_recognition is None:
|
879
|
+
use_chart_recognition = self.use_chart_recognition
|
880
|
+
|
421
881
|
return dict(
|
422
882
|
use_doc_preprocessor=use_doc_preprocessor,
|
423
|
-
use_general_ocr=use_general_ocr,
|
424
883
|
use_seal_recognition=use_seal_recognition,
|
425
884
|
use_table_recognition=use_table_recognition,
|
426
885
|
use_formula_recognition=use_formula_recognition,
|
886
|
+
use_chart_recognition=use_chart_recognition,
|
887
|
+
use_region_detection=use_region_detection,
|
427
888
|
)
|
428
889
|
|
429
890
|
def predict(
|
430
891
|
self,
|
431
892
|
input: Union[str, list[str], np.ndarray, list[np.ndarray]],
|
432
|
-
use_doc_orientation_classify: Union[bool, None] =
|
433
|
-
use_doc_unwarping: Union[bool, None] =
|
893
|
+
use_doc_orientation_classify: Union[bool, None] = False,
|
894
|
+
use_doc_unwarping: Union[bool, None] = False,
|
434
895
|
use_textline_orientation: Optional[bool] = None,
|
435
|
-
use_general_ocr: Union[bool, None] = None,
|
436
896
|
use_seal_recognition: Union[bool, None] = None,
|
437
897
|
use_table_recognition: Union[bool, None] = None,
|
438
898
|
use_formula_recognition: Union[bool, None] = None,
|
899
|
+
use_chart_recognition: Union[bool, None] = False,
|
900
|
+
use_region_detection: Union[bool, None] = None,
|
439
901
|
layout_threshold: Optional[Union[float, dict]] = None,
|
440
902
|
layout_nms: Optional[bool] = None,
|
441
903
|
layout_unclip_ratio: Optional[Union[float, Tuple[float, float], dict]] = None,
|
@@ -452,7 +914,10 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
452
914
|
seal_det_box_thresh: Union[float, None] = None,
|
453
915
|
seal_det_unclip_ratio: Union[float, None] = None,
|
454
916
|
seal_rec_score_thresh: Union[float, None] = None,
|
455
|
-
|
917
|
+
use_wired_table_cells_trans_to_html: bool = False,
|
918
|
+
use_wireless_table_cells_trans_to_html: bool = False,
|
919
|
+
use_table_orientation_classify: bool = True,
|
920
|
+
use_ocr_results_with_table_cells: bool = True,
|
456
921
|
use_e2e_wired_table_rec_model: bool = False,
|
457
922
|
use_e2e_wireless_table_rec_model: bool = True,
|
458
923
|
**kwargs,
|
@@ -464,10 +929,10 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
464
929
|
use_doc_orientation_classify (Optional[bool]): Whether to use document orientation classification.
|
465
930
|
use_doc_unwarping (Optional[bool]): Whether to use document unwarping.
|
466
931
|
use_textline_orientation (Optional[bool]): Whether to use textline orientation prediction.
|
467
|
-
use_general_ocr (Optional[bool]): Whether to use general OCR.
|
468
932
|
use_seal_recognition (Optional[bool]): Whether to use seal recognition.
|
469
933
|
use_table_recognition (Optional[bool]): Whether to use table recognition.
|
470
934
|
use_formula_recognition (Optional[bool]): Whether to use formula recognition.
|
935
|
+
use_region_detection (Optional[bool]): Whether to use region detection.
|
471
936
|
layout_threshold (Optional[float]): The threshold value to filter out low-confidence predictions. Default is None.
|
472
937
|
layout_nms (bool, optional): Whether to use layout-aware NMS. Defaults to False.
|
473
938
|
layout_unclip_ratio (Optional[Union[float, Tuple[float, float]]], optional): The ratio of unclipping the bounding box.
|
@@ -488,7 +953,10 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
488
953
|
seal_det_box_thresh (Optional[float]): Threshold for seal detection boxes.
|
489
954
|
seal_det_unclip_ratio (Optional[float]): Ratio for unclipping seal detection boxes.
|
490
955
|
seal_rec_score_thresh (Optional[float]): Score threshold for seal recognition.
|
491
|
-
|
956
|
+
use_wired_table_cells_trans_to_html (bool): Whether to use wired table cells trans to HTML.
|
957
|
+
use_wireless_table_cells_trans_to_html (bool): Whether to use wireless table cells trans to HTML.
|
958
|
+
use_table_orientation_classify (bool): Whether to use table orientation classification.
|
959
|
+
use_ocr_results_with_table_cells (bool): Whether to use OCR results processed by table cells.
|
492
960
|
use_e2e_wired_table_rec_model (bool): Whether to use end-to-end wired table recognition model.
|
493
961
|
use_e2e_wireless_table_rec_model (bool): Whether to use end-to-end wireless table recognition model.
|
494
962
|
**kwargs (Any): Additional settings to extend functionality.
|
@@ -496,154 +964,207 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
496
964
|
Returns:
|
497
965
|
LayoutParsingResultV2: The predicted layout parsing result.
|
498
966
|
"""
|
499
|
-
|
500
967
|
model_settings = self.get_model_settings(
|
501
968
|
use_doc_orientation_classify,
|
502
969
|
use_doc_unwarping,
|
503
|
-
use_general_ocr,
|
504
970
|
use_seal_recognition,
|
505
971
|
use_table_recognition,
|
506
972
|
use_formula_recognition,
|
973
|
+
use_chart_recognition,
|
974
|
+
use_region_detection,
|
507
975
|
)
|
508
976
|
|
509
977
|
if not self.check_model_settings_valid(model_settings):
|
510
978
|
yield {"error": "the input params for model settings are invalid!"}
|
511
979
|
|
512
980
|
for batch_data in self.batch_sampler(input):
|
513
|
-
|
981
|
+
image_arrays = self.img_reader(batch_data.instances)
|
514
982
|
|
515
983
|
if model_settings["use_doc_preprocessor"]:
|
516
|
-
|
984
|
+
doc_preprocessor_results = list(
|
517
985
|
self.doc_preprocessor_pipeline(
|
518
|
-
|
986
|
+
image_arrays,
|
519
987
|
use_doc_orientation_classify=use_doc_orientation_classify,
|
520
988
|
use_doc_unwarping=use_doc_unwarping,
|
521
|
-
)
|
989
|
+
)
|
522
990
|
)
|
523
991
|
else:
|
524
|
-
|
992
|
+
doc_preprocessor_results = [{"output_img": arr} for arr in image_arrays]
|
525
993
|
|
526
|
-
|
994
|
+
doc_preprocessor_images = [
|
995
|
+
item["output_img"] for item in doc_preprocessor_results
|
996
|
+
]
|
527
997
|
|
528
|
-
|
998
|
+
layout_det_results = list(
|
529
999
|
self.layout_det_model(
|
530
|
-
|
1000
|
+
doc_preprocessor_images,
|
531
1001
|
threshold=layout_threshold,
|
532
1002
|
layout_nms=layout_nms,
|
533
1003
|
layout_unclip_ratio=layout_unclip_ratio,
|
534
1004
|
layout_merge_bboxes_mode=layout_merge_bboxes_mode,
|
535
1005
|
)
|
536
1006
|
)
|
537
|
-
imgs_in_doc =
|
1007
|
+
imgs_in_doc = [
|
1008
|
+
gather_imgs(img, res["boxes"])
|
1009
|
+
for img, res in zip(doc_preprocessor_images, layout_det_results)
|
1010
|
+
]
|
1011
|
+
|
1012
|
+
if model_settings["use_region_detection"]:
|
1013
|
+
region_det_results = list(
|
1014
|
+
self.region_detection_model(
|
1015
|
+
doc_preprocessor_images,
|
1016
|
+
layout_nms=True,
|
1017
|
+
layout_merge_bboxes_mode="small",
|
1018
|
+
),
|
1019
|
+
)
|
1020
|
+
else:
|
1021
|
+
region_det_results = [{"boxes": []} for _ in doc_preprocessor_images]
|
538
1022
|
|
539
1023
|
if model_settings["use_formula_recognition"]:
|
540
|
-
formula_res_all =
|
1024
|
+
formula_res_all = list(
|
541
1025
|
self.formula_recognition_pipeline(
|
542
|
-
|
1026
|
+
doc_preprocessor_images,
|
543
1027
|
use_layout_detection=False,
|
544
1028
|
use_doc_orientation_classify=False,
|
545
1029
|
use_doc_unwarping=False,
|
546
|
-
layout_det_res=
|
1030
|
+
layout_det_res=layout_det_results,
|
547
1031
|
),
|
548
1032
|
)
|
549
|
-
|
1033
|
+
formula_res_lists = [
|
1034
|
+
item["formula_res_list"] for item in formula_res_all
|
1035
|
+
]
|
550
1036
|
else:
|
551
|
-
|
1037
|
+
formula_res_lists = [[] for _ in doc_preprocessor_images]
|
552
1038
|
|
553
|
-
for
|
554
|
-
|
555
|
-
doc_preprocessor_image[y_min:y_max, x_min:x_max, :] = 255.0
|
556
|
-
|
557
|
-
if (
|
558
|
-
model_settings["use_general_ocr"]
|
559
|
-
or model_settings["use_table_recognition"]
|
1039
|
+
for doc_preprocessor_image, formula_res_list in zip(
|
1040
|
+
doc_preprocessor_images, formula_res_lists
|
560
1041
|
):
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
1042
|
+
for formula_res in formula_res_list:
|
1043
|
+
x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
|
1044
|
+
doc_preprocessor_image[y_min:y_max, x_min:x_max, :] = 255.0
|
1045
|
+
|
1046
|
+
overall_ocr_results = list(
|
1047
|
+
self.general_ocr_pipeline(
|
1048
|
+
doc_preprocessor_images,
|
1049
|
+
use_textline_orientation=use_textline_orientation,
|
1050
|
+
text_det_limit_side_len=text_det_limit_side_len,
|
1051
|
+
text_det_limit_type=text_det_limit_type,
|
1052
|
+
text_det_thresh=text_det_thresh,
|
1053
|
+
text_det_box_thresh=text_det_box_thresh,
|
1054
|
+
text_det_unclip_ratio=text_det_unclip_ratio,
|
1055
|
+
text_rec_score_thresh=text_rec_score_thresh,
|
1056
|
+
),
|
1057
|
+
)
|
575
1058
|
|
576
|
-
overall_ocr_res
|
1059
|
+
for overall_ocr_res in overall_ocr_results:
|
1060
|
+
overall_ocr_res["rec_labels"] = ["text"] * len(
|
1061
|
+
overall_ocr_res["rec_texts"]
|
1062
|
+
)
|
577
1063
|
|
578
1064
|
if model_settings["use_table_recognition"]:
|
579
|
-
|
580
|
-
for
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
1065
|
+
table_res_lists = []
|
1066
|
+
for (
|
1067
|
+
layout_det_res,
|
1068
|
+
doc_preprocessor_image,
|
1069
|
+
overall_ocr_res,
|
1070
|
+
formula_res_list,
|
1071
|
+
imgs_in_doc_for_img,
|
1072
|
+
) in zip(
|
1073
|
+
layout_det_results,
|
1074
|
+
doc_preprocessor_images,
|
1075
|
+
overall_ocr_results,
|
1076
|
+
formula_res_lists,
|
1077
|
+
imgs_in_doc,
|
1078
|
+
):
|
1079
|
+
table_contents_for_img = copy.deepcopy(overall_ocr_res)
|
1080
|
+
for formula_res in formula_res_list:
|
1081
|
+
x_min, y_min, x_max, y_max = list(
|
1082
|
+
map(int, formula_res["dt_polys"])
|
1083
|
+
)
|
1084
|
+
poly_points = [
|
1085
|
+
(x_min, y_min),
|
1086
|
+
(x_max, y_min),
|
1087
|
+
(x_max, y_max),
|
1088
|
+
(x_min, y_max),
|
1089
|
+
]
|
1090
|
+
table_contents_for_img["dt_polys"].append(poly_points)
|
1091
|
+
rec_formula = formula_res["rec_formula"]
|
1092
|
+
if not rec_formula.startswith("$") or not rec_formula.endswith(
|
1093
|
+
"$"
|
1094
|
+
):
|
1095
|
+
rec_formula = f"${rec_formula}$"
|
1096
|
+
table_contents_for_img["rec_texts"].append(f"{rec_formula}")
|
1097
|
+
if table_contents_for_img["rec_boxes"].size == 0:
|
1098
|
+
table_contents_for_img["rec_boxes"] = np.array(
|
1099
|
+
[formula_res["dt_polys"]]
|
1100
|
+
)
|
1101
|
+
else:
|
1102
|
+
table_contents_for_img["rec_boxes"] = np.vstack(
|
1103
|
+
(
|
1104
|
+
table_contents_for_img["rec_boxes"],
|
1105
|
+
[formula_res["dt_polys"]],
|
1106
|
+
)
|
1107
|
+
)
|
1108
|
+
table_contents_for_img["rec_polys"].append(poly_points)
|
1109
|
+
table_contents_for_img["rec_scores"].append(1)
|
1110
|
+
|
1111
|
+
for img in imgs_in_doc_for_img:
|
1112
|
+
img_path = img["path"]
|
1113
|
+
x_min, y_min, x_max, y_max = img["coordinate"]
|
1114
|
+
poly_points = [
|
1115
|
+
(x_min, y_min),
|
1116
|
+
(x_max, y_min),
|
1117
|
+
(x_max, y_max),
|
1118
|
+
(x_min, y_max),
|
1119
|
+
]
|
1120
|
+
table_contents_for_img["dt_polys"].append(poly_points)
|
1121
|
+
table_contents_for_img["rec_texts"].append(
|
1122
|
+
f'<div style="text-align: center;"><img src="{img_path}" alt="Image" /></div>'
|
1123
|
+
)
|
1124
|
+
if table_contents_for_img["rec_boxes"].size == 0:
|
1125
|
+
table_contents_for_img["rec_boxes"] = np.array(
|
1126
|
+
[img["coordinate"]]
|
1127
|
+
)
|
1128
|
+
else:
|
1129
|
+
table_contents_for_img["rec_boxes"] = np.vstack(
|
1130
|
+
(table_contents_for_img["rec_boxes"], img["coordinate"])
|
1131
|
+
)
|
1132
|
+
table_contents_for_img["rec_polys"].append(poly_points)
|
1133
|
+
table_contents_for_img["rec_scores"].append(img["score"])
|
1134
|
+
|
1135
|
+
table_res_all = list(
|
1136
|
+
self.table_recognition_pipeline(
|
1137
|
+
doc_preprocessor_image,
|
1138
|
+
use_doc_orientation_classify=False,
|
1139
|
+
use_doc_unwarping=False,
|
1140
|
+
use_layout_detection=False,
|
1141
|
+
use_ocr_model=False,
|
1142
|
+
overall_ocr_res=table_contents_for_img,
|
1143
|
+
layout_det_res=layout_det_res,
|
1144
|
+
cell_sort_by_y_projection=True,
|
1145
|
+
use_wired_table_cells_trans_to_html=use_wired_table_cells_trans_to_html,
|
1146
|
+
use_wireless_table_cells_trans_to_html=use_wireless_table_cells_trans_to_html,
|
1147
|
+
use_table_orientation_classify=use_table_orientation_classify,
|
1148
|
+
use_ocr_results_with_table_cells=use_ocr_results_with_table_cells,
|
1149
|
+
use_e2e_wired_table_rec_model=use_e2e_wired_table_rec_model,
|
1150
|
+
use_e2e_wireless_table_rec_model=use_e2e_wireless_table_rec_model,
|
1151
|
+
),
|
594
1152
|
)
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
for img in imgs_in_doc:
|
599
|
-
img_path = img["path"]
|
600
|
-
x_min, y_min, x_max, y_max = img["coordinate"]
|
601
|
-
poly_points = [
|
602
|
-
(x_min, y_min),
|
603
|
-
(x_max, y_min),
|
604
|
-
(x_max, y_max),
|
605
|
-
(x_min, y_max),
|
1153
|
+
single_table_res_lists = [
|
1154
|
+
item["table_res_list"] for item in table_res_all
|
606
1155
|
]
|
607
|
-
|
608
|
-
table_contents["rec_texts"].append(
|
609
|
-
f'<div style="text-align: center;"><img src="{img_path}" alt="Image" /></div>'
|
610
|
-
)
|
611
|
-
if table_contents["rec_boxes"].size == 0:
|
612
|
-
table_contents["rec_boxes"] = np.array([img["coordinate"]])
|
613
|
-
else:
|
614
|
-
table_contents["rec_boxes"] = np.vstack(
|
615
|
-
(table_contents["rec_boxes"], img["coordinate"])
|
616
|
-
)
|
617
|
-
table_contents["rec_polys"].append(poly_points)
|
618
|
-
table_contents["rec_scores"].append(img["score"])
|
619
|
-
|
620
|
-
table_res_all = next(
|
621
|
-
self.table_recognition_pipeline(
|
622
|
-
doc_preprocessor_image,
|
623
|
-
use_doc_orientation_classify=False,
|
624
|
-
use_doc_unwarping=False,
|
625
|
-
use_layout_detection=False,
|
626
|
-
use_ocr_model=False,
|
627
|
-
overall_ocr_res=table_contents,
|
628
|
-
layout_det_res=layout_det_res,
|
629
|
-
cell_sort_by_y_projection=True,
|
630
|
-
use_table_cells_ocr_results=use_table_cells_ocr_results,
|
631
|
-
use_e2e_wired_table_rec_model=use_e2e_wired_table_rec_model,
|
632
|
-
use_e2e_wireless_table_rec_model=use_e2e_wireless_table_rec_model,
|
633
|
-
),
|
634
|
-
)
|
635
|
-
table_res_list = table_res_all["table_res_list"]
|
1156
|
+
table_res_lists.extend(single_table_res_lists)
|
636
1157
|
else:
|
637
|
-
|
1158
|
+
table_res_lists = [[] for _ in doc_preprocessor_images]
|
638
1159
|
|
639
1160
|
if model_settings["use_seal_recognition"]:
|
640
|
-
seal_res_all =
|
1161
|
+
seal_res_all = list(
|
641
1162
|
self.seal_recognition_pipeline(
|
642
|
-
|
1163
|
+
doc_preprocessor_images,
|
643
1164
|
use_doc_orientation_classify=False,
|
644
1165
|
use_doc_unwarping=False,
|
645
1166
|
use_layout_detection=False,
|
646
|
-
layout_det_res=
|
1167
|
+
layout_det_res=layout_det_results,
|
647
1168
|
seal_det_limit_side_len=seal_det_limit_side_len,
|
648
1169
|
seal_det_limit_type=seal_det_limit_type,
|
649
1170
|
seal_det_thresh=seal_det_thresh,
|
@@ -652,46 +1173,85 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
652
1173
|
seal_rec_score_thresh=seal_rec_score_thresh,
|
653
1174
|
),
|
654
1175
|
)
|
655
|
-
|
1176
|
+
seal_res_lists = [item["seal_res_list"] for item in seal_res_all]
|
656
1177
|
else:
|
657
|
-
|
1178
|
+
seal_res_lists = [[] for _ in doc_preprocessor_images]
|
658
1179
|
|
659
|
-
|
1180
|
+
for (
|
1181
|
+
input_path,
|
1182
|
+
page_index,
|
660
1183
|
doc_preprocessor_image,
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
1184
|
+
doc_preprocessor_res,
|
1185
|
+
layout_det_res,
|
1186
|
+
region_det_res,
|
1187
|
+
overall_ocr_res,
|
1188
|
+
table_res_list,
|
1189
|
+
seal_res_list,
|
1190
|
+
formula_res_list,
|
1191
|
+
imgs_in_doc_for_img,
|
1192
|
+
) in zip(
|
1193
|
+
batch_data.input_paths,
|
1194
|
+
batch_data.page_indexes,
|
1195
|
+
doc_preprocessor_images,
|
1196
|
+
doc_preprocessor_results,
|
1197
|
+
layout_det_results,
|
1198
|
+
region_det_results,
|
1199
|
+
overall_ocr_results,
|
1200
|
+
table_res_lists,
|
1201
|
+
seal_res_lists,
|
1202
|
+
formula_res_lists,
|
1203
|
+
imgs_in_doc,
|
1204
|
+
):
|
1205
|
+
chart_res_list = []
|
1206
|
+
if model_settings["use_chart_recognition"]:
|
1207
|
+
chart_imgs_list = []
|
1208
|
+
for bbox in layout_det_res["boxes"]:
|
1209
|
+
if bbox["label"] == "chart":
|
1210
|
+
x_min, y_min, x_max, y_max = bbox["coordinate"]
|
1211
|
+
chart_img = doc_preprocessor_image[
|
1212
|
+
int(y_min) : int(y_max), int(x_min) : int(x_max), :
|
1213
|
+
]
|
1214
|
+
chart_imgs_list.append({"image": chart_img})
|
1215
|
+
|
1216
|
+
for chart_res_batch in self.chart_recognition_model(
|
1217
|
+
input=chart_imgs_list
|
1218
|
+
):
|
1219
|
+
chart_res_list.append(chart_res_batch["result"])
|
1220
|
+
|
1221
|
+
parsing_res_list = self.get_layout_parsing_res(
|
1222
|
+
doc_preprocessor_image,
|
1223
|
+
region_det_res=region_det_res,
|
1224
|
+
layout_det_res=layout_det_res,
|
1225
|
+
overall_ocr_res=overall_ocr_res,
|
1226
|
+
table_res_list=table_res_list,
|
1227
|
+
seal_res_list=seal_res_list,
|
1228
|
+
chart_res_list=chart_res_list,
|
1229
|
+
formula_res_list=formula_res_list,
|
1230
|
+
text_rec_score_thresh=text_rec_score_thresh,
|
1231
|
+
)
|
674
1232
|
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
1233
|
+
for formula_res in formula_res_list:
|
1234
|
+
x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
|
1235
|
+
doc_preprocessor_image[y_min:y_max, x_min:x_max, :] = formula_res[
|
1236
|
+
"input_img"
|
1237
|
+
]
|
680
1238
|
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
|
1239
|
+
single_img_res = {
|
1240
|
+
"input_path": input_path,
|
1241
|
+
"page_index": page_index,
|
1242
|
+
"doc_preprocessor_res": doc_preprocessor_res,
|
1243
|
+
"layout_det_res": layout_det_res,
|
1244
|
+
"region_det_res": region_det_res,
|
1245
|
+
"overall_ocr_res": overall_ocr_res,
|
1246
|
+
"table_res_list": table_res_list,
|
1247
|
+
"seal_res_list": seal_res_list,
|
1248
|
+
"chart_res_list": chart_res_list,
|
1249
|
+
"formula_res_list": formula_res_list,
|
1250
|
+
"parsing_res_list": parsing_res_list,
|
1251
|
+
"imgs_in_doc": imgs_in_doc_for_img,
|
1252
|
+
"model_settings": model_settings,
|
1253
|
+
}
|
1254
|
+
yield LayoutParsingResultV2(single_img_res)
|
695
1255
|
|
696
1256
|
def concatenate_markdown_pages(self, markdown_list: list) -> tuple:
|
697
1257
|
"""
|
@@ -747,3 +1307,15 @@ class LayoutParsingPipelineV2(BasePipeline):
|
|
747
1307
|
)
|
748
1308
|
|
749
1309
|
return markdown_texts
|
1310
|
+
|
1311
|
+
|
1312
|
+
@pipeline_requires_extra("ocr")
|
1313
|
+
class LayoutParsingPipelineV2(AutoParallelImageSimpleInferencePipeline):
|
1314
|
+
entities = ["PP-StructureV3"]
|
1315
|
+
|
1316
|
+
@property
|
1317
|
+
def _pipeline_cls(self):
|
1318
|
+
return _LayoutParsingPipelineV2
|
1319
|
+
|
1320
|
+
def _get_batch_size(self, config):
|
1321
|
+
return config.get("batch_size", 1)
|