paddlex 3.0.0rc1__py3-none-any.whl → 3.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- paddlex/.version +1 -1
- paddlex/__init__.py +1 -1
- paddlex/configs/modules/chart_parsing/PP-Chart2Table.yaml +13 -0
- paddlex/configs/modules/doc_vlm/PP-DocBee2-3B.yaml +14 -0
- paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-L.yaml +40 -0
- paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-M.yaml +40 -0
- paddlex/configs/modules/formula_recognition/PP-FormulaNet_plus-S.yaml +40 -0
- paddlex/configs/modules/layout_detection/PP-DocBlockLayout.yaml +40 -0
- paddlex/configs/modules/layout_detection/PP-DocLayout-L.yaml +2 -2
- paddlex/configs/modules/layout_detection/PP-DocLayout-M.yaml +2 -2
- paddlex/configs/modules/layout_detection/PP-DocLayout-S.yaml +2 -2
- paddlex/configs/modules/layout_detection/PP-DocLayout_plus-L.yaml +40 -0
- paddlex/configs/modules/text_detection/PP-OCRv5_mobile_det.yaml +40 -0
- paddlex/configs/modules/text_detection/PP-OCRv5_server_det.yaml +40 -0
- paddlex/configs/modules/text_recognition/PP-OCRv5_mobile_rec.yaml +39 -0
- paddlex/configs/modules/text_recognition/PP-OCRv5_server_rec.yaml +39 -0
- paddlex/configs/modules/textline_orientation/PP-LCNet_x1_0_textline_ori.yaml +41 -0
- paddlex/configs/pipelines/OCR.yaml +7 -6
- paddlex/configs/pipelines/PP-ChatOCRv3-doc.yaml +3 -1
- paddlex/configs/pipelines/PP-ChatOCRv4-doc.yaml +91 -34
- paddlex/configs/pipelines/PP-StructureV3.yaml +72 -72
- paddlex/configs/pipelines/doc_understanding.yaml +1 -1
- paddlex/configs/pipelines/formula_recognition.yaml +2 -2
- paddlex/configs/pipelines/layout_parsing.yaml +3 -2
- paddlex/configs/pipelines/seal_recognition.yaml +1 -0
- paddlex/configs/pipelines/table_recognition.yaml +2 -1
- paddlex/configs/pipelines/table_recognition_v2.yaml +7 -1
- paddlex/hpip_links.html +20 -20
- paddlex/inference/common/batch_sampler/doc_vlm_batch_sampler.py +33 -10
- paddlex/inference/common/batch_sampler/image_batch_sampler.py +34 -25
- paddlex/inference/common/result/mixin.py +19 -12
- paddlex/inference/models/base/predictor/base_predictor.py +2 -8
- paddlex/inference/models/common/static_infer.py +29 -73
- paddlex/inference/models/common/tokenizer/__init__.py +2 -0
- paddlex/inference/models/common/tokenizer/clip_tokenizer.py +1 -1
- paddlex/inference/models/common/tokenizer/gpt_tokenizer.py +2 -2
- paddlex/inference/models/common/tokenizer/qwen2_5_tokenizer.py +112 -0
- paddlex/inference/models/common/tokenizer/qwen2_tokenizer.py +7 -1
- paddlex/inference/models/common/tokenizer/qwen_tokenizer.py +288 -0
- paddlex/inference/models/common/tokenizer/tokenizer_utils.py +13 -13
- paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py +3 -3
- paddlex/inference/models/common/tokenizer/vocab.py +7 -7
- paddlex/inference/models/common/ts/funcs.py +19 -8
- paddlex/inference/models/common/vlm/conversion_utils.py +99 -0
- paddlex/inference/models/common/vlm/fusion_ops.py +205 -0
- paddlex/inference/models/common/vlm/generation/configuration_utils.py +1 -1
- paddlex/inference/models/common/vlm/generation/logits_process.py +1 -1
- paddlex/inference/models/common/vlm/generation/utils.py +1 -1
- paddlex/inference/models/common/vlm/transformers/configuration_utils.py +3 -3
- paddlex/inference/models/common/vlm/transformers/conversion_utils.py +3 -3
- paddlex/inference/models/common/vlm/transformers/model_outputs.py +2 -2
- paddlex/inference/models/common/vlm/transformers/model_utils.py +7 -31
- paddlex/inference/models/doc_vlm/modeling/GOT_ocr_2_0.py +830 -0
- paddlex/inference/models/doc_vlm/modeling/__init__.py +2 -0
- paddlex/inference/models/doc_vlm/modeling/qwen2.py +1606 -0
- paddlex/inference/models/doc_vlm/modeling/qwen2_5_vl.py +3006 -0
- paddlex/inference/models/doc_vlm/modeling/qwen2_vl.py +0 -105
- paddlex/inference/models/doc_vlm/predictor.py +79 -24
- paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py +97 -0
- paddlex/inference/models/doc_vlm/processors/__init__.py +2 -0
- paddlex/inference/models/doc_vlm/processors/common.py +189 -0
- paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py +548 -0
- paddlex/inference/models/doc_vlm/processors/qwen2_vl.py +21 -176
- paddlex/inference/models/formula_recognition/predictor.py +8 -2
- paddlex/inference/models/formula_recognition/processors.py +90 -77
- paddlex/inference/models/formula_recognition/result.py +28 -27
- paddlex/inference/models/image_feature/processors.py +3 -4
- paddlex/inference/models/keypoint_detection/predictor.py +3 -0
- paddlex/inference/models/object_detection/predictor.py +2 -0
- paddlex/inference/models/object_detection/processors.py +28 -3
- paddlex/inference/models/object_detection/utils.py +2 -0
- paddlex/inference/models/table_structure_recognition/result.py +0 -10
- paddlex/inference/models/text_detection/predictor.py +8 -0
- paddlex/inference/models/text_detection/processors.py +44 -10
- paddlex/inference/models/text_detection/result.py +0 -10
- paddlex/inference/models/text_recognition/result.py +1 -1
- paddlex/inference/pipelines/__init__.py +9 -5
- paddlex/inference/pipelines/_parallel.py +172 -0
- paddlex/inference/pipelines/anomaly_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/attribute_recognition/pipeline.py +11 -1
- paddlex/inference/pipelines/base.py +14 -4
- paddlex/inference/pipelines/components/faisser.py +1 -1
- paddlex/inference/pipelines/doc_preprocessor/pipeline.py +53 -27
- paddlex/inference/pipelines/formula_recognition/pipeline.py +120 -82
- paddlex/inference/pipelines/formula_recognition/result.py +1 -11
- paddlex/inference/pipelines/image_classification/pipeline.py +16 -6
- paddlex/inference/pipelines/image_multilabel_classification/pipeline.py +16 -6
- paddlex/inference/pipelines/instance_segmentation/pipeline.py +16 -6
- paddlex/inference/pipelines/keypoint_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/layout_parsing/layout_objects.py +859 -0
- paddlex/inference/pipelines/layout_parsing/pipeline.py +34 -47
- paddlex/inference/pipelines/layout_parsing/pipeline_v2.py +832 -260
- paddlex/inference/pipelines/layout_parsing/result.py +4 -17
- paddlex/inference/pipelines/layout_parsing/result_v2.py +259 -245
- paddlex/inference/pipelines/layout_parsing/setting.py +88 -0
- paddlex/inference/pipelines/layout_parsing/utils.py +391 -2028
- paddlex/inference/pipelines/layout_parsing/xycut_enhanced/__init__.py +16 -0
- paddlex/inference/pipelines/layout_parsing/xycut_enhanced/utils.py +1199 -0
- paddlex/inference/pipelines/layout_parsing/xycut_enhanced/xycuts.py +615 -0
- paddlex/inference/pipelines/m_3d_bev_detection/pipeline.py +2 -2
- paddlex/inference/pipelines/multilingual_speech_recognition/pipeline.py +2 -2
- paddlex/inference/pipelines/object_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/ocr/pipeline.py +127 -70
- paddlex/inference/pipelines/ocr/result.py +21 -18
- paddlex/inference/pipelines/open_vocabulary_detection/pipeline.py +2 -2
- paddlex/inference/pipelines/open_vocabulary_segmentation/pipeline.py +2 -2
- paddlex/inference/pipelines/pp_chatocr/pipeline_base.py +2 -2
- paddlex/inference/pipelines/pp_chatocr/pipeline_v3.py +2 -5
- paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py +6 -6
- paddlex/inference/pipelines/rotated_object_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/seal_recognition/pipeline.py +109 -53
- paddlex/inference/pipelines/semantic_segmentation/pipeline.py +16 -6
- paddlex/inference/pipelines/small_object_detection/pipeline.py +16 -6
- paddlex/inference/pipelines/table_recognition/pipeline.py +26 -18
- paddlex/inference/pipelines/table_recognition/pipeline_v2.py +624 -53
- paddlex/inference/pipelines/table_recognition/result.py +1 -1
- paddlex/inference/pipelines/table_recognition/table_recognition_post_processing_v2.py +9 -5
- paddlex/inference/pipelines/ts_anomaly_detection/pipeline.py +2 -2
- paddlex/inference/pipelines/ts_classification/pipeline.py +2 -2
- paddlex/inference/pipelines/ts_forecasting/pipeline.py +2 -2
- paddlex/inference/pipelines/video_classification/pipeline.py +2 -2
- paddlex/inference/pipelines/video_detection/pipeline.py +2 -2
- paddlex/inference/serving/basic_serving/_app.py +46 -13
- paddlex/inference/serving/basic_serving/_pipeline_apps/_common/common.py +5 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/layout_parsing.py +0 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv3_doc.py +0 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/pp_chatocrv4_doc.py +1 -1
- paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py +6 -2
- paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition.py +1 -5
- paddlex/inference/serving/basic_serving/_pipeline_apps/table_recognition_v2.py +4 -5
- paddlex/inference/serving/infra/utils.py +20 -22
- paddlex/inference/serving/schemas/formula_recognition.py +1 -1
- paddlex/inference/serving/schemas/layout_parsing.py +1 -2
- paddlex/inference/serving/schemas/pp_chatocrv3_doc.py +1 -2
- paddlex/inference/serving/schemas/pp_chatocrv4_doc.py +2 -2
- paddlex/inference/serving/schemas/pp_structurev3.py +10 -6
- paddlex/inference/serving/schemas/seal_recognition.py +1 -1
- paddlex/inference/serving/schemas/table_recognition.py +2 -6
- paddlex/inference/serving/schemas/table_recognition_v2.py +5 -6
- paddlex/inference/utils/hpi.py +30 -16
- paddlex/inference/utils/hpi_model_info_collection.json +666 -162
- paddlex/inference/utils/io/readers.py +12 -12
- paddlex/inference/utils/misc.py +20 -0
- paddlex/inference/utils/mkldnn_blocklist.py +59 -0
- paddlex/inference/utils/official_models.py +140 -5
- paddlex/inference/utils/pp_option.py +74 -9
- paddlex/model.py +2 -2
- paddlex/modules/__init__.py +1 -1
- paddlex/modules/anomaly_detection/evaluator.py +2 -2
- paddlex/modules/base/__init__.py +1 -1
- paddlex/modules/base/evaluator.py +5 -5
- paddlex/modules/base/trainer.py +1 -1
- paddlex/modules/doc_vlm/dataset_checker.py +2 -2
- paddlex/modules/doc_vlm/evaluator.py +2 -2
- paddlex/modules/doc_vlm/exportor.py +2 -2
- paddlex/modules/doc_vlm/model_list.py +1 -1
- paddlex/modules/doc_vlm/trainer.py +2 -2
- paddlex/modules/face_recognition/evaluator.py +2 -2
- paddlex/modules/formula_recognition/evaluator.py +5 -2
- paddlex/modules/formula_recognition/model_list.py +3 -0
- paddlex/modules/formula_recognition/trainer.py +3 -0
- paddlex/modules/general_recognition/evaluator.py +1 -1
- paddlex/modules/image_classification/evaluator.py +2 -2
- paddlex/modules/image_classification/model_list.py +1 -0
- paddlex/modules/instance_segmentation/evaluator.py +1 -1
- paddlex/modules/keypoint_detection/evaluator.py +1 -1
- paddlex/modules/m_3d_bev_detection/evaluator.py +2 -2
- paddlex/modules/multilabel_classification/evaluator.py +2 -2
- paddlex/modules/object_detection/dataset_checker/dataset_src/convert_dataset.py +4 -4
- paddlex/modules/object_detection/evaluator.py +2 -2
- paddlex/modules/object_detection/model_list.py +2 -0
- paddlex/modules/semantic_segmentation/dataset_checker/__init__.py +12 -2
- paddlex/modules/semantic_segmentation/evaluator.py +2 -2
- paddlex/modules/table_recognition/evaluator.py +2 -2
- paddlex/modules/text_detection/evaluator.py +2 -2
- paddlex/modules/text_detection/model_list.py +2 -0
- paddlex/modules/text_recognition/evaluator.py +2 -2
- paddlex/modules/text_recognition/model_list.py +2 -0
- paddlex/modules/ts_anomaly_detection/evaluator.py +2 -2
- paddlex/modules/ts_classification/dataset_checker/dataset_src/split_dataset.py +1 -1
- paddlex/modules/ts_classification/evaluator.py +2 -2
- paddlex/modules/ts_forecast/evaluator.py +2 -2
- paddlex/modules/video_classification/evaluator.py +2 -2
- paddlex/modules/video_detection/evaluator.py +2 -2
- paddlex/ops/__init__.py +8 -5
- paddlex/paddlex_cli.py +19 -13
- paddlex/repo_apis/Paddle3D_api/bev_fusion/model.py +2 -2
- paddlex/repo_apis/PaddleClas_api/cls/config.py +1 -1
- paddlex/repo_apis/PaddleClas_api/cls/model.py +1 -1
- paddlex/repo_apis/PaddleClas_api/cls/register.py +10 -0
- paddlex/repo_apis/PaddleClas_api/cls/runner.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/instance_seg/model.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/instance_seg/runner.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/object_det/config.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/object_det/model.py +1 -1
- paddlex/repo_apis/PaddleDetection_api/object_det/official_categories.py +25 -0
- paddlex/repo_apis/PaddleDetection_api/object_det/register.py +30 -0
- paddlex/repo_apis/PaddleDetection_api/object_det/runner.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/formula_rec/config.py +3 -3
- paddlex/repo_apis/PaddleOCR_api/formula_rec/model.py +5 -9
- paddlex/repo_apis/PaddleOCR_api/formula_rec/register.py +27 -0
- paddlex/repo_apis/PaddleOCR_api/formula_rec/runner.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/table_rec/model.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/table_rec/runner.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/text_det/model.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/text_det/register.py +18 -0
- paddlex/repo_apis/PaddleOCR_api/text_det/runner.py +1 -1
- paddlex/repo_apis/PaddleOCR_api/text_rec/config.py +3 -3
- paddlex/repo_apis/PaddleOCR_api/text_rec/model.py +5 -9
- paddlex/repo_apis/PaddleOCR_api/text_rec/register.py +18 -0
- paddlex/repo_apis/PaddleOCR_api/text_rec/runner.py +1 -1
- paddlex/repo_apis/PaddleSeg_api/seg/model.py +1 -1
- paddlex/repo_apis/PaddleSeg_api/seg/runner.py +1 -1
- paddlex/repo_apis/PaddleTS_api/ts_ad/config.py +3 -3
- paddlex/repo_apis/PaddleTS_api/ts_cls/config.py +2 -2
- paddlex/repo_apis/PaddleTS_api/ts_fc/config.py +4 -4
- paddlex/repo_apis/PaddleVideo_api/video_cls/config.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_cls/model.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_cls/runner.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_det/config.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_det/model.py +1 -1
- paddlex/repo_apis/PaddleVideo_api/video_det/runner.py +1 -1
- paddlex/repo_apis/base/config.py +1 -1
- paddlex/repo_manager/core.py +3 -3
- paddlex/repo_manager/meta.py +6 -2
- paddlex/repo_manager/repo.py +17 -16
- paddlex/utils/custom_device_list.py +26 -2
- paddlex/utils/deps.py +3 -3
- paddlex/utils/device.py +5 -13
- paddlex/utils/env.py +4 -0
- paddlex/utils/flags.py +11 -4
- paddlex/utils/fonts/__init__.py +34 -4
- paddlex/utils/misc.py +1 -1
- paddlex/utils/subclass_register.py +2 -2
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/METADATA +349 -208
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/RECORD +240 -211
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/WHEEL +1 -1
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/entry_points.txt +1 -0
- {paddlex-3.0.0rc1.dist-info/licenses → paddlex-3.0.2.dist-info}/LICENSE +0 -0
- {paddlex-3.0.0rc1.dist-info → paddlex-3.0.2.dist-info}/top_level.txt +0 -0
@@ -13,7 +13,6 @@
|
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
15
|
import copy
|
16
|
-
from pathlib import Path
|
17
16
|
from typing import Dict
|
18
17
|
|
19
18
|
import numpy as np
|
@@ -31,15 +30,6 @@ class LayoutParsingResult(BaseCVResult, HtmlMixin, XlsxMixin):
|
|
31
30
|
HtmlMixin.__init__(self)
|
32
31
|
XlsxMixin.__init__(self)
|
33
32
|
|
34
|
-
def _get_input_fn(self):
|
35
|
-
fn = super()._get_input_fn()
|
36
|
-
if (page_idx := self["page_index"]) is not None:
|
37
|
-
fp = Path(fn)
|
38
|
-
stem, suffix = fp.stem, fp.suffix
|
39
|
-
return f"{stem}_{page_idx}{suffix}"
|
40
|
-
else:
|
41
|
-
return fn
|
42
|
-
|
43
33
|
def _to_img(self) -> Dict[str, np.ndarray]:
|
44
34
|
res_img_dict = {}
|
45
35
|
model_settings = self["model_settings"]
|
@@ -47,12 +37,11 @@ class LayoutParsingResult(BaseCVResult, HtmlMixin, XlsxMixin):
|
|
47
37
|
res_img_dict.update(**self["doc_preprocessor_res"].img)
|
48
38
|
res_img_dict["layout_det_res"] = self["layout_det_res"].img["res"]
|
49
39
|
|
50
|
-
|
51
|
-
res_img_dict["overall_ocr_res"] = self["overall_ocr_res"].img["ocr_res_img"]
|
40
|
+
res_img_dict["overall_ocr_res"] = self["overall_ocr_res"].img["ocr_res_img"]
|
52
41
|
|
53
42
|
if model_settings["use_table_recognition"] and len(self["table_res_list"]) > 0:
|
54
43
|
table_cell_img = Image.fromarray(
|
55
|
-
copy.deepcopy(self["doc_preprocessor_res"]["output_img"])
|
44
|
+
copy.deepcopy(self["doc_preprocessor_res"]["output_img"][:, :, ::-1])
|
56
45
|
)
|
57
46
|
table_draw = ImageDraw.Draw(table_cell_img)
|
58
47
|
rectangle_color = (255, 0, 0)
|
@@ -106,8 +95,7 @@ class LayoutParsingResult(BaseCVResult, HtmlMixin, XlsxMixin):
|
|
106
95
|
if self["model_settings"]["use_doc_preprocessor"]:
|
107
96
|
data["doc_preprocessor_res"] = self["doc_preprocessor_res"].str["res"]
|
108
97
|
data["layout_det_res"] = self["layout_det_res"].str["res"]
|
109
|
-
|
110
|
-
data["overall_ocr_res"] = self["overall_ocr_res"].str["res"]
|
98
|
+
data["overall_ocr_res"] = self["overall_ocr_res"].str["res"]
|
111
99
|
if model_settings["use_table_recognition"] and len(self["table_res_list"]) > 0:
|
112
100
|
data["table_res_list"] = []
|
113
101
|
for sno in range(len(self["table_res_list"])):
|
@@ -149,8 +137,7 @@ class LayoutParsingResult(BaseCVResult, HtmlMixin, XlsxMixin):
|
|
149
137
|
if self["model_settings"]["use_doc_preprocessor"]:
|
150
138
|
data["doc_preprocessor_res"] = self["doc_preprocessor_res"].json["res"]
|
151
139
|
data["layout_det_res"] = self["layout_det_res"].json["res"]
|
152
|
-
|
153
|
-
data["overall_ocr_res"] = self["overall_ocr_res"].json["res"]
|
140
|
+
data["overall_ocr_res"] = self["overall_ocr_res"].json["res"]
|
154
141
|
if model_settings["use_table_recognition"] and len(self["table_res_list"]) > 0:
|
155
142
|
data["table_res_list"] = []
|
156
143
|
for sno in range(len(self["table_res_list"])):
|
@@ -15,11 +15,13 @@ from __future__ import annotations
|
|
15
15
|
|
16
16
|
import copy
|
17
17
|
import re
|
18
|
-
from
|
18
|
+
from functools import partial
|
19
|
+
from typing import List
|
19
20
|
|
20
21
|
import numpy as np
|
21
|
-
from PIL import Image, ImageDraw
|
22
|
+
from PIL import Image, ImageDraw, ImageFont
|
22
23
|
|
24
|
+
from ....utils.fonts import PINGFANG_FONT_FILE_PATH
|
23
25
|
from ...common.result import (
|
24
26
|
BaseCVResult,
|
25
27
|
HtmlMixin,
|
@@ -27,7 +29,115 @@ from ...common.result import (
|
|
27
29
|
MarkdownMixin,
|
28
30
|
XlsxMixin,
|
29
31
|
)
|
30
|
-
from .
|
32
|
+
from .layout_objects import LayoutBlock
|
33
|
+
from .utils import get_seg_flag
|
34
|
+
|
35
|
+
|
36
|
+
def compile_title_pattern():
|
37
|
+
# Precompiled regex pattern for matching numbering at the beginning of the title
|
38
|
+
numbering_pattern = (
|
39
|
+
r"(?:" + r"[1-9][0-9]*(?:\.[1-9][0-9]*)*[\.、]?|" + r"[\(\(](?:[1-9][0-9]*|["
|
40
|
+
r"一二三四五六七八九十百千万亿零壹贰叁肆伍陆柒捌玖拾]+)[\)\)]|" + r"["
|
41
|
+
r"一二三四五六七八九十百千万亿零壹贰叁肆伍陆柒捌玖拾]+"
|
42
|
+
r"[、\.]?|" + r"(?:I|II|III|IV|V|VI|VII|VIII|IX|X)\.?" + r")"
|
43
|
+
)
|
44
|
+
return re.compile(r"^\s*(" + numbering_pattern + r")(\s*)(.*)$")
|
45
|
+
|
46
|
+
|
47
|
+
TITLE_RE_PATTERN = compile_title_pattern()
|
48
|
+
|
49
|
+
|
50
|
+
def format_title_func(block):
|
51
|
+
"""
|
52
|
+
Normalize chapter title.
|
53
|
+
Add the '#' to indicate the level of the title.
|
54
|
+
If numbering exists, ensure there's exactly one space between it and the title content.
|
55
|
+
If numbering does not exist, return the original title unchanged.
|
56
|
+
|
57
|
+
:param title: Original chapter title string.
|
58
|
+
:return: Normalized chapter title string.
|
59
|
+
"""
|
60
|
+
title = block.content
|
61
|
+
match = TITLE_RE_PATTERN.match(title)
|
62
|
+
if match:
|
63
|
+
numbering = match.group(1).strip()
|
64
|
+
title_content = match.group(3).lstrip()
|
65
|
+
# Return numbering and title content separated by one space
|
66
|
+
title = numbering + " " + title_content
|
67
|
+
|
68
|
+
title = title.rstrip(".")
|
69
|
+
level = (
|
70
|
+
title.count(
|
71
|
+
".",
|
72
|
+
)
|
73
|
+
+ 1
|
74
|
+
if "." in title
|
75
|
+
else 1
|
76
|
+
)
|
77
|
+
return f"#{'#' * level} {title}".replace("-\n", "").replace(
|
78
|
+
"\n",
|
79
|
+
" ",
|
80
|
+
)
|
81
|
+
|
82
|
+
|
83
|
+
def format_centered_by_html(string):
|
84
|
+
return (
|
85
|
+
f'<div style="text-align: center;">{string}</div>'.replace(
|
86
|
+
"-\n",
|
87
|
+
"",
|
88
|
+
).replace("\n", " ")
|
89
|
+
+ "\n"
|
90
|
+
)
|
91
|
+
|
92
|
+
|
93
|
+
def format_text_plain_func(block):
|
94
|
+
return block.content
|
95
|
+
|
96
|
+
|
97
|
+
def format_image_scaled_by_html_func(block, original_image_width):
|
98
|
+
img_tags = []
|
99
|
+
image_path = block.image["path"]
|
100
|
+
image_width = block.image["img"].width
|
101
|
+
scale = int(image_width / original_image_width * 100)
|
102
|
+
img_tags.append(
|
103
|
+
'<img src="{}" alt="Image" width="{}%" />'.format(
|
104
|
+
image_path.replace("-\n", "").replace("\n", " "), scale
|
105
|
+
),
|
106
|
+
)
|
107
|
+
return "\n".join(img_tags)
|
108
|
+
|
109
|
+
|
110
|
+
def format_image_plain_func(block):
|
111
|
+
img_tags = []
|
112
|
+
image_path = block.image["path"]
|
113
|
+
img_tags.append("".format(image_path.replace("-\n", "").replace("\n", " ")))
|
114
|
+
return "\n".join(img_tags)
|
115
|
+
|
116
|
+
|
117
|
+
def format_chart2table_func(block):
|
118
|
+
lines_list = block.content.split("\n")
|
119
|
+
column_num = len(lines_list[0].split("|"))
|
120
|
+
lines_list.insert(1, "|".join(["---"] * column_num))
|
121
|
+
lines_list = [f"|{line}|" for line in lines_list]
|
122
|
+
return "\n".join(lines_list)
|
123
|
+
|
124
|
+
|
125
|
+
def simplify_table_func(table_code):
|
126
|
+
return "\n" + table_code.replace("<html>", "").replace("</html>", "").replace(
|
127
|
+
"<body>", ""
|
128
|
+
).replace("</body>", "")
|
129
|
+
|
130
|
+
|
131
|
+
def format_first_line_func(block, templates, format_func, spliter):
|
132
|
+
lines = block.content.split(spliter)
|
133
|
+
for idx in range(len(lines)):
|
134
|
+
line = lines[idx]
|
135
|
+
if line.strip() == "":
|
136
|
+
continue
|
137
|
+
if line.lower() in templates:
|
138
|
+
lines[idx] = format_func(line)
|
139
|
+
break
|
140
|
+
return spliter.join(lines)
|
31
141
|
|
32
142
|
|
33
143
|
class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
|
@@ -40,30 +150,10 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
|
|
40
150
|
XlsxMixin.__init__(self)
|
41
151
|
MarkdownMixin.__init__(self)
|
42
152
|
JsonMixin.__init__(self)
|
43
|
-
self.title_pattern = self._build_title_pattern()
|
44
|
-
|
45
|
-
def _build_title_pattern(self):
|
46
|
-
# Precompiled regex pattern for matching numbering at the beginning of the title
|
47
|
-
numbering_pattern = (
|
48
|
-
r"(?:"
|
49
|
-
+ r"[1-9][0-9]*(?:\.[1-9][0-9]*)*[\.、]?|"
|
50
|
-
+ r"[\(\(](?:[1-9][0-9]*|["
|
51
|
-
r"一二三四五六七八九十百千万亿零壹贰叁肆伍陆柒捌玖拾]+)[\)\)]|" + r"["
|
52
|
-
r"一二三四五六七八九十百千万亿零壹贰叁肆伍陆柒捌玖拾]+"
|
53
|
-
r"[、\.]?|" + r"(?:I|II|III|IV|V|VI|VII|VIII|IX|X)\.?" + r")"
|
54
|
-
)
|
55
|
-
return re.compile(r"^\s*(" + numbering_pattern + r")(\s*)(.*)$")
|
56
|
-
|
57
|
-
def _get_input_fn(self):
|
58
|
-
fn = super()._get_input_fn()
|
59
|
-
if (page_idx := self["page_index"]) is not None:
|
60
|
-
fp = Path(fn)
|
61
|
-
stem, suffix = fp.stem, fp.suffix
|
62
|
-
return f"{stem}_{page_idx}{suffix}"
|
63
|
-
else:
|
64
|
-
return fn
|
65
153
|
|
66
154
|
def _to_img(self) -> dict[str, np.ndarray]:
|
155
|
+
from .utils import get_show_color
|
156
|
+
|
67
157
|
res_img_dict = {}
|
68
158
|
model_settings = self["model_settings"]
|
69
159
|
if model_settings["use_doc_preprocessor"]:
|
@@ -71,12 +161,14 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
|
|
71
161
|
res_img_dict[key] = value
|
72
162
|
res_img_dict["layout_det_res"] = self["layout_det_res"].img["res"]
|
73
163
|
|
74
|
-
if model_settings["
|
75
|
-
res_img_dict["
|
164
|
+
if model_settings["use_region_detection"]:
|
165
|
+
res_img_dict["region_det_res"] = self["region_det_res"].img["res"]
|
166
|
+
|
167
|
+
res_img_dict["overall_ocr_res"] = self["overall_ocr_res"].img["ocr_res_img"]
|
76
168
|
|
77
169
|
if model_settings["use_table_recognition"] and len(self["table_res_list"]) > 0:
|
78
170
|
table_cell_img = Image.fromarray(
|
79
|
-
copy.deepcopy(self["doc_preprocessor_res"]["output_img"])
|
171
|
+
copy.deepcopy(self["doc_preprocessor_res"]["output_img"][:, :, ::-1])
|
80
172
|
)
|
81
173
|
table_draw = ImageDraw.Draw(table_cell_img)
|
82
174
|
rectangle_color = (255, 0, 0)
|
@@ -101,16 +193,23 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
|
|
101
193
|
# for layout ordering image
|
102
194
|
image = Image.fromarray(self["doc_preprocessor_res"]["output_img"][:, :, ::-1])
|
103
195
|
draw = ImageDraw.Draw(image, "RGBA")
|
104
|
-
|
196
|
+
font_size = int(0.018 * int(image.width)) + 2
|
197
|
+
font = ImageFont.truetype(PINGFANG_FONT_FILE_PATH, font_size, encoding="utf-8")
|
198
|
+
parsing_result: List[LayoutBlock] = self["parsing_res_list"]
|
105
199
|
for block in parsing_result:
|
106
|
-
bbox = block
|
107
|
-
index = block.
|
108
|
-
label = block
|
109
|
-
fill_color = get_show_color(label)
|
200
|
+
bbox = block.bbox
|
201
|
+
index = block.order_index
|
202
|
+
label = block.label
|
203
|
+
fill_color = get_show_color(label, False)
|
110
204
|
draw.rectangle(bbox, fill=fill_color)
|
111
205
|
if index is not None:
|
112
|
-
text_position = (bbox[2] + 2, bbox[1] -
|
113
|
-
|
206
|
+
text_position = (bbox[2] + 2, bbox[1] - font_size // 2)
|
207
|
+
if int(image.width) - bbox[2] < font_size:
|
208
|
+
text_position = (
|
209
|
+
int(bbox[2] - font_size * 1.1),
|
210
|
+
bbox[1] - font_size // 2,
|
211
|
+
)
|
212
|
+
draw.text(text_position, str(index), font=font, fill="red")
|
114
213
|
|
115
214
|
res_img_dict["layout_order_res"] = image
|
116
215
|
|
@@ -134,8 +233,7 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
|
|
134
233
|
if self["model_settings"]["use_doc_preprocessor"]:
|
135
234
|
data["doc_preprocessor_res"] = self["doc_preprocessor_res"].str["res"]
|
136
235
|
data["layout_det_res"] = self["layout_det_res"].str["res"]
|
137
|
-
|
138
|
-
data["overall_ocr_res"] = self["overall_ocr_res"].str["res"]
|
236
|
+
data["overall_ocr_res"] = self["overall_ocr_res"].str["res"]
|
139
237
|
if model_settings["use_table_recognition"] and len(self["table_res_list"]) > 0:
|
140
238
|
data["table_res_list"] = []
|
141
239
|
for sno in range(len(self["table_res_list"])):
|
@@ -176,9 +274,9 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
|
|
176
274
|
parsing_res_list = self["parsing_res_list"]
|
177
275
|
parsing_res_list = [
|
178
276
|
{
|
179
|
-
"block_label": parsing_res
|
180
|
-
"block_content": parsing_res
|
181
|
-
"block_bbox": parsing_res
|
277
|
+
"block_label": parsing_res.label,
|
278
|
+
"block_content": parsing_res.content,
|
279
|
+
"block_bbox": parsing_res.bbox,
|
182
280
|
}
|
183
281
|
for parsing_res in parsing_res_list
|
184
282
|
]
|
@@ -186,8 +284,7 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
|
|
186
284
|
if self["model_settings"]["use_doc_preprocessor"]:
|
187
285
|
data["doc_preprocessor_res"] = self["doc_preprocessor_res"].json["res"]
|
188
286
|
data["layout_det_res"] = self["layout_det_res"].json["res"]
|
189
|
-
|
190
|
-
data["overall_ocr_res"] = self["overall_ocr_res"].json["res"]
|
287
|
+
data["overall_ocr_res"] = self["overall_ocr_res"].json["res"]
|
191
288
|
if model_settings["use_table_recognition"] and len(self["table_res_list"]) > 0:
|
192
289
|
data["table_res_list"] = []
|
193
290
|
for sno in range(len(self["table_res_list"])):
|
@@ -240,227 +337,144 @@ class LayoutParsingResultV2(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
|
|
240
337
|
res_xlsx_dict[key] = table_res.xlsx["pred"]
|
241
338
|
return res_xlsx_dict
|
242
339
|
|
243
|
-
def _to_markdown(self) -> dict:
|
340
|
+
def _to_markdown(self, pretty=True) -> dict:
|
244
341
|
"""
|
245
342
|
Save the parsing result to a Markdown file.
|
246
343
|
|
344
|
+
Args:
|
345
|
+
pretty (Optional[bool]): whether to pretty markdown by HTML, default by True.
|
346
|
+
|
247
347
|
Returns:
|
248
348
|
Dict
|
249
349
|
"""
|
350
|
+
original_image_width = self["doc_preprocessor_res"]["output_img"].shape[1]
|
250
351
|
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
:param title: Original chapter title string.
|
261
|
-
:return: Normalized chapter title string.
|
262
|
-
"""
|
263
|
-
match = self.title_pattern.match(title)
|
264
|
-
if match:
|
265
|
-
numbering = match.group(1).strip()
|
266
|
-
title_content = match.group(3).lstrip()
|
267
|
-
# Return numbering and title content separated by one space
|
268
|
-
title = numbering + " " + title_content
|
269
|
-
|
270
|
-
title = title.rstrip(".")
|
271
|
-
level = (
|
272
|
-
title.count(
|
273
|
-
".",
|
274
|
-
)
|
275
|
-
+ 1
|
276
|
-
if "." in title
|
277
|
-
else 1
|
278
|
-
)
|
279
|
-
return f"#{'#' * level} {title}".replace("-\n", "").replace(
|
280
|
-
"\n",
|
281
|
-
" ",
|
282
|
-
)
|
283
|
-
|
284
|
-
def format_centered_text(key):
|
285
|
-
return (
|
286
|
-
f'<div style="text-align: center;">{block[key]}</div>'.replace(
|
287
|
-
"-\n",
|
288
|
-
"",
|
289
|
-
).replace("\n", " ")
|
290
|
-
+ "\n"
|
352
|
+
if pretty:
|
353
|
+
format_text_func = lambda block: format_centered_by_html(
|
354
|
+
format_text_plain_func(block)
|
355
|
+
)
|
356
|
+
format_image_func = lambda block: format_centered_by_html(
|
357
|
+
format_image_scaled_by_html_func(
|
358
|
+
block,
|
359
|
+
original_image_width=original_image_width,
|
291
360
|
)
|
361
|
+
)
|
362
|
+
else:
|
363
|
+
format_text_func = lambda block: block.content
|
364
|
+
format_image_func = format_image_plain_func
|
292
365
|
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
'<div style="text-align: center;"><img src="{}" alt="Image" /></div>'.format(
|
298
|
-
image_path.replace("-\n", "").replace("\n", " "),
|
299
|
-
),
|
300
|
-
)
|
301
|
-
return "\n".join(img_tags)
|
302
|
-
|
303
|
-
def format_first_line(templates, format_func, spliter):
|
304
|
-
lines = block["block_content"].split(spliter)
|
305
|
-
for idx in range(len(lines)):
|
306
|
-
line = lines[idx]
|
307
|
-
if line.strip() == "":
|
308
|
-
continue
|
309
|
-
if line.lower() in templates:
|
310
|
-
lines[idx] = format_func(line)
|
311
|
-
break
|
312
|
-
return spliter.join(lines)
|
313
|
-
|
314
|
-
def format_table():
|
315
|
-
return "\n" + block["block_content"]
|
316
|
-
|
317
|
-
def get_seg_flag(block, prev_block):
|
318
|
-
|
319
|
-
seg_start_flag = True
|
320
|
-
seg_end_flag = True
|
321
|
-
|
322
|
-
block_box = block["block_bbox"]
|
323
|
-
context_left_coordinate = block_box[0]
|
324
|
-
context_right_coordinate = block_box[2]
|
325
|
-
seg_start_coordinate = block.get("seg_start_coordinate")
|
326
|
-
seg_end_coordinate = block.get("seg_end_coordinate")
|
327
|
-
|
328
|
-
if prev_block is not None:
|
329
|
-
prev_block_bbox = prev_block["block_bbox"]
|
330
|
-
num_of_prev_lines = prev_block.get("num_of_lines")
|
331
|
-
pre_block_seg_end_coordinate = prev_block.get("seg_end_coordinate")
|
332
|
-
prev_end_space_small = (
|
333
|
-
context_right_coordinate - pre_block_seg_end_coordinate < 10
|
334
|
-
)
|
335
|
-
prev_lines_more_than_one = num_of_prev_lines > 1
|
336
|
-
|
337
|
-
overlap_blocks = context_left_coordinate < prev_block_bbox[2]
|
338
|
-
|
339
|
-
# update context_left_coordinate and context_right_coordinate
|
340
|
-
if overlap_blocks:
|
341
|
-
context_left_coordinate = min(
|
342
|
-
prev_block_bbox[0], context_left_coordinate
|
343
|
-
)
|
344
|
-
context_right_coordinate = max(
|
345
|
-
prev_block_bbox[2], context_right_coordinate
|
346
|
-
)
|
347
|
-
prev_end_space_small = (
|
348
|
-
prev_block_bbox[2] - pre_block_seg_end_coordinate < 10
|
349
|
-
)
|
350
|
-
|
351
|
-
current_start_space_small = (
|
352
|
-
seg_start_coordinate - context_left_coordinate < 10
|
353
|
-
)
|
366
|
+
if self["model_settings"].get("use_chart_recognition", False):
|
367
|
+
format_chart_func = format_chart2table_func
|
368
|
+
else:
|
369
|
+
format_chart_func = format_image_func
|
354
370
|
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
and prev_lines_more_than_one
|
359
|
-
):
|
360
|
-
seg_start_flag = False
|
361
|
-
else:
|
362
|
-
if seg_start_coordinate - context_left_coordinate < 10:
|
363
|
-
seg_start_flag = False
|
364
|
-
|
365
|
-
if context_right_coordinate - seg_end_coordinate < 10:
|
366
|
-
seg_end_flag = False
|
367
|
-
|
368
|
-
return seg_start_flag, seg_end_flag
|
369
|
-
|
370
|
-
handlers = {
|
371
|
-
"paragraph_title": lambda: format_title(block["block_content"]),
|
372
|
-
"doc_title": lambda: f"# {block['block_content']}".replace(
|
373
|
-
"-\n",
|
374
|
-
"",
|
375
|
-
).replace("\n", " "),
|
376
|
-
"table_title": lambda: format_centered_text("block_content"),
|
377
|
-
"figure_title": lambda: format_centered_text("block_content"),
|
378
|
-
"chart_title": lambda: format_centered_text("block_content"),
|
379
|
-
"text": lambda: block["block_content"]
|
380
|
-
.replace("-\n", " ")
|
381
|
-
.replace("\n", " "),
|
382
|
-
"abstract": lambda: format_first_line(
|
383
|
-
["摘要", "abstract"], lambda l: f"## {l}\n", " "
|
384
|
-
),
|
385
|
-
"content": lambda: block["block_content"]
|
386
|
-
.replace("-\n", " \n")
|
387
|
-
.replace("\n", " \n"),
|
388
|
-
"image": lambda: format_image("block_image"),
|
389
|
-
"chart": lambda: format_image("block_image"),
|
390
|
-
"formula": lambda: f"$${block['block_content']}$$",
|
391
|
-
"table": format_table,
|
392
|
-
"reference": lambda: format_first_line(
|
393
|
-
["参考文献", "references"], lambda l: f"## {l}", "\n"
|
394
|
-
),
|
395
|
-
"algorithm": lambda: block["block_content"].strip("\n"),
|
396
|
-
"seal": lambda: f"Words of Seals:\n{block['block_content']}",
|
397
|
-
}
|
398
|
-
parsing_res_list = obj["parsing_res_list"]
|
399
|
-
markdown_content = ""
|
400
|
-
last_label = None
|
401
|
-
seg_start_flag = None
|
402
|
-
seg_end_flag = None
|
403
|
-
prev_block = None
|
404
|
-
page_first_element_seg_start_flag = None
|
405
|
-
page_last_element_seg_end_flag = None
|
406
|
-
parsing_res_list = sorted(
|
407
|
-
parsing_res_list,
|
408
|
-
key=lambda x: x.get("sub_index", 999),
|
371
|
+
if self["model_settings"].get("use_seal_recognition", False):
|
372
|
+
format_seal_func = lambda block: "\n".join(
|
373
|
+
[format_image_func(block), format_text_func(block)]
|
409
374
|
)
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
375
|
+
else:
|
376
|
+
format_seal_func = format_image_func
|
377
|
+
|
378
|
+
if self["model_settings"].get("use_table_recognition", False):
|
379
|
+
if pretty:
|
380
|
+
format_table_func = lambda block: "\n" + format_text_func(
|
381
|
+
block
|
382
|
+
).replace("<table>", '<table border="1">')
|
383
|
+
else:
|
384
|
+
format_table_func = lambda block: simplify_table_func(
|
385
|
+
"\n" + block.content
|
418
386
|
)
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
387
|
+
else:
|
388
|
+
format_table_func = format_image_func
|
389
|
+
|
390
|
+
if self["model_settings"].get("use_formula_recognition", False):
|
391
|
+
format_formula_func = lambda block: f"$${block.content}$$"
|
392
|
+
else:
|
393
|
+
format_formula_func = format_image_func
|
394
|
+
|
395
|
+
handle_funcs_dict = {
|
396
|
+
"paragraph_title": format_title_func,
|
397
|
+
"abstract_title": format_title_func,
|
398
|
+
"reference_title": format_title_func,
|
399
|
+
"content_title": format_title_func,
|
400
|
+
"doc_title": lambda block: f"# {block.content}".replace(
|
401
|
+
"-\n",
|
402
|
+
"",
|
403
|
+
).replace("\n", " "),
|
404
|
+
"table_title": format_text_func,
|
405
|
+
"figure_title": format_text_func,
|
406
|
+
"chart_title": format_text_func,
|
407
|
+
"vision_footnote": lambda block: block.content.replace(
|
408
|
+
"\n\n", "\n"
|
409
|
+
).replace("\n", "\n\n"),
|
410
|
+
"text": lambda block: block.content.replace("\n\n", "\n").replace(
|
411
|
+
"\n", "\n\n"
|
412
|
+
),
|
413
|
+
"abstract": partial(
|
414
|
+
format_first_line_func,
|
415
|
+
templates=["摘要", "abstract"],
|
416
|
+
format_func=lambda l: f"## {l}\n",
|
417
|
+
spliter=" ",
|
418
|
+
),
|
419
|
+
"content": lambda block: block.content.replace("-\n", " \n").replace(
|
420
|
+
"\n", " \n"
|
421
|
+
),
|
422
|
+
"image": format_image_func,
|
423
|
+
"chart": format_chart_func,
|
424
|
+
"formula": format_formula_func,
|
425
|
+
"table": format_table_func,
|
426
|
+
"reference": partial(
|
427
|
+
format_first_line_func,
|
428
|
+
templates=["参考文献", "references"],
|
429
|
+
format_func=lambda l: f"## {l}",
|
430
|
+
spliter="\n",
|
431
|
+
),
|
432
|
+
"algorithm": lambda block: block.content.strip("\n"),
|
433
|
+
"seal": format_seal_func,
|
434
|
+
}
|
435
|
+
|
436
|
+
markdown_content = ""
|
437
|
+
last_label = None
|
438
|
+
seg_start_flag = None
|
439
|
+
seg_end_flag = None
|
440
|
+
prev_block = None
|
441
|
+
page_first_element_seg_start_flag = None
|
442
|
+
page_last_element_seg_end_flag = None
|
443
|
+
markdown_info = {}
|
444
|
+
markdown_info["markdown_images"] = {}
|
445
|
+
for block in self["parsing_res_list"]:
|
446
|
+
seg_start_flag, seg_end_flag = get_seg_flag(block, prev_block)
|
447
|
+
|
448
|
+
label = block.label
|
449
|
+
if block.image is not None:
|
450
|
+
markdown_info["markdown_images"][block.image["path"]] = block.image[
|
451
|
+
"img"
|
452
|
+
]
|
453
|
+
page_first_element_seg_start_flag = (
|
454
|
+
seg_start_flag
|
455
|
+
if (page_first_element_seg_start_flag is None)
|
456
|
+
else page_first_element_seg_start_flag
|
451
457
|
)
|
452
458
|
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
459
|
+
handle_func = handle_funcs_dict.get(label, None)
|
460
|
+
if handle_func:
|
461
|
+
prev_block = block
|
462
|
+
if label == last_label == "text" and seg_start_flag == False:
|
463
|
+
markdown_content += handle_func(block)
|
464
|
+
else:
|
465
|
+
markdown_content += (
|
466
|
+
"\n\n" + handle_func(block)
|
467
|
+
if markdown_content
|
468
|
+
else handle_func(block)
|
469
|
+
)
|
470
|
+
last_label = label
|
471
|
+
page_last_element_seg_end_flag = seg_end_flag
|
472
|
+
|
473
|
+
markdown_info["markdown_texts"] = markdown_content
|
458
474
|
markdown_info["page_continuation_flags"] = (
|
459
475
|
page_first_element_seg_start_flag,
|
460
476
|
page_last_element_seg_end_flag,
|
461
477
|
)
|
462
|
-
|
463
|
-
markdown_info["markdown_images"] = {}
|
464
478
|
for img in self["imgs_in_doc"]:
|
465
479
|
markdown_info["markdown_images"][img["path"]] = img["img"]
|
466
480
|
|