magic-pdf 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/__init__.py +0 -0
- magic_pdf/cli/__init__.py +0 -0
- magic_pdf/cli/magicpdf.py +294 -0
- magic_pdf/dict2md/__init__.py +0 -0
- magic_pdf/dict2md/mkcontent.py +397 -0
- magic_pdf/dict2md/ocr_mkcontent.py +356 -0
- magic_pdf/filter/__init__.py +0 -0
- magic_pdf/filter/pdf_classify_by_type.py +381 -0
- magic_pdf/filter/pdf_meta_scan.py +368 -0
- magic_pdf/layout/__init__.py +0 -0
- magic_pdf/layout/bbox_sort.py +681 -0
- magic_pdf/layout/layout_det_utils.py +182 -0
- magic_pdf/layout/layout_sort.py +732 -0
- magic_pdf/layout/layout_spiler_recog.py +101 -0
- magic_pdf/layout/mcol_sort.py +336 -0
- magic_pdf/libs/Constants.py +11 -0
- magic_pdf/libs/MakeContentConfig.py +10 -0
- magic_pdf/libs/ModelBlockTypeEnum.py +9 -0
- magic_pdf/libs/__init__.py +0 -0
- magic_pdf/libs/boxbase.py +408 -0
- magic_pdf/libs/calc_span_stats.py +239 -0
- magic_pdf/libs/commons.py +204 -0
- magic_pdf/libs/config_reader.py +63 -0
- magic_pdf/libs/convert_utils.py +5 -0
- magic_pdf/libs/coordinate_transform.py +9 -0
- magic_pdf/libs/detect_language_from_model.py +21 -0
- magic_pdf/libs/draw_bbox.py +227 -0
- magic_pdf/libs/drop_reason.py +27 -0
- magic_pdf/libs/drop_tag.py +19 -0
- magic_pdf/libs/hash_utils.py +15 -0
- magic_pdf/libs/json_compressor.py +27 -0
- magic_pdf/libs/language.py +31 -0
- magic_pdf/libs/markdown_utils.py +31 -0
- magic_pdf/libs/math.py +9 -0
- magic_pdf/libs/nlp_utils.py +203 -0
- magic_pdf/libs/ocr_content_type.py +21 -0
- magic_pdf/libs/path_utils.py +23 -0
- magic_pdf/libs/pdf_image_tools.py +33 -0
- magic_pdf/libs/safe_filename.py +11 -0
- magic_pdf/libs/textbase.py +33 -0
- magic_pdf/libs/version.py +1 -0
- magic_pdf/libs/vis_utils.py +308 -0
- magic_pdf/model/__init__.py +0 -0
- magic_pdf/model/doc_analyze_by_360layout.py +8 -0
- magic_pdf/model/doc_analyze_by_pp_structurev2.py +125 -0
- magic_pdf/model/magic_model.py +632 -0
- magic_pdf/para/__init__.py +0 -0
- magic_pdf/para/block_continuation_processor.py +562 -0
- magic_pdf/para/block_termination_processor.py +480 -0
- magic_pdf/para/commons.py +222 -0
- magic_pdf/para/denoise.py +246 -0
- magic_pdf/para/draw.py +121 -0
- magic_pdf/para/exceptions.py +198 -0
- magic_pdf/para/layout_match_processor.py +40 -0
- magic_pdf/para/para_pipeline.py +297 -0
- magic_pdf/para/para_split.py +644 -0
- magic_pdf/para/para_split_v2.py +772 -0
- magic_pdf/para/raw_processor.py +207 -0
- magic_pdf/para/stats.py +268 -0
- magic_pdf/para/title_processor.py +1014 -0
- magic_pdf/pdf_parse_by_ocr.py +219 -0
- magic_pdf/pdf_parse_by_ocr_v2.py +17 -0
- magic_pdf/pdf_parse_by_txt.py +410 -0
- magic_pdf/pdf_parse_by_txt_v2.py +56 -0
- magic_pdf/pdf_parse_for_train.py +685 -0
- magic_pdf/pdf_parse_union_core.py +241 -0
- magic_pdf/pipe/AbsPipe.py +112 -0
- magic_pdf/pipe/OCRPipe.py +28 -0
- magic_pdf/pipe/TXTPipe.py +29 -0
- magic_pdf/pipe/UNIPipe.py +83 -0
- magic_pdf/pipe/__init__.py +0 -0
- magic_pdf/post_proc/__init__.py +0 -0
- magic_pdf/post_proc/detect_para.py +3472 -0
- magic_pdf/post_proc/pdf_post_filter.py +67 -0
- magic_pdf/post_proc/remove_footnote.py +153 -0
- magic_pdf/pre_proc/__init__.py +0 -0
- magic_pdf/pre_proc/citationmarker_remove.py +157 -0
- magic_pdf/pre_proc/construct_page_dict.py +72 -0
- magic_pdf/pre_proc/cut_image.py +71 -0
- magic_pdf/pre_proc/detect_equation.py +134 -0
- magic_pdf/pre_proc/detect_footer_by_model.py +64 -0
- magic_pdf/pre_proc/detect_footer_header_by_statistics.py +284 -0
- magic_pdf/pre_proc/detect_footnote.py +170 -0
- magic_pdf/pre_proc/detect_header.py +64 -0
- magic_pdf/pre_proc/detect_images.py +647 -0
- magic_pdf/pre_proc/detect_page_number.py +64 -0
- magic_pdf/pre_proc/detect_tables.py +62 -0
- magic_pdf/pre_proc/equations_replace.py +559 -0
- magic_pdf/pre_proc/fix_image.py +244 -0
- magic_pdf/pre_proc/fix_table.py +270 -0
- magic_pdf/pre_proc/main_text_font.py +23 -0
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +115 -0
- magic_pdf/pre_proc/ocr_detect_layout.py +133 -0
- magic_pdf/pre_proc/ocr_dict_merge.py +336 -0
- magic_pdf/pre_proc/ocr_span_list_modify.py +258 -0
- magic_pdf/pre_proc/pdf_pre_filter.py +74 -0
- magic_pdf/pre_proc/post_layout_split.py +0 -0
- magic_pdf/pre_proc/remove_bbox_overlap.py +98 -0
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +79 -0
- magic_pdf/pre_proc/remove_footer_header.py +117 -0
- magic_pdf/pre_proc/remove_rotate_bbox.py +188 -0
- magic_pdf/pre_proc/resolve_bbox_conflict.py +191 -0
- magic_pdf/pre_proc/solve_line_alien.py +29 -0
- magic_pdf/pre_proc/statistics.py +12 -0
- magic_pdf/rw/AbsReaderWriter.py +34 -0
- magic_pdf/rw/DiskReaderWriter.py +66 -0
- magic_pdf/rw/S3ReaderWriter.py +107 -0
- magic_pdf/rw/__init__.py +0 -0
- magic_pdf/spark/__init__.py +0 -0
- magic_pdf/spark/spark_api.py +51 -0
- magic_pdf/train_utils/__init__.py +0 -0
- magic_pdf/train_utils/convert_to_train_format.py +65 -0
- magic_pdf/train_utils/extract_caption.py +59 -0
- magic_pdf/train_utils/remove_footer_header.py +159 -0
- magic_pdf/train_utils/vis_utils.py +327 -0
- magic_pdf/user_api.py +136 -0
- magic_pdf-0.5.4.dist-info/LICENSE.md +661 -0
- magic_pdf-0.5.4.dist-info/METADATA +24 -0
- magic_pdf-0.5.4.dist-info/RECORD +121 -0
- magic_pdf-0.5.4.dist-info/WHEEL +5 -0
- magic_pdf-0.5.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,31 @@
|
|
1
|
+
import re
|
2
|
+
|
3
|
+
|
4
|
+
def escape_special_markdown_char(pymu_blocks):
|
5
|
+
"""
|
6
|
+
转义正文里对markdown语法有特殊意义的字符
|
7
|
+
"""
|
8
|
+
special_chars = ["*", "`", "~", "$"]
|
9
|
+
for blk in pymu_blocks:
|
10
|
+
for line in blk['lines']:
|
11
|
+
for span in line['spans']:
|
12
|
+
for char in special_chars:
|
13
|
+
span_text = span['text']
|
14
|
+
span_type = span.get("_type", None)
|
15
|
+
if span_type in ['inline-equation', 'interline-equation']:
|
16
|
+
continue
|
17
|
+
elif span_text:
|
18
|
+
span['text'] = span['text'].replace(char, "\\" + char)
|
19
|
+
|
20
|
+
return pymu_blocks
|
21
|
+
|
22
|
+
|
23
|
+
def ocr_escape_special_markdown_char(content):
|
24
|
+
"""
|
25
|
+
转义正文里对markdown语法有特殊意义的字符
|
26
|
+
"""
|
27
|
+
special_chars = ["*", "`", "~", "$"]
|
28
|
+
for char in special_chars:
|
29
|
+
content = content.replace(char, "\\" + char)
|
30
|
+
|
31
|
+
return content
|
magic_pdf/libs/math.py
ADDED
@@ -0,0 +1,203 @@
|
|
1
|
+
import re
|
2
|
+
from os import path
|
3
|
+
|
4
|
+
from collections import Counter
|
5
|
+
|
6
|
+
from loguru import logger
|
7
|
+
|
8
|
+
# from langdetect import detect
|
9
|
+
import spacy
|
10
|
+
import en_core_web_sm
|
11
|
+
import zh_core_web_sm
|
12
|
+
|
13
|
+
from magic_pdf.libs.language import detect_lang
|
14
|
+
|
15
|
+
|
16
|
+
class NLPModels:
|
17
|
+
"""
|
18
|
+
How to upload local models to s3:
|
19
|
+
- config aws cli:
|
20
|
+
doc\SETUP-CLI.md
|
21
|
+
doc\setup_cli.sh
|
22
|
+
app\config\__init__.py
|
23
|
+
- $ cd {local_dir_storing_models}
|
24
|
+
- $ ls models
|
25
|
+
en_core_web_sm-3.7.1/
|
26
|
+
zh_core_web_sm-3.7.0/
|
27
|
+
- $ aws s3 sync models/ s3://llm-infra/models --profile=p_project_norm
|
28
|
+
- $ aws s3 --profile=p_project_norm ls s3://llm-infra/models/
|
29
|
+
PRE en_core_web_sm-3.7.1/
|
30
|
+
PRE zh_core_web_sm-3.7.0/
|
31
|
+
"""
|
32
|
+
|
33
|
+
def __init__(self):
|
34
|
+
# if OS is windows, set "TMP_DIR" to "D:/tmp"
|
35
|
+
|
36
|
+
home_dir = path.expanduser("~")
|
37
|
+
self.default_local_path = path.join(home_dir, ".nlp_models")
|
38
|
+
self.default_shared_path = "/share/pdf_processor/nlp_models"
|
39
|
+
self.default_hdfs_path = "hdfs://pdf_processor/nlp_models"
|
40
|
+
self.default_s3_path = "s3://llm-infra/models"
|
41
|
+
self.nlp_models = self.nlp_models = {
|
42
|
+
"en_core_web_sm": {
|
43
|
+
"type": "spacy",
|
44
|
+
"version": "3.7.1",
|
45
|
+
},
|
46
|
+
"en_core_web_md": {
|
47
|
+
"type": "spacy",
|
48
|
+
"version": "3.7.1",
|
49
|
+
},
|
50
|
+
"en_core_web_lg": {
|
51
|
+
"type": "spacy",
|
52
|
+
"version": "3.7.1",
|
53
|
+
},
|
54
|
+
"zh_core_web_sm": {
|
55
|
+
"type": "spacy",
|
56
|
+
"version": "3.7.0",
|
57
|
+
},
|
58
|
+
"zh_core_web_md": {
|
59
|
+
"type": "spacy",
|
60
|
+
"version": "3.7.0",
|
61
|
+
},
|
62
|
+
"zh_core_web_lg": {
|
63
|
+
"type": "spacy",
|
64
|
+
"version": "3.7.0",
|
65
|
+
},
|
66
|
+
}
|
67
|
+
self.en_core_web_sm_model = en_core_web_sm.load()
|
68
|
+
self.zh_core_web_sm_model = zh_core_web_sm.load()
|
69
|
+
|
70
|
+
def load_model(self, model_name, model_type, model_version):
|
71
|
+
if (
|
72
|
+
model_name in self.nlp_models
|
73
|
+
and self.nlp_models[model_name]["type"] == model_type
|
74
|
+
and self.nlp_models[model_name]["version"] == model_version
|
75
|
+
):
|
76
|
+
return spacy.load(model_name) if spacy.util.is_package(model_name) else None
|
77
|
+
|
78
|
+
else:
|
79
|
+
logger.error(f"Unsupported model name or version: {model_name} {model_version}")
|
80
|
+
return None
|
81
|
+
|
82
|
+
def detect_language(self, text, use_langdetect=False):
|
83
|
+
if len(text) == 0:
|
84
|
+
return None
|
85
|
+
if use_langdetect:
|
86
|
+
# print("use_langdetect")
|
87
|
+
# print(detect_lang(text))
|
88
|
+
# return detect_lang(text)
|
89
|
+
if detect_lang(text) == "zh":
|
90
|
+
return "zh"
|
91
|
+
else:
|
92
|
+
return "en"
|
93
|
+
|
94
|
+
if not use_langdetect:
|
95
|
+
en_count = len(re.findall(r"[a-zA-Z]", text))
|
96
|
+
cn_count = len(re.findall(r"[\u4e00-\u9fff]", text))
|
97
|
+
|
98
|
+
if en_count > cn_count:
|
99
|
+
return "en"
|
100
|
+
|
101
|
+
if cn_count > en_count:
|
102
|
+
return "zh"
|
103
|
+
|
104
|
+
def detect_entity_catgr_using_nlp(self, text, threshold=0.5):
|
105
|
+
"""
|
106
|
+
Detect entity categories using NLP models and return the most frequent entity types.
|
107
|
+
|
108
|
+
Parameters
|
109
|
+
----------
|
110
|
+
text : str
|
111
|
+
Text to be processed.
|
112
|
+
|
113
|
+
Returns
|
114
|
+
-------
|
115
|
+
str
|
116
|
+
The most frequent entity type.
|
117
|
+
"""
|
118
|
+
lang = self.detect_language(text, use_langdetect=True)
|
119
|
+
|
120
|
+
if lang == "en":
|
121
|
+
nlp_model = self.en_core_web_sm_model
|
122
|
+
elif lang == "zh":
|
123
|
+
nlp_model = self.zh_core_web_sm_model
|
124
|
+
else:
|
125
|
+
# logger.error(f"Unsupported language: {lang}")
|
126
|
+
return {}
|
127
|
+
|
128
|
+
# Splitting text into smaller parts
|
129
|
+
text_parts = re.split(r"[,;,;、\s & |]+", text)
|
130
|
+
|
131
|
+
text_parts = [part for part in text_parts if not re.match(r"[\d\W]+", part)] # Remove non-words
|
132
|
+
text_combined = " ".join(text_parts)
|
133
|
+
|
134
|
+
try:
|
135
|
+
doc = nlp_model(text_combined)
|
136
|
+
entity_counts = Counter([ent.label_ for ent in doc.ents])
|
137
|
+
word_counts_in_entities = Counter()
|
138
|
+
|
139
|
+
for ent in doc.ents:
|
140
|
+
word_counts_in_entities[ent.label_] += len(ent.text.split())
|
141
|
+
|
142
|
+
total_words_in_entities = sum(word_counts_in_entities.values())
|
143
|
+
total_words = len([token for token in doc if not token.is_punct])
|
144
|
+
|
145
|
+
if total_words_in_entities == 0 or total_words == 0:
|
146
|
+
return None
|
147
|
+
|
148
|
+
entity_percentage = total_words_in_entities / total_words
|
149
|
+
if entity_percentage < 0.5:
|
150
|
+
return None
|
151
|
+
|
152
|
+
most_common_entity, word_count = word_counts_in_entities.most_common(1)[0]
|
153
|
+
entity_percentage = word_count / total_words_in_entities
|
154
|
+
|
155
|
+
if entity_percentage >= threshold:
|
156
|
+
return most_common_entity
|
157
|
+
else:
|
158
|
+
return None
|
159
|
+
except Exception as e:
|
160
|
+
logger.error(f"Error in entity detection: {e}")
|
161
|
+
return None
|
162
|
+
|
163
|
+
|
164
|
+
def __main__():
|
165
|
+
nlpModel = NLPModels()
|
166
|
+
|
167
|
+
test_strings = [
|
168
|
+
"张三",
|
169
|
+
"张三, 李四,王五; 赵六",
|
170
|
+
"John Doe",
|
171
|
+
"Jane Smith",
|
172
|
+
"Lee, John",
|
173
|
+
"John Doe, Jane Smith; Alice Johnson,Bob Lee",
|
174
|
+
"孙七, Michael Jordan;赵八",
|
175
|
+
"David Smith Michael O'Connor; Kevin ßáçøñ",
|
176
|
+
"李雷·韩梅梅, 张三·李四",
|
177
|
+
"Charles Robert Darwin, Isaac Newton",
|
178
|
+
"莱昂纳多·迪卡普里奥, 杰克·吉伦哈尔",
|
179
|
+
"John Doe, Jane Smith; Alice Johnson",
|
180
|
+
"张三, 李四,王五; 赵六",
|
181
|
+
"Lei Wang, Jia Li, and Xiaojun Chen, LINKE YANG OU, and YUAN ZHANG",
|
182
|
+
"Rachel Mills & William Barry & Susanne B. Haga",
|
183
|
+
"Claire Chabut* and Jean-François Bussières",
|
184
|
+
"1 Department of Chemistry, Northeastern University, Shenyang 110004, China 2 State Key Laboratory of Polymer Physics and Chemistry, Changchun Institute of Applied Chemistry, Chinese Academy of Sciences, Changchun 130022, China",
|
185
|
+
"Changchun",
|
186
|
+
"china",
|
187
|
+
"Rongjun Song, 1,2 Baoyan Zhang, 1 Baotong Huang, 2 Tao Tang 2",
|
188
|
+
"Synergistic Effect of Supported Nickel Catalyst with Intumescent Flame-Retardants on Flame Retardancy and Thermal Stability of Polypropylene",
|
189
|
+
"Synergistic Effect of Supported Nickel Catalyst with",
|
190
|
+
"Intumescent Flame-Retardants on Flame Retardancy",
|
191
|
+
"and Thermal Stability of Polypropylene",
|
192
|
+
]
|
193
|
+
|
194
|
+
for test in test_strings:
|
195
|
+
print()
|
196
|
+
print(f"Original String: {test}")
|
197
|
+
|
198
|
+
result = nlpModel.detect_entity_catgr_using_nlp(test)
|
199
|
+
print(f"Detected entities: {result}")
|
200
|
+
|
201
|
+
|
202
|
+
if __name__ == "__main__":
|
203
|
+
__main__()
|
@@ -0,0 +1,21 @@
|
|
1
|
+
class ContentType:
|
2
|
+
Image = "image"
|
3
|
+
Table = "table"
|
4
|
+
Text = "text"
|
5
|
+
InlineEquation = "inline_equation"
|
6
|
+
InterlineEquation = "interline_equation"
|
7
|
+
|
8
|
+
class BlockType:
|
9
|
+
Image = "image"
|
10
|
+
ImageBody = "image_body"
|
11
|
+
ImageCaption = "image_caption"
|
12
|
+
Table = "table"
|
13
|
+
TableBody = "table_body"
|
14
|
+
TableCaption = "table_caption"
|
15
|
+
TableFootnote = "table_footnote"
|
16
|
+
Text = "text"
|
17
|
+
Title = "title"
|
18
|
+
InterlineEquation = "interline_equation"
|
19
|
+
Footnote = "footnote"
|
20
|
+
Discarded = "discarded"
|
21
|
+
|
@@ -0,0 +1,23 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
from s3pathlib import S3Path
|
4
|
+
|
5
|
+
def remove_non_official_s3_args(s3path):
|
6
|
+
"""
|
7
|
+
example: s3://abc/xxxx.json?bytes=0,81350 ==> s3://abc/xxxx.json
|
8
|
+
"""
|
9
|
+
arr = s3path.split("?")
|
10
|
+
return arr[0]
|
11
|
+
|
12
|
+
def parse_s3path(s3path: str):
|
13
|
+
p = S3Path(remove_non_official_s3_args(s3path))
|
14
|
+
return p.bucket, p.key
|
15
|
+
|
16
|
+
def parse_s3_range_params(s3path: str):
|
17
|
+
"""
|
18
|
+
example: s3://abc/xxxx.json?bytes=0,81350 ==> [0, 81350]
|
19
|
+
"""
|
20
|
+
arr = s3path.split("?bytes=")
|
21
|
+
if len(arr) == 1:
|
22
|
+
return None
|
23
|
+
return arr[1].split(",")
|
@@ -0,0 +1,33 @@
|
|
1
|
+
|
2
|
+
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
3
|
+
from magic_pdf.libs.commons import fitz
|
4
|
+
from magic_pdf.libs.commons import join_path
|
5
|
+
from magic_pdf.libs.hash_utils import compute_sha256
|
6
|
+
|
7
|
+
|
8
|
+
def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter: AbsReaderWriter):
|
9
|
+
"""
|
10
|
+
从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径
|
11
|
+
save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
|
12
|
+
"""
|
13
|
+
# 拼接文件名
|
14
|
+
filename = f"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}"
|
15
|
+
|
16
|
+
# 老版本返回不带bucket的路径
|
17
|
+
img_path = join_path(return_path, filename) if return_path is not None else None
|
18
|
+
|
19
|
+
# 新版本生成平铺路径
|
20
|
+
img_hash256_path = f"{compute_sha256(img_path)}.jpg"
|
21
|
+
|
22
|
+
# 将坐标转换为fitz.Rect对象
|
23
|
+
rect = fitz.Rect(*bbox)
|
24
|
+
# 配置缩放倍数为3倍
|
25
|
+
zoom = fitz.Matrix(3, 3)
|
26
|
+
# 截取图片
|
27
|
+
pix = page.get_pixmap(clip=rect, matrix=zoom)
|
28
|
+
|
29
|
+
byte_data = pix.tobytes(output='jpeg', jpg_quality=95)
|
30
|
+
|
31
|
+
imageWriter.write(byte_data, img_hash256_path, AbsReaderWriter.MODE_BIN)
|
32
|
+
|
33
|
+
return img_hash256_path
|
@@ -0,0 +1,33 @@
|
|
1
|
+
import math
|
2
|
+
|
3
|
+
|
4
|
+
def __inc_dict_val(mp, key, val_inc:int):
|
5
|
+
if mp.get(key):
|
6
|
+
mp[key] = mp[key] + val_inc
|
7
|
+
else:
|
8
|
+
mp[key] = val_inc
|
9
|
+
|
10
|
+
|
11
|
+
|
12
|
+
def get_text_block_base_info(block):
|
13
|
+
"""
|
14
|
+
获取这个文本块里的字体的颜色、字号、字体
|
15
|
+
按照正文字数最多的返回
|
16
|
+
"""
|
17
|
+
|
18
|
+
counter = {}
|
19
|
+
|
20
|
+
for line in block['lines']:
|
21
|
+
for span in line['spans']:
|
22
|
+
color = span['color']
|
23
|
+
size = round(span['size'], 2)
|
24
|
+
font = span['font']
|
25
|
+
|
26
|
+
txt_len = len(span['text'])
|
27
|
+
__inc_dict_val(counter, (color, size, font), txt_len)
|
28
|
+
|
29
|
+
|
30
|
+
c, s, ft = max(counter, key=counter.get)
|
31
|
+
|
32
|
+
return c, s, ft
|
33
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "0.5.4"
|
@@ -0,0 +1,308 @@
|
|
1
|
+
from magic_pdf.libs.commons import fitz
|
2
|
+
import os
|
3
|
+
|
4
|
+
|
5
|
+
def draw_bbox_on_page(raw_pdf_doc: fitz.Document, paras_dict:dict, save_path: str):
|
6
|
+
"""
|
7
|
+
在page上画出bbox,保存到save_path
|
8
|
+
"""
|
9
|
+
# 检查文件是否存在
|
10
|
+
is_new_pdf = False
|
11
|
+
if os.path.exists(save_path):
|
12
|
+
# 打开现有的 PDF 文件
|
13
|
+
doc = fitz.open(save_path)
|
14
|
+
else:
|
15
|
+
# 创建一个新的空白 PDF 文件
|
16
|
+
is_new_pdf = True
|
17
|
+
doc = fitz.open('')
|
18
|
+
|
19
|
+
color_map = {
|
20
|
+
'image': fitz.pdfcolor["yellow"],
|
21
|
+
'text': fitz.pdfcolor['blue'],
|
22
|
+
"table": fitz.pdfcolor['green']
|
23
|
+
}
|
24
|
+
|
25
|
+
for k, v in paras_dict.items():
|
26
|
+
page_idx = v['page_idx']
|
27
|
+
width = raw_pdf_doc[page_idx].rect.width
|
28
|
+
height = raw_pdf_doc[page_idx].rect.height
|
29
|
+
new_page = doc.new_page(width=width, height=height)
|
30
|
+
|
31
|
+
shape = new_page.new_shape()
|
32
|
+
for order, block in enumerate(v['preproc_blocks']):
|
33
|
+
rect = fitz.Rect(block['bbox'])
|
34
|
+
shape = new_page.new_shape()
|
35
|
+
shape.draw_rect(rect)
|
36
|
+
shape.finish(color=None, fill=color_map['text'], fill_opacity=0.2)
|
37
|
+
shape.finish()
|
38
|
+
shape.commit()
|
39
|
+
|
40
|
+
for img in v['images']:
|
41
|
+
# 原始box画上去
|
42
|
+
rect = fitz.Rect(img['bbox'])
|
43
|
+
shape = new_page.new_shape()
|
44
|
+
shape.draw_rect(rect)
|
45
|
+
shape.finish(color=None, fill=fitz.pdfcolor['yellow'])
|
46
|
+
shape.finish()
|
47
|
+
shape.commit()
|
48
|
+
|
49
|
+
for img in v['image_backup']:
|
50
|
+
# 原始box画上去
|
51
|
+
rect = fitz.Rect(img['bbox'])
|
52
|
+
shape = new_page.new_shape()
|
53
|
+
shape.draw_rect(rect)
|
54
|
+
shape.finish(color=fitz.pdfcolor['yellow'], fill=None)
|
55
|
+
shape.finish()
|
56
|
+
shape.commit()
|
57
|
+
|
58
|
+
for tb in v['droped_text_block']:
|
59
|
+
# 原始box画上去
|
60
|
+
rect = fitz.Rect(tb['bbox'])
|
61
|
+
shape = new_page.new_shape()
|
62
|
+
shape.draw_rect(rect)
|
63
|
+
shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.4)
|
64
|
+
shape.finish()
|
65
|
+
shape.commit()
|
66
|
+
|
67
|
+
# TODO table
|
68
|
+
for tb in v['tables']:
|
69
|
+
rect = fitz.Rect(tb['bbox'])
|
70
|
+
shape = new_page.new_shape()
|
71
|
+
shape.draw_rect(rect)
|
72
|
+
shape.finish(color=None, fill=fitz.pdfcolor['green'], fill_opacity=0.2)
|
73
|
+
shape.finish()
|
74
|
+
shape.commit()
|
75
|
+
|
76
|
+
|
77
|
+
parent_dir = os.path.dirname(save_path)
|
78
|
+
if not os.path.exists(parent_dir):
|
79
|
+
os.makedirs(parent_dir)
|
80
|
+
|
81
|
+
if is_new_pdf:
|
82
|
+
doc.save(save_path)
|
83
|
+
else:
|
84
|
+
doc.saveIncr()
|
85
|
+
doc.close()
|
86
|
+
|
87
|
+
|
88
|
+
def debug_show_bbox(raw_pdf_doc: fitz.Document, page_idx: int, bboxes: list, droped_bboxes:list, expect_drop_bboxes:list, save_path: str, expected_page_id:int):
|
89
|
+
"""
|
90
|
+
以覆盖的方式写个临时的pdf,用于debug
|
91
|
+
"""
|
92
|
+
if page_idx!=expected_page_id:
|
93
|
+
return
|
94
|
+
|
95
|
+
if os.path.exists(save_path):
|
96
|
+
# 删除已经存在的文件
|
97
|
+
os.remove(save_path)
|
98
|
+
# 创建一个新的空白 PDF 文件
|
99
|
+
doc = fitz.open('')
|
100
|
+
|
101
|
+
width = raw_pdf_doc[page_idx].rect.width
|
102
|
+
height = raw_pdf_doc[page_idx].rect.height
|
103
|
+
new_page = doc.new_page(width=width, height=height)
|
104
|
+
|
105
|
+
shape = new_page.new_shape()
|
106
|
+
for bbox in bboxes:
|
107
|
+
# 原始box画上去
|
108
|
+
rect = fitz.Rect(*bbox[0:4])
|
109
|
+
shape = new_page.new_shape()
|
110
|
+
shape.draw_rect(rect)
|
111
|
+
shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
|
112
|
+
shape.finish()
|
113
|
+
shape.commit()
|
114
|
+
|
115
|
+
for bbox in droped_bboxes:
|
116
|
+
# 原始box画上去
|
117
|
+
rect = fitz.Rect(*bbox[0:4])
|
118
|
+
shape = new_page.new_shape()
|
119
|
+
shape.draw_rect(rect)
|
120
|
+
shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
|
121
|
+
shape.finish()
|
122
|
+
shape.commit()
|
123
|
+
|
124
|
+
for bbox in expect_drop_bboxes:
|
125
|
+
# 原始box画上去
|
126
|
+
rect = fitz.Rect(*bbox[0:4])
|
127
|
+
shape = new_page.new_shape()
|
128
|
+
shape.draw_rect(rect)
|
129
|
+
shape.finish(color=fitz.pdfcolor['red'], fill=None)
|
130
|
+
shape.finish()
|
131
|
+
shape.commit()
|
132
|
+
|
133
|
+
# shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(bboxes)}", fontname="helv", fontsize=12,
|
134
|
+
# color=(0, 0, 0))
|
135
|
+
# shape.finish(color=fitz.pdfcolor['black'])
|
136
|
+
# shape.commit()
|
137
|
+
|
138
|
+
parent_dir = os.path.dirname(save_path)
|
139
|
+
if not os.path.exists(parent_dir):
|
140
|
+
os.makedirs(parent_dir)
|
141
|
+
|
142
|
+
doc.save(save_path)
|
143
|
+
doc.close()
|
144
|
+
|
145
|
+
|
146
|
+
def debug_show_page(page, bboxes1: list,bboxes2: list,bboxes3: list,):
|
147
|
+
save_path = "./tmp/debug.pdf"
|
148
|
+
if os.path.exists(save_path):
|
149
|
+
# 删除已经存在的文件
|
150
|
+
os.remove(save_path)
|
151
|
+
# 创建一个新的空白 PDF 文件
|
152
|
+
doc = fitz.open('')
|
153
|
+
|
154
|
+
width = page.rect.width
|
155
|
+
height = page.rect.height
|
156
|
+
new_page = doc.new_page(width=width, height=height)
|
157
|
+
|
158
|
+
shape = new_page.new_shape()
|
159
|
+
for bbox in bboxes1:
|
160
|
+
# 原始box画上去
|
161
|
+
rect = fitz.Rect(*bbox[0:4])
|
162
|
+
shape = new_page.new_shape()
|
163
|
+
shape.draw_rect(rect)
|
164
|
+
shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
|
165
|
+
shape.finish()
|
166
|
+
shape.commit()
|
167
|
+
|
168
|
+
for bbox in bboxes2:
|
169
|
+
# 原始box画上去
|
170
|
+
rect = fitz.Rect(*bbox[0:4])
|
171
|
+
shape = new_page.new_shape()
|
172
|
+
shape.draw_rect(rect)
|
173
|
+
shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
|
174
|
+
shape.finish()
|
175
|
+
shape.commit()
|
176
|
+
|
177
|
+
for bbox in bboxes3:
|
178
|
+
# 原始box画上去
|
179
|
+
rect = fitz.Rect(*bbox[0:4])
|
180
|
+
shape = new_page.new_shape()
|
181
|
+
shape.draw_rect(rect)
|
182
|
+
shape.finish(color=fitz.pdfcolor['red'], fill=None)
|
183
|
+
shape.finish()
|
184
|
+
shape.commit()
|
185
|
+
|
186
|
+
parent_dir = os.path.dirname(save_path)
|
187
|
+
if not os.path.exists(parent_dir):
|
188
|
+
os.makedirs(parent_dir)
|
189
|
+
|
190
|
+
doc.save(save_path)
|
191
|
+
doc.close()
|
192
|
+
|
193
|
+
|
194
|
+
|
195
|
+
|
196
|
+
def draw_layout_bbox_on_page(raw_pdf_doc: fitz.Document, paras_dict:dict, header, footer, pdf_path: str):
|
197
|
+
"""
|
198
|
+
在page上画出bbox,保存到save_path
|
199
|
+
"""
|
200
|
+
# 检查文件是否存在
|
201
|
+
is_new_pdf = False
|
202
|
+
if os.path.exists(pdf_path):
|
203
|
+
# 打开现有的 PDF 文件
|
204
|
+
doc = fitz.open(pdf_path)
|
205
|
+
else:
|
206
|
+
# 创建一个新的空白 PDF 文件
|
207
|
+
is_new_pdf = True
|
208
|
+
doc = fitz.open('')
|
209
|
+
|
210
|
+
for k, v in paras_dict.items():
|
211
|
+
page_idx = v['page_idx']
|
212
|
+
layouts = v['layout_bboxes']
|
213
|
+
page = doc[page_idx]
|
214
|
+
shape = page.new_shape()
|
215
|
+
for order, layout in enumerate(layouts):
|
216
|
+
border_offset = 1
|
217
|
+
rect_box = layout['layout_bbox']
|
218
|
+
layout_label = layout['layout_label']
|
219
|
+
fill_color = fitz.pdfcolor['pink'] if layout_label=='U' else None
|
220
|
+
rect_box = [rect_box[0]+1, rect_box[1]-border_offset, rect_box[2]-1, rect_box[3]+border_offset]
|
221
|
+
rect = fitz.Rect(*rect_box)
|
222
|
+
shape.draw_rect(rect)
|
223
|
+
shape.finish(color=fitz.pdfcolor['red'], fill=fill_color, fill_opacity=0.4)
|
224
|
+
"""
|
225
|
+
draw order text on layout box
|
226
|
+
"""
|
227
|
+
font_size = 10
|
228
|
+
shape.insert_text((rect_box[0] + 1, rect_box[1] + font_size), f"{order}", fontsize=font_size, color=(0, 0, 0))
|
229
|
+
|
230
|
+
"""画上footer header"""
|
231
|
+
if header:
|
232
|
+
shape.draw_rect(fitz.Rect(header))
|
233
|
+
shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.2)
|
234
|
+
if footer:
|
235
|
+
shape.draw_rect(fitz.Rect(footer))
|
236
|
+
shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.2)
|
237
|
+
|
238
|
+
shape.commit()
|
239
|
+
|
240
|
+
if is_new_pdf:
|
241
|
+
doc.save(pdf_path)
|
242
|
+
else:
|
243
|
+
doc.saveIncr()
|
244
|
+
doc.close()
|
245
|
+
|
246
|
+
|
247
|
+
@DeprecationWarning
|
248
|
+
def draw_layout_on_page(raw_pdf_doc: fitz.Document, page_idx: int, page_layout: list, pdf_path: str):
|
249
|
+
"""
|
250
|
+
把layout的box用红色边框花在pdf_path的page_idx上
|
251
|
+
"""
|
252
|
+
def draw(shape, layout, fill_color=fitz.pdfcolor['pink']):
|
253
|
+
border_offset = 1
|
254
|
+
rect_box = layout['layout_bbox']
|
255
|
+
layout_label = layout['layout_label']
|
256
|
+
sub_layout = layout['sub_layout']
|
257
|
+
if len(sub_layout)==0:
|
258
|
+
fill_color = fill_color if layout_label=='U' else None
|
259
|
+
rect_box = [rect_box[0]+1, rect_box[1]-border_offset, rect_box[2]-1, rect_box[3]+border_offset]
|
260
|
+
rect = fitz.Rect(*rect_box)
|
261
|
+
shape.draw_rect(rect)
|
262
|
+
shape.finish(color=fitz.pdfcolor['red'], fill=fill_color, fill_opacity=0.2)
|
263
|
+
# if layout_label=='U':
|
264
|
+
# bad_boxes = layout.get("bad_boxes", [])
|
265
|
+
# for bad_box in bad_boxes:
|
266
|
+
# rect = fitz.Rect(*bad_box)
|
267
|
+
# shape.draw_rect(rect)
|
268
|
+
# shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['red'], fill_opacity=0.2)
|
269
|
+
# else:
|
270
|
+
# rect = fitz.Rect(*rect_box)
|
271
|
+
# shape.draw_rect(rect)
|
272
|
+
# shape.finish(color=fitz.pdfcolor['blue'])
|
273
|
+
|
274
|
+
for sub_layout in sub_layout:
|
275
|
+
draw(shape, sub_layout)
|
276
|
+
shape.commit()
|
277
|
+
|
278
|
+
|
279
|
+
# 检查文件是否存在
|
280
|
+
is_new_pdf = False
|
281
|
+
if os.path.exists(pdf_path):
|
282
|
+
# 打开现有的 PDF 文件
|
283
|
+
doc = fitz.open(pdf_path)
|
284
|
+
else:
|
285
|
+
# 创建一个新的空白 PDF 文件
|
286
|
+
is_new_pdf = True
|
287
|
+
doc = fitz.open('')
|
288
|
+
|
289
|
+
page = doc[page_idx]
|
290
|
+
shape = page.new_shape()
|
291
|
+
for order, layout in enumerate(page_layout):
|
292
|
+
draw(shape, layout, fitz.pdfcolor['yellow'])
|
293
|
+
|
294
|
+
# shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(layout)}", fontname="helv", fontsize=12,
|
295
|
+
# color=(0, 0, 0))
|
296
|
+
# shape.finish(color=fitz.pdfcolor['black'])
|
297
|
+
# shape.commit()
|
298
|
+
|
299
|
+
parent_dir = os.path.dirname(pdf_path)
|
300
|
+
if not os.path.exists(parent_dir):
|
301
|
+
os.makedirs(parent_dir)
|
302
|
+
|
303
|
+
if is_new_pdf:
|
304
|
+
doc.save(pdf_path)
|
305
|
+
else:
|
306
|
+
doc.saveIncr()
|
307
|
+
doc.close()
|
308
|
+
|
File without changes
|