magic-pdf 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/data/data_reader_writer/filebase.py +3 -0
- magic_pdf/filter/pdf_meta_scan.py +3 -17
- magic_pdf/libs/commons.py +0 -161
- magic_pdf/libs/draw_bbox.py +2 -3
- magic_pdf/libs/markdown_utils.py +0 -21
- magic_pdf/libs/pdf_image_tools.py +2 -1
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
- magic_pdf/model/magic_model.py +0 -30
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
- magic_pdf/para/para_split_v3.py +7 -2
- magic_pdf/pdf_parse_union_core_v2.py +97 -124
- magic_pdf/pre_proc/construct_page_dict.py +0 -55
- magic_pdf/pre_proc/cut_image.py +0 -37
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
- magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
- magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
- magic_pdf/rw/S3ReaderWriter.py +1 -1
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +25 -76
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +1 -1
- magic_pdf/dict2md/mkcontent.py +0 -438
- magic_pdf/layout/__init__.py +0 -0
- magic_pdf/layout/bbox_sort.py +0 -681
- magic_pdf/layout/layout_det_utils.py +0 -182
- magic_pdf/layout/layout_sort.py +0 -921
- magic_pdf/layout/layout_spiler_recog.py +0 -101
- magic_pdf/layout/mcol_sort.py +0 -336
- magic_pdf/libs/calc_span_stats.py +0 -239
- magic_pdf/libs/detect_language_from_model.py +0 -21
- magic_pdf/libs/nlp_utils.py +0 -203
- magic_pdf/libs/textbase.py +0 -33
- magic_pdf/libs/vis_utils.py +0 -308
- magic_pdf/para/block_continuation_processor.py +0 -562
- magic_pdf/para/block_termination_processor.py +0 -480
- magic_pdf/para/commons.py +0 -222
- magic_pdf/para/denoise.py +0 -246
- magic_pdf/para/draw.py +0 -121
- magic_pdf/para/exceptions.py +0 -198
- magic_pdf/para/layout_match_processor.py +0 -40
- magic_pdf/para/para_split.py +0 -807
- magic_pdf/para/para_split_v2.py +0 -959
- magic_pdf/para/raw_processor.py +0 -207
- magic_pdf/para/stats.py +0 -268
- magic_pdf/para/title_processor.py +0 -1014
- magic_pdf/pdf_parse_union_core.py +0 -345
- magic_pdf/post_proc/__init__.py +0 -0
- magic_pdf/post_proc/detect_para.py +0 -3472
- magic_pdf/post_proc/pdf_post_filter.py +0 -60
- magic_pdf/post_proc/remove_footnote.py +0 -153
- magic_pdf/pre_proc/citationmarker_remove.py +0 -161
- magic_pdf/pre_proc/detect_equation.py +0 -134
- magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
- magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
- magic_pdf/pre_proc/detect_footnote.py +0 -170
- magic_pdf/pre_proc/detect_header.py +0 -64
- magic_pdf/pre_proc/detect_images.py +0 -647
- magic_pdf/pre_proc/detect_page_number.py +0 -64
- magic_pdf/pre_proc/detect_tables.py +0 -62
- magic_pdf/pre_proc/equations_replace.py +0 -550
- magic_pdf/pre_proc/fix_image.py +0 -244
- magic_pdf/pre_proc/fix_table.py +0 -270
- magic_pdf/pre_proc/main_text_font.py +0 -23
- magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
- magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
- magic_pdf/pre_proc/post_layout_split.py +0 -0
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
- magic_pdf/pre_proc/remove_footer_header.py +0 -114
- magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
- magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
- magic_pdf/pre_proc/solve_line_alien.py +0 -29
- magic_pdf/pre_proc/statistics.py +0 -12
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
magic_pdf/libs/nlp_utils.py
DELETED
@@ -1,203 +0,0 @@
|
|
1
|
-
import re
|
2
|
-
from os import path
|
3
|
-
|
4
|
-
from collections import Counter
|
5
|
-
|
6
|
-
from loguru import logger
|
7
|
-
|
8
|
-
# from langdetect import detect
|
9
|
-
import spacy
|
10
|
-
import en_core_web_sm
|
11
|
-
import zh_core_web_sm
|
12
|
-
|
13
|
-
from magic_pdf.libs.language import detect_lang
|
14
|
-
|
15
|
-
|
16
|
-
class NLPModels:
|
17
|
-
"""
|
18
|
-
How to upload local models to s3:
|
19
|
-
- config aws cli:
|
20
|
-
doc\SETUP-CLI.md
|
21
|
-
doc\setup_cli.sh
|
22
|
-
app\config\__init__.py
|
23
|
-
- $ cd {local_dir_storing_models}
|
24
|
-
- $ ls models
|
25
|
-
en_core_web_sm-3.7.1/
|
26
|
-
zh_core_web_sm-3.7.0/
|
27
|
-
- $ aws s3 sync models/ s3://llm-infra/models --profile=p_project_norm
|
28
|
-
- $ aws s3 --profile=p_project_norm ls s3://llm-infra/models/
|
29
|
-
PRE en_core_web_sm-3.7.1/
|
30
|
-
PRE zh_core_web_sm-3.7.0/
|
31
|
-
"""
|
32
|
-
|
33
|
-
def __init__(self):
|
34
|
-
# if OS is windows, set "TMP_DIR" to "D:/tmp"
|
35
|
-
|
36
|
-
home_dir = path.expanduser("~")
|
37
|
-
self.default_local_path = path.join(home_dir, ".nlp_models")
|
38
|
-
self.default_shared_path = "/share/pdf_processor/nlp_models"
|
39
|
-
self.default_hdfs_path = "hdfs://pdf_processor/nlp_models"
|
40
|
-
self.default_s3_path = "s3://llm-infra/models"
|
41
|
-
self.nlp_models = self.nlp_models = {
|
42
|
-
"en_core_web_sm": {
|
43
|
-
"type": "spacy",
|
44
|
-
"version": "3.7.1",
|
45
|
-
},
|
46
|
-
"en_core_web_md": {
|
47
|
-
"type": "spacy",
|
48
|
-
"version": "3.7.1",
|
49
|
-
},
|
50
|
-
"en_core_web_lg": {
|
51
|
-
"type": "spacy",
|
52
|
-
"version": "3.7.1",
|
53
|
-
},
|
54
|
-
"zh_core_web_sm": {
|
55
|
-
"type": "spacy",
|
56
|
-
"version": "3.7.0",
|
57
|
-
},
|
58
|
-
"zh_core_web_md": {
|
59
|
-
"type": "spacy",
|
60
|
-
"version": "3.7.0",
|
61
|
-
},
|
62
|
-
"zh_core_web_lg": {
|
63
|
-
"type": "spacy",
|
64
|
-
"version": "3.7.0",
|
65
|
-
},
|
66
|
-
}
|
67
|
-
self.en_core_web_sm_model = en_core_web_sm.load()
|
68
|
-
self.zh_core_web_sm_model = zh_core_web_sm.load()
|
69
|
-
|
70
|
-
def load_model(self, model_name, model_type, model_version):
|
71
|
-
if (
|
72
|
-
model_name in self.nlp_models
|
73
|
-
and self.nlp_models[model_name]["type"] == model_type
|
74
|
-
and self.nlp_models[model_name]["version"] == model_version
|
75
|
-
):
|
76
|
-
return spacy.load(model_name) if spacy.util.is_package(model_name) else None
|
77
|
-
|
78
|
-
else:
|
79
|
-
logger.error(f"Unsupported model name or version: {model_name} {model_version}")
|
80
|
-
return None
|
81
|
-
|
82
|
-
def detect_language(self, text, use_langdetect=False):
|
83
|
-
if len(text) == 0:
|
84
|
-
return None
|
85
|
-
if use_langdetect:
|
86
|
-
# print("use_langdetect")
|
87
|
-
# print(detect_lang(text))
|
88
|
-
# return detect_lang(text)
|
89
|
-
if detect_lang(text) == "zh":
|
90
|
-
return "zh"
|
91
|
-
else:
|
92
|
-
return "en"
|
93
|
-
|
94
|
-
if not use_langdetect:
|
95
|
-
en_count = len(re.findall(r"[a-zA-Z]", text))
|
96
|
-
cn_count = len(re.findall(r"[\u4e00-\u9fff]", text))
|
97
|
-
|
98
|
-
if en_count > cn_count:
|
99
|
-
return "en"
|
100
|
-
|
101
|
-
if cn_count > en_count:
|
102
|
-
return "zh"
|
103
|
-
|
104
|
-
def detect_entity_catgr_using_nlp(self, text, threshold=0.5):
|
105
|
-
"""
|
106
|
-
Detect entity categories using NLP models and return the most frequent entity types.
|
107
|
-
|
108
|
-
Parameters
|
109
|
-
----------
|
110
|
-
text : str
|
111
|
-
Text to be processed.
|
112
|
-
|
113
|
-
Returns
|
114
|
-
-------
|
115
|
-
str
|
116
|
-
The most frequent entity type.
|
117
|
-
"""
|
118
|
-
lang = self.detect_language(text, use_langdetect=True)
|
119
|
-
|
120
|
-
if lang == "en":
|
121
|
-
nlp_model = self.en_core_web_sm_model
|
122
|
-
elif lang == "zh":
|
123
|
-
nlp_model = self.zh_core_web_sm_model
|
124
|
-
else:
|
125
|
-
# logger.error(f"Unsupported language: {lang}")
|
126
|
-
return {}
|
127
|
-
|
128
|
-
# Splitting text into smaller parts
|
129
|
-
text_parts = re.split(r"[,;,;、\s & |]+", text)
|
130
|
-
|
131
|
-
text_parts = [part for part in text_parts if not re.match(r"[\d\W]+", part)] # Remove non-words
|
132
|
-
text_combined = " ".join(text_parts)
|
133
|
-
|
134
|
-
try:
|
135
|
-
doc = nlp_model(text_combined)
|
136
|
-
entity_counts = Counter([ent.label_ for ent in doc.ents])
|
137
|
-
word_counts_in_entities = Counter()
|
138
|
-
|
139
|
-
for ent in doc.ents:
|
140
|
-
word_counts_in_entities[ent.label_] += len(ent.text.split())
|
141
|
-
|
142
|
-
total_words_in_entities = sum(word_counts_in_entities.values())
|
143
|
-
total_words = len([token for token in doc if not token.is_punct])
|
144
|
-
|
145
|
-
if total_words_in_entities == 0 or total_words == 0:
|
146
|
-
return None
|
147
|
-
|
148
|
-
entity_percentage = total_words_in_entities / total_words
|
149
|
-
if entity_percentage < 0.5:
|
150
|
-
return None
|
151
|
-
|
152
|
-
most_common_entity, word_count = word_counts_in_entities.most_common(1)[0]
|
153
|
-
entity_percentage = word_count / total_words_in_entities
|
154
|
-
|
155
|
-
if entity_percentage >= threshold:
|
156
|
-
return most_common_entity
|
157
|
-
else:
|
158
|
-
return None
|
159
|
-
except Exception as e:
|
160
|
-
logger.error(f"Error in entity detection: {e}")
|
161
|
-
return None
|
162
|
-
|
163
|
-
|
164
|
-
def __main__():
|
165
|
-
nlpModel = NLPModels()
|
166
|
-
|
167
|
-
test_strings = [
|
168
|
-
"张三",
|
169
|
-
"张三, 李四,王五; 赵六",
|
170
|
-
"John Doe",
|
171
|
-
"Jane Smith",
|
172
|
-
"Lee, John",
|
173
|
-
"John Doe, Jane Smith; Alice Johnson,Bob Lee",
|
174
|
-
"孙七, Michael Jordan;赵八",
|
175
|
-
"David Smith Michael O'Connor; Kevin ßáçøñ",
|
176
|
-
"李雷·韩梅梅, 张三·李四",
|
177
|
-
"Charles Robert Darwin, Isaac Newton",
|
178
|
-
"莱昂纳多·迪卡普里奥, 杰克·吉伦哈尔",
|
179
|
-
"John Doe, Jane Smith; Alice Johnson",
|
180
|
-
"张三, 李四,王五; 赵六",
|
181
|
-
"Lei Wang, Jia Li, and Xiaojun Chen, LINKE YANG OU, and YUAN ZHANG",
|
182
|
-
"Rachel Mills & William Barry & Susanne B. Haga",
|
183
|
-
"Claire Chabut* and Jean-François Bussières",
|
184
|
-
"1 Department of Chemistry, Northeastern University, Shenyang 110004, China 2 State Key Laboratory of Polymer Physics and Chemistry, Changchun Institute of Applied Chemistry, Chinese Academy of Sciences, Changchun 130022, China",
|
185
|
-
"Changchun",
|
186
|
-
"china",
|
187
|
-
"Rongjun Song, 1,2 Baoyan Zhang, 1 Baotong Huang, 2 Tao Tang 2",
|
188
|
-
"Synergistic Effect of Supported Nickel Catalyst with Intumescent Flame-Retardants on Flame Retardancy and Thermal Stability of Polypropylene",
|
189
|
-
"Synergistic Effect of Supported Nickel Catalyst with",
|
190
|
-
"Intumescent Flame-Retardants on Flame Retardancy",
|
191
|
-
"and Thermal Stability of Polypropylene",
|
192
|
-
]
|
193
|
-
|
194
|
-
for test in test_strings:
|
195
|
-
print()
|
196
|
-
print(f"Original String: {test}")
|
197
|
-
|
198
|
-
result = nlpModel.detect_entity_catgr_using_nlp(test)
|
199
|
-
print(f"Detected entities: {result}")
|
200
|
-
|
201
|
-
|
202
|
-
if __name__ == "__main__":
|
203
|
-
__main__()
|
magic_pdf/libs/textbase.py
DELETED
@@ -1,33 +0,0 @@
|
|
1
|
-
import math
|
2
|
-
|
3
|
-
|
4
|
-
def __inc_dict_val(mp, key, val_inc:int):
|
5
|
-
if mp.get(key):
|
6
|
-
mp[key] = mp[key] + val_inc
|
7
|
-
else:
|
8
|
-
mp[key] = val_inc
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
def get_text_block_base_info(block):
|
13
|
-
"""
|
14
|
-
获取这个文本块里的字体的颜色、字号、字体
|
15
|
-
按照正文字数最多的返回
|
16
|
-
"""
|
17
|
-
|
18
|
-
counter = {}
|
19
|
-
|
20
|
-
for line in block['lines']:
|
21
|
-
for span in line['spans']:
|
22
|
-
color = span['color']
|
23
|
-
size = round(span['size'], 2)
|
24
|
-
font = span['font']
|
25
|
-
|
26
|
-
txt_len = len(span['text'])
|
27
|
-
__inc_dict_val(counter, (color, size, font), txt_len)
|
28
|
-
|
29
|
-
|
30
|
-
c, s, ft = max(counter, key=counter.get)
|
31
|
-
|
32
|
-
return c, s, ft
|
33
|
-
|
magic_pdf/libs/vis_utils.py
DELETED
@@ -1,308 +0,0 @@
|
|
1
|
-
from magic_pdf.libs.commons import fitz
|
2
|
-
import os
|
3
|
-
|
4
|
-
|
5
|
-
def draw_bbox_on_page(raw_pdf_doc: fitz.Document, paras_dict:dict, save_path: str):
|
6
|
-
"""
|
7
|
-
在page上画出bbox,保存到save_path
|
8
|
-
"""
|
9
|
-
# 检查文件是否存在
|
10
|
-
is_new_pdf = False
|
11
|
-
if os.path.exists(save_path):
|
12
|
-
# 打开现有的 PDF 文件
|
13
|
-
doc = fitz.open(save_path)
|
14
|
-
else:
|
15
|
-
# 创建一个新的空白 PDF 文件
|
16
|
-
is_new_pdf = True
|
17
|
-
doc = fitz.open('')
|
18
|
-
|
19
|
-
color_map = {
|
20
|
-
'image': fitz.pdfcolor["yellow"],
|
21
|
-
'text': fitz.pdfcolor['blue'],
|
22
|
-
"table": fitz.pdfcolor['green']
|
23
|
-
}
|
24
|
-
|
25
|
-
for k, v in paras_dict.items():
|
26
|
-
page_idx = v['page_idx']
|
27
|
-
width = raw_pdf_doc[page_idx].rect.width
|
28
|
-
height = raw_pdf_doc[page_idx].rect.height
|
29
|
-
new_page = doc.new_page(width=width, height=height)
|
30
|
-
|
31
|
-
shape = new_page.new_shape()
|
32
|
-
for order, block in enumerate(v['preproc_blocks']):
|
33
|
-
rect = fitz.Rect(block['bbox'])
|
34
|
-
shape = new_page.new_shape()
|
35
|
-
shape.draw_rect(rect)
|
36
|
-
shape.finish(color=None, fill=color_map['text'], fill_opacity=0.2)
|
37
|
-
shape.finish()
|
38
|
-
shape.commit()
|
39
|
-
|
40
|
-
for img in v['images']:
|
41
|
-
# 原始box画上去
|
42
|
-
rect = fitz.Rect(img['bbox'])
|
43
|
-
shape = new_page.new_shape()
|
44
|
-
shape.draw_rect(rect)
|
45
|
-
shape.finish(color=None, fill=fitz.pdfcolor['yellow'])
|
46
|
-
shape.finish()
|
47
|
-
shape.commit()
|
48
|
-
|
49
|
-
for img in v['image_backup']:
|
50
|
-
# 原始box画上去
|
51
|
-
rect = fitz.Rect(img['bbox'])
|
52
|
-
shape = new_page.new_shape()
|
53
|
-
shape.draw_rect(rect)
|
54
|
-
shape.finish(color=fitz.pdfcolor['yellow'], fill=None)
|
55
|
-
shape.finish()
|
56
|
-
shape.commit()
|
57
|
-
|
58
|
-
for tb in v['droped_text_block']:
|
59
|
-
# 原始box画上去
|
60
|
-
rect = fitz.Rect(tb['bbox'])
|
61
|
-
shape = new_page.new_shape()
|
62
|
-
shape.draw_rect(rect)
|
63
|
-
shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.4)
|
64
|
-
shape.finish()
|
65
|
-
shape.commit()
|
66
|
-
|
67
|
-
# TODO table
|
68
|
-
for tb in v['tables']:
|
69
|
-
rect = fitz.Rect(tb['bbox'])
|
70
|
-
shape = new_page.new_shape()
|
71
|
-
shape.draw_rect(rect)
|
72
|
-
shape.finish(color=None, fill=fitz.pdfcolor['green'], fill_opacity=0.2)
|
73
|
-
shape.finish()
|
74
|
-
shape.commit()
|
75
|
-
|
76
|
-
|
77
|
-
parent_dir = os.path.dirname(save_path)
|
78
|
-
if not os.path.exists(parent_dir):
|
79
|
-
os.makedirs(parent_dir)
|
80
|
-
|
81
|
-
if is_new_pdf:
|
82
|
-
doc.save(save_path)
|
83
|
-
else:
|
84
|
-
doc.saveIncr()
|
85
|
-
doc.close()
|
86
|
-
|
87
|
-
|
88
|
-
def debug_show_bbox(raw_pdf_doc: fitz.Document, page_idx: int, bboxes: list, droped_bboxes:list, expect_drop_bboxes:list, save_path: str, expected_page_id:int):
|
89
|
-
"""
|
90
|
-
以覆盖的方式写个临时的pdf,用于debug
|
91
|
-
"""
|
92
|
-
if page_idx!=expected_page_id:
|
93
|
-
return
|
94
|
-
|
95
|
-
if os.path.exists(save_path):
|
96
|
-
# 删除已经存在的文件
|
97
|
-
os.remove(save_path)
|
98
|
-
# 创建一个新的空白 PDF 文件
|
99
|
-
doc = fitz.open('')
|
100
|
-
|
101
|
-
width = raw_pdf_doc[page_idx].rect.width
|
102
|
-
height = raw_pdf_doc[page_idx].rect.height
|
103
|
-
new_page = doc.new_page(width=width, height=height)
|
104
|
-
|
105
|
-
shape = new_page.new_shape()
|
106
|
-
for bbox in bboxes:
|
107
|
-
# 原始box画上去
|
108
|
-
rect = fitz.Rect(*bbox[0:4])
|
109
|
-
shape = new_page.new_shape()
|
110
|
-
shape.draw_rect(rect)
|
111
|
-
shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
|
112
|
-
shape.finish()
|
113
|
-
shape.commit()
|
114
|
-
|
115
|
-
for bbox in droped_bboxes:
|
116
|
-
# 原始box画上去
|
117
|
-
rect = fitz.Rect(*bbox[0:4])
|
118
|
-
shape = new_page.new_shape()
|
119
|
-
shape.draw_rect(rect)
|
120
|
-
shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
|
121
|
-
shape.finish()
|
122
|
-
shape.commit()
|
123
|
-
|
124
|
-
for bbox in expect_drop_bboxes:
|
125
|
-
# 原始box画上去
|
126
|
-
rect = fitz.Rect(*bbox[0:4])
|
127
|
-
shape = new_page.new_shape()
|
128
|
-
shape.draw_rect(rect)
|
129
|
-
shape.finish(color=fitz.pdfcolor['red'], fill=None)
|
130
|
-
shape.finish()
|
131
|
-
shape.commit()
|
132
|
-
|
133
|
-
# shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(bboxes)}", fontname="helv", fontsize=12,
|
134
|
-
# color=(0, 0, 0))
|
135
|
-
# shape.finish(color=fitz.pdfcolor['black'])
|
136
|
-
# shape.commit()
|
137
|
-
|
138
|
-
parent_dir = os.path.dirname(save_path)
|
139
|
-
if not os.path.exists(parent_dir):
|
140
|
-
os.makedirs(parent_dir)
|
141
|
-
|
142
|
-
doc.save(save_path)
|
143
|
-
doc.close()
|
144
|
-
|
145
|
-
|
146
|
-
def debug_show_page(page, bboxes1: list,bboxes2: list,bboxes3: list,):
|
147
|
-
save_path = "./tmp/debug.pdf"
|
148
|
-
if os.path.exists(save_path):
|
149
|
-
# 删除已经存在的文件
|
150
|
-
os.remove(save_path)
|
151
|
-
# 创建一个新的空白 PDF 文件
|
152
|
-
doc = fitz.open('')
|
153
|
-
|
154
|
-
width = page.rect.width
|
155
|
-
height = page.rect.height
|
156
|
-
new_page = doc.new_page(width=width, height=height)
|
157
|
-
|
158
|
-
shape = new_page.new_shape()
|
159
|
-
for bbox in bboxes1:
|
160
|
-
# 原始box画上去
|
161
|
-
rect = fitz.Rect(*bbox[0:4])
|
162
|
-
shape = new_page.new_shape()
|
163
|
-
shape.draw_rect(rect)
|
164
|
-
shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
|
165
|
-
shape.finish()
|
166
|
-
shape.commit()
|
167
|
-
|
168
|
-
for bbox in bboxes2:
|
169
|
-
# 原始box画上去
|
170
|
-
rect = fitz.Rect(*bbox[0:4])
|
171
|
-
shape = new_page.new_shape()
|
172
|
-
shape.draw_rect(rect)
|
173
|
-
shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
|
174
|
-
shape.finish()
|
175
|
-
shape.commit()
|
176
|
-
|
177
|
-
for bbox in bboxes3:
|
178
|
-
# 原始box画上去
|
179
|
-
rect = fitz.Rect(*bbox[0:4])
|
180
|
-
shape = new_page.new_shape()
|
181
|
-
shape.draw_rect(rect)
|
182
|
-
shape.finish(color=fitz.pdfcolor['red'], fill=None)
|
183
|
-
shape.finish()
|
184
|
-
shape.commit()
|
185
|
-
|
186
|
-
parent_dir = os.path.dirname(save_path)
|
187
|
-
if not os.path.exists(parent_dir):
|
188
|
-
os.makedirs(parent_dir)
|
189
|
-
|
190
|
-
doc.save(save_path)
|
191
|
-
doc.close()
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
def draw_layout_bbox_on_page(raw_pdf_doc: fitz.Document, paras_dict:dict, header, footer, pdf_path: str):
|
197
|
-
"""
|
198
|
-
在page上画出bbox,保存到save_path
|
199
|
-
"""
|
200
|
-
# 检查文件是否存在
|
201
|
-
is_new_pdf = False
|
202
|
-
if os.path.exists(pdf_path):
|
203
|
-
# 打开现有的 PDF 文件
|
204
|
-
doc = fitz.open(pdf_path)
|
205
|
-
else:
|
206
|
-
# 创建一个新的空白 PDF 文件
|
207
|
-
is_new_pdf = True
|
208
|
-
doc = fitz.open('')
|
209
|
-
|
210
|
-
for k, v in paras_dict.items():
|
211
|
-
page_idx = v['page_idx']
|
212
|
-
layouts = v['layout_bboxes']
|
213
|
-
page = doc[page_idx]
|
214
|
-
shape = page.new_shape()
|
215
|
-
for order, layout in enumerate(layouts):
|
216
|
-
border_offset = 1
|
217
|
-
rect_box = layout['layout_bbox']
|
218
|
-
layout_label = layout['layout_label']
|
219
|
-
fill_color = fitz.pdfcolor['pink'] if layout_label=='U' else None
|
220
|
-
rect_box = [rect_box[0]+1, rect_box[1]-border_offset, rect_box[2]-1, rect_box[3]+border_offset]
|
221
|
-
rect = fitz.Rect(*rect_box)
|
222
|
-
shape.draw_rect(rect)
|
223
|
-
shape.finish(color=fitz.pdfcolor['red'], fill=fill_color, fill_opacity=0.4)
|
224
|
-
"""
|
225
|
-
draw order text on layout box
|
226
|
-
"""
|
227
|
-
font_size = 10
|
228
|
-
shape.insert_text((rect_box[0] + 1, rect_box[1] + font_size), f"{order}", fontsize=font_size, color=(0, 0, 0))
|
229
|
-
|
230
|
-
"""画上footer header"""
|
231
|
-
if header:
|
232
|
-
shape.draw_rect(fitz.Rect(header))
|
233
|
-
shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.2)
|
234
|
-
if footer:
|
235
|
-
shape.draw_rect(fitz.Rect(footer))
|
236
|
-
shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.2)
|
237
|
-
|
238
|
-
shape.commit()
|
239
|
-
|
240
|
-
if is_new_pdf:
|
241
|
-
doc.save(pdf_path)
|
242
|
-
else:
|
243
|
-
doc.saveIncr()
|
244
|
-
doc.close()
|
245
|
-
|
246
|
-
|
247
|
-
@DeprecationWarning
|
248
|
-
def draw_layout_on_page(raw_pdf_doc: fitz.Document, page_idx: int, page_layout: list, pdf_path: str):
|
249
|
-
"""
|
250
|
-
把layout的box用红色边框花在pdf_path的page_idx上
|
251
|
-
"""
|
252
|
-
def draw(shape, layout, fill_color=fitz.pdfcolor['pink']):
|
253
|
-
border_offset = 1
|
254
|
-
rect_box = layout['layout_bbox']
|
255
|
-
layout_label = layout['layout_label']
|
256
|
-
sub_layout = layout['sub_layout']
|
257
|
-
if len(sub_layout)==0:
|
258
|
-
fill_color = fill_color if layout_label=='U' else None
|
259
|
-
rect_box = [rect_box[0]+1, rect_box[1]-border_offset, rect_box[2]-1, rect_box[3]+border_offset]
|
260
|
-
rect = fitz.Rect(*rect_box)
|
261
|
-
shape.draw_rect(rect)
|
262
|
-
shape.finish(color=fitz.pdfcolor['red'], fill=fill_color, fill_opacity=0.2)
|
263
|
-
# if layout_label=='U':
|
264
|
-
# bad_boxes = layout.get("bad_boxes", [])
|
265
|
-
# for bad_box in bad_boxes:
|
266
|
-
# rect = fitz.Rect(*bad_box)
|
267
|
-
# shape.draw_rect(rect)
|
268
|
-
# shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['red'], fill_opacity=0.2)
|
269
|
-
# else:
|
270
|
-
# rect = fitz.Rect(*rect_box)
|
271
|
-
# shape.draw_rect(rect)
|
272
|
-
# shape.finish(color=fitz.pdfcolor['blue'])
|
273
|
-
|
274
|
-
for sub_layout in sub_layout:
|
275
|
-
draw(shape, sub_layout)
|
276
|
-
shape.commit()
|
277
|
-
|
278
|
-
|
279
|
-
# 检查文件是否存在
|
280
|
-
is_new_pdf = False
|
281
|
-
if os.path.exists(pdf_path):
|
282
|
-
# 打开现有的 PDF 文件
|
283
|
-
doc = fitz.open(pdf_path)
|
284
|
-
else:
|
285
|
-
# 创建一个新的空白 PDF 文件
|
286
|
-
is_new_pdf = True
|
287
|
-
doc = fitz.open('')
|
288
|
-
|
289
|
-
page = doc[page_idx]
|
290
|
-
shape = page.new_shape()
|
291
|
-
for order, layout in enumerate(page_layout):
|
292
|
-
draw(shape, layout, fitz.pdfcolor['yellow'])
|
293
|
-
|
294
|
-
# shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(layout)}", fontname="helv", fontsize=12,
|
295
|
-
# color=(0, 0, 0))
|
296
|
-
# shape.finish(color=fitz.pdfcolor['black'])
|
297
|
-
# shape.commit()
|
298
|
-
|
299
|
-
parent_dir = os.path.dirname(pdf_path)
|
300
|
-
if not os.path.exists(parent_dir):
|
301
|
-
os.makedirs(parent_dir)
|
302
|
-
|
303
|
-
if is_new_pdf:
|
304
|
-
doc.save(pdf_path)
|
305
|
-
else:
|
306
|
-
doc.saveIncr()
|
307
|
-
doc.close()
|
308
|
-
|