magic-pdf 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/cli/magicpdf.py +19 -4
- magic_pdf/filter/pdf_classify_by_type.py +25 -17
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +61 -0
- magic_pdf/model/model_list.py +2 -0
- magic_pdf/model/pp_structure_v2.py +75 -0
- magic_pdf/pipe/OCRPipe.py +1 -1
- magic_pdf/pipe/TXTPipe.py +1 -1
- magic_pdf/pipe/UNIPipe.py +1 -1
- magic_pdf/user_api.py +7 -3
- {magic_pdf-0.5.4.dist-info → magic_pdf-0.5.5.dist-info}/METADATA +5 -2
- {magic_pdf-0.5.4.dist-info → magic_pdf-0.5.5.dist-info}/RECORD +16 -14
- magic_pdf/model/doc_analyze_by_pp_structurev2.py +0 -125
- /magic_pdf/model/{doc_analyze_by_360layout.py → 360_layout_analysis.py} +0 -0
- {magic_pdf-0.5.4.dist-info → magic_pdf-0.5.5.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.5.4.dist-info → magic_pdf-0.5.5.dist-info}/WHEEL +0 -0
- {magic_pdf-0.5.4.dist-info → magic_pdf-0.5.5.dist-info}/top_level.txt +0 -0
magic_pdf/cli/magicpdf.py
CHANGED
@@ -31,7 +31,6 @@ from magic_pdf.libs.version import __version__
|
|
31
31
|
|
32
32
|
from magic_pdf.libs.MakeContentConfig import DropMode
|
33
33
|
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
|
34
|
-
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
|
35
34
|
from magic_pdf.pipe.UNIPipe import UNIPipe
|
36
35
|
from magic_pdf.pipe.OCRPipe import OCRPipe
|
37
36
|
from magic_pdf.pipe.TXTPipe import TXTPipe
|
@@ -101,18 +100,34 @@ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer,
|
|
101
100
|
# [pdf_file_name, pipe.pdf_mid_data['not_common_character_rate'], pipe.pdf_mid_data['not_printable_rate']])
|
102
101
|
|
103
102
|
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE)
|
103
|
+
'''写markdown'''
|
104
104
|
md_writer.write(
|
105
105
|
content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT
|
106
106
|
)
|
107
|
+
'''写middle_json'''
|
107
108
|
md_writer.write(
|
108
109
|
content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
|
109
|
-
path=f"{pdf_file_name}.json",
|
110
|
+
path=f"{pdf_file_name}_middle.json",
|
110
111
|
mode=AbsReaderWriter.MODE_TXT,
|
111
112
|
)
|
112
|
-
|
113
|
+
'''写model_json'''
|
114
|
+
md_writer.write(
|
115
|
+
content=json_parse.dumps(pipe.model_list, ensure_ascii=False, indent=4),
|
116
|
+
path=f"{pdf_file_name}_model.json",
|
117
|
+
mode=AbsReaderWriter.MODE_TXT,
|
118
|
+
)
|
119
|
+
'''写源pdf'''
|
120
|
+
md_writer.write(
|
121
|
+
content=pdf_bytes,
|
122
|
+
path=f"{pdf_file_name}_origin.pdf",
|
123
|
+
mode=AbsReaderWriter.MODE_BIN,
|
124
|
+
)
|
113
125
|
content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
|
126
|
+
'''写content_list'''
|
114
127
|
md_writer.write(
|
115
|
-
|
128
|
+
content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
|
129
|
+
path=f"{pdf_file_name}_content_list.json",
|
130
|
+
mode=AbsReaderWriter.MODE_TXT
|
116
131
|
)
|
117
132
|
|
118
133
|
|
@@ -21,7 +21,7 @@ from magic_pdf.libs.commons import mymax, get_top_percent_list
|
|
21
21
|
from magic_pdf.filter.pdf_meta_scan import scan_max_page, junk_limit_min
|
22
22
|
|
23
23
|
TEXT_LEN_THRESHOLD = 100
|
24
|
-
AVG_TEXT_LEN_THRESHOLD =
|
24
|
+
AVG_TEXT_LEN_THRESHOLD = 100
|
25
25
|
TEXT_LEN_SAMPLE_RATIO = 0.1 # 抽取0.1的页面进行文字长度统计
|
26
26
|
|
27
27
|
|
@@ -65,12 +65,14 @@ def merge_images(image_list, page_width, page_height, max_offset=5, max_gap=2):
|
|
65
65
|
# 如果宽达标,检测是否能竖着拼
|
66
66
|
if full_width:
|
67
67
|
# 竖着拼需要满足两个前提,左右边界各偏移不能超过 max_offset,第一张图的下边界和第二张图的上边界偏移不能超过 max_gap
|
68
|
-
close1 = (last_x0 - max_offset) <= x0 <= (last_x0 + max_offset) and (last_x1 - max_offset) <= x1 <= (
|
68
|
+
close1 = (last_x0 - max_offset) <= x0 <= (last_x0 + max_offset) and (last_x1 - max_offset) <= x1 <= (
|
69
|
+
last_x1 + max_offset) and (last_y1 - max_gap) <= y0 <= (last_y1 + max_gap)
|
69
70
|
|
70
71
|
# 如果高达标,检测是否可以横着拼
|
71
72
|
if full_height:
|
72
73
|
# 横着拼需要满足两个前提,上下边界各偏移不能超过 max_offset,第一张图的右边界和第二张图的左边界偏移不能超过 max_gap
|
73
|
-
close2 = (last_y0 - max_offset) <= y0 <= (last_y0 + max_offset) and (last_y1 - max_offset) <= y1 <= (
|
74
|
+
close2 = (last_y0 - max_offset) <= y0 <= (last_y0 + max_offset) and (last_y1 - max_offset) <= y1 <= (
|
75
|
+
last_y1 + max_offset) and (last_x1 - max_gap) <= x0 <= (last_x1 + max_gap)
|
74
76
|
|
75
77
|
# Check if the image can be merged with the last image
|
76
78
|
if (full_width and close1) or (full_height and close2):
|
@@ -109,10 +111,9 @@ def classify_by_area(total_page: int, page_width, page_height, img_sz_list, text
|
|
109
111
|
# 先对每个id出现的次数做个统计
|
110
112
|
objid_cnt = Counter([objid for page_img_sz in img_sz_list for _, _, _, _, objid in page_img_sz])
|
111
113
|
# 再去掉出现次数大于10的
|
112
|
-
if total_page >= scan_max_page
|
114
|
+
if total_page >= scan_max_page: # 新的meta_scan只扫描前 scan_max_page 页,页数大于 scan_max_page 当total_page为 scan_max_page
|
113
115
|
total_page = scan_max_page
|
114
116
|
|
115
|
-
|
116
117
|
repeat_threshold = 2 # 把bad_image的阈值设为2
|
117
118
|
# repeat_threshold = min(2, total_page) # 当total_page为1时,repeat_threshold为1,会产生误判导致所有img变成bad_img
|
118
119
|
bad_image_objid = set([objid for objid, cnt in objid_cnt.items() if cnt >= repeat_threshold])
|
@@ -129,26 +130,26 @@ def classify_by_area(total_page: int, page_width, page_height, img_sz_list, text
|
|
129
130
|
# if len(fake_image_ids) > 0 and any([l > TEXT_LEN_THRESHOLD for l in text_len_at_bad_image_page_idx]): # 这些透明图片所在的页面上有文字大于阈值
|
130
131
|
# return True
|
131
132
|
|
132
|
-
img_sz_list = [[img_sz for img_sz in page_img_sz if img_sz[-1] not in bad_image_objid] for page_img_sz in
|
133
|
-
|
133
|
+
img_sz_list = [[img_sz for img_sz in page_img_sz if img_sz[-1] not in bad_image_objid] for page_img_sz in
|
134
|
+
img_sz_list] # 过滤掉重复出现的图片
|
134
135
|
|
135
136
|
# 有的扫描版会把一页图片拆成很多张,需要先把图拼起来再计算
|
136
137
|
img_sz_list = merge_images(img_sz_list, page_width, page_height)
|
137
138
|
|
138
139
|
# 计算每个页面上最大的图的面积,然后计算这个面积占页面面积的比例
|
139
|
-
max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
|
140
|
+
max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
|
141
|
+
img_sz_list]
|
140
142
|
page_area = page_width * page_height
|
141
143
|
max_image_area_per_page = [area / page_area for area in max_image_area_per_page]
|
142
144
|
max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.5]
|
143
145
|
|
144
|
-
if len(max_image_area_per_page) >= 0.5 * total_page:
|
146
|
+
if len(max_image_area_per_page) >= 0.5 * total_page: # 阈值从0.8改到0.5,适配3页里面有两页和两页里面有一页的情况
|
145
147
|
# 这里条件成立的前提是把反复出现的图片去掉了。这些图片是隐藏的透明图层,其特点是id都一样
|
146
148
|
return False
|
147
149
|
else:
|
148
150
|
return True
|
149
151
|
|
150
152
|
|
151
|
-
|
152
153
|
def classify_by_text_len(text_len_list: list, total_page: int):
|
153
154
|
"""
|
154
155
|
随机抽取10%的页面,如果少于5个页面,那么就取全部页面。
|
@@ -173,6 +174,7 @@ def classify_by_text_len(text_len_list: list, total_page: int):
|
|
173
174
|
is_text_pdf = any([text_len > TEXT_LEN_THRESHOLD for text_len in text_len_lst])
|
174
175
|
return is_text_pdf
|
175
176
|
|
177
|
+
|
176
178
|
def classify_by_avg_words(text_len_list: list):
|
177
179
|
"""
|
178
180
|
补充规则,如果平均每页字数少于 AVG_TEXT_LEN_THRESHOLD,就不是文字pdf
|
@@ -193,6 +195,7 @@ def classify_by_avg_words(text_len_list: list):
|
|
193
195
|
|
194
196
|
return is_text_pdf
|
195
197
|
|
198
|
+
|
196
199
|
def classify_by_img_num(img_sz_list: list, img_num_list: list):
|
197
200
|
"""
|
198
201
|
补充规则,有一种扫描版本的PDF,每一页都会放所有的扫描页进去,在 metascan 时会被去重,
|
@@ -208,11 +211,11 @@ def classify_by_img_num(img_sz_list: list, img_num_list: list):
|
|
208
211
|
# img_sz_list中非空元素的个数小于1,前80%的元素都相等,且最大值大于等于junk_limit_min
|
209
212
|
if count_img_sz_list_not_none <= 1 and len(set(top_eighty_percent)) == 1 and max(img_num_list) >= junk_limit_min:
|
210
213
|
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
214
|
+
#拿max和min的值,用来判断list内的值是否全都相等
|
215
|
+
# min_imgs = min(img_num_list)
|
216
|
+
# max_imgs = max(img_num_list)
|
217
|
+
#
|
218
|
+
# if count_img_sz_list_not_none == 0 and max_imgs == min_imgs and max_imgs >= junk_limit_min:
|
216
219
|
return False # 如果满足这个条件,一定不是文字版pdf
|
217
220
|
else:
|
218
221
|
return True # 不满足这三个条件,可能是文字版pdf,通过其他规则判断
|
@@ -244,6 +247,7 @@ def classify_by_text_layout(text_layout_per_page: list):
|
|
244
247
|
else:
|
245
248
|
return False # 文本布局未知,默认认为不是文字版pdf
|
246
249
|
|
250
|
+
|
247
251
|
def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
|
248
252
|
"""
|
249
253
|
判断一页是否由细长条组成,有两个条件:
|
@@ -258,6 +262,7 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
|
|
258
262
|
Returns:
|
259
263
|
bool: 如果满足条件的页面的比例小于0.5,返回True,否则返回False
|
260
264
|
"""
|
265
|
+
|
261
266
|
def is_narrow_strip(img):
|
262
267
|
x0, y0, x1, y1, _ = img
|
263
268
|
width, height = x1 - x0, y1 - y0
|
@@ -299,7 +304,8 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
|
|
299
304
|
return narrow_strip_pages_ratio < 0.5
|
300
305
|
|
301
306
|
|
302
|
-
def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list,
|
307
|
+
def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list,
|
308
|
+
text_layout_list: list):
|
303
309
|
"""
|
304
310
|
这里的图片和页面长度单位是pts
|
305
311
|
:param total_page:
|
@@ -324,7 +330,9 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
|
|
324
330
|
elif not any(results.values()):
|
325
331
|
return False, results
|
326
332
|
else:
|
327
|
-
logger.warning(
|
333
|
+
logger.warning(
|
334
|
+
f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']}, by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}, by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}",
|
335
|
+
file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法
|
328
336
|
return False, results
|
329
337
|
|
330
338
|
|
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.5.
|
1
|
+
__version__ = "0.5.5"
|
@@ -0,0 +1,61 @@
|
|
1
|
+
import fitz
|
2
|
+
import cv2
|
3
|
+
from PIL import Image
|
4
|
+
import numpy as np
|
5
|
+
|
6
|
+
from magic_pdf.model.model_list import MODEL
|
7
|
+
from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
|
8
|
+
|
9
|
+
|
10
|
+
def dict_compare(d1, d2):
|
11
|
+
return d1.items() == d2.items()
|
12
|
+
|
13
|
+
|
14
|
+
def remove_duplicates_dicts(lst):
|
15
|
+
unique_dicts = []
|
16
|
+
for dict_item in lst:
|
17
|
+
if not any(
|
18
|
+
dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts
|
19
|
+
):
|
20
|
+
unique_dicts.append(dict_item)
|
21
|
+
return unique_dicts
|
22
|
+
|
23
|
+
|
24
|
+
def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
|
25
|
+
images = []
|
26
|
+
with fitz.open("pdf", pdf_bytes) as doc:
|
27
|
+
for index in range(0, doc.page_count):
|
28
|
+
page = doc[index]
|
29
|
+
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
30
|
+
pm = page.get_pixmap(matrix=mat, alpha=False)
|
31
|
+
|
32
|
+
# if width or height > 2000 pixels, don't enlarge the image
|
33
|
+
# if pm.width > 2000 or pm.height > 2000:
|
34
|
+
# pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
35
|
+
|
36
|
+
img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
|
37
|
+
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
|
38
|
+
img_dict = {"img": img, "width": pm.width, "height": pm.height}
|
39
|
+
images.append(img_dict)
|
40
|
+
return images
|
41
|
+
|
42
|
+
|
43
|
+
def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False, model=MODEL.Paddle):
|
44
|
+
images = load_images_from_pdf(pdf_bytes)
|
45
|
+
custom_model = None
|
46
|
+
if model == MODEL.Paddle:
|
47
|
+
custom_model = CustomPaddleModel(ocr=ocr, show_log=show_log)
|
48
|
+
else:
|
49
|
+
pass
|
50
|
+
model_json = []
|
51
|
+
for index, img_dict in enumerate(images):
|
52
|
+
img = img_dict["img"]
|
53
|
+
page_width = img_dict["width"]
|
54
|
+
page_height = img_dict["height"]
|
55
|
+
result = custom_model(img)
|
56
|
+
page_info = {"page_no": index, "height": page_height, "width": page_width}
|
57
|
+
page_dict = {"layout_dets": result, "page_info": page_info}
|
58
|
+
|
59
|
+
model_json.append(page_dict)
|
60
|
+
|
61
|
+
return model_json
|
@@ -0,0 +1,75 @@
|
|
1
|
+
import random
|
2
|
+
|
3
|
+
from loguru import logger
|
4
|
+
from paddleocr import PPStructure
|
5
|
+
|
6
|
+
|
7
|
+
def region_to_bbox(region):
|
8
|
+
x0 = region[0][0]
|
9
|
+
y0 = region[0][1]
|
10
|
+
x1 = region[2][0]
|
11
|
+
y1 = region[2][1]
|
12
|
+
return [x0, y0, x1, y1]
|
13
|
+
|
14
|
+
|
15
|
+
class CustomPaddleModel:
|
16
|
+
def __init__(self, ocr: bool = False, show_log: bool = False):
|
17
|
+
self.model = PPStructure(table=False, ocr=ocr, show_log=show_log)
|
18
|
+
|
19
|
+
def __call__(self, img):
|
20
|
+
result = self.model(img)
|
21
|
+
spans = []
|
22
|
+
for line in result:
|
23
|
+
line.pop("img")
|
24
|
+
"""
|
25
|
+
为paddle输出适配type no.
|
26
|
+
title: 0 # 标题
|
27
|
+
text: 1 # 文本
|
28
|
+
header: 2 # abandon
|
29
|
+
footer: 2 # abandon
|
30
|
+
reference: 1 # 文本 or abandon
|
31
|
+
equation: 8 # 行间公式 block
|
32
|
+
equation: 14 # 行间公式 text
|
33
|
+
figure: 3 # 图片
|
34
|
+
figure_caption: 4 # 图片描述
|
35
|
+
table: 5 # 表格
|
36
|
+
table_caption: 6 # 表格描述
|
37
|
+
"""
|
38
|
+
if line["type"] == "title":
|
39
|
+
line["category_id"] = 0
|
40
|
+
elif line["type"] in ["text", "reference"]:
|
41
|
+
line["category_id"] = 1
|
42
|
+
elif line["type"] == "figure":
|
43
|
+
line["category_id"] = 3
|
44
|
+
elif line["type"] == "figure_caption":
|
45
|
+
line["category_id"] = 4
|
46
|
+
elif line["type"] == "table":
|
47
|
+
line["category_id"] = 5
|
48
|
+
elif line["type"] == "table_caption":
|
49
|
+
line["category_id"] = 6
|
50
|
+
elif line["type"] == "equation":
|
51
|
+
line["category_id"] = 8
|
52
|
+
elif line["type"] in ["header", "footer"]:
|
53
|
+
line["category_id"] = 2
|
54
|
+
else:
|
55
|
+
logger.warning(f"unknown type: {line['type']}")
|
56
|
+
|
57
|
+
# 兼容不输出score的paddleocr版本
|
58
|
+
if line.get("score") is None:
|
59
|
+
line["score"] = 0.5 + random.random() * 0.5
|
60
|
+
|
61
|
+
res = line.pop("res", None)
|
62
|
+
if res is not None and len(res) > 0:
|
63
|
+
for span in res:
|
64
|
+
new_span = {
|
65
|
+
"category_id": 15,
|
66
|
+
"bbox": region_to_bbox(span["text_region"]),
|
67
|
+
"score": span["confidence"],
|
68
|
+
"text": span["text"],
|
69
|
+
}
|
70
|
+
spans.append(new_span)
|
71
|
+
|
72
|
+
if len(spans) > 0:
|
73
|
+
result.extend(spans)
|
74
|
+
|
75
|
+
return result
|
magic_pdf/pipe/OCRPipe.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
from magic_pdf.libs.MakeContentConfig import DropMode
|
2
|
-
from magic_pdf.model.
|
2
|
+
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
3
3
|
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
4
4
|
from magic_pdf.pipe.AbsPipe import AbsPipe
|
5
5
|
from magic_pdf.user_api import parse_ocr_pdf
|
magic_pdf/pipe/TXTPipe.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
from magic_pdf.libs.MakeContentConfig import DropMode
|
2
|
-
from magic_pdf.model.
|
2
|
+
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
3
3
|
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
4
4
|
from magic_pdf.libs.json_compressor import JsonCompressor
|
5
5
|
from magic_pdf.pipe.AbsPipe import AbsPipe
|
magic_pdf/pipe/UNIPipe.py
CHANGED
@@ -3,7 +3,7 @@ import json
|
|
3
3
|
from loguru import logger
|
4
4
|
|
5
5
|
from magic_pdf.libs.MakeContentConfig import DropMode
|
6
|
-
from magic_pdf.model.
|
6
|
+
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
7
7
|
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
8
8
|
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
9
9
|
from magic_pdf.libs.commons import join_path
|
magic_pdf/user_api.py
CHANGED
@@ -16,7 +16,7 @@ import re
|
|
16
16
|
from loguru import logger
|
17
17
|
|
18
18
|
from magic_pdf.libs.version import __version__
|
19
|
-
from magic_pdf.model.
|
19
|
+
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
20
20
|
from magic_pdf.rw import AbsReaderWriter
|
21
21
|
from magic_pdf.pdf_parse_by_ocr_v2 import parse_pdf_by_ocr
|
22
22
|
from magic_pdf.pdf_parse_by_txt_v2 import parse_pdf_by_txt
|
@@ -104,11 +104,15 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
|
|
104
104
|
return garbage_count / total
|
105
105
|
|
106
106
|
def calculate_not_printable_rate(text):
|
107
|
-
|
107
|
+
printable_text = ""
|
108
|
+
for c in text:
|
109
|
+
if c.isprintable():
|
110
|
+
printable_text += c
|
111
|
+
printable_total = len(printable_text)
|
108
112
|
total = len(text)
|
109
113
|
if total == 0:
|
110
114
|
return 0 # 避免除以零的错误
|
111
|
-
return (total -
|
115
|
+
return (total - printable_total) / total
|
112
116
|
|
113
117
|
not_common_character_rate = calculate_not_common_character_rate(text_all)
|
114
118
|
not_printable_rate = calculate_not_printable_rate(text_all)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 0.5.
|
3
|
+
Version: 0.5.5
|
4
4
|
Requires-Python: >=3.9
|
5
5
|
License-File: LICENSE.md
|
6
6
|
Requires-Dist: boto3 >=1.28.43
|
@@ -19,6 +19,9 @@ Requires-Dist: wordninja >=2.0.0
|
|
19
19
|
Requires-Dist: scikit-learn >=1.0.2
|
20
20
|
Requires-Dist: nltk ==3.8.1
|
21
21
|
Requires-Dist: s3pathlib >=2.1.1
|
22
|
-
Requires-Dist: paddlepaddle
|
23
22
|
Requires-Dist: paddleocr
|
23
|
+
Provides-Extra: cpu
|
24
|
+
Requires-Dist: paddlepaddle ; extra == 'cpu'
|
25
|
+
Provides-Extra: gpu
|
26
|
+
Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
|
24
27
|
|
@@ -5,14 +5,14 @@ magic_pdf/pdf_parse_by_txt.py,sha256=5_kdfvDkv_XwDove2AW7SopGysYLJ1-tsOQy2yuII1Y
|
|
5
5
|
magic_pdf/pdf_parse_by_txt_v2.py,sha256=mGadyYamoCNGNsKOQM1uXQR65zMUKyL24yURGHADmVs,1908
|
6
6
|
magic_pdf/pdf_parse_for_train.py,sha256=Oby61DMjJ716Jj_ri7lwXfv2Chus0pbBR2RPXrmBW08,28661
|
7
7
|
magic_pdf/pdf_parse_union_core.py,sha256=a67iQuEfuslAEF-wQplGZKXUuz5mT3HiCyvuR52E6Gw,10584
|
8
|
-
magic_pdf/user_api.py,sha256=
|
8
|
+
magic_pdf/user_api.py,sha256=VWle9vV5DSHOV1pgFkIVRnKGcKibj2-OWsoDvVzoiaE,4803
|
9
9
|
magic_pdf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
-
magic_pdf/cli/magicpdf.py,sha256=
|
10
|
+
magic_pdf/cli/magicpdf.py,sha256=IoyuWsnJp5lLDS4G9brtCqNdIWKb57Ini4uftkCl2Mg,11357
|
11
11
|
magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
12
|
magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
|
13
13
|
magic_pdf/dict2md/ocr_mkcontent.py,sha256=Y0nFbCX7zSVLq-vQqJvR8azumd0003ixrk5wy0vIJxU,15068
|
14
14
|
magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
-
magic_pdf/filter/pdf_classify_by_type.py,sha256=
|
15
|
+
magic_pdf/filter/pdf_classify_by_type.py,sha256=FG2ItqYErRQ8sSaA2xhENMX4vTrOsA2FcDX_LdnMu9c,42158
|
16
16
|
magic_pdf/filter/pdf_meta_scan.py,sha256=KLih7jfVqABhdeZ9tAu9-WZm0W0wX-PKCws4mFBGtYk,17001
|
17
17
|
magic_pdf/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
18
|
magic_pdf/layout/bbox_sort.py,sha256=PzzaBf6MC_AZ-ZWGU0Kg-KIsw874l_gML73mM3hE4Ps,30807
|
@@ -45,12 +45,14 @@ magic_pdf/libs/path_utils.py,sha256=YYh8a0K8KiUhFDd_S1oLohL8n8fcSos4iMj74YEJ57s,
|
|
45
45
|
magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
|
46
46
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
47
47
|
magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
|
48
|
-
magic_pdf/libs/version.py,sha256=
|
48
|
+
magic_pdf/libs/version.py,sha256=78mfpLewKVki6c9UONSUdlVme_JsN9ZwIfp4Hf4jmG0,22
|
49
49
|
magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
|
50
|
+
magic_pdf/model/360_layout_analysis.py,sha256=GbchKPJRVcrxvwNXMnR4vt8lOLPauTWMl-43ayyhX7U,221
|
50
51
|
magic_pdf/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
51
|
-
magic_pdf/model/
|
52
|
-
magic_pdf/model/doc_analyze_by_pp_structurev2.py,sha256=ry2sLGt10ShgvHZvhpf_QA0QGG9kXRdoAsYmxLNcPWE,4082
|
52
|
+
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=QD4NWEIz8UXdIG4V_3P8EaYesxk6PvC1SOtTWEy2GEY,2007
|
53
53
|
magic_pdf/model/magic_model.py,sha256=2H6Gz1mg0f0YCvz-TLIWrAWXCQLgZftBXJNRPlSIjwc,25077
|
54
|
+
magic_pdf/model/model_list.py,sha256=dNfnDodnbkgIW0PFDjn_KsQMA8DODOzo4Z4jxfOilaA,44
|
55
|
+
magic_pdf/model/pp_structure_v2.py,sha256=qsyt9vFDGaVizBMiSaeFVHTDsJTrIHx46Ec2J8SOj1A,2469
|
54
56
|
magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
55
57
|
magic_pdf/para/block_continuation_processor.py,sha256=IkReB5hirjm0OAirNzQQpnWe2f2bdP3Hug3Ef8qTRDA,22749
|
56
58
|
magic_pdf/para/block_termination_processor.py,sha256=YU3ZYqJy9e3OQmOuQYZrR6AUpmAlQ0mhj0PgZZPZ_fM,17957
|
@@ -66,9 +68,9 @@ magic_pdf/para/raw_processor.py,sha256=mHxD9FrdOSXH7NqM41s55URyCyuyACvm9kKtowkIb
|
|
66
68
|
magic_pdf/para/stats.py,sha256=-6Pf9Y8jkP1uJOYWiHUjw9Lb-Fb9GY7MHr_ok7x2GX0,9731
|
67
69
|
magic_pdf/para/title_processor.py,sha256=pYZv9vEkIjAtCz8jIUtl9AVUy_ib5SdAZmMVoZtsMRI,38593
|
68
70
|
magic_pdf/pipe/AbsPipe.py,sha256=jUngTfYeVeltp03QwTcZvmBYghTgA5Gd7SdZSsFUr0o,3932
|
69
|
-
magic_pdf/pipe/OCRPipe.py,sha256=
|
70
|
-
magic_pdf/pipe/TXTPipe.py,sha256
|
71
|
-
magic_pdf/pipe/UNIPipe.py,sha256=
|
71
|
+
magic_pdf/pipe/OCRPipe.py,sha256=iKnNveVfsrBGl_2Xtd4hAAS5HntYyjwfBeVIKGc8V5U,1196
|
72
|
+
magic_pdf/pipe/TXTPipe.py,sha256=R0UzMZ7Z_59Vh7cPdBAO4gvHtgA5wLoODnCPnpEjbPM,1255
|
73
|
+
magic_pdf/pipe/UNIPipe.py,sha256=47a9jx1a_zO4m3sVnhcOnrmNc_QT-TI-9mv2x7L6SrQ,3507
|
72
74
|
magic_pdf/pipe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
73
75
|
magic_pdf/post_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
74
76
|
magic_pdf/post_proc/detect_para.py,sha256=5LX86ueHQGOV9CNimAxqZH4R3KTi78leum1de_Na0pw,126181
|
@@ -114,8 +116,8 @@ magic_pdf/train_utils/convert_to_train_format.py,sha256=ifo2FAoBMa_etCvz0O4v03xO
|
|
114
116
|
magic_pdf/train_utils/extract_caption.py,sha256=gommEqIEWLplSDEJWD7_66daqlOBsWhpRBW1DHpkny4,1825
|
115
117
|
magic_pdf/train_utils/remove_footer_header.py,sha256=pyeNNdJ-th3wl5Xwb10ZLYNaFN4-6BmahoMFE8VTNNs,5978
|
116
118
|
magic_pdf/train_utils/vis_utils.py,sha256=MV9N9cT3ifJ35u7LFKGF9I_bOIQrtU1zcsxu2hj3aqM,10111
|
117
|
-
magic_pdf-0.5.
|
118
|
-
magic_pdf-0.5.
|
119
|
-
magic_pdf-0.5.
|
120
|
-
magic_pdf-0.5.
|
121
|
-
magic_pdf-0.5.
|
119
|
+
magic_pdf-0.5.5.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
120
|
+
magic_pdf-0.5.5.dist-info/METADATA,sha256=AUcJHTtNxJBaHZOd-m2OTcIUBEescc3KbXXK2f6_kTE,775
|
121
|
+
magic_pdf-0.5.5.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
122
|
+
magic_pdf-0.5.5.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
123
|
+
magic_pdf-0.5.5.dist-info/RECORD,,
|
@@ -1,125 +0,0 @@
|
|
1
|
-
import random
|
2
|
-
|
3
|
-
import fitz
|
4
|
-
import cv2
|
5
|
-
from paddleocr import PPStructure
|
6
|
-
from PIL import Image
|
7
|
-
from loguru import logger
|
8
|
-
import numpy as np
|
9
|
-
|
10
|
-
def region_to_bbox(region):
|
11
|
-
x0 = region[0][0]
|
12
|
-
y0 = region[0][1]
|
13
|
-
x1 = region[2][0]
|
14
|
-
y1 = region[2][1]
|
15
|
-
return [x0, y0, x1, y1]
|
16
|
-
|
17
|
-
|
18
|
-
def dict_compare(d1, d2):
|
19
|
-
return d1.items() == d2.items()
|
20
|
-
|
21
|
-
|
22
|
-
def remove_duplicates_dicts(lst):
|
23
|
-
unique_dicts = []
|
24
|
-
for dict_item in lst:
|
25
|
-
if not any(dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts):
|
26
|
-
unique_dicts.append(dict_item)
|
27
|
-
return unique_dicts
|
28
|
-
def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
|
29
|
-
ocr_engine = PPStructure(table=False, ocr=ocr, show_log=show_log)
|
30
|
-
|
31
|
-
imgs = []
|
32
|
-
with fitz.open("pdf", pdf_bytes) as doc:
|
33
|
-
for index in range(0, doc.page_count):
|
34
|
-
page = doc[index]
|
35
|
-
dpi = 200
|
36
|
-
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
37
|
-
pm = page.get_pixmap(matrix=mat, alpha=False)
|
38
|
-
|
39
|
-
# if width or height > 2000 pixels, don't enlarge the image
|
40
|
-
# if pm.width > 2000 or pm.height > 2000:
|
41
|
-
# pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
42
|
-
|
43
|
-
img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
|
44
|
-
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
|
45
|
-
img_dict = {
|
46
|
-
"img": img,
|
47
|
-
"width": pm.width,
|
48
|
-
"height": pm.height
|
49
|
-
}
|
50
|
-
imgs.append(img_dict)
|
51
|
-
|
52
|
-
model_json = []
|
53
|
-
for index, img_dict in enumerate(imgs):
|
54
|
-
img = img_dict['img']
|
55
|
-
page_width = img_dict['width']
|
56
|
-
page_height = img_dict['height']
|
57
|
-
result = ocr_engine(img)
|
58
|
-
spans = []
|
59
|
-
for line in result:
|
60
|
-
line.pop('img')
|
61
|
-
'''
|
62
|
-
为paddle输出适配type no.
|
63
|
-
title: 0 # 标题
|
64
|
-
text: 1 # 文本
|
65
|
-
header: 2 # abandon
|
66
|
-
footer: 2 # abandon
|
67
|
-
reference: 1 # 文本 or abandon
|
68
|
-
equation: 8 # 行间公式 block
|
69
|
-
equation: 14 # 行间公式 text
|
70
|
-
figure: 3 # 图片
|
71
|
-
figure_caption: 4 # 图片描述
|
72
|
-
table: 5 # 表格
|
73
|
-
table_caption: 6 # 表格描述
|
74
|
-
'''
|
75
|
-
if line['type'] == 'title':
|
76
|
-
line['category_id'] = 0
|
77
|
-
elif line['type'] in ['text', 'reference']:
|
78
|
-
line['category_id'] = 1
|
79
|
-
elif line['type'] == 'figure':
|
80
|
-
line['category_id'] = 3
|
81
|
-
elif line['type'] == 'figure_caption':
|
82
|
-
line['category_id'] = 4
|
83
|
-
elif line['type'] == 'table':
|
84
|
-
line['category_id'] = 5
|
85
|
-
elif line['type'] == 'table_caption':
|
86
|
-
line['category_id'] = 6
|
87
|
-
elif line['type'] == 'equation':
|
88
|
-
line['category_id'] = 8
|
89
|
-
elif line['type'] in ['header', 'footer']:
|
90
|
-
line['category_id'] = 2
|
91
|
-
else:
|
92
|
-
logger.warning(f"unknown type: {line['type']}")
|
93
|
-
|
94
|
-
# 兼容不输出score的paddleocr版本
|
95
|
-
if line.get("score") is None:
|
96
|
-
line['score'] = 0.5 + random.random() * 0.5
|
97
|
-
|
98
|
-
res = line.pop('res', None)
|
99
|
-
if res is not None and len(res) > 0:
|
100
|
-
for span in res:
|
101
|
-
new_span = {'category_id': 15,
|
102
|
-
'bbox': region_to_bbox(span['text_region']),
|
103
|
-
'score': span['confidence'],
|
104
|
-
'text': span['text']
|
105
|
-
}
|
106
|
-
spans.append(new_span)
|
107
|
-
|
108
|
-
if len(spans) > 0:
|
109
|
-
result.extend(spans)
|
110
|
-
|
111
|
-
result = remove_duplicates_dicts(result)
|
112
|
-
|
113
|
-
page_info = {
|
114
|
-
"page_no": index,
|
115
|
-
"height": page_height,
|
116
|
-
"width": page_width
|
117
|
-
}
|
118
|
-
page_dict = {
|
119
|
-
"layout_dets": result,
|
120
|
-
"page_info": page_info
|
121
|
-
}
|
122
|
-
|
123
|
-
model_json.append(page_dict)
|
124
|
-
|
125
|
-
return model_json
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|