magic-pdf 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/dict2md/ocr_mkcontent.py +9 -2
- magic_pdf/filter/pdf_classify_by_type.py +7 -3
- magic_pdf/filter/pdf_meta_scan.py +34 -14
- magic_pdf/libs/pdf_check.py +59 -0
- magic_pdf/libs/version.py +1 -1
- magic_pdf/pipe/AbsPipe.py +1 -0
- magic_pdf/user_api.py +37 -36
- {magic_pdf-0.5.5.dist-info → magic_pdf-0.5.6.dist-info}/METADATA +2 -1
- {magic_pdf-0.5.5.dist-info → magic_pdf-0.5.6.dist-info}/RECORD +12 -11
- {magic_pdf-0.5.5.dist-info → magic_pdf-0.5.6.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.5.5.dist-info → magic_pdf-0.5.6.dist-info}/WHEEL +0 -0
- {magic_pdf-0.5.5.dist-info → magic_pdf-0.5.6.dist-info}/top_level.txt +0 -0
@@ -144,10 +144,17 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
|
|
144
144
|
def merge_para_with_text(para_block):
|
145
145
|
para_text = ''
|
146
146
|
for line in para_block['lines']:
|
147
|
+
line_text = ""
|
148
|
+
line_lang = ""
|
149
|
+
for span in line['spans']:
|
150
|
+
span_type = span['type']
|
151
|
+
if span_type == ContentType.Text:
|
152
|
+
line_text += span['content'].strip()
|
153
|
+
if line_text != "":
|
154
|
+
line_lang = detect_lang(line_text)
|
147
155
|
for span in line['spans']:
|
148
156
|
span_type = span['type']
|
149
157
|
content = ''
|
150
|
-
language = ''
|
151
158
|
if span_type == ContentType.Text:
|
152
159
|
content = span['content']
|
153
160
|
language = detect_lang(content)
|
@@ -161,7 +168,7 @@ def merge_para_with_text(para_block):
|
|
161
168
|
content = f"\n$$\n{span['content']}\n$$\n"
|
162
169
|
|
163
170
|
if content != '':
|
164
|
-
if 'zh' in
|
171
|
+
if 'zh' in line_lang: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断
|
165
172
|
para_text += content # 中文语境下,content间不需要空格分隔
|
166
173
|
else:
|
167
174
|
para_text += content + ' ' # 英文语境下 content间需要空格分隔
|
@@ -305,7 +305,7 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
|
|
305
305
|
|
306
306
|
|
307
307
|
def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list,
|
308
|
-
text_layout_list: list):
|
308
|
+
text_layout_list: list, invalid_chars: bool):
|
309
309
|
"""
|
310
310
|
这里的图片和页面长度单位是pts
|
311
311
|
:param total_page:
|
@@ -322,7 +322,8 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
|
|
322
322
|
'by_avg_words': classify_by_avg_words(text_len_list),
|
323
323
|
'by_img_num': classify_by_img_num(img_sz_list, img_num_list),
|
324
324
|
'by_text_layout': classify_by_text_layout(text_layout_list),
|
325
|
-
'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list)
|
325
|
+
'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list),
|
326
|
+
'by_invalid_chars': invalid_chars,
|
326
327
|
}
|
327
328
|
|
328
329
|
if all(results.values()):
|
@@ -331,7 +332,10 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
|
|
331
332
|
return False, results
|
332
333
|
else:
|
333
334
|
logger.warning(
|
334
|
-
f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']},
|
335
|
+
f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']},"
|
336
|
+
f" by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']},"
|
337
|
+
f" by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']},"
|
338
|
+
f" by_invalid_chars: {results['by_invalid_chars']}",
|
335
339
|
file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法
|
336
340
|
return False, results
|
337
341
|
|
@@ -12,12 +12,13 @@ from collections import Counter
|
|
12
12
|
|
13
13
|
from magic_pdf.libs.drop_reason import DropReason
|
14
14
|
from magic_pdf.libs.language import detect_lang
|
15
|
+
from magic_pdf.libs.pdf_check import detect_invalid_chars
|
15
16
|
|
16
17
|
scan_max_page = 50
|
17
18
|
junk_limit_min = 10
|
18
19
|
|
19
20
|
|
20
|
-
def calculate_max_image_area_per_page(result:list, page_width_pts, page_height_pts):
|
21
|
+
def calculate_max_image_area_per_page(result: list, page_width_pts, page_height_pts):
|
21
22
|
max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
|
22
23
|
result]
|
23
24
|
page_area = int(page_width_pts) * int(page_height_pts)
|
@@ -25,14 +26,15 @@ def calculate_max_image_area_per_page(result:list, page_width_pts, page_height_p
|
|
25
26
|
max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.6]
|
26
27
|
return max_image_area_per_page
|
27
28
|
|
29
|
+
|
28
30
|
def process_image(page, junk_img_bojids=[]):
|
29
|
-
page_result = []# 存每个页面里的多张图四元组信息
|
31
|
+
page_result = [] # 存每个页面里的多张图四元组信息
|
30
32
|
items = page.get_images()
|
31
33
|
dedup = set()
|
32
34
|
for img in items:
|
33
35
|
# 这里返回的是图片在page上的实际展示的大小。返回一个数组,每个元素第一部分是
|
34
|
-
img_bojid = img[0]# 在pdf文件中是全局唯一的,如果这个图反复出现在pdf里那么就可能是垃圾信息,例如水印、页眉页脚等
|
35
|
-
if img_bojid in junk_img_bojids
|
36
|
+
img_bojid = img[0] # 在pdf文件中是全局唯一的,如果这个图反复出现在pdf里那么就可能是垃圾信息,例如水印、页眉页脚等
|
37
|
+
if img_bojid in junk_img_bojids: # 如果是垃圾图像,就跳过
|
36
38
|
continue
|
37
39
|
recs = page.get_image_rects(img, transform=True)
|
38
40
|
if recs:
|
@@ -47,6 +49,8 @@ def process_image(page, junk_img_bojids=[]):
|
|
47
49
|
dedup.add((x0, y0, x1, y1, img_bojid))
|
48
50
|
page_result.append([x0, y0, x1, y1, img_bojid])
|
49
51
|
return page_result
|
52
|
+
|
53
|
+
|
50
54
|
def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
|
51
55
|
"""
|
52
56
|
返回每个页面里的图片的四元组,每个页面多个图片。
|
@@ -57,7 +61,7 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
|
|
57
61
|
img_bojid_counter = Counter(img[0] for page in doc for img in page.get_images())
|
58
62
|
# 找出出现次数超过 len(doc) 半数的 img_bojid
|
59
63
|
|
60
|
-
junk_limit = max(len(doc)*0.5, junk_limit_min)# 对一些页数比较少的进行豁免
|
64
|
+
junk_limit = max(len(doc) * 0.5, junk_limit_min) # 对一些页数比较少的进行豁免
|
61
65
|
|
62
66
|
junk_img_bojids = [img_bojid for img_bojid, count in img_bojid_counter.items() if count >= junk_limit]
|
63
67
|
|
@@ -82,9 +86,10 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
|
|
82
86
|
result.append(page_result)
|
83
87
|
for item in result:
|
84
88
|
if not any(item): # 如果任何一页没有图片,说明是个文字版,需要判断是否为特殊文字版
|
85
|
-
if max(imgs_len_list) == min(imgs_len_list) and max(
|
89
|
+
if max(imgs_len_list) == min(imgs_len_list) and max(
|
90
|
+
imgs_len_list) >= junk_limit_min: # 如果是特殊文字版,就把junklist置空并break
|
86
91
|
junk_img_bojids = []
|
87
|
-
else
|
92
|
+
else: # 不是特殊文字版,是个普通文字版,但是存在垃圾图片,不置空junklist
|
88
93
|
pass
|
89
94
|
break_loop = True
|
90
95
|
break
|
@@ -94,16 +99,16 @@ def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
|
|
94
99
|
# 检查前80%的元素是否都相等
|
95
100
|
if len(set(top_eighty_percent)) == 1 and max(imgs_len_list) >= junk_limit_min:
|
96
101
|
|
97
|
-
|
98
|
-
|
102
|
+
# # 如果前10页跑完都有图,根据每页图片数量是否相等判断是否需要清除junklist
|
103
|
+
# if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:
|
99
104
|
|
100
105
|
#前10页都有图,且每页数量一致,需要检测图片大小占页面的比例判断是否需要清除junklist
|
101
106
|
max_image_area_per_page = calculate_max_image_area_per_page(result, page_width_pts, page_height_pts)
|
102
107
|
if len(max_image_area_per_page) < 0.8 * special_limit_pages: # 前10页不全是大图,说明可能是个文字版pdf,把垃圾图片list置空
|
103
108
|
junk_img_bojids = []
|
104
|
-
else
|
109
|
+
else: # 前10页都有图,而且80%都是大图,且每页图片数量一致并都很多,说明是扫描版1,不需要清空junklist
|
105
110
|
pass
|
106
|
-
else
|
111
|
+
else: # 每页图片数量不一致,需要清掉junklist全量跑前50页图片
|
107
112
|
junk_img_bojids = []
|
108
113
|
|
109
114
|
#正式进入取前50页图片的信息流程
|
@@ -136,7 +141,6 @@ def get_pdf_page_size_pts(doc: fitz.Document):
|
|
136
141
|
median_width = page_width_list[len(page_width_list) // 2]
|
137
142
|
median_height = page_height_list[len(page_height_list) // 2]
|
138
143
|
|
139
|
-
|
140
144
|
return median_width, median_height
|
141
145
|
|
142
146
|
|
@@ -156,6 +160,7 @@ def get_pdf_textlen_per_page(doc: fitz.Document):
|
|
156
160
|
|
157
161
|
return text_len_lst
|
158
162
|
|
163
|
+
|
159
164
|
def get_pdf_text_layout_per_page(doc: fitz.Document):
|
160
165
|
"""
|
161
166
|
根据PDF文档的每一页文本布局,判断该页的文本布局是横向、纵向还是未知。
|
@@ -233,11 +238,16 @@ def get_pdf_text_layout_per_page(doc: fitz.Document):
|
|
233
238
|
# logger.info(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
|
234
239
|
return text_layout_list
|
235
240
|
|
241
|
+
|
236
242
|
'''定义一个自定义异常用来抛出单页svg太多的pdf'''
|
243
|
+
|
244
|
+
|
237
245
|
class PageSvgsTooManyError(Exception):
|
238
246
|
def __init__(self, message="Page SVGs are too many"):
|
239
247
|
self.message = message
|
240
248
|
super().__init__(self.message)
|
249
|
+
|
250
|
+
|
241
251
|
def get_svgs_per_page(doc: fitz.Document):
|
242
252
|
svgs_len_list = []
|
243
253
|
for page_id, page in enumerate(doc):
|
@@ -251,6 +261,7 @@ def get_svgs_per_page(doc: fitz.Document):
|
|
251
261
|
# logger.info(f"page_id: {page_id}, svgs_len: {len(svgs)}")
|
252
262
|
return svgs_len_list
|
253
263
|
|
264
|
+
|
254
265
|
def get_imgs_per_page(doc: fitz.Document):
|
255
266
|
imgs_len_list = []
|
256
267
|
for page_id, page in enumerate(doc):
|
@@ -287,6 +298,13 @@ def get_language(doc: fitz.Document):
|
|
287
298
|
return language
|
288
299
|
|
289
300
|
|
301
|
+
def check_invalid_chars(pdf_bytes):
|
302
|
+
"""
|
303
|
+
乱码检测
|
304
|
+
"""
|
305
|
+
return detect_invalid_chars(pdf_bytes)
|
306
|
+
|
307
|
+
|
290
308
|
def pdf_meta_scan(pdf_bytes: bytes):
|
291
309
|
"""
|
292
310
|
:param s3_pdf_path:
|
@@ -318,7 +336,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
|
|
318
336
|
# logger.info(f"text_layout_per_page: {text_layout_per_page}")
|
319
337
|
text_language = get_language(doc)
|
320
338
|
# logger.info(f"text_language: {text_language}")
|
321
|
-
|
339
|
+
invalid_chars = check_invalid_chars(pdf_bytes)
|
340
|
+
# logger.info(f"invalid_chars: {invalid_chars}")
|
322
341
|
|
323
342
|
# 最后输出一条json
|
324
343
|
res = {
|
@@ -334,6 +353,7 @@ def pdf_meta_scan(pdf_bytes: bytes):
|
|
334
353
|
# "svgs_per_page": svgs_per_page,
|
335
354
|
"imgs_per_page": imgs_per_page, # 增加每页img数量list
|
336
355
|
"junk_img_bojids": junk_img_bojids, # 增加垃圾图片的bojid list
|
356
|
+
"invalid_chars": invalid_chars,
|
337
357
|
"metadata": doc.metadata
|
338
358
|
}
|
339
359
|
# logger.info(json.dumps(res, ensure_ascii=False))
|
@@ -365,4 +385,4 @@ if __name__ == '__main__':
|
|
365
385
|
# file_content = read_file("D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师_大乘无量寿.pdf","")
|
366
386
|
# doc = fitz.open("pdf", file_content)
|
367
387
|
# text_layout_lst = get_pdf_text_layout_per_page(doc)
|
368
|
-
# print(text_layout_lst)
|
388
|
+
# print(text_layout_lst)
|
@@ -0,0 +1,59 @@
|
|
1
|
+
from io import BytesIO
|
2
|
+
import re
|
3
|
+
import fitz
|
4
|
+
import numpy as np
|
5
|
+
from loguru import logger
|
6
|
+
from pdfminer.high_level import extract_text
|
7
|
+
|
8
|
+
|
9
|
+
def calculate_sample_count(total_page: int, sample_ratio=0.1):
|
10
|
+
"""
|
11
|
+
根据总页数和采样率计算采样页面的数量。
|
12
|
+
"""
|
13
|
+
select_page_cnt = int(total_page * sample_ratio)
|
14
|
+
if select_page_cnt < 5:
|
15
|
+
select_page_cnt = min(10, total_page)
|
16
|
+
elif select_page_cnt > 10:
|
17
|
+
select_page_cnt = 10
|
18
|
+
return select_page_cnt
|
19
|
+
|
20
|
+
|
21
|
+
def extract_pages(src_pdf_bytes: bytes):
|
22
|
+
pdf_docs = fitz.open("pdf", src_pdf_bytes)
|
23
|
+
total_page = len(pdf_docs)
|
24
|
+
if total_page == 0:
|
25
|
+
# 如果PDF没有页面,直接返回空文档
|
26
|
+
logger.warning("PDF is empty, return empty document")
|
27
|
+
return fitz.Document()
|
28
|
+
select_page_cnt = calculate_sample_count(total_page)
|
29
|
+
|
30
|
+
page_num = np.random.choice(total_page, select_page_cnt, replace=False)
|
31
|
+
sample_docs = fitz.Document()
|
32
|
+
try:
|
33
|
+
for index in page_num:
|
34
|
+
sample_docs.insert_pdf(pdf_docs, from_page=int(index), to_page=int(index))
|
35
|
+
except Exception as e:
|
36
|
+
logger.exception(e)
|
37
|
+
return sample_docs
|
38
|
+
|
39
|
+
|
40
|
+
def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
|
41
|
+
""""
|
42
|
+
检测PDF中是否包含非法字符
|
43
|
+
"""
|
44
|
+
'''需要使用'''
|
45
|
+
sample_docs = extract_pages(src_pdf_bytes)
|
46
|
+
sample_pdf_bytes = sample_docs.tobytes()
|
47
|
+
sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
|
48
|
+
text = extract_text(sample_pdf_file_like_object)
|
49
|
+
# logger.info(text)
|
50
|
+
'''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
|
51
|
+
cid_pattern = re.compile(r'\(cid:\d+\)')
|
52
|
+
matches = cid_pattern.findall(text)
|
53
|
+
cid_count = len(matches)
|
54
|
+
text_len = len(text)
|
55
|
+
logger.info(f"cid_count: {cid_count}, text_len: {text_len}")
|
56
|
+
if cid_count > 10:
|
57
|
+
return False # 乱码文档
|
58
|
+
else:
|
59
|
+
return True # 正常文档
|
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.5.
|
1
|
+
__version__ = "0.5.6"
|
magic_pdf/pipe/AbsPipe.py
CHANGED
magic_pdf/user_api.py
CHANGED
@@ -86,45 +86,46 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
|
|
86
86
|
return None
|
87
87
|
|
88
88
|
pdf_info_dict = parse_pdf(parse_pdf_by_txt)
|
89
|
-
text_all = ""
|
90
|
-
for page_dict in pdf_info_dict['pdf_info']:
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
def calculate_not_common_character_rate(text):
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
def calculate_not_printable_rate(text):
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
not_common_character_rate = calculate_not_common_character_rate(text_all)
|
118
|
-
not_printable_rate = calculate_not_printable_rate(text_all)
|
119
|
-
pdf_info_dict["_not_common_character_rate"] = not_common_character_rate
|
120
|
-
pdf_info_dict["_not_printable_rate"] = not_printable_rate
|
121
|
-
logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}")
|
89
|
+
# text_all = ""
|
90
|
+
# for page_dict in pdf_info_dict['pdf_info']:
|
91
|
+
# for para_block in page_dict['para_blocks']:
|
92
|
+
# if para_block['type'] in ['title', 'text']:
|
93
|
+
# for line in para_block['lines']:
|
94
|
+
# for span in line['spans']:
|
95
|
+
# text_all += span['content']
|
96
|
+
|
97
|
+
# def calculate_not_common_character_rate(text):
|
98
|
+
# garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
|
99
|
+
# # 计算乱码字符的数量
|
100
|
+
# garbage_count = len(garbage_regex.findall(text))
|
101
|
+
# total = len(text)
|
102
|
+
# if total == 0:
|
103
|
+
# return 0 # 避免除以零的错误
|
104
|
+
# return garbage_count / total
|
105
|
+
#
|
106
|
+
# def calculate_not_printable_rate(text):
|
107
|
+
# printable_text = ""
|
108
|
+
# for c in text:
|
109
|
+
# if c.isprintable():
|
110
|
+
# printable_text += c
|
111
|
+
# printable_total = len(printable_text)
|
112
|
+
# total = len(text)
|
113
|
+
# if total == 0:
|
114
|
+
# return 0 # 避免除以零的错误
|
115
|
+
# return (total - printable_total) / total
|
116
|
+
#
|
117
|
+
# not_common_character_rate = calculate_not_common_character_rate(text_all)
|
118
|
+
# not_printable_rate = calculate_not_printable_rate(text_all)
|
119
|
+
# pdf_info_dict["_not_common_character_rate"] = not_common_character_rate
|
120
|
+
# pdf_info_dict["_not_printable_rate"] = not_printable_rate
|
121
|
+
# logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}")
|
122
|
+
'''新逻辑使用pdfminer识别乱码pdf,准确率高且不会误伤,已在解析流程之前进行处理'''
|
122
123
|
# not_common_character_rate对小语种可能会有误伤,not_printable_rate对小语种较为友好
|
123
124
|
if (pdf_info_dict is None
|
124
|
-
|
125
|
-
|
125
|
+
or pdf_info_dict.get("_need_drop", False)
|
126
|
+
# or not_printable_rate > 0.02 # 参考一些正常的pdf,这个值没有超过0.01的,阈值设为0.02
|
126
127
|
):
|
127
|
-
logger.warning(f"parse_pdf_by_txt drop or error
|
128
|
+
logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
|
128
129
|
if input_model_is_empty:
|
129
130
|
pdf_models = doc_analyze(pdf_bytes, ocr=True)
|
130
131
|
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 0.5.
|
3
|
+
Version: 0.5.6
|
4
4
|
Requires-Python: >=3.9
|
5
5
|
License-File: LICENSE.md
|
6
6
|
Requires-Dist: boto3 >=1.28.43
|
@@ -20,6 +20,7 @@ Requires-Dist: scikit-learn >=1.0.2
|
|
20
20
|
Requires-Dist: nltk ==3.8.1
|
21
21
|
Requires-Dist: s3pathlib >=2.1.1
|
22
22
|
Requires-Dist: paddleocr
|
23
|
+
Requires-Dist: pdfminer.six >=20231228
|
23
24
|
Provides-Extra: cpu
|
24
25
|
Requires-Dist: paddlepaddle ; extra == 'cpu'
|
25
26
|
Provides-Extra: gpu
|
@@ -5,15 +5,15 @@ magic_pdf/pdf_parse_by_txt.py,sha256=5_kdfvDkv_XwDove2AW7SopGysYLJ1-tsOQy2yuII1Y
|
|
5
5
|
magic_pdf/pdf_parse_by_txt_v2.py,sha256=mGadyYamoCNGNsKOQM1uXQR65zMUKyL24yURGHADmVs,1908
|
6
6
|
magic_pdf/pdf_parse_for_train.py,sha256=Oby61DMjJ716Jj_ri7lwXfv2Chus0pbBR2RPXrmBW08,28661
|
7
7
|
magic_pdf/pdf_parse_union_core.py,sha256=a67iQuEfuslAEF-wQplGZKXUuz5mT3HiCyvuR52E6Gw,10584
|
8
|
-
magic_pdf/user_api.py,sha256=
|
8
|
+
magic_pdf/user_api.py,sha256=R4onPBXlA8GARwlqBh5wmxUtTxwQb-PUeFK7eTAWzoU,4971
|
9
9
|
magic_pdf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
magic_pdf/cli/magicpdf.py,sha256=IoyuWsnJp5lLDS4G9brtCqNdIWKb57Ini4uftkCl2Mg,11357
|
11
11
|
magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
12
|
magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
|
13
|
-
magic_pdf/dict2md/ocr_mkcontent.py,sha256=
|
13
|
+
magic_pdf/dict2md/ocr_mkcontent.py,sha256=RyxebPtvFfNce_HCa-_YGxwFx_srzL-BfMKc85V9JG0,15442
|
14
14
|
magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
-
magic_pdf/filter/pdf_classify_by_type.py,sha256=
|
16
|
-
magic_pdf/filter/pdf_meta_scan.py,sha256=
|
15
|
+
magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
|
16
|
+
magic_pdf/filter/pdf_meta_scan.py,sha256=5R2XDiBZw0xd4ugbDxuyk6fztGlT5jFsGN85hLvo-hQ,17390
|
17
17
|
magic_pdf/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
18
|
magic_pdf/layout/bbox_sort.py,sha256=PzzaBf6MC_AZ-ZWGU0Kg-KIsw874l_gML73mM3hE4Ps,30807
|
19
19
|
magic_pdf/layout/layout_det_utils.py,sha256=NCYBTvsrULE3Cue53aMD1MfXTmOL9Xy0nivl6ku2cls,9137
|
@@ -42,10 +42,11 @@ magic_pdf/libs/math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
|
|
42
42
|
magic_pdf/libs/nlp_utils.py,sha256=-X9W3-Ns5ZdDYFvyyEq6i6P2b5hCATaFEZeOjwNOH9M,6901
|
43
43
|
magic_pdf/libs/ocr_content_type.py,sha256=DiGTYppd6WlibwCAeVpIy3NHCQkglfIAQsJ_ffu5BPw,526
|
44
44
|
magic_pdf/libs/path_utils.py,sha256=YYh8a0K8KiUhFDd_S1oLohL8n8fcSos4iMj74YEJ57s,538
|
45
|
+
magic_pdf/libs/pdf_check.py,sha256=LeCoMTVaVPWTgE0MSD6OnyXbpdjV7HfiX1RD6xesIWM,1911
|
45
46
|
magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
|
46
47
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
47
48
|
magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
|
48
|
-
magic_pdf/libs/version.py,sha256=
|
49
|
+
magic_pdf/libs/version.py,sha256=CMH34Gt1AqO7z_TqRj94XwohGoVCf8aes0djkqm45mk,22
|
49
50
|
magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
|
50
51
|
magic_pdf/model/360_layout_analysis.py,sha256=GbchKPJRVcrxvwNXMnR4vt8lOLPauTWMl-43ayyhX7U,221
|
51
52
|
magic_pdf/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -67,7 +68,7 @@ magic_pdf/para/para_split_v2.py,sha256=a04dsUFE3JD4DA9e2DULJgbKrcqWuCfK58de1p-T3
|
|
67
68
|
magic_pdf/para/raw_processor.py,sha256=mHxD9FrdOSXH7NqM41s55URyCyuyACvm9kKtowkIb3k,6317
|
68
69
|
magic_pdf/para/stats.py,sha256=-6Pf9Y8jkP1uJOYWiHUjw9Lb-Fb9GY7MHr_ok7x2GX0,9731
|
69
70
|
magic_pdf/para/title_processor.py,sha256=pYZv9vEkIjAtCz8jIUtl9AVUy_ib5SdAZmMVoZtsMRI,38593
|
70
|
-
magic_pdf/pipe/AbsPipe.py,sha256=
|
71
|
+
magic_pdf/pipe/AbsPipe.py,sha256=28e3HxybBO86npy_L4WD6F7hfjKHHc86-IhiwzAnLdk,3979
|
71
72
|
magic_pdf/pipe/OCRPipe.py,sha256=iKnNveVfsrBGl_2Xtd4hAAS5HntYyjwfBeVIKGc8V5U,1196
|
72
73
|
magic_pdf/pipe/TXTPipe.py,sha256=R0UzMZ7Z_59Vh7cPdBAO4gvHtgA5wLoODnCPnpEjbPM,1255
|
73
74
|
magic_pdf/pipe/UNIPipe.py,sha256=47a9jx1a_zO4m3sVnhcOnrmNc_QT-TI-9mv2x7L6SrQ,3507
|
@@ -116,8 +117,8 @@ magic_pdf/train_utils/convert_to_train_format.py,sha256=ifo2FAoBMa_etCvz0O4v03xO
|
|
116
117
|
magic_pdf/train_utils/extract_caption.py,sha256=gommEqIEWLplSDEJWD7_66daqlOBsWhpRBW1DHpkny4,1825
|
117
118
|
magic_pdf/train_utils/remove_footer_header.py,sha256=pyeNNdJ-th3wl5Xwb10ZLYNaFN4-6BmahoMFE8VTNNs,5978
|
118
119
|
magic_pdf/train_utils/vis_utils.py,sha256=MV9N9cT3ifJ35u7LFKGF9I_bOIQrtU1zcsxu2hj3aqM,10111
|
119
|
-
magic_pdf-0.5.
|
120
|
-
magic_pdf-0.5.
|
121
|
-
magic_pdf-0.5.
|
122
|
-
magic_pdf-0.5.
|
123
|
-
magic_pdf-0.5.
|
120
|
+
magic_pdf-0.5.6.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
121
|
+
magic_pdf-0.5.6.dist-info/METADATA,sha256=R1Rjdsta6IJ197EPwgSb7c-LtgPg2HnLibsGKRUa-i4,814
|
122
|
+
magic_pdf-0.5.6.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
123
|
+
magic_pdf-0.5.6.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
124
|
+
magic_pdf-0.5.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|