magic-pdf 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/__init__.py +0 -0
- magic_pdf/cli/__init__.py +0 -0
- magic_pdf/cli/magicpdf.py +294 -0
- magic_pdf/dict2md/__init__.py +0 -0
- magic_pdf/dict2md/mkcontent.py +397 -0
- magic_pdf/dict2md/ocr_mkcontent.py +356 -0
- magic_pdf/filter/__init__.py +0 -0
- magic_pdf/filter/pdf_classify_by_type.py +381 -0
- magic_pdf/filter/pdf_meta_scan.py +368 -0
- magic_pdf/layout/__init__.py +0 -0
- magic_pdf/layout/bbox_sort.py +681 -0
- magic_pdf/layout/layout_det_utils.py +182 -0
- magic_pdf/layout/layout_sort.py +732 -0
- magic_pdf/layout/layout_spiler_recog.py +101 -0
- magic_pdf/layout/mcol_sort.py +336 -0
- magic_pdf/libs/Constants.py +11 -0
- magic_pdf/libs/MakeContentConfig.py +10 -0
- magic_pdf/libs/ModelBlockTypeEnum.py +9 -0
- magic_pdf/libs/__init__.py +0 -0
- magic_pdf/libs/boxbase.py +408 -0
- magic_pdf/libs/calc_span_stats.py +239 -0
- magic_pdf/libs/commons.py +204 -0
- magic_pdf/libs/config_reader.py +63 -0
- magic_pdf/libs/convert_utils.py +5 -0
- magic_pdf/libs/coordinate_transform.py +9 -0
- magic_pdf/libs/detect_language_from_model.py +21 -0
- magic_pdf/libs/draw_bbox.py +227 -0
- magic_pdf/libs/drop_reason.py +27 -0
- magic_pdf/libs/drop_tag.py +19 -0
- magic_pdf/libs/hash_utils.py +15 -0
- magic_pdf/libs/json_compressor.py +27 -0
- magic_pdf/libs/language.py +31 -0
- magic_pdf/libs/markdown_utils.py +31 -0
- magic_pdf/libs/math.py +9 -0
- magic_pdf/libs/nlp_utils.py +203 -0
- magic_pdf/libs/ocr_content_type.py +21 -0
- magic_pdf/libs/path_utils.py +23 -0
- magic_pdf/libs/pdf_image_tools.py +33 -0
- magic_pdf/libs/safe_filename.py +11 -0
- magic_pdf/libs/textbase.py +33 -0
- magic_pdf/libs/version.py +1 -0
- magic_pdf/libs/vis_utils.py +308 -0
- magic_pdf/model/__init__.py +0 -0
- magic_pdf/model/doc_analyze_by_360layout.py +8 -0
- magic_pdf/model/doc_analyze_by_pp_structurev2.py +125 -0
- magic_pdf/model/magic_model.py +632 -0
- magic_pdf/para/__init__.py +0 -0
- magic_pdf/para/block_continuation_processor.py +562 -0
- magic_pdf/para/block_termination_processor.py +480 -0
- magic_pdf/para/commons.py +222 -0
- magic_pdf/para/denoise.py +246 -0
- magic_pdf/para/draw.py +121 -0
- magic_pdf/para/exceptions.py +198 -0
- magic_pdf/para/layout_match_processor.py +40 -0
- magic_pdf/para/para_pipeline.py +297 -0
- magic_pdf/para/para_split.py +644 -0
- magic_pdf/para/para_split_v2.py +772 -0
- magic_pdf/para/raw_processor.py +207 -0
- magic_pdf/para/stats.py +268 -0
- magic_pdf/para/title_processor.py +1014 -0
- magic_pdf/pdf_parse_by_ocr.py +219 -0
- magic_pdf/pdf_parse_by_ocr_v2.py +17 -0
- magic_pdf/pdf_parse_by_txt.py +410 -0
- magic_pdf/pdf_parse_by_txt_v2.py +56 -0
- magic_pdf/pdf_parse_for_train.py +685 -0
- magic_pdf/pdf_parse_union_core.py +241 -0
- magic_pdf/pipe/AbsPipe.py +112 -0
- magic_pdf/pipe/OCRPipe.py +28 -0
- magic_pdf/pipe/TXTPipe.py +29 -0
- magic_pdf/pipe/UNIPipe.py +83 -0
- magic_pdf/pipe/__init__.py +0 -0
- magic_pdf/post_proc/__init__.py +0 -0
- magic_pdf/post_proc/detect_para.py +3472 -0
- magic_pdf/post_proc/pdf_post_filter.py +67 -0
- magic_pdf/post_proc/remove_footnote.py +153 -0
- magic_pdf/pre_proc/__init__.py +0 -0
- magic_pdf/pre_proc/citationmarker_remove.py +157 -0
- magic_pdf/pre_proc/construct_page_dict.py +72 -0
- magic_pdf/pre_proc/cut_image.py +71 -0
- magic_pdf/pre_proc/detect_equation.py +134 -0
- magic_pdf/pre_proc/detect_footer_by_model.py +64 -0
- magic_pdf/pre_proc/detect_footer_header_by_statistics.py +284 -0
- magic_pdf/pre_proc/detect_footnote.py +170 -0
- magic_pdf/pre_proc/detect_header.py +64 -0
- magic_pdf/pre_proc/detect_images.py +647 -0
- magic_pdf/pre_proc/detect_page_number.py +64 -0
- magic_pdf/pre_proc/detect_tables.py +62 -0
- magic_pdf/pre_proc/equations_replace.py +559 -0
- magic_pdf/pre_proc/fix_image.py +244 -0
- magic_pdf/pre_proc/fix_table.py +270 -0
- magic_pdf/pre_proc/main_text_font.py +23 -0
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +115 -0
- magic_pdf/pre_proc/ocr_detect_layout.py +133 -0
- magic_pdf/pre_proc/ocr_dict_merge.py +336 -0
- magic_pdf/pre_proc/ocr_span_list_modify.py +258 -0
- magic_pdf/pre_proc/pdf_pre_filter.py +74 -0
- magic_pdf/pre_proc/post_layout_split.py +0 -0
- magic_pdf/pre_proc/remove_bbox_overlap.py +98 -0
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +79 -0
- magic_pdf/pre_proc/remove_footer_header.py +117 -0
- magic_pdf/pre_proc/remove_rotate_bbox.py +188 -0
- magic_pdf/pre_proc/resolve_bbox_conflict.py +191 -0
- magic_pdf/pre_proc/solve_line_alien.py +29 -0
- magic_pdf/pre_proc/statistics.py +12 -0
- magic_pdf/rw/AbsReaderWriter.py +34 -0
- magic_pdf/rw/DiskReaderWriter.py +66 -0
- magic_pdf/rw/S3ReaderWriter.py +107 -0
- magic_pdf/rw/__init__.py +0 -0
- magic_pdf/spark/__init__.py +0 -0
- magic_pdf/spark/spark_api.py +51 -0
- magic_pdf/train_utils/__init__.py +0 -0
- magic_pdf/train_utils/convert_to_train_format.py +65 -0
- magic_pdf/train_utils/extract_caption.py +59 -0
- magic_pdf/train_utils/remove_footer_header.py +159 -0
- magic_pdf/train_utils/vis_utils.py +327 -0
- magic_pdf/user_api.py +136 -0
- magic_pdf-0.5.4.dist-info/LICENSE.md +661 -0
- magic_pdf-0.5.4.dist-info/METADATA +24 -0
- magic_pdf-0.5.4.dist-info/RECORD +121 -0
- magic_pdf-0.5.4.dist-info/WHEEL +5 -0
- magic_pdf-0.5.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,125 @@
|
|
1
|
+
import random
|
2
|
+
|
3
|
+
import fitz
|
4
|
+
import cv2
|
5
|
+
from paddleocr import PPStructure
|
6
|
+
from PIL import Image
|
7
|
+
from loguru import logger
|
8
|
+
import numpy as np
|
9
|
+
|
10
|
+
def region_to_bbox(region):
|
11
|
+
x0 = region[0][0]
|
12
|
+
y0 = region[0][1]
|
13
|
+
x1 = region[2][0]
|
14
|
+
y1 = region[2][1]
|
15
|
+
return [x0, y0, x1, y1]
|
16
|
+
|
17
|
+
|
18
|
+
def dict_compare(d1, d2):
|
19
|
+
return d1.items() == d2.items()
|
20
|
+
|
21
|
+
|
22
|
+
def remove_duplicates_dicts(lst):
|
23
|
+
unique_dicts = []
|
24
|
+
for dict_item in lst:
|
25
|
+
if not any(dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts):
|
26
|
+
unique_dicts.append(dict_item)
|
27
|
+
return unique_dicts
|
28
|
+
def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
|
29
|
+
ocr_engine = PPStructure(table=False, ocr=ocr, show_log=show_log)
|
30
|
+
|
31
|
+
imgs = []
|
32
|
+
with fitz.open("pdf", pdf_bytes) as doc:
|
33
|
+
for index in range(0, doc.page_count):
|
34
|
+
page = doc[index]
|
35
|
+
dpi = 200
|
36
|
+
mat = fitz.Matrix(dpi / 72, dpi / 72)
|
37
|
+
pm = page.get_pixmap(matrix=mat, alpha=False)
|
38
|
+
|
39
|
+
# if width or height > 2000 pixels, don't enlarge the image
|
40
|
+
# if pm.width > 2000 or pm.height > 2000:
|
41
|
+
# pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
42
|
+
|
43
|
+
img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
|
44
|
+
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
|
45
|
+
img_dict = {
|
46
|
+
"img": img,
|
47
|
+
"width": pm.width,
|
48
|
+
"height": pm.height
|
49
|
+
}
|
50
|
+
imgs.append(img_dict)
|
51
|
+
|
52
|
+
model_json = []
|
53
|
+
for index, img_dict in enumerate(imgs):
|
54
|
+
img = img_dict['img']
|
55
|
+
page_width = img_dict['width']
|
56
|
+
page_height = img_dict['height']
|
57
|
+
result = ocr_engine(img)
|
58
|
+
spans = []
|
59
|
+
for line in result:
|
60
|
+
line.pop('img')
|
61
|
+
'''
|
62
|
+
为paddle输出适配type no.
|
63
|
+
title: 0 # 标题
|
64
|
+
text: 1 # 文本
|
65
|
+
header: 2 # abandon
|
66
|
+
footer: 2 # abandon
|
67
|
+
reference: 1 # 文本 or abandon
|
68
|
+
equation: 8 # 行间公式 block
|
69
|
+
equation: 14 # 行间公式 text
|
70
|
+
figure: 3 # 图片
|
71
|
+
figure_caption: 4 # 图片描述
|
72
|
+
table: 5 # 表格
|
73
|
+
table_caption: 6 # 表格描述
|
74
|
+
'''
|
75
|
+
if line['type'] == 'title':
|
76
|
+
line['category_id'] = 0
|
77
|
+
elif line['type'] in ['text', 'reference']:
|
78
|
+
line['category_id'] = 1
|
79
|
+
elif line['type'] == 'figure':
|
80
|
+
line['category_id'] = 3
|
81
|
+
elif line['type'] == 'figure_caption':
|
82
|
+
line['category_id'] = 4
|
83
|
+
elif line['type'] == 'table':
|
84
|
+
line['category_id'] = 5
|
85
|
+
elif line['type'] == 'table_caption':
|
86
|
+
line['category_id'] = 6
|
87
|
+
elif line['type'] == 'equation':
|
88
|
+
line['category_id'] = 8
|
89
|
+
elif line['type'] in ['header', 'footer']:
|
90
|
+
line['category_id'] = 2
|
91
|
+
else:
|
92
|
+
logger.warning(f"unknown type: {line['type']}")
|
93
|
+
|
94
|
+
# 兼容不输出score的paddleocr版本
|
95
|
+
if line.get("score") is None:
|
96
|
+
line['score'] = 0.5 + random.random() * 0.5
|
97
|
+
|
98
|
+
res = line.pop('res', None)
|
99
|
+
if res is not None and len(res) > 0:
|
100
|
+
for span in res:
|
101
|
+
new_span = {'category_id': 15,
|
102
|
+
'bbox': region_to_bbox(span['text_region']),
|
103
|
+
'score': span['confidence'],
|
104
|
+
'text': span['text']
|
105
|
+
}
|
106
|
+
spans.append(new_span)
|
107
|
+
|
108
|
+
if len(spans) > 0:
|
109
|
+
result.extend(spans)
|
110
|
+
|
111
|
+
result = remove_duplicates_dicts(result)
|
112
|
+
|
113
|
+
page_info = {
|
114
|
+
"page_no": index,
|
115
|
+
"height": page_height,
|
116
|
+
"width": page_width
|
117
|
+
}
|
118
|
+
page_dict = {
|
119
|
+
"layout_dets": result,
|
120
|
+
"page_info": page_info
|
121
|
+
}
|
122
|
+
|
123
|
+
model_json.append(page_dict)
|
124
|
+
|
125
|
+
return model_json
|