magic-pdf 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/__init__.py +0 -0
- magic_pdf/cli/__init__.py +0 -0
- magic_pdf/cli/magicpdf.py +294 -0
- magic_pdf/dict2md/__init__.py +0 -0
- magic_pdf/dict2md/mkcontent.py +397 -0
- magic_pdf/dict2md/ocr_mkcontent.py +356 -0
- magic_pdf/filter/__init__.py +0 -0
- magic_pdf/filter/pdf_classify_by_type.py +381 -0
- magic_pdf/filter/pdf_meta_scan.py +368 -0
- magic_pdf/layout/__init__.py +0 -0
- magic_pdf/layout/bbox_sort.py +681 -0
- magic_pdf/layout/layout_det_utils.py +182 -0
- magic_pdf/layout/layout_sort.py +732 -0
- magic_pdf/layout/layout_spiler_recog.py +101 -0
- magic_pdf/layout/mcol_sort.py +336 -0
- magic_pdf/libs/Constants.py +11 -0
- magic_pdf/libs/MakeContentConfig.py +10 -0
- magic_pdf/libs/ModelBlockTypeEnum.py +9 -0
- magic_pdf/libs/__init__.py +0 -0
- magic_pdf/libs/boxbase.py +408 -0
- magic_pdf/libs/calc_span_stats.py +239 -0
- magic_pdf/libs/commons.py +204 -0
- magic_pdf/libs/config_reader.py +63 -0
- magic_pdf/libs/convert_utils.py +5 -0
- magic_pdf/libs/coordinate_transform.py +9 -0
- magic_pdf/libs/detect_language_from_model.py +21 -0
- magic_pdf/libs/draw_bbox.py +227 -0
- magic_pdf/libs/drop_reason.py +27 -0
- magic_pdf/libs/drop_tag.py +19 -0
- magic_pdf/libs/hash_utils.py +15 -0
- magic_pdf/libs/json_compressor.py +27 -0
- magic_pdf/libs/language.py +31 -0
- magic_pdf/libs/markdown_utils.py +31 -0
- magic_pdf/libs/math.py +9 -0
- magic_pdf/libs/nlp_utils.py +203 -0
- magic_pdf/libs/ocr_content_type.py +21 -0
- magic_pdf/libs/path_utils.py +23 -0
- magic_pdf/libs/pdf_image_tools.py +33 -0
- magic_pdf/libs/safe_filename.py +11 -0
- magic_pdf/libs/textbase.py +33 -0
- magic_pdf/libs/version.py +1 -0
- magic_pdf/libs/vis_utils.py +308 -0
- magic_pdf/model/__init__.py +0 -0
- magic_pdf/model/doc_analyze_by_360layout.py +8 -0
- magic_pdf/model/doc_analyze_by_pp_structurev2.py +125 -0
- magic_pdf/model/magic_model.py +632 -0
- magic_pdf/para/__init__.py +0 -0
- magic_pdf/para/block_continuation_processor.py +562 -0
- magic_pdf/para/block_termination_processor.py +480 -0
- magic_pdf/para/commons.py +222 -0
- magic_pdf/para/denoise.py +246 -0
- magic_pdf/para/draw.py +121 -0
- magic_pdf/para/exceptions.py +198 -0
- magic_pdf/para/layout_match_processor.py +40 -0
- magic_pdf/para/para_pipeline.py +297 -0
- magic_pdf/para/para_split.py +644 -0
- magic_pdf/para/para_split_v2.py +772 -0
- magic_pdf/para/raw_processor.py +207 -0
- magic_pdf/para/stats.py +268 -0
- magic_pdf/para/title_processor.py +1014 -0
- magic_pdf/pdf_parse_by_ocr.py +219 -0
- magic_pdf/pdf_parse_by_ocr_v2.py +17 -0
- magic_pdf/pdf_parse_by_txt.py +410 -0
- magic_pdf/pdf_parse_by_txt_v2.py +56 -0
- magic_pdf/pdf_parse_for_train.py +685 -0
- magic_pdf/pdf_parse_union_core.py +241 -0
- magic_pdf/pipe/AbsPipe.py +112 -0
- magic_pdf/pipe/OCRPipe.py +28 -0
- magic_pdf/pipe/TXTPipe.py +29 -0
- magic_pdf/pipe/UNIPipe.py +83 -0
- magic_pdf/pipe/__init__.py +0 -0
- magic_pdf/post_proc/__init__.py +0 -0
- magic_pdf/post_proc/detect_para.py +3472 -0
- magic_pdf/post_proc/pdf_post_filter.py +67 -0
- magic_pdf/post_proc/remove_footnote.py +153 -0
- magic_pdf/pre_proc/__init__.py +0 -0
- magic_pdf/pre_proc/citationmarker_remove.py +157 -0
- magic_pdf/pre_proc/construct_page_dict.py +72 -0
- magic_pdf/pre_proc/cut_image.py +71 -0
- magic_pdf/pre_proc/detect_equation.py +134 -0
- magic_pdf/pre_proc/detect_footer_by_model.py +64 -0
- magic_pdf/pre_proc/detect_footer_header_by_statistics.py +284 -0
- magic_pdf/pre_proc/detect_footnote.py +170 -0
- magic_pdf/pre_proc/detect_header.py +64 -0
- magic_pdf/pre_proc/detect_images.py +647 -0
- magic_pdf/pre_proc/detect_page_number.py +64 -0
- magic_pdf/pre_proc/detect_tables.py +62 -0
- magic_pdf/pre_proc/equations_replace.py +559 -0
- magic_pdf/pre_proc/fix_image.py +244 -0
- magic_pdf/pre_proc/fix_table.py +270 -0
- magic_pdf/pre_proc/main_text_font.py +23 -0
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +115 -0
- magic_pdf/pre_proc/ocr_detect_layout.py +133 -0
- magic_pdf/pre_proc/ocr_dict_merge.py +336 -0
- magic_pdf/pre_proc/ocr_span_list_modify.py +258 -0
- magic_pdf/pre_proc/pdf_pre_filter.py +74 -0
- magic_pdf/pre_proc/post_layout_split.py +0 -0
- magic_pdf/pre_proc/remove_bbox_overlap.py +98 -0
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +79 -0
- magic_pdf/pre_proc/remove_footer_header.py +117 -0
- magic_pdf/pre_proc/remove_rotate_bbox.py +188 -0
- magic_pdf/pre_proc/resolve_bbox_conflict.py +191 -0
- magic_pdf/pre_proc/solve_line_alien.py +29 -0
- magic_pdf/pre_proc/statistics.py +12 -0
- magic_pdf/rw/AbsReaderWriter.py +34 -0
- magic_pdf/rw/DiskReaderWriter.py +66 -0
- magic_pdf/rw/S3ReaderWriter.py +107 -0
- magic_pdf/rw/__init__.py +0 -0
- magic_pdf/spark/__init__.py +0 -0
- magic_pdf/spark/spark_api.py +51 -0
- magic_pdf/train_utils/__init__.py +0 -0
- magic_pdf/train_utils/convert_to_train_format.py +65 -0
- magic_pdf/train_utils/extract_caption.py +59 -0
- magic_pdf/train_utils/remove_footer_header.py +159 -0
- magic_pdf/train_utils/vis_utils.py +327 -0
- magic_pdf/user_api.py +136 -0
- magic_pdf-0.5.4.dist-info/LICENSE.md +661 -0
- magic_pdf-0.5.4.dist-info/METADATA +24 -0
- magic_pdf-0.5.4.dist-info/RECORD +121 -0
- magic_pdf-0.5.4.dist-info/WHEEL +5 -0
- magic_pdf-0.5.4.dist-info/top_level.txt +1 -0
magic_pdf/user_api.py
ADDED
@@ -0,0 +1,136 @@
|
|
1
|
+
"""
|
2
|
+
用户输入:
|
3
|
+
model数组,每个元素代表一个页面
|
4
|
+
pdf在s3的路径
|
5
|
+
截图保存的s3位置
|
6
|
+
|
7
|
+
然后:
|
8
|
+
1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader
|
9
|
+
2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter
|
10
|
+
|
11
|
+
其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
|
12
|
+
|
13
|
+
"""
|
14
|
+
import re
|
15
|
+
|
16
|
+
from loguru import logger
|
17
|
+
|
18
|
+
from magic_pdf.libs.version import __version__
|
19
|
+
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
|
20
|
+
from magic_pdf.rw import AbsReaderWriter
|
21
|
+
from magic_pdf.pdf_parse_by_ocr_v2 import parse_pdf_by_ocr
|
22
|
+
from magic_pdf.pdf_parse_by_txt_v2 import parse_pdf_by_txt
|
23
|
+
|
24
|
+
PARSE_TYPE_TXT = "txt"
|
25
|
+
PARSE_TYPE_OCR = "ocr"
|
26
|
+
|
27
|
+
|
28
|
+
def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
|
29
|
+
**kwargs):
|
30
|
+
"""
|
31
|
+
解析文本类pdf
|
32
|
+
"""
|
33
|
+
pdf_info_dict = parse_pdf_by_txt(
|
34
|
+
pdf_bytes,
|
35
|
+
pdf_models,
|
36
|
+
imageWriter,
|
37
|
+
start_page_id=start_page,
|
38
|
+
debug_mode=is_debug,
|
39
|
+
)
|
40
|
+
|
41
|
+
pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
|
42
|
+
|
43
|
+
pdf_info_dict["_version_name"] = __version__
|
44
|
+
|
45
|
+
return pdf_info_dict
|
46
|
+
|
47
|
+
|
48
|
+
def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
|
49
|
+
**kwargs):
|
50
|
+
"""
|
51
|
+
解析ocr类pdf
|
52
|
+
"""
|
53
|
+
pdf_info_dict = parse_pdf_by_ocr(
|
54
|
+
pdf_bytes,
|
55
|
+
pdf_models,
|
56
|
+
imageWriter,
|
57
|
+
start_page_id=start_page,
|
58
|
+
debug_mode=is_debug,
|
59
|
+
)
|
60
|
+
|
61
|
+
pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
|
62
|
+
|
63
|
+
pdf_info_dict["_version_name"] = __version__
|
64
|
+
|
65
|
+
return pdf_info_dict
|
66
|
+
|
67
|
+
|
68
|
+
def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0,
|
69
|
+
input_model_is_empty: bool = False,
|
70
|
+
*args, **kwargs):
|
71
|
+
"""
|
72
|
+
ocr和文本混合的pdf,全部解析出来
|
73
|
+
"""
|
74
|
+
|
75
|
+
def parse_pdf(method):
|
76
|
+
try:
|
77
|
+
return method(
|
78
|
+
pdf_bytes,
|
79
|
+
pdf_models,
|
80
|
+
imageWriter,
|
81
|
+
start_page_id=start_page,
|
82
|
+
debug_mode=is_debug,
|
83
|
+
)
|
84
|
+
except Exception as e:
|
85
|
+
logger.exception(e)
|
86
|
+
return None
|
87
|
+
|
88
|
+
pdf_info_dict = parse_pdf(parse_pdf_by_txt)
|
89
|
+
text_all = ""
|
90
|
+
for page_dict in pdf_info_dict['pdf_info']:
|
91
|
+
for para_block in page_dict['para_blocks']:
|
92
|
+
if para_block['type'] in ['title', 'text']:
|
93
|
+
for line in para_block['lines']:
|
94
|
+
for span in line['spans']:
|
95
|
+
text_all += span['content']
|
96
|
+
|
97
|
+
def calculate_not_common_character_rate(text):
|
98
|
+
garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
|
99
|
+
# 计算乱码字符的数量
|
100
|
+
garbage_count = len(garbage_regex.findall(text))
|
101
|
+
total = len(text)
|
102
|
+
if total == 0:
|
103
|
+
return 0 # 避免除以零的错误
|
104
|
+
return garbage_count / total
|
105
|
+
|
106
|
+
def calculate_not_printable_rate(text):
|
107
|
+
printable = sum(1 for c in text if c.isprintable())
|
108
|
+
total = len(text)
|
109
|
+
if total == 0:
|
110
|
+
return 0 # 避免除以零的错误
|
111
|
+
return (total - printable) / total
|
112
|
+
|
113
|
+
not_common_character_rate = calculate_not_common_character_rate(text_all)
|
114
|
+
not_printable_rate = calculate_not_printable_rate(text_all)
|
115
|
+
pdf_info_dict["_not_common_character_rate"] = not_common_character_rate
|
116
|
+
pdf_info_dict["_not_printable_rate"] = not_printable_rate
|
117
|
+
logger.info(f"not_common_character_rate: {not_common_character_rate}, not_printable_rate: {not_printable_rate}")
|
118
|
+
# not_common_character_rate对小语种可能会有误伤,not_printable_rate对小语种较为友好
|
119
|
+
if (pdf_info_dict is None
|
120
|
+
or pdf_info_dict.get("_need_drop", False)
|
121
|
+
or not_printable_rate > 0.02 # 参考一些正常的pdf,这个值没有超过0.01的,阈值设为0.02
|
122
|
+
):
|
123
|
+
logger.warning(f"parse_pdf_by_txt drop or error or garbled_rate too large, switch to parse_pdf_by_ocr")
|
124
|
+
if input_model_is_empty:
|
125
|
+
pdf_models = doc_analyze(pdf_bytes, ocr=True)
|
126
|
+
pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
|
127
|
+
if pdf_info_dict is None:
|
128
|
+
raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
|
129
|
+
else:
|
130
|
+
pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
|
131
|
+
else:
|
132
|
+
pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
|
133
|
+
|
134
|
+
pdf_info_dict["_version_name"] = __version__
|
135
|
+
|
136
|
+
return pdf_info_dict
|