magic-pdf 0.9.3__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/constants.py +53 -0
- magic_pdf/config/drop_reason.py +35 -0
- magic_pdf/config/drop_tag.py +19 -0
- magic_pdf/config/make_content_config.py +11 -0
- magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
- magic_pdf/data/read_api.py +1 -1
- magic_pdf/dict2md/mkcontent.py +226 -185
- magic_pdf/dict2md/ocr_mkcontent.py +11 -11
- magic_pdf/filter/pdf_meta_scan.py +101 -79
- magic_pdf/integrations/rag/utils.py +4 -5
- magic_pdf/libs/config_reader.py +5 -5
- magic_pdf/libs/draw_bbox.py +3 -2
- magic_pdf/libs/pdf_image_tools.py +36 -12
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
- magic_pdf/model/magic_model.py +13 -13
- magic_pdf/model/pdf_extract_kit.py +122 -76
- magic_pdf/model/sub_modules/model_init.py +40 -35
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +33 -7
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +12 -4
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +2 -0
- magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +30 -28
- magic_pdf/para/para_split.py +411 -248
- magic_pdf/para/para_split_v2.py +352 -182
- magic_pdf/para/para_split_v3.py +110 -53
- magic_pdf/pdf_parse_by_ocr.py +2 -0
- magic_pdf/pdf_parse_by_txt.py +2 -0
- magic_pdf/pdf_parse_union_core.py +174 -100
- magic_pdf/pdf_parse_union_core_v2.py +202 -36
- magic_pdf/pipe/AbsPipe.py +28 -44
- magic_pdf/pipe/OCRPipe.py +5 -5
- magic_pdf/pipe/TXTPipe.py +5 -6
- magic_pdf/pipe/UNIPipe.py +24 -25
- magic_pdf/post_proc/pdf_post_filter.py +7 -14
- magic_pdf/pre_proc/cut_image.py +9 -11
- magic_pdf/pre_proc/equations_replace.py +203 -212
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
- magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
- magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
- magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
- magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
- magic_pdf/pre_proc/remove_footer_header.py +2 -5
- magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
- magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
- magic_pdf/spark/spark_api.py +15 -17
- magic_pdf/tools/cli.py +3 -4
- magic_pdf/tools/cli_dev.py +6 -9
- magic_pdf/tools/common.py +26 -36
- magic_pdf/user_api.py +29 -38
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/METADATA +11 -12
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/RECORD +57 -58
- magic_pdf/libs/Constants.py +0 -55
- magic_pdf/libs/MakeContentConfig.py +0 -11
- magic_pdf/libs/drop_reason.py +0 -27
- magic_pdf/libs/drop_tag.py +0 -19
- magic_pdf/para/para_pipeline.py +0 -297
- /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/WHEEL +0 -0
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/top_level.txt +0 -0
magic_pdf/para/para_pipeline.py
DELETED
@@ -1,297 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
import json
|
3
|
-
|
4
|
-
from magic_pdf.para.commons import *
|
5
|
-
|
6
|
-
from magic_pdf.para.raw_processor import RawBlockProcessor
|
7
|
-
from magic_pdf.para.layout_match_processor import LayoutFilterProcessor
|
8
|
-
from magic_pdf.para.stats import BlockStatisticsCalculator
|
9
|
-
from magic_pdf.para.stats import DocStatisticsCalculator
|
10
|
-
from magic_pdf.para.title_processor import TitleProcessor
|
11
|
-
from magic_pdf.para.block_termination_processor import BlockTerminationProcessor
|
12
|
-
from magic_pdf.para.block_continuation_processor import BlockContinuationProcessor
|
13
|
-
from magic_pdf.para.draw import DrawAnnos
|
14
|
-
from magic_pdf.para.exceptions import (
|
15
|
-
DenseSingleLineBlockException,
|
16
|
-
TitleDetectionException,
|
17
|
-
TitleLevelException,
|
18
|
-
ParaSplitException,
|
19
|
-
ParaMergeException,
|
20
|
-
DiscardByException,
|
21
|
-
)
|
22
|
-
|
23
|
-
|
24
|
-
if sys.version_info[0] >= 3:
|
25
|
-
sys.stdout.reconfigure(encoding="utf-8") # type: ignore
|
26
|
-
|
27
|
-
|
28
|
-
class ParaProcessPipeline:
|
29
|
-
def __init__(self) -> None:
|
30
|
-
pass
|
31
|
-
|
32
|
-
def para_process_pipeline(self, pdf_info_dict, para_debug_mode=None, input_pdf_path=None, output_pdf_path=None):
|
33
|
-
"""
|
34
|
-
This function processes the paragraphs, including:
|
35
|
-
1. Read raw input json file into pdf_dic
|
36
|
-
2. Detect and replace equations
|
37
|
-
3. Combine spans into a natural line
|
38
|
-
4. Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
|
39
|
-
5. Compute statistics for each block
|
40
|
-
6. Detect titles in the document
|
41
|
-
7. Detect paragraphs inside each block
|
42
|
-
8. Divide the level of the titles
|
43
|
-
9. Detect and combine paragraphs from different blocks into one paragraph
|
44
|
-
10. Check whether the final results after checking headings, dividing paragraphs within blocks, and merging paragraphs between blocks are plausible and reasonable.
|
45
|
-
11. Draw annotations on the pdf file
|
46
|
-
|
47
|
-
Parameters
|
48
|
-
----------
|
49
|
-
pdf_dic_json_fpath : str
|
50
|
-
path to the pdf dictionary json file.
|
51
|
-
Notice: data noises, including overlap blocks, header, footer, watermark, vertical margin note have been removed already.
|
52
|
-
input_pdf_doc : str
|
53
|
-
path to the input pdf file
|
54
|
-
output_pdf_path : str
|
55
|
-
path to the output pdf file
|
56
|
-
|
57
|
-
Returns
|
58
|
-
-------
|
59
|
-
pdf_dict : dict
|
60
|
-
result dictionary
|
61
|
-
"""
|
62
|
-
|
63
|
-
error_info = None
|
64
|
-
|
65
|
-
output_json_file = ""
|
66
|
-
output_dir = ""
|
67
|
-
|
68
|
-
if input_pdf_path is not None:
|
69
|
-
input_pdf_path = os.path.abspath(input_pdf_path)
|
70
|
-
|
71
|
-
# print_green_on_red(f">>>>>>>>>>>>>>>>>>> Process the paragraphs of {input_pdf_path}")
|
72
|
-
|
73
|
-
if output_pdf_path is not None:
|
74
|
-
output_dir = os.path.dirname(output_pdf_path)
|
75
|
-
output_json_file = f"{output_dir}/pdf_dic.json"
|
76
|
-
|
77
|
-
def __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode):
|
78
|
-
"""
|
79
|
-
Save the pdf_dic to a json file
|
80
|
-
"""
|
81
|
-
output_pdf_file_name = os.path.basename(output_pdf_path)
|
82
|
-
# output_dir = os.path.dirname(output_pdf_path)
|
83
|
-
output_dir = "\\tmp\\pdf_parse"
|
84
|
-
output_pdf_file_name = output_pdf_file_name.replace(".pdf", f"_stage_{stage}.json")
|
85
|
-
pdf_dic_json_fpath = os.path.join(output_dir, output_pdf_file_name)
|
86
|
-
|
87
|
-
if not os.path.exists(output_dir):
|
88
|
-
os.makedirs(output_dir)
|
89
|
-
|
90
|
-
if para_debug_mode == "full":
|
91
|
-
with open(pdf_dic_json_fpath, "w", encoding="utf-8") as f:
|
92
|
-
json.dump(pdf_dic, f, indent=2, ensure_ascii=False)
|
93
|
-
|
94
|
-
# Validate the output already exists
|
95
|
-
if not os.path.exists(pdf_dic_json_fpath):
|
96
|
-
print_red(f"Failed to save the pdf_dic to {pdf_dic_json_fpath}")
|
97
|
-
return None
|
98
|
-
else:
|
99
|
-
print_green(f"Succeed to save the pdf_dic to {pdf_dic_json_fpath}")
|
100
|
-
|
101
|
-
return pdf_dic_json_fpath
|
102
|
-
|
103
|
-
"""
|
104
|
-
Preprocess the lines of block
|
105
|
-
"""
|
106
|
-
# Find and replace the interline and inline equations, should be better done before the paragraph processing
|
107
|
-
# Create "para_blocks" for each page.
|
108
|
-
# equationProcessor = EquationsProcessor()
|
109
|
-
# pdf_dic = equationProcessor.batch_process_blocks(pdf_info_dict)
|
110
|
-
|
111
|
-
# Combine spans into a natural line
|
112
|
-
rawBlockProcessor = RawBlockProcessor()
|
113
|
-
pdf_dic = rawBlockProcessor.batch_process_blocks(pdf_info_dict)
|
114
|
-
# print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n")
|
115
|
-
|
116
|
-
# Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
|
117
|
-
layoutFilter = LayoutFilterProcessor()
|
118
|
-
pdf_dic = layoutFilter.batch_process_blocks(pdf_dic)
|
119
|
-
|
120
|
-
# Compute statistics for each block
|
121
|
-
blockStatisticsCalculator = BlockStatisticsCalculator()
|
122
|
-
pdf_dic = blockStatisticsCalculator.batch_process_blocks(pdf_dic)
|
123
|
-
# print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n")
|
124
|
-
|
125
|
-
# Compute statistics for all blocks(namely this pdf document)
|
126
|
-
docStatisticsCalculator = DocStatisticsCalculator()
|
127
|
-
pdf_dic = docStatisticsCalculator.calc_stats_of_doc(pdf_dic)
|
128
|
-
# print(f"pdf_dic['statistics']: {pdf_dic['statistics']}", end="\n\n")
|
129
|
-
|
130
|
-
# Dump the first three stages of pdf_dic to a json file
|
131
|
-
if para_debug_mode == "full":
|
132
|
-
pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode)
|
133
|
-
|
134
|
-
"""
|
135
|
-
Detect titles in the document
|
136
|
-
"""
|
137
|
-
doc_statistics = pdf_dic["statistics"]
|
138
|
-
titleProcessor = TitleProcessor(doc_statistics)
|
139
|
-
pdf_dic = titleProcessor.batch_process_blocks_detect_titles(pdf_dic)
|
140
|
-
|
141
|
-
if para_debug_mode == "full":
|
142
|
-
pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="1", para_debug_mode=para_debug_mode)
|
143
|
-
|
144
|
-
"""
|
145
|
-
Detect and divide the level of the titles
|
146
|
-
"""
|
147
|
-
titleProcessor = TitleProcessor()
|
148
|
-
|
149
|
-
pdf_dic = titleProcessor.batch_process_blocks_recog_title_level(pdf_dic)
|
150
|
-
|
151
|
-
if para_debug_mode == "full":
|
152
|
-
pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="2", para_debug_mode=para_debug_mode)
|
153
|
-
|
154
|
-
"""
|
155
|
-
Detect and split paragraphs inside each block
|
156
|
-
"""
|
157
|
-
blockInnerParasProcessor = BlockTerminationProcessor()
|
158
|
-
|
159
|
-
pdf_dic = blockInnerParasProcessor.batch_process_blocks(pdf_dic)
|
160
|
-
|
161
|
-
if para_debug_mode == "full":
|
162
|
-
pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode=para_debug_mode)
|
163
|
-
|
164
|
-
# pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode="full")
|
165
|
-
# print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}")
|
166
|
-
|
167
|
-
"""
|
168
|
-
Detect and combine paragraphs from different blocks into one paragraph
|
169
|
-
"""
|
170
|
-
blockContinuationProcessor = BlockContinuationProcessor()
|
171
|
-
|
172
|
-
pdf_dic = blockContinuationProcessor.batch_tag_paras(pdf_dic)
|
173
|
-
pdf_dic = blockContinuationProcessor.batch_merge_paras(pdf_dic)
|
174
|
-
|
175
|
-
if para_debug_mode == "full":
|
176
|
-
pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode=para_debug_mode)
|
177
|
-
|
178
|
-
# pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode="full")
|
179
|
-
# print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}")
|
180
|
-
|
181
|
-
"""
|
182
|
-
Discard pdf files by checking exceptions and return the error info to the caller
|
183
|
-
"""
|
184
|
-
discardByException = DiscardByException()
|
185
|
-
|
186
|
-
is_discard_by_single_line_block = discardByException.discard_by_single_line_block(
|
187
|
-
pdf_dic, exception=DenseSingleLineBlockException()
|
188
|
-
)
|
189
|
-
is_discard_by_title_detection = discardByException.discard_by_title_detection(
|
190
|
-
pdf_dic, exception=TitleDetectionException()
|
191
|
-
)
|
192
|
-
is_discard_by_title_level = discardByException.discard_by_title_level(pdf_dic, exception=TitleLevelException())
|
193
|
-
is_discard_by_split_para = discardByException.discard_by_split_para(pdf_dic, exception=ParaSplitException())
|
194
|
-
is_discard_by_merge_para = discardByException.discard_by_merge_para(pdf_dic, exception=ParaMergeException())
|
195
|
-
|
196
|
-
"""
|
197
|
-
if any(
|
198
|
-
info is not None
|
199
|
-
for info in [
|
200
|
-
is_discard_by_single_line_block,
|
201
|
-
is_discard_by_title_detection,
|
202
|
-
is_discard_by_title_level,
|
203
|
-
is_discard_by_split_para,
|
204
|
-
is_discard_by_merge_para,
|
205
|
-
]
|
206
|
-
):
|
207
|
-
error_info = next(
|
208
|
-
(
|
209
|
-
info
|
210
|
-
for info in [
|
211
|
-
is_discard_by_single_line_block,
|
212
|
-
is_discard_by_title_detection,
|
213
|
-
is_discard_by_title_level,
|
214
|
-
is_discard_by_split_para,
|
215
|
-
is_discard_by_merge_para,
|
216
|
-
]
|
217
|
-
if info is not None
|
218
|
-
),
|
219
|
-
None,
|
220
|
-
)
|
221
|
-
return pdf_dic, error_info
|
222
|
-
|
223
|
-
if any(
|
224
|
-
info is not None
|
225
|
-
for info in [
|
226
|
-
is_discard_by_single_line_block,
|
227
|
-
is_discard_by_title_detection,
|
228
|
-
is_discard_by_title_level,
|
229
|
-
is_discard_by_split_para,
|
230
|
-
is_discard_by_merge_para,
|
231
|
-
]
|
232
|
-
):
|
233
|
-
error_info = next(
|
234
|
-
(
|
235
|
-
info
|
236
|
-
for info in [
|
237
|
-
is_discard_by_single_line_block,
|
238
|
-
is_discard_by_title_detection,
|
239
|
-
is_discard_by_title_level,
|
240
|
-
is_discard_by_split_para,
|
241
|
-
is_discard_by_merge_para,
|
242
|
-
]
|
243
|
-
if info is not None
|
244
|
-
),
|
245
|
-
None,
|
246
|
-
)
|
247
|
-
return pdf_dic, error_info
|
248
|
-
"""
|
249
|
-
|
250
|
-
"""
|
251
|
-
Dump the final pdf_dic to a json file
|
252
|
-
"""
|
253
|
-
if para_debug_mode is not None:
|
254
|
-
with open(output_json_file, "w", encoding="utf-8") as f:
|
255
|
-
json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
|
256
|
-
|
257
|
-
"""
|
258
|
-
Draw the annotations
|
259
|
-
"""
|
260
|
-
|
261
|
-
if is_discard_by_single_line_block is not None:
|
262
|
-
error_info = is_discard_by_single_line_block
|
263
|
-
elif is_discard_by_title_detection is not None:
|
264
|
-
error_info = is_discard_by_title_detection
|
265
|
-
elif is_discard_by_title_level is not None:
|
266
|
-
error_info = is_discard_by_title_level
|
267
|
-
elif is_discard_by_split_para is not None:
|
268
|
-
error_info = is_discard_by_split_para
|
269
|
-
elif is_discard_by_merge_para is not None:
|
270
|
-
error_info = is_discard_by_merge_para
|
271
|
-
|
272
|
-
if error_info is not None:
|
273
|
-
return pdf_dic, error_info
|
274
|
-
|
275
|
-
"""
|
276
|
-
Dump the final pdf_dic to a json file
|
277
|
-
"""
|
278
|
-
if para_debug_mode is not None:
|
279
|
-
with open(output_json_file, "w", encoding="utf-8") as f:
|
280
|
-
json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
|
281
|
-
|
282
|
-
"""
|
283
|
-
Draw the annotations
|
284
|
-
"""
|
285
|
-
if para_debug_mode is not None:
|
286
|
-
drawAnnos = DrawAnnos()
|
287
|
-
drawAnnos.draw_annos(input_pdf_path, pdf_dic, output_pdf_path)
|
288
|
-
|
289
|
-
"""
|
290
|
-
Remove the intermediate files which are generated in the process of paragraph processing if debug_mode is simple
|
291
|
-
"""
|
292
|
-
if para_debug_mode is not None:
|
293
|
-
for fpath in os.listdir(output_dir):
|
294
|
-
if fpath.endswith(".json") and "stage" in fpath:
|
295
|
-
os.remove(os.path.join(output_dir, fpath))
|
296
|
-
|
297
|
-
return pdf_dic, error_info
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|