magic-pdf 0.9.3__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. magic_pdf/config/constants.py +53 -0
  2. magic_pdf/config/drop_reason.py +35 -0
  3. magic_pdf/config/drop_tag.py +19 -0
  4. magic_pdf/config/make_content_config.py +11 -0
  5. magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
  6. magic_pdf/data/data_reader_writer/filebase.py +3 -0
  7. magic_pdf/data/read_api.py +1 -1
  8. magic_pdf/dict2md/mkcontent.py +226 -185
  9. magic_pdf/dict2md/ocr_mkcontent.py +11 -11
  10. magic_pdf/filter/pdf_meta_scan.py +101 -79
  11. magic_pdf/integrations/rag/utils.py +4 -5
  12. magic_pdf/libs/config_reader.py +5 -5
  13. magic_pdf/libs/draw_bbox.py +3 -2
  14. magic_pdf/libs/pdf_image_tools.py +36 -12
  15. magic_pdf/libs/version.py +1 -1
  16. magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
  17. magic_pdf/model/magic_model.py +13 -13
  18. magic_pdf/model/pdf_extract_kit.py +122 -76
  19. magic_pdf/model/sub_modules/model_init.py +40 -35
  20. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +33 -7
  21. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +12 -4
  22. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +2 -0
  23. magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +30 -28
  24. magic_pdf/para/para_split.py +411 -248
  25. magic_pdf/para/para_split_v2.py +352 -182
  26. magic_pdf/para/para_split_v3.py +110 -53
  27. magic_pdf/pdf_parse_by_ocr.py +2 -0
  28. magic_pdf/pdf_parse_by_txt.py +2 -0
  29. magic_pdf/pdf_parse_union_core.py +174 -100
  30. magic_pdf/pdf_parse_union_core_v2.py +202 -36
  31. magic_pdf/pipe/AbsPipe.py +28 -44
  32. magic_pdf/pipe/OCRPipe.py +5 -5
  33. magic_pdf/pipe/TXTPipe.py +5 -6
  34. magic_pdf/pipe/UNIPipe.py +24 -25
  35. magic_pdf/post_proc/pdf_post_filter.py +7 -14
  36. magic_pdf/pre_proc/cut_image.py +9 -11
  37. magic_pdf/pre_proc/equations_replace.py +203 -212
  38. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
  39. magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
  40. magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
  41. magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
  42. magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
  43. magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
  44. magic_pdf/pre_proc/remove_footer_header.py +2 -5
  45. magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
  46. magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
  47. magic_pdf/spark/spark_api.py +15 -17
  48. magic_pdf/tools/cli.py +3 -4
  49. magic_pdf/tools/cli_dev.py +6 -9
  50. magic_pdf/tools/common.py +26 -36
  51. magic_pdf/user_api.py +29 -38
  52. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/METADATA +11 -12
  53. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/RECORD +58 -59
  54. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/WHEEL +1 -1
  55. magic_pdf/libs/Constants.py +0 -55
  56. magic_pdf/libs/MakeContentConfig.py +0 -11
  57. magic_pdf/libs/drop_reason.py +0 -27
  58. magic_pdf/libs/drop_tag.py +0 -19
  59. magic_pdf/para/para_pipeline.py +0 -297
  60. /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
  61. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/LICENSE.md +0 -0
  62. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/entry_points.txt +0 -0
  63. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/top_level.txt +0 -0
@@ -1,297 +0,0 @@
1
- import os
2
- import json
3
-
4
- from magic_pdf.para.commons import *
5
-
6
- from magic_pdf.para.raw_processor import RawBlockProcessor
7
- from magic_pdf.para.layout_match_processor import LayoutFilterProcessor
8
- from magic_pdf.para.stats import BlockStatisticsCalculator
9
- from magic_pdf.para.stats import DocStatisticsCalculator
10
- from magic_pdf.para.title_processor import TitleProcessor
11
- from magic_pdf.para.block_termination_processor import BlockTerminationProcessor
12
- from magic_pdf.para.block_continuation_processor import BlockContinuationProcessor
13
- from magic_pdf.para.draw import DrawAnnos
14
- from magic_pdf.para.exceptions import (
15
- DenseSingleLineBlockException,
16
- TitleDetectionException,
17
- TitleLevelException,
18
- ParaSplitException,
19
- ParaMergeException,
20
- DiscardByException,
21
- )
22
-
23
-
24
- if sys.version_info[0] >= 3:
25
- sys.stdout.reconfigure(encoding="utf-8") # type: ignore
26
-
27
-
28
- class ParaProcessPipeline:
29
- def __init__(self) -> None:
30
- pass
31
-
32
- def para_process_pipeline(self, pdf_info_dict, para_debug_mode=None, input_pdf_path=None, output_pdf_path=None):
33
- """
34
- This function processes the paragraphs, including:
35
- 1. Read raw input json file into pdf_dic
36
- 2. Detect and replace equations
37
- 3. Combine spans into a natural line
38
- 4. Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
39
- 5. Compute statistics for each block
40
- 6. Detect titles in the document
41
- 7. Detect paragraphs inside each block
42
- 8. Divide the level of the titles
43
- 9. Detect and combine paragraphs from different blocks into one paragraph
44
- 10. Check whether the final results after checking headings, dividing paragraphs within blocks, and merging paragraphs between blocks are plausible and reasonable.
45
- 11. Draw annotations on the pdf file
46
-
47
- Parameters
48
- ----------
49
- pdf_dic_json_fpath : str
50
- path to the pdf dictionary json file.
51
- Notice: data noises, including overlap blocks, header, footer, watermark, vertical margin note have been removed already.
52
- input_pdf_doc : str
53
- path to the input pdf file
54
- output_pdf_path : str
55
- path to the output pdf file
56
-
57
- Returns
58
- -------
59
- pdf_dict : dict
60
- result dictionary
61
- """
62
-
63
- error_info = None
64
-
65
- output_json_file = ""
66
- output_dir = ""
67
-
68
- if input_pdf_path is not None:
69
- input_pdf_path = os.path.abspath(input_pdf_path)
70
-
71
- # print_green_on_red(f">>>>>>>>>>>>>>>>>>> Process the paragraphs of {input_pdf_path}")
72
-
73
- if output_pdf_path is not None:
74
- output_dir = os.path.dirname(output_pdf_path)
75
- output_json_file = f"{output_dir}/pdf_dic.json"
76
-
77
- def __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode):
78
- """
79
- Save the pdf_dic to a json file
80
- """
81
- output_pdf_file_name = os.path.basename(output_pdf_path)
82
- # output_dir = os.path.dirname(output_pdf_path)
83
- output_dir = "\\tmp\\pdf_parse"
84
- output_pdf_file_name = output_pdf_file_name.replace(".pdf", f"_stage_{stage}.json")
85
- pdf_dic_json_fpath = os.path.join(output_dir, output_pdf_file_name)
86
-
87
- if not os.path.exists(output_dir):
88
- os.makedirs(output_dir)
89
-
90
- if para_debug_mode == "full":
91
- with open(pdf_dic_json_fpath, "w", encoding="utf-8") as f:
92
- json.dump(pdf_dic, f, indent=2, ensure_ascii=False)
93
-
94
- # Validate the output already exists
95
- if not os.path.exists(pdf_dic_json_fpath):
96
- print_red(f"Failed to save the pdf_dic to {pdf_dic_json_fpath}")
97
- return None
98
- else:
99
- print_green(f"Succeed to save the pdf_dic to {pdf_dic_json_fpath}")
100
-
101
- return pdf_dic_json_fpath
102
-
103
- """
104
- Preprocess the lines of block
105
- """
106
- # Find and replace the interline and inline equations, should be better done before the paragraph processing
107
- # Create "para_blocks" for each page.
108
- # equationProcessor = EquationsProcessor()
109
- # pdf_dic = equationProcessor.batch_process_blocks(pdf_info_dict)
110
-
111
- # Combine spans into a natural line
112
- rawBlockProcessor = RawBlockProcessor()
113
- pdf_dic = rawBlockProcessor.batch_process_blocks(pdf_info_dict)
114
- # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n")
115
-
116
- # Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
117
- layoutFilter = LayoutFilterProcessor()
118
- pdf_dic = layoutFilter.batch_process_blocks(pdf_dic)
119
-
120
- # Compute statistics for each block
121
- blockStatisticsCalculator = BlockStatisticsCalculator()
122
- pdf_dic = blockStatisticsCalculator.batch_process_blocks(pdf_dic)
123
- # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n")
124
-
125
- # Compute statistics for all blocks(namely this pdf document)
126
- docStatisticsCalculator = DocStatisticsCalculator()
127
- pdf_dic = docStatisticsCalculator.calc_stats_of_doc(pdf_dic)
128
- # print(f"pdf_dic['statistics']: {pdf_dic['statistics']}", end="\n\n")
129
-
130
- # Dump the first three stages of pdf_dic to a json file
131
- if para_debug_mode == "full":
132
- pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode)
133
-
134
- """
135
- Detect titles in the document
136
- """
137
- doc_statistics = pdf_dic["statistics"]
138
- titleProcessor = TitleProcessor(doc_statistics)
139
- pdf_dic = titleProcessor.batch_process_blocks_detect_titles(pdf_dic)
140
-
141
- if para_debug_mode == "full":
142
- pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="1", para_debug_mode=para_debug_mode)
143
-
144
- """
145
- Detect and divide the level of the titles
146
- """
147
- titleProcessor = TitleProcessor()
148
-
149
- pdf_dic = titleProcessor.batch_process_blocks_recog_title_level(pdf_dic)
150
-
151
- if para_debug_mode == "full":
152
- pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="2", para_debug_mode=para_debug_mode)
153
-
154
- """
155
- Detect and split paragraphs inside each block
156
- """
157
- blockInnerParasProcessor = BlockTerminationProcessor()
158
-
159
- pdf_dic = blockInnerParasProcessor.batch_process_blocks(pdf_dic)
160
-
161
- if para_debug_mode == "full":
162
- pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode=para_debug_mode)
163
-
164
- # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode="full")
165
- # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}")
166
-
167
- """
168
- Detect and combine paragraphs from different blocks into one paragraph
169
- """
170
- blockContinuationProcessor = BlockContinuationProcessor()
171
-
172
- pdf_dic = blockContinuationProcessor.batch_tag_paras(pdf_dic)
173
- pdf_dic = blockContinuationProcessor.batch_merge_paras(pdf_dic)
174
-
175
- if para_debug_mode == "full":
176
- pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode=para_debug_mode)
177
-
178
- # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode="full")
179
- # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}")
180
-
181
- """
182
- Discard pdf files by checking exceptions and return the error info to the caller
183
- """
184
- discardByException = DiscardByException()
185
-
186
- is_discard_by_single_line_block = discardByException.discard_by_single_line_block(
187
- pdf_dic, exception=DenseSingleLineBlockException()
188
- )
189
- is_discard_by_title_detection = discardByException.discard_by_title_detection(
190
- pdf_dic, exception=TitleDetectionException()
191
- )
192
- is_discard_by_title_level = discardByException.discard_by_title_level(pdf_dic, exception=TitleLevelException())
193
- is_discard_by_split_para = discardByException.discard_by_split_para(pdf_dic, exception=ParaSplitException())
194
- is_discard_by_merge_para = discardByException.discard_by_merge_para(pdf_dic, exception=ParaMergeException())
195
-
196
- """
197
- if any(
198
- info is not None
199
- for info in [
200
- is_discard_by_single_line_block,
201
- is_discard_by_title_detection,
202
- is_discard_by_title_level,
203
- is_discard_by_split_para,
204
- is_discard_by_merge_para,
205
- ]
206
- ):
207
- error_info = next(
208
- (
209
- info
210
- for info in [
211
- is_discard_by_single_line_block,
212
- is_discard_by_title_detection,
213
- is_discard_by_title_level,
214
- is_discard_by_split_para,
215
- is_discard_by_merge_para,
216
- ]
217
- if info is not None
218
- ),
219
- None,
220
- )
221
- return pdf_dic, error_info
222
-
223
- if any(
224
- info is not None
225
- for info in [
226
- is_discard_by_single_line_block,
227
- is_discard_by_title_detection,
228
- is_discard_by_title_level,
229
- is_discard_by_split_para,
230
- is_discard_by_merge_para,
231
- ]
232
- ):
233
- error_info = next(
234
- (
235
- info
236
- for info in [
237
- is_discard_by_single_line_block,
238
- is_discard_by_title_detection,
239
- is_discard_by_title_level,
240
- is_discard_by_split_para,
241
- is_discard_by_merge_para,
242
- ]
243
- if info is not None
244
- ),
245
- None,
246
- )
247
- return pdf_dic, error_info
248
- """
249
-
250
- """
251
- Dump the final pdf_dic to a json file
252
- """
253
- if para_debug_mode is not None:
254
- with open(output_json_file, "w", encoding="utf-8") as f:
255
- json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
256
-
257
- """
258
- Draw the annotations
259
- """
260
-
261
- if is_discard_by_single_line_block is not None:
262
- error_info = is_discard_by_single_line_block
263
- elif is_discard_by_title_detection is not None:
264
- error_info = is_discard_by_title_detection
265
- elif is_discard_by_title_level is not None:
266
- error_info = is_discard_by_title_level
267
- elif is_discard_by_split_para is not None:
268
- error_info = is_discard_by_split_para
269
- elif is_discard_by_merge_para is not None:
270
- error_info = is_discard_by_merge_para
271
-
272
- if error_info is not None:
273
- return pdf_dic, error_info
274
-
275
- """
276
- Dump the final pdf_dic to a json file
277
- """
278
- if para_debug_mode is not None:
279
- with open(output_json_file, "w", encoding="utf-8") as f:
280
- json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
281
-
282
- """
283
- Draw the annotations
284
- """
285
- if para_debug_mode is not None:
286
- drawAnnos = DrawAnnos()
287
- drawAnnos.draw_annos(input_pdf_path, pdf_dic, output_pdf_path)
288
-
289
- """
290
- Remove the intermediate files which are generated in the process of paragraph processing if debug_mode is simple
291
- """
292
- if para_debug_mode is not None:
293
- for fpath in os.listdir(output_dir):
294
- if fpath.endswith(".json") and "stage" in fpath:
295
- os.remove(os.path.join(output_dir, fpath))
296
-
297
- return pdf_dic, error_info
File without changes