magic-pdf 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. magic_pdf/__init__.py +0 -0
  2. magic_pdf/cli/__init__.py +0 -0
  3. magic_pdf/cli/magicpdf.py +294 -0
  4. magic_pdf/dict2md/__init__.py +0 -0
  5. magic_pdf/dict2md/mkcontent.py +397 -0
  6. magic_pdf/dict2md/ocr_mkcontent.py +356 -0
  7. magic_pdf/filter/__init__.py +0 -0
  8. magic_pdf/filter/pdf_classify_by_type.py +381 -0
  9. magic_pdf/filter/pdf_meta_scan.py +368 -0
  10. magic_pdf/layout/__init__.py +0 -0
  11. magic_pdf/layout/bbox_sort.py +681 -0
  12. magic_pdf/layout/layout_det_utils.py +182 -0
  13. magic_pdf/layout/layout_sort.py +732 -0
  14. magic_pdf/layout/layout_spiler_recog.py +101 -0
  15. magic_pdf/layout/mcol_sort.py +336 -0
  16. magic_pdf/libs/Constants.py +11 -0
  17. magic_pdf/libs/MakeContentConfig.py +10 -0
  18. magic_pdf/libs/ModelBlockTypeEnum.py +9 -0
  19. magic_pdf/libs/__init__.py +0 -0
  20. magic_pdf/libs/boxbase.py +408 -0
  21. magic_pdf/libs/calc_span_stats.py +239 -0
  22. magic_pdf/libs/commons.py +204 -0
  23. magic_pdf/libs/config_reader.py +63 -0
  24. magic_pdf/libs/convert_utils.py +5 -0
  25. magic_pdf/libs/coordinate_transform.py +9 -0
  26. magic_pdf/libs/detect_language_from_model.py +21 -0
  27. magic_pdf/libs/draw_bbox.py +227 -0
  28. magic_pdf/libs/drop_reason.py +27 -0
  29. magic_pdf/libs/drop_tag.py +19 -0
  30. magic_pdf/libs/hash_utils.py +15 -0
  31. magic_pdf/libs/json_compressor.py +27 -0
  32. magic_pdf/libs/language.py +31 -0
  33. magic_pdf/libs/markdown_utils.py +31 -0
  34. magic_pdf/libs/math.py +9 -0
  35. magic_pdf/libs/nlp_utils.py +203 -0
  36. magic_pdf/libs/ocr_content_type.py +21 -0
  37. magic_pdf/libs/path_utils.py +23 -0
  38. magic_pdf/libs/pdf_image_tools.py +33 -0
  39. magic_pdf/libs/safe_filename.py +11 -0
  40. magic_pdf/libs/textbase.py +33 -0
  41. magic_pdf/libs/version.py +1 -0
  42. magic_pdf/libs/vis_utils.py +308 -0
  43. magic_pdf/model/__init__.py +0 -0
  44. magic_pdf/model/doc_analyze_by_360layout.py +8 -0
  45. magic_pdf/model/doc_analyze_by_pp_structurev2.py +125 -0
  46. magic_pdf/model/magic_model.py +632 -0
  47. magic_pdf/para/__init__.py +0 -0
  48. magic_pdf/para/block_continuation_processor.py +562 -0
  49. magic_pdf/para/block_termination_processor.py +480 -0
  50. magic_pdf/para/commons.py +222 -0
  51. magic_pdf/para/denoise.py +246 -0
  52. magic_pdf/para/draw.py +121 -0
  53. magic_pdf/para/exceptions.py +198 -0
  54. magic_pdf/para/layout_match_processor.py +40 -0
  55. magic_pdf/para/para_pipeline.py +297 -0
  56. magic_pdf/para/para_split.py +644 -0
  57. magic_pdf/para/para_split_v2.py +772 -0
  58. magic_pdf/para/raw_processor.py +207 -0
  59. magic_pdf/para/stats.py +268 -0
  60. magic_pdf/para/title_processor.py +1014 -0
  61. magic_pdf/pdf_parse_by_ocr.py +219 -0
  62. magic_pdf/pdf_parse_by_ocr_v2.py +17 -0
  63. magic_pdf/pdf_parse_by_txt.py +410 -0
  64. magic_pdf/pdf_parse_by_txt_v2.py +56 -0
  65. magic_pdf/pdf_parse_for_train.py +685 -0
  66. magic_pdf/pdf_parse_union_core.py +241 -0
  67. magic_pdf/pipe/AbsPipe.py +112 -0
  68. magic_pdf/pipe/OCRPipe.py +28 -0
  69. magic_pdf/pipe/TXTPipe.py +29 -0
  70. magic_pdf/pipe/UNIPipe.py +83 -0
  71. magic_pdf/pipe/__init__.py +0 -0
  72. magic_pdf/post_proc/__init__.py +0 -0
  73. magic_pdf/post_proc/detect_para.py +3472 -0
  74. magic_pdf/post_proc/pdf_post_filter.py +67 -0
  75. magic_pdf/post_proc/remove_footnote.py +153 -0
  76. magic_pdf/pre_proc/__init__.py +0 -0
  77. magic_pdf/pre_proc/citationmarker_remove.py +157 -0
  78. magic_pdf/pre_proc/construct_page_dict.py +72 -0
  79. magic_pdf/pre_proc/cut_image.py +71 -0
  80. magic_pdf/pre_proc/detect_equation.py +134 -0
  81. magic_pdf/pre_proc/detect_footer_by_model.py +64 -0
  82. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +284 -0
  83. magic_pdf/pre_proc/detect_footnote.py +170 -0
  84. magic_pdf/pre_proc/detect_header.py +64 -0
  85. magic_pdf/pre_proc/detect_images.py +647 -0
  86. magic_pdf/pre_proc/detect_page_number.py +64 -0
  87. magic_pdf/pre_proc/detect_tables.py +62 -0
  88. magic_pdf/pre_proc/equations_replace.py +559 -0
  89. magic_pdf/pre_proc/fix_image.py +244 -0
  90. magic_pdf/pre_proc/fix_table.py +270 -0
  91. magic_pdf/pre_proc/main_text_font.py +23 -0
  92. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +115 -0
  93. magic_pdf/pre_proc/ocr_detect_layout.py +133 -0
  94. magic_pdf/pre_proc/ocr_dict_merge.py +336 -0
  95. magic_pdf/pre_proc/ocr_span_list_modify.py +258 -0
  96. magic_pdf/pre_proc/pdf_pre_filter.py +74 -0
  97. magic_pdf/pre_proc/post_layout_split.py +0 -0
  98. magic_pdf/pre_proc/remove_bbox_overlap.py +98 -0
  99. magic_pdf/pre_proc/remove_colored_strip_bbox.py +79 -0
  100. magic_pdf/pre_proc/remove_footer_header.py +117 -0
  101. magic_pdf/pre_proc/remove_rotate_bbox.py +188 -0
  102. magic_pdf/pre_proc/resolve_bbox_conflict.py +191 -0
  103. magic_pdf/pre_proc/solve_line_alien.py +29 -0
  104. magic_pdf/pre_proc/statistics.py +12 -0
  105. magic_pdf/rw/AbsReaderWriter.py +34 -0
  106. magic_pdf/rw/DiskReaderWriter.py +66 -0
  107. magic_pdf/rw/S3ReaderWriter.py +107 -0
  108. magic_pdf/rw/__init__.py +0 -0
  109. magic_pdf/spark/__init__.py +0 -0
  110. magic_pdf/spark/spark_api.py +51 -0
  111. magic_pdf/train_utils/__init__.py +0 -0
  112. magic_pdf/train_utils/convert_to_train_format.py +65 -0
  113. magic_pdf/train_utils/extract_caption.py +59 -0
  114. magic_pdf/train_utils/remove_footer_header.py +159 -0
  115. magic_pdf/train_utils/vis_utils.py +327 -0
  116. magic_pdf/user_api.py +136 -0
  117. magic_pdf-0.5.4.dist-info/LICENSE.md +661 -0
  118. magic_pdf-0.5.4.dist-info/METADATA +24 -0
  119. magic_pdf-0.5.4.dist-info/RECORD +121 -0
  120. magic_pdf-0.5.4.dist-info/WHEEL +5 -0
  121. magic_pdf-0.5.4.dist-info/top_level.txt +1 -0
@@ -0,0 +1,356 @@
1
+ from loguru import logger
2
+
3
+ from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
4
+ from magic_pdf.libs.commons import join_path
5
+ from magic_pdf.libs.language import detect_lang
6
+ from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
7
+ from magic_pdf.libs.ocr_content_type import ContentType, BlockType
8
+ import wordninja
9
+ import re
10
+
11
+
12
+ def split_long_words(text):
13
+ segments = text.split(' ')
14
+ for i in range(len(segments)):
15
+ words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
16
+ for j in range(len(words)):
17
+ if len(words[j]) > 15:
18
+ words[j] = ' '.join(wordninja.split(words[j]))
19
+ segments[i] = ''.join(words)
20
+ return ' '.join(segments)
21
+
22
+
23
+ def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
24
+ markdown = []
25
+ for page_info in pdf_info_list:
26
+ paras_of_layout = page_info.get("para_blocks")
27
+ page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
28
+ markdown.extend(page_markdown)
29
+ return '\n\n'.join(markdown)
30
+
31
+
32
+ def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list):
33
+ markdown = []
34
+ for page_info in pdf_info_dict:
35
+ paras_of_layout = page_info.get("para_blocks")
36
+ page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "nlp")
37
+ markdown.extend(page_markdown)
38
+ return '\n\n'.join(markdown)
39
+
40
+
41
+ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list, img_buket_path):
42
+ markdown_with_para_and_pagination = []
43
+ page_no = 0
44
+ for page_info in pdf_info_dict:
45
+ paras_of_layout = page_info.get("para_blocks")
46
+ if not paras_of_layout:
47
+ continue
48
+ page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
49
+ markdown_with_para_and_pagination.append({
50
+ 'page_no': page_no,
51
+ 'md_content': '\n\n'.join(page_markdown)
52
+ })
53
+ page_no += 1
54
+ return markdown_with_para_and_pagination
55
+
56
+
57
+ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
58
+ page_markdown = []
59
+ for paras in paras_of_layout:
60
+ for para in paras:
61
+ para_text = ''
62
+ for line in para:
63
+ for span in line['spans']:
64
+ span_type = span.get('type')
65
+ content = ''
66
+ language = ''
67
+ if span_type == ContentType.Text:
68
+ content = span['content']
69
+ language = detect_lang(content)
70
+ if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
71
+ content = ocr_escape_special_markdown_char(split_long_words(content))
72
+ else:
73
+ content = ocr_escape_special_markdown_char(content)
74
+ elif span_type == ContentType.InlineEquation:
75
+ content = f"${span['content']}$"
76
+ elif span_type == ContentType.InterlineEquation:
77
+ content = f"\n$$\n{span['content']}\n$$\n"
78
+ elif span_type in [ContentType.Image, ContentType.Table]:
79
+ if mode == 'mm':
80
+ content = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
81
+ elif mode == 'nlp':
82
+ pass
83
+ if content != '':
84
+ if language == 'en': # 英文语境下 content间需要空格分隔
85
+ para_text += content + ' '
86
+ else: # 中文语境下,content间不需要空格分隔
87
+ para_text += content
88
+ if para_text.strip() == '':
89
+ continue
90
+ else:
91
+ page_markdown.append(para_text.strip() + ' ')
92
+ return page_markdown
93
+
94
+
95
+ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
96
+ page_markdown = []
97
+ for para_block in paras_of_layout:
98
+ para_text = ''
99
+ para_type = para_block['type']
100
+ if para_type == BlockType.Text:
101
+ para_text = merge_para_with_text(para_block)
102
+ elif para_type == BlockType.Title:
103
+ para_text = f"# {merge_para_with_text(para_block)}"
104
+ elif para_type == BlockType.InterlineEquation:
105
+ para_text = merge_para_with_text(para_block)
106
+ elif para_type == BlockType.Image:
107
+ if mode == 'nlp':
108
+ continue
109
+ elif mode == 'mm':
110
+ for block in para_block['blocks']: # 1st.拼image_body
111
+ if block['type'] == BlockType.ImageBody:
112
+ for line in block['lines']:
113
+ for span in line['spans']:
114
+ if span['type'] == ContentType.Image:
115
+ para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
116
+ for block in para_block['blocks']: # 2nd.拼image_caption
117
+ if block['type'] == BlockType.ImageCaption:
118
+ para_text += merge_para_with_text(block)
119
+ elif para_type == BlockType.Table:
120
+ if mode == 'nlp':
121
+ continue
122
+ elif mode == 'mm':
123
+ for block in para_block['blocks']: # 1st.拼table_caption
124
+ if block['type'] == BlockType.TableCaption:
125
+ para_text += merge_para_with_text(block)
126
+ for block in para_block['blocks']: # 2nd.拼table_body
127
+ if block['type'] == BlockType.TableBody:
128
+ for line in block['lines']:
129
+ for span in line['spans']:
130
+ if span['type'] == ContentType.Table:
131
+ para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
132
+ for block in para_block['blocks']: # 3rd.拼table_footnote
133
+ if block['type'] == BlockType.TableFootnote:
134
+ para_text += merge_para_with_text(block)
135
+
136
+ if para_text.strip() == '':
137
+ continue
138
+ else:
139
+ page_markdown.append(para_text.strip() + ' ')
140
+
141
+ return page_markdown
142
+
143
+
144
+ def merge_para_with_text(para_block):
145
+ para_text = ''
146
+ for line in para_block['lines']:
147
+ for span in line['spans']:
148
+ span_type = span['type']
149
+ content = ''
150
+ language = ''
151
+ if span_type == ContentType.Text:
152
+ content = span['content']
153
+ language = detect_lang(content)
154
+ if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
155
+ content = ocr_escape_special_markdown_char(split_long_words(content))
156
+ else:
157
+ content = ocr_escape_special_markdown_char(content)
158
+ elif span_type == ContentType.InlineEquation:
159
+ content = f"${span['content']}$"
160
+ elif span_type == ContentType.InterlineEquation:
161
+ content = f"\n$$\n{span['content']}\n$$\n"
162
+
163
+ if content != '':
164
+ if 'zh' in language:
165
+ para_text += content # 中文语境下,content间不需要空格分隔
166
+ else:
167
+ para_text += content + ' ' # 英文语境下 content间需要空格分隔
168
+ return para_text
169
+
170
+
171
+ def para_to_standard_format(para, img_buket_path):
172
+ para_content = {}
173
+ if len(para) == 1:
174
+ para_content = line_to_standard_format(para[0], img_buket_path)
175
+ elif len(para) > 1:
176
+ para_text = ''
177
+ inline_equation_num = 0
178
+ for line in para:
179
+ for span in line['spans']:
180
+ language = ''
181
+ span_type = span.get('type')
182
+ content = ""
183
+ if span_type == ContentType.Text:
184
+ content = span['content']
185
+ language = detect_lang(content)
186
+ if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
187
+ content = ocr_escape_special_markdown_char(split_long_words(content))
188
+ else:
189
+ content = ocr_escape_special_markdown_char(content)
190
+ elif span_type == ContentType.InlineEquation:
191
+ content = f"${span['content']}$"
192
+ inline_equation_num += 1
193
+
194
+ if language == 'en': # 英文语境下 content间需要空格分隔
195
+ para_text += content + ' '
196
+ else: # 中文语境下,content间不需要空格分隔
197
+ para_text += content
198
+ para_content = {
199
+ 'type': 'text',
200
+ 'text': para_text,
201
+ 'inline_equation_num': inline_equation_num
202
+ }
203
+ return para_content
204
+
205
+
206
+ def para_to_standard_format_v2(para_block, img_buket_path):
207
+ para_type = para_block['type']
208
+ if para_type == BlockType.Text:
209
+ para_content = {
210
+ 'type': 'text',
211
+ 'text': merge_para_with_text(para_block),
212
+ }
213
+ elif para_type == BlockType.Title:
214
+ para_content = {
215
+ 'type': 'text',
216
+ 'text': merge_para_with_text(para_block),
217
+ 'text_level': 1
218
+ }
219
+ elif para_type == BlockType.InterlineEquation:
220
+ para_content = {
221
+ 'type': 'equation',
222
+ 'text': merge_para_with_text(para_block),
223
+ 'text_format': "latex"
224
+ }
225
+ elif para_type == BlockType.Image:
226
+ para_content = {
227
+ 'type': 'image',
228
+ }
229
+ for block in para_block['blocks']:
230
+ if block['type'] == BlockType.ImageBody:
231
+ para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
232
+ if block['type'] == BlockType.ImageCaption:
233
+ para_content['img_caption'] = merge_para_with_text(block)
234
+ elif para_type == BlockType.Table:
235
+ para_content = {
236
+ 'type': 'table',
237
+ }
238
+ for block in para_block['blocks']:
239
+ if block['type'] == BlockType.TableBody:
240
+ para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
241
+ if block['type'] == BlockType.TableCaption:
242
+ para_content['table_caption'] = merge_para_with_text(block)
243
+ if block['type'] == BlockType.TableFootnote:
244
+ para_content['table_footnote'] = merge_para_with_text(block)
245
+
246
+ return para_content
247
+
248
+
249
+ def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str):
250
+ content_list = []
251
+ for page_info in pdf_info_dict:
252
+ paras_of_layout = page_info.get("para_blocks")
253
+ if not paras_of_layout:
254
+ continue
255
+ for para_block in paras_of_layout:
256
+ para_content = para_to_standard_format_v2(para_block, img_buket_path)
257
+ content_list.append(para_content)
258
+ return content_list
259
+
260
+
261
+ def line_to_standard_format(line, img_buket_path):
262
+ line_text = ""
263
+ inline_equation_num = 0
264
+ for span in line['spans']:
265
+ if not span.get('content'):
266
+ if not span.get('image_path'):
267
+ continue
268
+ else:
269
+ if span['type'] == ContentType.Image:
270
+ content = {
271
+ 'type': 'image',
272
+ 'img_path': join_path(img_buket_path, span['image_path'])
273
+ }
274
+ return content
275
+ elif span['type'] == ContentType.Table:
276
+ content = {
277
+ 'type': 'table',
278
+ 'img_path': join_path(img_buket_path, span['image_path'])
279
+ }
280
+ return content
281
+ else:
282
+ if span['type'] == ContentType.InterlineEquation:
283
+ interline_equation = span['content']
284
+ content = {
285
+ 'type': 'equation',
286
+ 'latex': f"$$\n{interline_equation}\n$$"
287
+ }
288
+ return content
289
+ elif span['type'] == ContentType.InlineEquation:
290
+ inline_equation = span['content']
291
+ line_text += f"${inline_equation}$"
292
+ inline_equation_num += 1
293
+ elif span['type'] == ContentType.Text:
294
+ text_content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号
295
+ line_text += text_content
296
+ content = {
297
+ 'type': 'text',
298
+ 'text': line_text,
299
+ 'inline_equation_num': inline_equation_num
300
+ }
301
+ return content
302
+
303
+
304
+ def ocr_mk_mm_standard_format(pdf_info_dict: list):
305
+ """
306
+ content_list
307
+ type string image/text/table/equation(行间的单独拿出来,行内的和text合并)
308
+ latex string latex文本字段。
309
+ text string 纯文本格式的文本数据。
310
+ md string markdown格式的文本数据。
311
+ img_path string s3://full/path/to/img.jpg
312
+ """
313
+ content_list = []
314
+ for page_info in pdf_info_dict:
315
+ blocks = page_info.get("preproc_blocks")
316
+ if not blocks:
317
+ continue
318
+ for block in blocks:
319
+ for line in block['lines']:
320
+ content = line_to_standard_format(line)
321
+ content_list.append(content)
322
+ return content_list
323
+
324
+
325
+ def union_make(pdf_info_dict: list, make_mode: str, drop_mode: str, img_buket_path: str = ""):
326
+ output_content = []
327
+ for page_info in pdf_info_dict:
328
+ if page_info.get("need_drop", False):
329
+ drop_reason = page_info.get("drop_reason")
330
+ if drop_mode == DropMode.NONE:
331
+ pass
332
+ elif drop_mode == DropMode.WHOLE_PDF:
333
+ raise Exception(f"drop_mode is {DropMode.WHOLE_PDF} , drop_reason is {drop_reason}")
334
+ elif drop_mode == DropMode.SINGLE_PAGE:
335
+ logger.warning(f"drop_mode is {DropMode.SINGLE_PAGE} , drop_reason is {drop_reason}")
336
+ continue
337
+ else:
338
+ raise Exception(f"drop_mode can not be null")
339
+
340
+ paras_of_layout = page_info.get("para_blocks")
341
+ if not paras_of_layout:
342
+ continue
343
+ if make_mode == MakeMode.MM_MD:
344
+ page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
345
+ output_content.extend(page_markdown)
346
+ elif make_mode == MakeMode.NLP_MD:
347
+ page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "nlp")
348
+ output_content.extend(page_markdown)
349
+ elif make_mode == MakeMode.STANDARD_FORMAT:
350
+ for para_block in paras_of_layout:
351
+ para_content = para_to_standard_format_v2(para_block, img_buket_path)
352
+ output_content.append(para_content)
353
+ if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
354
+ return '\n\n'.join(output_content)
355
+ elif make_mode == MakeMode.STANDARD_FORMAT:
356
+ return output_content
File without changes