magic-pdf 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. magic_pdf/__init__.py +0 -0
  2. magic_pdf/cli/__init__.py +0 -0
  3. magic_pdf/cli/magicpdf.py +294 -0
  4. magic_pdf/dict2md/__init__.py +0 -0
  5. magic_pdf/dict2md/mkcontent.py +397 -0
  6. magic_pdf/dict2md/ocr_mkcontent.py +356 -0
  7. magic_pdf/filter/__init__.py +0 -0
  8. magic_pdf/filter/pdf_classify_by_type.py +381 -0
  9. magic_pdf/filter/pdf_meta_scan.py +368 -0
  10. magic_pdf/layout/__init__.py +0 -0
  11. magic_pdf/layout/bbox_sort.py +681 -0
  12. magic_pdf/layout/layout_det_utils.py +182 -0
  13. magic_pdf/layout/layout_sort.py +732 -0
  14. magic_pdf/layout/layout_spiler_recog.py +101 -0
  15. magic_pdf/layout/mcol_sort.py +336 -0
  16. magic_pdf/libs/Constants.py +11 -0
  17. magic_pdf/libs/MakeContentConfig.py +10 -0
  18. magic_pdf/libs/ModelBlockTypeEnum.py +9 -0
  19. magic_pdf/libs/__init__.py +0 -0
  20. magic_pdf/libs/boxbase.py +408 -0
  21. magic_pdf/libs/calc_span_stats.py +239 -0
  22. magic_pdf/libs/commons.py +204 -0
  23. magic_pdf/libs/config_reader.py +63 -0
  24. magic_pdf/libs/convert_utils.py +5 -0
  25. magic_pdf/libs/coordinate_transform.py +9 -0
  26. magic_pdf/libs/detect_language_from_model.py +21 -0
  27. magic_pdf/libs/draw_bbox.py +227 -0
  28. magic_pdf/libs/drop_reason.py +27 -0
  29. magic_pdf/libs/drop_tag.py +19 -0
  30. magic_pdf/libs/hash_utils.py +15 -0
  31. magic_pdf/libs/json_compressor.py +27 -0
  32. magic_pdf/libs/language.py +31 -0
  33. magic_pdf/libs/markdown_utils.py +31 -0
  34. magic_pdf/libs/math.py +9 -0
  35. magic_pdf/libs/nlp_utils.py +203 -0
  36. magic_pdf/libs/ocr_content_type.py +21 -0
  37. magic_pdf/libs/path_utils.py +23 -0
  38. magic_pdf/libs/pdf_image_tools.py +33 -0
  39. magic_pdf/libs/safe_filename.py +11 -0
  40. magic_pdf/libs/textbase.py +33 -0
  41. magic_pdf/libs/version.py +1 -0
  42. magic_pdf/libs/vis_utils.py +308 -0
  43. magic_pdf/model/__init__.py +0 -0
  44. magic_pdf/model/doc_analyze_by_360layout.py +8 -0
  45. magic_pdf/model/doc_analyze_by_pp_structurev2.py +125 -0
  46. magic_pdf/model/magic_model.py +632 -0
  47. magic_pdf/para/__init__.py +0 -0
  48. magic_pdf/para/block_continuation_processor.py +562 -0
  49. magic_pdf/para/block_termination_processor.py +480 -0
  50. magic_pdf/para/commons.py +222 -0
  51. magic_pdf/para/denoise.py +246 -0
  52. magic_pdf/para/draw.py +121 -0
  53. magic_pdf/para/exceptions.py +198 -0
  54. magic_pdf/para/layout_match_processor.py +40 -0
  55. magic_pdf/para/para_pipeline.py +297 -0
  56. magic_pdf/para/para_split.py +644 -0
  57. magic_pdf/para/para_split_v2.py +772 -0
  58. magic_pdf/para/raw_processor.py +207 -0
  59. magic_pdf/para/stats.py +268 -0
  60. magic_pdf/para/title_processor.py +1014 -0
  61. magic_pdf/pdf_parse_by_ocr.py +219 -0
  62. magic_pdf/pdf_parse_by_ocr_v2.py +17 -0
  63. magic_pdf/pdf_parse_by_txt.py +410 -0
  64. magic_pdf/pdf_parse_by_txt_v2.py +56 -0
  65. magic_pdf/pdf_parse_for_train.py +685 -0
  66. magic_pdf/pdf_parse_union_core.py +241 -0
  67. magic_pdf/pipe/AbsPipe.py +112 -0
  68. magic_pdf/pipe/OCRPipe.py +28 -0
  69. magic_pdf/pipe/TXTPipe.py +29 -0
  70. magic_pdf/pipe/UNIPipe.py +83 -0
  71. magic_pdf/pipe/__init__.py +0 -0
  72. magic_pdf/post_proc/__init__.py +0 -0
  73. magic_pdf/post_proc/detect_para.py +3472 -0
  74. magic_pdf/post_proc/pdf_post_filter.py +67 -0
  75. magic_pdf/post_proc/remove_footnote.py +153 -0
  76. magic_pdf/pre_proc/__init__.py +0 -0
  77. magic_pdf/pre_proc/citationmarker_remove.py +157 -0
  78. magic_pdf/pre_proc/construct_page_dict.py +72 -0
  79. magic_pdf/pre_proc/cut_image.py +71 -0
  80. magic_pdf/pre_proc/detect_equation.py +134 -0
  81. magic_pdf/pre_proc/detect_footer_by_model.py +64 -0
  82. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +284 -0
  83. magic_pdf/pre_proc/detect_footnote.py +170 -0
  84. magic_pdf/pre_proc/detect_header.py +64 -0
  85. magic_pdf/pre_proc/detect_images.py +647 -0
  86. magic_pdf/pre_proc/detect_page_number.py +64 -0
  87. magic_pdf/pre_proc/detect_tables.py +62 -0
  88. magic_pdf/pre_proc/equations_replace.py +559 -0
  89. magic_pdf/pre_proc/fix_image.py +244 -0
  90. magic_pdf/pre_proc/fix_table.py +270 -0
  91. magic_pdf/pre_proc/main_text_font.py +23 -0
  92. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +115 -0
  93. magic_pdf/pre_proc/ocr_detect_layout.py +133 -0
  94. magic_pdf/pre_proc/ocr_dict_merge.py +336 -0
  95. magic_pdf/pre_proc/ocr_span_list_modify.py +258 -0
  96. magic_pdf/pre_proc/pdf_pre_filter.py +74 -0
  97. magic_pdf/pre_proc/post_layout_split.py +0 -0
  98. magic_pdf/pre_proc/remove_bbox_overlap.py +98 -0
  99. magic_pdf/pre_proc/remove_colored_strip_bbox.py +79 -0
  100. magic_pdf/pre_proc/remove_footer_header.py +117 -0
  101. magic_pdf/pre_proc/remove_rotate_bbox.py +188 -0
  102. magic_pdf/pre_proc/resolve_bbox_conflict.py +191 -0
  103. magic_pdf/pre_proc/solve_line_alien.py +29 -0
  104. magic_pdf/pre_proc/statistics.py +12 -0
  105. magic_pdf/rw/AbsReaderWriter.py +34 -0
  106. magic_pdf/rw/DiskReaderWriter.py +66 -0
  107. magic_pdf/rw/S3ReaderWriter.py +107 -0
  108. magic_pdf/rw/__init__.py +0 -0
  109. magic_pdf/spark/__init__.py +0 -0
  110. magic_pdf/spark/spark_api.py +51 -0
  111. magic_pdf/train_utils/__init__.py +0 -0
  112. magic_pdf/train_utils/convert_to_train_format.py +65 -0
  113. magic_pdf/train_utils/extract_caption.py +59 -0
  114. magic_pdf/train_utils/remove_footer_header.py +159 -0
  115. magic_pdf/train_utils/vis_utils.py +327 -0
  116. magic_pdf/user_api.py +136 -0
  117. magic_pdf-0.5.4.dist-info/LICENSE.md +661 -0
  118. magic_pdf-0.5.4.dist-info/METADATA +24 -0
  119. magic_pdf-0.5.4.dist-info/RECORD +121 -0
  120. magic_pdf-0.5.4.dist-info/WHEEL +5 -0
  121. magic_pdf-0.5.4.dist-info/top_level.txt +1 -0
magic_pdf/__init__.py ADDED
File without changes
File without changes
@@ -0,0 +1,294 @@
1
+ """
2
+ 这里实现2个click命令:
3
+ 第一个:
4
+ 接收一个完整的s3路径,例如:s3://llm-pdf-text/pdf_ebook_and_paper/pre-clean-mm-markdown/v014/part-660420b490be-000008.jsonl?bytes=0,81350
5
+ 1)根据~/magic-pdf.json里的ak,sk等,构造s3cliReader读取到这个jsonl的对应行,返回json对象。
6
+ 2)根据Json对象里的pdf的s3路径获取到他的ak,sk,endpoint,构造出s3cliReader用来读取pdf
7
+ 3)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalImageWriter,用来保存截图
8
+ 4)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
9
+
10
+ 最后把以上步骤准备好的对象传入真正的解析API
11
+
12
+ 第二个:
13
+ 接收1)pdf的本地路径。2)模型json文件(可选)。然后:
14
+ 1)根据~/magic-pdf.json读取到本地保存图片、md等临时目录的位置,构造出LocalImageWriter,用来保存截图
15
+ 2)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
16
+ 3)根据约定,根据pdf本地路径,推导出pdf模型的json,并读入
17
+
18
+
19
+ 效果:
20
+ python magicpdf.py --json s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350
21
+ python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json 或者 python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf
22
+ """
23
+
24
+ import os
25
+ import json as json_parse
26
+ import sys
27
+ import click
28
+ from loguru import logger
29
+ from pathlib import Path
30
+ from magic_pdf.libs.version import __version__
31
+
32
+ from magic_pdf.libs.MakeContentConfig import DropMode
33
+ from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
34
+ from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
35
+ from magic_pdf.pipe.UNIPipe import UNIPipe
36
+ from magic_pdf.pipe.OCRPipe import OCRPipe
37
+ from magic_pdf.pipe.TXTPipe import TXTPipe
38
+ from magic_pdf.libs.config_reader import get_s3_config
39
+ from magic_pdf.libs.path_utils import (
40
+ parse_s3path,
41
+ parse_s3_range_params,
42
+ remove_non_official_s3_args,
43
+ )
44
+ from magic_pdf.libs.config_reader import get_local_dir
45
+ from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
46
+ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
47
+ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
48
+ import csv
49
+
50
+ parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
51
+
52
+
53
+ def prepare_env(pdf_file_name, method):
54
+ local_parent_dir = os.path.join(
55
+ get_local_dir(), "magic-pdf", pdf_file_name, method
56
+ )
57
+
58
+ local_image_dir = os.path.join(str(local_parent_dir), "images")
59
+ local_md_dir = local_parent_dir
60
+ os.makedirs(local_image_dir, exist_ok=True)
61
+ os.makedirs(local_md_dir, exist_ok=True)
62
+ return local_image_dir, local_md_dir
63
+
64
+
65
+ def write_to_csv(csv_file_path, csv_data):
66
+ with open(csv_file_path, mode='a', newline='', encoding='utf-8') as csvfile:
67
+ # 创建csv writer对象
68
+ csv_writer = csv.writer(csvfile)
69
+ # 写入数据
70
+ csv_writer.writerow(csv_data)
71
+ print(f"数据已成功追加到 '{csv_file_path}'")
72
+
73
+
74
+ def _do_parse(pdf_file_name, pdf_bytes, model_list, parse_method, image_writer, md_writer, image_dir, local_md_dir):
75
+ if parse_method == "auto":
76
+ jso_useful_key = {
77
+ "_pdf_type": "",
78
+ "model_list": model_list
79
+ }
80
+ pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
81
+ elif parse_method == "txt":
82
+ pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
83
+ elif parse_method == "ocr":
84
+ pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
85
+ else:
86
+ print("unknow parse method")
87
+ sys.exit(1)
88
+
89
+ pipe.pipe_classify()
90
+
91
+ '''如果没有传入有效的模型数据,则使用内置paddle解析'''
92
+ if len(model_list) == 0:
93
+ pipe.pipe_analyze()
94
+
95
+ pipe.pipe_parse()
96
+ pdf_info = pipe.pdf_mid_data['pdf_info']
97
+ draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
98
+ draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
99
+
100
+ # write_to_csv(r"D:\project\20231108code-clean\linshixuqiu\pdf_dev\新模型\新建文件夹\luanma.csv",
101
+ # [pdf_file_name, pipe.pdf_mid_data['not_common_character_rate'], pipe.pdf_mid_data['not_printable_rate']])
102
+
103
+ md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE)
104
+ md_writer.write(
105
+ content=md_content, path=f"{pdf_file_name}.md", mode=AbsReaderWriter.MODE_TXT
106
+ )
107
+ md_writer.write(
108
+ content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
109
+ path=f"{pdf_file_name}.json",
110
+ mode=AbsReaderWriter.MODE_TXT,
111
+ )
112
+
113
+ content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
114
+ md_writer.write(
115
+ str(content_list), f"{pdf_file_name}.txt", AbsReaderWriter.MODE_TXT
116
+ )
117
+
118
+
119
+ @click.group()
120
+ @click.version_option(__version__, "--version", "-v", help="显示版本信息")
121
+ @click.help_option("--help", "-h", help="显示帮助信息")
122
+ def cli():
123
+ pass
124
+
125
+
126
+ @cli.command()
127
+ @click.option("--json", type=str, help="输入一个S3路径")
128
+ @click.option(
129
+ "--method",
130
+ type=parse_pdf_methods,
131
+ help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
132
+ default="auto",
133
+ )
134
+ def json_command(json, method):
135
+ if not json.startswith("s3://"):
136
+ print("usage: python magipdf.py --json s3://some_bucket/some_path")
137
+ sys.exit(1)
138
+
139
+ def read_s3_path(s3path):
140
+ bucket, key = parse_s3path(s3path)
141
+
142
+ s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
143
+ s3_rw = S3ReaderWriter(
144
+ s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
145
+ )
146
+ may_range_params = parse_s3_range_params(s3path)
147
+ if may_range_params is None or 2 != len(may_range_params):
148
+ byte_start, byte_end = 0, None
149
+ else:
150
+ byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
151
+ byte_end += byte_start - 1
152
+ return s3_rw.read_jsonl(
153
+ remove_non_official_s3_args(s3path),
154
+ byte_start,
155
+ byte_end,
156
+ AbsReaderWriter.MODE_BIN,
157
+ )
158
+
159
+ jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
160
+ s3_file_path = jso.get("file_location")
161
+ if s3_file_path is None:
162
+ s3_file_path = jso.get("path")
163
+ pdf_file_name = Path(s3_file_path).stem
164
+ pdf_data = read_s3_path(s3_file_path)
165
+ local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
166
+
167
+ local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
168
+ local_md_dir
169
+ )
170
+
171
+ _do_parse(
172
+ pdf_file_name,
173
+ pdf_data,
174
+ jso["doc_layout_result"],
175
+ method,
176
+ local_image_rw,
177
+ local_md_rw,
178
+ os.path.basename(local_image_dir),
179
+ local_md_dir
180
+ )
181
+
182
+
183
+ @cli.command()
184
+ @click.option("--local_json", type=str, help="输入一个本地jsonl路径")
185
+ @click.option(
186
+ "--method",
187
+ type=parse_pdf_methods,
188
+ help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
189
+ default="auto",
190
+ )
191
+ def local_json_command(local_json, method):
192
+ def read_s3_path(s3path):
193
+ bucket, key = parse_s3path(s3path)
194
+
195
+ s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
196
+ s3_rw = S3ReaderWriter(
197
+ s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
198
+ )
199
+ may_range_params = parse_s3_range_params(s3path)
200
+ if may_range_params is None or 2 != len(may_range_params):
201
+ byte_start, byte_end = 0, None
202
+ else:
203
+ byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
204
+ byte_end += byte_start - 1
205
+ return s3_rw.read_jsonl(
206
+ remove_non_official_s3_args(s3path),
207
+ byte_start,
208
+ byte_end,
209
+ AbsReaderWriter.MODE_BIN,
210
+ )
211
+
212
+ with open(local_json, "r", encoding="utf-8") as f:
213
+ for json_line in f:
214
+ jso = json_parse.loads(json_line)
215
+
216
+ s3_file_path = jso.get("file_location")
217
+ if s3_file_path is None:
218
+ s3_file_path = jso.get("path")
219
+ pdf_file_name = Path(s3_file_path).stem
220
+ pdf_data = read_s3_path(s3_file_path)
221
+ local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
222
+
223
+ local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
224
+ local_md_dir
225
+ )
226
+
227
+ _do_parse(
228
+ pdf_file_name,
229
+ pdf_data,
230
+ jso["doc_layout_result"],
231
+ method,
232
+ local_image_rw,
233
+ local_md_rw,
234
+ os.path.basename(local_image_dir),
235
+ local_md_dir
236
+ )
237
+
238
+
239
+ @cli.command()
240
+ @click.option(
241
+ "--pdf", type=click.Path(exists=True), required=True, help="PDF文件的路径"
242
+ )
243
+ @click.option("--model", type=click.Path(exists=True), help="模型的路径")
244
+ @click.option(
245
+ "--method",
246
+ type=parse_pdf_methods,
247
+ help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
248
+ default="auto",
249
+ )
250
+ def pdf_command(pdf, model, method):
251
+ def read_fn(path):
252
+ disk_rw = DiskReaderWriter(os.path.dirname(path))
253
+ return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
254
+
255
+ pdf_data = read_fn(pdf)
256
+
257
+ def get_model_json(model_path):
258
+ # 这里处理pdf和模型相关的逻辑
259
+ if model_path is None:
260
+ model_path = pdf.replace(".pdf", ".json")
261
+ if not os.path.exists(model_path):
262
+ logger.warning(f"not found json {model_path} existed, use paddle analyze")
263
+ # 本地无模型数据则调用内置paddle分析,先传空list,在内部识别到空list再调用paddle
264
+ model_json = "[]"
265
+ else:
266
+ model_json = read_fn(model_path).decode("utf-8")
267
+ else:
268
+ model_json = read_fn(model_path).decode("utf-8")
269
+
270
+ return model_json
271
+
272
+ jso = json_parse.loads(get_model_json(model))
273
+ pdf_file_name = Path(pdf).stem
274
+ local_image_dir, local_md_dir = prepare_env(pdf_file_name, method)
275
+ local_image_rw, local_md_rw = DiskReaderWriter(local_image_dir), DiskReaderWriter(
276
+ local_md_dir
277
+ )
278
+ _do_parse(
279
+ pdf_file_name,
280
+ pdf_data,
281
+ jso,
282
+ method,
283
+ local_image_rw,
284
+ local_md_rw,
285
+ os.path.basename(local_image_dir),
286
+ local_md_dir
287
+ )
288
+
289
+
290
+ if __name__ == "__main__":
291
+ """
292
+ python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551
293
+ """
294
+ cli()
File without changes