magic-pdf 0.6.1__py3-none-any.whl → 0.7.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. magic_pdf/dict2md/ocr_mkcontent.py +20 -7
  2. magic_pdf/libs/config_reader.py +28 -10
  3. magic_pdf/libs/language.py +12 -0
  4. magic_pdf/libs/version.py +1 -1
  5. magic_pdf/model/__init__.py +1 -1
  6. magic_pdf/model/doc_analyze_by_custom_model.py +35 -3
  7. magic_pdf/model/magic_model.py +49 -41
  8. magic_pdf/model/pdf_extract_kit.py +155 -60
  9. magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py +7 -6
  10. magic_pdf/model/pek_sub_modules/self_modify.py +87 -43
  11. magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +22 -0
  12. magic_pdf/model/pp_structure_v2.py +1 -1
  13. magic_pdf/pdf_parse_union_core.py +4 -2
  14. magic_pdf/pre_proc/citationmarker_remove.py +5 -1
  15. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +40 -2
  16. magic_pdf/pre_proc/ocr_span_list_modify.py +12 -7
  17. magic_pdf/resources/fasttext-langdetect/lid.176.ftz +0 -0
  18. magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml +2 -2
  19. magic_pdf/resources/model_config/model_configs.yaml +4 -0
  20. magic_pdf/rw/AbsReaderWriter.py +1 -18
  21. magic_pdf/rw/DiskReaderWriter.py +32 -24
  22. magic_pdf/rw/S3ReaderWriter.py +83 -48
  23. magic_pdf/tools/cli.py +79 -0
  24. magic_pdf/tools/cli_dev.py +156 -0
  25. magic_pdf/tools/common.py +119 -0
  26. {magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/METADATA +120 -72
  27. {magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/RECORD +34 -35
  28. {magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/WHEEL +1 -1
  29. magic_pdf-0.7.0a1.dist-info/entry_points.txt +3 -0
  30. magic_pdf/cli/magicpdf.py +0 -337
  31. magic_pdf/pdf_parse_for_train.py +0 -685
  32. magic_pdf/train_utils/convert_to_train_format.py +0 -65
  33. magic_pdf/train_utils/extract_caption.py +0 -59
  34. magic_pdf/train_utils/remove_footer_header.py +0 -159
  35. magic_pdf/train_utils/vis_utils.py +0 -327
  36. magic_pdf-0.6.1.dist-info/entry_points.txt +0 -2
  37. /magic_pdf/libs/{math.py → local_math.py} +0 -0
  38. /magic_pdf/{cli → model/pek_sub_modules/structeqtable}/__init__.py +0 -0
  39. /magic_pdf/{train_utils → tools}/__init__.py +0 -0
  40. {magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/LICENSE.md +0 -0
  41. {magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/top_level.txt +0 -0
magic_pdf/cli/magicpdf.py DELETED
@@ -1,337 +0,0 @@
1
- """
2
- 这里实现2个click命令:
3
- 第一个:
4
- 接收一个完整的s3路径,例如:s3://llm-pdf-text/pdf_ebook_and_paper/pre-clean-mm-markdown/v014/part-660420b490be-000008.jsonl?bytes=0,81350
5
- 1)根据~/magic-pdf.json里的ak,sk等,构造s3cliReader读取到这个jsonl的对应行,返回json对象。
6
- 2)根据Json对象里的pdf的s3路径获取到他的ak,sk,endpoint,构造出s3cliReader用来读取pdf
7
- 3)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalImageWriter,用来保存截图
8
- 4)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
9
-
10
- 最后把以上步骤准备好的对象传入真正的解析API
11
-
12
- 第二个:
13
- 接收1)pdf的本地路径。2)模型json文件(可选)。然后:
14
- 1)根据~/magic-pdf.json读取到本地保存图片、md等临时目录的位置,构造出LocalImageWriter,用来保存截图
15
- 2)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件
16
- 3)根据约定,根据pdf本地路径,推导出pdf模型的json,并读入
17
-
18
-
19
- 效果:
20
- python magicpdf.py json-command --json s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350
21
- python magicpdf.py pdf-command --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json 或者 python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf
22
- """
23
-
24
- import os
25
- import json as json_parse
26
- import click
27
- from loguru import logger
28
- from pathlib import Path
29
- from magic_pdf.libs.version import __version__
30
-
31
- from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
32
- from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
33
- from magic_pdf.pipe.UNIPipe import UNIPipe
34
- from magic_pdf.pipe.OCRPipe import OCRPipe
35
- from magic_pdf.pipe.TXTPipe import TXTPipe
36
- from magic_pdf.libs.path_utils import (
37
- parse_s3path,
38
- parse_s3_range_params,
39
- remove_non_official_s3_args,
40
- )
41
- from magic_pdf.libs.config_reader import (
42
- get_local_dir,
43
- get_s3_config,
44
- )
45
- from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
46
- from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
47
- from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
48
- import csv
49
- import copy
50
- import magic_pdf.model as model_config
51
-
52
- parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
53
-
54
-
55
- def prepare_env(pdf_file_name, method):
56
- local_parent_dir = os.path.join(get_local_dir(), "magic-pdf", pdf_file_name, method)
57
-
58
- local_image_dir = os.path.join(str(local_parent_dir), "images")
59
- local_md_dir = local_parent_dir
60
- os.makedirs(local_image_dir, exist_ok=True)
61
- os.makedirs(local_md_dir, exist_ok=True)
62
- return local_image_dir, local_md_dir
63
-
64
-
65
- def write_to_csv(csv_file_path, csv_data):
66
- with open(csv_file_path, mode="a", newline="", encoding="utf-8") as csvfile:
67
- # 创建csv writer对象
68
- csv_writer = csv.writer(csvfile)
69
- # 写入数据
70
- csv_writer.writerow(csv_data)
71
- logger.info(f"数据已成功追加到 '{csv_file_path}'")
72
-
73
-
74
- def do_parse(
75
- pdf_file_name,
76
- pdf_bytes,
77
- model_list,
78
- parse_method,
79
- f_draw_span_bbox=True,
80
- f_draw_layout_bbox=True,
81
- f_dump_md=True,
82
- f_dump_middle_json=True,
83
- f_dump_model_json=True,
84
- f_dump_orig_pdf=True,
85
- f_dump_content_list=True,
86
- f_make_md_mode=MakeMode.MM_MD,
87
- ):
88
- orig_model_list = copy.deepcopy(model_list)
89
-
90
- local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method)
91
- logger.info(f"local output dir is {local_md_dir}")
92
- image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(local_md_dir)
93
- image_dir = str(os.path.basename(local_image_dir))
94
-
95
- if parse_method == "auto":
96
- jso_useful_key = {"_pdf_type": "", "model_list": model_list}
97
- pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
98
- elif parse_method == "txt":
99
- pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
100
- elif parse_method == "ocr":
101
- pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
102
- else:
103
- logger.error("unknown parse method")
104
- exit(1)
105
-
106
- pipe.pipe_classify()
107
-
108
- """如果没有传入有效的模型数据,则使用内置model解析"""
109
- if len(model_list) == 0:
110
- if model_config.__use_inside_model__:
111
- pipe.pipe_analyze()
112
- orig_model_list = copy.deepcopy(pipe.model_list)
113
- else:
114
- logger.error("need model list input")
115
- exit(1)
116
-
117
- pipe.pipe_parse()
118
- pdf_info = pipe.pdf_mid_data["pdf_info"]
119
- if f_draw_layout_bbox:
120
- draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
121
- if f_draw_span_bbox:
122
- draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
123
-
124
- md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode)
125
- if f_dump_md:
126
- """写markdown"""
127
- md_writer.write(
128
- content=md_content,
129
- path=f"{pdf_file_name}.md",
130
- mode=AbsReaderWriter.MODE_TXT,
131
- )
132
-
133
- if f_dump_middle_json:
134
- """写middle_json"""
135
- md_writer.write(
136
- content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
137
- path=f"{pdf_file_name}_middle.json",
138
- mode=AbsReaderWriter.MODE_TXT,
139
- )
140
-
141
- if f_dump_model_json:
142
- """写model_json"""
143
- md_writer.write(
144
- content=json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4),
145
- path=f"{pdf_file_name}_model.json",
146
- mode=AbsReaderWriter.MODE_TXT,
147
- )
148
-
149
- if f_dump_orig_pdf:
150
- """写源pdf"""
151
- md_writer.write(
152
- content=pdf_bytes,
153
- path=f"{pdf_file_name}_origin.pdf",
154
- mode=AbsReaderWriter.MODE_BIN,
155
- )
156
-
157
- content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
158
- if f_dump_content_list:
159
- """写content_list"""
160
- md_writer.write(
161
- content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
162
- path=f"{pdf_file_name}_content_list.json",
163
- mode=AbsReaderWriter.MODE_TXT,
164
- )
165
-
166
-
167
- @click.group()
168
- @click.version_option(__version__, "--version", "-v", help="显示版本信息")
169
- @click.help_option("--help", "-h", help="显示帮助信息")
170
- def cli():
171
- pass
172
-
173
-
174
- @cli.command()
175
- @click.option("--json", type=str, help="输入一个S3路径")
176
- @click.option(
177
- "--method",
178
- type=parse_pdf_methods,
179
- help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
180
- default="auto",
181
- )
182
- @click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
183
- @click.option("--model_mode", type=click.STRING, default="full", help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢")
184
- def json_command(json, method, inside_model, model_mode):
185
- model_config.__use_inside_model__ = inside_model
186
- model_config.__model_mode__ = model_mode
187
-
188
- if not json.startswith("s3://"):
189
- logger.error("usage: magic-pdf json-command --json s3://some_bucket/some_path")
190
- exit(1)
191
-
192
- def read_s3_path(s3path):
193
- bucket, key = parse_s3path(s3path)
194
-
195
- s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
196
- s3_rw = S3ReaderWriter(
197
- s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
198
- )
199
- may_range_params = parse_s3_range_params(s3path)
200
- if may_range_params is None or 2 != len(may_range_params):
201
- byte_start, byte_end = 0, None
202
- else:
203
- byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
204
- byte_end += byte_start - 1
205
- return s3_rw.read_jsonl(
206
- remove_non_official_s3_args(s3path),
207
- byte_start,
208
- byte_end,
209
- AbsReaderWriter.MODE_BIN,
210
- )
211
-
212
- jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
213
- s3_file_path = jso.get("file_location")
214
- if s3_file_path is None:
215
- s3_file_path = jso.get("path")
216
- pdf_file_name = Path(s3_file_path).stem
217
- pdf_data = read_s3_path(s3_file_path)
218
-
219
- do_parse(
220
- pdf_file_name,
221
- pdf_data,
222
- jso["doc_layout_result"],
223
- method,
224
- )
225
-
226
-
227
- @cli.command()
228
- @click.option("--local_json", type=str, help="输入一个本地jsonl路径")
229
- @click.option(
230
- "--method",
231
- type=parse_pdf_methods,
232
- help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
233
- default="auto",
234
- )
235
- @click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
236
- @click.option("--model_mode", type=click.STRING, default="full", help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢")
237
- def local_json_command(local_json, method, inside_model, model_mode):
238
- model_config.__use_inside_model__ = inside_model
239
- model_config.__model_mode__ = model_mode
240
-
241
- def read_s3_path(s3path):
242
- bucket, key = parse_s3path(s3path)
243
-
244
- s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
245
- s3_rw = S3ReaderWriter(
246
- s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
247
- )
248
- may_range_params = parse_s3_range_params(s3path)
249
- if may_range_params is None or 2 != len(may_range_params):
250
- byte_start, byte_end = 0, None
251
- else:
252
- byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
253
- byte_end += byte_start - 1
254
- return s3_rw.read_jsonl(
255
- remove_non_official_s3_args(s3path),
256
- byte_start,
257
- byte_end,
258
- AbsReaderWriter.MODE_BIN,
259
- )
260
-
261
- with open(local_json, "r", encoding="utf-8") as f:
262
- for json_line in f:
263
- jso = json_parse.loads(json_line)
264
-
265
- s3_file_path = jso.get("file_location")
266
- if s3_file_path is None:
267
- s3_file_path = jso.get("path")
268
- pdf_file_name = Path(s3_file_path).stem
269
- pdf_data = read_s3_path(s3_file_path)
270
- do_parse(
271
- pdf_file_name,
272
- pdf_data,
273
- jso["doc_layout_result"],
274
- method,
275
- )
276
-
277
-
278
- @cli.command()
279
- @click.option(
280
- "--pdf", type=click.Path(exists=True), required=True, help="PDF文件的路径"
281
- )
282
- @click.option("--model", type=click.Path(exists=True), help="模型的路径")
283
- @click.option(
284
- "--method",
285
- type=parse_pdf_methods,
286
- help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
287
- default="auto",
288
- )
289
- @click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
290
- @click.option("--model_mode", type=click.STRING, default="full", help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢")
291
- def pdf_command(pdf, model, method, inside_model, model_mode):
292
- model_config.__use_inside_model__ = inside_model
293
- model_config.__model_mode__ = model_mode
294
-
295
- def read_fn(path):
296
- disk_rw = DiskReaderWriter(os.path.dirname(path))
297
- return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
298
-
299
- pdf_data = read_fn(pdf)
300
-
301
- def get_model_json(model_path):
302
- # 这里处理pdf和模型相关的逻辑
303
- if model_path is None:
304
- file_name_without_extension, extension = os.path.splitext(pdf)
305
- if extension == ".pdf":
306
- model_path = file_name_without_extension + ".json"
307
- else:
308
- raise Exception("pdf_path input error")
309
- if not os.path.exists(model_path):
310
- logger.warning(
311
- f"not found json {model_path} existed"
312
- )
313
- # 本地无模型数据则调用内置paddle分析,先传空list,在内部识别到空list再调用paddle
314
- model_json = "[]"
315
- else:
316
- model_json = read_fn(model_path).decode("utf-8")
317
- else:
318
- model_json = read_fn(model_path).decode("utf-8")
319
-
320
- return model_json
321
-
322
- jso = json_parse.loads(get_model_json(model))
323
- pdf_file_name = Path(pdf).stem
324
-
325
- do_parse(
326
- pdf_file_name,
327
- pdf_data,
328
- jso,
329
- method,
330
- )
331
-
332
-
333
- if __name__ == "__main__":
334
- """
335
- python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551
336
- """
337
- cli()