magic-pdf 0.6.2b1__py3-none-any.whl → 0.7.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. magic_pdf/dict2md/ocr_mkcontent.py +10 -3
  2. magic_pdf/libs/config_reader.py +10 -10
  3. magic_pdf/libs/version.py +1 -1
  4. magic_pdf/model/doc_analyze_by_custom_model.py +8 -2
  5. magic_pdf/model/magic_model.py +4 -0
  6. magic_pdf/model/pdf_extract_kit.py +45 -2
  7. magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +22 -0
  8. magic_pdf/resources/model_config/model_configs.yaml +4 -0
  9. magic_pdf/rw/AbsReaderWriter.py +1 -18
  10. magic_pdf/rw/DiskReaderWriter.py +32 -24
  11. magic_pdf/rw/S3ReaderWriter.py +83 -48
  12. magic_pdf/tools/cli.py +79 -0
  13. magic_pdf/tools/cli_dev.py +156 -0
  14. magic_pdf/tools/common.py +119 -0
  15. {magic_pdf-0.6.2b1.dist-info → magic_pdf-0.7.0a1.dist-info}/METADATA +49 -31
  16. {magic_pdf-0.6.2b1.dist-info → magic_pdf-0.7.0a1.dist-info}/RECORD +22 -24
  17. {magic_pdf-0.6.2b1.dist-info → magic_pdf-0.7.0a1.dist-info}/WHEEL +1 -1
  18. magic_pdf-0.7.0a1.dist-info/entry_points.txt +3 -0
  19. magic_pdf/cli/magicpdf.py +0 -359
  20. magic_pdf/pdf_parse_for_train.py +0 -685
  21. magic_pdf/train_utils/convert_to_train_format.py +0 -65
  22. magic_pdf/train_utils/extract_caption.py +0 -59
  23. magic_pdf/train_utils/remove_footer_header.py +0 -159
  24. magic_pdf/train_utils/vis_utils.py +0 -327
  25. magic_pdf-0.6.2b1.dist-info/entry_points.txt +0 -2
  26. /magic_pdf/{cli → model/pek_sub_modules/structeqtable}/__init__.py +0 -0
  27. /magic_pdf/{train_utils → tools}/__init__.py +0 -0
  28. {magic_pdf-0.6.2b1.dist-info → magic_pdf-0.7.0a1.dist-info}/LICENSE.md +0 -0
  29. {magic_pdf-0.6.2b1.dist-info → magic_pdf-0.7.0a1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,156 @@
1
+ import os
2
+ import json as json_parse
3
+ import click
4
+ from pathlib import Path
5
+ from magic_pdf.libs.path_utils import (
6
+ parse_s3path,
7
+ parse_s3_range_params,
8
+ remove_non_official_s3_args,
9
+ )
10
+ from magic_pdf.libs.config_reader import (
11
+ get_s3_config,
12
+ )
13
+ from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
14
+ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
15
+ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
16
+ import magic_pdf.model as model_config
17
+ from magic_pdf.tools.common import parse_pdf_methods, do_parse
18
+ from magic_pdf.libs.version import __version__
19
+
20
+
21
+ def read_s3_path(s3path):
22
+ bucket, key = parse_s3path(s3path)
23
+
24
+ s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
25
+ s3_rw = S3ReaderWriter(
26
+ s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
27
+ )
28
+ may_range_params = parse_s3_range_params(s3path)
29
+ if may_range_params is None or 2 != len(may_range_params):
30
+ byte_start, byte_end = 0, None
31
+ else:
32
+ byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
33
+ byte_end += byte_start - 1
34
+ return s3_rw.read_jsonl(
35
+ remove_non_official_s3_args(s3path),
36
+ byte_start,
37
+ byte_end,
38
+ AbsReaderWriter.MODE_BIN,
39
+ )
40
+
41
+
42
+ @click.group()
43
+ @click.version_option(__version__, "--version", "-v", help="显示版本信息")
44
+ def cli():
45
+ pass
46
+
47
+
48
+ @cli.command()
49
+ @click.option(
50
+ "-j",
51
+ "--jsonl",
52
+ "jsonl",
53
+ type=str,
54
+ help="输入 jsonl 路径,本地或者 s3 上的文件",
55
+ required=True,
56
+ )
57
+ @click.option(
58
+ "-m",
59
+ "--method",
60
+ "method",
61
+ type=parse_pdf_methods,
62
+ help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
63
+ default="auto",
64
+ )
65
+ @click.option(
66
+ "-o",
67
+ "--output-dir",
68
+ "output_dir",
69
+ type=str,
70
+ help="输出到本地目录",
71
+ default="",
72
+ )
73
+ def jsonl(jsonl, method, output_dir):
74
+ print("haha")
75
+ model_config.__use_inside_model__ = False
76
+ full_jsonl_path = os.path.realpath(jsonl)
77
+ if output_dir == "":
78
+ output_dir = os.path.join(os.path.dirname(full_jsonl_path), "output")
79
+
80
+ if jsonl.startswith("s3://"):
81
+ jso = json_parse.loads(read_s3_path(jsonl).decode("utf-8"))
82
+ else:
83
+ with open(jsonl) as f:
84
+ jso = json_parse.loads(f.readline())
85
+ s3_file_path = jso.get("file_location")
86
+ if s3_file_path is None:
87
+ s3_file_path = jso.get("path")
88
+ pdf_file_name = Path(s3_file_path).stem
89
+ pdf_data = read_s3_path(s3_file_path)
90
+
91
+
92
+ print(pdf_file_name, jso, method)
93
+ do_parse(
94
+ output_dir,
95
+ pdf_file_name,
96
+ pdf_data,
97
+ jso["doc_layout_result"],
98
+ method,
99
+ f_dump_content_list=True,
100
+ )
101
+
102
+
103
+ @cli.command()
104
+ @click.option(
105
+ "-p",
106
+ "--pdf",
107
+ "pdf",
108
+ type=click.Path(exists=True),
109
+ required=True,
110
+ help="本地 PDF 文件",
111
+ )
112
+ @click.option(
113
+ "-j",
114
+ "--json",
115
+ "json_data",
116
+ type=click.Path(exists=True),
117
+ required=True,
118
+ help="本地模型推理出的 json 数据",
119
+ )
120
+ @click.option(
121
+ "-o", "--output-dir", "output_dir", type=str, help="本地输出目录", default=""
122
+ )
123
+ @click.option(
124
+ "-m",
125
+ "--method",
126
+ "method",
127
+ type=parse_pdf_methods,
128
+ help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
129
+ default="auto",
130
+ )
131
+ def pdf(pdf, json_data, output_dir, method):
132
+ model_config.__use_inside_model__ = False
133
+ full_pdf_path = os.path.realpath(pdf)
134
+ if output_dir == "":
135
+ output_dir = os.path.join(os.path.dirname(full_pdf_path), "output")
136
+
137
+ def read_fn(path):
138
+ disk_rw = DiskReaderWriter(os.path.dirname(path))
139
+ return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
140
+
141
+ model_json_list = json_parse.loads(read_fn(json_data).decode("utf-8"))
142
+
143
+ file_name = str(Path(full_pdf_path).stem)
144
+ pdf_data = read_fn(full_pdf_path)
145
+ do_parse(
146
+ output_dir,
147
+ file_name,
148
+ pdf_data,
149
+ model_json_list,
150
+ method,
151
+ f_dump_content_list=True,
152
+ )
153
+
154
+
155
+ if __name__ == "__main__":
156
+ cli()
@@ -0,0 +1,119 @@
1
+ import os
2
+ import json as json_parse
3
+ import copy
4
+ import click
5
+ from loguru import logger
6
+ from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
7
+ from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
8
+ from magic_pdf.pipe.UNIPipe import UNIPipe
9
+ from magic_pdf.pipe.OCRPipe import OCRPipe
10
+ from magic_pdf.pipe.TXTPipe import TXTPipe
11
+ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
12
+ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
13
+ import magic_pdf.model as model_config
14
+
15
+
16
+ def prepare_env(output_dir, pdf_file_name, method):
17
+ local_parent_dir = os.path.join(output_dir, pdf_file_name, method)
18
+
19
+ local_image_dir = os.path.join(str(local_parent_dir), "images")
20
+ local_md_dir = local_parent_dir
21
+ os.makedirs(local_image_dir, exist_ok=True)
22
+ os.makedirs(local_md_dir, exist_ok=True)
23
+ return local_image_dir, local_md_dir
24
+
25
+
26
+ def do_parse(
27
+ output_dir,
28
+ pdf_file_name,
29
+ pdf_bytes,
30
+ model_list,
31
+ parse_method,
32
+ f_draw_span_bbox=True,
33
+ f_draw_layout_bbox=True,
34
+ f_dump_md=True,
35
+ f_dump_middle_json=True,
36
+ f_dump_model_json=True,
37
+ f_dump_orig_pdf=True,
38
+ f_dump_content_list=False,
39
+ f_make_md_mode=MakeMode.MM_MD,
40
+ ):
41
+ orig_model_list = copy.deepcopy(model_list)
42
+ local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
43
+
44
+ image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(
45
+ local_md_dir
46
+ )
47
+ image_dir = str(os.path.basename(local_image_dir))
48
+
49
+ if parse_method == "auto":
50
+ jso_useful_key = {"_pdf_type": "", "model_list": model_list}
51
+ pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
52
+ elif parse_method == "txt":
53
+ pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
54
+ elif parse_method == "ocr":
55
+ pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
56
+ else:
57
+ logger.error("unknown parse method")
58
+ exit(1)
59
+
60
+ pipe.pipe_classify()
61
+
62
+ if len(model_list) == 0:
63
+ if model_config.__use_inside_model__:
64
+ pipe.pipe_analyze()
65
+ orig_model_list = copy.deepcopy(pipe.model_list)
66
+ else:
67
+ logger.error("need model list input")
68
+ exit(2)
69
+
70
+ pipe.pipe_parse()
71
+ pdf_info = pipe.pdf_mid_data["pdf_info"]
72
+ if f_draw_layout_bbox:
73
+ draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
74
+ if f_draw_span_bbox:
75
+ draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
76
+
77
+ md_content = pipe.pipe_mk_markdown(
78
+ image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode
79
+ )
80
+ if f_dump_md:
81
+ md_writer.write(
82
+ content=md_content,
83
+ path=f"{pdf_file_name}.md",
84
+ mode=AbsReaderWriter.MODE_TXT,
85
+ )
86
+
87
+ if f_dump_middle_json:
88
+ md_writer.write(
89
+ content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
90
+ path="middle.json",
91
+ mode=AbsReaderWriter.MODE_TXT,
92
+ )
93
+
94
+ if f_dump_model_json:
95
+ md_writer.write(
96
+ content=json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4),
97
+ path="model.json",
98
+ mode=AbsReaderWriter.MODE_TXT,
99
+ )
100
+
101
+ if f_dump_orig_pdf:
102
+ md_writer.write(
103
+ content=pdf_bytes,
104
+ path="origin.pdf",
105
+ mode=AbsReaderWriter.MODE_BIN,
106
+ )
107
+
108
+ content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
109
+ if f_dump_content_list:
110
+ md_writer.write(
111
+ content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
112
+ path="content_list.json",
113
+ mode=AbsReaderWriter.MODE_TXT,
114
+ )
115
+
116
+ logger.info(f"local output dir is {local_md_dir}")
117
+
118
+
119
+ parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
@@ -1,33 +1,36 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.6.2b1
3
+ Version: 0.7.0a1
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  Home-page: https://github.com/opendatalab/MinerU
6
6
  Requires-Python: >=3.9
7
7
  Description-Content-Type: text/markdown
8
8
  License-File: LICENSE.md
9
- Requires-Dist: boto3 >=1.28.43
10
- Requires-Dist: Brotli >=1.1.0
11
- Requires-Dist: click >=8.1.7
12
- Requires-Dist: PyMuPDF >=1.24.9
13
- Requires-Dist: loguru >=0.6.0
14
- Requires-Dist: numpy <2.0.0,>=1.21.6
15
- Requires-Dist: fast-langdetect ==0.2.0
16
- Requires-Dist: wordninja >=2.0.0
17
- Requires-Dist: scikit-learn >=1.0.2
18
- Requires-Dist: pdfminer.six ==20231228
9
+ Requires-Dist: boto3>=1.28.43
10
+ Requires-Dist: Brotli>=1.1.0
11
+ Requires-Dist: click>=8.1.7
12
+ Requires-Dist: PyMuPDF>=1.24.9
13
+ Requires-Dist: loguru>=0.6.0
14
+ Requires-Dist: numpy<2.0.0,>=1.21.6
15
+ Requires-Dist: fast-langdetect==0.2.0
16
+ Requires-Dist: wordninja>=2.0.0
17
+ Requires-Dist: scikit-learn>=1.0.2
18
+ Requires-Dist: pdfminer.six==20231228
19
19
  Provides-Extra: full
20
- Requires-Dist: unimernet ==0.1.6 ; extra == 'full'
21
- Requires-Dist: matplotlib ; extra == 'full'
22
- Requires-Dist: ultralytics ; extra == 'full'
23
- Requires-Dist: paddleocr ==2.7.3 ; extra == 'full'
24
- Requires-Dist: detectron2 ; extra == 'full'
25
- Requires-Dist: paddlepaddle ==3.0.0b1 ; (platform_system == "Linux") and extra == 'full'
26
- Requires-Dist: paddlepaddle ==2.6.1 ; (platform_system == "Windows" or platform_system == "Darwin") and extra == 'full'
20
+ Requires-Dist: unimernet==0.1.6; extra == "full"
21
+ Requires-Dist: ultralytics; extra == "full"
22
+ Requires-Dist: paddleocr==2.7.3; extra == "full"
23
+ Requires-Dist: pypandoc; extra == "full"
24
+ Requires-Dist: struct-eqtable==0.1.0; extra == "full"
25
+ Requires-Dist: detectron2; extra == "full"
26
+ Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "full"
27
+ Requires-Dist: matplotlib; (platform_system == "Linux" or platform_system == "Darwin") and extra == "full"
28
+ Requires-Dist: matplotlib<=3.9.0; platform_system == "Windows" and extra == "full"
29
+ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_system == "Darwin") and extra == "full"
27
30
  Provides-Extra: lite
28
- Requires-Dist: paddleocr ==2.7.3 ; extra == 'lite'
29
- Requires-Dist: paddlepaddle ==3.0.0b1 ; (platform_system == "Linux") and extra == 'lite'
30
- Requires-Dist: paddlepaddle ==2.6.1 ; (platform_system == "Windows" or platform_system == "Darwin") and extra == 'lite'
31
+ Requires-Dist: paddleocr==2.7.3; extra == "lite"
32
+ Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "lite"
33
+ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_system == "Darwin") and extra == "lite"
31
34
 
32
35
  <div id="top">
33
36
 
@@ -62,7 +65,7 @@ Requires-Dist: paddlepaddle ==2.6.1 ; (platform_system == "Windows" or platform_
62
65
  </p>
63
66
 
64
67
  <p align="center">
65
- 👋 join us on <a href="https://discord.gg/AsQMhuMN" target="_blank">Discord</a> and <a href="https://cdn.vansin.top/internlm/mineru.jpg" target="_blank">WeChat</a>
68
+ 👋 join us on <a href="https://discord.gg/gPxmVeGC" target="_blank">Discord</a> and <a href="https://cdn.vansin.top/internlm/mineru.jpg" target="_blank">WeChat</a>
66
69
  </p>
67
70
  </div>
68
71
 
@@ -131,21 +134,30 @@ conda activate MinerU
131
134
 
132
135
  #### 1. Install Magic-PDF
133
136
 
134
- Install the full-feature package with pip:
135
- >Note: The pip-installed package supports CPU-only and is ideal for quick tests.
136
- >
137
- >For CUDA/MPS acceleration in production, see [Acceleration Using CUDA or MPS](#4-Acceleration-Using-CUDA-or-MPS).
137
+ **1.Install dependencies**
138
138
 
139
- ```bash
140
- pip install magic-pdf[full-cpu]
141
- ```
142
139
  The full-feature package depends on detectron2, which requires a compilation installation.
143
140
  If you need to compile it yourself, please refer to https://github.com/facebookresearch/detectron2/issues/5114
144
141
  Alternatively, you can directly use our precompiled whl package (limited to Python 3.10):
145
142
 
146
143
  ```bash
147
- pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/
144
+ pip install detectron2 --extra-index-url https://wheels.myhloli.com
145
+ ```
146
+
147
+ **2.Install the full-feature package with pip**
148
+ >Note: The pip-installed package supports CPU-only and is ideal for quick tests.
149
+ >
150
+ >For CUDA/MPS acceleration in production, see [Acceleration Using CUDA or MPS](#4-Acceleration-Using-CUDA-or-MPS).
151
+
152
+ ```bash
153
+ pip install magic-pdf[full]==0.6.2b1
148
154
  ```
155
+ > ❗️❗️❗️
156
+ > We have pre-released the 0.6.2 beta version, addressing numerous issues mentioned in our logs. However, this build has not undergone full QA testing and does not represent the final release quality. Should you encounter any problems, please promptly report them to us via issues or revert to using version 0.6.1.
157
+ > ```bash
158
+ > pip install magic-pdf[full-cpu]==0.6.1
159
+ > ```
160
+
149
161
 
150
162
 
151
163
  #### 2. Downloading model weights files
@@ -174,10 +186,16 @@ If you have an available Nvidia GPU or are using a Mac with Apple Silicon, you c
174
186
  ##### CUDA
175
187
 
176
188
  You need to install the corresponding PyTorch version according to your CUDA version.
177
- This example installs the CUDA 11.8 version.More information https://pytorch.org/get-started/locally/
189
+ This example installs the CUDA 11.8 version.More information https://pytorch.org/get-started/locally/
178
190
  ```bash
179
191
  pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118
180
192
  ```
193
+ > ❗ ️Make sure to specify version
194
+ > ```bash
195
+ > torch==2.3.1 torchvision==0.18.1
196
+ > ```
197
+ > in the command, as these are the highest versions we support. Failing to specify the versions may result in automatically installing higher versions which can cause the program to fail.
198
+
181
199
  Also, you need to modify the value of "device-mode" in the configuration file magic-pdf.json.
182
200
  ```json
183
201
  {
@@ -1,14 +1,11 @@
1
1
  magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  magic_pdf/pdf_parse_by_ocr.py,sha256=IWnSWt1Z-d35xRqspzdLR2iUtma_SAu4W7K4kEk8SHc,638
3
3
  magic_pdf/pdf_parse_by_txt.py,sha256=KUSH7Gh83CZmdyWw59pqDskwyJ2Kg-jU-9fnQGJQEs4,537
4
- magic_pdf/pdf_parse_for_train.py,sha256=Oby61DMjJ716Jj_ri7lwXfv2Chus0pbBR2RPXrmBW08,28661
5
4
  magic_pdf/pdf_parse_union_core.py,sha256=jNly6l9pGcCf7wr6s6PgQhITJZ1m9PaI32Q26zxFngs,10620
6
5
  magic_pdf/user_api.py,sha256=CVQH-VSiZpz0bSkyMT4czk1epZriIPSJsLsPbluPa9Q,3054
7
- magic_pdf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- magic_pdf/cli/magicpdf.py,sha256=jzGSxLcx_tmjFiZfIKpomfU9rfvAwZDyMm4bmNGQmJw,13541
9
6
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
7
  magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
11
- magic_pdf/dict2md/ocr_mkcontent.py,sha256=L5D6aLB3vSATfDmsXRI_Mnq79ijH6yPYdSxSJb5aFh8,15678
8
+ magic_pdf/dict2md/ocr_mkcontent.py,sha256=mq6tACGkL383bdUla7xOkRXRTBBydRtdbgIHwkk_daM,16169
12
9
  magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
10
  magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
14
11
  magic_pdf/filter/pdf_meta_scan.py,sha256=5R2XDiBZw0xd4ugbDxuyk6fztGlT5jFsGN85hLvo-hQ,17390
@@ -25,7 +22,7 @@ magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
22
  magic_pdf/libs/boxbase.py,sha256=MvD0DypR4sTEF3T2RrI_yJ8mPDUBYHAqAaau2mnBSxY,15343
26
23
  magic_pdf/libs/calc_span_stats.py,sha256=5vnU27DcbkFDRSAoLqAmX0KQ3I9ehWkEgh_t9hxg_zI,10147
27
24
  magic_pdf/libs/commons.py,sha256=6Zu9-OyamyCNDY7qj0SxR-rux-ggj9im3CVPtC4ubB8,7108
28
- magic_pdf/libs/config_reader.py,sha256=Wlig8KRhKpz8fVNc7dlspxoccX5SnyT7FmwKj9i-gWE,2435
25
+ magic_pdf/libs/config_reader.py,sha256=dPx6JJJuCw9AzNgKtrTG1elmfdeN6gDhgFK9r15-NsE,2505
29
26
  magic_pdf/libs/convert_utils.py,sha256=Ov-lsfCLBPz_15iSJXIslBNmrSf_E_1g_XDWJy8NgO8,143
30
27
  magic_pdf/libs/coordinate_transform.py,sha256=Bbop2cP2uz2ZG0U0gwd7J6EKkgABq5Rv03qf2LMPw80,429
31
28
  magic_pdf/libs/detect_language_from_model.py,sha256=Uln8F9qs8EJOw4EgI7KRlaU3lD_mK8KMTlADLFtz8fk,816
@@ -44,13 +41,13 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
44
41
  magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
45
42
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
46
43
  magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
47
- magic_pdf/libs/version.py,sha256=ZMr9EH74nzdVNg9PXVs5XLZcabTX-_J2XZD1VgXHRBE,24
44
+ magic_pdf/libs/version.py,sha256=iEqOsQ5JUsdTQDAPqryKypkJMTOXXBl71cd4Drh5pDs,24
48
45
  magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
49
46
  magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
50
- magic_pdf/model/doc_analyze_by_custom_model.py,sha256=0YyZoDUQBFQIDiHDK59JEql9FKSkZMr8aGN0Tw1eL8k,4064
51
- magic_pdf/model/magic_model.py,sha256=tIGi-vtn6OUDqs3mtW2YLzoFvSPu4kkBiHJUv6o0Kic,25233
47
+ magic_pdf/model/doc_analyze_by_custom_model.py,sha256=cSmh27RG1cvY0pd98T82rq0pANBwkYN0siZIN6oPNI8,4332
48
+ magic_pdf/model/magic_model.py,sha256=xwKV9BrdjOJecJSzbErT54N6qeJu0tvFuJg2S1z_2kU,25413
52
49
  magic_pdf/model/model_list.py,sha256=AqxAtKGLDn7VVXWYwk0l9LnACxDLyU2jwOJ7vjPZj04,72
53
- magic_pdf/model/pdf_extract_kit.py,sha256=ecoTBn7ROmfU1KxFpFzu7yXw7aezmPV6xOht7PRmPHA,10692
50
+ magic_pdf/model/pdf_extract_kit.py,sha256=I3pZBWQu8y5YVjURTUJnsIySjeSGK-Yefit9PiqN9VE,12952
54
51
  magic_pdf/model/pp_structure_v2.py,sha256=1sn8IJK0d5ZmqJ2XFt9FdaSdI0RQf-iwNAWBrVrIeuc,2872
55
52
  magic_pdf/model/pek_sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
53
  magic_pdf/model/pek_sub_modules/post_process.py,sha256=HzRxV2sVR3Qo8XKYEHhT6tae-bYTb6dnAfGP6gfVNaM,1135
@@ -75,6 +72,8 @@ magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configur
75
72
  magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py,sha256=mdo8tO-DrJcv0Lbk9Pp98n3NQXYOnFFyXQWjU7t35kA,54633
76
73
  magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py,sha256=diKlrfxYjKAmYrUgjYdx-FXLh-swShC3tl-EBX1b3oI,1197
77
74
  magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py,sha256=0lxiG69_fGpSSBYA9CBLnDa_qqa1rInZ0pJpqBwZ0Yw,1372
75
+ magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py,sha256=BdrBZ_2B2jgF0vzn_ted8bE9Te-DC1Ea2UijqULNKjg,928
76
+ magic_pdf/model/pek_sub_modules/structeqtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
78
77
  magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
79
78
  magic_pdf/para/block_continuation_processor.py,sha256=IkReB5hirjm0OAirNzQQpnWe2f2bdP3Hug3Ef8qTRDA,22749
80
79
  magic_pdf/para/block_termination_processor.py,sha256=YU3ZYqJy9e3OQmOuQYZrR6AUpmAlQ0mhj0PgZZPZ_fM,17957
@@ -128,23 +127,22 @@ magic_pdf/pre_proc/resolve_bbox_conflict.py,sha256=bJiegofPUeDyi--oZjfipQ5Q5RLm6
128
127
  magic_pdf/pre_proc/solve_line_alien.py,sha256=aNoQptPcC38Sm1I2ABhgw8jeH_5kjsRHx3VYlFFtm1g,853
129
128
  magic_pdf/pre_proc/statistics.py,sha256=_9jGlXq0iXd03UMxB92ZqCiu7cjNkG5vHvFlTF_9ytA,220
130
129
  magic_pdf/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
131
- magic_pdf/resources/model_config/model_configs.yaml,sha256=C_9UfFMlHOX-iSgcwCHjyHKazKKuwpy1RcGHeTQD1kY,139
130
+ magic_pdf/resources/model_config/model_configs.yaml,sha256=2MxCOJ5yNUupQqvrAvEuJKlygjxxV_o1qE64K_4NWKA,235
132
131
  magic_pdf/resources/model_config/UniMERNet/demo.yaml,sha256=al9_--m3n2j9zEn9OjlmmpfQbqVBAYFakXc_hY4vDXo,807
133
132
  magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml,sha256=9aNAEYgpHTAWpcUrDvuPG2y4V-Qw8QdcJefi96y8yDU,6109
134
- magic_pdf/rw/AbsReaderWriter.py,sha256=1Hd6Xo2g12CaRAo5Sze-R_GSQA6GQ0rQwSmgQvw4V_c,1297
135
- magic_pdf/rw/DiskReaderWriter.py,sha256=0tt8lbRyqrOfFgGlhjt24YMdj2xN7QUIVysfhFIxPgo,2113
136
- magic_pdf/rw/S3ReaderWriter.py,sha256=O7Quf3CUqXBjMz4sIE7kNVI3TIQROeg5PuXneAacieY,4474
133
+ magic_pdf/rw/AbsReaderWriter.py,sha256=2H5SDJfAAOX9kPfel06a8VRCHxD1Y8aPbWEkQDdn9JM,452
134
+ magic_pdf/rw/DiskReaderWriter.py,sha256=7ZAekH8V6xlBo_1WeSZ6sNwAj2WGPtjNl50zq1CoMDY,2614
135
+ magic_pdf/rw/S3ReaderWriter.py,sha256=_DmL45Ubio-_VsKD84KrqOQ-VNDUTzcXSrXfNMb5vww,5310
137
136
  magic_pdf/rw/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
138
137
  magic_pdf/spark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
139
138
  magic_pdf/spark/spark_api.py,sha256=eSLXTjMYW5Ya41VMIApRVfji1ZxEZXdH9ZdsL6fy5Kw,1131
140
- magic_pdf/train_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
141
- magic_pdf/train_utils/convert_to_train_format.py,sha256=ifo2FAoBMa_etCvz0O4v03xO6mHV8IPXYxHH7-OcHfk,2443
142
- magic_pdf/train_utils/extract_caption.py,sha256=gommEqIEWLplSDEJWD7_66daqlOBsWhpRBW1DHpkny4,1825
143
- magic_pdf/train_utils/remove_footer_header.py,sha256=pyeNNdJ-th3wl5Xwb10ZLYNaFN4-6BmahoMFE8VTNNs,5978
144
- magic_pdf/train_utils/vis_utils.py,sha256=MV9N9cT3ifJ35u7LFKGF9I_bOIQrtU1zcsxu2hj3aqM,10111
145
- magic_pdf-0.6.2b1.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
146
- magic_pdf-0.6.2b1.dist-info/METADATA,sha256=DVFHn5RBQilga9OZp87sGf4e4wAE-XT8QpB6KSVUgoo,11541
147
- magic_pdf-0.6.2b1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
148
- magic_pdf-0.6.2b1.dist-info/entry_points.txt,sha256=NbSkSmE08UuTwdoJD8Uofq8iyufySA4x7jmIIk4YCzI,57
149
- magic_pdf-0.6.2b1.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
150
- magic_pdf-0.6.2b1.dist-info/RECORD,,
139
+ magic_pdf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
140
+ magic_pdf/tools/cli.py,sha256=aVmurGAEyWT-MOv0MOaCRrfef1-jkRTpeVVWUsEVyeY,2157
141
+ magic_pdf/tools/cli_dev.py,sha256=uDc4fDxVuOIrkaKRdjNAqyh9htyLd-fYDEfJBNFUYao,4149
142
+ magic_pdf/tools/common.py,sha256=x4W-Tyo0A-TGsOjzlUGAhxiU2AisU3nBE3_2H_RLUO4,3801
143
+ magic_pdf-0.7.0a1.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
144
+ magic_pdf-0.7.0a1.dist-info/METADATA,sha256=NBLsixinI-5iHwdweKr13SM5qg6Jf-fWCwg5ihavlpY,12455
145
+ magic_pdf-0.7.0a1.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
146
+ magic_pdf-0.7.0a1.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
147
+ magic_pdf-0.7.0a1.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
148
+ magic_pdf-0.7.0a1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.43.0)
2
+ Generator: bdist_wheel (0.44.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ magic-pdf = magic_pdf.tools.cli:cli
3
+ magic-pdf-dev = magic_pdf.tools.cli_dev:cli