magic-pdf 0.5.10__py3-none-any.whl → 0.5.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
magic_pdf/cli/magicpdf.py CHANGED
@@ -23,7 +23,6 @@ python magicpdf.py pdf-command --pdf /home/llm/Downloads/xxxx.pdf --model /home
23
23
 
24
24
  import os
25
25
  import json as json_parse
26
- import sys
27
26
  import click
28
27
  from loguru import logger
29
28
  from pathlib import Path
@@ -46,6 +45,7 @@ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
46
45
  from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
47
46
  import csv
48
47
  import copy
48
+ import magic_pdf.model as model_config
49
49
 
50
50
  parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
51
51
 
@@ -66,21 +66,21 @@ def write_to_csv(csv_file_path, csv_data):
66
66
  csv_writer = csv.writer(csvfile)
67
67
  # 写入数据
68
68
  csv_writer.writerow(csv_data)
69
- print(f"数据已成功追加到 '{csv_file_path}'")
69
+ logger.info(f"数据已成功追加到 '{csv_file_path}'")
70
70
 
71
71
 
72
72
  def do_parse(
73
- pdf_file_name,
74
- pdf_bytes,
75
- model_list,
76
- parse_method,
77
- f_draw_span_bbox=True,
78
- f_draw_layout_bbox=True,
79
- f_dump_md=True,
80
- f_dump_middle_json=True,
81
- f_dump_model_json=True,
82
- f_dump_orig_pdf=True,
83
- f_dump_content_list=True,
73
+ pdf_file_name,
74
+ pdf_bytes,
75
+ model_list,
76
+ parse_method,
77
+ f_draw_span_bbox=True,
78
+ f_draw_layout_bbox=True,
79
+ f_dump_md=True,
80
+ f_dump_middle_json=True,
81
+ f_dump_model_json=True,
82
+ f_dump_orig_pdf=True,
83
+ f_dump_content_list=True,
84
84
  ):
85
85
  orig_model_list = copy.deepcopy(model_list)
86
86
 
@@ -96,14 +96,18 @@ def do_parse(
96
96
  elif parse_method == "ocr":
97
97
  pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
98
98
  else:
99
- print("unknown parse method")
100
- sys.exit(1)
99
+ logger.error("unknown parse method")
100
+ exit(1)
101
101
 
102
102
  pipe.pipe_classify()
103
103
 
104
- """如果没有传入有效的模型数据,则使用内置paddle解析"""
104
+ """如果没有传入有效的模型数据,则使用内置model解析"""
105
105
  if len(model_list) == 0:
106
- pipe.pipe_analyze()
106
+ if model_config.__use_inside_model__:
107
+ pipe.pipe_analyze()
108
+ else:
109
+ logger.error("need model list input")
110
+ exit(1)
107
111
 
108
112
  pipe.pipe_parse()
109
113
  pdf_info = pipe.pdf_mid_data["pdf_info"]
@@ -170,10 +174,13 @@ def cli():
170
174
  help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
171
175
  default="auto",
172
176
  )
173
- def json_command(json, method):
177
+ @click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
178
+ def json_command(json, method, inside_model):
179
+ model_config.__use_inside_model__ = inside_model
180
+
174
181
  if not json.startswith("s3://"):
175
- print("usage: python magipdf.py --json s3://some_bucket/some_path")
176
- sys.exit(1)
182
+ logger.error("usage: magic-pdf json-command --json s3://some_bucket/some_path")
183
+ exit(1)
177
184
 
178
185
  def read_s3_path(s3path):
179
186
  bucket, key = parse_s3path(s3path)
@@ -218,7 +225,10 @@ def json_command(json, method):
218
225
  help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
219
226
  default="auto",
220
227
  )
221
- def local_json_command(local_json, method):
228
+ @click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
229
+ def local_json_command(local_json, method, inside_model):
230
+ model_config.__use_inside_model__ = inside_model
231
+
222
232
  def read_s3_path(s3path):
223
233
  bucket, key = parse_s3path(s3path)
224
234
 
@@ -267,7 +277,10 @@ def local_json_command(local_json, method):
267
277
  help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
268
278
  default="auto",
269
279
  )
270
- def pdf_command(pdf, model, method):
280
+ @click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
281
+ def pdf_command(pdf, model, method, inside_model):
282
+ model_config.__use_inside_model__ = inside_model
283
+
271
284
  def read_fn(path):
272
285
  disk_rw = DiskReaderWriter(os.path.dirname(path))
273
286
  return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
@@ -1,13 +1,6 @@
1
- import regex
2
1
  import unicodedata
3
2
  from fast_langdetect import detect_langs
4
3
 
5
- RE_BAD_CHARS = regex.compile(r"\p{Cc}|\p{Cs}")
6
-
7
-
8
- def remove_bad_chars(text):
9
- return RE_BAD_CHARS.sub("", text)
10
-
11
4
 
12
5
  def detect_lang(text: str) -> str:
13
6
  if len(text) == 0:
@@ -1,7 +1,5 @@
1
1
 
2
2
 
3
- from s3pathlib import S3Path
4
-
5
3
  def remove_non_official_s3_args(s3path):
6
4
  """
7
5
  example: s3://abc/xxxx.json?bytes=0,81350 ==> s3://abc/xxxx.json
@@ -10,8 +8,19 @@ def remove_non_official_s3_args(s3path):
10
8
  return arr[0]
11
9
 
12
10
  def parse_s3path(s3path: str):
13
- p = S3Path(remove_non_official_s3_args(s3path))
14
- return p.bucket, p.key
11
+ # from s3pathlib import S3Path
12
+ # p = S3Path(remove_non_official_s3_args(s3path))
13
+ # return p.bucket, p.key
14
+ s3path = remove_non_official_s3_args(s3path).strip()
15
+ if s3path.startswith(('s3://', 's3a://')):
16
+ prefix, path = s3path.split('://', 1)
17
+ bucket_name, key = path.split('/', 1)
18
+ return bucket_name, key
19
+ elif s3path.startswith('/'):
20
+ raise ValueError("The provided path starts with '/'. This does not conform to a valid S3 path format.")
21
+ else:
22
+ raise ValueError("Invalid S3 path format. Expected 's3://bucket-name/key' or 's3a://bucket-name/key'.")
23
+
15
24
 
16
25
  def parse_s3_range_params(s3path: str):
17
26
  """
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.5.10"
1
+ __version__ = "0.5.12"
@@ -0,0 +1 @@
1
+ __use_inside_model__ = False
@@ -1,10 +1,8 @@
1
1
  import fitz
2
- import cv2
3
- from PIL import Image
4
2
  import numpy as np
5
-
3
+ from loguru import logger
6
4
  from magic_pdf.model.model_list import MODEL
7
- from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
5
+ import magic_pdf.model as model_config
8
6
 
9
7
 
10
8
  def dict_compare(d1, d2):
@@ -22,6 +20,11 @@ def remove_duplicates_dicts(lst):
22
20
 
23
21
 
24
22
  def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
23
+ try:
24
+ import cv2
25
+ from PIL import Image
26
+ except ImportError:
27
+ logger.error("opencv-python and Pillow are not installed, please install by pip.")
25
28
  images = []
26
29
  with fitz.open("pdf", pdf_bytes) as doc:
27
30
  for index in range(0, doc.page_count):
@@ -41,6 +44,13 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
41
44
 
42
45
 
43
46
  def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False, model=MODEL.Paddle):
47
+
48
+ if model_config.__use_inside_model__:
49
+ from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
50
+ else:
51
+ logger.error("use_inside_model is False, not allow to use inside model")
52
+ exit(1)
53
+
44
54
  images = load_images_from_pdf(pdf_bytes)
45
55
  custom_model = None
46
56
  if model == MODEL.Paddle:
@@ -1,7 +1,12 @@
1
1
  import random
2
2
 
3
3
  from loguru import logger
4
- from paddleocr import PPStructure
4
+
5
+ try:
6
+ from paddleocr import PPStructure
7
+ except ImportError:
8
+ logger.error('paddleocr not installed, please install by "pip install magic-pdf[cpu]" or "pip install magic-pdf[gpu]"')
9
+ exit(1)
5
10
 
6
11
 
7
12
  def region_to_bbox(region):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.5.10
3
+ Version: 0.5.12
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  Home-page: https://github.com/magicpdf/Magic-PDF
6
6
  Requires-Python: >=3.9
@@ -9,34 +9,28 @@ License-File: LICENSE.md
9
9
  Requires-Dist: boto3 >=1.28.43
10
10
  Requires-Dist: Brotli >=1.1.0
11
11
  Requires-Dist: click >=8.1.7
12
- Requires-Dist: Distance >=0.1.3
13
- Requires-Dist: PyMuPDF >=1.24.5
12
+ Requires-Dist: PyMuPDF >=1.24.7
14
13
  Requires-Dist: loguru >=0.6.0
15
- Requires-Dist: matplotlib >=3.8.3
16
14
  Requires-Dist: numpy >=1.21.6
17
- Requires-Dist: pandas >=1.3.5
18
15
  Requires-Dist: fast-langdetect >=0.1.1
19
- Requires-Dist: regex >=2023.12.25
20
- Requires-Dist: termcolor >=2.4.0
21
16
  Requires-Dist: wordninja >=2.0.0
22
17
  Requires-Dist: scikit-learn >=1.0.2
23
- Requires-Dist: nltk ==3.8.1
24
- Requires-Dist: s3pathlib >=2.1.1
25
- Requires-Dist: paddleocr
26
18
  Requires-Dist: pdfminer.six >=20231228
27
19
  Provides-Extra: cpu
20
+ Requires-Dist: paddleocr ; extra == 'cpu'
28
21
  Requires-Dist: paddlepaddle ; extra == 'cpu'
29
22
  Provides-Extra: gpu
23
+ Requires-Dist: paddleocr ; extra == 'gpu'
30
24
  Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
31
25
 
32
26
  <div id="top"></div>
33
27
  <div align="center">
34
28
 
35
- [![stars](https://img.shields.io/github/stars/magicpdf/Magic-PDF.svg)](https://github.com/magicpdf/Magic-PDF)
36
- [![forks](https://img.shields.io/github/forks/magicpdf/Magic-PDF.svg)](https://github.com/magicpdf/Magic-PDF)
37
- [![license](https://img.shields.io/github/license/magicpdf/Magic-PDF.svg)](https://github.com/magicpdf/Magic-PDF/tree/main/LICENSE)
38
- [![issue resolution](https://img.shields.io/github/issues-closed-raw/magicpdf/Magic-PDF)](https://github.com/magicpdf/Magic-PDF/issues)
39
- [![open issues](https://img.shields.io/github/issues-raw/magicpdf/Magic-PDF)](https://github.com/magicpdf/Magic-PDF/issues)
29
+ [![stars](https://img.shields.io/github/stars/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
30
+ [![forks](https://img.shields.io/github/forks/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
31
+ [![license](https://img.shields.io/github/license/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU/tree/main/LICENSE)
32
+ [![issue resolution](https://img.shields.io/github/issues-closed-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
33
+ [![open issues](https://img.shields.io/github/issues-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
40
34
 
41
35
  [English](README.md) | [简体中文](README_zh-CN.md)
42
36
 
@@ -46,8 +40,20 @@ Requires-Dist: paddlepaddle-gpu ; extra == 'gpu'
46
40
 
47
41
  </div>
48
42
 
43
+ # MinerU
44
+
45
+
46
+ ## Introduction
47
+
48
+ MinerU is a one-stop, open-source data extraction tool, primarily includes the following features:
49
+
50
+ - [Magic-PDF](#Magic-PDF) PDF Document Extraction
51
+ - [Magic-Doc](#Magic-Doc) Webpage & E-book Extraction
52
+
53
+
49
54
  # Magic-PDF
50
55
 
56
+
51
57
  ## Introduction
52
58
 
53
59
  Magic-PDF is a tool designed to convert PDF documents into Markdown format, capable of processing files stored locally or on object storage supporting S3 protocol.
@@ -64,15 +70,38 @@ Key features include:
64
70
  - Compatibility with CPU and GPU environments
65
71
  - Available for Windows, Linux, and macOS platforms
66
72
 
73
+
74
+ https://github.com/magicpdf/Magic-PDF/assets/11393164/618937cb-dc6a-4646-b433-e3131a5f4070
75
+
76
+
77
+
78
+ ## Project Panorama
79
+
80
+ ![Project Panorama](docs/images/project_panorama_en.png)
81
+
82
+
83
+ ## Flowchart
84
+
85
+ ![Flowchart](docs/images/flowchart_en.png)
86
+
87
+ ### Submodule Repositories
88
+
89
+ - [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
90
+ - A Comprehensive Toolkit for High-Quality PDF Content Extraction
91
+ - [Miner-PDF-Benchmark](https://github.com/opendatalab/Miner-PDF-Benchmark)
92
+ - An end-to-end PDF document comprehension evaluation suite designed for large-scale model data scenarios
93
+
94
+
67
95
  ## Getting Started
68
96
 
69
97
  ### Requirements
70
98
 
71
- - Python 3.9 or newer
99
+ - Python >= 3.9
72
100
 
73
101
  ### Usage Instructions
74
102
 
75
103
  #### 1. Install Magic-PDF
104
+
76
105
  ```bash
77
106
  pip install magic-pdf
78
107
  ```
@@ -80,11 +109,14 @@ pip install magic-pdf
80
109
  #### 2. Usage via Command Line
81
110
 
82
111
  ###### simple
112
+
83
113
  ```bash
84
114
  cp magic-pdf.template.json to ~/magic-pdf.json
85
115
  magic-pdf pdf-command --pdf "pdf_path" --model "model_json_path"
86
116
  ```
117
+
87
118
  ###### more
119
+
88
120
  ```bash
89
121
  magic-pdf --help
90
122
  ```
@@ -115,19 +147,61 @@ pipe.pipe_parse()
115
147
  md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
116
148
  ```
117
149
 
118
- Demo can be referred to [demo.py](https://github.com/magicpdf/Magic-PDF/blob/master/demo/demo.py)
150
+ Demo can be referred to [demo.py](demo/demo.py)
151
+
119
152
 
120
153
  ## All Thanks To Our Contributors
121
154
 
122
155
  <a href="https://github.com/magicpdf/Magic-PDF/graphs/contributors">
123
- <img src="https://contrib.rocks/image?repo=magicpdf/Magic-PDF" />
156
+ <img src="https://contrib.rocks/image?repo=opendatalab/MinerU" />
124
157
  </a>
125
158
 
159
+
126
160
  ## License Information
127
161
 
128
- See [LICENSE.md](https://github.com/magicpdf/Magic-PDF/blob/master/LICENSE.md) for details.
162
+ [LICENSE.md](LICENSE.md)
163
+
164
+ The project currently leverages PyMuPDF to deliver advanced functionalities; however, its adherence to the AGPL license may impose limitations on certain use cases. In upcoming iterations, we intend to explore and transition to a more permissively licensed PDF processing library to enhance user-friendliness and flexibility.
165
+
129
166
 
130
167
  ## Acknowledgments
131
168
 
132
169
  - [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
133
170
  - [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
171
+
172
+
173
+ # Magic-Doc
174
+
175
+
176
+ ## Introduction
177
+
178
+ Magic-Doc is a tool designed to convert web pages or multi-format e-books into markdown format.
179
+
180
+ Key Features Include:
181
+
182
+ - Web Page Extraction
183
+ - Cross-modal precise parsing of text, images, tables, and formula information.
184
+
185
+ - E-Book Document Extraction
186
+ - Supports various document formats including epub, mobi, with full adaptation for text and images.
187
+
188
+ - Language Type Identification
189
+ - Accurate recognition of 176 languages.
190
+
191
+ https://github.com/opendatalab/MinerU/assets/11393164/a5a650e9-f4c0-463e-acc3-960967f1a1ca
192
+
193
+
194
+
195
+ https://github.com/opendatalab/MinerU/assets/11393164/0f4a6fe9-6cca-4113-9fdc-a537749d764d
196
+
197
+
198
+
199
+ https://github.com/opendatalab/MinerU/assets/11393164/20438a02-ce6c-4af8-9dde-d722a4e825b2
200
+
201
+
202
+
203
+
204
+ ## Project Repository
205
+
206
+ - [Magic-Doc](https://github.com/magicpdf/Magic-Doc)
207
+ Outstanding Webpage and E-book Extraction Tool
@@ -5,7 +5,7 @@ magic_pdf/pdf_parse_for_train.py,sha256=Oby61DMjJ716Jj_ri7lwXfv2Chus0pbBR2RPXrmB
5
5
  magic_pdf/pdf_parse_union_core.py,sha256=a67iQuEfuslAEF-wQplGZKXUuz5mT3HiCyvuR52E6Gw,10584
6
6
  magic_pdf/user_api.py,sha256=CVQH-VSiZpz0bSkyMT4czk1epZriIPSJsLsPbluPa9Q,3054
7
7
  magic_pdf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- magic_pdf/cli/magicpdf.py,sha256=FF6flO6wUcKG9Qx_FG6-xhHfmQzQWLjwrkMa5kowDgs,10937
8
+ magic_pdf/cli/magicpdf.py,sha256=BA6lPjBhurXAnfmvdbQiS_atEFq3PVdNYDPkvxs1N0M,11654
9
9
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
11
11
  magic_pdf/dict2md/ocr_mkcontent.py,sha256=RyxebPtvFfNce_HCa-_YGxwFx_srzL-BfMKc85V9JG0,15442
@@ -34,24 +34,24 @@ magic_pdf/libs/drop_reason.py,sha256=IfjPSrPLMmVziqjOXPep7r_ioQKFRahDgbOW1SD-Tuw
34
34
  magic_pdf/libs/drop_tag.py,sha256=bZDg3bIVWvBT1Ec1icwj5WLOkt5-hI6eRYZ2tX9_a74,673
35
35
  magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
36
36
  magic_pdf/libs/json_compressor.py,sha256=6-KCu0lb5ksmyqWtQGb4QqmP-FjRb5dP7P-Hevcn68g,875
37
- magic_pdf/libs/language.py,sha256=klymhpJyFSc9ukUPwIQmCx1DwGMuXueosaFjmMzETQw,812
37
+ magic_pdf/libs/language.py,sha256=U8bOttqtJiBvqOFUksiHeeC3vgjzJIWTLqQrmorg7T0,683
38
38
  magic_pdf/libs/markdown_utils.py,sha256=cLxLXjRhrNp_wCHvtglrGA_FVdrvfd1KULeTtj1p18w,944
39
39
  magic_pdf/libs/math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
40
40
  magic_pdf/libs/nlp_utils.py,sha256=-X9W3-Ns5ZdDYFvyyEq6i6P2b5hCATaFEZeOjwNOH9M,6901
41
41
  magic_pdf/libs/ocr_content_type.py,sha256=DiGTYppd6WlibwCAeVpIy3NHCQkglfIAQsJ_ffu5BPw,526
42
- magic_pdf/libs/path_utils.py,sha256=YYh8a0K8KiUhFDd_S1oLohL8n8fcSos4iMj74YEJ57s,538
42
+ magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
43
43
  magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2145
44
44
  magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
45
45
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
46
46
  magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
47
- magic_pdf/libs/version.py,sha256=1nlPInsRzDbcDPveZ3ghSJ6v6KveN9n6gnj-twW4DkI,23
47
+ magic_pdf/libs/version.py,sha256=LznNzk7nDbJCv7NVxCOu958-1uT_nFJ79_3vJt7WPDc,23
48
48
  magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
49
49
  magic_pdf/model/360_layout_analysis.py,sha256=GbchKPJRVcrxvwNXMnR4vt8lOLPauTWMl-43ayyhX7U,221
50
- magic_pdf/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
- magic_pdf/model/doc_analyze_by_custom_model.py,sha256=QD4NWEIz8UXdIG4V_3P8EaYesxk6PvC1SOtTWEy2GEY,2007
50
+ magic_pdf/model/__init__.py,sha256=X6t9kPDqM8hDCbq8fQc_8jILtG6mepDjN_kadUo39Sk,29
51
+ magic_pdf/model/doc_analyze_by_custom_model.py,sha256=8z4NX7Lk7CcPl1BQiNYL6dDiP63M3f6m3dmW6rjHCqg,2370
52
52
  magic_pdf/model/magic_model.py,sha256=2H6Gz1mg0f0YCvz-TLIWrAWXCQLgZftBXJNRPlSIjwc,25077
53
53
  magic_pdf/model/model_list.py,sha256=dNfnDodnbkgIW0PFDjn_KsQMA8DODOzo4Z4jxfOilaA,44
54
- magic_pdf/model/pp_structure_v2.py,sha256=qsyt9vFDGaVizBMiSaeFVHTDsJTrIHx46Ec2J8SOj1A,2469
54
+ magic_pdf/model/pp_structure_v2.py,sha256=fFbAOYEcLXlkCjqZ3yxZXR7nqtp6V8yowyjSibW3lhY,2635
55
55
  magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
56
  magic_pdf/para/block_continuation_processor.py,sha256=IkReB5hirjm0OAirNzQQpnWe2f2bdP3Hug3Ef8qTRDA,22749
57
57
  magic_pdf/para/block_termination_processor.py,sha256=YU3ZYqJy9e3OQmOuQYZrR6AUpmAlQ0mhj0PgZZPZ_fM,17957
@@ -115,9 +115,9 @@ magic_pdf/train_utils/convert_to_train_format.py,sha256=ifo2FAoBMa_etCvz0O4v03xO
115
115
  magic_pdf/train_utils/extract_caption.py,sha256=gommEqIEWLplSDEJWD7_66daqlOBsWhpRBW1DHpkny4,1825
116
116
  magic_pdf/train_utils/remove_footer_header.py,sha256=pyeNNdJ-th3wl5Xwb10ZLYNaFN4-6BmahoMFE8VTNNs,5978
117
117
  magic_pdf/train_utils/vis_utils.py,sha256=MV9N9cT3ifJ35u7LFKGF9I_bOIQrtU1zcsxu2hj3aqM,10111
118
- magic_pdf-0.5.10.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
119
- magic_pdf-0.5.10.dist-info/METADATA,sha256=B3e0sVOyFhk47EfHPuLFRUNxzdasWYx3XuYR53LSJX8,4175
120
- magic_pdf-0.5.10.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
121
- magic_pdf-0.5.10.dist-info/entry_points.txt,sha256=NbSkSmE08UuTwdoJD8Uofq8iyufySA4x7jmIIk4YCzI,57
122
- magic_pdf-0.5.10.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
123
- magic_pdf-0.5.10.dist-info/RECORD,,
118
+ magic_pdf-0.5.12.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
119
+ magic_pdf-0.5.12.dist-info/METADATA,sha256=iNTDKGkj4D77ErkS0P1dNZ4ttFriYHbTSjsEE3f8MP0,5917
120
+ magic_pdf-0.5.12.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
121
+ magic_pdf-0.5.12.dist-info/entry_points.txt,sha256=NbSkSmE08UuTwdoJD8Uofq8iyufySA4x7jmIIk4YCzI,57
122
+ magic_pdf-0.5.12.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
123
+ magic_pdf-0.5.12.dist-info/RECORD,,