magic-pdf 0.9.1__py3-none-any.whl → 0.9.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/dict2md/ocr_mkcontent.py +1 -1
- magic_pdf/libs/Constants.py +3 -1
- magic_pdf/libs/config_reader.py +1 -1
- magic_pdf/libs/draw_bbox.py +10 -4
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/pdf_extract_kit.py +42 -310
- magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +21 -0
- magic_pdf/model/sub_modules/mfd/__init__.py +0 -0
- magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +12 -0
- magic_pdf/model/sub_modules/mfd/yolov8/__init__.py +0 -0
- magic_pdf/model/sub_modules/mfr/__init__.py +0 -0
- magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +98 -0
- magic_pdf/model/sub_modules/mfr/unimernet/__init__.py +0 -0
- magic_pdf/model/sub_modules/model_init.py +144 -0
- magic_pdf/model/sub_modules/model_utils.py +51 -0
- magic_pdf/model/sub_modules/ocr/__init__.py +0 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py +0 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +259 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +168 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +213 -0
- magic_pdf/model/sub_modules/reading_oreder/__init__.py +0 -0
- magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py +0 -0
- magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py +242 -0
- magic_pdf/model/sub_modules/table/__init__.py +0 -0
- magic_pdf/model/sub_modules/table/rapidtable/__init__.py +0 -0
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +14 -0
- magic_pdf/model/sub_modules/table/structeqtable/__init__.py +0 -0
- magic_pdf/model/{pek_sub_modules/structeqtable/StructTableModel.py → sub_modules/table/structeqtable/struct_eqtable.py} +3 -11
- magic_pdf/model/sub_modules/table/table_utils.py +11 -0
- magic_pdf/model/sub_modules/table/tablemaster/__init__.py +0 -0
- magic_pdf/model/{ppTableModel.py → sub_modules/table/tablemaster/tablemaster_paddle.py} +1 -1
- magic_pdf/para/para_split_v3.py +13 -15
- magic_pdf/pdf_parse_union_core_v2.py +56 -19
- magic_pdf/resources/model_config/model_configs.yaml +2 -1
- magic_pdf/tools/common.py +47 -3
- {magic_pdf-0.9.1.dist-info → magic_pdf-0.9.3.dist-info}/METADATA +35 -25
- {magic_pdf-0.9.1.dist-info → magic_pdf-0.9.3.dist-info}/RECORD +65 -44
- {magic_pdf-0.9.1.dist-info → magic_pdf-0.9.3.dist-info}/WHEEL +1 -1
- magic_pdf/model/pek_sub_modules/post_process.py +0 -36
- magic_pdf/model/pek_sub_modules/self_modify.py +0 -388
- /magic_pdf/model/{pek_sub_modules → sub_modules}/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules/layoutlmv3 → sub_modules/layout}/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules/structeqtable → sub_modules/layout/doclayout_yolo}/__init__.py +0 -0
- /magic_pdf/model/{v3 → sub_modules/layout/layoutlmv3}/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/backbone.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/beit.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/deit.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/cord.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/data_collator.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/funsd.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/image_utils.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/xfund.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/model_init.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/rcnn_vl.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/visualizer.py +0 -0
- /magic_pdf/model/{v3 → sub_modules/reading_oreder/layoutreader}/helpers.py +0 -0
- {magic_pdf-0.9.1.dist-info → magic_pdf-0.9.3.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.9.1.dist-info → magic_pdf-0.9.3.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.9.1.dist-info → magic_pdf-0.9.3.dist-info}/top_level.txt +0 -0
@@ -30,8 +30,8 @@ from magic_pdf.pre_proc.equations_replace import (
|
|
30
30
|
from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
|
31
31
|
ocr_prepare_bboxes_for_layout_split_v2
|
32
32
|
from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
|
33
|
-
|
34
|
-
|
33
|
+
fix_discarded_block,
|
34
|
+
fix_block_spans_v2)
|
35
35
|
from magic_pdf.pre_proc.ocr_span_list_modify import (
|
36
36
|
get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
|
37
37
|
remove_overlaps_min_spans)
|
@@ -164,8 +164,8 @@ class ModelSingleton:
|
|
164
164
|
|
165
165
|
|
166
166
|
def do_predict(boxes: List[List[int]], model) -> List[int]:
|
167
|
-
from magic_pdf.model.
|
168
|
-
|
167
|
+
from magic_pdf.model.sub_modules.reading_oreder.layoutreader.helpers import (boxes2inputs, parse_logits,
|
168
|
+
prepare_inputs)
|
169
169
|
|
170
170
|
inputs = boxes2inputs(boxes)
|
171
171
|
inputs = prepare_inputs(inputs, model)
|
@@ -174,23 +174,57 @@ def do_predict(boxes: List[List[int]], model) -> List[int]:
|
|
174
174
|
|
175
175
|
|
176
176
|
def cal_block_index(fix_blocks, sorted_bboxes):
|
177
|
-
for block in fix_blocks:
|
178
177
|
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
178
|
+
if sorted_bboxes is not None:
|
179
|
+
# 使用layoutreader排序
|
180
|
+
for block in fix_blocks:
|
181
|
+
line_index_list = []
|
182
|
+
if len(block['lines']) == 0:
|
183
|
+
block['index'] = sorted_bboxes.index(block['bbox'])
|
184
|
+
else:
|
185
|
+
for line in block['lines']:
|
186
|
+
line['index'] = sorted_bboxes.index(line['bbox'])
|
187
|
+
line_index_list.append(line['index'])
|
188
|
+
median_value = statistics.median(line_index_list)
|
189
|
+
block['index'] = median_value
|
190
|
+
|
191
|
+
# 删除图表body block中的虚拟line信息, 并用real_lines信息回填
|
192
|
+
if block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
|
193
|
+
block['virtual_lines'] = copy.deepcopy(block['lines'])
|
194
|
+
block['lines'] = copy.deepcopy(block['real_lines'])
|
195
|
+
del block['real_lines']
|
196
|
+
else:
|
197
|
+
# 使用xycut排序
|
198
|
+
block_bboxes = []
|
199
|
+
for block in fix_blocks:
|
200
|
+
block_bboxes.append(block['bbox'])
|
201
|
+
|
202
|
+
# 删除图表body block中的虚拟line信息, 并用real_lines信息回填
|
203
|
+
if block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
|
204
|
+
block['virtual_lines'] = copy.deepcopy(block['lines'])
|
205
|
+
block['lines'] = copy.deepcopy(block['real_lines'])
|
206
|
+
del block['real_lines']
|
207
|
+
|
208
|
+
import numpy as np
|
209
|
+
from magic_pdf.model.sub_modules.reading_oreder.layoutreader.xycut import recursive_xy_cut
|
210
|
+
|
211
|
+
random_boxes = np.array(block_bboxes)
|
212
|
+
np.random.shuffle(random_boxes)
|
213
|
+
res = []
|
214
|
+
recursive_xy_cut(np.asarray(random_boxes).astype(int), np.arange(len(block_bboxes)), res)
|
215
|
+
assert len(res) == len(block_bboxes)
|
216
|
+
sorted_boxes = random_boxes[np.array(res)].tolist()
|
217
|
+
|
218
|
+
for i, block in enumerate(fix_blocks):
|
219
|
+
block['index'] = sorted_boxes.index(block['bbox'])
|
220
|
+
|
221
|
+
# 生成line index
|
222
|
+
sorted_blocks = sorted(fix_blocks, key=lambda b: b['index'])
|
223
|
+
line_inedx = 1
|
224
|
+
for block in sorted_blocks:
|
183
225
|
for line in block['lines']:
|
184
|
-
line['index'] =
|
185
|
-
|
186
|
-
median_value = statistics.median(line_index_list)
|
187
|
-
block['index'] = median_value
|
188
|
-
|
189
|
-
# 删除图表body block中的虚拟line信息, 并用real_lines信息回填
|
190
|
-
if block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
|
191
|
-
block['virtual_lines'] = copy.deepcopy(block['lines'])
|
192
|
-
block['lines'] = copy.deepcopy(block['real_lines'])
|
193
|
-
del block['real_lines']
|
226
|
+
line['index'] = line_inedx
|
227
|
+
line_inedx += 1
|
194
228
|
|
195
229
|
return fix_blocks
|
196
230
|
|
@@ -264,6 +298,9 @@ def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
|
|
264
298
|
block['lines'].append({'bbox': line, 'spans': []})
|
265
299
|
page_line_list.extend(lines)
|
266
300
|
|
301
|
+
if len(page_line_list) > 200: # layoutreader最高支持512line
|
302
|
+
return None
|
303
|
+
|
267
304
|
# 使用layoutreader排序
|
268
305
|
x_scale = 1000.0 / page_w
|
269
306
|
y_scale = 1000.0 / page_h
|
magic_pdf/tools/common.py
CHANGED
@@ -14,6 +14,9 @@ from magic_pdf.pipe.TXTPipe import TXTPipe
|
|
14
14
|
from magic_pdf.pipe.UNIPipe import UNIPipe
|
15
15
|
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
16
16
|
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
17
|
+
import fitz
|
18
|
+
# from io import BytesIO
|
19
|
+
# from pypdf import PdfReader, PdfWriter
|
17
20
|
|
18
21
|
|
19
22
|
def prepare_env(output_dir, pdf_file_name, method):
|
@@ -26,6 +29,42 @@ def prepare_env(output_dir, pdf_file_name, method):
|
|
26
29
|
return local_image_dir, local_md_dir
|
27
30
|
|
28
31
|
|
32
|
+
# def convert_pdf_bytes_to_bytes_by_pypdf(pdf_bytes, start_page_id=0, end_page_id=None):
|
33
|
+
# # 将字节数据包装在 BytesIO 对象中
|
34
|
+
# pdf_file = BytesIO(pdf_bytes)
|
35
|
+
# # 读取 PDF 的字节数据
|
36
|
+
# reader = PdfReader(pdf_file)
|
37
|
+
# # 创建一个新的 PDF 写入器
|
38
|
+
# writer = PdfWriter()
|
39
|
+
# # 将所有页面添加到新的 PDF 写入器中
|
40
|
+
# end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(reader.pages) - 1
|
41
|
+
# if end_page_id > len(reader.pages) - 1:
|
42
|
+
# logger.warning("end_page_id is out of range, use pdf_docs length")
|
43
|
+
# end_page_id = len(reader.pages) - 1
|
44
|
+
# for i, page in enumerate(reader.pages):
|
45
|
+
# if start_page_id <= i <= end_page_id:
|
46
|
+
# writer.add_page(page)
|
47
|
+
# # 创建一个字节缓冲区来存储输出的 PDF 数据
|
48
|
+
# output_buffer = BytesIO()
|
49
|
+
# # 将 PDF 写入字节缓冲区
|
50
|
+
# writer.write(output_buffer)
|
51
|
+
# # 获取字节缓冲区的内容
|
52
|
+
# converted_pdf_bytes = output_buffer.getvalue()
|
53
|
+
# return converted_pdf_bytes
|
54
|
+
|
55
|
+
|
56
|
+
def convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id=0, end_page_id=None):
|
57
|
+
document = fitz.open("pdf", pdf_bytes)
|
58
|
+
output_document = fitz.open()
|
59
|
+
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(document) - 1
|
60
|
+
if end_page_id > len(document) - 1:
|
61
|
+
logger.warning("end_page_id is out of range, use pdf_docs length")
|
62
|
+
end_page_id = len(document) - 1
|
63
|
+
output_document.insert_pdf(document, from_page=start_page_id, to_page=end_page_id)
|
64
|
+
output_bytes = output_document.tobytes()
|
65
|
+
return output_bytes
|
66
|
+
|
67
|
+
|
29
68
|
def do_parse(
|
30
69
|
output_dir,
|
31
70
|
pdf_file_name,
|
@@ -55,6 +94,8 @@ def do_parse(
|
|
55
94
|
f_draw_model_bbox = True
|
56
95
|
f_draw_line_sort_bbox = True
|
57
96
|
|
97
|
+
pdf_bytes = convert_pdf_bytes_to_bytes_by_pymupdf(pdf_bytes, start_page_id, end_page_id)
|
98
|
+
|
58
99
|
orig_model_list = copy.deepcopy(model_list)
|
59
100
|
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name,
|
60
101
|
parse_method)
|
@@ -66,15 +107,18 @@ def do_parse(
|
|
66
107
|
if parse_method == 'auto':
|
67
108
|
jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
|
68
109
|
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True,
|
69
|
-
start_page_id=start_page_id, end_page_id=end_page_id,
|
110
|
+
# start_page_id=start_page_id, end_page_id=end_page_id,
|
111
|
+
lang=lang,
|
70
112
|
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
|
71
113
|
elif parse_method == 'txt':
|
72
114
|
pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True,
|
73
|
-
start_page_id=start_page_id, end_page_id=end_page_id,
|
115
|
+
# start_page_id=start_page_id, end_page_id=end_page_id,
|
116
|
+
lang=lang,
|
74
117
|
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
|
75
118
|
elif parse_method == 'ocr':
|
76
119
|
pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True,
|
77
|
-
start_page_id=start_page_id, end_page_id=end_page_id,
|
120
|
+
# start_page_id=start_page_id, end_page_id=end_page_id,
|
121
|
+
lang=lang,
|
78
122
|
layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
|
79
123
|
else:
|
80
124
|
logger.error('unknown parse method')
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 0.9.
|
3
|
+
Version: 0.9.3
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
Home-page: https://github.com/opendatalab/MinerU
|
6
6
|
Requires-Python: >=3.9
|
@@ -26,6 +26,9 @@ Requires-Dist: struct-eqtable==0.3.2; extra == "full"
|
|
26
26
|
Requires-Dist: einops; extra == "full"
|
27
27
|
Requires-Dist: accelerate; extra == "full"
|
28
28
|
Requires-Dist: doclayout-yolo==0.0.2; extra == "full"
|
29
|
+
Requires-Dist: rapidocr-paddle; extra == "full"
|
30
|
+
Requires-Dist: rapid-table; extra == "full"
|
31
|
+
Requires-Dist: PyYAML; extra == "full"
|
29
32
|
Requires-Dist: detectron2; extra == "full"
|
30
33
|
Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "full"
|
31
34
|
Requires-Dist: matplotlib; (platform_system == "Linux" or platform_system == "Darwin") and extra == "full"
|
@@ -80,7 +83,8 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
|
|
80
83
|
</div>
|
81
84
|
|
82
85
|
# Changelog
|
83
|
-
- 2024/11/
|
86
|
+
- 2024/11/15 0.9.3 released. Integrated [RapidTable](https://github.com/RapidAI/RapidTable) for table recognition, improving single-table parsing speed by more than 10 times, with higher accuracy and lower GPU memory usage.
|
87
|
+
- 2024/11/06 0.9.2 released. Integrated the [StructTable-InternVL2-1B](https://huggingface.co/U4R/StructTable-InternVL2-1B) model for table recognition functionality.
|
84
88
|
- 2024/10/31 0.9.0 released. This is a major new version with extensive code refactoring, addressing numerous issues, improving performance, reducing hardware requirements, and enhancing usability:
|
85
89
|
- Refactored the sorting module code to use [layoutreader](https://github.com/ppaanngggg/layoutreader) for reading order sorting, ensuring high accuracy in various layouts.
|
86
90
|
- Refactored the paragraph concatenation module to achieve good results in cross-column, cross-page, cross-figure, and cross-table scenarios.
|
@@ -176,13 +180,14 @@ There are three different ways to experience MinerU:
|
|
176
180
|
- [Quick CPU Demo (Windows, Linux, Mac)](#quick-cpu-demo)
|
177
181
|
- [Linux/Windows + CUDA](#Using-GPU)
|
178
182
|
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
183
|
+
> [!WARNING]
|
184
|
+
> **Pre-installation Notice—Hardware and Software Environment Support**
|
185
|
+
>
|
186
|
+
> To ensure the stability and reliability of the project, we only optimize and test for specific hardware and software environments during development. This ensures that users deploying and running the project on recommended system configurations will get the best performance with the fewest compatibility issues.
|
187
|
+
>
|
188
|
+
> By focusing resources on the mainline environment, our team can more efficiently resolve potential bugs and develop new features.
|
189
|
+
>
|
190
|
+
> In non-mainline environments, due to the diversity of hardware and software configurations, as well as third-party dependency compatibility issues, we cannot guarantee 100% project availability. Therefore, for users who wish to use this project in non-recommended environments, we suggest carefully reading the documentation and FAQ first. Most issues already have corresponding solutions in the FAQ. We also encourage community feedback to help us gradually expand support.
|
186
191
|
|
187
192
|
<table>
|
188
193
|
<tr>
|
@@ -262,11 +267,13 @@ Refer to [How to Download Model Files](docs/how_to_download_models_en.md) for de
|
|
262
267
|
After completing the [2. Download model weight files](#2-download-model-weight-files) step, the script will automatically generate a `magic-pdf.json` file in the user directory and configure the default model path.
|
263
268
|
You can find the `magic-pdf.json` file in your 【user directory】.
|
264
269
|
|
270
|
+
> [!TIP]
|
265
271
|
> The user directory for Windows is "C:\\Users\\username", for Linux it is "/home/username", and for macOS it is "/Users/username".
|
266
272
|
|
267
273
|
You can modify certain configurations in this file to enable or disable features, such as table recognition:
|
268
274
|
|
269
275
|
|
276
|
+
> [!NOTE]
|
270
277
|
> If the following items are not present in the JSON, please manually add the required items and remove the comment content (standard JSON does not support comments).
|
271
278
|
|
272
279
|
```json
|
@@ -281,7 +288,7 @@ You can modify certain configurations in this file to enable or disable features
|
|
281
288
|
"enable": true // The formula recognition feature is enabled by default. If you need to disable it, please change the value here to "false".
|
282
289
|
},
|
283
290
|
"table-config": {
|
284
|
-
"model": "
|
291
|
+
"model": "rapid_table", // When using structEqTable, please change to "struct_eqtable".
|
285
292
|
"enable": false, // The table recognition feature is disabled by default. If you need to enable it, please change the value here to "true".
|
286
293
|
"max_time": 400
|
287
294
|
}
|
@@ -295,13 +302,14 @@ If your device supports CUDA and meets the GPU requirements of the mainline envi
|
|
295
302
|
- [Ubuntu 22.04 LTS + GPU](docs/README_Ubuntu_CUDA_Acceleration_en_US.md)
|
296
303
|
- [Windows 10/11 + GPU](docs/README_Windows_CUDA_Acceleration_en_US.md)
|
297
304
|
- Quick Deployment with Docker
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
+
> [!IMPORTANT]
|
306
|
+
> Docker requires a GPU with at least 8GB of VRAM, and all acceleration features are enabled by default.
|
307
|
+
>
|
308
|
+
> Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker.
|
309
|
+
>
|
310
|
+
> ```bash
|
311
|
+
> docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
|
312
|
+
> ```
|
305
313
|
```bash
|
306
314
|
wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
|
307
315
|
docker build -t mineru:latest .
|
@@ -363,8 +371,8 @@ The results will be saved in the `{some_output_dir}` directory. The output file
|
|
363
371
|
├── some_pdf_spans.pdf # smallest granularity bbox position information diagram
|
364
372
|
└── some_pdf_content_list.json # Rich text JSON arranged in reading order
|
365
373
|
```
|
366
|
-
|
367
|
-
For more information about the output files, please refer to the [Output File Description](docs/output_file_en_us.md).
|
374
|
+
> [!TIP]
|
375
|
+
> For more information about the output files, please refer to the [Output File Description](docs/output_file_en_us.md).
|
368
376
|
|
369
377
|
### API
|
370
378
|
|
@@ -415,12 +423,12 @@ TODO
|
|
415
423
|
|
416
424
|
# TODO
|
417
425
|
|
418
|
-
-
|
419
|
-
-
|
420
|
-
-
|
421
|
-
-
|
422
|
-
-
|
423
|
-
-
|
426
|
+
- [x] Reading order based on the model
|
427
|
+
- [x] Recognition of `index` and `list` in the main text
|
428
|
+
- [x] Table recognition
|
429
|
+
- [ ] Code block recognition in the main text
|
430
|
+
- [ ] [Chemical formula recognition](docs/chemical_knowledge_introduction/introduction.pdf)
|
431
|
+
- [ ] Geometric shape recognition
|
424
432
|
|
425
433
|
# Known Issues
|
426
434
|
|
@@ -455,7 +463,9 @@ This project currently uses PyMuPDF to achieve advanced functionality. However,
|
|
455
463
|
# Acknowledgments
|
456
464
|
|
457
465
|
- [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
|
466
|
+
- [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
|
458
467
|
- [StructEqTable](https://github.com/UniModal4Reasoning/StructEqTable-Deploy)
|
468
|
+
- [RapidTable](https://github.com/RapidAI/RapidTable)
|
459
469
|
- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
|
460
470
|
- [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
|
461
471
|
- [layoutreader](https://github.com/ppaanngggg/layoutreader)
|
@@ -2,7 +2,7 @@ magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
magic_pdf/pdf_parse_by_ocr.py,sha256=E-AYHUXjzorFli0CEtmnAi09SI2STJ7FX58yjU0c9PI,810
|
3
3
|
magic_pdf/pdf_parse_by_txt.py,sha256=YeFYVAdfwF1CXOHq0LVE5131nqPHA14nt5t_sb-CMMk,709
|
4
4
|
magic_pdf/pdf_parse_union_core.py,sha256=AGIrP7ahc6Ycku0PxAlbjZhwqsdJ8iuRPIn-PFASKWY,11772
|
5
|
-
magic_pdf/pdf_parse_union_core_v2.py,sha256=
|
5
|
+
magic_pdf/pdf_parse_union_core_v2.py,sha256=GAgSP0PqbPg4U_nJXUztr-uBmakIK5rKwuxv0o9nMK0,25228
|
6
6
|
magic_pdf/user_api.py,sha256=gM-3RQYc6pMEsVvEPFXfWf5RBjAvHcUccugL6fXpP_U,3991
|
7
7
|
magic_pdf/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
8
|
magic_pdf/config/enums.py,sha256=CImYuw4sbKpq9zrj6zrrEvtdoGkjxDt8S8ByNVDpypU,89
|
@@ -23,7 +23,7 @@ magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
|
|
23
23
|
magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
|
24
24
|
magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
25
25
|
magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
|
26
|
-
magic_pdf/dict2md/ocr_mkcontent.py,sha256=
|
26
|
+
magic_pdf/dict2md/ocr_mkcontent.py,sha256=lM5UBDueiZcm4_z-jtmcgbJH2jhaXhMVY5ubggaKqHU,12954
|
27
27
|
magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
28
28
|
magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
|
29
29
|
magic_pdf/filter/pdf_meta_scan.py,sha256=5R2XDiBZw0xd4ugbDxuyk6fztGlT5jFsGN85hLvo-hQ,17390
|
@@ -38,7 +38,7 @@ magic_pdf/layout/layout_det_utils.py,sha256=NCYBTvsrULE3Cue53aMD1MfXTmOL9Xy0nivl
|
|
38
38
|
magic_pdf/layout/layout_sort.py,sha256=jtacQVcxnuYAksvEqtS0DH-v6U8qyjX-jmyZgDJ-egA,37005
|
39
39
|
magic_pdf/layout/layout_spiler_recog.py,sha256=QjBSgB-a7J2yjUR1eaCs9ZD7URtiRnV6W934hpAeuC4,3067
|
40
40
|
magic_pdf/layout/mcol_sort.py,sha256=ADnLisBJBHXDKYChcf2lzTb_TC_vZ4q89_CSN8mwEJc,11331
|
41
|
-
magic_pdf/libs/Constants.py,sha256=
|
41
|
+
magic_pdf/libs/Constants.py,sha256=ptiwMvWDUmzRZ0IbP1bM3PjGJ24BQVQQHO4sCeioPv8,1173
|
42
42
|
magic_pdf/libs/MakeContentConfig.py,sha256=Do5VKNQp3gfUKyhrZStfzfBj7l-vbsYpsJFF1SsmEc0,248
|
43
43
|
magic_pdf/libs/ModelBlockTypeEnum.py,sha256=kalXPbo5ya6hKhhBHPGlHl1yjWOURoXZWQM3rVUyPsY,164
|
44
44
|
magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -46,11 +46,11 @@ magic_pdf/libs/boxbase.py,sha256=ELMHWolgWROxOAQDgwmL7VS5kveZp4ifvEzRmPul2Ws,169
|
|
46
46
|
magic_pdf/libs/calc_span_stats.py,sha256=5vnU27DcbkFDRSAoLqAmX0KQ3I9ehWkEgh_t9hxg_zI,10147
|
47
47
|
magic_pdf/libs/clean_memory.py,sha256=BIOmEWuwR7c_p4OwTSW2muE3PRaGhmOplS-wTXt_EXk,211
|
48
48
|
magic_pdf/libs/commons.py,sha256=6Zu9-OyamyCNDY7qj0SxR-rux-ggj9im3CVPtC4ubB8,7108
|
49
|
-
magic_pdf/libs/config_reader.py,sha256=
|
49
|
+
magic_pdf/libs/config_reader.py,sha256=7QIeUPLb8CNa7E3n8TT3MN61lZdYVTylxn5cyXPsPfA,4066
|
50
50
|
magic_pdf/libs/convert_utils.py,sha256=Ov-lsfCLBPz_15iSJXIslBNmrSf_E_1g_XDWJy8NgO8,143
|
51
51
|
magic_pdf/libs/coordinate_transform.py,sha256=Bbop2cP2uz2ZG0U0gwd7J6EKkgABq5Rv03qf2LMPw80,429
|
52
52
|
magic_pdf/libs/detect_language_from_model.py,sha256=Uln8F9qs8EJOw4EgI7KRlaU3lD_mK8KMTlADLFtz8fk,816
|
53
|
-
magic_pdf/libs/draw_bbox.py,sha256=
|
53
|
+
magic_pdf/libs/draw_bbox.py,sha256=Ri_jbOv3Tgnx6s1IscRIWiIKNfUHPkGW8v4q4jPtgo8,17623
|
54
54
|
magic_pdf/libs/drop_reason.py,sha256=IfjPSrPLMmVziqjOXPep7r_ioQKFRahDgbOW1SD-Tuw,2148
|
55
55
|
magic_pdf/libs/drop_tag.py,sha256=bZDg3bIVWvBT1Ec1icwj5WLOkt5-hI6eRYZ2tX9_a74,673
|
56
56
|
magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
|
@@ -65,42 +65,63 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
|
|
65
65
|
magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
|
66
66
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
67
67
|
magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
|
68
|
-
magic_pdf/libs/version.py,sha256=
|
68
|
+
magic_pdf/libs/version.py,sha256=xKd3pzbczuMsdB08eLAOqZDUd_q1IRxwZ_ccAFL4c4A,22
|
69
69
|
magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
|
70
70
|
magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
|
71
71
|
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=eYrtOIlFqw8O95ShoCTaAhLBHk7TXc5DGif93VikW4s,6977
|
72
72
|
magic_pdf/model/magic_model.py,sha256=RKJOruUGAV1lHcGqSlCDbkJn5kutb3fphDreOHASPQg,43505
|
73
73
|
magic_pdf/model/model_list.py,sha256=tJ9jtMB93HGx8Rmt8wmQSDFXZBUIPQrwaaYsep4luTM,183
|
74
|
-
magic_pdf/model/pdf_extract_kit.py,sha256=
|
75
|
-
magic_pdf/model/ppTableModel.py,sha256=fqMuMahN2BW4sKGCgFLsi1X1OFaIG8Dab_eHUhKPcH4,2692
|
74
|
+
magic_pdf/model/pdf_extract_kit.py,sha256=6y8tQSwse8cAgqjDoJvJ-uSPdT8FYzyUeCW5g7j1Tyw,10126
|
76
75
|
magic_pdf/model/pp_structure_v2.py,sha256=BKPN7W4BjG0eWPAPjPEac1RMnb5eIzmAz4E4Rq-9b1U,3019
|
77
|
-
magic_pdf/model/
|
78
|
-
magic_pdf/model/
|
79
|
-
magic_pdf/model/
|
80
|
-
magic_pdf/model/
|
81
|
-
magic_pdf/model/
|
82
|
-
magic_pdf/model/
|
83
|
-
magic_pdf/model/
|
84
|
-
magic_pdf/model/
|
85
|
-
magic_pdf/model/
|
86
|
-
magic_pdf/model/
|
87
|
-
magic_pdf/model/
|
88
|
-
magic_pdf/model/
|
89
|
-
magic_pdf/model/
|
90
|
-
magic_pdf/model/
|
91
|
-
magic_pdf/model/
|
92
|
-
magic_pdf/model/
|
93
|
-
magic_pdf/model/
|
94
|
-
magic_pdf/model/
|
95
|
-
magic_pdf/model/
|
96
|
-
magic_pdf/model/
|
97
|
-
magic_pdf/model/
|
98
|
-
magic_pdf/model/
|
99
|
-
magic_pdf/model/
|
100
|
-
magic_pdf/model/
|
101
|
-
magic_pdf/model/
|
102
|
-
magic_pdf/model/
|
103
|
-
magic_pdf/model/
|
76
|
+
magic_pdf/model/sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
77
|
+
magic_pdf/model/sub_modules/model_init.py,sha256=iFugp79H_QLi-P7t_6Ug0qIs2oOc4zSnf-8hhZhezHA,5021
|
78
|
+
magic_pdf/model/sub_modules/model_utils.py,sha256=ToiuwXbrvH_CPIwW2AXzz9miadUN5FA7lthwBljtIco,2118
|
79
|
+
magic_pdf/model/sub_modules/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
80
|
+
magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py,sha256=roe6Rth6cvBrCw0MWXcj1CBjvK3S_Ni7GC4DxY4-yBQ,886
|
81
|
+
magic_pdf/model/sub_modules/layout/doclayout_yolo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
82
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
83
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/backbone.py,sha256=1cvSCczgvwOLdvzWyqttoYPMHsXmnzI3w9abJ1bAXoM,7106
|
84
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/beit.py,sha256=e-INve6bpEx_0FM5wYbQcEcelc79tzDlCljTVHaGt1w,30450
|
85
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/deit.py,sha256=Qyn5UWutZ-0GJczexCh-oMMSXtav_g3ovumMFJp8Om4,17000
|
86
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/model_init.py,sha256=PhWqqRwgSSmXTaUlLIjGqnBUNjzxwYDKgMzKjnxNy1k,4528
|
87
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/rcnn_vl.py,sha256=nI4G6AeLRmjavNhs5S2USKh0ozn-ftMuW0F0m_eVy3c,6649
|
88
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/visualizer.py,sha256=H6UYeCCbaN2gbDjGthTkKkPoyWxfE3azRjsR7fVBwnw,49797
|
89
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/__init__.py,sha256=C4N9gXJr7is7uznvQefQ7dOhlzEhdp86Lgh-7p0Y-08,186
|
90
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/__init__.py,sha256=W7V62JOh12NdMZj2H1sde3Il0AqW2VKplmHEsLle6tg,76
|
91
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/cord.py,sha256=jR_lRZxy8SeEvTK3FdlXmQHF0kefJf7ZqwM_8pvyI5E,8153
|
92
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/data_collator.py,sha256=M2TE47BprHSuQJYcoMeWOSpqkr_nh8VK6t2l26XWmxg,6279
|
93
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/funsd.py,sha256=Ez9tMeruHncJlkKQ7iRGBB9Pk1uWtgxlGeqs-sOmIG0,5214
|
94
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/image_utils.py,sha256=vuNOMzYw_h7jmaD2XUqkGlrjDEPB7XUts16GRICBmG4,10334
|
95
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/xfund.py,sha256=6jLKyc_4VhbHY4YEzBXm5RkPdsd9ldnUGXFZBLiJ-_s,8270
|
96
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/__init__.py,sha256=d5bm3Rx-jTrgfJDWrzD7t5R5CdHfug9dCNvUEneIYW4,190
|
97
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py,sha256=a04w_C0B4P9jF-3I_tXCj3fLmfFQR5XSKGbhgGm--pM,1216
|
98
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py,sha256=CJBcAmmLeRFVMN1YjWefoUW7hk0KXek0Eb_tergKl4Y,2150
|
99
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py,sha256=mdo8tO-DrJcv0Lbk9Pp98n3NQXYOnFFyXQWjU7t35kA,54633
|
100
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py,sha256=diKlrfxYjKAmYrUgjYdx-FXLh-swShC3tl-EBX1b3oI,1197
|
101
|
+
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py,sha256=0lxiG69_fGpSSBYA9CBLnDa_qqa1rInZ0pJpqBwZ0Yw,1372
|
102
|
+
magic_pdf/model/sub_modules/mfd/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
103
|
+
magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py,sha256=A0eABWvJLyRH6kENWU31g66D2QQos12S0hEmbOuoB0g,347
|
104
|
+
magic_pdf/model/sub_modules/mfd/yolov8/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
105
|
+
magic_pdf/model/sub_modules/mfr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
106
|
+
magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=jeJkqID6L1ZivPMdK1PgpFrE0RcmCRl19oXbudxwgXc,3528
|
107
|
+
magic_pdf/model/sub_modules/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
108
|
+
magic_pdf/model/sub_modules/ocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
109
|
+
magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
110
|
+
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py,sha256=2QAxxs0awZ_osLMiL-oP8Ik6VQ3f2C4dgJ0EV93bxlQ,9202
|
111
|
+
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py,sha256=BZ7wtkYvvcKtv8jUOI1n6wsSramt-Ob5faP7UeqrfCU,6710
|
112
|
+
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py,sha256=VouMTvi6M5TV6pQdlpusgfyZapxiZ_Wi7Ff53eMC3rE,8996
|
113
|
+
magic_pdf/model/sub_modules/reading_oreder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
114
|
+
magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
115
|
+
magic_pdf/model/sub_modules/reading_oreder/layoutreader/helpers.py,sha256=IVUFcNMDF3-kio-BIxjppHnWS3eHPqvvNihIw2fbIFM,4372
|
116
|
+
magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py,sha256=ezNSq_Y4UXiztB58hbXJsjTJlOBqWIjuW5A2uLSaZSo,7349
|
117
|
+
magic_pdf/model/sub_modules/table/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
118
|
+
magic_pdf/model/sub_modules/table/table_utils.py,sha256=B9BC4f5EEjlt2ldYxrIC8Wic2Tz3t3gTJeEyK3ggrOU,282
|
119
|
+
magic_pdf/model/sub_modules/table/rapidtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
120
|
+
magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py,sha256=UT__wzKQ4tVxlxgFacDqJfTyBU911CTJXD_6CTw6iS8,516
|
121
|
+
magic_pdf/model/sub_modules/table/structeqtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
122
|
+
magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py,sha256=SrNPm-uOFEvN5muFGbXTAuwzXm-rCiaihVdqbydIBIA,1131
|
123
|
+
magic_pdf/model/sub_modules/table/tablemaster/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
124
|
+
magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py,sha256=keSvrxuTVqc8PbNenwb43VDhJqqzp0ayxK691kxClac,2702
|
104
125
|
magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
105
126
|
magic_pdf/para/block_continuation_processor.py,sha256=IkReB5hirjm0OAirNzQQpnWe2f2bdP3Hug3Ef8qTRDA,22749
|
106
127
|
magic_pdf/para/block_termination_processor.py,sha256=YU3ZYqJy9e3OQmOuQYZrR6AUpmAlQ0mhj0PgZZPZ_fM,17957
|
@@ -112,7 +133,7 @@ magic_pdf/para/layout_match_processor.py,sha256=yr4FEO7GJ502udShqGRqIJQ_FQxoa0aG
|
|
112
133
|
magic_pdf/para/para_pipeline.py,sha256=zLaCHI9jLi1UPzh0lHP44mUjpKVTHS0gE_5YrkjVqEY,11796
|
113
134
|
magic_pdf/para/para_split.py,sha256=-UJM2jREW_2h3ZlJAU7dRD8bK3CMGKuhJrfgqv3Auvk,31310
|
114
135
|
magic_pdf/para/para_split_v2.py,sha256=ZIiLzpvVL364x1zcEG9IbT6ARJ-6JnWLIVrsDmf4w1M,36878
|
115
|
-
magic_pdf/para/para_split_v3.py,sha256=
|
136
|
+
magic_pdf/para/para_split_v3.py,sha256=vSJ5_QqGKP1rbTbGQg5ONNpybidpTdbgXZgTGd2bGsw,14539
|
116
137
|
magic_pdf/para/raw_processor.py,sha256=mHxD9FrdOSXH7NqM41s55URyCyuyACvm9kKtowkIb3k,6317
|
117
138
|
magic_pdf/para/stats.py,sha256=-6Pf9Y8jkP1uJOYWiHUjw9Lb-Fb9GY7MHr_ok7x2GX0,9731
|
118
139
|
magic_pdf/para/title_processor.py,sha256=pYZv9vEkIjAtCz8jIUtl9AVUy_ib5SdAZmMVoZtsMRI,38593
|
@@ -155,7 +176,7 @@ magic_pdf/pre_proc/resolve_bbox_conflict.py,sha256=bJiegofPUeDyi--oZjfipQ5Q5RLm6
|
|
155
176
|
magic_pdf/pre_proc/solve_line_alien.py,sha256=aNoQptPcC38Sm1I2ABhgw8jeH_5kjsRHx3VYlFFtm1g,853
|
156
177
|
magic_pdf/pre_proc/statistics.py,sha256=_9jGlXq0iXd03UMxB92ZqCiu7cjNkG5vHvFlTF_9ytA,220
|
157
178
|
magic_pdf/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
|
158
|
-
magic_pdf/resources/model_config/model_configs.yaml,sha256=
|
179
|
+
magic_pdf/resources/model_config/model_configs.yaml,sha256=S2BnVQxPd0xsZswn9WqJKTfnqd7ayY5lRwDVifTEAfw,290
|
159
180
|
magic_pdf/resources/model_config/UniMERNet/demo.yaml,sha256=Jdaim2D2lAYrV9rhc1X5Sy2_IacGOrfysJhxEUgSElo,827
|
160
181
|
magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml,sha256=9aNAEYgpHTAWpcUrDvuPG2y4V-Qw8QdcJefi96y8yDU,6109
|
161
182
|
magic_pdf/rw/AbsReaderWriter.py,sha256=2H5SDJfAAOX9kPfel06a8VRCHxD1Y8aPbWEkQDdn9JM,452
|
@@ -167,12 +188,12 @@ magic_pdf/spark/spark_api.py,sha256=eSLXTjMYW5Ya41VMIApRVfji1ZxEZXdH9ZdsL6fy5Kw,
|
|
167
188
|
magic_pdf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
168
189
|
magic_pdf/tools/cli.py,sha256=yl2E-DYxBN3XF7bWOBseYxptbmeE7tXWpwV-sp2aGIE,3140
|
169
190
|
magic_pdf/tools/cli_dev.py,sha256=3e5eyCQEt_EujXZu5fUAWr_W-YQQVqS9pB0Qgw7t1D8,4122
|
170
|
-
magic_pdf/tools/common.py,sha256=
|
191
|
+
magic_pdf/tools/common.py,sha256=oo6DsbriyQv0azRNZSt4B-13eXvsMsPgE_kwgO0-aM8,7364
|
171
192
|
magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
172
193
|
magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
|
173
|
-
magic_pdf-0.9.
|
174
|
-
magic_pdf-0.9.
|
175
|
-
magic_pdf-0.9.
|
176
|
-
magic_pdf-0.9.
|
177
|
-
magic_pdf-0.9.
|
178
|
-
magic_pdf-0.9.
|
194
|
+
magic_pdf-0.9.3.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
195
|
+
magic_pdf-0.9.3.dist-info/METADATA,sha256=IpWvg-cnoZ9euLIh_3PYmPGh-DCQ8n8Lp2Ar4oyUfuc,40128
|
196
|
+
magic_pdf-0.9.3.dist-info/WHEEL,sha256=bFJAMchF8aTQGUgMZzHJyDDMPTO3ToJ7x23SLJa1SVo,92
|
197
|
+
magic_pdf-0.9.3.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
|
198
|
+
magic_pdf-0.9.3.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
199
|
+
magic_pdf-0.9.3.dist-info/RECORD,,
|
@@ -1,36 +0,0 @@
|
|
1
|
-
import re
|
2
|
-
|
3
|
-
def layout_rm_equation(layout_res):
|
4
|
-
rm_idxs = []
|
5
|
-
for idx, ele in enumerate(layout_res['layout_dets']):
|
6
|
-
if ele['category_id'] == 10:
|
7
|
-
rm_idxs.append(idx)
|
8
|
-
|
9
|
-
for idx in rm_idxs[::-1]:
|
10
|
-
del layout_res['layout_dets'][idx]
|
11
|
-
return layout_res
|
12
|
-
|
13
|
-
|
14
|
-
def get_croped_image(image_pil, bbox):
|
15
|
-
x_min, y_min, x_max, y_max = bbox
|
16
|
-
croped_img = image_pil.crop((x_min, y_min, x_max, y_max))
|
17
|
-
return croped_img
|
18
|
-
|
19
|
-
|
20
|
-
def latex_rm_whitespace(s: str):
|
21
|
-
"""Remove unnecessary whitespace from LaTeX code.
|
22
|
-
"""
|
23
|
-
text_reg = r'(\\(operatorname|mathrm|text|mathbf)\s?\*? {.*?})'
|
24
|
-
letter = '[a-zA-Z]'
|
25
|
-
noletter = '[\W_^\d]'
|
26
|
-
names = [x[0].replace(' ', '') for x in re.findall(text_reg, s)]
|
27
|
-
s = re.sub(text_reg, lambda match: str(names.pop(0)), s)
|
28
|
-
news = s
|
29
|
-
while True:
|
30
|
-
s = news
|
31
|
-
news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, noletter), r'\1\2', s)
|
32
|
-
news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, letter), r'\1\2', news)
|
33
|
-
news = re.sub(r'(%s)\s+?(%s)' % (letter, noletter), r'\1\2', news)
|
34
|
-
if news == s:
|
35
|
-
break
|
36
|
-
return s
|