magic-pdf 1.1.0__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/dict2md/ocr_mkcontent.py +24 -0
- magic_pdf/filter/__init__.py +1 -1
- magic_pdf/filter/pdf_classify_by_type.py +6 -4
- magic_pdf/filter/pdf_meta_scan.py +4 -4
- magic_pdf/libs/pdf_check.py +11 -1
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +31 -39
- magic_pdf/model/magic_model.py +161 -4
- magic_pdf/model/pdf_extract_kit.py +0 -7
- magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +4 -3
- magic_pdf/model/sub_modules/model_init.py +28 -14
- magic_pdf/pdf_parse_union_core_v2.py +45 -32
- magic_pdf/post_proc/llm_aided.py +14 -16
- magic_pdf/pre_proc/ocr_dict_merge.py +14 -2
- {magic_pdf-1.1.0.dist-info → magic_pdf-1.2.0.dist-info}/METADATA +49 -41
- {magic_pdf-1.1.0.dist-info → magic_pdf-1.2.0.dist-info}/RECORD +20 -21
- {magic_pdf-1.1.0.dist-info → magic_pdf-1.2.0.dist-info}/WHEEL +1 -1
- magic_pdf/post_proc/llm_aided_ocr.py +0 -689
- {magic_pdf-1.1.0.dist-info → magic_pdf-1.2.0.dist-info}/LICENSE.md +0 -0
- {magic_pdf-1.1.0.dist-info → magic_pdf-1.2.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-1.1.0.dist-info → magic_pdf-1.2.0.dist-info}/top_level.txt +0 -0
@@ -6,8 +6,10 @@ import statistics
|
|
6
6
|
import time
|
7
7
|
from typing import List
|
8
8
|
|
9
|
+
import cv2
|
9
10
|
import fitz
|
10
11
|
import torch
|
12
|
+
import numpy as np
|
11
13
|
from loguru import logger
|
12
14
|
|
13
15
|
from magic_pdf.config.enums import SupportedPdfParseMethod
|
@@ -127,16 +129,15 @@ def fill_char_in_spans(spans, all_chars):
|
|
127
129
|
span['chars'].append(char)
|
128
130
|
break
|
129
131
|
|
130
|
-
|
131
|
-
|
132
|
+
need_ocr_spans = []
|
132
133
|
for span in spans:
|
133
134
|
chars_to_content(span)
|
134
135
|
# 有的span中虽然没有字但有一两个空的占位符,用宽高和content长度过滤
|
135
136
|
if len(span['content']) * span['height'] < span['width'] * 0.5:
|
136
137
|
# logger.info(f"maybe empty span: {len(span['content'])}, {span['height']}, {span['width']}")
|
137
|
-
|
138
|
+
need_ocr_spans.append(span)
|
138
139
|
del span['height'], span['width']
|
139
|
-
return
|
140
|
+
return need_ocr_spans
|
140
141
|
|
141
142
|
|
142
143
|
# 使用鲁棒性更强的中心点坐标判断
|
@@ -190,6 +191,31 @@ def remove_tilted_line(text_blocks):
|
|
190
191
|
block['lines'].remove(line)
|
191
192
|
|
192
193
|
|
194
|
+
def calculate_contrast(img, img_mode) -> float:
|
195
|
+
"""
|
196
|
+
计算给定图像的对比度。
|
197
|
+
:param img: 图像,类型为numpy.ndarray
|
198
|
+
:Param img_mode = 图像的色彩通道,'rgb' 或 'bgr'
|
199
|
+
:return: 图像的对比度值
|
200
|
+
"""
|
201
|
+
if img_mode == 'rgb':
|
202
|
+
# 将RGB图像转换为灰度图
|
203
|
+
gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
|
204
|
+
elif img_mode == 'bgr':
|
205
|
+
# 将BGR图像转换为灰度图
|
206
|
+
gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
207
|
+
else:
|
208
|
+
raise ValueError("Invalid image mode. Please provide 'rgb' or 'bgr'.")
|
209
|
+
|
210
|
+
# 计算均值和标准差
|
211
|
+
mean_value = np.mean(gray_img)
|
212
|
+
std_dev = np.std(gray_img)
|
213
|
+
# 对比度定义为标准差除以平均值(加上小常数避免除零错误)
|
214
|
+
contrast = std_dev / (mean_value + 1e-6)
|
215
|
+
# logger.info(f"contrast: {contrast}")
|
216
|
+
return round(contrast, 2)
|
217
|
+
|
218
|
+
|
193
219
|
def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
|
194
220
|
# cid用0xfffd表示,连字符拆开
|
195
221
|
# text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
|
@@ -274,9 +300,9 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
|
|
274
300
|
span['chars'] = []
|
275
301
|
new_spans.append(span)
|
276
302
|
|
277
|
-
|
303
|
+
need_ocr_spans = fill_char_in_spans(new_spans, all_pymu_chars)
|
278
304
|
|
279
|
-
if len(
|
305
|
+
if len(need_ocr_spans) > 0:
|
280
306
|
|
281
307
|
# 初始化ocr模型
|
282
308
|
atom_model_manager = AtomModelSingleton()
|
@@ -287,9 +313,15 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
|
|
287
313
|
lang=lang
|
288
314
|
)
|
289
315
|
|
290
|
-
for span in
|
316
|
+
for span in need_ocr_spans:
|
291
317
|
# 对span的bbox截图再ocr
|
292
318
|
span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode='cv2')
|
319
|
+
|
320
|
+
# 计算span的对比度,低于0.20的span不进行ocr
|
321
|
+
if calculate_contrast(span_img, img_mode='bgr') <= 0.20:
|
322
|
+
spans.remove(span)
|
323
|
+
continue
|
324
|
+
|
293
325
|
ocr_res = ocr_model.ocr(span_img, det=False)
|
294
326
|
if ocr_res and len(ocr_res) > 0:
|
295
327
|
if len(ocr_res[0]) > 0:
|
@@ -306,24 +338,7 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
|
|
306
338
|
|
307
339
|
def model_init(model_name: str):
|
308
340
|
from transformers import LayoutLMv3ForTokenClassification
|
309
|
-
device = get_device()
|
310
|
-
if torch.cuda.is_available():
|
311
|
-
device = torch.device('cuda')
|
312
|
-
if torch.cuda.is_bf16_supported():
|
313
|
-
supports_bfloat16 = True
|
314
|
-
else:
|
315
|
-
supports_bfloat16 = False
|
316
|
-
elif str(device).startswith("npu"):
|
317
|
-
import torch_npu
|
318
|
-
if torch_npu.npu.is_available():
|
319
|
-
device = torch.device('npu')
|
320
|
-
supports_bfloat16 = False
|
321
|
-
else:
|
322
|
-
device = torch.device('cpu')
|
323
|
-
supports_bfloat16 = False
|
324
|
-
else:
|
325
|
-
device = torch.device('cpu')
|
326
|
-
supports_bfloat16 = False
|
341
|
+
device = torch.device(get_device())
|
327
342
|
|
328
343
|
if model_name == 'layoutreader':
|
329
344
|
# 检测modelscope的缓存目录是否存在
|
@@ -339,9 +354,6 @@ def model_init(model_name: str):
|
|
339
354
|
model = LayoutLMv3ForTokenClassification.from_pretrained(
|
340
355
|
'hantian/layoutreader'
|
341
356
|
)
|
342
|
-
# 检查设备是否支持 bfloat16
|
343
|
-
if supports_bfloat16:
|
344
|
-
model.bfloat16()
|
345
357
|
model.to(device).eval()
|
346
358
|
else:
|
347
359
|
logger.error('model name not allow')
|
@@ -404,10 +416,11 @@ def cal_block_index(fix_blocks, sorted_bboxes):
|
|
404
416
|
block_bboxes.append(block['bbox'])
|
405
417
|
|
406
418
|
# 删除图表body block中的虚拟line信息, 并用real_lines信息回填
|
407
|
-
if block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
|
408
|
-
|
409
|
-
|
410
|
-
|
419
|
+
if block['type'] in [BlockType.ImageBody, BlockType.TableBody, BlockType.Title, BlockType.InterlineEquation]:
|
420
|
+
if 'real_lines' in block:
|
421
|
+
block['virtual_lines'] = copy.deepcopy(block['lines'])
|
422
|
+
block['lines'] = copy.deepcopy(block['real_lines'])
|
423
|
+
del block['real_lines']
|
411
424
|
|
412
425
|
import numpy as np
|
413
426
|
|
magic_pdf/post_proc/llm_aided.py
CHANGED
@@ -3,6 +3,7 @@ import json
|
|
3
3
|
from loguru import logger
|
4
4
|
from magic_pdf.dict2md.ocr_mkcontent import merge_para_with_text
|
5
5
|
from openai import OpenAI
|
6
|
+
import ast
|
6
7
|
|
7
8
|
|
8
9
|
#@todo: 有的公式以"\"结尾,这样会导致尾部拼接的"$"被转义,也需要修复
|
@@ -119,11 +120,12 @@ def llm_aided_title(pdf_info_dict, title_aided_config):
|
|
119
120
|
- 在完成初步分级后,仔细检查分级结果的合理性
|
120
121
|
- 根据上下文关系和逻辑顺序,对不合理的分级进行微调
|
121
122
|
- 确保最终的分级结果符合文档的实际结构和逻辑
|
123
|
+
- 字典中可能包含被误当成标题的正文,你可以通过将其层级标记为 0 来排除它们
|
122
124
|
|
123
125
|
IMPORTANT:
|
124
|
-
|
125
|
-
{{
|
126
|
-
|
126
|
+
请直接返回优化过的由标题层级组成的字典,格式为{{标题id:标题层级}},如下:
|
127
|
+
{{0:1,1:2,2:2,3:3}}
|
128
|
+
不需要对字典格式化,不需要返回任何其他信息。
|
127
129
|
|
128
130
|
Input title list:
|
129
131
|
{title_dict}
|
@@ -133,7 +135,7 @@ Corrected title list:
|
|
133
135
|
|
134
136
|
retry_count = 0
|
135
137
|
max_retries = 3
|
136
|
-
|
138
|
+
dict_completion = None
|
137
139
|
|
138
140
|
while retry_count < max_retries:
|
139
141
|
try:
|
@@ -143,24 +145,20 @@ Corrected title list:
|
|
143
145
|
{'role': 'user', 'content': title_optimize_prompt}],
|
144
146
|
temperature=0.7,
|
145
147
|
)
|
146
|
-
|
148
|
+
# logger.info(f"Title completion: {completion.choices[0].message.content}")
|
149
|
+
dict_completion = ast.literal_eval(completion.choices[0].message.content)
|
150
|
+
# logger.info(f"len(dict_completion): {len(dict_completion)}, len(title_dict): {len(title_dict)}")
|
147
151
|
|
148
|
-
|
149
|
-
# logger.info(f"len(json_completion): {len(json_completion)}, len(title_dict): {len(title_dict)}")
|
150
|
-
|
151
|
-
if len(json_completion) == len(title_dict):
|
152
|
+
if len(dict_completion) == len(title_dict):
|
152
153
|
for i, origin_title_block in enumerate(origin_title_list):
|
153
|
-
origin_title_block["level"] = int(
|
154
|
+
origin_title_block["level"] = int(dict_completion[i])
|
154
155
|
break
|
155
156
|
else:
|
156
157
|
logger.warning("The number of titles in the optimized result is not equal to the number of titles in the input.")
|
157
158
|
retry_count += 1
|
158
159
|
except Exception as e:
|
159
|
-
|
160
|
-
logger.warning(f"JSON decode error on attempt {retry_count + 1}: {e}")
|
161
|
-
else:
|
162
|
-
logger.exception(e)
|
160
|
+
logger.exception(e)
|
163
161
|
retry_count += 1
|
164
162
|
|
165
|
-
if
|
166
|
-
logger.error("Failed to decode
|
163
|
+
if dict_completion is None:
|
164
|
+
logger.error("Failed to decode dict after maximum retries.")
|
@@ -60,6 +60,19 @@ def merge_spans_to_line(spans, threshold=0.6):
|
|
60
60
|
return lines
|
61
61
|
|
62
62
|
|
63
|
+
def span_block_type_compatible(span_type, block_type):
|
64
|
+
if span_type in [ContentType.Text, ContentType.InlineEquation]:
|
65
|
+
return block_type in [BlockType.Text, BlockType.Title, BlockType.ImageCaption, BlockType.ImageFootnote, BlockType.TableCaption, BlockType.TableFootnote]
|
66
|
+
elif span_type == ContentType.InterlineEquation:
|
67
|
+
return block_type in [BlockType.InterlineEquation]
|
68
|
+
elif span_type == ContentType.Image:
|
69
|
+
return block_type in [BlockType.ImageBody]
|
70
|
+
elif span_type == ContentType.Table:
|
71
|
+
return block_type in [BlockType.TableBody]
|
72
|
+
else:
|
73
|
+
return False
|
74
|
+
|
75
|
+
|
63
76
|
def fill_spans_in_blocks(blocks, spans, radio):
|
64
77
|
"""将allspans中的span按位置关系,放入blocks中."""
|
65
78
|
block_with_spans = []
|
@@ -78,8 +91,7 @@ def fill_spans_in_blocks(blocks, spans, radio):
|
|
78
91
|
block_spans = []
|
79
92
|
for span in spans:
|
80
93
|
span_bbox = span['bbox']
|
81
|
-
if calculate_overlap_area_in_bbox1_area_ratio(
|
82
|
-
span_bbox, block_bbox) > radio:
|
94
|
+
if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio and span_block_type_compatible(span['type'], block_type):
|
83
95
|
block_spans.append(span)
|
84
96
|
|
85
97
|
block_dict['spans'] = block_spans
|
@@ -1,49 +1,49 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.2.0
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
Home-page: https://github.com/opendatalab/MinerU
|
6
6
|
Requires-Python: >=3.9
|
7
7
|
Description-Content-Type: text/markdown
|
8
8
|
License-File: LICENSE.md
|
9
|
-
Requires-Dist: boto3
|
10
|
-
Requires-Dist: Brotli
|
11
|
-
Requires-Dist: click
|
12
|
-
Requires-Dist: fast-langdetect
|
13
|
-
Requires-Dist: loguru
|
14
|
-
Requires-Dist: numpy
|
15
|
-
Requires-Dist: pydantic
|
16
|
-
Requires-Dist: PyMuPDF
|
17
|
-
Requires-Dist: scikit-learn
|
18
|
-
Requires-Dist: torch
|
9
|
+
Requires-Dist: boto3>=1.28.43
|
10
|
+
Requires-Dist: Brotli>=1.1.0
|
11
|
+
Requires-Dist: click>=8.1.7
|
12
|
+
Requires-Dist: fast-langdetect>=0.2.3
|
13
|
+
Requires-Dist: loguru>=0.6.0
|
14
|
+
Requires-Dist: numpy<2.0.0,>=1.21.6
|
15
|
+
Requires-Dist: pydantic>=2.7.2
|
16
|
+
Requires-Dist: PyMuPDF<=1.24.14,>=1.24.9
|
17
|
+
Requires-Dist: scikit-learn>=1.0.2
|
18
|
+
Requires-Dist: torch>=2.2.2
|
19
19
|
Requires-Dist: transformers
|
20
|
-
Requires-Dist: pdfminer.six
|
20
|
+
Requires-Dist: pdfminer.six==20231228
|
21
21
|
Provides-Extra: full
|
22
|
-
Requires-Dist: unimernet
|
23
|
-
Requires-Dist: torch
|
24
|
-
Requires-Dist: torchvision
|
25
|
-
Requires-Dist: ultralytics
|
26
|
-
Requires-Dist: paddleocr
|
27
|
-
Requires-Dist: struct-eqtable
|
28
|
-
Requires-Dist: einops
|
29
|
-
Requires-Dist: accelerate
|
30
|
-
Requires-Dist: doclayout-yolo
|
31
|
-
Requires-Dist: rapidocr-paddle
|
32
|
-
Requires-Dist: rapidocr-onnxruntime
|
33
|
-
Requires-Dist: rapid-table
|
34
|
-
Requires-Dist: PyYAML
|
35
|
-
Requires-Dist: openai
|
36
|
-
Requires-Dist: detectron2
|
37
|
-
Requires-Dist:
|
38
|
-
Requires-Dist:
|
39
|
-
Requires-Dist: matplotlib
|
40
|
-
Requires-Dist: paddlepaddle
|
22
|
+
Requires-Dist: unimernet==0.2.3; extra == "full"
|
23
|
+
Requires-Dist: torch<=2.3.1,>=2.2.2; extra == "full"
|
24
|
+
Requires-Dist: torchvision<=0.18.1,>=0.17.2; extra == "full"
|
25
|
+
Requires-Dist: ultralytics>=8.3.48; extra == "full"
|
26
|
+
Requires-Dist: paddleocr==2.7.3; extra == "full"
|
27
|
+
Requires-Dist: struct-eqtable==0.3.2; extra == "full"
|
28
|
+
Requires-Dist: einops; extra == "full"
|
29
|
+
Requires-Dist: accelerate; extra == "full"
|
30
|
+
Requires-Dist: doclayout-yolo==0.0.2b1; extra == "full"
|
31
|
+
Requires-Dist: rapidocr-paddle<2.0.0,>=1.4.5; extra == "full"
|
32
|
+
Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.4; extra == "full"
|
33
|
+
Requires-Dist: rapid-table<2.0.0,>=1.0.3; extra == "full"
|
34
|
+
Requires-Dist: PyYAML; extra == "full"
|
35
|
+
Requires-Dist: openai; extra == "full"
|
36
|
+
Requires-Dist: detectron2; extra == "full"
|
37
|
+
Requires-Dist: matplotlib; (platform_system == "Linux" or platform_system == "Darwin") and extra == "full"
|
38
|
+
Requires-Dist: paddlepaddle==3.0.0rc1; (platform_system == "Linux" or platform_system == "Darwin") and extra == "full"
|
39
|
+
Requires-Dist: matplotlib<=3.9.0; platform_system == "Windows" and extra == "full"
|
40
|
+
Requires-Dist: paddlepaddle==2.6.1; platform_system == "Windows" and extra == "full"
|
41
41
|
Provides-Extra: lite
|
42
|
-
Requires-Dist: paddleocr
|
43
|
-
Requires-Dist: paddlepaddle
|
44
|
-
Requires-Dist: paddlepaddle
|
42
|
+
Requires-Dist: paddleocr==2.7.3; extra == "lite"
|
43
|
+
Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "lite"
|
44
|
+
Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_system == "Darwin") and extra == "lite"
|
45
45
|
Provides-Extra: old_linux
|
46
|
-
Requires-Dist: albumentations
|
46
|
+
Requires-Dist: albumentations<=1.4.20; extra == "old-linux"
|
47
47
|
|
48
48
|
<div align="center" xmlns="http://www.w3.org/1999/html">
|
49
49
|
<!-- logo -->
|
@@ -94,6 +94,15 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
|
|
94
94
|
</div>
|
95
95
|
|
96
96
|
# Changelog
|
97
|
+
- 2025/02/24 1.2.0 released. This version includes several fixes and improvements to enhance parsing efficiency and accuracy:
|
98
|
+
- Performance Optimization
|
99
|
+
- Increased classification speed for PDF documents in auto mode.
|
100
|
+
- Parsing Optimization
|
101
|
+
- Improved parsing logic for documents containing watermarks, significantly enhancing the parsing results for such documents.
|
102
|
+
- Enhanced the matching logic for multiple images/tables and captions within a single page, improving the accuracy of image-text matching in complex layouts.
|
103
|
+
- Bug Fixes
|
104
|
+
- Fixed an issue where image/table spans were incorrectly filled into text blocks under certain conditions.
|
105
|
+
- Resolved an issue where title blocks were empty in some cases.
|
97
106
|
- 2025/01/22 1.1.0 released. In this version we have focused on improving parsing accuracy and efficiency:
|
98
107
|
- Model capability upgrade (requires re-executing the [model download process](docs/how_to_download_models_en.md) to obtain incremental updates of model files)
|
99
108
|
- The layout recognition model has been upgraded to the latest `doclayout_yolo(2501)` model, improving layout recognition accuracy.
|
@@ -280,10 +289,9 @@ There are three different ways to experience MinerU:
|
|
280
289
|
|
281
290
|
### Online Demo
|
282
291
|
|
283
|
-
|
284
|
-
[](https://mineru.net/OpenSourceTools/Extractor?source=github)
|
292
|
+
Synced with dev branch updates:
|
285
293
|
|
286
|
-
|
294
|
+
[](https://mineru.net/OpenSourceTools/Extractor?source=github)
|
287
295
|
[](https://huggingface.co/spaces/opendatalab/MinerU)
|
288
296
|
[](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
|
289
297
|
|
@@ -292,8 +300,8 @@ Test Version (Synced with dev branch updates, testing new features):
|
|
292
300
|
#### 1. Install magic-pdf
|
293
301
|
|
294
302
|
```bash
|
295
|
-
conda create -n
|
296
|
-
conda activate
|
303
|
+
conda create -n mineru python=3.10
|
304
|
+
conda activate mineru
|
297
305
|
pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com
|
298
306
|
```
|
299
307
|
|
@@ -353,7 +361,7 @@ If your device supports CUDA and meets the GPU requirements of the mainline envi
|
|
353
361
|
```bash
|
354
362
|
wget https://github.com/opendatalab/MinerU/raw/master/docker/global/Dockerfile -O Dockerfile
|
355
363
|
docker build -t mineru:latest .
|
356
|
-
docker run
|
364
|
+
docker run -it --name mineru --gpus=all mineru:latest /bin/bash -c "echo 'source /opt/mineru_venv/bin/activate' >> ~/.bashrc && exec bash"
|
357
365
|
magic-pdf --help
|
358
366
|
```
|
359
367
|
|
@@ -1,5 +1,5 @@
|
|
1
1
|
magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
magic_pdf/pdf_parse_union_core_v2.py,sha256=
|
2
|
+
magic_pdf/pdf_parse_union_core_v2.py,sha256=jIrXgU_gKL4toJ6GsCoDxByszaN8mAr5vrEy_c63ewk,38310
|
3
3
|
magic_pdf/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
4
|
magic_pdf/config/constants.py,sha256=fXGzANULnJWLPxwYp3BEFWx-rnorzpySMx63ffyEyq4,1272
|
5
5
|
magic_pdf/config/drop_reason.py,sha256=CqjMzBE96Qo8OeFvhhhItY8WhyqsKhE3DmyJLoQZNCc,2248
|
@@ -24,10 +24,10 @@ magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,111
|
|
24
24
|
magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
|
25
25
|
magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
|
26
26
|
magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
27
|
-
magic_pdf/dict2md/ocr_mkcontent.py,sha256=
|
28
|
-
magic_pdf/filter/__init__.py,sha256=
|
29
|
-
magic_pdf/filter/pdf_classify_by_type.py,sha256=
|
30
|
-
magic_pdf/filter/pdf_meta_scan.py,sha256=
|
27
|
+
magic_pdf/dict2md/ocr_mkcontent.py,sha256=ZZTaiIn18OWuWKGbDdpoOZ3VMhe_3_JKwrKCfzDiSk0,13715
|
28
|
+
magic_pdf/filter/__init__.py,sha256=_7lSez_myu4b6cdzPpQ-NfREuqeBSq_QdyBPKVLyq2U,1505
|
29
|
+
magic_pdf/filter/pdf_classify_by_type.py,sha256=YNYXamxYgEiSujwilCNHOtrwpgJGDiQ597qJfardDVc,42354
|
30
|
+
magic_pdf/filter/pdf_meta_scan.py,sha256=eOuM0-JgaXvHolSgepGoNDJDmv_uITWLQpH_0MfnVQw,17478
|
31
31
|
magic_pdf/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
32
32
|
magic_pdf/integrations/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
33
33
|
magic_pdf/integrations/rag/api.py,sha256=t38wvIBzLje4_JzTP3dewMLqV-tQJ-A3B92Sj2oyrfs,2507
|
@@ -47,23 +47,23 @@ magic_pdf/libs/language.py,sha256=7RT3mxSa7jdpoC5ySd7ZddHA7TO7UsnmDOWiYZAxuyg,14
|
|
47
47
|
magic_pdf/libs/local_math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
|
48
48
|
magic_pdf/libs/markdown_utils.py,sha256=86v2BmsSV4NkoRZrH4uQD1youJhYFF3vIKr_vDeg3z0,270
|
49
49
|
magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
|
50
|
-
magic_pdf/libs/pdf_check.py,sha256=
|
50
|
+
magic_pdf/libs/pdf_check.py,sha256=7GWWvDR6g_rj_fE6XJlbTq5AFVX11ngRIzT0N18F214,3396
|
51
51
|
magic_pdf/libs/pdf_image_tools.py,sha256=kjzSEbm7K0yiHv8kJ4VbZ9HHktM8qvAv3LhxRyDZEQk,1987
|
52
52
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
53
|
-
magic_pdf/libs/version.py,sha256=
|
53
|
+
magic_pdf/libs/version.py,sha256=MpAT5hgNoHnTtG1XRD_GV_A7QrHVU6vJjGSw_8qMGA4,22
|
54
54
|
magic_pdf/model/__init__.py,sha256=sa-dO2k-TLy25I2gRrzjm_cQeYfzMf-pLwBJHkIxGo0,51
|
55
55
|
magic_pdf/model/batch_analyze.py,sha256=sbrgOJWycb1Ep6e62CPi6jEyG6VSeklIxc4PmrqaLhM,11933
|
56
|
-
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=
|
57
|
-
magic_pdf/model/magic_model.py,sha256=
|
56
|
+
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=wma0aq6RyxAepEqnaiTJ9_pWWKLVBj39c6xWA85dxzA,8068
|
57
|
+
magic_pdf/model/magic_model.py,sha256=OcKhSJ_PyAAldgpKPiPxi2uuvnj3Sf4SvXi_5Rv0a6Q,30667
|
58
58
|
magic_pdf/model/model_list.py,sha256=aqfEJlEfbib3D3ISrxc0Coh6SbffYh8Yq2FlQN35_zA,213
|
59
|
-
magic_pdf/model/pdf_extract_kit.py,sha256=
|
59
|
+
magic_pdf/model/pdf_extract_kit.py,sha256=Rd51VNZPKRA_tUbDss-b44d84K6WDG2S87a37Ax7HUA,12224
|
60
60
|
magic_pdf/model/pp_structure_v2.py,sha256=NcqFWL4nUtjl82MFak8HX_8V3i4Aw_fK4dATrIp5uGs,3840
|
61
61
|
magic_pdf/model/sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
62
|
-
magic_pdf/model/sub_modules/model_init.py,sha256=
|
62
|
+
magic_pdf/model/sub_modules/model_init.py,sha256=Ltwi3Nd5PdVVXRF9fto5nImFVg6w-twAMzOLV_F-c3g,7693
|
63
63
|
magic_pdf/model/sub_modules/model_utils.py,sha256=2pI1Xcr2zCF3b64e4WoFtIbjSmTVYBE4zjyHB23gvmE,2488
|
64
64
|
magic_pdf/model/sub_modules/language_detection/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
65
65
|
magic_pdf/model/sub_modules/language_detection/utils.py,sha256=5nec_loLyYCJ5o6n38AYLz2SKmRvHDCBdt6ka84EaGM,3096
|
66
|
-
magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py,sha256=
|
66
|
+
magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py,sha256=bl2i7kweoJNdj47FlE9h0B_-nNQrMcW9mCLQ1puMEH8,4893
|
67
67
|
magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
68
68
|
magic_pdf/model/sub_modules/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
69
69
|
magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py,sha256=gy7rc8poO-Zr8511NJjuBV8Uryq5k3JKrstLtCONg0c,2237
|
@@ -115,14 +115,13 @@ magic_pdf/operators/__init__.py,sha256=liU2-WYUvsQ1G4PYBppyvokS9z5IjrnlVMtoBAC1R
|
|
115
115
|
magic_pdf/operators/models.py,sha256=mRqbCVrxxaUVDpEBAsXaK7EL1M-goICkE1W0FYgewio,5305
|
116
116
|
magic_pdf/operators/pipes.py,sha256=XgBgisKQd_ruW-3Tw4v5LhqloZUHgn2aFcpi_q8LbCs,6767
|
117
117
|
magic_pdf/post_proc/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
118
|
-
magic_pdf/post_proc/llm_aided.py,sha256=
|
119
|
-
magic_pdf/post_proc/llm_aided_ocr.py,sha256=89kxzEQVqNGSUtmvgcg2AVDDmgb43bamdRxXbwS2FxQ,33557
|
118
|
+
magic_pdf/post_proc/llm_aided.py,sha256=yzhu2cCpUZjdwf3v0swYDgSs9VWIfMAoXepYIP1EMZs,6367
|
120
119
|
magic_pdf/post_proc/para_split_v3.py,sha256=v4SdQn4OZdHRXpWQMfQ-FGJz_tglQ88uFUqpwY542Fo,16922
|
121
120
|
magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
122
121
|
magic_pdf/pre_proc/construct_page_dict.py,sha256=OFmq5XRKi6fYIo-lmGlL-NB16Sf0egzsfEx-fT2uYrc,660
|
123
122
|
magic_pdf/pre_proc/cut_image.py,sha256=NDzbxwD7z7Tb4uAxL4KR6LzURFdN1Tzr4nPvj-VmEqc,1225
|
124
123
|
magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=nt88ttXCEI_1ihAF7HU15SQjwM69V-iJmk-L_nyzA6o,9328
|
125
|
-
magic_pdf/pre_proc/ocr_dict_merge.py,sha256=
|
124
|
+
magic_pdf/pre_proc/ocr_dict_merge.py,sha256=vrbLIzNIjxrm7PonfHaFdY6qaicc0uIly62SJwgZ5UM,5496
|
126
125
|
magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=xrgC9vR0poklZuY4Og41pZVdXzuaGFg3BnQ01X60dpo,3102
|
127
126
|
magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=mcdxAh4P56NZ3Ij8h3vW8qC_SrszfXflVWuWUuUiTNg,3089
|
128
127
|
magic_pdf/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
|
@@ -139,9 +138,9 @@ magic_pdf/tools/common.py,sha256=1LfMeXBBsb3WlGeNAze_pPOYXQ8Qbfh-JgRXweojHKo,838
|
|
139
138
|
magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
140
139
|
magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
|
141
140
|
magic_pdf/utils/office_to_pdf.py,sha256=7aj-Ls2v8saD-Rgu_t3FIc-J3Ka9wnmiEH5zY-H1Vxs,729
|
142
|
-
magic_pdf-1.
|
143
|
-
magic_pdf-1.
|
144
|
-
magic_pdf-1.
|
145
|
-
magic_pdf-1.
|
146
|
-
magic_pdf-1.
|
147
|
-
magic_pdf-1.
|
141
|
+
magic_pdf-1.2.0.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
142
|
+
magic_pdf-1.2.0.dist-info/METADATA,sha256=7iel3MItxKhJc1Bbfh_NMbDp8a23k9G1vA8LYEw2k_U,40720
|
143
|
+
magic_pdf-1.2.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
144
|
+
magic_pdf-1.2.0.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
|
145
|
+
magic_pdf-1.2.0.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
146
|
+
magic_pdf-1.2.0.dist-info/RECORD,,
|