magic-pdf 1.2.0__py3-none-any.whl → 1.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/dict2md/ocr_mkcontent.py +2 -5
- magic_pdf/libs/performance_stats.py +54 -0
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +1 -5
- magic_pdf/model/magic_model.py +1 -2
- magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +74 -9
- magic_pdf/pdf_parse_union_core_v2.py +7 -3
- magic_pdf/post_proc/para_split_v3.py +16 -13
- magic_pdf/pre_proc/ocr_dict_merge.py +1 -1
- {magic_pdf-1.2.0.dist-info → magic_pdf-1.2.2.dist-info}/METADATA +6 -2
- {magic_pdf-1.2.0.dist-info → magic_pdf-1.2.2.dist-info}/RECORD +15 -14
- {magic_pdf-1.2.0.dist-info → magic_pdf-1.2.2.dist-info}/LICENSE.md +0 -0
- {magic_pdf-1.2.0.dist-info → magic_pdf-1.2.2.dist-info}/WHEEL +0 -0
- {magic_pdf-1.2.0.dist-info → magic_pdf-1.2.2.dist-info}/entry_points.txt +0 -0
- {magic_pdf-1.2.0.dist-info → magic_pdf-1.2.2.dist-info}/top_level.txt +0 -0
@@ -138,12 +138,9 @@ def full_to_half(text: str) -> str:
|
|
138
138
|
result = []
|
139
139
|
for char in text:
|
140
140
|
code = ord(char)
|
141
|
-
# Full-width
|
142
|
-
if
|
141
|
+
# Full-width letters and numbers (FF21-FF3A for A-Z, FF41-FF5A for a-z, FF10-FF19 for 0-9)
|
142
|
+
if (0xFF21 <= code <= 0xFF3A) or (0xFF41 <= code <= 0xFF5A) or (0xFF10 <= code <= 0xFF19):
|
143
143
|
result.append(chr(code - 0xFEE0)) # Shift to ASCII range
|
144
|
-
# Full-width space
|
145
|
-
elif code == 0x3000:
|
146
|
-
result.append(' ')
|
147
144
|
else:
|
148
145
|
result.append(char)
|
149
146
|
return ''.join(result)
|
@@ -0,0 +1,54 @@
|
|
1
|
+
import time
|
2
|
+
import functools
|
3
|
+
from collections import defaultdict
|
4
|
+
from typing import Dict, List
|
5
|
+
|
6
|
+
|
7
|
+
class PerformanceStats:
|
8
|
+
"""性能统计类,用于收集和展示方法执行时间"""
|
9
|
+
|
10
|
+
_stats: Dict[str, List[float]] = defaultdict(list)
|
11
|
+
|
12
|
+
@classmethod
|
13
|
+
def add_execution_time(cls, func_name: str, execution_time: float):
|
14
|
+
"""添加执行时间记录"""
|
15
|
+
cls._stats[func_name].append(execution_time)
|
16
|
+
|
17
|
+
@classmethod
|
18
|
+
def get_stats(cls) -> Dict[str, dict]:
|
19
|
+
"""获取统计结果"""
|
20
|
+
results = {}
|
21
|
+
for func_name, times in cls._stats.items():
|
22
|
+
results[func_name] = {
|
23
|
+
'count': len(times),
|
24
|
+
'total_time': sum(times),
|
25
|
+
'avg_time': sum(times) / len(times),
|
26
|
+
'min_time': min(times),
|
27
|
+
'max_time': max(times)
|
28
|
+
}
|
29
|
+
return results
|
30
|
+
|
31
|
+
@classmethod
|
32
|
+
def print_stats(cls):
|
33
|
+
"""打印统计结果"""
|
34
|
+
stats = cls.get_stats()
|
35
|
+
print("\n性能统计结果:")
|
36
|
+
print("-" * 80)
|
37
|
+
print(f"{'方法名':<40} {'调用次数':>8} {'总时间(s)':>12} {'平均时间(s)':>12}")
|
38
|
+
print("-" * 80)
|
39
|
+
for func_name, data in stats.items():
|
40
|
+
print(f"{func_name:<40} {data['count']:8d} {data['total_time']:12.6f} {data['avg_time']:12.6f}")
|
41
|
+
|
42
|
+
|
43
|
+
def measure_time(func):
|
44
|
+
"""测量方法执行时间的装饰器"""
|
45
|
+
|
46
|
+
@functools.wraps(func)
|
47
|
+
def wrapper(*args, **kwargs):
|
48
|
+
start_time = time.time()
|
49
|
+
result = func(*args, **kwargs)
|
50
|
+
execution_time = time.time() - start_time
|
51
|
+
PerformanceStats.add_execution_time(func.__name__, execution_time)
|
52
|
+
return result
|
53
|
+
|
54
|
+
return wrapper
|
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.2"
|
@@ -170,11 +170,7 @@ def doc_analyze(
|
|
170
170
|
gpu_memory = int(os.getenv("VIRTUAL_VRAM_SIZE", round(get_vram(device))))
|
171
171
|
if gpu_memory is not None and gpu_memory >= 8:
|
172
172
|
|
173
|
-
if gpu_memory >=
|
174
|
-
batch_ratio = 32
|
175
|
-
elif gpu_memory >=20:
|
176
|
-
batch_ratio = 16
|
177
|
-
elif gpu_memory >= 16:
|
173
|
+
if gpu_memory >= 16:
|
178
174
|
batch_ratio = 8
|
179
175
|
elif gpu_memory >= 10:
|
180
176
|
batch_ratio = 4
|
magic_pdf/model/magic_model.py
CHANGED
@@ -528,14 +528,13 @@ class MagicModel:
|
|
528
528
|
pair_dis = bbox_distance(subjects[sub_idx]['bbox'], objects[obj_idx]['bbox'])
|
529
529
|
nearest_dis = float('inf')
|
530
530
|
for i in range(N):
|
531
|
-
if i in seen_idx:continue
|
531
|
+
if i in seen_idx or i == sub_idx:continue
|
532
532
|
nearest_dis = min(nearest_dis, bbox_distance(subjects[i]['bbox'], objects[obj_idx]['bbox']))
|
533
533
|
|
534
534
|
if pair_dis >= 3*nearest_dis:
|
535
535
|
seen_idx.add(sub_idx)
|
536
536
|
continue
|
537
537
|
|
538
|
-
|
539
538
|
seen_idx.add(sub_idx)
|
540
539
|
seen_idx.add(obj_idx + OBJ_IDX_OFFSET)
|
541
540
|
seen_sub_idx.add(sub_idx)
|
@@ -100,20 +100,61 @@ class UnimernetModel(object):
|
|
100
100
|
res["latex"] = latex_rm_whitespace(latex)
|
101
101
|
return formula_list
|
102
102
|
|
103
|
-
def batch_predict(
|
104
|
-
|
105
|
-
) -> list:
|
103
|
+
# def batch_predict(
|
104
|
+
# self, images_mfd_res: list, images: list, batch_size: int = 64
|
105
|
+
# ) -> list:
|
106
|
+
# images_formula_list = []
|
107
|
+
# mf_image_list = []
|
108
|
+
# backfill_list = []
|
109
|
+
# for image_index in range(len(images_mfd_res)):
|
110
|
+
# mfd_res = images_mfd_res[image_index]
|
111
|
+
# pil_img = Image.fromarray(images[image_index])
|
112
|
+
# formula_list = []
|
113
|
+
#
|
114
|
+
# for xyxy, conf, cla in zip(
|
115
|
+
# mfd_res.boxes.xyxy, mfd_res.boxes.conf, mfd_res.boxes.cls
|
116
|
+
# ):
|
117
|
+
# xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
|
118
|
+
# new_item = {
|
119
|
+
# "category_id": 13 + int(cla.item()),
|
120
|
+
# "poly": [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax],
|
121
|
+
# "score": round(float(conf.item()), 2),
|
122
|
+
# "latex": "",
|
123
|
+
# }
|
124
|
+
# formula_list.append(new_item)
|
125
|
+
# bbox_img = pil_img.crop((xmin, ymin, xmax, ymax))
|
126
|
+
# mf_image_list.append(bbox_img)
|
127
|
+
#
|
128
|
+
# images_formula_list.append(formula_list)
|
129
|
+
# backfill_list += formula_list
|
130
|
+
#
|
131
|
+
# dataset = MathDataset(mf_image_list, transform=self.mfr_transform)
|
132
|
+
# dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=0)
|
133
|
+
# mfr_res = []
|
134
|
+
# for mf_img in dataloader:
|
135
|
+
# mf_img = mf_img.to(self.device)
|
136
|
+
# with torch.no_grad():
|
137
|
+
# output = self.model.generate({"image": mf_img})
|
138
|
+
# mfr_res.extend(output["pred_str"])
|
139
|
+
# for res, latex in zip(backfill_list, mfr_res):
|
140
|
+
# res["latex"] = latex_rm_whitespace(latex)
|
141
|
+
# return images_formula_list
|
142
|
+
|
143
|
+
def batch_predict(self, images_mfd_res: list, images: list, batch_size: int = 64) -> list:
|
106
144
|
images_formula_list = []
|
107
145
|
mf_image_list = []
|
108
146
|
backfill_list = []
|
147
|
+
image_info = [] # Store (area, original_index, image) tuples
|
148
|
+
|
149
|
+
# Collect images with their original indices
|
109
150
|
for image_index in range(len(images_mfd_res)):
|
110
151
|
mfd_res = images_mfd_res[image_index]
|
111
152
|
pil_img = Image.fromarray(images[image_index])
|
112
153
|
formula_list = []
|
113
154
|
|
114
|
-
for xyxy, conf, cla in zip(
|
115
|
-
|
116
|
-
):
|
155
|
+
for idx, (xyxy, conf, cla) in enumerate(zip(
|
156
|
+
mfd_res.boxes.xyxy, mfd_res.boxes.conf, mfd_res.boxes.cls
|
157
|
+
)):
|
117
158
|
xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
|
118
159
|
new_item = {
|
119
160
|
"category_id": 13 + int(cla.item()),
|
@@ -123,19 +164,43 @@ class UnimernetModel(object):
|
|
123
164
|
}
|
124
165
|
formula_list.append(new_item)
|
125
166
|
bbox_img = pil_img.crop((xmin, ymin, xmax, ymax))
|
167
|
+
area = (xmax - xmin) * (ymax - ymin)
|
168
|
+
|
169
|
+
curr_idx = len(mf_image_list)
|
170
|
+
image_info.append((area, curr_idx, bbox_img))
|
126
171
|
mf_image_list.append(bbox_img)
|
127
172
|
|
128
173
|
images_formula_list.append(formula_list)
|
129
174
|
backfill_list += formula_list
|
130
175
|
|
131
|
-
|
176
|
+
# Stable sort by area
|
177
|
+
image_info.sort(key=lambda x: x[0]) # sort by area
|
178
|
+
sorted_indices = [x[1] for x in image_info]
|
179
|
+
sorted_images = [x[2] for x in image_info]
|
180
|
+
|
181
|
+
# Create mapping for results
|
182
|
+
index_mapping = {new_idx: old_idx for new_idx, old_idx in enumerate(sorted_indices)}
|
183
|
+
|
184
|
+
# Create dataset with sorted images
|
185
|
+
dataset = MathDataset(sorted_images, transform=self.mfr_transform)
|
132
186
|
dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=0)
|
187
|
+
|
188
|
+
# Process batches and store results
|
133
189
|
mfr_res = []
|
134
190
|
for mf_img in dataloader:
|
135
191
|
mf_img = mf_img.to(self.device)
|
136
192
|
with torch.no_grad():
|
137
193
|
output = self.model.generate({"image": mf_img})
|
138
194
|
mfr_res.extend(output["pred_str"])
|
139
|
-
|
140
|
-
|
195
|
+
|
196
|
+
# Restore original order
|
197
|
+
unsorted_results = [""] * len(mfr_res)
|
198
|
+
for new_idx, latex in enumerate(mfr_res):
|
199
|
+
original_idx = index_mapping[new_idx]
|
200
|
+
unsorted_results[original_idx] = latex_rm_whitespace(latex)
|
201
|
+
|
202
|
+
# Fill results back
|
203
|
+
for res, latex in zip(backfill_list, unsorted_results):
|
204
|
+
res["latex"] = latex
|
205
|
+
|
141
206
|
return images_formula_list
|
@@ -21,9 +21,12 @@ from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir, get_l
|
|
21
21
|
from magic_pdf.libs.convert_utils import dict_to_list
|
22
22
|
from magic_pdf.libs.hash_utils import compute_md5
|
23
23
|
from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
|
24
|
+
from magic_pdf.libs.performance_stats import measure_time, PerformanceStats
|
24
25
|
from magic_pdf.model.magic_model import MagicModel
|
25
26
|
from magic_pdf.post_proc.llm_aided import llm_aided_formula, llm_aided_text, llm_aided_title
|
26
27
|
|
28
|
+
from concurrent.futures import ThreadPoolExecutor
|
29
|
+
|
27
30
|
try:
|
28
31
|
import torchtext
|
29
32
|
|
@@ -215,7 +218,7 @@ def calculate_contrast(img, img_mode) -> float:
|
|
215
218
|
# logger.info(f"contrast: {contrast}")
|
216
219
|
return round(contrast, 2)
|
217
220
|
|
218
|
-
|
221
|
+
# @measure_time
|
219
222
|
def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
|
220
223
|
# cid用0xfffd表示,连字符拆开
|
221
224
|
# text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
|
@@ -489,7 +492,7 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
|
|
489
492
|
else:
|
490
493
|
return [[x0, y0, x1, y1]]
|
491
494
|
|
492
|
-
|
495
|
+
# @measure_time
|
493
496
|
def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
|
494
497
|
page_line_list = []
|
495
498
|
|
@@ -923,7 +926,6 @@ def pdf_parse_union(
|
|
923
926
|
magic_model = MagicModel(model_list, dataset)
|
924
927
|
|
925
928
|
"""根据输入的起始范围解析pdf"""
|
926
|
-
# end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
|
927
929
|
end_page_id = (
|
928
930
|
end_page_id
|
929
931
|
if end_page_id is not None and end_page_id >= 0
|
@@ -960,6 +962,8 @@ def pdf_parse_union(
|
|
960
962
|
)
|
961
963
|
pdf_info_dict[f'page_{page_id}'] = page_info
|
962
964
|
|
965
|
+
# PerformanceStats.print_stats()
|
966
|
+
|
963
967
|
"""分段"""
|
964
968
|
para_split(pdf_info_dict)
|
965
969
|
|
@@ -108,29 +108,32 @@ def __is_list_or_index_block(block):
|
|
108
108
|
):
|
109
109
|
multiple_para_flag = True
|
110
110
|
|
111
|
-
|
112
|
-
line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
|
113
|
-
block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
|
114
|
-
if (
|
115
|
-
line['bbox'][0] - block['bbox_fs'][0] > 0.7 * line_height
|
116
|
-
and block['bbox_fs'][2] - line['bbox'][2] > 0.7 * line_height
|
117
|
-
):
|
118
|
-
external_sides_not_close_num += 1
|
119
|
-
if abs(line_mid_x - block_mid_x) < line_height / 2:
|
120
|
-
center_close_num += 1
|
111
|
+
block_text = ''
|
121
112
|
|
113
|
+
for line in block['lines']:
|
122
114
|
line_text = ''
|
123
115
|
|
124
116
|
for span in line['spans']:
|
125
117
|
span_type = span['type']
|
126
118
|
if span_type == ContentType.Text:
|
127
119
|
line_text += span['content'].strip()
|
128
|
-
|
129
120
|
# 添加所有文本,包括空行,保持与block['lines']长度一致
|
130
121
|
lines_text_list.append(line_text)
|
131
122
|
block_text = ''.join(lines_text_list)
|
132
|
-
|
133
|
-
|
123
|
+
|
124
|
+
block_lang = detect_lang(block_text)
|
125
|
+
# logger.info(f"block_lang: {block_lang}")
|
126
|
+
|
127
|
+
for line in block['lines']:
|
128
|
+
line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
|
129
|
+
block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
|
130
|
+
if (
|
131
|
+
line['bbox'][0] - block['bbox_fs'][0] > 0.7 * line_height
|
132
|
+
and block['bbox_fs'][2] - line['bbox'][2] > 0.7 * line_height
|
133
|
+
):
|
134
|
+
external_sides_not_close_num += 1
|
135
|
+
if abs(line_mid_x - block_mid_x) < line_height / 2:
|
136
|
+
center_close_num += 1
|
134
137
|
|
135
138
|
# 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
|
136
139
|
if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:
|
@@ -64,7 +64,7 @@ def span_block_type_compatible(span_type, block_type):
|
|
64
64
|
if span_type in [ContentType.Text, ContentType.InlineEquation]:
|
65
65
|
return block_type in [BlockType.Text, BlockType.Title, BlockType.ImageCaption, BlockType.ImageFootnote, BlockType.TableCaption, BlockType.TableFootnote]
|
66
66
|
elif span_type == ContentType.InterlineEquation:
|
67
|
-
return block_type in [BlockType.InterlineEquation]
|
67
|
+
return block_type in [BlockType.InterlineEquation, BlockType.Text]
|
68
68
|
elif span_type == ContentType.Image:
|
69
69
|
return block_type in [BlockType.ImageBody]
|
70
70
|
elif span_type == ContentType.Table:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.2
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
Home-page: https://github.com/opendatalab/MinerU
|
6
6
|
Requires-Python: >=3.9
|
@@ -9,7 +9,7 @@ License-File: LICENSE.md
|
|
9
9
|
Requires-Dist: boto3>=1.28.43
|
10
10
|
Requires-Dist: Brotli>=1.1.0
|
11
11
|
Requires-Dist: click>=8.1.7
|
12
|
-
Requires-Dist: fast-langdetect
|
12
|
+
Requires-Dist: fast-langdetect<0.3.0,>=0.2.3
|
13
13
|
Requires-Dist: loguru>=0.6.0
|
14
14
|
Requires-Dist: numpy<2.0.0,>=1.21.6
|
15
15
|
Requires-Dist: pydantic>=2.7.2
|
@@ -94,6 +94,10 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
|
|
94
94
|
</div>
|
95
95
|
|
96
96
|
# Changelog
|
97
|
+
- 2025/03/03 1.2.1 released, fixed several bugs:
|
98
|
+
- Fixed the impact on punctuation marks during full-width to half-width conversion of letters and numbers
|
99
|
+
- Fixed caption matching inaccuracies in certain scenarios
|
100
|
+
- Fixed formula span loss issues in certain scenarios
|
97
101
|
- 2025/02/24 1.2.0 released. This version includes several fixes and improvements to enhance parsing efficiency and accuracy:
|
98
102
|
- Performance Optimization
|
99
103
|
- Increased classification speed for PDF documents in auto mode.
|
@@ -1,5 +1,5 @@
|
|
1
1
|
magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
magic_pdf/pdf_parse_union_core_v2.py,sha256=
|
2
|
+
magic_pdf/pdf_parse_union_core_v2.py,sha256=Pt3UtPQgOrF2YudQqrwVVC767_271E-LRg2aUsiggXg,38435
|
3
3
|
magic_pdf/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
4
|
magic_pdf/config/constants.py,sha256=fXGzANULnJWLPxwYp3BEFWx-rnorzpySMx63ffyEyq4,1272
|
5
5
|
magic_pdf/config/drop_reason.py,sha256=CqjMzBE96Qo8OeFvhhhItY8WhyqsKhE3DmyJLoQZNCc,2248
|
@@ -24,7 +24,7 @@ magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,111
|
|
24
24
|
magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
|
25
25
|
magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
|
26
26
|
magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
27
|
-
magic_pdf/dict2md/ocr_mkcontent.py,sha256=
|
27
|
+
magic_pdf/dict2md/ocr_mkcontent.py,sha256=12WeBVxnBzzruk8CfYqqsV2dpH-mDWmE4Osl1RlRoc8,13741
|
28
28
|
magic_pdf/filter/__init__.py,sha256=_7lSez_myu4b6cdzPpQ-NfREuqeBSq_QdyBPKVLyq2U,1505
|
29
29
|
magic_pdf/filter/pdf_classify_by_type.py,sha256=YNYXamxYgEiSujwilCNHOtrwpgJGDiQ597qJfardDVc,42354
|
30
30
|
magic_pdf/filter/pdf_meta_scan.py,sha256=eOuM0-JgaXvHolSgepGoNDJDmv_uITWLQpH_0MfnVQw,17478
|
@@ -49,12 +49,13 @@ magic_pdf/libs/markdown_utils.py,sha256=86v2BmsSV4NkoRZrH4uQD1youJhYFF3vIKr_vDeg
|
|
49
49
|
magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
|
50
50
|
magic_pdf/libs/pdf_check.py,sha256=7GWWvDR6g_rj_fE6XJlbTq5AFVX11ngRIzT0N18F214,3396
|
51
51
|
magic_pdf/libs/pdf_image_tools.py,sha256=kjzSEbm7K0yiHv8kJ4VbZ9HHktM8qvAv3LhxRyDZEQk,1987
|
52
|
+
magic_pdf/libs/performance_stats.py,sha256=BFi4NIsUYlanznYoTVq4hBpj4NOuShAlWBHzebBGVYM,1702
|
52
53
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
53
|
-
magic_pdf/libs/version.py,sha256=
|
54
|
+
magic_pdf/libs/version.py,sha256=uuf4VNtTNA93fMhoAur9YafzaKJFnczY-H1SSCSuRVQ,22
|
54
55
|
magic_pdf/model/__init__.py,sha256=sa-dO2k-TLy25I2gRrzjm_cQeYfzMf-pLwBJHkIxGo0,51
|
55
56
|
magic_pdf/model/batch_analyze.py,sha256=sbrgOJWycb1Ep6e62CPi6jEyG6VSeklIxc4PmrqaLhM,11933
|
56
|
-
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=
|
57
|
-
magic_pdf/model/magic_model.py,sha256=
|
57
|
+
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=T0-h4QmSIDXRzgF5uWO4jQrwIot221l26PXU52xeKiA,7933
|
58
|
+
magic_pdf/model/magic_model.py,sha256=yZKWo_wRck_-YLyFGRiUHGar8sV1Y6458BFLbyBAt74,30682
|
58
59
|
magic_pdf/model/model_list.py,sha256=aqfEJlEfbib3D3ISrxc0Coh6SbffYh8Yq2FlQN35_zA,213
|
59
60
|
magic_pdf/model/pdf_extract_kit.py,sha256=Rd51VNZPKRA_tUbDss-b44d84K6WDG2S87a37Ax7HUA,12224
|
60
61
|
magic_pdf/model/pp_structure_v2.py,sha256=NcqFWL4nUtjl82MFak8HX_8V3i4Aw_fK4dATrIp5uGs,3840
|
@@ -92,7 +93,7 @@ magic_pdf/model/sub_modules/mfd/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
|
|
92
93
|
magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py,sha256=QfHbMr1br0pOJUu1NJEMgA6yw11G0yFImJv_AfW48_c,1008
|
93
94
|
magic_pdf/model/sub_modules/mfd/yolov8/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
94
95
|
magic_pdf/model/sub_modules/mfr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
95
|
-
magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=
|
96
|
+
magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=HPNetRfQeHoHfRTzFEaIjLSHfjrxRvS-EaApMUebZuQ,8020
|
96
97
|
magic_pdf/model/sub_modules/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
97
98
|
magic_pdf/model/sub_modules/ocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
98
99
|
magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -116,12 +117,12 @@ magic_pdf/operators/models.py,sha256=mRqbCVrxxaUVDpEBAsXaK7EL1M-goICkE1W0FYgewio
|
|
116
117
|
magic_pdf/operators/pipes.py,sha256=XgBgisKQd_ruW-3Tw4v5LhqloZUHgn2aFcpi_q8LbCs,6767
|
117
118
|
magic_pdf/post_proc/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
118
119
|
magic_pdf/post_proc/llm_aided.py,sha256=yzhu2cCpUZjdwf3v0swYDgSs9VWIfMAoXepYIP1EMZs,6367
|
119
|
-
magic_pdf/post_proc/para_split_v3.py,sha256=
|
120
|
+
magic_pdf/post_proc/para_split_v3.py,sha256=SPN_VVGvFX5KpFMGw9OzgoE-kTZq-FF036i0cIImGH8,16975
|
120
121
|
magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
121
122
|
magic_pdf/pre_proc/construct_page_dict.py,sha256=OFmq5XRKi6fYIo-lmGlL-NB16Sf0egzsfEx-fT2uYrc,660
|
122
123
|
magic_pdf/pre_proc/cut_image.py,sha256=NDzbxwD7z7Tb4uAxL4KR6LzURFdN1Tzr4nPvj-VmEqc,1225
|
123
124
|
magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=nt88ttXCEI_1ihAF7HU15SQjwM69V-iJmk-L_nyzA6o,9328
|
124
|
-
magic_pdf/pre_proc/ocr_dict_merge.py,sha256=
|
125
|
+
magic_pdf/pre_proc/ocr_dict_merge.py,sha256=4Z3aHZ9sxzijkVpOCENslvUcpp7DXgNID4Gl3pxwIg4,5512
|
125
126
|
magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=xrgC9vR0poklZuY4Og41pZVdXzuaGFg3BnQ01X60dpo,3102
|
126
127
|
magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=mcdxAh4P56NZ3Ij8h3vW8qC_SrszfXflVWuWUuUiTNg,3089
|
127
128
|
magic_pdf/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
|
@@ -138,9 +139,9 @@ magic_pdf/tools/common.py,sha256=1LfMeXBBsb3WlGeNAze_pPOYXQ8Qbfh-JgRXweojHKo,838
|
|
138
139
|
magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
139
140
|
magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
|
140
141
|
magic_pdf/utils/office_to_pdf.py,sha256=7aj-Ls2v8saD-Rgu_t3FIc-J3Ka9wnmiEH5zY-H1Vxs,729
|
141
|
-
magic_pdf-1.2.
|
142
|
-
magic_pdf-1.2.
|
143
|
-
magic_pdf-1.2.
|
144
|
-
magic_pdf-1.2.
|
145
|
-
magic_pdf-1.2.
|
146
|
-
magic_pdf-1.2.
|
142
|
+
magic_pdf-1.2.2.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
143
|
+
magic_pdf-1.2.2.dist-info/METADATA,sha256=FYzj0yWzmFAG4mQ22DH9F4KZfqexNg7YuhgiXMHc9Ug,41001
|
144
|
+
magic_pdf-1.2.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
145
|
+
magic_pdf-1.2.2.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
|
146
|
+
magic_pdf-1.2.2.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
147
|
+
magic_pdf-1.2.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|