magic-pdf 1.3.1__py3-none-any.whl → 1.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/data/batch_build_dataset.py +59 -48
- magic_pdf/data/dataset.py +3 -3
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/batch_analyze.py +10 -4
- magic_pdf/model/doc_analyze_by_custom_model.py +26 -22
- magic_pdf/model/sub_modules/model_utils.py +189 -7
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml +5 -1
- magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_rec.py +6 -0
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +49 -12
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +2 -2
- magic_pdf/tools/common.py +21 -4
- {magic_pdf-1.3.1.dist-info → magic_pdf-1.3.3.dist-info}/METADATA +160 -61
- {magic_pdf-1.3.1.dist-info → magic_pdf-1.3.3.dist-info}/RECORD +17 -17
- {magic_pdf-1.3.1.dist-info → magic_pdf-1.3.3.dist-info}/LICENSE.md +0 -0
- {magic_pdf-1.3.1.dist-info → magic_pdf-1.3.3.dist-info}/WHEEL +0 -0
- {magic_pdf-1.3.1.dist-info → magic_pdf-1.3.3.dist-info}/entry_points.txt +0 -0
- {magic_pdf-1.3.1.dist-info → magic_pdf-1.3.3.dist-info}/top_level.txt +0 -0
@@ -103,54 +103,65 @@ def batch_build_dataset(pdf_paths, k, lang=None):
|
|
103
103
|
all_images : list
|
104
104
|
List of all processed images
|
105
105
|
"""
|
106
|
-
# Get page counts for each PDF
|
107
|
-
pdf_info = []
|
108
|
-
total_pages = 0
|
109
106
|
|
107
|
+
results = []
|
110
108
|
for pdf_path in pdf_paths:
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
total_pages += num_pages
|
116
|
-
doc.close()
|
117
|
-
except Exception as e:
|
118
|
-
print(f'Error opening {pdf_path}: {e}')
|
119
|
-
|
120
|
-
# Partition the jobs based on page countEach job has 1 page
|
121
|
-
partitions = partition_array_greedy(pdf_info, k)
|
122
|
-
|
123
|
-
# Process each partition in parallel
|
124
|
-
all_images_h = {}
|
125
|
-
|
126
|
-
with concurrent.futures.ProcessPoolExecutor(max_workers=k) as executor:
|
127
|
-
# Submit one task per partition
|
128
|
-
futures = []
|
129
|
-
for sn, partition in enumerate(partitions):
|
130
|
-
# Get the jobs for this partition
|
131
|
-
partition_jobs = [pdf_info[idx] for idx in partition]
|
132
|
-
|
133
|
-
# Submit the task
|
134
|
-
future = executor.submit(
|
135
|
-
process_pdf_batch,
|
136
|
-
partition_jobs,
|
137
|
-
sn
|
138
|
-
)
|
139
|
-
futures.append(future)
|
140
|
-
# Process results as they complete
|
141
|
-
for i, future in enumerate(concurrent.futures.as_completed(futures)):
|
142
|
-
try:
|
143
|
-
idx, images = future.result()
|
144
|
-
all_images_h[idx] = images
|
145
|
-
except Exception as e:
|
146
|
-
print(f'Error processing partition: {e}')
|
147
|
-
results = [None] * len(pdf_paths)
|
148
|
-
for i in range(len(partitions)):
|
149
|
-
partition = partitions[i]
|
150
|
-
for j in range(len(partition)):
|
151
|
-
with open(pdf_info[partition[j]][0], 'rb') as f:
|
152
|
-
pdf_bytes = f.read()
|
153
|
-
dataset = PymuDocDataset(pdf_bytes, lang=lang)
|
154
|
-
dataset.set_images(all_images_h[i][j])
|
155
|
-
results[partition[j]] = dataset
|
109
|
+
with open(pdf_path, 'rb') as f:
|
110
|
+
pdf_bytes = f.read()
|
111
|
+
dataset = PymuDocDataset(pdf_bytes, lang=lang)
|
112
|
+
results.append(dataset)
|
156
113
|
return results
|
114
|
+
|
115
|
+
|
116
|
+
#
|
117
|
+
# # Get page counts for each PDF
|
118
|
+
# pdf_info = []
|
119
|
+
# total_pages = 0
|
120
|
+
#
|
121
|
+
# for pdf_path in pdf_paths:
|
122
|
+
# try:
|
123
|
+
# doc = fitz.open(pdf_path)
|
124
|
+
# num_pages = len(doc)
|
125
|
+
# pdf_info.append((pdf_path, num_pages))
|
126
|
+
# total_pages += num_pages
|
127
|
+
# doc.close()
|
128
|
+
# except Exception as e:
|
129
|
+
# print(f'Error opening {pdf_path}: {e}')
|
130
|
+
#
|
131
|
+
# # Partition the jobs based on page countEach job has 1 page
|
132
|
+
# partitions = partition_array_greedy(pdf_info, k)
|
133
|
+
#
|
134
|
+
# # Process each partition in parallel
|
135
|
+
# all_images_h = {}
|
136
|
+
#
|
137
|
+
# with concurrent.futures.ProcessPoolExecutor(max_workers=k) as executor:
|
138
|
+
# # Submit one task per partition
|
139
|
+
# futures = []
|
140
|
+
# for sn, partition in enumerate(partitions):
|
141
|
+
# # Get the jobs for this partition
|
142
|
+
# partition_jobs = [pdf_info[idx] for idx in partition]
|
143
|
+
#
|
144
|
+
# # Submit the task
|
145
|
+
# future = executor.submit(
|
146
|
+
# process_pdf_batch,
|
147
|
+
# partition_jobs,
|
148
|
+
# sn
|
149
|
+
# )
|
150
|
+
# futures.append(future)
|
151
|
+
# # Process results as they complete
|
152
|
+
# for i, future in enumerate(concurrent.futures.as_completed(futures)):
|
153
|
+
# try:
|
154
|
+
# idx, images = future.result()
|
155
|
+
# all_images_h[idx] = images
|
156
|
+
# except Exception as e:
|
157
|
+
# print(f'Error processing partition: {e}')
|
158
|
+
# results = [None] * len(pdf_paths)
|
159
|
+
# for i in range(len(partitions)):
|
160
|
+
# partition = partitions[i]
|
161
|
+
# for j in range(len(partition)):
|
162
|
+
# with open(pdf_info[partition[j]][0], 'rb') as f:
|
163
|
+
# pdf_bytes = f.read()
|
164
|
+
# dataset = PymuDocDataset(pdf_bytes, lang=lang)
|
165
|
+
# dataset.set_images(all_images_h[i][j])
|
166
|
+
# results[partition[j]] = dataset
|
167
|
+
# return results
|
magic_pdf/data/dataset.py
CHANGED
@@ -150,7 +150,7 @@ class PymuDocDataset(Dataset):
|
|
150
150
|
elif lang == 'auto':
|
151
151
|
from magic_pdf.model.sub_modules.language_detection.utils import \
|
152
152
|
auto_detect_lang
|
153
|
-
self._lang = auto_detect_lang(
|
153
|
+
self._lang = auto_detect_lang(self._data_bits)
|
154
154
|
logger.info(f'lang: {lang}, detect_lang: {self._lang}')
|
155
155
|
else:
|
156
156
|
self._lang = lang
|
@@ -249,7 +249,7 @@ class ImageDataset(Dataset):
|
|
249
249
|
elif lang == 'auto':
|
250
250
|
from magic_pdf.model.sub_modules.language_detection.utils import \
|
251
251
|
auto_detect_lang
|
252
|
-
self._lang = auto_detect_lang(
|
252
|
+
self._lang = auto_detect_lang(self._data_bits)
|
253
253
|
logger.info(f'lang: {lang}, detect_lang: {self._lang}')
|
254
254
|
else:
|
255
255
|
self._lang = lang
|
@@ -405,4 +405,4 @@ class Doc(PageableData):
|
|
405
405
|
fontsize (int): font size of the text
|
406
406
|
color (list[float] | None): three element tuple which describe the RGB of the board line, None will use the default font color!
|
407
407
|
"""
|
408
|
-
self._doc.insert_text(coord, content, fontsize=fontsize, color=color)
|
408
|
+
self._doc.insert_text(coord, content, fontsize=fontsize, color=color)
|
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.3.
|
1
|
+
__version__ = "1.3.3"
|
magic_pdf/model/batch_analyze.py
CHANGED
@@ -30,8 +30,14 @@ class BatchAnalyze:
|
|
30
30
|
|
31
31
|
images_layout_res = []
|
32
32
|
layout_start_time = time.time()
|
33
|
-
|
34
|
-
|
33
|
+
self.model = self.model_manager.get_model(
|
34
|
+
ocr=True,
|
35
|
+
show_log=self.show_log,
|
36
|
+
lang = None,
|
37
|
+
layout_model = self.layout_model,
|
38
|
+
formula_enable = self.formula_enable,
|
39
|
+
table_enable = self.table_enable,
|
40
|
+
)
|
35
41
|
|
36
42
|
images = [image for image, _, _ in images_with_extra_info]
|
37
43
|
|
@@ -143,14 +149,14 @@ class BatchAnalyze:
|
|
143
149
|
if ocr_res:
|
144
150
|
ocr_result_list = get_ocr_result_list(ocr_res, useful_list, ocr_res_list_dict['ocr_enable'], new_image, _lang)
|
145
151
|
ocr_res_list_dict['layout_res'].extend(ocr_result_list)
|
146
|
-
|
152
|
+
|
153
|
+
# det_count += len(ocr_res_list_dict['ocr_res_list'])
|
147
154
|
# logger.info(f'ocr-det time: {round(time.time()-det_start, 2)}, image num: {det_count}')
|
148
155
|
|
149
156
|
|
150
157
|
# 表格识别 table recognition
|
151
158
|
if self.model.apply_table:
|
152
159
|
table_start = time.time()
|
153
|
-
table_count = 0
|
154
160
|
# for table_res_list_dict in table_res_list_all_page:
|
155
161
|
for table_res_dict in tqdm(table_res_list_all_page, desc="Table Predict"):
|
156
162
|
_lang = table_res_dict['lang']
|
@@ -146,10 +146,8 @@ def doc_analyze(
|
|
146
146
|
img_dict = page_data.get_image()
|
147
147
|
images.append(img_dict['img'])
|
148
148
|
page_wh_list.append((img_dict['width'], img_dict['height']))
|
149
|
-
|
150
|
-
|
151
|
-
else:
|
152
|
-
images_with_extra_info = [(images[index], ocr, lang) for index in range(len(dataset))]
|
149
|
+
|
150
|
+
images_with_extra_info = [(images[index], ocr, dataset._lang) for index in range(len(images))]
|
153
151
|
|
154
152
|
if len(images) >= MIN_BATCH_INFERENCE_SIZE:
|
155
153
|
batch_size = MIN_BATCH_INFERENCE_SIZE
|
@@ -158,8 +156,8 @@ def doc_analyze(
|
|
158
156
|
batch_images = [images_with_extra_info]
|
159
157
|
|
160
158
|
results = []
|
161
|
-
for
|
162
|
-
|
159
|
+
for batch_image in batch_images:
|
160
|
+
result = may_batch_image_analyze(batch_image, ocr, show_log,layout_model, formula_enable, table_enable)
|
163
161
|
results.extend(result)
|
164
162
|
|
165
163
|
model_json = []
|
@@ -181,7 +179,7 @@ def doc_analyze(
|
|
181
179
|
|
182
180
|
def batch_doc_analyze(
|
183
181
|
datasets: list[Dataset],
|
184
|
-
parse_method: str,
|
182
|
+
parse_method: str = 'auto',
|
185
183
|
show_log: bool = False,
|
186
184
|
lang=None,
|
187
185
|
layout_model=None,
|
@@ -190,30 +188,37 @@ def batch_doc_analyze(
|
|
190
188
|
):
|
191
189
|
MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 200))
|
192
190
|
batch_size = MIN_BATCH_INFERENCE_SIZE
|
193
|
-
images = []
|
194
191
|
page_wh_list = []
|
195
192
|
|
196
193
|
images_with_extra_info = []
|
197
194
|
for dataset in datasets:
|
198
|
-
for index in range(len(dataset)):
|
199
|
-
if lang is None or lang == 'auto':
|
200
|
-
_lang = dataset._lang
|
201
|
-
else:
|
202
|
-
_lang = lang
|
203
195
|
|
196
|
+
ocr = False
|
197
|
+
if parse_method == 'auto':
|
198
|
+
if dataset.classify() == SupportedPdfParseMethod.TXT:
|
199
|
+
ocr = False
|
200
|
+
elif dataset.classify() == SupportedPdfParseMethod.OCR:
|
201
|
+
ocr = True
|
202
|
+
elif parse_method == 'ocr':
|
203
|
+
ocr = True
|
204
|
+
elif parse_method == 'txt':
|
205
|
+
ocr = False
|
206
|
+
|
207
|
+
_lang = dataset._lang
|
208
|
+
|
209
|
+
for index in range(len(dataset)):
|
204
210
|
page_data = dataset.get_page(index)
|
205
211
|
img_dict = page_data.get_image()
|
206
|
-
images.append(img_dict['img'])
|
207
212
|
page_wh_list.append((img_dict['width'], img_dict['height']))
|
208
|
-
|
209
|
-
images_with_extra_info.append((images[-1], dataset.classify() == SupportedPdfParseMethod.OCR, _lang))
|
210
|
-
else:
|
211
|
-
images_with_extra_info.append((images[-1], parse_method == 'ocr', _lang))
|
213
|
+
images_with_extra_info.append((img_dict['img'], ocr, _lang))
|
212
214
|
|
213
215
|
batch_images = [images_with_extra_info[i:i+batch_size] for i in range(0, len(images_with_extra_info), batch_size)]
|
214
216
|
results = []
|
215
|
-
|
216
|
-
|
217
|
+
processed_images_count = 0
|
218
|
+
for index, batch_image in enumerate(batch_images):
|
219
|
+
processed_images_count += len(batch_image)
|
220
|
+
logger.info(f'Batch {index + 1}/{len(batch_images)}: {processed_images_count} pages/{len(images_with_extra_info)} pages')
|
221
|
+
result = may_batch_image_analyze(batch_image, True, show_log, layout_model, formula_enable, table_enable)
|
217
222
|
results.extend(result)
|
218
223
|
|
219
224
|
infer_results = []
|
@@ -233,7 +238,6 @@ def batch_doc_analyze(
|
|
233
238
|
|
234
239
|
def may_batch_image_analyze(
|
235
240
|
images_with_extra_info: list[(np.ndarray, bool, str)],
|
236
|
-
idx: int,
|
237
241
|
ocr: bool,
|
238
242
|
show_log: bool = False,
|
239
243
|
layout_model=None,
|
@@ -291,4 +295,4 @@ def may_batch_image_analyze(
|
|
291
295
|
# f'doc analyze time: {round(time.time() - doc_analyze_start, 2)},'
|
292
296
|
# f' speed: {doc_analyze_speed} pages/second'
|
293
297
|
# )
|
294
|
-
return
|
298
|
+
return results
|
@@ -29,22 +29,204 @@ def crop_img(input_res, input_np_img, crop_paste_x=0, crop_paste_y=0):
|
|
29
29
|
return return_image, return_list
|
30
30
|
|
31
31
|
|
32
|
-
|
33
|
-
|
32
|
+
def get_coords_and_area(table):
|
33
|
+
"""Extract coordinates and area from a table."""
|
34
|
+
xmin, ymin = int(table['poly'][0]), int(table['poly'][1])
|
35
|
+
xmax, ymax = int(table['poly'][4]), int(table['poly'][5])
|
36
|
+
area = (xmax - xmin) * (ymax - ymin)
|
37
|
+
return xmin, ymin, xmax, ymax, area
|
38
|
+
|
39
|
+
|
40
|
+
def calculate_intersection(box1, box2):
|
41
|
+
"""Calculate intersection coordinates between two boxes."""
|
42
|
+
intersection_xmin = max(box1[0], box2[0])
|
43
|
+
intersection_ymin = max(box1[1], box2[1])
|
44
|
+
intersection_xmax = min(box1[2], box2[2])
|
45
|
+
intersection_ymax = min(box1[3], box2[3])
|
46
|
+
|
47
|
+
# Check if intersection is valid
|
48
|
+
if intersection_xmax <= intersection_xmin or intersection_ymax <= intersection_ymin:
|
49
|
+
return None
|
50
|
+
|
51
|
+
return intersection_xmin, intersection_ymin, intersection_xmax, intersection_ymax
|
52
|
+
|
53
|
+
|
54
|
+
def calculate_iou(box1, box2):
|
55
|
+
"""Calculate IoU between two boxes."""
|
56
|
+
intersection = calculate_intersection(box1[:4], box2[:4])
|
57
|
+
|
58
|
+
if not intersection:
|
59
|
+
return 0
|
60
|
+
|
61
|
+
intersection_xmin, intersection_ymin, intersection_xmax, intersection_ymax = intersection
|
62
|
+
intersection_area = (intersection_xmax - intersection_xmin) * (intersection_ymax - intersection_ymin)
|
63
|
+
|
64
|
+
area1, area2 = box1[4], box2[4]
|
65
|
+
union_area = area1 + area2 - intersection_area
|
66
|
+
|
67
|
+
return intersection_area / union_area if union_area > 0 else 0
|
68
|
+
|
69
|
+
|
70
|
+
def is_inside(small_box, big_box, overlap_threshold=0.8):
|
71
|
+
"""Check if small_box is inside big_box by at least overlap_threshold."""
|
72
|
+
intersection = calculate_intersection(small_box[:4], big_box[:4])
|
73
|
+
|
74
|
+
if not intersection:
|
75
|
+
return False
|
76
|
+
|
77
|
+
intersection_xmin, intersection_ymin, intersection_xmax, intersection_ymax = intersection
|
78
|
+
intersection_area = (intersection_xmax - intersection_xmin) * (intersection_ymax - intersection_ymin)
|
79
|
+
|
80
|
+
# Check if overlap exceeds threshold
|
81
|
+
return intersection_area >= overlap_threshold * small_box[4]
|
82
|
+
|
83
|
+
|
84
|
+
def do_overlap(box1, box2):
|
85
|
+
"""Check if two boxes overlap."""
|
86
|
+
return calculate_intersection(box1[:4], box2[:4]) is not None
|
87
|
+
|
88
|
+
|
89
|
+
def merge_high_iou_tables(table_res_list, layout_res, table_indices, iou_threshold=0.7):
|
90
|
+
"""Merge tables with IoU > threshold."""
|
91
|
+
if len(table_res_list) < 2:
|
92
|
+
return table_res_list, table_indices
|
93
|
+
|
94
|
+
table_info = [get_coords_and_area(table) for table in table_res_list]
|
95
|
+
merged = True
|
96
|
+
|
97
|
+
while merged:
|
98
|
+
merged = False
|
99
|
+
i = 0
|
100
|
+
while i < len(table_res_list) - 1:
|
101
|
+
j = i + 1
|
102
|
+
while j < len(table_res_list):
|
103
|
+
iou = calculate_iou(table_info[i], table_info[j])
|
104
|
+
|
105
|
+
if iou > iou_threshold:
|
106
|
+
# Merge tables by taking their union
|
107
|
+
x1_min, y1_min, x1_max, y1_max, _ = table_info[i]
|
108
|
+
x2_min, y2_min, x2_max, y2_max, _ = table_info[j]
|
109
|
+
|
110
|
+
union_xmin = min(x1_min, x2_min)
|
111
|
+
union_ymin = min(y1_min, y2_min)
|
112
|
+
union_xmax = max(x1_max, x2_max)
|
113
|
+
union_ymax = max(y1_max, y2_max)
|
114
|
+
|
115
|
+
# Create merged table
|
116
|
+
merged_table = table_res_list[i].copy()
|
117
|
+
merged_table['poly'][0] = union_xmin
|
118
|
+
merged_table['poly'][1] = union_ymin
|
119
|
+
merged_table['poly'][2] = union_xmax
|
120
|
+
merged_table['poly'][3] = union_ymin
|
121
|
+
merged_table['poly'][4] = union_xmax
|
122
|
+
merged_table['poly'][5] = union_ymax
|
123
|
+
merged_table['poly'][6] = union_xmin
|
124
|
+
merged_table['poly'][7] = union_ymax
|
125
|
+
|
126
|
+
# Update layout_res
|
127
|
+
to_remove = [table_indices[j], table_indices[i]]
|
128
|
+
for idx in sorted(to_remove, reverse=True):
|
129
|
+
del layout_res[idx]
|
130
|
+
layout_res.append(merged_table)
|
131
|
+
|
132
|
+
# Update tracking lists
|
133
|
+
table_indices = [k if k < min(to_remove) else
|
134
|
+
k - 1 if k < max(to_remove) else
|
135
|
+
k - 2 if k > max(to_remove) else
|
136
|
+
len(layout_res) - 1
|
137
|
+
for k in table_indices
|
138
|
+
if k not in to_remove]
|
139
|
+
table_indices.append(len(layout_res) - 1)
|
140
|
+
|
141
|
+
# Update table lists
|
142
|
+
table_res_list.pop(j)
|
143
|
+
table_res_list.pop(i)
|
144
|
+
table_res_list.append(merged_table)
|
145
|
+
|
146
|
+
# Update table_info
|
147
|
+
table_info = [get_coords_and_area(table) for table in table_res_list]
|
148
|
+
|
149
|
+
merged = True
|
150
|
+
break
|
151
|
+
j += 1
|
152
|
+
|
153
|
+
if merged:
|
154
|
+
break
|
155
|
+
i += 1
|
156
|
+
|
157
|
+
return table_res_list, table_indices
|
158
|
+
|
159
|
+
|
160
|
+
def filter_nested_tables(table_res_list, overlap_threshold=0.8, area_threshold=0.8):
|
161
|
+
"""Remove big tables containing multiple smaller tables within them."""
|
162
|
+
if len(table_res_list) < 3:
|
163
|
+
return table_res_list
|
164
|
+
|
165
|
+
table_info = [get_coords_and_area(table) for table in table_res_list]
|
166
|
+
big_tables_idx = []
|
167
|
+
|
168
|
+
for i in range(len(table_res_list)):
|
169
|
+
# Find tables inside this one
|
170
|
+
tables_inside = [j for j in range(len(table_res_list))
|
171
|
+
if i != j and is_inside(table_info[j], table_info[i], overlap_threshold)]
|
172
|
+
|
173
|
+
# Continue if there are at least 2 tables inside
|
174
|
+
if len(tables_inside) >= 2:
|
175
|
+
# Check if inside tables overlap with each other
|
176
|
+
tables_overlap = any(do_overlap(table_info[tables_inside[idx1]], table_info[tables_inside[idx2]])
|
177
|
+
for idx1 in range(len(tables_inside))
|
178
|
+
for idx2 in range(idx1 + 1, len(tables_inside)))
|
179
|
+
|
180
|
+
# If no overlaps, check area condition
|
181
|
+
if not tables_overlap:
|
182
|
+
total_inside_area = sum(table_info[j][4] for j in tables_inside)
|
183
|
+
big_table_area = table_info[i][4]
|
184
|
+
|
185
|
+
if total_inside_area > area_threshold * big_table_area:
|
186
|
+
big_tables_idx.append(i)
|
187
|
+
|
188
|
+
return [table for i, table in enumerate(table_res_list) if i not in big_tables_idx]
|
189
|
+
|
190
|
+
|
191
|
+
def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshold=0.8, area_threshold=0.8):
|
192
|
+
"""Extract OCR, table and other regions from layout results."""
|
34
193
|
ocr_res_list = []
|
35
194
|
table_res_list = []
|
195
|
+
table_indices = []
|
36
196
|
single_page_mfdetrec_res = []
|
37
|
-
|
38
|
-
|
197
|
+
|
198
|
+
# Categorize regions
|
199
|
+
for i, res in enumerate(layout_res):
|
200
|
+
category_id = int(res['category_id'])
|
201
|
+
|
202
|
+
if category_id in [13, 14]: # Formula regions
|
39
203
|
single_page_mfdetrec_res.append({
|
40
204
|
"bbox": [int(res['poly'][0]), int(res['poly'][1]),
|
41
205
|
int(res['poly'][4]), int(res['poly'][5])],
|
42
206
|
})
|
43
|
-
elif
|
207
|
+
elif category_id in [0, 1, 2, 4, 6, 7]: # OCR regions
|
44
208
|
ocr_res_list.append(res)
|
45
|
-
elif
|
209
|
+
elif category_id == 5: # Table regions
|
46
210
|
table_res_list.append(res)
|
47
|
-
|
211
|
+
table_indices.append(i)
|
212
|
+
|
213
|
+
# Process tables: merge high IoU tables first, then filter nested tables
|
214
|
+
table_res_list, table_indices = merge_high_iou_tables(
|
215
|
+
table_res_list, layout_res, table_indices, iou_threshold)
|
216
|
+
|
217
|
+
filtered_table_res_list = filter_nested_tables(
|
218
|
+
table_res_list, overlap_threshold, area_threshold)
|
219
|
+
|
220
|
+
# Remove filtered out tables from layout_res
|
221
|
+
if len(filtered_table_res_list) < len(table_res_list):
|
222
|
+
kept_tables = set(id(table) for table in filtered_table_res_list)
|
223
|
+
to_remove = [table_indices[i] for i, table in enumerate(table_res_list)
|
224
|
+
if id(table) not in kept_tables]
|
225
|
+
|
226
|
+
for idx in sorted(to_remove, reverse=True):
|
227
|
+
del layout_res[idx]
|
228
|
+
|
229
|
+
return ocr_res_list, filtered_table_res_list, single_page_mfdetrec_res
|
48
230
|
|
49
231
|
|
50
232
|
def clean_vram(device, vram_threshold=8):
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml
CHANGED
@@ -1,8 +1,12 @@
|
|
1
1
|
lang:
|
2
|
-
|
2
|
+
ch_lite:
|
3
3
|
det: ch_PP-OCRv3_det_infer.pth
|
4
4
|
rec: ch_PP-OCRv4_rec_infer.pth
|
5
5
|
dict: ppocr_keys_v1.txt
|
6
|
+
ch:
|
7
|
+
det: ch_PP-OCRv3_det_infer.pth
|
8
|
+
rec: ch_PP-OCRv4_rec_server_infer.pth
|
9
|
+
dict: ppocr_keys_v1.txt
|
6
10
|
en:
|
7
11
|
det: en_PP-OCRv3_det_infer.pth
|
8
12
|
rec: en_PP-OCRv4_rec_infer.pth
|
@@ -437,4 +437,10 @@ class TextRecognizer(BaseOCRV20):
|
|
437
437
|
index += 1
|
438
438
|
pbar.update(current_batch_size)
|
439
439
|
|
440
|
+
# Fix NaN values in recognition results
|
441
|
+
for i in range(len(rec_res)):
|
442
|
+
text, score = rec_res[i]
|
443
|
+
if isinstance(score, float) and math.isnan(score):
|
444
|
+
rec_res[i] = (text, 0.0)
|
445
|
+
|
440
446
|
return rec_res, elapse
|
@@ -35,26 +35,63 @@ class RapidTableModel(object):
|
|
35
35
|
# from rapidocr_onnxruntime import RapidOCR
|
36
36
|
# self.ocr_engine = RapidOCR()
|
37
37
|
|
38
|
-
self.ocr_model_name = "PaddleOCR"
|
38
|
+
# self.ocr_model_name = "PaddleOCR"
|
39
39
|
self.ocr_engine = ocr_engine
|
40
40
|
|
41
41
|
|
42
42
|
def predict(self, image):
|
43
|
+
bgr_image = cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR)
|
43
44
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
45
|
+
# First check the overall image aspect ratio (height/width)
|
46
|
+
img_height, img_width = bgr_image.shape[:2]
|
47
|
+
img_aspect_ratio = img_height / img_width if img_width > 0 else 1.0
|
48
|
+
img_is_portrait = img_aspect_ratio > 1.2
|
49
|
+
|
50
|
+
if img_is_portrait:
|
51
|
+
|
52
|
+
det_res = self.ocr_engine.ocr(bgr_image, rec=False)[0]
|
53
|
+
# Check if table is rotated by analyzing text box aspect ratios
|
54
|
+
is_rotated = False
|
55
|
+
if det_res:
|
56
|
+
vertical_count = 0
|
57
|
+
|
58
|
+
for box_ocr_res in det_res:
|
59
|
+
p1, p2, p3, p4 = box_ocr_res
|
60
|
+
|
61
|
+
# Calculate width and height
|
62
|
+
width = p3[0] - p1[0]
|
63
|
+
height = p3[1] - p1[1]
|
64
|
+
|
65
|
+
aspect_ratio = width / height if height > 0 else 1.0
|
66
|
+
|
67
|
+
# Count vertical vs horizontal text boxes
|
68
|
+
if aspect_ratio < 0.8: # Taller than wide - vertical text
|
69
|
+
vertical_count += 1
|
70
|
+
# elif aspect_ratio > 1.2: # Wider than tall - horizontal text
|
71
|
+
# horizontal_count += 1
|
72
|
+
|
73
|
+
# If we have more vertical text boxes than horizontal ones,
|
74
|
+
# and vertical ones are significant, table might be rotated
|
75
|
+
if vertical_count >= len(det_res) * 0.3:
|
76
|
+
is_rotated = True
|
77
|
+
|
78
|
+
# logger.debug(f"Text orientation analysis: vertical={vertical_count}, det_res={len(det_res)}, rotated={is_rotated}")
|
79
|
+
|
80
|
+
# Rotate image if necessary
|
81
|
+
if is_rotated:
|
82
|
+
# logger.debug("Table appears to be in portrait orientation, rotating 90 degrees clockwise")
|
83
|
+
image = cv2.rotate(np.asarray(image), cv2.ROTATE_90_CLOCKWISE)
|
84
|
+
bgr_image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
|
85
|
+
|
86
|
+
# Continue with OCR on potentially rotated image
|
87
|
+
ocr_result = self.ocr_engine.ocr(bgr_image)[0]
|
88
|
+
if ocr_result:
|
89
|
+
ocr_result = [[item[0], item[1][0], item[1][1]] for item in ocr_result if
|
90
|
+
len(item) == 2 and isinstance(item[1], tuple)]
|
54
91
|
else:
|
55
|
-
logger.error("OCR model not supported")
|
56
92
|
ocr_result = None
|
57
93
|
|
94
|
+
|
58
95
|
if ocr_result:
|
59
96
|
table_results = self.table_model(np.asarray(image), ocr_result)
|
60
97
|
html_code = table_results.pred_html
|
@@ -99,11 +99,11 @@ def ocr_prepare_bboxes_for_layout_split_v2(
|
|
99
99
|
all_discarded_blocks = []
|
100
100
|
add_bboxes(discarded_blocks, BlockType.Discarded, all_discarded_blocks)
|
101
101
|
|
102
|
-
"""footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半
|
102
|
+
"""footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半30%区域的"""
|
103
103
|
footnote_blocks = []
|
104
104
|
for discarded in discarded_blocks:
|
105
105
|
x0, y0, x1, y1 = discarded['bbox']
|
106
|
-
if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h
|
106
|
+
if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h * 0.7):
|
107
107
|
footnote_blocks.append([x0, y0, x1, y1])
|
108
108
|
|
109
109
|
"""移除在footnote下面的任何框"""
|
magic_pdf/tools/common.py
CHANGED
@@ -109,9 +109,7 @@ def _do_parse(
|
|
109
109
|
pdf_bytes = ds._raw_data
|
110
110
|
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method)
|
111
111
|
|
112
|
-
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
|
113
|
-
local_md_dir
|
114
|
-
)
|
112
|
+
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
|
115
113
|
image_dir = str(os.path.basename(local_image_dir))
|
116
114
|
|
117
115
|
if len(model_list) == 0:
|
@@ -317,7 +315,26 @@ def batch_do_parse(
|
|
317
315
|
|
318
316
|
infer_results = batch_doc_analyze(dss, parse_method, lang=lang, layout_model=layout_model, formula_enable=formula_enable, table_enable=table_enable)
|
319
317
|
for idx, infer_result in enumerate(infer_results):
|
320
|
-
_do_parse(
|
318
|
+
_do_parse(
|
319
|
+
output_dir = output_dir,
|
320
|
+
pdf_file_name = pdf_file_names[idx],
|
321
|
+
pdf_bytes_or_dataset = dss[idx],
|
322
|
+
model_list = infer_result.get_infer_res(),
|
323
|
+
parse_method = parse_method,
|
324
|
+
debug_able = debug_able,
|
325
|
+
f_draw_span_bbox = f_draw_span_bbox,
|
326
|
+
f_draw_layout_bbox = f_draw_layout_bbox,
|
327
|
+
f_dump_md=f_dump_md,
|
328
|
+
f_dump_middle_json=f_dump_middle_json,
|
329
|
+
f_dump_model_json=f_dump_model_json,
|
330
|
+
f_dump_orig_pdf=f_dump_orig_pdf,
|
331
|
+
f_dump_content_list=f_dump_content_list,
|
332
|
+
f_make_md_mode=MakeMode.MM_MD,
|
333
|
+
f_draw_model_bbox=f_draw_model_bbox,
|
334
|
+
f_draw_line_sort_bbox=f_draw_line_sort_bbox,
|
335
|
+
f_draw_char_bbox=f_draw_char_bbox,
|
336
|
+
lang=lang,
|
337
|
+
)
|
321
338
|
|
322
339
|
|
323
340
|
parse_pdf_methods = click.Choice(['ocr', 'txt', 'auto'])
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 1.3.
|
3
|
+
Version: 1.3.3
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
License: AGPL-3.0
|
6
6
|
Project-URL: Home, https://mineru.net/
|
@@ -23,27 +23,26 @@ Requires-Dist: numpy >=1.21.6
|
|
23
23
|
Requires-Dist: pdfminer.six ==20231228
|
24
24
|
Requires-Dist: pydantic <2.11,>=2.7.2
|
25
25
|
Requires-Dist: scikit-learn >=1.0.2
|
26
|
-
Requires-Dist: torch !=2.5.0,!=2.5.1
|
26
|
+
Requires-Dist: torch !=2.5.0,!=2.5.1,>=2.2.2
|
27
27
|
Requires-Dist: torchvision
|
28
28
|
Requires-Dist: tqdm >=4.67.1
|
29
29
|
Requires-Dist: transformers !=4.51.0,<5.0.0,>=4.49.0
|
30
30
|
Provides-Extra: full
|
31
31
|
Requires-Dist: PyYAML <7,>=6.0.2 ; extra == 'full'
|
32
|
-
Requires-Dist: dill <1,>=0.3.
|
32
|
+
Requires-Dist: dill <1,>=0.3.8 ; extra == 'full'
|
33
33
|
Requires-Dist: doclayout-yolo ==0.0.2b1 ; extra == 'full'
|
34
34
|
Requires-Dist: ftfy <7,>=6.3.1 ; extra == 'full'
|
35
|
+
Requires-Dist: matplotlib <4,>=3.10 ; extra == 'full'
|
35
36
|
Requires-Dist: omegaconf <3,>=2.3.0 ; extra == 'full'
|
36
37
|
Requires-Dist: openai <2,>=1.70.0 ; extra == 'full'
|
37
38
|
Requires-Dist: pyclipper <2,>=1.3.0 ; extra == 'full'
|
38
39
|
Requires-Dist: rapid-table <2.0.0,>=1.0.5 ; extra == 'full'
|
39
40
|
Requires-Dist: shapely <3,>=2.0.7 ; extra == 'full'
|
40
|
-
Requires-Dist: ultralytics
|
41
|
-
Requires-Dist: matplotlib >=3.10 ; (platform_system == "Linux" or platform_system == "Darwin") and extra == 'full'
|
42
|
-
Requires-Dist: matplotlib <=3.9.0 ; (platform_system == "Windows") and extra == 'full'
|
41
|
+
Requires-Dist: ultralytics <9,>=8.3.48 ; extra == 'full'
|
43
42
|
Provides-Extra: full_old_linux
|
44
43
|
Requires-Dist: PyYAML ==6.0.2 ; extra == 'full_old_linux'
|
45
44
|
Requires-Dist: albumentations ==1.4.20 ; extra == 'full_old_linux'
|
46
|
-
Requires-Dist: dill ==0.3.
|
45
|
+
Requires-Dist: dill ==0.3.8 ; extra == 'full_old_linux'
|
47
46
|
Requires-Dist: doclayout-yolo ==0.0.2b1 ; extra == 'full_old_linux'
|
48
47
|
Requires-Dist: ftfy ==6.3.1 ; extra == 'full_old_linux'
|
49
48
|
Requires-Dist: matplotlib <=3.10.1,>=3.10 ; extra == 'full_old_linux'
|
@@ -108,9 +107,14 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
|
|
108
107
|
</div>
|
109
108
|
|
110
109
|
# Changelog
|
110
|
+
- 2025/04/12 1.3.2 released
|
111
|
+
- Fixed the issue of incompatible dependency package versions when installing in Python 3.13 environment on Windows systems.
|
112
|
+
- Optimized memory usage during batch inference.
|
113
|
+
- Improved the parsing effect of tables rotated by 90 degrees.
|
114
|
+
- Enhanced the parsing accuracy for large tables in financial report samples.
|
115
|
+
- Fixed the occasional word concatenation issue in English text areas when OCR language is not specified.(The model needs to be updated)
|
111
116
|
- 2025/04/08 1.3.1 released, fixed some compatibility issues
|
112
117
|
- Supported Python 3.13
|
113
|
-
- Resolved errors caused by `transformers 4.51.0`
|
114
118
|
- Made the final adaptation for some outdated Linux systems (e.g., CentOS 7), and no further support will be guaranteed for subsequent versions. [Installation Instructions](https://github.com/opendatalab/MinerU/issues/1004)
|
115
119
|
- 2025/04/03 1.3.0 released, in this version we made many optimizations and improvements:
|
116
120
|
- Installation and compatibility optimization
|
@@ -129,59 +133,154 @@ Easier to use: Just grab MinerU Desktop. No coding, no login, just a simple inte
|
|
129
133
|
- Usability Optimization
|
130
134
|
- By using `paddleocr2torch`, completely replaced the use of the `paddle` framework and `paddleocr` in the project, resolving conflicts between `paddle` and `torch`, as well as thread safety issues caused by the `paddle` framework.
|
131
135
|
- Added a real-time progress bar during the parsing process to accurately track progress, making the wait less painful.
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
136
|
+
<details>
|
137
|
+
<summary>2025/03/03 1.2.1 released</summary>
|
138
|
+
<ul>
|
139
|
+
<li>Fixed the impact on punctuation marks during full-width to half-width conversion of letters and numbers</li>
|
140
|
+
<li>Fixed caption matching inaccuracies in certain scenarios</li>
|
141
|
+
<li>Fixed formula span loss issues in certain scenarios</li>
|
142
|
+
</ul>
|
143
|
+
</details>
|
144
|
+
|
145
|
+
<details>
|
146
|
+
<summary>2025/02/24 1.2.0 released</summary>
|
147
|
+
<p>This version includes several fixes and improvements to enhance parsing efficiency and accuracy:</p>
|
148
|
+
<ul>
|
149
|
+
<li><strong>Performance Optimization</strong>
|
150
|
+
<ul>
|
151
|
+
<li>Increased classification speed for PDF documents in auto mode.</li>
|
152
|
+
</ul>
|
153
|
+
</li>
|
154
|
+
<li><strong>Parsing Optimization</strong>
|
155
|
+
<ul>
|
156
|
+
<li>Improved parsing logic for documents containing watermarks, significantly enhancing the parsing results for such documents.</li>
|
157
|
+
<li>Enhanced the matching logic for multiple images/tables and captions within a single page, improving the accuracy of image-text matching in complex layouts.</li>
|
158
|
+
</ul>
|
159
|
+
</li>
|
160
|
+
<li><strong>Bug Fixes</strong>
|
161
|
+
<ul>
|
162
|
+
<li>Fixed an issue where image/table spans were incorrectly filled into text blocks under certain conditions.</li>
|
163
|
+
<li>Resolved an issue where title blocks were empty in some cases.</li>
|
164
|
+
</ul>
|
165
|
+
</li>
|
166
|
+
</ul>
|
167
|
+
</details>
|
168
|
+
|
169
|
+
<details>
|
170
|
+
<summary>2025/01/22 1.1.0 released</summary>
|
171
|
+
<p>In this version we have focused on improving parsing accuracy and efficiency:</p>
|
172
|
+
<ul>
|
173
|
+
<li><strong>Model capability upgrade</strong> (requires re-executing the <a href="https://github.com/opendatalab/MinerU/blob/master/docs/how_to_download_models_en.md">model download process</a> to obtain incremental updates of model files)
|
174
|
+
<ul>
|
175
|
+
<li>The layout recognition model has been upgraded to the latest <code>doclayout_yolo(2501)</code> model, improving layout recognition accuracy.</li>
|
176
|
+
<li>The formula parsing model has been upgraded to the latest <code>unimernet(2501)</code> model, improving formula recognition accuracy.</li>
|
177
|
+
</ul>
|
178
|
+
</li>
|
179
|
+
<li><strong>Performance optimization</strong>
|
180
|
+
<ul>
|
181
|
+
<li>On devices that meet certain configuration requirements (16GB+ VRAM), by optimizing resource usage and restructuring the processing pipeline, overall parsing speed has been increased by more than 50%.</li>
|
182
|
+
</ul>
|
183
|
+
</li>
|
184
|
+
<li><strong>Parsing effect optimization</strong>
|
185
|
+
<ul>
|
186
|
+
<li>Added a new heading classification feature (testing version, enabled by default) to the online demo (<a href="https://mineru.net/OpenSourceTools/Extractor">mineru.net</a>/<a href="https://huggingface.co/spaces/opendatalab/MinerU">huggingface</a>/<a href="https://www.modelscope.cn/studios/OpenDataLab/MinerU">modelscope</a>), which supports hierarchical classification of headings, thereby enhancing document structuring.</li>
|
187
|
+
</ul>
|
188
|
+
</li>
|
189
|
+
</ul>
|
190
|
+
</details>
|
191
|
+
|
192
|
+
<details>
|
193
|
+
<summary>2025/01/10 1.0.1 released</summary>
|
194
|
+
<p>This is our first official release, where we have introduced a completely new API interface and enhanced compatibility through extensive refactoring, as well as a brand new automatic language identification feature:</p>
|
195
|
+
<ul>
|
196
|
+
<li><strong>New API Interface</strong>
|
197
|
+
<ul>
|
198
|
+
<li>For the data-side API, we have introduced the Dataset class, designed to provide a robust and flexible data processing framework. This framework currently supports a variety of document formats, including images (.jpg and .png), PDFs, Word documents (.doc and .docx), and PowerPoint presentations (.ppt and .pptx). It ensures effective support for data processing tasks ranging from simple to complex.</li>
|
199
|
+
<li>For the user-side API, we have meticulously designed the MinerU processing workflow as a series of composable Stages. Each Stage represents a specific processing step, allowing users to define new Stages according to their needs and creatively combine these stages to customize their data processing workflows.</li>
|
200
|
+
</ul>
|
201
|
+
</li>
|
202
|
+
<li><strong>Enhanced Compatibility</strong>
|
203
|
+
<ul>
|
204
|
+
<li>By optimizing the dependency environment and configuration items, we ensure stable and efficient operation on ARM architecture Linux systems.</li>
|
205
|
+
<li>We have deeply integrated with Huawei Ascend NPU acceleration, providing autonomous and controllable high-performance computing capabilities. This supports the localization and development of AI application platforms in China. <a href="https://github.com/opendatalab/MinerU/blob/master/docs/README_Ascend_NPU_Acceleration_zh_CN.md">Ascend NPU Acceleration</a></li>
|
206
|
+
</ul>
|
207
|
+
</li>
|
208
|
+
<li><strong>Automatic Language Identification</strong>
|
209
|
+
<ul>
|
210
|
+
<li>By introducing a new language recognition model, setting the <code>lang</code> configuration to <code>auto</code> during document parsing will automatically select the appropriate OCR language model, improving the accuracy of scanned document parsing.</li>
|
211
|
+
</ul>
|
212
|
+
</li>
|
213
|
+
</ul>
|
214
|
+
</details>
|
215
|
+
|
216
|
+
<details>
|
217
|
+
<summary>2024/11/22 0.10.0 released</summary>
|
218
|
+
<p>Introducing hybrid OCR text extraction capabilities:</p>
|
219
|
+
<ul>
|
220
|
+
<li>Significantly improved parsing performance in complex text distribution scenarios such as dense formulas, irregular span regions, and text represented by images.</li>
|
221
|
+
<li>Combines the dual advantages of accurate content extraction and faster speed in text mode, and more precise span/line region recognition in OCR mode.</li>
|
222
|
+
</ul>
|
223
|
+
</details>
|
224
|
+
|
225
|
+
<details>
|
226
|
+
<summary>2024/11/15 0.9.3 released</summary>
|
227
|
+
<p>Integrated <a href="https://github.com/RapidAI/RapidTable">RapidTable</a> for table recognition, improving single-table parsing speed by more than 10 times, with higher accuracy and lower GPU memory usage.</p>
|
228
|
+
</details>
|
229
|
+
|
230
|
+
<details>
|
231
|
+
<summary>2024/11/06 0.9.2 released</summary>
|
232
|
+
<p>Integrated the <a href="https://huggingface.co/U4R/StructTable-InternVL2-1B">StructTable-InternVL2-1B</a> model for table recognition functionality.</p>
|
233
|
+
</details>
|
234
|
+
|
235
|
+
<details>
|
236
|
+
<summary>2024/10/31 0.9.0 released</summary>
|
237
|
+
<p>This is a major new version with extensive code refactoring, addressing numerous issues, improving performance, reducing hardware requirements, and enhancing usability:</p>
|
238
|
+
<ul>
|
239
|
+
<li>Refactored the sorting module code to use <a href="https://github.com/ppaanngggg/layoutreader">layoutreader</a> for reading order sorting, ensuring high accuracy in various layouts.</li>
|
240
|
+
<li>Refactored the paragraph concatenation module to achieve good results in cross-column, cross-page, cross-figure, and cross-table scenarios.</li>
|
241
|
+
<li>Refactored the list and table of contents recognition functions, significantly improving the accuracy of list blocks and table of contents blocks, as well as the parsing of corresponding text paragraphs.</li>
|
242
|
+
<li>Refactored the matching logic for figures, tables, and descriptive text, greatly enhancing the accuracy of matching captions and footnotes to figures and tables, and reducing the loss rate of descriptive text to near zero.</li>
|
243
|
+
<li>Added multi-language support for OCR, supporting detection and recognition of 84 languages. For the list of supported languages, see <a href="https://paddlepaddle.github.io/PaddleOCR/latest/en/ppocr/blog/multi_languages.html#5-support-languages-and-abbreviations">OCR Language Support List</a>.</li>
|
244
|
+
<li>Added memory recycling logic and other memory optimization measures, significantly reducing memory usage. The memory requirement for enabling all acceleration features except table acceleration (layout/formula/OCR) has been reduced from 16GB to 8GB, and the memory requirement for enabling all acceleration features has been reduced from 24GB to 10GB.</li>
|
245
|
+
<li>Optimized configuration file feature switches, adding an independent formula detection switch to significantly improve speed and parsing results when formula detection is not needed.</li>
|
246
|
+
<li>Integrated <a href="https://github.com/opendatalab/PDF-Extract-Kit">PDF-Extract-Kit 1.0</a>:
|
247
|
+
<ul>
|
248
|
+
<li>Added the self-developed <code>doclayout_yolo</code> model, which speeds up processing by more than 10 times compared to the original solution while maintaining similar parsing effects, and can be freely switched with <code>layoutlmv3</code> via the configuration file.</li>
|
249
|
+
<li>Upgraded formula parsing to <code>unimernet 0.2.1</code>, improving formula parsing accuracy while significantly reducing memory usage.</li>
|
250
|
+
<li>Due to the repository change for <code>PDF-Extract-Kit 1.0</code>, you need to re-download the model. Please refer to <a href="https://github.com/opendatalab/MinerU/blob/master/docs/how_to_download_models_en.md">How to Download Models</a> for detailed steps.</li>
|
251
|
+
</ul>
|
252
|
+
</li>
|
253
|
+
</ul>
|
254
|
+
</details>
|
255
|
+
|
256
|
+
<details>
|
257
|
+
<summary>2024/09/27 Version 0.8.1 released</summary>
|
258
|
+
<p>Fixed some bugs, and providing a <a href="https://github.com/opendatalab/MinerU/blob/master/projects/web_demo/README.md">localized deployment version</a> of the <a href="https://opendatalab.com/OpenSourceTools/Extractor/PDF/">online demo</a> and the <a href="https://github.com/opendatalab/MinerU/blob/master/projects/web/README.md">front-end interface</a>.</p>
|
259
|
+
</details>
|
260
|
+
|
261
|
+
<details>
|
262
|
+
<summary>2024/09/09 Version 0.8.0 released</summary>
|
263
|
+
<p>Supporting fast deployment with Dockerfile, and launching demos on Huggingface and Modelscope.</p>
|
264
|
+
</details>
|
265
|
+
|
266
|
+
<details>
|
267
|
+
<summary>2024/08/30 Version 0.7.1 released</summary>
|
268
|
+
<p>Add paddle tablemaster table recognition option</p>
|
269
|
+
</details>
|
270
|
+
|
271
|
+
<details>
|
272
|
+
<summary>2024/08/09 Version 0.7.0b1 released</summary>
|
273
|
+
<p>Simplified installation process, added table recognition functionality</p>
|
274
|
+
</details>
|
275
|
+
|
276
|
+
<details>
|
277
|
+
<summary>2024/08/01 Version 0.6.2b1 released</summary>
|
278
|
+
<p>Optimized dependency conflict issues and installation documentation</p>
|
279
|
+
</details>
|
280
|
+
|
281
|
+
<details>
|
282
|
+
<summary>2024/07/05 Initial open-source release</summary>
|
283
|
+
</details>
|
185
284
|
|
186
285
|
<!-- TABLE OF CONTENT -->
|
187
286
|
|
@@ -10,8 +10,8 @@ magic_pdf/config/make_content_config.py,sha256=J2eJIhVHBPGwX18zVQomQUOxs8LcfeGLx
|
|
10
10
|
magic_pdf/config/model_block_type.py,sha256=y5ie2ZLvo-h8OdVk8HOEha6qK0OJFtLmtOhYjrV680g,166
|
11
11
|
magic_pdf/config/ocr_content_type.py,sha256=e_7RBTdShaWvWhMO2SFou7GM521elMH_Jtn5usbHWdY,890
|
12
12
|
magic_pdf/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
|
-
magic_pdf/data/batch_build_dataset.py,sha256=
|
14
|
-
magic_pdf/data/dataset.py,sha256=
|
13
|
+
magic_pdf/data/batch_build_dataset.py,sha256=KQoWFJDqCwRQug8-fTuciSwff58AYRjCNP6GdiDhxLI,4953
|
14
|
+
magic_pdf/data/dataset.py,sha256=2v-a7kA6dRUDQpjlAVE5We1tMATR-MYKzQCcBhNci5g,12258
|
15
15
|
magic_pdf/data/read_api.py,sha256=_faBnYE3iU_EiQLNFjVM6a8IQtOGAcSQNYBZsTSN1d8,5225
|
16
16
|
magic_pdf/data/schemas.py,sha256=oIUTBzK8Wq8Wuy8A_uilWAbVhucRvOs9_f3lSKYgcmQ,664
|
17
17
|
magic_pdf/data/utils.py,sha256=dNWIJECPXaakKocI4z5Tq6vhDDSnR-bVWQV7DO2w_A8,5335
|
@@ -52,17 +52,17 @@ magic_pdf/libs/pdf_check.py,sha256=7GWWvDR6g_rj_fE6XJlbTq5AFVX11ngRIzT0N18F214,3
|
|
52
52
|
magic_pdf/libs/pdf_image_tools.py,sha256=_au7plmKKctpPKozBumSKgP8689q4vH1mU8VMLO0IbM,2260
|
53
53
|
magic_pdf/libs/performance_stats.py,sha256=DW-c6nUTUnWKGTONRKfpucsYZm1ake016F9K7jJwbik,2136
|
54
54
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
55
|
-
magic_pdf/libs/version.py,sha256
|
55
|
+
magic_pdf/libs/version.py,sha256=Vi6om3KImlKsS_Wg5CjUgYffoi2zx7T-SRPnnGL0G7M,22
|
56
56
|
magic_pdf/model/__init__.py,sha256=sa-dO2k-TLy25I2gRrzjm_cQeYfzMf-pLwBJHkIxGo0,51
|
57
|
-
magic_pdf/model/batch_analyze.py,sha256=
|
58
|
-
magic_pdf/model/doc_analyze_by_custom_model.py,sha256
|
57
|
+
magic_pdf/model/batch_analyze.py,sha256=yKhKQuZTh9GG83p61bw2BRqKMbnsjsmX73gfuTRk8xE,11272
|
58
|
+
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=-cjn7DQi6kZCqVZ0IxbXuL2kmeGhSVLzLaezIHPFzMU,10317
|
59
59
|
magic_pdf/model/magic_model.py,sha256=yZKWo_wRck_-YLyFGRiUHGar8sV1Y6458BFLbyBAt74,30682
|
60
60
|
magic_pdf/model/model_list.py,sha256=aqfEJlEfbib3D3ISrxc0Coh6SbffYh8Yq2FlQN35_zA,213
|
61
61
|
magic_pdf/model/pdf_extract_kit.py,sha256=C3sKqRkoD20Ldmo-cqGn1zRldEL-l5NYqcFvd05_fGU,10845
|
62
62
|
magic_pdf/model/pp_structure_v2.py,sha256=NcqFWL4nUtjl82MFak8HX_8V3i4Aw_fK4dATrIp5uGs,3840
|
63
63
|
magic_pdf/model/sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
64
64
|
magic_pdf/model/sub_modules/model_init.py,sha256=e2t95kxiuU47luOHByDokbQ2uob6oQhEA4b6UGVfMjY,8303
|
65
|
-
magic_pdf/model/sub_modules/model_utils.py,sha256=
|
65
|
+
magic_pdf/model/sub_modules/model_utils.py,sha256=iNC-zuDLWkwUAwMZ0YcGxAwHn5SAAFRdZBQgTy9nmgY,9880
|
66
66
|
magic_pdf/model/sub_modules/language_detection/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
67
67
|
magic_pdf/model/sub_modules/language_detection/utils.py,sha256=Q__v6DdNJztt8GhVSuSB0txahVq-aj8RLhWn2VScx4w,3047
|
68
68
|
magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py,sha256=7T8eFl8zlZe6F0j0jSB3jrwOapDft320JQ1fuWxpvAY,5230
|
@@ -141,7 +141,7 @@ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/db_post
|
|
141
141
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/rec_postprocess.py,sha256=HiHNr4bhW5U1j4pYoyi8fPOaFsn8TUc4nSB6q8chfV4,26899
|
142
142
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
143
143
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml,sha256=eEzg5D5L3MHFL4H02gZnxdDiqtSCUzZDnt5pqDAmgCI,6980
|
144
|
-
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml,sha256=
|
144
|
+
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml,sha256=M0vyAENxKIaPaSdRBDhH8ik5V71vcY1STkZoq-3iqD8,1504
|
145
145
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/arabic_dict.txt,sha256=xbaXD14RWk0Vpc7fAHpephuszp1j-Qi3IWC4VrFKu70,407
|
146
146
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/chinese_cht_dict.txt,sha256=gyVR_uHy-8l1CHctgevcjboSwA3pejXHHJ3fQ92sGoM,33443
|
147
147
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/cyrillic_dict.txt,sha256=NpqCxsjEeXhKXXJkSLg7Hq-1_vCkEppeqjkpYl3c0TI,410
|
@@ -158,7 +158,7 @@ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/__init__.py,sha256=xEqR6
|
|
158
158
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
159
159
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_cls.py,sha256=8RmKl1vejnZl65caHZNV2ta6hMsg5B_LE-FuqCO8T8A,4225
|
160
160
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_det.py,sha256=cRBKE0blzryj3Ar6yM0FKKgxmZdgMc44NDNl1S2wiRs,9136
|
161
|
-
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_rec.py,sha256=
|
161
|
+
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_rec.py,sha256=GZ1PhVZ6GCPedgzU02e4pC52jHPf7uNI1GTID2CkMHA,19444
|
162
162
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_system.py,sha256=hkegkn6hq2v2zqHVAP615-k-fkTS8swRYSbZeoqmSI8,3822
|
163
163
|
magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/pytorchocr_utility.py,sha256=i1PFN-_kefJUUZ4Vk7igs1TU8gfErTDlDXY6-8Uaurw,9323
|
164
164
|
magic_pdf/model/sub_modules/reading_oreder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -168,7 +168,7 @@ magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py,sha256=ezNSq_Y4
|
|
168
168
|
magic_pdf/model/sub_modules/table/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
169
169
|
magic_pdf/model/sub_modules/table/table_utils.py,sha256=B9BC4f5EEjlt2ldYxrIC8Wic2Tz3t3gTJeEyK3ggrOU,282
|
170
170
|
magic_pdf/model/sub_modules/table/rapidtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
171
|
-
magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py,sha256=
|
171
|
+
magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py,sha256=6W6qBNFZ_ETyk7B7Figk2ekPT3YgM_CUGWlAbdJC6dQ,4399
|
172
172
|
magic_pdf/operators/__init__.py,sha256=liU2-WYUvsQ1G4PYBppyvokS9z5IjrnlVMtoBAC1REI,2630
|
173
173
|
magic_pdf/operators/models.py,sha256=mRqbCVrxxaUVDpEBAsXaK7EL1M-goICkE1W0FYgewio,5305
|
174
174
|
magic_pdf/operators/pipes.py,sha256=XgBgisKQd_ruW-3Tw4v5LhqloZUHgn2aFcpi_q8LbCs,6767
|
@@ -178,7 +178,7 @@ magic_pdf/post_proc/para_split_v3.py,sha256=SPN_VVGvFX5KpFMGw9OzgoE-kTZq-FF036i0
|
|
178
178
|
magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
179
179
|
magic_pdf/pre_proc/construct_page_dict.py,sha256=OFmq5XRKi6fYIo-lmGlL-NB16Sf0egzsfEx-fT2uYrc,660
|
180
180
|
magic_pdf/pre_proc/cut_image.py,sha256=NDzbxwD7z7Tb4uAxL4KR6LzURFdN1Tzr4nPvj-VmEqc,1225
|
181
|
-
magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=
|
181
|
+
magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=3_bEbZC_BDwbuaBLPdCIbkxz93-g9oCtvjuXD8qbklo,9330
|
182
182
|
magic_pdf/pre_proc/ocr_dict_merge.py,sha256=PscKGF0uJIjMxZRM69FLUs1SZO_wOswDQQV1f0M2xAo,5627
|
183
183
|
magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=bs5RLvk4kIyx9_Hqq0FU3AGPPxE8Sxs97Uwlf1sBryM,4725
|
184
184
|
magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=mcdxAh4P56NZ3Ij8h3vW8qC_SrszfXflVWuWUuUiTNg,3089
|
@@ -191,13 +191,13 @@ magic_pdf/spark/spark_api.py,sha256=BYO6zlRW0cEnIUB3ZzNQTu_LsPHEVitqiUN7gy3x_wo,
|
|
191
191
|
magic_pdf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
192
192
|
magic_pdf/tools/cli.py,sha256=_oa-M5Hcopa5RZudVzrEip2W8pa9422Lmat7tMBJO5M,5171
|
193
193
|
magic_pdf/tools/cli_dev.py,sha256=3RbubfTIagWoFYdu8wSDanr-BJDjFGeDet55jTy7He0,3948
|
194
|
-
magic_pdf/tools/common.py,sha256
|
194
|
+
magic_pdf/tools/common.py,sha256=-x0RSFr7SNbdYq7DntaLYmQmaxyF-xKSf4xMpSUTzA0,12623
|
195
195
|
magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
196
196
|
magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
|
197
197
|
magic_pdf/utils/office_to_pdf.py,sha256=7aj-Ls2v8saD-Rgu_t3FIc-J3Ka9wnmiEH5zY-H1Vxs,729
|
198
|
-
magic_pdf-1.3.
|
199
|
-
magic_pdf-1.3.
|
200
|
-
magic_pdf-1.3.
|
201
|
-
magic_pdf-1.3.
|
202
|
-
magic_pdf-1.3.
|
203
|
-
magic_pdf-1.3.
|
198
|
+
magic_pdf-1.3.3.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
199
|
+
magic_pdf-1.3.3.dist-info/METADATA,sha256=1Y-a4UouLQRhsldrhz6UZLlx4KUFOdjSk5R1gK_oYjs,45615
|
200
|
+
magic_pdf-1.3.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
201
|
+
magic_pdf-1.3.3.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
|
202
|
+
magic_pdf-1.3.3.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
203
|
+
magic_pdf-1.3.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|