magic-pdf 0.8.0__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/libs/boxbase.py +19 -0
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/magic_model.py +96 -45
- magic_pdf/tools/common.py +1 -1
- {magic_pdf-0.8.0.dist-info → magic_pdf-0.8.1.dist-info}/METADATA +7 -4
- {magic_pdf-0.8.0.dist-info → magic_pdf-0.8.1.dist-info}/RECORD +10 -10
- {magic_pdf-0.8.0.dist-info → magic_pdf-0.8.1.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.8.0.dist-info → magic_pdf-0.8.1.dist-info}/WHEEL +0 -0
- {magic_pdf-0.8.0.dist-info → magic_pdf-0.8.1.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.8.0.dist-info → magic_pdf-0.8.1.dist-info}/top_level.txt +0 -0
magic_pdf/libs/boxbase.py
CHANGED
@@ -426,3 +426,22 @@ def bbox_distance(bbox1, bbox2):
|
|
426
426
|
elif top:
|
427
427
|
return y2 - y1b
|
428
428
|
return 0.0
|
429
|
+
|
430
|
+
|
431
|
+
def box_area(bbox):
|
432
|
+
return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
|
433
|
+
|
434
|
+
|
435
|
+
def get_overlap_area(bbox1, bbox2):
|
436
|
+
"""计算box1和box2的重叠面积占bbox1的比例."""
|
437
|
+
# Determine the coordinates of the intersection rectangle
|
438
|
+
x_left = max(bbox1[0], bbox2[0])
|
439
|
+
y_top = max(bbox1[1], bbox2[1])
|
440
|
+
x_right = min(bbox1[2], bbox2[2])
|
441
|
+
y_bottom = min(bbox1[3], bbox2[3])
|
442
|
+
|
443
|
+
if x_right < x_left or y_bottom < y_top:
|
444
|
+
return 0.0
|
445
|
+
|
446
|
+
# The area of overlap area
|
447
|
+
return (x_right - x_left) * (y_bottom - y_top)
|
magic_pdf/libs/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.8.
|
1
|
+
__version__ = "0.8.1"
|
magic_pdf/model/magic_model.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
import json
|
2
2
|
|
3
3
|
from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
|
4
|
-
bbox_relative_pos, calculate_iou,
|
5
|
-
calculate_overlap_area_in_bbox1_area_ratio
|
4
|
+
bbox_relative_pos, box_area, calculate_iou,
|
5
|
+
calculate_overlap_area_in_bbox1_area_ratio,
|
6
|
+
get_overlap_area)
|
6
7
|
from magic_pdf.libs.commons import fitz, join_path
|
7
8
|
from magic_pdf.libs.coordinate_transform import get_scale_ratio
|
8
9
|
from magic_pdf.libs.local_math import float_gt
|
@@ -12,6 +13,7 @@ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
|
12
13
|
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
13
14
|
|
14
15
|
CAPATION_OVERLAP_AREA_RATIO = 0.6
|
16
|
+
MERGE_BOX_OVERLAP_AREA_RATIO = 1.1
|
15
17
|
|
16
18
|
|
17
19
|
class MagicModel:
|
@@ -124,49 +126,51 @@ class MagicModel:
|
|
124
126
|
tables.append(obj)
|
125
127
|
if len(footnotes) * len(figures) == 0:
|
126
128
|
continue
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
)
|
129
|
+
dis_figure_footnote = {}
|
130
|
+
dis_table_footnote = {}
|
131
|
+
|
132
|
+
for i in range(len(footnotes)):
|
133
|
+
for j in range(len(figures)):
|
134
|
+
pos_flag_count = sum(
|
135
|
+
list(
|
136
|
+
map(
|
137
|
+
lambda x: 1 if x else 0,
|
138
|
+
bbox_relative_pos(
|
139
|
+
footnotes[i]['bbox'], figures[j]['bbox']
|
140
|
+
),
|
140
141
|
)
|
141
142
|
)
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
)
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
)
|
143
|
+
)
|
144
|
+
if pos_flag_count > 1:
|
145
|
+
continue
|
146
|
+
dis_figure_footnote[i] = min(
|
147
|
+
bbox_distance(figures[j]['bbox'], footnotes[i]['bbox']),
|
148
|
+
dis_figure_footnote.get(i, float('inf')),
|
149
|
+
)
|
150
|
+
for i in range(len(footnotes)):
|
151
|
+
for j in range(len(tables)):
|
152
|
+
pos_flag_count = sum(
|
153
|
+
list(
|
154
|
+
map(
|
155
|
+
lambda x: 1 if x else 0,
|
156
|
+
bbox_relative_pos(
|
157
|
+
footnotes[i]['bbox'], tables[j]['bbox']
|
158
|
+
),
|
158
159
|
)
|
159
160
|
)
|
160
|
-
|
161
|
-
|
161
|
+
)
|
162
|
+
if pos_flag_count > 1:
|
163
|
+
continue
|
162
164
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
165
|
+
dis_table_footnote[i] = min(
|
166
|
+
bbox_distance(tables[j]['bbox'], footnotes[i]['bbox']),
|
167
|
+
dis_table_footnote.get(i, float('inf')),
|
168
|
+
)
|
169
|
+
for i in range(len(footnotes)):
|
170
|
+
if i not in dis_figure_footnote:
|
171
|
+
continue
|
172
|
+
if dis_table_footnote.get(i, float('inf')) > dis_figure_footnote[i]:
|
173
|
+
footnotes[i]['category_id'] = CategoryId.ImageFootnote
|
170
174
|
|
171
175
|
def __reduct_overlap(self, bboxes):
|
172
176
|
N = len(bboxes)
|
@@ -191,6 +195,44 @@ class MagicModel:
|
|
191
195
|
筛选出所有和 merged bbox 有 overlap 且 overlap 面积大于 object 的面积的 subjects。
|
192
196
|
再求出筛选出的 subjects 和 object 的最短距离
|
193
197
|
"""
|
198
|
+
def search_overlap_between_boxes(
|
199
|
+
subject_idx, object_idx
|
200
|
+
):
|
201
|
+
idxes = [subject_idx, object_idx]
|
202
|
+
x0s = [all_bboxes[idx]['bbox'][0] for idx in idxes]
|
203
|
+
y0s = [all_bboxes[idx]['bbox'][1] for idx in idxes]
|
204
|
+
x1s = [all_bboxes[idx]['bbox'][2] for idx in idxes]
|
205
|
+
y1s = [all_bboxes[idx]['bbox'][3] for idx in idxes]
|
206
|
+
|
207
|
+
merged_bbox = [
|
208
|
+
min(x0s),
|
209
|
+
min(y0s),
|
210
|
+
max(x1s),
|
211
|
+
max(y1s),
|
212
|
+
]
|
213
|
+
ratio = 0
|
214
|
+
|
215
|
+
other_objects = list(
|
216
|
+
map(
|
217
|
+
lambda x: {'bbox': x['bbox'], 'score': x['score']},
|
218
|
+
filter(
|
219
|
+
lambda x: x['category_id']
|
220
|
+
not in (object_category_id, subject_category_id),
|
221
|
+
self.__model_list[page_no]['layout_dets'],
|
222
|
+
),
|
223
|
+
)
|
224
|
+
)
|
225
|
+
for other_object in other_objects:
|
226
|
+
ratio = max(
|
227
|
+
ratio,
|
228
|
+
get_overlap_area(
|
229
|
+
merged_bbox, other_object['bbox']
|
230
|
+
) * 1.0 / box_area(all_bboxes[object_idx]['bbox'])
|
231
|
+
)
|
232
|
+
if ratio >= MERGE_BOX_OVERLAP_AREA_RATIO:
|
233
|
+
break
|
234
|
+
|
235
|
+
return ratio
|
194
236
|
|
195
237
|
def may_find_other_nearest_bbox(subject_idx, object_idx):
|
196
238
|
ret = float('inf')
|
@@ -299,6 +341,15 @@ class MagicModel:
|
|
299
341
|
):
|
300
342
|
continue
|
301
343
|
|
344
|
+
subject_idx, object_idx = i, j
|
345
|
+
if all_bboxes[j]['category_id'] == subject_category_id:
|
346
|
+
subject_idx, object_idx = j, i
|
347
|
+
|
348
|
+
if search_overlap_between_boxes(subject_idx, object_idx) >= MERGE_BOX_OVERLAP_AREA_RATIO:
|
349
|
+
dis[i][j] = float('inf')
|
350
|
+
dis[j][i] = dis[i][j]
|
351
|
+
continue
|
352
|
+
|
302
353
|
dis[i][j] = bbox_distance(all_bboxes[i]['bbox'], all_bboxes[j]['bbox'])
|
303
354
|
dis[j][i] = dis[i][j]
|
304
355
|
|
@@ -627,13 +678,13 @@ class MagicModel:
|
|
627
678
|
span['type'] = ContentType.Image
|
628
679
|
elif category_id == 5:
|
629
680
|
# 获取table模型结果
|
630
|
-
latex = layout_det.get(
|
631
|
-
html = layout_det.get(
|
681
|
+
latex = layout_det.get('latex', None)
|
682
|
+
html = layout_det.get('html', None)
|
632
683
|
if latex:
|
633
|
-
span[
|
684
|
+
span['latex'] = latex
|
634
685
|
elif html:
|
635
|
-
span[
|
636
|
-
span[
|
686
|
+
span['html'] = html
|
687
|
+
span['type'] = ContentType.Table
|
637
688
|
elif category_id == 13:
|
638
689
|
span['content'] = layout_det['latex']
|
639
690
|
span['type'] = ContentType.InlineEquation
|
magic_pdf/tools/common.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: magic-pdf
|
3
|
-
Version: 0.8.
|
3
|
+
Version: 0.8.1
|
4
4
|
Summary: A practical tool for converting PDF to Markdown
|
5
5
|
Home-page: https://github.com/opendatalab/MinerU
|
6
6
|
Requires-Python: >=3.9
|
@@ -49,8 +49,9 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
|
|
49
49
|
[](https://pepy.tech/project/magic-pdf)
|
50
50
|
[](https://pepy.tech/project/magic-pdf)
|
51
51
|
|
52
|
-
[](https://opendatalab.com/OpenSourceTools/Extractor/PDF)
|
53
|
+
[](https://huggingface.co/spaces/opendatalab/MinerU)
|
54
|
+
[](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
|
54
55
|
[](https://colab.research.google.com/gist/papayalove/b5f4913389e7ff9883c6b687de156e78/mineru_demo.ipynb)
|
55
56
|
[](#)
|
56
57
|
|
@@ -214,7 +215,9 @@ In non-mainline environments, due to the diversity of hardware and software conf
|
|
214
215
|
|
215
216
|
### Online Demo
|
216
217
|
|
217
|
-
[
|
218
|
+
[](https://opendatalab.com/OpenSourceTools/Extractor/PDF)
|
219
|
+
[](https://huggingface.co/spaces/opendatalab/MinerU)
|
220
|
+
[](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
|
218
221
|
|
219
222
|
### Quick CPU Demo
|
220
223
|
|
@@ -24,7 +24,7 @@ magic_pdf/libs/Constants.py,sha256=rdJVadmgN0UlIB-xcMQ9j7Qk9q1Qahxt3KEY-vL7hSU,7
|
|
24
24
|
magic_pdf/libs/MakeContentConfig.py,sha256=UDZPpsv8q4DqTy8h0vRtrT2kHqWiVI205VnVhlUEQc0,206
|
25
25
|
magic_pdf/libs/ModelBlockTypeEnum.py,sha256=kalXPbo5ya6hKhhBHPGlHl1yjWOURoXZWQM3rVUyPsY,164
|
26
26
|
magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
27
|
-
magic_pdf/libs/boxbase.py,sha256=
|
27
|
+
magic_pdf/libs/boxbase.py,sha256=YGIVYWBHyBSopcTxd5e3FVw6QQUnRMzyYgFJmcEl4Hc,15842
|
28
28
|
magic_pdf/libs/calc_span_stats.py,sha256=5vnU27DcbkFDRSAoLqAmX0KQ3I9ehWkEgh_t9hxg_zI,10147
|
29
29
|
magic_pdf/libs/commons.py,sha256=6Zu9-OyamyCNDY7qj0SxR-rux-ggj9im3CVPtC4ubB8,7108
|
30
30
|
magic_pdf/libs/config_reader.py,sha256=dPx6JJJuCw9AzNgKtrTG1elmfdeN6gDhgFK9r15-NsE,2505
|
@@ -46,11 +46,11 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
|
|
46
46
|
magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
|
47
47
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
48
48
|
magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
|
49
|
-
magic_pdf/libs/version.py,sha256=
|
49
|
+
magic_pdf/libs/version.py,sha256=Ocl79hbbH8_jdr5dGC90VR1cAvZc05Rc0tkZttUnMjo,22
|
50
50
|
magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
|
51
51
|
magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
|
52
52
|
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=Ht1zZAB4WV3b-AWdRV5m5fuidlD6ZhNkvJM0j9i1l_E,4809
|
53
|
-
magic_pdf/model/magic_model.py,sha256=
|
53
|
+
magic_pdf/model/magic_model.py,sha256=afSd9D0S31uEAseY98rJrL9BybBMeSMm5jbeWoWuWWo,30694
|
54
54
|
magic_pdf/model/model_list.py,sha256=tJ9jtMB93HGx8Rmt8wmQSDFXZBUIPQrwaaYsep4luTM,183
|
55
55
|
magic_pdf/model/pdf_extract_kit.py,sha256=Bdxqo3AGXs0VByFVj6ZEOm4T6wXTZwkZsRRFtxasNQM,17901
|
56
56
|
magic_pdf/model/ppTableModel.py,sha256=wWiui9VOjkKYlNX-viPqsWpzgkNJ-9_S2Se-j4oyLqU,2687
|
@@ -145,10 +145,10 @@ magic_pdf/spark/spark_api.py,sha256=eSLXTjMYW5Ya41VMIApRVfji1ZxEZXdH9ZdsL6fy5Kw,
|
|
145
145
|
magic_pdf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
146
146
|
magic_pdf/tools/cli.py,sha256=tUeJhGudJIrCDMNQDRTWtNGE_4E0TWpKTuEkf5y_2uk,2734
|
147
147
|
magic_pdf/tools/cli_dev.py,sha256=3e5eyCQEt_EujXZu5fUAWr_W-YQQVqS9pB0Qgw7t1D8,4122
|
148
|
-
magic_pdf/tools/common.py,sha256=
|
149
|
-
magic_pdf-0.8.
|
150
|
-
magic_pdf-0.8.
|
151
|
-
magic_pdf-0.8.
|
152
|
-
magic_pdf-0.8.
|
153
|
-
magic_pdf-0.8.
|
154
|
-
magic_pdf-0.8.
|
148
|
+
magic_pdf/tools/common.py,sha256=2KsqN0rNcuyt9B3vAoF-HeeiwdJbO3iO5VE1zBZ1VCw,4859
|
149
|
+
magic_pdf-0.8.1.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
150
|
+
magic_pdf-0.8.1.dist-info/METADATA,sha256=fWU35cJWoz62IWXDGIC3PsTDIz_vgV_orr1eEjvjjaQ,34142
|
151
|
+
magic_pdf-0.8.1.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
|
152
|
+
magic_pdf-0.8.1.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
|
153
|
+
magic_pdf-0.8.1.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
154
|
+
magic_pdf-0.8.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|