magic-pdf 0.8.0__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
magic_pdf/libs/boxbase.py CHANGED
@@ -426,3 +426,22 @@ def bbox_distance(bbox1, bbox2):
426
426
  elif top:
427
427
  return y2 - y1b
428
428
  return 0.0
429
+
430
+
431
+ def box_area(bbox):
432
+ return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
433
+
434
+
435
+ def get_overlap_area(bbox1, bbox2):
436
+ """计算box1和box2的重叠面积占bbox1的比例."""
437
+ # Determine the coordinates of the intersection rectangle
438
+ x_left = max(bbox1[0], bbox2[0])
439
+ y_top = max(bbox1[1], bbox2[1])
440
+ x_right = min(bbox1[2], bbox2[2])
441
+ y_bottom = min(bbox1[3], bbox2[3])
442
+
443
+ if x_right < x_left or y_bottom < y_top:
444
+ return 0.0
445
+
446
+ # The area of overlap area
447
+ return (x_right - x_left) * (y_bottom - y_top)
magic_pdf/libs/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.8.0"
1
+ __version__ = "0.8.1"
@@ -1,8 +1,9 @@
1
1
  import json
2
2
 
3
3
  from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
4
- bbox_relative_pos, calculate_iou,
5
- calculate_overlap_area_in_bbox1_area_ratio)
4
+ bbox_relative_pos, box_area, calculate_iou,
5
+ calculate_overlap_area_in_bbox1_area_ratio,
6
+ get_overlap_area)
6
7
  from magic_pdf.libs.commons import fitz, join_path
7
8
  from magic_pdf.libs.coordinate_transform import get_scale_ratio
8
9
  from magic_pdf.libs.local_math import float_gt
@@ -12,6 +13,7 @@ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
12
13
  from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
13
14
 
14
15
  CAPATION_OVERLAP_AREA_RATIO = 0.6
16
+ MERGE_BOX_OVERLAP_AREA_RATIO = 1.1
15
17
 
16
18
 
17
19
  class MagicModel:
@@ -124,49 +126,51 @@ class MagicModel:
124
126
  tables.append(obj)
125
127
  if len(footnotes) * len(figures) == 0:
126
128
  continue
127
- dis_figure_footnote = {}
128
- dis_table_footnote = {}
129
-
130
- for i in range(len(footnotes)):
131
- for j in range(len(figures)):
132
- pos_flag_count = sum(
133
- list(
134
- map(
135
- lambda x: 1 if x else 0,
136
- bbox_relative_pos(
137
- footnotes[i]['bbox'], figures[j]['bbox']
138
- ),
139
- )
129
+ dis_figure_footnote = {}
130
+ dis_table_footnote = {}
131
+
132
+ for i in range(len(footnotes)):
133
+ for j in range(len(figures)):
134
+ pos_flag_count = sum(
135
+ list(
136
+ map(
137
+ lambda x: 1 if x else 0,
138
+ bbox_relative_pos(
139
+ footnotes[i]['bbox'], figures[j]['bbox']
140
+ ),
140
141
  )
141
142
  )
142
- if pos_flag_count > 1:
143
- continue
144
- dis_figure_footnote[i] = min(
145
- bbox_distance(figures[j]['bbox'], footnotes[i]['bbox']),
146
- dis_figure_footnote.get(i, float('inf')),
147
- )
148
- for i in range(len(footnotes)):
149
- for j in range(len(tables)):
150
- pos_flag_count = sum(
151
- list(
152
- map(
153
- lambda x: 1 if x else 0,
154
- bbox_relative_pos(
155
- footnotes[i]['bbox'], tables[j]['bbox']
156
- ),
157
- )
143
+ )
144
+ if pos_flag_count > 1:
145
+ continue
146
+ dis_figure_footnote[i] = min(
147
+ bbox_distance(figures[j]['bbox'], footnotes[i]['bbox']),
148
+ dis_figure_footnote.get(i, float('inf')),
149
+ )
150
+ for i in range(len(footnotes)):
151
+ for j in range(len(tables)):
152
+ pos_flag_count = sum(
153
+ list(
154
+ map(
155
+ lambda x: 1 if x else 0,
156
+ bbox_relative_pos(
157
+ footnotes[i]['bbox'], tables[j]['bbox']
158
+ ),
158
159
  )
159
160
  )
160
- if pos_flag_count > 1:
161
- continue
161
+ )
162
+ if pos_flag_count > 1:
163
+ continue
162
164
 
163
- dis_table_footnote[i] = min(
164
- bbox_distance(tables[j]['bbox'], footnotes[i]['bbox']),
165
- dis_table_footnote.get(i, float('inf')),
166
- )
167
- for i in range(len(footnotes)):
168
- if dis_table_footnote.get(i, float('inf')) > dis_figure_footnote[i]:
169
- footnotes[i]['category_id'] = CategoryId.ImageFootnote
165
+ dis_table_footnote[i] = min(
166
+ bbox_distance(tables[j]['bbox'], footnotes[i]['bbox']),
167
+ dis_table_footnote.get(i, float('inf')),
168
+ )
169
+ for i in range(len(footnotes)):
170
+ if i not in dis_figure_footnote:
171
+ continue
172
+ if dis_table_footnote.get(i, float('inf')) > dis_figure_footnote[i]:
173
+ footnotes[i]['category_id'] = CategoryId.ImageFootnote
170
174
 
171
175
  def __reduct_overlap(self, bboxes):
172
176
  N = len(bboxes)
@@ -191,6 +195,44 @@ class MagicModel:
191
195
  筛选出所有和 merged bbox 有 overlap 且 overlap 面积大于 object 的面积的 subjects。
192
196
  再求出筛选出的 subjects 和 object 的最短距离
193
197
  """
198
+ def search_overlap_between_boxes(
199
+ subject_idx, object_idx
200
+ ):
201
+ idxes = [subject_idx, object_idx]
202
+ x0s = [all_bboxes[idx]['bbox'][0] for idx in idxes]
203
+ y0s = [all_bboxes[idx]['bbox'][1] for idx in idxes]
204
+ x1s = [all_bboxes[idx]['bbox'][2] for idx in idxes]
205
+ y1s = [all_bboxes[idx]['bbox'][3] for idx in idxes]
206
+
207
+ merged_bbox = [
208
+ min(x0s),
209
+ min(y0s),
210
+ max(x1s),
211
+ max(y1s),
212
+ ]
213
+ ratio = 0
214
+
215
+ other_objects = list(
216
+ map(
217
+ lambda x: {'bbox': x['bbox'], 'score': x['score']},
218
+ filter(
219
+ lambda x: x['category_id']
220
+ not in (object_category_id, subject_category_id),
221
+ self.__model_list[page_no]['layout_dets'],
222
+ ),
223
+ )
224
+ )
225
+ for other_object in other_objects:
226
+ ratio = max(
227
+ ratio,
228
+ get_overlap_area(
229
+ merged_bbox, other_object['bbox']
230
+ ) * 1.0 / box_area(all_bboxes[object_idx]['bbox'])
231
+ )
232
+ if ratio >= MERGE_BOX_OVERLAP_AREA_RATIO:
233
+ break
234
+
235
+ return ratio
194
236
 
195
237
  def may_find_other_nearest_bbox(subject_idx, object_idx):
196
238
  ret = float('inf')
@@ -299,6 +341,15 @@ class MagicModel:
299
341
  ):
300
342
  continue
301
343
 
344
+ subject_idx, object_idx = i, j
345
+ if all_bboxes[j]['category_id'] == subject_category_id:
346
+ subject_idx, object_idx = j, i
347
+
348
+ if search_overlap_between_boxes(subject_idx, object_idx) >= MERGE_BOX_OVERLAP_AREA_RATIO:
349
+ dis[i][j] = float('inf')
350
+ dis[j][i] = dis[i][j]
351
+ continue
352
+
302
353
  dis[i][j] = bbox_distance(all_bboxes[i]['bbox'], all_bboxes[j]['bbox'])
303
354
  dis[j][i] = dis[i][j]
304
355
 
@@ -627,13 +678,13 @@ class MagicModel:
627
678
  span['type'] = ContentType.Image
628
679
  elif category_id == 5:
629
680
  # 获取table模型结果
630
- latex = layout_det.get("latex", None)
631
- html = layout_det.get("html", None)
681
+ latex = layout_det.get('latex', None)
682
+ html = layout_det.get('html', None)
632
683
  if latex:
633
- span["latex"] = latex
684
+ span['latex'] = latex
634
685
  elif html:
635
- span["html"] = html
636
- span["type"] = ContentType.Table
686
+ span['html'] = html
687
+ span['type'] = ContentType.Table
637
688
  elif category_id == 13:
638
689
  span['content'] = layout_det['latex']
639
690
  span['type'] = ContentType.InlineEquation
magic_pdf/tools/common.py CHANGED
@@ -46,7 +46,7 @@ def do_parse(
46
46
  end_page_id=None,
47
47
  ):
48
48
  if debug_able:
49
- logger.warning("debug mode is on")
49
+ logger.warning('debug mode is on')
50
50
  f_dump_content_list = True
51
51
  f_draw_model_bbox = True
52
52
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: magic-pdf
3
- Version: 0.8.0
3
+ Version: 0.8.1
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  Home-page: https://github.com/opendatalab/MinerU
6
6
  Requires-Python: >=3.9
@@ -49,8 +49,9 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
49
49
  [![Downloads](https://static.pepy.tech/badge/magic-pdf)](https://pepy.tech/project/magic-pdf)
50
50
  [![Downloads](https://static.pepy.tech/badge/magic-pdf/month)](https://pepy.tech/project/magic-pdf)
51
51
 
52
- [![HuggingFace](https://img.shields.io/badge/HuggingFace-Demo-yellow.svg?logo=)](https://huggingface.co/spaces/opendatalab/MinerU)
53
- [![ModelScope](https://img.shields.io/badge/ModelScope-Demo-purple?logo=&labelColor=white)](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
52
+ [![OpenDataLab](https://img.shields.io/badge/Demo_on_OpenDataLab-blue?logo=&labelColor=white)](https://opendatalab.com/OpenSourceTools/Extractor/PDF)
53
+ [![HuggingFace](https://img.shields.io/badge/Demo_on_HuggingFace-yellow.svg?logo=&labelColor=white)](https://huggingface.co/spaces/opendatalab/MinerU)
54
+ [![ModelScope](https://img.shields.io/badge/Demo_on_ModelScope-purple?logo=&labelColor=white)](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
54
55
  [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/papayalove/b5f4913389e7ff9883c6b687de156e78/mineru_demo.ipynb)
55
56
  [![Paper](https://img.shields.io/badge/Paper-arXiv-green)](#)
56
57
 
@@ -214,7 +215,9 @@ In non-mainline environments, due to the diversity of hardware and software conf
214
215
 
215
216
  ### Online Demo
216
217
 
217
- [Click here for the online demo](https://opendatalab.com/OpenSourceTools/Extractor/PDF)
218
+ [![OpenDataLab](https://img.shields.io/badge/Demo_on_OpenDataLab-blue?logo=&labelColor=white)](https://opendatalab.com/OpenSourceTools/Extractor/PDF)
219
+ [![HuggingFace](https://img.shields.io/badge/Demo_on_HuggingFace-yellow.svg?logo=&labelColor=white)](https://huggingface.co/spaces/opendatalab/MinerU)
220
+ [![ModelScope](https://img.shields.io/badge/Demo_on_ModelScope-purple?logo=&labelColor=white)](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
218
221
 
219
222
  ### Quick CPU Demo
220
223
 
@@ -24,7 +24,7 @@ magic_pdf/libs/Constants.py,sha256=rdJVadmgN0UlIB-xcMQ9j7Qk9q1Qahxt3KEY-vL7hSU,7
24
24
  magic_pdf/libs/MakeContentConfig.py,sha256=UDZPpsv8q4DqTy8h0vRtrT2kHqWiVI205VnVhlUEQc0,206
25
25
  magic_pdf/libs/ModelBlockTypeEnum.py,sha256=kalXPbo5ya6hKhhBHPGlHl1yjWOURoXZWQM3rVUyPsY,164
26
26
  magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
- magic_pdf/libs/boxbase.py,sha256=h6TUcVZPvfyMm3nfnSNIb9cdAZhgfhAXCHz4SxrI2L4,15308
27
+ magic_pdf/libs/boxbase.py,sha256=YGIVYWBHyBSopcTxd5e3FVw6QQUnRMzyYgFJmcEl4Hc,15842
28
28
  magic_pdf/libs/calc_span_stats.py,sha256=5vnU27DcbkFDRSAoLqAmX0KQ3I9ehWkEgh_t9hxg_zI,10147
29
29
  magic_pdf/libs/commons.py,sha256=6Zu9-OyamyCNDY7qj0SxR-rux-ggj9im3CVPtC4ubB8,7108
30
30
  magic_pdf/libs/config_reader.py,sha256=dPx6JJJuCw9AzNgKtrTG1elmfdeN6gDhgFK9r15-NsE,2505
@@ -46,11 +46,11 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
46
46
  magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
47
47
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
48
48
  magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
49
- magic_pdf/libs/version.py,sha256=iPlYCcIzuzW7T2HKDkmYlMkRI51dBLfNRxPPiWrfw9U,22
49
+ magic_pdf/libs/version.py,sha256=Ocl79hbbH8_jdr5dGC90VR1cAvZc05Rc0tkZttUnMjo,22
50
50
  magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
51
51
  magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
52
52
  magic_pdf/model/doc_analyze_by_custom_model.py,sha256=Ht1zZAB4WV3b-AWdRV5m5fuidlD6ZhNkvJM0j9i1l_E,4809
53
- magic_pdf/model/magic_model.py,sha256=Ze_rb4TQBI53_9-n37F6q6f965CT6r9qemi6rreicLU,28926
53
+ magic_pdf/model/magic_model.py,sha256=afSd9D0S31uEAseY98rJrL9BybBMeSMm5jbeWoWuWWo,30694
54
54
  magic_pdf/model/model_list.py,sha256=tJ9jtMB93HGx8Rmt8wmQSDFXZBUIPQrwaaYsep4luTM,183
55
55
  magic_pdf/model/pdf_extract_kit.py,sha256=Bdxqo3AGXs0VByFVj6ZEOm4T6wXTZwkZsRRFtxasNQM,17901
56
56
  magic_pdf/model/ppTableModel.py,sha256=wWiui9VOjkKYlNX-viPqsWpzgkNJ-9_S2Se-j4oyLqU,2687
@@ -145,10 +145,10 @@ magic_pdf/spark/spark_api.py,sha256=eSLXTjMYW5Ya41VMIApRVfji1ZxEZXdH9ZdsL6fy5Kw,
145
145
  magic_pdf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
146
146
  magic_pdf/tools/cli.py,sha256=tUeJhGudJIrCDMNQDRTWtNGE_4E0TWpKTuEkf5y_2uk,2734
147
147
  magic_pdf/tools/cli_dev.py,sha256=3e5eyCQEt_EujXZu5fUAWr_W-YQQVqS9pB0Qgw7t1D8,4122
148
- magic_pdf/tools/common.py,sha256=uJkRb2T6pouALitPCXVD2FVEeHwq2NWTsGGeFigdb74,4859
149
- magic_pdf-0.8.0.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
150
- magic_pdf-0.8.0.dist-info/METADATA,sha256=od0Sz7-Uq3vCjbkVGMiJem6pMWVW_S_1SCzgkeWTyFU,23839
151
- magic_pdf-0.8.0.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
152
- magic_pdf-0.8.0.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
153
- magic_pdf-0.8.0.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
154
- magic_pdf-0.8.0.dist-info/RECORD,,
148
+ magic_pdf/tools/common.py,sha256=2KsqN0rNcuyt9B3vAoF-HeeiwdJbO3iO5VE1zBZ1VCw,4859
149
+ magic_pdf-0.8.1.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
150
+ magic_pdf-0.8.1.dist-info/METADATA,sha256=fWU35cJWoz62IWXDGIC3PsTDIz_vgV_orr1eEjvjjaQ,34142
151
+ magic_pdf-0.8.1.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
152
+ magic_pdf-0.8.1.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
153
+ magic_pdf-0.8.1.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
154
+ magic_pdf-0.8.1.dist-info/RECORD,,