mineru 2.7.2__py3-none-any.whl → 2.7.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -424,7 +424,8 @@ def __tie_up_category_by_index(blocks, subject_block_type, object_block_type):
424
424
  # 调用通用方法
425
425
  return tie_up_category_by_index(
426
426
  get_subjects,
427
- get_objects
427
+ get_objects,
428
+ object_block_type=object_block_type
428
429
  )
429
430
 
430
431
 
@@ -349,7 +349,8 @@ def __tie_up_category_by_index(blocks, subject_block_type, object_block_type):
349
349
  # 调用通用方法
350
350
  return tie_up_category_by_index(
351
351
  get_subjects,
352
- get_objects
352
+ get_objects,
353
+ object_block_type=object_block_type
353
354
  )
354
355
 
355
356
 
@@ -2,6 +2,8 @@
2
2
  包含两个MagicModel类中重复使用的方法和逻辑
3
3
  """
4
4
  from typing import List, Dict, Any, Callable
5
+
6
+ from loguru import logger
5
7
  from mineru.utils.boxbase import bbox_distance, bbox_center_distance, is_in
6
8
 
7
9
 
@@ -172,11 +174,15 @@ def tie_up_category_by_index(
172
174
  get_subjects_func: Callable,
173
175
  get_objects_func: Callable,
174
176
  extract_subject_func: Callable = None,
175
- extract_object_func: Callable = None
177
+ extract_object_func: Callable = None,
178
+ object_block_type: str = "object",
176
179
  ):
177
180
  """
178
181
  基于index的类别关联方法,用于将主体对象与客体对象进行关联
179
- 客体优先匹配给index最接近的主体,index差值相同时使用bbox中心点距离作为tiebreaker
182
+ 客体优先匹配给index最接近的主体,匹配优先级为:
183
+ 1. index差值(最高优先级)
184
+ 2. bbox边缘距离(相邻边距离)
185
+ 3. bbox中心点距离(最低优先级,作为最终tiebreaker)
180
186
 
181
187
  参数:
182
188
  get_subjects_func: 函数,提取主体对象
@@ -207,6 +213,29 @@ def tie_up_category_by_index(
207
213
  "sub_idx": i,
208
214
  }
209
215
 
216
+ # 提取所有客体的index集合,用于计算有效index差值
217
+ object_indices = set(obj["index"] for obj in objects)
218
+
219
+ def calc_effective_index_diff(obj_index: int, sub_index: int) -> int:
220
+ """
221
+ 计算有效的index差值
222
+ 有效差值 = 绝对差值 - 区间内其他客体的数量
223
+ 即:如果obj_index和sub_index之间的差值是由其他客体造成的,则应该扣除这部分差值
224
+ """
225
+ if obj_index == sub_index:
226
+ return 0
227
+
228
+ start, end = min(obj_index, sub_index), max(obj_index, sub_index)
229
+ abs_diff = end - start
230
+
231
+ # 计算区间(start, end)内有多少个其他客体的index
232
+ other_objects_count = 0
233
+ for idx in range(start + 1, end):
234
+ if idx in object_indices:
235
+ other_objects_count += 1
236
+
237
+ return abs_diff - other_objects_count
238
+
210
239
  # 为每个客体找到最匹配的主体
211
240
  for obj in objects:
212
241
  if len(subjects) == 0:
@@ -217,10 +246,10 @@ def tie_up_category_by_index(
217
246
  min_index_diff = float("inf")
218
247
  best_subject_indices = []
219
248
 
220
- # 找出index差值最小的所有主体
249
+ # 找出有效index差值最小的所有主体
221
250
  for i, subject in enumerate(subjects):
222
251
  sub_index = subject["index"]
223
- index_diff = abs(obj_index - sub_index)
252
+ index_diff = calc_effective_index_diff(obj_index, sub_index)
224
253
 
225
254
  if index_diff < min_index_diff:
226
255
  min_index_diff = index_diff
@@ -228,18 +257,37 @@ def tie_up_category_by_index(
228
257
  elif index_diff == min_index_diff:
229
258
  best_subject_indices.append(i)
230
259
 
231
- # 如果有多个主体的index差值相同,使用中心点距离作为tiebreaker
232
- if len(best_subject_indices) > 1:
233
- min_center_dist = float("inf")
260
+ if len(best_subject_indices) == 1:
234
261
  best_subject_idx = best_subject_indices[0]
235
-
236
- for idx in best_subject_indices:
237
- center_dist = bbox_center_distance(obj["bbox"], subjects[idx]["bbox"])
238
- if center_dist < min_center_dist:
239
- min_center_dist = center_dist
240
- best_subject_idx = idx
262
+ # 如果有多个主体的index差值相同(最多两个),根据边缘距离进行筛选
263
+ elif len(best_subject_indices) == 2:
264
+ # 计算所有候选主体的边缘距离
265
+ edge_distances = [(idx, bbox_distance(obj["bbox"], subjects[idx]["bbox"])) for idx in best_subject_indices]
266
+ edge_dist_diff = abs(edge_distances[0][1] - edge_distances[1][1])
267
+
268
+ for idx, edge_dist in edge_distances:
269
+ logger.debug(f"Obj index: {obj_index}, Sub index: {subjects[idx]['index']}, Edge distance: {edge_dist}")
270
+
271
+ if edge_dist_diff > 2:
272
+ # 边缘距离差值大于2,匹配边缘距离更小的主体
273
+ best_subject_idx = min(edge_distances, key=lambda x: x[1])[0]
274
+ logger.debug(f"Obj index: {obj_index}, edge_dist_diff > 2, matching to subject with min edge distance, index: {subjects[best_subject_idx]['index']}")
275
+ elif object_block_type == "table_caption":
276
+ # 边缘距离差值<=2且为table_caption,匹配index更大的主体
277
+ best_subject_idx = max(best_subject_indices, key=lambda idx: subjects[idx]["index"])
278
+ logger.debug(f"Obj index: {obj_index}, edge_dist_diff <= 2 and table_caption, matching to later subject with index: {subjects[best_subject_idx]['index']}")
279
+ elif object_block_type.endswith("footnote"):
280
+ # 边缘距离差值<=2且为footnote,匹配index更小的主体
281
+ best_subject_idx = min(best_subject_indices, key=lambda idx: subjects[idx]["index"])
282
+ logger.debug(f"Obj index: {obj_index}, edge_dist_diff <= 2 and footnote, matching to earlier subject with index: {subjects[best_subject_idx]['index']}")
283
+ else:
284
+ # 边缘距离差值<=2 且不适用特殊匹配规则,使用中心点距离匹配
285
+ center_distances = [(idx, bbox_center_distance(obj["bbox"], subjects[idx]["bbox"])) for idx in best_subject_indices]
286
+ for idx, center_dist in center_distances:
287
+ logger.debug(f"Obj index: {obj_index}, Sub index: {subjects[idx]['index']}, Center distance: {center_dist}")
288
+ best_subject_idx = min(center_distances, key=lambda x: x[1])[0]
241
289
  else:
242
- best_subject_idx = best_subject_indices[0]
290
+ raise ValueError("More than two subjects have the same minimal index difference, which is unexpected.")
243
291
 
244
292
  # 将客体添加到最佳主体的obj_bboxes中
245
293
  result_dict[best_subject_idx]["obj_bboxes"].append(extract_object_func(obj))
mineru/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "2.7.2"
1
+ __version__ = "2.7.3"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mineru
3
- Version: 2.7.2
3
+ Version: 2.7.3
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  License: AGPL-3.0
6
6
  Project-URL: homepage, https://mineru.net/
@@ -1,10 +1,10 @@
1
1
  mineru/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
2
- mineru/version.py,sha256=H1WLrviWKvrPzDle8EWdCYYkzljxs0mtbXigYc-xaKA,22
2
+ mineru/version.py,sha256=uf6cgtzZWaYn5QApMyykHXMzWM_oEqWLhYTsWSWu2_k,22
3
3
  mineru/backend/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
4
4
  mineru/backend/utils.py,sha256=GLJU3IznDmhE1_qNmkU1UOtsuskIHBezgsEVO6Uar-Y,698
5
5
  mineru/backend/hybrid/__init__.py,sha256=IFgr2C8NfSAj8q7JF7QOqMvCiJ6Fc8TIuU3Uh2DaFZU,51
6
6
  mineru/backend/hybrid/hybrid_analyze.py,sha256=Sckw6T-pvMv3V_nqZkBeW8kY4zNIBlWxqeS2vXqNqtY,20939
7
- mineru/backend/hybrid/hybrid_magic_model.py,sha256=39ByeZh54KBbPe77bzGCqZrZ5RNwNxGYttcoisgDOrc,24668
7
+ mineru/backend/hybrid/hybrid_magic_model.py,sha256=_DvBq5WP_UZvmHfhZloxqv-MKoWWe_ye1kNLv6RA5rU,24713
8
8
  mineru/backend/hybrid/hybrid_model_output_to_middle_json.py,sha256=yE-c1eGa5LzPqLfKfvBON_SJRljqyz2B7LiglFcE7FQ,8468
9
9
  mineru/backend/pipeline/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
10
10
  mineru/backend/pipeline/batch_analyze.py,sha256=3UBs2WOwcI-mfGAlxZt437OqSOleXPLnpYbrD9h5D54,21303
@@ -19,7 +19,7 @@ mineru/backend/vlm/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5
19
19
  mineru/backend/vlm/model_output_to_middle_json.py,sha256=AqYX44gS9crUO_t7SuUatD71EVjow6pI6yA2Ik3gQ0s,5139
20
20
  mineru/backend/vlm/utils.py,sha256=1qma_KmDjRfOckcPbriGgRhS1XMk_johsyACfwcmDr4,3844
21
21
  mineru/backend/vlm/vlm_analyze.py,sha256=ttnQBUy1PEm9JZoF2G1_z-7gA3MgUUUBhz6OypCb4_g,14765
22
- mineru/backend/vlm/vlm_magic_model.py,sha256=mD-irxboo2DmMu4QF1wnvbti2xdNyBmNflbB4a-TmsU,21402
22
+ mineru/backend/vlm/vlm_magic_model.py,sha256=RodoVwNJhzjyuRLn5Io5gFMIX1NxCuuLzCbUxGaKV80,21447
23
23
  mineru/backend/vlm/vlm_middle_json_mkcontent.py,sha256=w-Szbm4HitR7MY4pinSCZZdXtPSqmtlU9cjNh4IOQyg,29499
24
24
  mineru/cli/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
25
25
  mineru/cli/client.py,sha256=mPNfMEShVG-ithmlJQ5nGRIad2gCZgUjBGHN7zAmLhQ,6978
@@ -171,7 +171,7 @@ mineru/utils/guess_suffix_or_lang.py,sha256=aUC2wAJwa5LH0SHxwTbOEJqVVgvpdUCWFF6o
171
171
  mineru/utils/hash_utils.py,sha256=UPS_8NRBmVumdyOv16Lmv6Ly2xK8OVDJEe5gG6gKIFk,857
172
172
  mineru/utils/language.py,sha256=7RT3mxSa7jdpoC5ySd7ZddHA7TO7UsnmDOWiYZAxuyg,1433
173
173
  mineru/utils/llm_aided.py,sha256=9WUytvxenSAuaWR4sTQhVPQ5h8pY0wVOH1O2sj_6dLs,5149
174
- mineru/utils/magic_model_utils.py,sha256=I6vdN56aqhQBGOasoWHiJbjnXsBwUojw6xFjbWZSHaU,8656
174
+ mineru/utils/magic_model_utils.py,sha256=8Hv-BDk9Ez4TUx6hrVJ_675yZZggPj6Uib81lSpm0ig,11683
175
175
  mineru/utils/model_utils.py,sha256=w-jSN7Ilh27FlMjPpKNO6MPbo_dT5Ln7zCQcXaREl_k,19605
176
176
  mineru/utils/models_download_utils.py,sha256=UfjvwhxO6BkJHa5JSpEVNZ71GoLMPMmJpym3THET2T4,2957
177
177
  mineru/utils/ocr_utils.py,sha256=lPIrwNUib5mrzUkponRYHuUCdjV2qvETNLSzOLyflrU,15990
@@ -185,9 +185,9 @@ mineru/utils/run_async.py,sha256=rPeP4BCZerR8VByRDhiYzfZiahLVqoZEBVAS54dAjNg,128
185
185
  mineru/utils/span_block_fix.py,sha256=0eVQjJCrT03woRt9hoh6Uu42Tp1dacfGTv2x3B9qq94,8797
186
186
  mineru/utils/span_pre_proc.py,sha256=nu6Bh5TWPKFzHuFfbEs0Asr04M4xOL5IONz_8GJHn44,13862
187
187
  mineru/utils/table_merge.py,sha256=LORxz0THemCqH746FMViqEuLzM088M4HgIkEuwDIfNU,21393
188
- mineru-2.7.2.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
189
- mineru-2.7.2.dist-info/METADATA,sha256=w3qS7X-Wjvqz8Ra5fp0QH-Wvq_RbZHGyaVOL8WIrerw,36621
190
- mineru-2.7.2.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
191
- mineru-2.7.2.dist-info/entry_points.txt,sha256=JbtrCPhx1T32s7TONUsteKg-24ZwRT1HSiFtW5jypVw,376
192
- mineru-2.7.2.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
193
- mineru-2.7.2.dist-info/RECORD,,
188
+ mineru-2.7.3.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
189
+ mineru-2.7.3.dist-info/METADATA,sha256=XDUBoY78vVkmR2TFpXk_frncPD6D_Ev067KuoRUJR2U,36621
190
+ mineru-2.7.3.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
191
+ mineru-2.7.3.dist-info/entry_points.txt,sha256=JbtrCPhx1T32s7TONUsteKg-24ZwRT1HSiFtW5jypVw,376
192
+ mineru-2.7.3.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
193
+ mineru-2.7.3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.10.1)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5