mineru 2.7.2__py3-none-any.whl → 2.7.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mineru/backend/hybrid/hybrid_magic_model.py +2 -1
- mineru/backend/vlm/vlm_magic_model.py +2 -1
- mineru/utils/magic_model_utils.py +62 -14
- mineru/version.py +1 -1
- {mineru-2.7.2.dist-info → mineru-2.7.3.dist-info}/METADATA +1 -1
- {mineru-2.7.2.dist-info → mineru-2.7.3.dist-info}/RECORD +10 -10
- {mineru-2.7.2.dist-info → mineru-2.7.3.dist-info}/WHEEL +1 -1
- {mineru-2.7.2.dist-info → mineru-2.7.3.dist-info}/entry_points.txt +0 -0
- {mineru-2.7.2.dist-info → mineru-2.7.3.dist-info}/licenses/LICENSE.md +0 -0
- {mineru-2.7.2.dist-info → mineru-2.7.3.dist-info}/top_level.txt +0 -0
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
包含两个MagicModel类中重复使用的方法和逻辑
|
|
3
3
|
"""
|
|
4
4
|
from typing import List, Dict, Any, Callable
|
|
5
|
+
|
|
6
|
+
from loguru import logger
|
|
5
7
|
from mineru.utils.boxbase import bbox_distance, bbox_center_distance, is_in
|
|
6
8
|
|
|
7
9
|
|
|
@@ -172,11 +174,15 @@ def tie_up_category_by_index(
|
|
|
172
174
|
get_subjects_func: Callable,
|
|
173
175
|
get_objects_func: Callable,
|
|
174
176
|
extract_subject_func: Callable = None,
|
|
175
|
-
extract_object_func: Callable = None
|
|
177
|
+
extract_object_func: Callable = None,
|
|
178
|
+
object_block_type: str = "object",
|
|
176
179
|
):
|
|
177
180
|
"""
|
|
178
181
|
基于index的类别关联方法,用于将主体对象与客体对象进行关联
|
|
179
|
-
客体优先匹配给index
|
|
182
|
+
客体优先匹配给index最接近的主体,匹配优先级为:
|
|
183
|
+
1. index差值(最高优先级)
|
|
184
|
+
2. bbox边缘距离(相邻边距离)
|
|
185
|
+
3. bbox中心点距离(最低优先级,作为最终tiebreaker)
|
|
180
186
|
|
|
181
187
|
参数:
|
|
182
188
|
get_subjects_func: 函数,提取主体对象
|
|
@@ -207,6 +213,29 @@ def tie_up_category_by_index(
|
|
|
207
213
|
"sub_idx": i,
|
|
208
214
|
}
|
|
209
215
|
|
|
216
|
+
# 提取所有客体的index集合,用于计算有效index差值
|
|
217
|
+
object_indices = set(obj["index"] for obj in objects)
|
|
218
|
+
|
|
219
|
+
def calc_effective_index_diff(obj_index: int, sub_index: int) -> int:
|
|
220
|
+
"""
|
|
221
|
+
计算有效的index差值
|
|
222
|
+
有效差值 = 绝对差值 - 区间内其他客体的数量
|
|
223
|
+
即:如果obj_index和sub_index之间的差值是由其他客体造成的,则应该扣除这部分差值
|
|
224
|
+
"""
|
|
225
|
+
if obj_index == sub_index:
|
|
226
|
+
return 0
|
|
227
|
+
|
|
228
|
+
start, end = min(obj_index, sub_index), max(obj_index, sub_index)
|
|
229
|
+
abs_diff = end - start
|
|
230
|
+
|
|
231
|
+
# 计算区间(start, end)内有多少个其他客体的index
|
|
232
|
+
other_objects_count = 0
|
|
233
|
+
for idx in range(start + 1, end):
|
|
234
|
+
if idx in object_indices:
|
|
235
|
+
other_objects_count += 1
|
|
236
|
+
|
|
237
|
+
return abs_diff - other_objects_count
|
|
238
|
+
|
|
210
239
|
# 为每个客体找到最匹配的主体
|
|
211
240
|
for obj in objects:
|
|
212
241
|
if len(subjects) == 0:
|
|
@@ -217,10 +246,10 @@ def tie_up_category_by_index(
|
|
|
217
246
|
min_index_diff = float("inf")
|
|
218
247
|
best_subject_indices = []
|
|
219
248
|
|
|
220
|
-
#
|
|
249
|
+
# 找出有效index差值最小的所有主体
|
|
221
250
|
for i, subject in enumerate(subjects):
|
|
222
251
|
sub_index = subject["index"]
|
|
223
|
-
index_diff =
|
|
252
|
+
index_diff = calc_effective_index_diff(obj_index, sub_index)
|
|
224
253
|
|
|
225
254
|
if index_diff < min_index_diff:
|
|
226
255
|
min_index_diff = index_diff
|
|
@@ -228,18 +257,37 @@ def tie_up_category_by_index(
|
|
|
228
257
|
elif index_diff == min_index_diff:
|
|
229
258
|
best_subject_indices.append(i)
|
|
230
259
|
|
|
231
|
-
|
|
232
|
-
if len(best_subject_indices) > 1:
|
|
233
|
-
min_center_dist = float("inf")
|
|
260
|
+
if len(best_subject_indices) == 1:
|
|
234
261
|
best_subject_idx = best_subject_indices[0]
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
262
|
+
# 如果有多个主体的index差值相同(最多两个),根据边缘距离进行筛选
|
|
263
|
+
elif len(best_subject_indices) == 2:
|
|
264
|
+
# 计算所有候选主体的边缘距离
|
|
265
|
+
edge_distances = [(idx, bbox_distance(obj["bbox"], subjects[idx]["bbox"])) for idx in best_subject_indices]
|
|
266
|
+
edge_dist_diff = abs(edge_distances[0][1] - edge_distances[1][1])
|
|
267
|
+
|
|
268
|
+
for idx, edge_dist in edge_distances:
|
|
269
|
+
logger.debug(f"Obj index: {obj_index}, Sub index: {subjects[idx]['index']}, Edge distance: {edge_dist}")
|
|
270
|
+
|
|
271
|
+
if edge_dist_diff > 2:
|
|
272
|
+
# 边缘距离差值大于2,匹配边缘距离更小的主体
|
|
273
|
+
best_subject_idx = min(edge_distances, key=lambda x: x[1])[0]
|
|
274
|
+
logger.debug(f"Obj index: {obj_index}, edge_dist_diff > 2, matching to subject with min edge distance, index: {subjects[best_subject_idx]['index']}")
|
|
275
|
+
elif object_block_type == "table_caption":
|
|
276
|
+
# 边缘距离差值<=2且为table_caption,匹配index更大的主体
|
|
277
|
+
best_subject_idx = max(best_subject_indices, key=lambda idx: subjects[idx]["index"])
|
|
278
|
+
logger.debug(f"Obj index: {obj_index}, edge_dist_diff <= 2 and table_caption, matching to later subject with index: {subjects[best_subject_idx]['index']}")
|
|
279
|
+
elif object_block_type.endswith("footnote"):
|
|
280
|
+
# 边缘距离差值<=2且为footnote,匹配index更小的主体
|
|
281
|
+
best_subject_idx = min(best_subject_indices, key=lambda idx: subjects[idx]["index"])
|
|
282
|
+
logger.debug(f"Obj index: {obj_index}, edge_dist_diff <= 2 and footnote, matching to earlier subject with index: {subjects[best_subject_idx]['index']}")
|
|
283
|
+
else:
|
|
284
|
+
# 边缘距离差值<=2 且不适用特殊匹配规则,使用中心点距离匹配
|
|
285
|
+
center_distances = [(idx, bbox_center_distance(obj["bbox"], subjects[idx]["bbox"])) for idx in best_subject_indices]
|
|
286
|
+
for idx, center_dist in center_distances:
|
|
287
|
+
logger.debug(f"Obj index: {obj_index}, Sub index: {subjects[idx]['index']}, Center distance: {center_dist}")
|
|
288
|
+
best_subject_idx = min(center_distances, key=lambda x: x[1])[0]
|
|
241
289
|
else:
|
|
242
|
-
|
|
290
|
+
raise ValueError("More than two subjects have the same minimal index difference, which is unexpected.")
|
|
243
291
|
|
|
244
292
|
# 将客体添加到最佳主体的obj_bboxes中
|
|
245
293
|
result_dict[best_subject_idx]["obj_bboxes"].append(extract_object_func(obj))
|
mineru/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "2.7.
|
|
1
|
+
__version__ = "2.7.3"
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
mineru/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
2
|
-
mineru/version.py,sha256=
|
|
2
|
+
mineru/version.py,sha256=uf6cgtzZWaYn5QApMyykHXMzWM_oEqWLhYTsWSWu2_k,22
|
|
3
3
|
mineru/backend/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
4
4
|
mineru/backend/utils.py,sha256=GLJU3IznDmhE1_qNmkU1UOtsuskIHBezgsEVO6Uar-Y,698
|
|
5
5
|
mineru/backend/hybrid/__init__.py,sha256=IFgr2C8NfSAj8q7JF7QOqMvCiJ6Fc8TIuU3Uh2DaFZU,51
|
|
6
6
|
mineru/backend/hybrid/hybrid_analyze.py,sha256=Sckw6T-pvMv3V_nqZkBeW8kY4zNIBlWxqeS2vXqNqtY,20939
|
|
7
|
-
mineru/backend/hybrid/hybrid_magic_model.py,sha256=
|
|
7
|
+
mineru/backend/hybrid/hybrid_magic_model.py,sha256=_DvBq5WP_UZvmHfhZloxqv-MKoWWe_ye1kNLv6RA5rU,24713
|
|
8
8
|
mineru/backend/hybrid/hybrid_model_output_to_middle_json.py,sha256=yE-c1eGa5LzPqLfKfvBON_SJRljqyz2B7LiglFcE7FQ,8468
|
|
9
9
|
mineru/backend/pipeline/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
10
10
|
mineru/backend/pipeline/batch_analyze.py,sha256=3UBs2WOwcI-mfGAlxZt437OqSOleXPLnpYbrD9h5D54,21303
|
|
@@ -19,7 +19,7 @@ mineru/backend/vlm/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5
|
|
|
19
19
|
mineru/backend/vlm/model_output_to_middle_json.py,sha256=AqYX44gS9crUO_t7SuUatD71EVjow6pI6yA2Ik3gQ0s,5139
|
|
20
20
|
mineru/backend/vlm/utils.py,sha256=1qma_KmDjRfOckcPbriGgRhS1XMk_johsyACfwcmDr4,3844
|
|
21
21
|
mineru/backend/vlm/vlm_analyze.py,sha256=ttnQBUy1PEm9JZoF2G1_z-7gA3MgUUUBhz6OypCb4_g,14765
|
|
22
|
-
mineru/backend/vlm/vlm_magic_model.py,sha256=
|
|
22
|
+
mineru/backend/vlm/vlm_magic_model.py,sha256=RodoVwNJhzjyuRLn5Io5gFMIX1NxCuuLzCbUxGaKV80,21447
|
|
23
23
|
mineru/backend/vlm/vlm_middle_json_mkcontent.py,sha256=w-Szbm4HitR7MY4pinSCZZdXtPSqmtlU9cjNh4IOQyg,29499
|
|
24
24
|
mineru/cli/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
|
25
25
|
mineru/cli/client.py,sha256=mPNfMEShVG-ithmlJQ5nGRIad2gCZgUjBGHN7zAmLhQ,6978
|
|
@@ -171,7 +171,7 @@ mineru/utils/guess_suffix_or_lang.py,sha256=aUC2wAJwa5LH0SHxwTbOEJqVVgvpdUCWFF6o
|
|
|
171
171
|
mineru/utils/hash_utils.py,sha256=UPS_8NRBmVumdyOv16Lmv6Ly2xK8OVDJEe5gG6gKIFk,857
|
|
172
172
|
mineru/utils/language.py,sha256=7RT3mxSa7jdpoC5ySd7ZddHA7TO7UsnmDOWiYZAxuyg,1433
|
|
173
173
|
mineru/utils/llm_aided.py,sha256=9WUytvxenSAuaWR4sTQhVPQ5h8pY0wVOH1O2sj_6dLs,5149
|
|
174
|
-
mineru/utils/magic_model_utils.py,sha256=
|
|
174
|
+
mineru/utils/magic_model_utils.py,sha256=8Hv-BDk9Ez4TUx6hrVJ_675yZZggPj6Uib81lSpm0ig,11683
|
|
175
175
|
mineru/utils/model_utils.py,sha256=w-jSN7Ilh27FlMjPpKNO6MPbo_dT5Ln7zCQcXaREl_k,19605
|
|
176
176
|
mineru/utils/models_download_utils.py,sha256=UfjvwhxO6BkJHa5JSpEVNZ71GoLMPMmJpym3THET2T4,2957
|
|
177
177
|
mineru/utils/ocr_utils.py,sha256=lPIrwNUib5mrzUkponRYHuUCdjV2qvETNLSzOLyflrU,15990
|
|
@@ -185,9 +185,9 @@ mineru/utils/run_async.py,sha256=rPeP4BCZerR8VByRDhiYzfZiahLVqoZEBVAS54dAjNg,128
|
|
|
185
185
|
mineru/utils/span_block_fix.py,sha256=0eVQjJCrT03woRt9hoh6Uu42Tp1dacfGTv2x3B9qq94,8797
|
|
186
186
|
mineru/utils/span_pre_proc.py,sha256=nu6Bh5TWPKFzHuFfbEs0Asr04M4xOL5IONz_8GJHn44,13862
|
|
187
187
|
mineru/utils/table_merge.py,sha256=LORxz0THemCqH746FMViqEuLzM088M4HgIkEuwDIfNU,21393
|
|
188
|
-
mineru-2.7.
|
|
189
|
-
mineru-2.7.
|
|
190
|
-
mineru-2.7.
|
|
191
|
-
mineru-2.7.
|
|
192
|
-
mineru-2.7.
|
|
193
|
-
mineru-2.7.
|
|
188
|
+
mineru-2.7.3.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
|
189
|
+
mineru-2.7.3.dist-info/METADATA,sha256=XDUBoY78vVkmR2TFpXk_frncPD6D_Ev067KuoRUJR2U,36621
|
|
190
|
+
mineru-2.7.3.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
191
|
+
mineru-2.7.3.dist-info/entry_points.txt,sha256=JbtrCPhx1T32s7TONUsteKg-24ZwRT1HSiFtW5jypVw,376
|
|
192
|
+
mineru-2.7.3.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
|
|
193
|
+
mineru-2.7.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|