mineru 2.6.8__py3-none-any.whl → 2.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mineru/backend/hybrid/__init__.py +1 -0
- mineru/backend/hybrid/hybrid_analyze.py +526 -0
- mineru/backend/hybrid/hybrid_magic_model.py +617 -0
- mineru/backend/hybrid/hybrid_model_output_to_middle_json.py +212 -0
- mineru/backend/pipeline/batch_analyze.py +9 -1
- mineru/backend/pipeline/model_init.py +96 -1
- mineru/backend/pipeline/pipeline_analyze.py +6 -4
- mineru/backend/pipeline/pipeline_middle_json_mkcontent.py +32 -41
- mineru/backend/vlm/utils.py +3 -1
- mineru/backend/vlm/vlm_analyze.py +12 -12
- mineru/backend/vlm/vlm_magic_model.py +24 -89
- mineru/backend/vlm/vlm_middle_json_mkcontent.py +112 -12
- mineru/cli/client.py +17 -17
- mineru/cli/common.py +169 -20
- mineru/cli/fast_api.py +39 -13
- mineru/cli/gradio_app.py +232 -206
- mineru/model/mfd/yolo_v8.py +12 -6
- mineru/model/mfr/unimernet/Unimernet.py +71 -3
- mineru/resources/header.html +5 -1
- mineru/utils/boxbase.py +23 -0
- mineru/utils/char_utils.py +55 -0
- mineru/utils/engine_utils.py +74 -0
- mineru/utils/enum_class.py +18 -1
- mineru/utils/magic_model_utils.py +85 -2
- mineru/utils/pdf_image_tools.py +37 -17
- mineru/utils/span_pre_proc.py +5 -3
- mineru/utils/table_merge.py +13 -22
- mineru/version.py +1 -1
- mineru-2.7.1.dist-info/METADATA +438 -0
- {mineru-2.6.8.dist-info → mineru-2.7.1.dist-info}/RECORD +34 -28
- mineru-2.6.8.dist-info/METADATA +0 -954
- {mineru-2.6.8.dist-info → mineru-2.7.1.dist-info}/WHEEL +0 -0
- {mineru-2.6.8.dist-info → mineru-2.7.1.dist-info}/entry_points.txt +0 -0
- {mineru-2.6.8.dist-info → mineru-2.7.1.dist-info}/licenses/LICENSE.md +0 -0
- {mineru-2.6.8.dist-info → mineru-2.7.1.dist-info}/top_level.txt +0 -0
|
@@ -6,7 +6,7 @@ from loguru import logger
|
|
|
6
6
|
from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio
|
|
7
7
|
from mineru.utils.enum_class import ContentType, BlockType
|
|
8
8
|
from mineru.utils.guess_suffix_or_lang import guess_language_by_text
|
|
9
|
-
from mineru.utils.magic_model_utils import reduct_overlap,
|
|
9
|
+
from mineru.utils.magic_model_utils import reduct_overlap, tie_up_category_by_index
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class MagicModel:
|
|
@@ -317,13 +317,14 @@ def clean_content(content):
|
|
|
317
317
|
return content
|
|
318
318
|
|
|
319
319
|
|
|
320
|
-
def
|
|
320
|
+
def __tie_up_category_by_index(blocks, subject_block_type, object_block_type):
|
|
321
|
+
"""基于index的主客体关联包装函数"""
|
|
321
322
|
# 定义获取主体和客体对象的函数
|
|
322
323
|
def get_subjects():
|
|
323
324
|
return reduct_overlap(
|
|
324
325
|
list(
|
|
325
326
|
map(
|
|
326
|
-
lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"], "angle":x["angle"]},
|
|
327
|
+
lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"], "angle": x["angle"]},
|
|
327
328
|
filter(
|
|
328
329
|
lambda x: x["type"] == subject_block_type,
|
|
329
330
|
blocks,
|
|
@@ -336,7 +337,7 @@ def __tie_up_category_by_distance_v3(blocks, subject_block_type, object_block_ty
|
|
|
336
337
|
return reduct_overlap(
|
|
337
338
|
list(
|
|
338
339
|
map(
|
|
339
|
-
lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"], "angle":x["angle"]},
|
|
340
|
+
lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"], "angle": x["angle"]},
|
|
340
341
|
filter(
|
|
341
342
|
lambda x: x["type"] == object_block_type,
|
|
342
343
|
blocks,
|
|
@@ -346,15 +347,15 @@ def __tie_up_category_by_distance_v3(blocks, subject_block_type, object_block_ty
|
|
|
346
347
|
)
|
|
347
348
|
|
|
348
349
|
# 调用通用方法
|
|
349
|
-
return
|
|
350
|
+
return tie_up_category_by_index(
|
|
350
351
|
get_subjects,
|
|
351
352
|
get_objects
|
|
352
353
|
)
|
|
353
354
|
|
|
354
355
|
|
|
355
356
|
def get_type_blocks(blocks, block_type: Literal["image", "table", "code"]):
|
|
356
|
-
with_captions =
|
|
357
|
-
with_footnotes =
|
|
357
|
+
with_captions = __tie_up_category_by_index(blocks, f"{block_type}_body", f"{block_type}_caption")
|
|
358
|
+
with_footnotes = __tie_up_category_by_index(blocks, f"{block_type}_body", f"{block_type}_footnote")
|
|
358
359
|
ret = []
|
|
359
360
|
for v in with_captions:
|
|
360
361
|
record = {
|
|
@@ -368,49 +369,6 @@ def get_type_blocks(blocks, block_type: Literal["image", "table", "code"]):
|
|
|
368
369
|
return ret
|
|
369
370
|
|
|
370
371
|
|
|
371
|
-
def fix_two_layer_blocks_back(blocks, fix_type: Literal["image", "table", "code"]):
|
|
372
|
-
need_fix_blocks = get_type_blocks(blocks, fix_type)
|
|
373
|
-
fixed_blocks = []
|
|
374
|
-
not_include_blocks = []
|
|
375
|
-
processed_indices = set()
|
|
376
|
-
|
|
377
|
-
# 处理需要组织成two_layer结构的blocks
|
|
378
|
-
for block in need_fix_blocks:
|
|
379
|
-
body = block[f"{fix_type}_body"]
|
|
380
|
-
caption_list = block[f"{fix_type}_caption_list"]
|
|
381
|
-
footnote_list = block[f"{fix_type}_footnote_list"]
|
|
382
|
-
|
|
383
|
-
body["type"] = f"{fix_type}_body"
|
|
384
|
-
for caption in caption_list:
|
|
385
|
-
caption["type"] = f"{fix_type}_caption"
|
|
386
|
-
processed_indices.add(caption["index"])
|
|
387
|
-
for footnote in footnote_list:
|
|
388
|
-
footnote["type"] = f"{fix_type}_footnote"
|
|
389
|
-
processed_indices.add(footnote["index"])
|
|
390
|
-
|
|
391
|
-
processed_indices.add(body["index"])
|
|
392
|
-
|
|
393
|
-
two_layer_block = {
|
|
394
|
-
"type": fix_type,
|
|
395
|
-
"bbox": body["bbox"],
|
|
396
|
-
"blocks": [
|
|
397
|
-
body,
|
|
398
|
-
],
|
|
399
|
-
"index": body["index"],
|
|
400
|
-
}
|
|
401
|
-
two_layer_block["blocks"].extend([*caption_list, *footnote_list])
|
|
402
|
-
|
|
403
|
-
fixed_blocks.append(two_layer_block)
|
|
404
|
-
|
|
405
|
-
# 添加未处理的blocks
|
|
406
|
-
for block in blocks:
|
|
407
|
-
if block["index"] not in processed_indices:
|
|
408
|
-
# 直接添加未处理的block
|
|
409
|
-
not_include_blocks.append(block)
|
|
410
|
-
|
|
411
|
-
return fixed_blocks, not_include_blocks
|
|
412
|
-
|
|
413
|
-
|
|
414
372
|
def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table", "code"]):
|
|
415
373
|
need_fix_blocks = get_type_blocks(blocks, fix_type)
|
|
416
374
|
fixed_blocks = []
|
|
@@ -418,25 +376,16 @@ def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table", "code"]):
|
|
|
418
376
|
processed_indices = set()
|
|
419
377
|
|
|
420
378
|
# 特殊处理表格类型,确保标题在表格前,注脚在表格后
|
|
421
|
-
if fix_type
|
|
379
|
+
if fix_type in ["table", "image"]:
|
|
422
380
|
# 收集所有不合适的caption和footnote
|
|
423
381
|
misplaced_captions = [] # 存储(caption, 原始block索引)
|
|
424
382
|
misplaced_footnotes = [] # 存储(footnote, 原始block索引)
|
|
425
383
|
|
|
426
|
-
# 第一步:移除不符合位置要求的
|
|
384
|
+
# 第一步:移除不符合位置要求的footnote
|
|
427
385
|
for block_idx, block in enumerate(need_fix_blocks):
|
|
428
386
|
body = block[f"{fix_type}_body"]
|
|
429
387
|
body_index = body["index"]
|
|
430
388
|
|
|
431
|
-
# 检查caption应在body前或同位置
|
|
432
|
-
valid_captions = []
|
|
433
|
-
for caption in block[f"{fix_type}_caption_list"]:
|
|
434
|
-
if caption["index"] <= body_index:
|
|
435
|
-
valid_captions.append(caption)
|
|
436
|
-
else:
|
|
437
|
-
misplaced_captions.append((caption, block_idx))
|
|
438
|
-
block[f"{fix_type}_caption_list"] = valid_captions
|
|
439
|
-
|
|
440
389
|
# 检查footnote应在body后或同位置
|
|
441
390
|
valid_footnotes = []
|
|
442
391
|
for footnote in block[f"{fix_type}_footnote_list"]:
|
|
@@ -446,28 +395,6 @@ def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table", "code"]):
|
|
|
446
395
|
misplaced_footnotes.append((footnote, block_idx))
|
|
447
396
|
block[f"{fix_type}_footnote_list"] = valid_footnotes
|
|
448
397
|
|
|
449
|
-
# 第二步:重新分配不合规的caption到合适的body
|
|
450
|
-
for caption, original_block_idx in misplaced_captions:
|
|
451
|
-
caption_index = caption["index"]
|
|
452
|
-
best_block_idx = None
|
|
453
|
-
min_distance = float('inf')
|
|
454
|
-
|
|
455
|
-
# 寻找索引大于等于caption_index的最近body
|
|
456
|
-
for idx, block in enumerate(need_fix_blocks):
|
|
457
|
-
body_index = block[f"{fix_type}_body"]["index"]
|
|
458
|
-
if body_index >= caption_index and idx != original_block_idx:
|
|
459
|
-
distance = body_index - caption_index
|
|
460
|
-
if distance < min_distance:
|
|
461
|
-
min_distance = distance
|
|
462
|
-
best_block_idx = idx
|
|
463
|
-
|
|
464
|
-
if best_block_idx is not None:
|
|
465
|
-
# 找到合适的body,添加到对应block的caption_list
|
|
466
|
-
need_fix_blocks[best_block_idx][f"{fix_type}_caption_list"].append(caption)
|
|
467
|
-
else:
|
|
468
|
-
# 没找到合适的body,作为普通block处理
|
|
469
|
-
not_include_blocks.append(caption)
|
|
470
|
-
|
|
471
398
|
# 第三步:重新分配不合规的footnote到合适的body
|
|
472
399
|
for footnote, original_block_idx in misplaced_footnotes:
|
|
473
400
|
footnote_index = footnote["index"]
|
|
@@ -502,13 +429,22 @@ def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table", "code"]):
|
|
|
502
429
|
caption_list.sort(key=lambda x: x["index"], reverse=True)
|
|
503
430
|
filtered_captions = [caption_list[0]]
|
|
504
431
|
for i in range(1, len(caption_list)):
|
|
505
|
-
|
|
506
|
-
|
|
432
|
+
prev_index = caption_list[i - 1]["index"]
|
|
433
|
+
curr_index = caption_list[i]["index"]
|
|
434
|
+
|
|
435
|
+
# 检查是否连续
|
|
436
|
+
if curr_index == prev_index - 1:
|
|
507
437
|
filtered_captions.append(caption_list[i])
|
|
508
438
|
else:
|
|
509
|
-
#
|
|
510
|
-
|
|
511
|
-
|
|
439
|
+
# 检查gap中是否只有body_index
|
|
440
|
+
gap_indices = set(range(curr_index + 1, prev_index))
|
|
441
|
+
if gap_indices == {body_index}:
|
|
442
|
+
# gap中只有body_index,不算真正的gap
|
|
443
|
+
filtered_captions.append(caption_list[i])
|
|
444
|
+
else:
|
|
445
|
+
# 出现真正的gap,后续所有caption都作为普通block
|
|
446
|
+
not_include_blocks.extend(caption_list[i:])
|
|
447
|
+
break
|
|
512
448
|
# 恢复升序
|
|
513
449
|
filtered_captions.reverse()
|
|
514
450
|
block[f"{fix_type}_caption_list"] = filtered_captions
|
|
@@ -592,7 +528,6 @@ def fix_list_blocks(list_blocks, text_blocks, ref_text_blocks):
|
|
|
592
528
|
for list_block in list_blocks:
|
|
593
529
|
# 统计list_block["blocks"]中所有block的type,用众数作为list_block的sub_type
|
|
594
530
|
type_count = {}
|
|
595
|
-
line_content = []
|
|
596
531
|
for sub_block in list_block["blocks"]:
|
|
597
532
|
sub_block_type = sub_block["type"]
|
|
598
533
|
if sub_block_type not in type_count:
|
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
import os
|
|
2
2
|
|
|
3
3
|
from loguru import logger
|
|
4
|
+
|
|
5
|
+
from mineru.utils.char_utils import full_to_half_exclude_marks, is_hyphen_at_line_end
|
|
4
6
|
from mineru.utils.config_reader import get_latex_delimiter_config, get_formula_enable, get_table_enable
|
|
5
7
|
from mineru.utils.enum_class import MakeMode, BlockType, ContentType, ContentTypeV2
|
|
8
|
+
from mineru.utils.language import detect_lang
|
|
6
9
|
|
|
7
10
|
latex_delimiters_config = get_latex_delimiter_config()
|
|
8
11
|
|
|
@@ -18,9 +21,18 @@ display_right_delimiter = delimiters['display']['right']
|
|
|
18
21
|
inline_left_delimiter = delimiters['inline']['left']
|
|
19
22
|
inline_right_delimiter = delimiters['inline']['right']
|
|
20
23
|
|
|
24
|
+
|
|
21
25
|
def merge_para_with_text(para_block, formula_enable=True, img_buket_path=''):
|
|
22
|
-
|
|
26
|
+
block_text = ''
|
|
23
27
|
for line in para_block['lines']:
|
|
28
|
+
for span in line['spans']:
|
|
29
|
+
if span['type'] in [ContentType.TEXT]:
|
|
30
|
+
span['content'] = full_to_half_exclude_marks(span['content'])
|
|
31
|
+
block_text += span['content']
|
|
32
|
+
block_lang = detect_lang(block_text)
|
|
33
|
+
|
|
34
|
+
para_text = ''
|
|
35
|
+
for i, line in enumerate(para_block['lines']):
|
|
24
36
|
for j, span in enumerate(line['spans']):
|
|
25
37
|
span_type = span['type']
|
|
26
38
|
content = ''
|
|
@@ -34,17 +46,51 @@ def merge_para_with_text(para_block, formula_enable=True, img_buket_path=''):
|
|
|
34
46
|
else:
|
|
35
47
|
if span.get('image_path', ''):
|
|
36
48
|
content = f""
|
|
37
|
-
|
|
49
|
+
|
|
50
|
+
content = content.strip()
|
|
38
51
|
if content:
|
|
39
|
-
|
|
40
|
-
|
|
52
|
+
|
|
53
|
+
if span_type == ContentType.INTERLINE_EQUATION:
|
|
54
|
+
para_text += content
|
|
55
|
+
continue
|
|
56
|
+
|
|
57
|
+
# 定义CJK语言集合(中日韩)
|
|
58
|
+
cjk_langs = {'zh', 'ja', 'ko'}
|
|
59
|
+
# logger.info(f'block_lang: {block_lang}, content: {content}')
|
|
60
|
+
|
|
61
|
+
# 判断是否为行末span
|
|
62
|
+
is_last_span = j == len(line['spans']) - 1
|
|
63
|
+
|
|
64
|
+
if block_lang in cjk_langs: # 中文/日语/韩文语境下,换行不需要空格分隔,但是如果是行内公式结尾,还是要加空格
|
|
65
|
+
if is_last_span and span_type != ContentType.INLINE_EQUATION:
|
|
41
66
|
para_text += content
|
|
42
67
|
else:
|
|
43
68
|
para_text += f'{content} '
|
|
44
|
-
|
|
45
|
-
|
|
69
|
+
else:
|
|
70
|
+
# 西方文本语境下 每行的最后一个span判断是否要去除连字符
|
|
71
|
+
if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]:
|
|
72
|
+
# 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
|
|
73
|
+
if (
|
|
74
|
+
is_last_span
|
|
75
|
+
and span_type == ContentType.TEXT
|
|
76
|
+
and is_hyphen_at_line_end(content)
|
|
77
|
+
):
|
|
78
|
+
# 如果下一行的第一个span是小写字母开头,删除连字符
|
|
79
|
+
if (
|
|
80
|
+
i+1 < len(para_block['lines'])
|
|
81
|
+
and para_block['lines'][i + 1].get('spans')
|
|
82
|
+
and para_block['lines'][i + 1]['spans'][0].get('type') == ContentType.TEXT
|
|
83
|
+
and para_block['lines'][i + 1]['spans'][0].get('content', '')
|
|
84
|
+
and para_block['lines'][i + 1]['spans'][0]['content'][0].islower()
|
|
85
|
+
):
|
|
86
|
+
para_text += content[:-1]
|
|
87
|
+
else: # 如果没有下一行,或者下一行的第一个span不是小写字母开头,则保留连字符但不加空格
|
|
88
|
+
para_text += content
|
|
89
|
+
else: # 西方文本语境下 content间需要空格分隔
|
|
90
|
+
para_text += f'{content} '
|
|
46
91
|
return para_text
|
|
47
92
|
|
|
93
|
+
|
|
48
94
|
def mk_blocks_to_markdown(para_blocks, make_mode, formula_enable, table_enable, img_buket_path=''):
|
|
49
95
|
page_markdown = []
|
|
50
96
|
for para_block in para_blocks:
|
|
@@ -479,22 +525,76 @@ def get_body_data(para_block):
|
|
|
479
525
|
|
|
480
526
|
|
|
481
527
|
def merge_para_with_text_v2(para_block):
|
|
482
|
-
|
|
483
|
-
para_type = para_block['type']
|
|
528
|
+
block_text = ''
|
|
484
529
|
for line in para_block['lines']:
|
|
485
530
|
for span in line['spans']:
|
|
531
|
+
if span['type'] in [ContentType.TEXT]:
|
|
532
|
+
span['content'] = full_to_half_exclude_marks(span['content'])
|
|
533
|
+
block_text += span['content']
|
|
534
|
+
block_lang = detect_lang(block_text)
|
|
535
|
+
|
|
536
|
+
para_content = []
|
|
537
|
+
para_type = para_block['type']
|
|
538
|
+
for i, line in enumerate(para_block['lines']):
|
|
539
|
+
for j, span in enumerate(line['spans']):
|
|
486
540
|
span_type = span['type']
|
|
487
541
|
if span.get("content", '').strip():
|
|
488
|
-
if
|
|
489
|
-
|
|
542
|
+
if span_type == ContentType.TEXT:
|
|
543
|
+
if para_type == BlockType.PHONETIC:
|
|
544
|
+
span_type = ContentTypeV2.SPAN_PHONETIC
|
|
545
|
+
else:
|
|
546
|
+
span_type = ContentTypeV2.SPAN_TEXT
|
|
490
547
|
if span_type == ContentType.INLINE_EQUATION:
|
|
491
548
|
span_type = ContentTypeV2.SPAN_EQUATION_INLINE
|
|
492
549
|
if span_type in [
|
|
493
550
|
ContentTypeV2.SPAN_TEXT,
|
|
551
|
+
]:
|
|
552
|
+
# 定义CJK语言集合(中日韩)
|
|
553
|
+
cjk_langs = {'zh', 'ja', 'ko'}
|
|
554
|
+
# logger.info(f'block_lang: {block_lang}, content: {content}')
|
|
555
|
+
|
|
556
|
+
# 判断是否为行末span
|
|
557
|
+
is_last_span = j == len(line['spans']) - 1
|
|
558
|
+
|
|
559
|
+
if block_lang in cjk_langs: # 中文/日语/韩文语境下,换行不需要空格分隔,但是如果是行内公式结尾,还是要加空格
|
|
560
|
+
if is_last_span:
|
|
561
|
+
span_content = span['content']
|
|
562
|
+
else:
|
|
563
|
+
span_content = f"{span['content']} "
|
|
564
|
+
else:
|
|
565
|
+
# 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
|
|
566
|
+
if (
|
|
567
|
+
is_last_span
|
|
568
|
+
and is_hyphen_at_line_end(span['content'])
|
|
569
|
+
):
|
|
570
|
+
# 如果下一行的第一个span是小写字母开头,删除连字符
|
|
571
|
+
if (
|
|
572
|
+
i + 1 < len(para_block['lines'])
|
|
573
|
+
and para_block['lines'][i + 1].get('spans')
|
|
574
|
+
and para_block['lines'][i + 1]['spans'][0].get('type') == ContentType.TEXT
|
|
575
|
+
and para_block['lines'][i + 1]['spans'][0].get('content', '')
|
|
576
|
+
and para_block['lines'][i + 1]['spans'][0]['content'][0].islower()
|
|
577
|
+
):
|
|
578
|
+
span_content = span['content'][:-1]
|
|
579
|
+
else: # 如果没有下一行,或者下一行的第一个span不是小写字母开头,则保留连字符但不加空格
|
|
580
|
+
span_content = span['content']
|
|
581
|
+
else:
|
|
582
|
+
# 西方文本语境下content间需要空格分隔
|
|
583
|
+
span_content = f"{span['content']} "
|
|
584
|
+
|
|
585
|
+
if para_content and para_content[-1]['type'] == span_type:
|
|
586
|
+
# 合并相同类型的span
|
|
587
|
+
para_content[-1]['content'] += span_content
|
|
588
|
+
else:
|
|
589
|
+
span_content = {
|
|
590
|
+
'type': span_type,
|
|
591
|
+
'content': span_content,
|
|
592
|
+
}
|
|
593
|
+
para_content.append(span_content)
|
|
594
|
+
|
|
595
|
+
elif span_type in [
|
|
494
596
|
ContentTypeV2.SPAN_PHONETIC,
|
|
495
597
|
ContentTypeV2.SPAN_EQUATION_INLINE,
|
|
496
|
-
ContentTypeV2.SPAN_MD,
|
|
497
|
-
ContentTypeV2.SPAN_CODE_INLINE,
|
|
498
598
|
]:
|
|
499
599
|
span_content = {
|
|
500
600
|
'type': span_type,
|
mineru/cli/client.py
CHANGED
|
@@ -1,10 +1,15 @@
|
|
|
1
1
|
# Copyright (c) Opendatalab. All rights reserved.
|
|
2
2
|
import os
|
|
3
|
+
import sys
|
|
4
|
+
|
|
3
5
|
import click
|
|
4
6
|
from pathlib import Path
|
|
5
7
|
from loguru import logger
|
|
6
8
|
|
|
7
|
-
|
|
9
|
+
log_level = os.getenv("MINERU_LOG_LEVEL", "INFO").upper()
|
|
10
|
+
logger.remove() # 移除默认handler
|
|
11
|
+
logger.add(sys.stderr, level=log_level) # 添加新handler
|
|
12
|
+
|
|
8
13
|
from mineru.utils.cli_parser import arg_parse
|
|
9
14
|
from mineru.utils.config_reader import get_device
|
|
10
15
|
from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
|
|
@@ -13,10 +18,6 @@ from ..version import __version__
|
|
|
13
18
|
from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
|
|
14
19
|
|
|
15
20
|
|
|
16
|
-
backends = ['pipeline', 'vlm-transformers', 'vlm-vllm-engine', 'vlm-lmdeploy-engine', 'vlm-http-client']
|
|
17
|
-
if is_mac_os_version_supported():
|
|
18
|
-
backends.append("vlm-mlx-engine")
|
|
19
|
-
|
|
20
21
|
@click.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=True))
|
|
21
22
|
@click.pass_context
|
|
22
23
|
@click.version_option(__version__,
|
|
@@ -50,24 +51,23 @@ if is_mac_os_version_supported():
|
|
|
50
51
|
txt: Use text extraction method.
|
|
51
52
|
ocr: Use OCR method for image-based PDFs.
|
|
52
53
|
Without method specified, 'auto' will be used by default.
|
|
53
|
-
Adapted only for the case where the backend is set to 'pipeline'.""",
|
|
54
|
+
Adapted only for the case where the backend is set to 'pipeline' and 'hybrid-*'.""",
|
|
54
55
|
default='auto',
|
|
55
56
|
)
|
|
56
57
|
@click.option(
|
|
57
58
|
'-b',
|
|
58
59
|
'--backend',
|
|
59
60
|
'backend',
|
|
60
|
-
type=click.Choice(
|
|
61
|
+
type=click.Choice(['pipeline', 'vlm-http-client', 'hybrid-http-client', 'vlm-auto-engine', 'hybrid-auto-engine',]),
|
|
61
62
|
help="""\b
|
|
62
63
|
the backend for parsing pdf:
|
|
63
64
|
pipeline: More general.
|
|
64
|
-
vlm-
|
|
65
|
-
vlm-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
default='pipeline',
|
|
65
|
+
vlm-auto-engine: High accuracy via local computing power.
|
|
66
|
+
vlm-http-client: High accuracy via remote computing power(client suitable for openai-compatible servers).
|
|
67
|
+
hybrid-auto-engine: Next-generation high accuracy solution via local computing power.
|
|
68
|
+
hybrid-http-client: High accuracy but requires a little local computing power(client suitable for openai-compatible servers).
|
|
69
|
+
Without method specified, hybrid-auto-engine will be used by default.""",
|
|
70
|
+
default='hybrid-auto-engine',
|
|
71
71
|
)
|
|
72
72
|
@click.option(
|
|
73
73
|
'-l',
|
|
@@ -78,7 +78,7 @@ if is_mac_os_version_supported():
|
|
|
78
78
|
help="""
|
|
79
79
|
Input the languages in the pdf (if known) to improve OCR accuracy.
|
|
80
80
|
Without languages specified, 'ch' will be used by default.
|
|
81
|
-
Adapted only for the case where the backend is set to
|
|
81
|
+
Adapted only for the case where the backend is set to 'pipeline' and 'hybrid-*'.
|
|
82
82
|
""",
|
|
83
83
|
default='ch',
|
|
84
84
|
)
|
|
@@ -88,7 +88,7 @@ if is_mac_os_version_supported():
|
|
|
88
88
|
'server_url',
|
|
89
89
|
type=str,
|
|
90
90
|
help="""
|
|
91
|
-
When the backend is
|
|
91
|
+
When the backend is `<vlm/hybrid>-http-client`, you need to specify the server_url, for example:`http://127.0.0.1:30000`
|
|
92
92
|
""",
|
|
93
93
|
default=None,
|
|
94
94
|
)
|
|
@@ -130,7 +130,7 @@ if is_mac_os_version_supported():
|
|
|
130
130
|
'device_mode',
|
|
131
131
|
type=str,
|
|
132
132
|
help="""Device mode for model inference, e.g., "cpu", "cuda", "cuda:0", "npu", "npu:0", "mps".
|
|
133
|
-
Adapted only for the case where the backend is set to "pipeline"
|
|
133
|
+
Adapted only for the case where the backend is set to "pipeline". """,
|
|
134
134
|
default=None,
|
|
135
135
|
)
|
|
136
136
|
@click.option(
|
mineru/cli/common.py
CHANGED
|
@@ -10,6 +10,7 @@ import pypdfium2 as pdfium
|
|
|
10
10
|
|
|
11
11
|
from mineru.data.data_reader_writer import FileBasedDataWriter
|
|
12
12
|
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox, draw_line_sort_bbox
|
|
13
|
+
from mineru.utils.engine_utils import get_vlm_engine
|
|
13
14
|
from mineru.utils.enum_class import MakeMode
|
|
14
15
|
from mineru.utils.guess_suffix_or_lang import guess_suffix_by_bytes
|
|
15
16
|
from mineru.utils.pdf_image_tools import images_bytes_to_pdf_bytes
|
|
@@ -304,6 +305,112 @@ def _process_vlm(
|
|
|
304
305
|
)
|
|
305
306
|
|
|
306
307
|
|
|
308
|
+
def _process_hybrid(
|
|
309
|
+
output_dir,
|
|
310
|
+
pdf_file_names,
|
|
311
|
+
pdf_bytes_list,
|
|
312
|
+
h_lang_list,
|
|
313
|
+
parse_method,
|
|
314
|
+
inline_formula_enable,
|
|
315
|
+
backend,
|
|
316
|
+
f_draw_layout_bbox,
|
|
317
|
+
f_draw_span_bbox,
|
|
318
|
+
f_dump_md,
|
|
319
|
+
f_dump_middle_json,
|
|
320
|
+
f_dump_model_output,
|
|
321
|
+
f_dump_orig_pdf,
|
|
322
|
+
f_dump_content_list,
|
|
323
|
+
f_make_md_mode,
|
|
324
|
+
server_url=None,
|
|
325
|
+
**kwargs,
|
|
326
|
+
):
|
|
327
|
+
from mineru.backend.hybrid.hybrid_analyze import doc_analyze as hybrid_doc_analyze
|
|
328
|
+
"""同步处理hybrid后端逻辑"""
|
|
329
|
+
if not backend.endswith("client"):
|
|
330
|
+
server_url = None
|
|
331
|
+
|
|
332
|
+
for idx, (pdf_bytes, lang) in enumerate(zip(pdf_bytes_list, h_lang_list)):
|
|
333
|
+
pdf_file_name = pdf_file_names[idx]
|
|
334
|
+
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, f"hybrid_{parse_method}")
|
|
335
|
+
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
|
|
336
|
+
|
|
337
|
+
middle_json, infer_result, _vlm_ocr_enable = hybrid_doc_analyze(
|
|
338
|
+
pdf_bytes,
|
|
339
|
+
image_writer=image_writer,
|
|
340
|
+
backend=backend,
|
|
341
|
+
parse_method=parse_method,
|
|
342
|
+
language=lang,
|
|
343
|
+
inline_formula_enable=inline_formula_enable,
|
|
344
|
+
server_url=server_url,
|
|
345
|
+
**kwargs,
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
pdf_info = middle_json["pdf_info"]
|
|
349
|
+
|
|
350
|
+
# f_draw_span_bbox = not _vlm_ocr_enable
|
|
351
|
+
f_draw_span_bbox = False
|
|
352
|
+
|
|
353
|
+
_process_output(
|
|
354
|
+
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
|
|
355
|
+
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
|
|
356
|
+
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
|
|
357
|
+
f_make_md_mode, middle_json, infer_result, is_pipeline=False
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
async def _async_process_hybrid(
|
|
362
|
+
output_dir,
|
|
363
|
+
pdf_file_names,
|
|
364
|
+
pdf_bytes_list,
|
|
365
|
+
h_lang_list,
|
|
366
|
+
parse_method,
|
|
367
|
+
inline_formula_enable,
|
|
368
|
+
backend,
|
|
369
|
+
f_draw_layout_bbox,
|
|
370
|
+
f_draw_span_bbox,
|
|
371
|
+
f_dump_md,
|
|
372
|
+
f_dump_middle_json,
|
|
373
|
+
f_dump_model_output,
|
|
374
|
+
f_dump_orig_pdf,
|
|
375
|
+
f_dump_content_list,
|
|
376
|
+
f_make_md_mode,
|
|
377
|
+
server_url=None,
|
|
378
|
+
**kwargs,
|
|
379
|
+
):
|
|
380
|
+
from mineru.backend.hybrid.hybrid_analyze import aio_doc_analyze as aio_hybrid_doc_analyze
|
|
381
|
+
"""异步处理hybrid后端逻辑"""
|
|
382
|
+
if not backend.endswith("client"):
|
|
383
|
+
server_url = None
|
|
384
|
+
|
|
385
|
+
for idx, (pdf_bytes, lang) in enumerate(zip(pdf_bytes_list, h_lang_list)):
|
|
386
|
+
pdf_file_name = pdf_file_names[idx]
|
|
387
|
+
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, f"hybrid_{parse_method}")
|
|
388
|
+
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
|
|
389
|
+
|
|
390
|
+
middle_json, infer_result, _vlm_ocr_enable = await aio_hybrid_doc_analyze(
|
|
391
|
+
pdf_bytes,
|
|
392
|
+
image_writer=image_writer,
|
|
393
|
+
backend=backend,
|
|
394
|
+
parse_method=parse_method,
|
|
395
|
+
language=lang,
|
|
396
|
+
inline_formula_enable=inline_formula_enable,
|
|
397
|
+
server_url=server_url,
|
|
398
|
+
**kwargs,
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
pdf_info = middle_json["pdf_info"]
|
|
402
|
+
|
|
403
|
+
# f_draw_span_bbox = not _vlm_ocr_enable
|
|
404
|
+
f_draw_span_bbox = False
|
|
405
|
+
|
|
406
|
+
_process_output(
|
|
407
|
+
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
|
|
408
|
+
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
|
|
409
|
+
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
|
|
410
|
+
f_make_md_mode, middle_json, infer_result, is_pipeline=False
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
|
|
307
414
|
def do_parse(
|
|
308
415
|
output_dir,
|
|
309
416
|
pdf_file_names: list[str],
|
|
@@ -340,18 +447,40 @@ def do_parse(
|
|
|
340
447
|
if backend.startswith("vlm-"):
|
|
341
448
|
backend = backend[4:]
|
|
342
449
|
|
|
343
|
-
|
|
344
|
-
|
|
450
|
+
if backend == "vllm-async-engine":
|
|
451
|
+
raise Exception("vlm-vllm-async-engine backend is not supported in sync mode, please use vlm-vllm-engine backend")
|
|
345
452
|
|
|
346
|
-
|
|
347
|
-
|
|
453
|
+
if backend == "auto-engine":
|
|
454
|
+
backend = get_vlm_engine(inference_engine='auto', is_async=False)
|
|
348
455
|
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
456
|
+
os.environ['MINERU_VLM_FORMULA_ENABLE'] = str(formula_enable)
|
|
457
|
+
os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable)
|
|
458
|
+
|
|
459
|
+
_process_vlm(
|
|
460
|
+
output_dir, pdf_file_names, pdf_bytes_list, backend,
|
|
461
|
+
f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
|
|
462
|
+
f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode,
|
|
463
|
+
server_url, **kwargs,
|
|
464
|
+
)
|
|
465
|
+
elif backend.startswith("hybrid-"):
|
|
466
|
+
backend = backend[7:]
|
|
467
|
+
|
|
468
|
+
if backend == "vllm-async-engine":
|
|
469
|
+
raise Exception(
|
|
470
|
+
"hybrid-vllm-async-engine backend is not supported in sync mode, please use hybrid-vllm-engine backend")
|
|
471
|
+
|
|
472
|
+
if backend == "auto-engine":
|
|
473
|
+
backend = get_vlm_engine(inference_engine='auto', is_async=False)
|
|
474
|
+
|
|
475
|
+
os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable)
|
|
476
|
+
os.environ['MINERU_VLM_FORMULA_ENABLE'] = "true"
|
|
477
|
+
|
|
478
|
+
_process_hybrid(
|
|
479
|
+
output_dir, pdf_file_names, pdf_bytes_list, p_lang_list, parse_method, formula_enable, backend,
|
|
480
|
+
f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
|
|
481
|
+
f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode,
|
|
482
|
+
server_url, **kwargs,
|
|
483
|
+
)
|
|
355
484
|
|
|
356
485
|
|
|
357
486
|
async def aio_do_parse(
|
|
@@ -391,19 +520,39 @@ async def aio_do_parse(
|
|
|
391
520
|
if backend.startswith("vlm-"):
|
|
392
521
|
backend = backend[4:]
|
|
393
522
|
|
|
394
|
-
|
|
395
|
-
|
|
523
|
+
if backend == "vllm-engine":
|
|
524
|
+
raise Exception("vlm-vllm-engine backend is not supported in async mode, please use vlm-vllm-async-engine backend")
|
|
396
525
|
|
|
397
|
-
|
|
398
|
-
|
|
526
|
+
if backend == "auto-engine":
|
|
527
|
+
backend = get_vlm_engine(inference_engine='auto', is_async=True)
|
|
399
528
|
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
|
|
403
|
-
f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode,
|
|
404
|
-
server_url, **kwargs,
|
|
405
|
-
)
|
|
529
|
+
os.environ['MINERU_VLM_FORMULA_ENABLE'] = str(formula_enable)
|
|
530
|
+
os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable)
|
|
406
531
|
|
|
532
|
+
await _async_process_vlm(
|
|
533
|
+
output_dir, pdf_file_names, pdf_bytes_list, backend,
|
|
534
|
+
f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
|
|
535
|
+
f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode,
|
|
536
|
+
server_url, **kwargs,
|
|
537
|
+
)
|
|
538
|
+
elif backend.startswith("hybrid-"):
|
|
539
|
+
backend = backend[7:]
|
|
540
|
+
|
|
541
|
+
if backend == "vllm-engine":
|
|
542
|
+
raise Exception("hybrid-vllm-engine backend is not supported in async mode, please use hybrid-vllm-async-engine backend")
|
|
543
|
+
|
|
544
|
+
if backend == "auto-engine":
|
|
545
|
+
backend = get_vlm_engine(inference_engine='auto', is_async=True)
|
|
546
|
+
|
|
547
|
+
os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable)
|
|
548
|
+
os.environ['MINERU_VLM_FORMULA_ENABLE'] = "true"
|
|
549
|
+
|
|
550
|
+
await _async_process_hybrid(
|
|
551
|
+
output_dir, pdf_file_names, pdf_bytes_list, p_lang_list, parse_method, formula_enable, backend,
|
|
552
|
+
f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
|
|
553
|
+
f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode,
|
|
554
|
+
server_url, **kwargs,
|
|
555
|
+
)
|
|
407
556
|
|
|
408
557
|
|
|
409
558
|
if __name__ == "__main__":
|