mineru 2.5.4__py3-none-any.whl → 2.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mineru/backend/pipeline/model_init.py +25 -3
- mineru/backend/pipeline/model_json_to_middle_json.py +2 -2
- mineru/backend/pipeline/model_list.py +0 -1
- mineru/backend/utils.py +24 -0
- mineru/backend/vlm/model_output_to_middle_json.py +2 -2
- mineru/backend/vlm/{custom_logits_processors.py → utils.py} +36 -2
- mineru/backend/vlm/vlm_analyze.py +45 -50
- mineru/backend/vlm/vlm_magic_model.py +155 -1
- mineru/cli/common.py +25 -22
- mineru/cli/fast_api.py +2 -8
- mineru/cli/gradio_app.py +96 -9
- mineru/cli/models_download.py +1 -0
- mineru/model/mfr/pp_formulanet_plus_m/predict_formula.py +152 -0
- mineru/model/mfr/pp_formulanet_plus_m/processors.py +657 -0
- mineru/model/mfr/unimernet/unimernet_hf/modeling_unimernet.py +1 -326
- mineru/model/mfr/utils.py +338 -0
- mineru/model/ocr/paddleocr2pytorch/pytorch_paddle.py +103 -16
- mineru/model/table/rec/unet_table/main.py +1 -1
- mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/data/imaug/operators.py +5 -5
- mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/backbones/__init__.py +2 -1
- mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/backbones/rec_lcnetv3.py +7 -7
- mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/backbones/rec_pphgnetv2.py +2 -2
- mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/heads/__init__.py +2 -0
- mineru/model/utils/pytorchocr/modeling/heads/rec_ppformulanet_head.py +1383 -0
- mineru/model/utils/pytorchocr/modeling/heads/rec_unimernet_head.py +2631 -0
- mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/postprocess/rec_postprocess.py +25 -28
- mineru/model/utils/pytorchocr/utils/__init__.py +0 -0
- mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/arch_config.yaml +130 -0
- mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv5_arabic_dict.txt +747 -0
- mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv5_cyrillic_dict.txt +850 -0
- mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv5_devanagari_dict.txt +568 -0
- mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv5_ta_dict.txt +513 -0
- mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv5_te_dict.txt +540 -0
- mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/models_config.yml +15 -15
- mineru/model/utils/pytorchocr/utils/resources/pp_formulanet_arch_config.yaml +24 -0
- mineru/model/utils/tools/infer/__init__.py +1 -0
- mineru/model/{ocr/paddleocr2pytorch → utils}/tools/infer/predict_det.py +6 -3
- mineru/model/{ocr/paddleocr2pytorch → utils}/tools/infer/predict_rec.py +16 -25
- mineru/model/vlm_vllm_model/server.py +7 -2
- mineru/resources/header.html +2 -2
- mineru/utils/enum_class.py +1 -0
- mineru/utils/llm_aided.py +4 -2
- mineru/utils/ocr_utils.py +16 -0
- mineru/utils/table_merge.py +102 -13
- mineru/version.py +1 -1
- {mineru-2.5.4.dist-info → mineru-2.6.1.dist-info}/METADATA +32 -8
- mineru-2.6.1.dist-info/RECORD +195 -0
- mineru-2.5.4.dist-info/RECORD +0 -181
- /mineru/model/{ocr/paddleocr2pytorch/pytorchocr → mfr/pp_formulanet_plus_m}/__init__.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch/tools/infer → utils}/__init__.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch/pytorchocr/modeling → utils/pytorchocr}/__init__.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/base_ocr_v20.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/data/__init__.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/data/imaug/__init__.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch/pytorchocr/utils → utils/pytorchocr/modeling}/__init__.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/architectures/__init__.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/architectures/base_model.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/backbones/det_mobilenet_v3.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/backbones/rec_donut_swin.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/backbones/rec_hgnet.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/backbones/rec_mobilenet_v3.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/backbones/rec_mv1_enhance.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/backbones/rec_svtrnet.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/common.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/heads/cls_head.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/heads/det_db_head.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/heads/rec_ctc_head.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/heads/rec_multi_head.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/necks/__init__.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/necks/db_fpn.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/necks/intracl.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/necks/rnn.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/postprocess/__init__.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/postprocess/cls_postprocess.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/postprocess/db_postprocess.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/arabic_dict.txt +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/chinese_cht_dict.txt +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/cyrillic_dict.txt +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/devanagari_dict.txt +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/en_dict.txt +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/japan_dict.txt +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ka_dict.txt +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/korean_dict.txt +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/latin_dict.txt +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ppocr_keys_v1.txt +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ppocrv4_doc_dict.txt +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ppocrv5_dict.txt +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ppocrv5_el_dict.txt +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ppocrv5_en_dict.txt +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ppocrv5_eslav_dict.txt +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ppocrv5_korean_dict.txt +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ppocrv5_latin_dict.txt +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ppocrv5_th_dict.txt +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ta_dict.txt +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/te_dict.txt +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/tools/__init__.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/tools/infer/predict_cls.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/tools/infer/predict_system.py +0 -0
- /mineru/model/{ocr/paddleocr2pytorch → utils}/tools/infer/pytorchocr_utility.py +0 -0
- {mineru-2.5.4.dist-info → mineru-2.6.1.dist-info}/WHEEL +0 -0
- {mineru-2.5.4.dist-info → mineru-2.6.1.dist-info}/entry_points.txt +0 -0
- {mineru-2.5.4.dist-info → mineru-2.6.1.dist-info}/licenses/LICENSE.md +0 -0
- {mineru-2.5.4.dist-info → mineru-2.6.1.dist-info}/top_level.txt +0 -0
|
@@ -7,6 +7,7 @@ from .model_list import AtomicModel
|
|
|
7
7
|
from ...model.layout.doclayoutyolo import DocLayoutYOLOModel
|
|
8
8
|
from ...model.mfd.yolo_v8 import YOLOv8MFDModel
|
|
9
9
|
from ...model.mfr.unimernet.Unimernet import UnimernetModel
|
|
10
|
+
from ...model.mfr.pp_formulanet_plus_m.predict_formula import FormulaRecognizer
|
|
10
11
|
from ...model.ocr.paddleocr2pytorch.pytorch_paddle import PytorchPaddleOCR
|
|
11
12
|
from ...model.ori_cls.paddle_ori_cls import PaddleOrientationClsModel
|
|
12
13
|
from ...model.table.cls.paddle_table_cls import PaddleTableClsModel
|
|
@@ -16,6 +17,15 @@ from ...model.table.rec.unet_table.main import UnetTableModel
|
|
|
16
17
|
from ...utils.enum_class import ModelPath
|
|
17
18
|
from ...utils.models_download_utils import auto_download_and_get_model_root_path
|
|
18
19
|
|
|
20
|
+
MFR_MODEL = os.getenv('MINERU_FORMULA_CH_SUPPORT', 'False')
|
|
21
|
+
if MFR_MODEL.lower() in ['true', '1', 'yes']:
|
|
22
|
+
MFR_MODEL = "pp_formulanet_plus_m"
|
|
23
|
+
elif MFR_MODEL.lower() in ['false', '0', 'no']:
|
|
24
|
+
MFR_MODEL = "unimernet_small"
|
|
25
|
+
else:
|
|
26
|
+
logger.warning(f"Invalid MINERU_FORMULA_CH_SUPPORT value: {MFR_MODEL}, set to default 'False'")
|
|
27
|
+
MFR_MODEL = "unimernet_small"
|
|
28
|
+
|
|
19
29
|
|
|
20
30
|
def img_orientation_cls_model_init():
|
|
21
31
|
atom_model_manager = AtomModelSingleton()
|
|
@@ -68,7 +78,13 @@ def mfd_model_init(weight, device='cpu'):
|
|
|
68
78
|
|
|
69
79
|
|
|
70
80
|
def mfr_model_init(weight_dir, device='cpu'):
|
|
71
|
-
|
|
81
|
+
if MFR_MODEL == "unimernet_small":
|
|
82
|
+
mfr_model = UnimernetModel(weight_dir, device)
|
|
83
|
+
elif MFR_MODEL == "pp_formulanet_plus_m":
|
|
84
|
+
mfr_model = FormulaRecognizer(weight_dir, device)
|
|
85
|
+
else:
|
|
86
|
+
logger.error('MFR model name not allow')
|
|
87
|
+
exit(1)
|
|
72
88
|
return mfr_model
|
|
73
89
|
|
|
74
90
|
|
|
@@ -205,11 +221,17 @@ class MineruPipelineModel:
|
|
|
205
221
|
)
|
|
206
222
|
|
|
207
223
|
# 初始化公式解析模型
|
|
208
|
-
|
|
224
|
+
if MFR_MODEL == "unimernet_small":
|
|
225
|
+
mfr_model_path = ModelPath.unimernet_small
|
|
226
|
+
elif MFR_MODEL == "pp_formulanet_plus_m":
|
|
227
|
+
mfr_model_path = ModelPath.pp_formulanet_plus_m
|
|
228
|
+
else:
|
|
229
|
+
logger.error('MFR model name not allow')
|
|
230
|
+
exit(1)
|
|
209
231
|
|
|
210
232
|
self.mfr_model = atom_model_manager.get_atom_model(
|
|
211
233
|
atom_model_name=AtomicModel.MFR,
|
|
212
|
-
mfr_weight_dir=
|
|
234
|
+
mfr_weight_dir=str(os.path.join(auto_download_and_get_model_root_path(mfr_model_path), mfr_model_path)),
|
|
213
235
|
device=self.device,
|
|
214
236
|
)
|
|
215
237
|
|
|
@@ -5,6 +5,7 @@ import time
|
|
|
5
5
|
from loguru import logger
|
|
6
6
|
from tqdm import tqdm
|
|
7
7
|
|
|
8
|
+
from mineru.backend.utils import cross_page_table_merge
|
|
8
9
|
from mineru.utils.config_reader import get_device, get_llm_aided_config, get_formula_enable
|
|
9
10
|
from mineru.backend.pipeline.model_init import AtomModelSingleton
|
|
10
11
|
from mineru.backend.pipeline.para_split import para_split
|
|
@@ -20,7 +21,6 @@ from mineru.utils.ocr_utils import OcrConfidence
|
|
|
20
21
|
from mineru.utils.span_block_fix import fill_spans_in_blocks, fix_discarded_block, fix_block_spans
|
|
21
22
|
from mineru.utils.span_pre_proc import remove_outside_spans, remove_overlaps_low_confidence_spans, \
|
|
22
23
|
remove_overlaps_min_spans, txt_spans_extract
|
|
23
|
-
from mineru.utils.table_merge import merge_table
|
|
24
24
|
from mineru.version import __version__
|
|
25
25
|
from mineru.utils.hash_utils import bytes_md5
|
|
26
26
|
|
|
@@ -231,7 +231,7 @@ def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=N
|
|
|
231
231
|
para_split(middle_json["pdf_info"])
|
|
232
232
|
|
|
233
233
|
"""表格跨页合并"""
|
|
234
|
-
|
|
234
|
+
cross_page_table_merge(middle_json["pdf_info"])
|
|
235
235
|
|
|
236
236
|
"""llm优化"""
|
|
237
237
|
llm_aided_config = get_llm_aided_config()
|
mineru/backend/utils.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from loguru import logger
|
|
4
|
+
|
|
5
|
+
from mineru.utils.table_merge import merge_table
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def cross_page_table_merge(pdf_info: list[dict]):
|
|
9
|
+
"""Merge tables that span across multiple pages in a PDF document.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
pdf_info (list[dict]): A list of dictionaries containing information about each page in the PDF.
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
None
|
|
16
|
+
"""
|
|
17
|
+
is_merge_table = os.getenv('MINERU_TABLE_MERGE_ENABLE', 'true')
|
|
18
|
+
if is_merge_table.lower() in ['true', '1', 'yes']:
|
|
19
|
+
merge_table(pdf_info)
|
|
20
|
+
elif is_merge_table.lower() in ['false', '0', 'no']:
|
|
21
|
+
pass
|
|
22
|
+
else:
|
|
23
|
+
logger.warning(f'unknown MINERU_TABLE_MERGE_ENABLE config: {is_merge_table}, pass')
|
|
24
|
+
pass
|
|
@@ -5,13 +5,13 @@ import cv2
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
from loguru import logger
|
|
7
7
|
|
|
8
|
+
from mineru.backend.utils import cross_page_table_merge
|
|
8
9
|
from mineru.backend.vlm.vlm_magic_model import MagicModel
|
|
9
10
|
from mineru.utils.config_reader import get_table_enable, get_llm_aided_config
|
|
10
11
|
from mineru.utils.cut_image import cut_image_and_table
|
|
11
12
|
from mineru.utils.enum_class import ContentType
|
|
12
13
|
from mineru.utils.hash_utils import bytes_md5
|
|
13
14
|
from mineru.utils.pdf_image_tools import get_crop_img
|
|
14
|
-
from mineru.utils.table_merge import merge_table
|
|
15
15
|
from mineru.version import __version__
|
|
16
16
|
|
|
17
17
|
|
|
@@ -110,7 +110,7 @@ def result_to_middle_json(model_output_blocks_list, images_list, pdf_doc, image_
|
|
|
110
110
|
"""表格跨页合并"""
|
|
111
111
|
table_enable = get_table_enable(os.getenv('MINERU_VLM_TABLE_ENABLE', 'True').lower() == 'true')
|
|
112
112
|
if table_enable:
|
|
113
|
-
|
|
113
|
+
cross_page_table_merge(middle_json["pdf_info"])
|
|
114
114
|
|
|
115
115
|
"""llm优化标题分级"""
|
|
116
116
|
if heading_level_import_success:
|
|
@@ -3,8 +3,11 @@ import os
|
|
|
3
3
|
from loguru import logger
|
|
4
4
|
from packaging import version
|
|
5
5
|
|
|
6
|
+
from mineru.utils.config_reader import get_device
|
|
7
|
+
from mineru.utils.model_utils import get_vram
|
|
6
8
|
|
|
7
|
-
|
|
9
|
+
|
|
10
|
+
def enable_custom_logits_processors() -> bool:
|
|
8
11
|
import torch
|
|
9
12
|
from vllm import __version__ as vllm_version
|
|
10
13
|
|
|
@@ -38,4 +41,35 @@ def enable_custom_logits_processors():
|
|
|
38
41
|
return False
|
|
39
42
|
else:
|
|
40
43
|
logger.info(f"compute_capability: {compute_capability} >= 8.0 and vllm version: {vllm_version} >= 0.10.1, enable custom_logits_processors")
|
|
41
|
-
return True
|
|
44
|
+
return True
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def set_default_gpu_memory_utilization() -> float:
|
|
48
|
+
from vllm import __version__ as vllm_version
|
|
49
|
+
if version.parse(vllm_version) >= version.parse("0.11.0"):
|
|
50
|
+
return 0.7
|
|
51
|
+
else:
|
|
52
|
+
return 0.5
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def set_default_batch_size() -> int:
|
|
56
|
+
try:
|
|
57
|
+
device = get_device()
|
|
58
|
+
vram = get_vram(device)
|
|
59
|
+
if vram is not None:
|
|
60
|
+
gpu_memory = int(os.getenv('MINERU_VIRTUAL_VRAM_SIZE', round(vram)))
|
|
61
|
+
if gpu_memory >= 16:
|
|
62
|
+
batch_size = 8
|
|
63
|
+
elif gpu_memory >= 8:
|
|
64
|
+
batch_size = 4
|
|
65
|
+
else:
|
|
66
|
+
batch_size = 1
|
|
67
|
+
logger.info(f'gpu_memory: {gpu_memory} GB, batch_size: {batch_size}')
|
|
68
|
+
else:
|
|
69
|
+
# Default batch_ratio when VRAM can't be determined
|
|
70
|
+
batch_size = 1
|
|
71
|
+
logger.info(f'Could not determine GPU memory, using default batch_ratio: {batch_size}')
|
|
72
|
+
except Exception as e:
|
|
73
|
+
logger.warning(f'Error determining VRAM: {e}, using default batch_ratio: 1')
|
|
74
|
+
batch_size = 1
|
|
75
|
+
return batch_size
|
|
@@ -4,14 +4,13 @@ import time
|
|
|
4
4
|
|
|
5
5
|
from loguru import logger
|
|
6
6
|
|
|
7
|
-
from .
|
|
7
|
+
from .utils import enable_custom_logits_processors, set_default_gpu_memory_utilization, set_default_batch_size
|
|
8
8
|
from .model_output_to_middle_json import result_to_middle_json
|
|
9
9
|
from ...data.data_reader_writer import DataWriter
|
|
10
10
|
from mineru.utils.pdf_image_tools import load_images_from_pdf
|
|
11
11
|
from ...utils.config_reader import get_device
|
|
12
12
|
|
|
13
13
|
from ...utils.enum_class import ImageType
|
|
14
|
-
from ...utils.model_utils import get_vram
|
|
15
14
|
from ...utils.models_download_utils import auto_download_and_get_model_root_path
|
|
16
15
|
|
|
17
16
|
from mineru_vl_utils import MinerUClient
|
|
@@ -41,7 +40,13 @@ class ModelSingleton:
|
|
|
41
40
|
processor = None
|
|
42
41
|
vllm_llm = None
|
|
43
42
|
vllm_async_llm = None
|
|
44
|
-
batch_size = 0
|
|
43
|
+
batch_size = kwargs.get("batch_size", 0) # for transformers backend only
|
|
44
|
+
max_concurrency = kwargs.get("max_concurrency", 100) # for http-client backend only
|
|
45
|
+
http_timeout = kwargs.get("http_timeout", 600) # for http-client backend only
|
|
46
|
+
# 从kwargs中移除这些参数,避免传递给不相关的初始化函数
|
|
47
|
+
for param in ["batch_size", "max_concurrency", "http_timeout"]:
|
|
48
|
+
if param in kwargs:
|
|
49
|
+
del kwargs[param]
|
|
45
50
|
if backend in ['transformers', 'vllm-engine', "vllm-async-engine"] and not model_path:
|
|
46
51
|
model_path = auto_download_and_get_model_root_path("/","vlm")
|
|
47
52
|
if backend == "transformers":
|
|
@@ -68,53 +73,41 @@ class ModelSingleton:
|
|
|
68
73
|
model_path,
|
|
69
74
|
use_fast=True,
|
|
70
75
|
)
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
from vllm.v1.engine.async_llm import AsyncLLM
|
|
107
|
-
from mineru_vl_utils import MinerULogitsProcessor
|
|
108
|
-
except ImportError:
|
|
109
|
-
raise ImportError("Please install vllm to use the vllm-async-engine backend.")
|
|
110
|
-
if "gpu_memory_utilization" not in kwargs:
|
|
111
|
-
kwargs["gpu_memory_utilization"] = 0.5
|
|
112
|
-
if "model" not in kwargs:
|
|
113
|
-
kwargs["model"] = model_path
|
|
114
|
-
if enable_custom_logits_processors() and ("logits_processors" not in kwargs):
|
|
115
|
-
kwargs["logits_processors"] = [MinerULogitsProcessor]
|
|
116
|
-
# 使用kwargs为 vllm初始化参数
|
|
117
|
-
vllm_async_llm = AsyncLLM.from_engine_args(AsyncEngineArgs(**kwargs))
|
|
76
|
+
if batch_size == 0:
|
|
77
|
+
batch_size = set_default_batch_size()
|
|
78
|
+
else:
|
|
79
|
+
if os.getenv('OMP_NUM_THREADS') is None:
|
|
80
|
+
os.environ["OMP_NUM_THREADS"] = "1"
|
|
81
|
+
|
|
82
|
+
if backend == "vllm-engine":
|
|
83
|
+
try:
|
|
84
|
+
import vllm
|
|
85
|
+
from mineru_vl_utils import MinerULogitsProcessor
|
|
86
|
+
except ImportError:
|
|
87
|
+
raise ImportError("Please install vllm to use the vllm-engine backend.")
|
|
88
|
+
if "gpu_memory_utilization" not in kwargs:
|
|
89
|
+
kwargs["gpu_memory_utilization"] = set_default_gpu_memory_utilization()
|
|
90
|
+
if "model" not in kwargs:
|
|
91
|
+
kwargs["model"] = model_path
|
|
92
|
+
if enable_custom_logits_processors() and ("logits_processors" not in kwargs):
|
|
93
|
+
kwargs["logits_processors"] = [MinerULogitsProcessor]
|
|
94
|
+
# 使用kwargs为 vllm初始化参数
|
|
95
|
+
vllm_llm = vllm.LLM(**kwargs)
|
|
96
|
+
elif backend == "vllm-async-engine":
|
|
97
|
+
try:
|
|
98
|
+
from vllm.engine.arg_utils import AsyncEngineArgs
|
|
99
|
+
from vllm.v1.engine.async_llm import AsyncLLM
|
|
100
|
+
from mineru_vl_utils import MinerULogitsProcessor
|
|
101
|
+
except ImportError:
|
|
102
|
+
raise ImportError("Please install vllm to use the vllm-async-engine backend.")
|
|
103
|
+
if "gpu_memory_utilization" not in kwargs:
|
|
104
|
+
kwargs["gpu_memory_utilization"] = set_default_gpu_memory_utilization()
|
|
105
|
+
if "model" not in kwargs:
|
|
106
|
+
kwargs["model"] = model_path
|
|
107
|
+
if enable_custom_logits_processors() and ("logits_processors" not in kwargs):
|
|
108
|
+
kwargs["logits_processors"] = [MinerULogitsProcessor]
|
|
109
|
+
# 使用kwargs为 vllm初始化参数
|
|
110
|
+
vllm_async_llm = AsyncLLM.from_engine_args(AsyncEngineArgs(**kwargs))
|
|
118
111
|
self._models[key] = MinerUClient(
|
|
119
112
|
backend=backend,
|
|
120
113
|
model=model,
|
|
@@ -123,6 +116,8 @@ class ModelSingleton:
|
|
|
123
116
|
vllm_async_llm=vllm_async_llm,
|
|
124
117
|
server_url=server_url,
|
|
125
118
|
batch_size=batch_size,
|
|
119
|
+
max_concurrency=max_concurrency,
|
|
120
|
+
http_timeout=http_timeout,
|
|
126
121
|
)
|
|
127
122
|
elapsed = round(time.time() - start_time, 2)
|
|
128
123
|
logger.info(f"get {backend} predictor cost: {elapsed}s")
|
|
@@ -361,7 +361,7 @@ def get_type_blocks(blocks, block_type: Literal["image", "table", "code"]):
|
|
|
361
361
|
return ret
|
|
362
362
|
|
|
363
363
|
|
|
364
|
-
def
|
|
364
|
+
def fix_two_layer_blocks_back(blocks, fix_type: Literal["image", "table", "code"]):
|
|
365
365
|
need_fix_blocks = get_type_blocks(blocks, fix_type)
|
|
366
366
|
fixed_blocks = []
|
|
367
367
|
not_include_blocks = []
|
|
@@ -404,6 +404,160 @@ def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table", "code"]):
|
|
|
404
404
|
return fixed_blocks, not_include_blocks
|
|
405
405
|
|
|
406
406
|
|
|
407
|
+
def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table", "code"]):
|
|
408
|
+
need_fix_blocks = get_type_blocks(blocks, fix_type)
|
|
409
|
+
fixed_blocks = []
|
|
410
|
+
not_include_blocks = []
|
|
411
|
+
processed_indices = set()
|
|
412
|
+
|
|
413
|
+
# 特殊处理表格类型,确保标题在表格前,注脚在表格后
|
|
414
|
+
if fix_type == "table":
|
|
415
|
+
# 收集所有不合适的caption和footnote
|
|
416
|
+
misplaced_captions = [] # 存储(caption, 原始block索引)
|
|
417
|
+
misplaced_footnotes = [] # 存储(footnote, 原始block索引)
|
|
418
|
+
|
|
419
|
+
# 第一步:移除不符合位置要求的caption和footnote
|
|
420
|
+
for block_idx, block in enumerate(need_fix_blocks):
|
|
421
|
+
body = block[f"{fix_type}_body"]
|
|
422
|
+
body_index = body["index"]
|
|
423
|
+
|
|
424
|
+
# 检查caption应在body前或同位置
|
|
425
|
+
valid_captions = []
|
|
426
|
+
for caption in block[f"{fix_type}_caption_list"]:
|
|
427
|
+
if caption["index"] <= body_index:
|
|
428
|
+
valid_captions.append(caption)
|
|
429
|
+
else:
|
|
430
|
+
misplaced_captions.append((caption, block_idx))
|
|
431
|
+
block[f"{fix_type}_caption_list"] = valid_captions
|
|
432
|
+
|
|
433
|
+
# 检查footnote应在body后或同位置
|
|
434
|
+
valid_footnotes = []
|
|
435
|
+
for footnote in block[f"{fix_type}_footnote_list"]:
|
|
436
|
+
if footnote["index"] >= body_index:
|
|
437
|
+
valid_footnotes.append(footnote)
|
|
438
|
+
else:
|
|
439
|
+
misplaced_footnotes.append((footnote, block_idx))
|
|
440
|
+
block[f"{fix_type}_footnote_list"] = valid_footnotes
|
|
441
|
+
|
|
442
|
+
# 第二步:重新分配不合规的caption到合适的body
|
|
443
|
+
for caption, original_block_idx in misplaced_captions:
|
|
444
|
+
caption_index = caption["index"]
|
|
445
|
+
best_block_idx = None
|
|
446
|
+
min_distance = float('inf')
|
|
447
|
+
|
|
448
|
+
# 寻找索引大于等于caption_index的最近body
|
|
449
|
+
for idx, block in enumerate(need_fix_blocks):
|
|
450
|
+
body_index = block[f"{fix_type}_body"]["index"]
|
|
451
|
+
if body_index >= caption_index and idx != original_block_idx:
|
|
452
|
+
distance = body_index - caption_index
|
|
453
|
+
if distance < min_distance:
|
|
454
|
+
min_distance = distance
|
|
455
|
+
best_block_idx = idx
|
|
456
|
+
|
|
457
|
+
if best_block_idx is not None:
|
|
458
|
+
# 找到合适的body,添加到对应block的caption_list
|
|
459
|
+
need_fix_blocks[best_block_idx][f"{fix_type}_caption_list"].append(caption)
|
|
460
|
+
else:
|
|
461
|
+
# 没找到合适的body,作为普通block处理
|
|
462
|
+
not_include_blocks.append(caption)
|
|
463
|
+
|
|
464
|
+
# 第三步:重新分配不合规的footnote到合适的body
|
|
465
|
+
for footnote, original_block_idx in misplaced_footnotes:
|
|
466
|
+
footnote_index = footnote["index"]
|
|
467
|
+
best_block_idx = None
|
|
468
|
+
min_distance = float('inf')
|
|
469
|
+
|
|
470
|
+
# 寻找索引小于等于footnote_index的最近body
|
|
471
|
+
for idx, block in enumerate(need_fix_blocks):
|
|
472
|
+
body_index = block[f"{fix_type}_body"]["index"]
|
|
473
|
+
if body_index <= footnote_index and idx != original_block_idx:
|
|
474
|
+
distance = footnote_index - body_index
|
|
475
|
+
if distance < min_distance:
|
|
476
|
+
min_distance = distance
|
|
477
|
+
best_block_idx = idx
|
|
478
|
+
|
|
479
|
+
if best_block_idx is not None:
|
|
480
|
+
# 找到合适的body,添加到对应block的footnote_list
|
|
481
|
+
need_fix_blocks[best_block_idx][f"{fix_type}_footnote_list"].append(footnote)
|
|
482
|
+
else:
|
|
483
|
+
# 没找到合适的body,作为普通block处理
|
|
484
|
+
not_include_blocks.append(footnote)
|
|
485
|
+
|
|
486
|
+
# 第四步:将每个block的caption_list和footnote_list中不连续index的元素提出来作为普通block处理
|
|
487
|
+
for block in need_fix_blocks:
|
|
488
|
+
caption_list = block[f"{fix_type}_caption_list"]
|
|
489
|
+
footnote_list = block[f"{fix_type}_footnote_list"]
|
|
490
|
+
body_index = block[f"{fix_type}_body"]["index"]
|
|
491
|
+
|
|
492
|
+
# 处理caption_list (从body往前看,caption在body之前)
|
|
493
|
+
if caption_list:
|
|
494
|
+
# 按index降序排列,从最接近body的开始检查
|
|
495
|
+
caption_list.sort(key=lambda x: x["index"], reverse=True)
|
|
496
|
+
filtered_captions = [caption_list[0]]
|
|
497
|
+
for i in range(1, len(caption_list)):
|
|
498
|
+
# 检查是否与前一个caption连续(降序所以是-1)
|
|
499
|
+
if caption_list[i]["index"] == caption_list[i - 1]["index"] - 1:
|
|
500
|
+
filtered_captions.append(caption_list[i])
|
|
501
|
+
else:
|
|
502
|
+
# 出现gap,后续所有caption都作为普通block
|
|
503
|
+
not_include_blocks.extend(caption_list[i:])
|
|
504
|
+
break
|
|
505
|
+
# 恢复升序
|
|
506
|
+
filtered_captions.reverse()
|
|
507
|
+
block[f"{fix_type}_caption_list"] = filtered_captions
|
|
508
|
+
|
|
509
|
+
# 处理footnote_list (从body往后看,footnote在body之后)
|
|
510
|
+
if footnote_list:
|
|
511
|
+
# 按index升序排列,从最接近body的开始检查
|
|
512
|
+
footnote_list.sort(key=lambda x: x["index"])
|
|
513
|
+
filtered_footnotes = [footnote_list[0]]
|
|
514
|
+
for i in range(1, len(footnote_list)):
|
|
515
|
+
# 检查是否与前一个footnote连续
|
|
516
|
+
if footnote_list[i]["index"] == footnote_list[i - 1]["index"] + 1:
|
|
517
|
+
filtered_footnotes.append(footnote_list[i])
|
|
518
|
+
else:
|
|
519
|
+
# 出现gap,后续所有footnote都作为普通block
|
|
520
|
+
not_include_blocks.extend(footnote_list[i:])
|
|
521
|
+
break
|
|
522
|
+
block[f"{fix_type}_footnote_list"] = filtered_footnotes
|
|
523
|
+
|
|
524
|
+
# 构建两层结构blocks
|
|
525
|
+
for block in need_fix_blocks:
|
|
526
|
+
body = block[f"{fix_type}_body"]
|
|
527
|
+
caption_list = block[f"{fix_type}_caption_list"]
|
|
528
|
+
footnote_list = block[f"{fix_type}_footnote_list"]
|
|
529
|
+
|
|
530
|
+
body["type"] = f"{fix_type}_body"
|
|
531
|
+
for caption in caption_list:
|
|
532
|
+
caption["type"] = f"{fix_type}_caption"
|
|
533
|
+
processed_indices.add(caption["index"])
|
|
534
|
+
for footnote in footnote_list:
|
|
535
|
+
footnote["type"] = f"{fix_type}_footnote"
|
|
536
|
+
processed_indices.add(footnote["index"])
|
|
537
|
+
|
|
538
|
+
processed_indices.add(body["index"])
|
|
539
|
+
|
|
540
|
+
two_layer_block = {
|
|
541
|
+
"type": fix_type,
|
|
542
|
+
"bbox": body["bbox"],
|
|
543
|
+
"blocks": [body],
|
|
544
|
+
"index": body["index"],
|
|
545
|
+
}
|
|
546
|
+
two_layer_block["blocks"].extend([*caption_list, *footnote_list])
|
|
547
|
+
# 对blocks按index排序
|
|
548
|
+
two_layer_block["blocks"].sort(key=lambda x: x["index"])
|
|
549
|
+
|
|
550
|
+
fixed_blocks.append(two_layer_block)
|
|
551
|
+
|
|
552
|
+
# 添加未处理的blocks
|
|
553
|
+
for block in blocks:
|
|
554
|
+
block.pop("type", None)
|
|
555
|
+
if block["index"] not in processed_indices and block not in not_include_blocks:
|
|
556
|
+
not_include_blocks.append(block)
|
|
557
|
+
|
|
558
|
+
return fixed_blocks, not_include_blocks
|
|
559
|
+
|
|
560
|
+
|
|
407
561
|
def fix_list_blocks(list_blocks, text_blocks, ref_text_blocks):
|
|
408
562
|
for list_block in list_blocks:
|
|
409
563
|
list_block["blocks"] = []
|
mineru/cli/common.py
CHANGED
|
@@ -18,7 +18,7 @@ from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
|
|
|
18
18
|
from mineru.backend.vlm.vlm_analyze import aio_doc_analyze as aio_vlm_doc_analyze
|
|
19
19
|
|
|
20
20
|
pdf_suffixes = ["pdf"]
|
|
21
|
-
image_suffixes = ["png", "jpeg", "jp2", "webp", "gif", "bmp", "jpg"]
|
|
21
|
+
image_suffixes = ["png", "jpeg", "jp2", "webp", "gif", "bmp", "jpg", "tiff"]
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
def read_fn(path):
|
|
@@ -44,34 +44,37 @@ def prepare_env(output_dir, pdf_file_name, parse_method):
|
|
|
44
44
|
|
|
45
45
|
|
|
46
46
|
def convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id=0, end_page_id=None):
|
|
47
|
+
try:
|
|
48
|
+
# 从字节数据加载PDF
|
|
49
|
+
pdf = pdfium.PdfDocument(pdf_bytes)
|
|
47
50
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
if end_page_id > len(pdf) - 1:
|
|
54
|
-
logger.warning("end_page_id is out of range, use pdf_docs length")
|
|
55
|
-
end_page_id = len(pdf) - 1
|
|
51
|
+
# 确定结束页
|
|
52
|
+
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(pdf) - 1
|
|
53
|
+
if end_page_id > len(pdf) - 1:
|
|
54
|
+
logger.warning("end_page_id is out of range, use pdf_docs length")
|
|
55
|
+
end_page_id = len(pdf) - 1
|
|
56
56
|
|
|
57
|
-
|
|
58
|
-
|
|
57
|
+
# 创建一个新的PDF文档
|
|
58
|
+
output_pdf = pdfium.PdfDocument.new()
|
|
59
59
|
|
|
60
|
-
|
|
61
|
-
|
|
60
|
+
# 选择要导入的页面索引
|
|
61
|
+
page_indices = list(range(start_page_id, end_page_id + 1))
|
|
62
62
|
|
|
63
|
-
|
|
64
|
-
|
|
63
|
+
# 从原PDF导入页面到新PDF
|
|
64
|
+
output_pdf.import_pages(pdf, page_indices)
|
|
65
65
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
66
|
+
# 将新PDF保存到内存缓冲区
|
|
67
|
+
output_buffer = io.BytesIO()
|
|
68
|
+
output_pdf.save(output_buffer)
|
|
69
69
|
|
|
70
|
-
|
|
71
|
-
|
|
70
|
+
# 获取字节数据
|
|
71
|
+
output_bytes = output_buffer.getvalue()
|
|
72
72
|
|
|
73
|
-
|
|
74
|
-
|
|
73
|
+
pdf.close() # 关闭原PDF文档以释放资源
|
|
74
|
+
output_pdf.close() # 关闭新PDF文档以释放资源
|
|
75
|
+
except Exception as e:
|
|
76
|
+
logger.warning(f"Error in converting PDF bytes: {e}, Using original PDF bytes.")
|
|
77
|
+
output_bytes = pdf_bytes
|
|
75
78
|
|
|
76
79
|
return output_bytes
|
|
77
80
|
|
mineru/cli/fast_api.py
CHANGED
|
@@ -177,10 +177,7 @@ async def parse_pdf(
|
|
|
177
177
|
zf.write(path, arcname=os.path.join(safe_pdf_name, f"{safe_pdf_name}_middle.json"))
|
|
178
178
|
|
|
179
179
|
if return_model_output:
|
|
180
|
-
|
|
181
|
-
path = os.path.join(parse_dir, f"{pdf_name}_model.json")
|
|
182
|
-
else:
|
|
183
|
-
path = os.path.join(parse_dir, f"{pdf_name}_model_output.txt")
|
|
180
|
+
path = os.path.join(parse_dir, f"{pdf_name}_model.json")
|
|
184
181
|
if os.path.exists(path):
|
|
185
182
|
zf.write(path, arcname=os.path.join(safe_pdf_name, os.path.basename(path)))
|
|
186
183
|
|
|
@@ -220,10 +217,7 @@ async def parse_pdf(
|
|
|
220
217
|
if return_middle_json:
|
|
221
218
|
data["middle_json"] = get_infer_result("_middle.json", pdf_name, parse_dir)
|
|
222
219
|
if return_model_output:
|
|
223
|
-
|
|
224
|
-
data["model_output"] = get_infer_result("_model.json", pdf_name, parse_dir)
|
|
225
|
-
else:
|
|
226
|
-
data["model_output"] = get_infer_result("_model_output.txt", pdf_name, parse_dir)
|
|
220
|
+
data["model_output"] = get_infer_result("_model.json", pdf_name, parse_dir)
|
|
227
221
|
if return_content_list:
|
|
228
222
|
data["content_list"] = get_infer_result("_content_list.json", pdf_name, parse_dir)
|
|
229
223
|
if return_images:
|