mineru 2.5.3__py3-none-any.whl → 2.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. mineru/backend/pipeline/model_init.py +25 -3
  2. mineru/backend/pipeline/model_json_to_middle_json.py +2 -2
  3. mineru/backend/pipeline/model_list.py +0 -1
  4. mineru/backend/utils.py +24 -0
  5. mineru/backend/vlm/model_output_to_middle_json.py +2 -2
  6. mineru/backend/vlm/{custom_logits_processors.py → utils.py} +36 -2
  7. mineru/backend/vlm/vlm_analyze.py +43 -50
  8. mineru/backend/vlm/vlm_magic_model.py +155 -1
  9. mineru/cli/common.py +26 -23
  10. mineru/cli/fast_api.py +2 -8
  11. mineru/cli/gradio_app.py +104 -13
  12. mineru/cli/models_download.py +1 -0
  13. mineru/model/mfr/pp_formulanet_plus_m/predict_formula.py +152 -0
  14. mineru/model/mfr/pp_formulanet_plus_m/processors.py +657 -0
  15. mineru/model/mfr/unimernet/unimernet_hf/modeling_unimernet.py +1 -326
  16. mineru/model/mfr/utils.py +338 -0
  17. mineru/model/ocr/paddleocr2pytorch/pytorch_paddle.py +103 -16
  18. mineru/model/table/rec/unet_table/main.py +1 -1
  19. mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/data/imaug/operators.py +5 -5
  20. mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/backbones/__init__.py +2 -1
  21. mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/backbones/rec_lcnetv3.py +7 -7
  22. mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/backbones/rec_pphgnetv2.py +2 -2
  23. mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/heads/__init__.py +2 -0
  24. mineru/model/utils/pytorchocr/modeling/heads/rec_ppformulanet_head.py +1383 -0
  25. mineru/model/utils/pytorchocr/modeling/heads/rec_unimernet_head.py +2631 -0
  26. mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/postprocess/rec_postprocess.py +25 -28
  27. mineru/model/utils/pytorchocr/utils/__init__.py +0 -0
  28. mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/arch_config.yaml +130 -0
  29. mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv5_arabic_dict.txt +747 -0
  30. mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv5_cyrillic_dict.txt +850 -0
  31. mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv5_devanagari_dict.txt +568 -0
  32. mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv5_ta_dict.txt +513 -0
  33. mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv5_te_dict.txt +540 -0
  34. mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/models_config.yml +15 -15
  35. mineru/model/utils/pytorchocr/utils/resources/pp_formulanet_arch_config.yaml +24 -0
  36. mineru/model/utils/tools/infer/__init__.py +1 -0
  37. mineru/model/{ocr/paddleocr2pytorch → utils}/tools/infer/predict_det.py +6 -3
  38. mineru/model/{ocr/paddleocr2pytorch → utils}/tools/infer/predict_rec.py +16 -25
  39. mineru/model/vlm_vllm_model/server.py +4 -1
  40. mineru/resources/header.html +2 -2
  41. mineru/utils/enum_class.py +1 -0
  42. mineru/utils/guess_suffix_or_lang.py +9 -1
  43. mineru/utils/llm_aided.py +4 -2
  44. mineru/utils/ocr_utils.py +16 -0
  45. mineru/utils/table_merge.py +102 -13
  46. mineru/version.py +1 -1
  47. {mineru-2.5.3.dist-info → mineru-2.6.0.dist-info}/METADATA +33 -6
  48. mineru-2.6.0.dist-info/RECORD +195 -0
  49. mineru-2.5.3.dist-info/RECORD +0 -181
  50. /mineru/model/{ocr/paddleocr2pytorch/pytorchocr → mfr/pp_formulanet_plus_m}/__init__.py +0 -0
  51. /mineru/model/{ocr/paddleocr2pytorch/tools/infer → utils}/__init__.py +0 -0
  52. /mineru/model/{ocr/paddleocr2pytorch/pytorchocr/modeling → utils/pytorchocr}/__init__.py +0 -0
  53. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/base_ocr_v20.py +0 -0
  54. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/data/__init__.py +0 -0
  55. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/data/imaug/__init__.py +0 -0
  56. /mineru/model/{ocr/paddleocr2pytorch/pytorchocr/utils → utils/pytorchocr/modeling}/__init__.py +0 -0
  57. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/architectures/__init__.py +0 -0
  58. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/architectures/base_model.py +0 -0
  59. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/backbones/det_mobilenet_v3.py +0 -0
  60. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/backbones/rec_donut_swin.py +0 -0
  61. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/backbones/rec_hgnet.py +0 -0
  62. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/backbones/rec_mobilenet_v3.py +0 -0
  63. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/backbones/rec_mv1_enhance.py +0 -0
  64. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/backbones/rec_svtrnet.py +0 -0
  65. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/common.py +0 -0
  66. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/heads/cls_head.py +0 -0
  67. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/heads/det_db_head.py +0 -0
  68. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/heads/rec_ctc_head.py +0 -0
  69. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/heads/rec_multi_head.py +0 -0
  70. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/necks/__init__.py +0 -0
  71. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/necks/db_fpn.py +0 -0
  72. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/necks/intracl.py +0 -0
  73. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/modeling/necks/rnn.py +0 -0
  74. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/postprocess/__init__.py +0 -0
  75. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/postprocess/cls_postprocess.py +0 -0
  76. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/postprocess/db_postprocess.py +0 -0
  77. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/arabic_dict.txt +0 -0
  78. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/chinese_cht_dict.txt +0 -0
  79. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/cyrillic_dict.txt +0 -0
  80. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/devanagari_dict.txt +0 -0
  81. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/en_dict.txt +0 -0
  82. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/japan_dict.txt +0 -0
  83. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ka_dict.txt +0 -0
  84. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/korean_dict.txt +0 -0
  85. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/latin_dict.txt +0 -0
  86. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ppocr_keys_v1.txt +0 -0
  87. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ppocrv4_doc_dict.txt +0 -0
  88. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ppocrv5_dict.txt +0 -0
  89. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ppocrv5_el_dict.txt +0 -0
  90. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ppocrv5_en_dict.txt +0 -0
  91. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ppocrv5_eslav_dict.txt +0 -0
  92. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ppocrv5_korean_dict.txt +0 -0
  93. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ppocrv5_latin_dict.txt +0 -0
  94. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ppocrv5_th_dict.txt +0 -0
  95. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/ta_dict.txt +0 -0
  96. /mineru/model/{ocr/paddleocr2pytorch → utils}/pytorchocr/utils/resources/dict/te_dict.txt +0 -0
  97. /mineru/model/{ocr/paddleocr2pytorch → utils}/tools/__init__.py +0 -0
  98. /mineru/model/{ocr/paddleocr2pytorch → utils}/tools/infer/predict_cls.py +0 -0
  99. /mineru/model/{ocr/paddleocr2pytorch → utils}/tools/infer/predict_system.py +0 -0
  100. /mineru/model/{ocr/paddleocr2pytorch → utils}/tools/infer/pytorchocr_utility.py +0 -0
  101. {mineru-2.5.3.dist-info → mineru-2.6.0.dist-info}/WHEEL +0 -0
  102. {mineru-2.5.3.dist-info → mineru-2.6.0.dist-info}/entry_points.txt +0 -0
  103. {mineru-2.5.3.dist-info → mineru-2.6.0.dist-info}/licenses/LICENSE.md +0 -0
  104. {mineru-2.5.3.dist-info → mineru-2.6.0.dist-info}/top_level.txt +0 -0
@@ -7,6 +7,7 @@ from .model_list import AtomicModel
7
7
  from ...model.layout.doclayoutyolo import DocLayoutYOLOModel
8
8
  from ...model.mfd.yolo_v8 import YOLOv8MFDModel
9
9
  from ...model.mfr.unimernet.Unimernet import UnimernetModel
10
+ from ...model.mfr.pp_formulanet_plus_m.predict_formula import FormulaRecognizer
10
11
  from ...model.ocr.paddleocr2pytorch.pytorch_paddle import PytorchPaddleOCR
11
12
  from ...model.ori_cls.paddle_ori_cls import PaddleOrientationClsModel
12
13
  from ...model.table.cls.paddle_table_cls import PaddleTableClsModel
@@ -16,6 +17,15 @@ from ...model.table.rec.unet_table.main import UnetTableModel
16
17
  from ...utils.enum_class import ModelPath
17
18
  from ...utils.models_download_utils import auto_download_and_get_model_root_path
18
19
 
20
+ MFR_MODEL = os.getenv('MINERU_FORMULA_CH_SUPPORT', 'False')
21
+ if MFR_MODEL.lower() in ['true', '1', 'yes']:
22
+ MFR_MODEL = "pp_formulanet_plus_m"
23
+ elif MFR_MODEL.lower() in ['false', '0', 'no']:
24
+ MFR_MODEL = "unimernet_small"
25
+ else:
26
+ logger.warning(f"Invalid MINERU_FORMULA_CH_SUPPORT value: {MFR_MODEL}, set to default 'False'")
27
+ MFR_MODEL = "unimernet_small"
28
+
19
29
 
20
30
  def img_orientation_cls_model_init():
21
31
  atom_model_manager = AtomModelSingleton()
@@ -68,7 +78,13 @@ def mfd_model_init(weight, device='cpu'):
68
78
 
69
79
 
70
80
  def mfr_model_init(weight_dir, device='cpu'):
71
- mfr_model = UnimernetModel(weight_dir, device)
81
+ if MFR_MODEL == "unimernet_small":
82
+ mfr_model = UnimernetModel(weight_dir, device)
83
+ elif MFR_MODEL == "pp_formulanet_plus_m":
84
+ mfr_model = FormulaRecognizer(weight_dir, device)
85
+ else:
86
+ logger.error('MFR model name not allow')
87
+ exit(1)
72
88
  return mfr_model
73
89
 
74
90
 
@@ -205,11 +221,17 @@ class MineruPipelineModel:
205
221
  )
206
222
 
207
223
  # 初始化公式解析模型
208
- mfr_weight_dir = os.path.join(auto_download_and_get_model_root_path(ModelPath.unimernet_small), ModelPath.unimernet_small)
224
+ if MFR_MODEL == "unimernet_small":
225
+ mfr_model_path = ModelPath.unimernet_small
226
+ elif MFR_MODEL == "pp_formulanet_plus_m":
227
+ mfr_model_path = ModelPath.pp_formulanet_plus_m
228
+ else:
229
+ logger.error('MFR model name not allow')
230
+ exit(1)
209
231
 
210
232
  self.mfr_model = atom_model_manager.get_atom_model(
211
233
  atom_model_name=AtomicModel.MFR,
212
- mfr_weight_dir=mfr_weight_dir,
234
+ mfr_weight_dir=str(os.path.join(auto_download_and_get_model_root_path(mfr_model_path), mfr_model_path)),
213
235
  device=self.device,
214
236
  )
215
237
 
@@ -5,6 +5,7 @@ import time
5
5
  from loguru import logger
6
6
  from tqdm import tqdm
7
7
 
8
+ from mineru.backend.utils import cross_page_table_merge
8
9
  from mineru.utils.config_reader import get_device, get_llm_aided_config, get_formula_enable
9
10
  from mineru.backend.pipeline.model_init import AtomModelSingleton
10
11
  from mineru.backend.pipeline.para_split import para_split
@@ -20,7 +21,6 @@ from mineru.utils.ocr_utils import OcrConfidence
20
21
  from mineru.utils.span_block_fix import fill_spans_in_blocks, fix_discarded_block, fix_block_spans
21
22
  from mineru.utils.span_pre_proc import remove_outside_spans, remove_overlaps_low_confidence_spans, \
22
23
  remove_overlaps_min_spans, txt_spans_extract
23
- from mineru.utils.table_merge import merge_table
24
24
  from mineru.version import __version__
25
25
  from mineru.utils.hash_utils import bytes_md5
26
26
 
@@ -231,7 +231,7 @@ def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=N
231
231
  para_split(middle_json["pdf_info"])
232
232
 
233
233
  """表格跨页合并"""
234
- merge_table(middle_json["pdf_info"])
234
+ cross_page_table_merge(middle_json["pdf_info"])
235
235
 
236
236
  """llm优化"""
237
237
  llm_aided_config = get_llm_aided_config()
@@ -7,4 +7,3 @@ class AtomicModel:
7
7
  WiredTable = "wired_table"
8
8
  TableCls = "table_cls"
9
9
  ImgOrientationCls = "img_ori_cls"
10
-
@@ -0,0 +1,24 @@
1
+ import os
2
+
3
+ from loguru import logger
4
+
5
+ from mineru.utils.table_merge import merge_table
6
+
7
+
8
+ def cross_page_table_merge(pdf_info: list[dict]):
9
+ """Merge tables that span across multiple pages in a PDF document.
10
+
11
+ Args:
12
+ pdf_info (list[dict]): A list of dictionaries containing information about each page in the PDF.
13
+
14
+ Returns:
15
+ None
16
+ """
17
+ is_merge_table = os.getenv('MINERU_TABLE_MERGE_ENABLE', 'true')
18
+ if is_merge_table.lower() in ['true', '1', 'yes']:
19
+ merge_table(pdf_info)
20
+ elif is_merge_table.lower() in ['false', '0', 'no']:
21
+ pass
22
+ else:
23
+ logger.warning(f'unknown MINERU_TABLE_MERGE_ENABLE config: {is_merge_table}, pass')
24
+ pass
@@ -5,13 +5,13 @@ import cv2
5
5
  import numpy as np
6
6
  from loguru import logger
7
7
 
8
+ from mineru.backend.utils import cross_page_table_merge
8
9
  from mineru.backend.vlm.vlm_magic_model import MagicModel
9
10
  from mineru.utils.config_reader import get_table_enable, get_llm_aided_config
10
11
  from mineru.utils.cut_image import cut_image_and_table
11
12
  from mineru.utils.enum_class import ContentType
12
13
  from mineru.utils.hash_utils import bytes_md5
13
14
  from mineru.utils.pdf_image_tools import get_crop_img
14
- from mineru.utils.table_merge import merge_table
15
15
  from mineru.version import __version__
16
16
 
17
17
 
@@ -110,7 +110,7 @@ def result_to_middle_json(model_output_blocks_list, images_list, pdf_doc, image_
110
110
  """表格跨页合并"""
111
111
  table_enable = get_table_enable(os.getenv('MINERU_VLM_TABLE_ENABLE', 'True').lower() == 'true')
112
112
  if table_enable:
113
- merge_table(middle_json["pdf_info"])
113
+ cross_page_table_merge(middle_json["pdf_info"])
114
114
 
115
115
  """llm优化标题分级"""
116
116
  if heading_level_import_success:
@@ -3,8 +3,11 @@ import os
3
3
  from loguru import logger
4
4
  from packaging import version
5
5
 
6
+ from mineru.utils.config_reader import get_device
7
+ from mineru.utils.model_utils import get_vram
6
8
 
7
- def enable_custom_logits_processors():
9
+
10
+ def enable_custom_logits_processors() -> bool:
8
11
  import torch
9
12
  from vllm import __version__ as vllm_version
10
13
 
@@ -38,4 +41,35 @@ def enable_custom_logits_processors():
38
41
  return False
39
42
  else:
40
43
  logger.info(f"compute_capability: {compute_capability} >= 8.0 and vllm version: {vllm_version} >= 0.10.1, enable custom_logits_processors")
41
- return True
44
+ return True
45
+
46
+
47
+ def set_defult_gpu_memory_utilization() -> float:
48
+ from vllm import __version__ as vllm_version
49
+ if version.parse(vllm_version) >= version.parse("0.11.0"):
50
+ return 0.7
51
+ else:
52
+ return 0.5
53
+
54
+
55
+ def set_defult_batch_size() -> int:
56
+ try:
57
+ device = get_device()
58
+ vram = get_vram(device)
59
+ if vram is not None:
60
+ gpu_memory = int(os.getenv('MINERU_VIRTUAL_VRAM_SIZE', round(vram)))
61
+ if gpu_memory >= 16:
62
+ batch_size = 8
63
+ elif gpu_memory >= 8:
64
+ batch_size = 4
65
+ else:
66
+ batch_size = 1
67
+ logger.info(f'gpu_memory: {gpu_memory} GB, batch_size: {batch_size}')
68
+ else:
69
+ # Default batch_ratio when VRAM can't be determined
70
+ batch_size = 1
71
+ logger.info(f'Could not determine GPU memory, using default batch_ratio: {batch_size}')
72
+ except Exception as e:
73
+ logger.warning(f'Error determining VRAM: {e}, using default batch_ratio: 1')
74
+ batch_size = 1
75
+ return batch_size
@@ -4,14 +4,13 @@ import time
4
4
 
5
5
  from loguru import logger
6
6
 
7
- from .custom_logits_processors import enable_custom_logits_processors
7
+ from .utils import enable_custom_logits_processors, set_defult_gpu_memory_utilization, set_defult_batch_size
8
8
  from .model_output_to_middle_json import result_to_middle_json
9
9
  from ...data.data_reader_writer import DataWriter
10
10
  from mineru.utils.pdf_image_tools import load_images_from_pdf
11
11
  from ...utils.config_reader import get_device
12
12
 
13
13
  from ...utils.enum_class import ImageType
14
- from ...utils.model_utils import get_vram
15
14
  from ...utils.models_download_utils import auto_download_and_get_model_root_path
16
15
 
17
16
  from mineru_vl_utils import MinerUClient
@@ -41,7 +40,13 @@ class ModelSingleton:
41
40
  processor = None
42
41
  vllm_llm = None
43
42
  vllm_async_llm = None
44
- batch_size = 0
43
+ batch_size = kwargs.get("batch_size", 0) # for transformers backend only
44
+ max_concurrency = kwargs.get("max_concurrency", 100) # for http-client backend only
45
+ http_timeout = kwargs.get("http_timeout", 600) # for http-client backend only
46
+ # 从kwargs中移除这些参数,避免传递给不相关的初始化函数
47
+ for param in ["batch_size", "max_concurrency", "http_timeout"]:
48
+ if param in kwargs:
49
+ del kwargs[param]
45
50
  if backend in ['transformers', 'vllm-engine', "vllm-async-engine"] and not model_path:
46
51
  model_path = auto_download_and_get_model_root_path("/","vlm")
47
52
  if backend == "transformers":
@@ -68,53 +73,39 @@ class ModelSingleton:
68
73
  model_path,
69
74
  use_fast=True,
70
75
  )
71
- try:
72
- vram = get_vram(device)
73
- if vram is not None:
74
- gpu_memory = int(os.getenv('MINERU_VIRTUAL_VRAM_SIZE', round(vram)))
75
- if gpu_memory >= 16:
76
- batch_size = 8
77
- elif gpu_memory >= 8:
78
- batch_size = 4
79
- else:
80
- batch_size = 1
81
- logger.info(f'gpu_memory: {gpu_memory} GB, batch_size: {batch_size}')
82
- else:
83
- # Default batch_ratio when VRAM can't be determined
84
- batch_size = 1
85
- logger.info(f'Could not determine GPU memory, using default batch_ratio: {batch_size}')
86
- except Exception as e:
87
- logger.warning(f'Error determining VRAM: {e}, using default batch_ratio: 1')
88
- batch_size = 1
89
- elif backend == "vllm-engine":
90
- try:
91
- import vllm
92
- from mineru_vl_utils import MinerULogitsProcessor
93
- except ImportError:
94
- raise ImportError("Please install vllm to use the vllm-engine backend.")
95
- if "gpu_memory_utilization" not in kwargs:
96
- kwargs["gpu_memory_utilization"] = 0.5
97
- if "model" not in kwargs:
98
- kwargs["model"] = model_path
99
- if enable_custom_logits_processors() and ("logits_processors" not in kwargs):
100
- kwargs["logits_processors"] = [MinerULogitsProcessor]
101
- # 使用kwargs vllm初始化参数
102
- vllm_llm = vllm.LLM(**kwargs)
103
- elif backend == "vllm-async-engine":
104
- try:
105
- from vllm.engine.arg_utils import AsyncEngineArgs
106
- from vllm.v1.engine.async_llm import AsyncLLM
107
- from mineru_vl_utils import MinerULogitsProcessor
108
- except ImportError:
109
- raise ImportError("Please install vllm to use the vllm-async-engine backend.")
110
- if "gpu_memory_utilization" not in kwargs:
111
- kwargs["gpu_memory_utilization"] = 0.5
112
- if "model" not in kwargs:
113
- kwargs["model"] = model_path
114
- if enable_custom_logits_processors() and ("logits_processors" not in kwargs):
115
- kwargs["logits_processors"] = [MinerULogitsProcessor]
116
- # 使用kwargs为 vllm初始化参数
117
- vllm_async_llm = AsyncLLM.from_engine_args(AsyncEngineArgs(**kwargs))
76
+ if batch_size == 0:
77
+ batch_size = set_defult_batch_size()
78
+ else:
79
+ os.environ["OMP_NUM_THREADS"] = "1"
80
+ if backend == "vllm-engine":
81
+ try:
82
+ import vllm
83
+ from mineru_vl_utils import MinerULogitsProcessor
84
+ except ImportError:
85
+ raise ImportError("Please install vllm to use the vllm-engine backend.")
86
+ if "gpu_memory_utilization" not in kwargs:
87
+ kwargs["gpu_memory_utilization"] = set_defult_gpu_memory_utilization()
88
+ if "model" not in kwargs:
89
+ kwargs["model"] = model_path
90
+ if enable_custom_logits_processors() and ("logits_processors" not in kwargs):
91
+ kwargs["logits_processors"] = [MinerULogitsProcessor]
92
+ # 使用kwargs为 vllm初始化参数
93
+ vllm_llm = vllm.LLM(**kwargs)
94
+ elif backend == "vllm-async-engine":
95
+ try:
96
+ from vllm.engine.arg_utils import AsyncEngineArgs
97
+ from vllm.v1.engine.async_llm import AsyncLLM
98
+ from mineru_vl_utils import MinerULogitsProcessor
99
+ except ImportError:
100
+ raise ImportError("Please install vllm to use the vllm-async-engine backend.")
101
+ if "gpu_memory_utilization" not in kwargs:
102
+ kwargs["gpu_memory_utilization"] = set_defult_gpu_memory_utilization()
103
+ if "model" not in kwargs:
104
+ kwargs["model"] = model_path
105
+ if enable_custom_logits_processors() and ("logits_processors" not in kwargs):
106
+ kwargs["logits_processors"] = [MinerULogitsProcessor]
107
+ # 使用kwargs为 vllm初始化参数
108
+ vllm_async_llm = AsyncLLM.from_engine_args(AsyncEngineArgs(**kwargs))
118
109
  self._models[key] = MinerUClient(
119
110
  backend=backend,
120
111
  model=model,
@@ -123,6 +114,8 @@ class ModelSingleton:
123
114
  vllm_async_llm=vllm_async_llm,
124
115
  server_url=server_url,
125
116
  batch_size=batch_size,
117
+ max_concurrency=max_concurrency,
118
+ http_timeout=http_timeout,
126
119
  )
127
120
  elapsed = round(time.time() - start_time, 2)
128
121
  logger.info(f"get {backend} predictor cost: {elapsed}s")
@@ -361,7 +361,7 @@ def get_type_blocks(blocks, block_type: Literal["image", "table", "code"]):
361
361
  return ret
362
362
 
363
363
 
364
- def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table", "code"]):
364
+ def fix_two_layer_blocks_back(blocks, fix_type: Literal["image", "table", "code"]):
365
365
  need_fix_blocks = get_type_blocks(blocks, fix_type)
366
366
  fixed_blocks = []
367
367
  not_include_blocks = []
@@ -404,6 +404,160 @@ def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table", "code"]):
404
404
  return fixed_blocks, not_include_blocks
405
405
 
406
406
 
407
+ def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table", "code"]):
408
+ need_fix_blocks = get_type_blocks(blocks, fix_type)
409
+ fixed_blocks = []
410
+ not_include_blocks = []
411
+ processed_indices = set()
412
+
413
+ # 特殊处理表格类型,确保标题在表格前,注脚在表格后
414
+ if fix_type == "table":
415
+ # 收集所有不合适的caption和footnote
416
+ misplaced_captions = [] # 存储(caption, 原始block索引)
417
+ misplaced_footnotes = [] # 存储(footnote, 原始block索引)
418
+
419
+ # 第一步:移除不符合位置要求的caption和footnote
420
+ for block_idx, block in enumerate(need_fix_blocks):
421
+ body = block[f"{fix_type}_body"]
422
+ body_index = body["index"]
423
+
424
+ # 检查caption应在body前或同位置
425
+ valid_captions = []
426
+ for caption in block[f"{fix_type}_caption_list"]:
427
+ if caption["index"] <= body_index:
428
+ valid_captions.append(caption)
429
+ else:
430
+ misplaced_captions.append((caption, block_idx))
431
+ block[f"{fix_type}_caption_list"] = valid_captions
432
+
433
+ # 检查footnote应在body后或同位置
434
+ valid_footnotes = []
435
+ for footnote in block[f"{fix_type}_footnote_list"]:
436
+ if footnote["index"] >= body_index:
437
+ valid_footnotes.append(footnote)
438
+ else:
439
+ misplaced_footnotes.append((footnote, block_idx))
440
+ block[f"{fix_type}_footnote_list"] = valid_footnotes
441
+
442
+ # 第二步:重新分配不合规的caption到合适的body
443
+ for caption, original_block_idx in misplaced_captions:
444
+ caption_index = caption["index"]
445
+ best_block_idx = None
446
+ min_distance = float('inf')
447
+
448
+ # 寻找索引大于等于caption_index的最近body
449
+ for idx, block in enumerate(need_fix_blocks):
450
+ body_index = block[f"{fix_type}_body"]["index"]
451
+ if body_index >= caption_index and idx != original_block_idx:
452
+ distance = body_index - caption_index
453
+ if distance < min_distance:
454
+ min_distance = distance
455
+ best_block_idx = idx
456
+
457
+ if best_block_idx is not None:
458
+ # 找到合适的body,添加到对应block的caption_list
459
+ need_fix_blocks[best_block_idx][f"{fix_type}_caption_list"].append(caption)
460
+ else:
461
+ # 没找到合适的body,作为普通block处理
462
+ not_include_blocks.append(caption)
463
+
464
+ # 第三步:重新分配不合规的footnote到合适的body
465
+ for footnote, original_block_idx in misplaced_footnotes:
466
+ footnote_index = footnote["index"]
467
+ best_block_idx = None
468
+ min_distance = float('inf')
469
+
470
+ # 寻找索引小于等于footnote_index的最近body
471
+ for idx, block in enumerate(need_fix_blocks):
472
+ body_index = block[f"{fix_type}_body"]["index"]
473
+ if body_index <= footnote_index and idx != original_block_idx:
474
+ distance = footnote_index - body_index
475
+ if distance < min_distance:
476
+ min_distance = distance
477
+ best_block_idx = idx
478
+
479
+ if best_block_idx is not None:
480
+ # 找到合适的body,添加到对应block的footnote_list
481
+ need_fix_blocks[best_block_idx][f"{fix_type}_footnote_list"].append(footnote)
482
+ else:
483
+ # 没找到合适的body,作为普通block处理
484
+ not_include_blocks.append(footnote)
485
+
486
+ # 第四步:将每个block的caption_list和footnote_list中不连续index的元素提出来作为普通block处理
487
+ for block in need_fix_blocks:
488
+ caption_list = block[f"{fix_type}_caption_list"]
489
+ footnote_list = block[f"{fix_type}_footnote_list"]
490
+ body_index = block[f"{fix_type}_body"]["index"]
491
+
492
+ # 处理caption_list (从body往前看,caption在body之前)
493
+ if caption_list:
494
+ # 按index降序排列,从最接近body的开始检查
495
+ caption_list.sort(key=lambda x: x["index"], reverse=True)
496
+ filtered_captions = [caption_list[0]]
497
+ for i in range(1, len(caption_list)):
498
+ # 检查是否与前一个caption连续(降序所以是-1)
499
+ if caption_list[i]["index"] == caption_list[i - 1]["index"] - 1:
500
+ filtered_captions.append(caption_list[i])
501
+ else:
502
+ # 出现gap,后续所有caption都作为普通block
503
+ not_include_blocks.extend(caption_list[i:])
504
+ break
505
+ # 恢复升序
506
+ filtered_captions.reverse()
507
+ block[f"{fix_type}_caption_list"] = filtered_captions
508
+
509
+ # 处理footnote_list (从body往后看,footnote在body之后)
510
+ if footnote_list:
511
+ # 按index升序排列,从最接近body的开始检查
512
+ footnote_list.sort(key=lambda x: x["index"])
513
+ filtered_footnotes = [footnote_list[0]]
514
+ for i in range(1, len(footnote_list)):
515
+ # 检查是否与前一个footnote连续
516
+ if footnote_list[i]["index"] == footnote_list[i - 1]["index"] + 1:
517
+ filtered_footnotes.append(footnote_list[i])
518
+ else:
519
+ # 出现gap,后续所有footnote都作为普通block
520
+ not_include_blocks.extend(footnote_list[i:])
521
+ break
522
+ block[f"{fix_type}_footnote_list"] = filtered_footnotes
523
+
524
+ # 构建两层结构blocks
525
+ for block in need_fix_blocks:
526
+ body = block[f"{fix_type}_body"]
527
+ caption_list = block[f"{fix_type}_caption_list"]
528
+ footnote_list = block[f"{fix_type}_footnote_list"]
529
+
530
+ body["type"] = f"{fix_type}_body"
531
+ for caption in caption_list:
532
+ caption["type"] = f"{fix_type}_caption"
533
+ processed_indices.add(caption["index"])
534
+ for footnote in footnote_list:
535
+ footnote["type"] = f"{fix_type}_footnote"
536
+ processed_indices.add(footnote["index"])
537
+
538
+ processed_indices.add(body["index"])
539
+
540
+ two_layer_block = {
541
+ "type": fix_type,
542
+ "bbox": body["bbox"],
543
+ "blocks": [body],
544
+ "index": body["index"],
545
+ }
546
+ two_layer_block["blocks"].extend([*caption_list, *footnote_list])
547
+ # 对blocks按index排序
548
+ two_layer_block["blocks"].sort(key=lambda x: x["index"])
549
+
550
+ fixed_blocks.append(two_layer_block)
551
+
552
+ # 添加未处理的blocks
553
+ for block in blocks:
554
+ block.pop("type", None)
555
+ if block["index"] not in processed_indices and block not in not_include_blocks:
556
+ not_include_blocks.append(block)
557
+
558
+ return fixed_blocks, not_include_blocks
559
+
560
+
407
561
  def fix_list_blocks(list_blocks, text_blocks, ref_text_blocks):
408
562
  for list_block in list_blocks:
409
563
  list_block["blocks"] = []
mineru/cli/common.py CHANGED
@@ -18,7 +18,7 @@ from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
18
18
  from mineru.backend.vlm.vlm_analyze import aio_doc_analyze as aio_vlm_doc_analyze
19
19
 
20
20
  pdf_suffixes = ["pdf"]
21
- image_suffixes = ["png", "jpeg", "jp2", "webp", "gif", "bmp", "jpg"]
21
+ image_suffixes = ["png", "jpeg", "jp2", "webp", "gif", "bmp", "jpg", "tiff"]
22
22
 
23
23
 
24
24
  def read_fn(path):
@@ -26,7 +26,7 @@ def read_fn(path):
26
26
  path = Path(path)
27
27
  with open(str(path), "rb") as input_file:
28
28
  file_bytes = input_file.read()
29
- file_suffix = guess_suffix_by_bytes(file_bytes)
29
+ file_suffix = guess_suffix_by_bytes(file_bytes, path)
30
30
  if file_suffix in image_suffixes:
31
31
  return images_bytes_to_pdf_bytes(file_bytes)
32
32
  elif file_suffix in pdf_suffixes:
@@ -44,34 +44,37 @@ def prepare_env(output_dir, pdf_file_name, parse_method):
44
44
 
45
45
 
46
46
  def convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id=0, end_page_id=None):
47
+ try:
48
+ # 从字节数据加载PDF
49
+ pdf = pdfium.PdfDocument(pdf_bytes)
47
50
 
48
- # 从字节数据加载PDF
49
- pdf = pdfium.PdfDocument(pdf_bytes)
50
-
51
- # 确定结束页
52
- end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(pdf) - 1
53
- if end_page_id > len(pdf) - 1:
54
- logger.warning("end_page_id is out of range, use pdf_docs length")
55
- end_page_id = len(pdf) - 1
51
+ # 确定结束页
52
+ end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(pdf) - 1
53
+ if end_page_id > len(pdf) - 1:
54
+ logger.warning("end_page_id is out of range, use pdf_docs length")
55
+ end_page_id = len(pdf) - 1
56
56
 
57
- # 创建一个新的PDF文档
58
- output_pdf = pdfium.PdfDocument.new()
57
+ # 创建一个新的PDF文档
58
+ output_pdf = pdfium.PdfDocument.new()
59
59
 
60
- # 选择要导入的页面索引
61
- page_indices = list(range(start_page_id, end_page_id + 1))
60
+ # 选择要导入的页面索引
61
+ page_indices = list(range(start_page_id, end_page_id + 1))
62
62
 
63
- # 从原PDF导入页面到新PDF
64
- output_pdf.import_pages(pdf, page_indices)
63
+ # 从原PDF导入页面到新PDF
64
+ output_pdf.import_pages(pdf, page_indices)
65
65
 
66
- # 将新PDF保存到内存缓冲区
67
- output_buffer = io.BytesIO()
68
- output_pdf.save(output_buffer)
66
+ # 将新PDF保存到内存缓冲区
67
+ output_buffer = io.BytesIO()
68
+ output_pdf.save(output_buffer)
69
69
 
70
- # 获取字节数据
71
- output_bytes = output_buffer.getvalue()
70
+ # 获取字节数据
71
+ output_bytes = output_buffer.getvalue()
72
72
 
73
- pdf.close() # 关闭原PDF文档以释放资源
74
- output_pdf.close() # 关闭新PDF文档以释放资源
73
+ pdf.close() # 关闭原PDF文档以释放资源
74
+ output_pdf.close() # 关闭新PDF文档以释放资源
75
+ except Exception as e:
76
+ logger.warning(f"Error in converting PDF bytes: {e}, Using original PDF bytes.")
77
+ output_bytes = pdf_bytes
75
78
 
76
79
  return output_bytes
77
80
 
mineru/cli/fast_api.py CHANGED
@@ -177,10 +177,7 @@ async def parse_pdf(
177
177
  zf.write(path, arcname=os.path.join(safe_pdf_name, f"{safe_pdf_name}_middle.json"))
178
178
 
179
179
  if return_model_output:
180
- if backend.startswith("pipeline"):
181
- path = os.path.join(parse_dir, f"{pdf_name}_model.json")
182
- else:
183
- path = os.path.join(parse_dir, f"{pdf_name}_model_output.txt")
180
+ path = os.path.join(parse_dir, f"{pdf_name}_model.json")
184
181
  if os.path.exists(path):
185
182
  zf.write(path, arcname=os.path.join(safe_pdf_name, os.path.basename(path)))
186
183
 
@@ -220,10 +217,7 @@ async def parse_pdf(
220
217
  if return_middle_json:
221
218
  data["middle_json"] = get_infer_result("_middle.json", pdf_name, parse_dir)
222
219
  if return_model_output:
223
- if backend.startswith("pipeline"):
224
- data["model_output"] = get_infer_result("_model.json", pdf_name, parse_dir)
225
- else:
226
- data["model_output"] = get_infer_result("_model_output.txt", pdf_name, parse_dir)
220
+ data["model_output"] = get_infer_result("_model.json", pdf_name, parse_dir)
227
221
  if return_content_list:
228
222
  data["content_list"] = get_infer_result("_content_list.json", pdf_name, parse_dir)
229
223
  if return_images: