mineru 2.2.2__py3-none-any.whl → 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. mineru/backend/pipeline/pipeline_middle_json_mkcontent.py +3 -3
  2. mineru/backend/vlm/model_output_to_middle_json.py +123 -0
  3. mineru/backend/vlm/vlm_analyze.py +97 -16
  4. mineru/backend/vlm/vlm_magic_model.py +201 -135
  5. mineru/backend/vlm/vlm_middle_json_mkcontent.py +52 -11
  6. mineru/cli/client.py +6 -5
  7. mineru/cli/common.py +17 -16
  8. mineru/cli/fast_api.py +9 -7
  9. mineru/cli/gradio_app.py +15 -16
  10. mineru/cli/vlm_vllm_server.py +4 -0
  11. mineru/model/table/rec/unet_table/main.py +8 -0
  12. mineru/model/vlm_vllm_model/__init__.py +0 -0
  13. mineru/model/vlm_vllm_model/server.py +51 -0
  14. mineru/resources/header.html +10 -2
  15. mineru/utils/draw_bbox.py +32 -10
  16. mineru/utils/enum_class.py +16 -2
  17. mineru/utils/guess_suffix_or_lang.py +20 -0
  18. mineru/utils/span_block_fix.py +4 -2
  19. mineru/version.py +1 -1
  20. {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/METADATA +70 -25
  21. {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/RECORD +25 -38
  22. {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/entry_points.txt +1 -1
  23. mineru/backend/vlm/base_predictor.py +0 -186
  24. mineru/backend/vlm/hf_predictor.py +0 -217
  25. mineru/backend/vlm/predictor.py +0 -111
  26. mineru/backend/vlm/sglang_client_predictor.py +0 -443
  27. mineru/backend/vlm/sglang_engine_predictor.py +0 -246
  28. mineru/backend/vlm/token_to_middle_json.py +0 -122
  29. mineru/backend/vlm/utils.py +0 -40
  30. mineru/cli/vlm_sglang_server.py +0 -4
  31. mineru/model/vlm_hf_model/__init__.py +0 -9
  32. mineru/model/vlm_hf_model/configuration_mineru2.py +0 -38
  33. mineru/model/vlm_hf_model/image_processing_mineru2.py +0 -269
  34. mineru/model/vlm_hf_model/modeling_mineru2.py +0 -449
  35. mineru/model/vlm_sglang_model/__init__.py +0 -14
  36. mineru/model/vlm_sglang_model/engine.py +0 -264
  37. mineru/model/vlm_sglang_model/image_processor.py +0 -213
  38. mineru/model/vlm_sglang_model/logit_processor.py +0 -90
  39. mineru/model/vlm_sglang_model/model.py +0 -453
  40. mineru/model/vlm_sglang_model/server.py +0 -75
  41. {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/WHEEL +0 -0
  42. {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/licenses/LICENSE.md +0 -0
  43. {mineru-2.2.2.dist-info → mineru-2.5.0.dist-info}/top_level.txt +0 -0
@@ -245,14 +245,14 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
245
245
  if block['type'] == BlockType.TABLE_FOOTNOTE:
246
246
  para_content[BlockType.TABLE_FOOTNOTE].append(merge_para_with_text(block))
247
247
 
248
- page_weight, page_height = page_size
248
+ page_width, page_height = page_size
249
249
  para_bbox = para_block.get('bbox')
250
250
  if para_bbox:
251
251
  x0, y0, x1, y1 = para_bbox
252
252
  para_content['bbox'] = [
253
- int(x0 * 1000 / page_weight),
253
+ int(x0 * 1000 / page_width),
254
254
  int(y0 * 1000 / page_height),
255
- int(x1 * 1000 / page_weight),
255
+ int(x1 * 1000 / page_width),
256
256
  int(y1 * 1000 / page_height),
257
257
  ]
258
258
 
@@ -0,0 +1,123 @@
1
+ import os
2
+ import time
3
+
4
+ import cv2
5
+ import numpy as np
6
+ from loguru import logger
7
+
8
+ from mineru.backend.vlm.vlm_magic_model import MagicModel
9
+ from mineru.utils.config_reader import get_table_enable, get_llm_aided_config
10
+ from mineru.utils.cut_image import cut_image_and_table
11
+ from mineru.utils.enum_class import ContentType
12
+ from mineru.utils.hash_utils import bytes_md5
13
+ from mineru.utils.pdf_image_tools import get_crop_img
14
+ from mineru.utils.table_merge import merge_table
15
+ from mineru.version import __version__
16
+
17
+
18
+ heading_level_import_success = False
19
+ llm_aided_config = get_llm_aided_config()
20
+ if llm_aided_config:
21
+ title_aided_config = llm_aided_config.get('title_aided', {})
22
+ if title_aided_config.get('enable', False):
23
+ try:
24
+ from mineru.utils.llm_aided import llm_aided_title
25
+ from mineru.backend.pipeline.model_init import AtomModelSingleton
26
+ heading_level_import_success = True
27
+ except Exception as e:
28
+ logger.warning("The heading level feature cannot be used. If you need to use the heading level feature, "
29
+ "please execute `pip install mineru[core]` to install the required packages.")
30
+
31
+
32
+ def blocks_to_page_info(page_blocks, image_dict, page, image_writer, page_index) -> dict:
33
+ """将blocks转换为页面信息"""
34
+
35
+ scale = image_dict["scale"]
36
+ # page_pil_img = image_dict["img_pil"]
37
+ page_pil_img = image_dict["img_pil"]
38
+ page_img_md5 = bytes_md5(page_pil_img.tobytes())
39
+ width, height = map(int, page.get_size())
40
+
41
+ magic_model = MagicModel(page_blocks, width, height)
42
+ image_blocks = magic_model.get_image_blocks()
43
+ table_blocks = magic_model.get_table_blocks()
44
+ title_blocks = magic_model.get_title_blocks()
45
+ discarded_blocks = magic_model.get_discarded_blocks()
46
+ code_blocks = magic_model.get_code_blocks()
47
+ ref_text_blocks = magic_model.get_ref_text_blocks()
48
+ phonetic_blocks = magic_model.get_phonetic_blocks()
49
+ list_blocks = magic_model.get_list_blocks()
50
+
51
+ # 如果有标题优化需求,则对title_blocks截图det
52
+ if heading_level_import_success:
53
+ atom_model_manager = AtomModelSingleton()
54
+ ocr_model = atom_model_manager.get_atom_model(
55
+ atom_model_name='ocr',
56
+ ocr_show_log=False,
57
+ det_db_box_thresh=0.3,
58
+ lang='ch_lite'
59
+ )
60
+ for title_block in title_blocks:
61
+ title_pil_img = get_crop_img(title_block['bbox'], page_pil_img, scale)
62
+ title_np_img = np.array(title_pil_img)
63
+ # 给title_pil_img添加上下左右各50像素白边padding
64
+ title_np_img = cv2.copyMakeBorder(
65
+ title_np_img, 50, 50, 50, 50, cv2.BORDER_CONSTANT, value=[255, 255, 255]
66
+ )
67
+ title_img = cv2.cvtColor(title_np_img, cv2.COLOR_RGB2BGR)
68
+ ocr_det_res = ocr_model.ocr(title_img, rec=False)[0]
69
+ if len(ocr_det_res) > 0:
70
+ # 计算所有res的平均高度
71
+ avg_height = np.mean([box[2][1] - box[0][1] for box in ocr_det_res])
72
+ title_block['line_avg_height'] = round(avg_height/scale)
73
+
74
+ text_blocks = magic_model.get_text_blocks()
75
+ interline_equation_blocks = magic_model.get_interline_equation_blocks()
76
+
77
+ all_spans = magic_model.get_all_spans()
78
+ # 对image/table/interline_equation的span截图
79
+ for span in all_spans:
80
+ if span["type"] in [ContentType.IMAGE, ContentType.TABLE, ContentType.INTERLINE_EQUATION]:
81
+ span = cut_image_and_table(span, page_pil_img, page_img_md5, page_index, image_writer, scale=scale)
82
+
83
+ page_blocks = []
84
+ page_blocks.extend([
85
+ *image_blocks,
86
+ *table_blocks,
87
+ *code_blocks,
88
+ *ref_text_blocks,
89
+ *phonetic_blocks,
90
+ *title_blocks,
91
+ *text_blocks,
92
+ *interline_equation_blocks,
93
+ *list_blocks,
94
+ ])
95
+ # 对page_blocks根据index的值进行排序
96
+ page_blocks.sort(key=lambda x: x["index"])
97
+
98
+ page_info = {"para_blocks": page_blocks, "discarded_blocks": discarded_blocks, "page_size": [width, height], "page_idx": page_index}
99
+ return page_info
100
+
101
+
102
+ def result_to_middle_json(model_output_blocks_list, images_list, pdf_doc, image_writer):
103
+ middle_json = {"pdf_info": [], "_backend":"vlm", "_version_name": __version__}
104
+ for index, page_blocks in enumerate(model_output_blocks_list):
105
+ page = pdf_doc[index]
106
+ image_dict = images_list[index]
107
+ page_info = blocks_to_page_info(page_blocks, image_dict, page, image_writer, index)
108
+ middle_json["pdf_info"].append(page_info)
109
+
110
+ """表格跨页合并"""
111
+ table_enable = get_table_enable(os.getenv('MINERU_VLM_TABLE_ENABLE', 'True').lower() == 'true')
112
+ if table_enable:
113
+ merge_table(middle_json["pdf_info"])
114
+
115
+ """llm优化标题分级"""
116
+ if heading_level_import_success:
117
+ llm_aided_title_start_time = time.time()
118
+ llm_aided_title(middle_json["pdf_info"], title_aided_config)
119
+ logger.info(f'llm aided title time: {round(time.time() - llm_aided_title_start_time, 2)}')
120
+
121
+ # 关闭pdf文档
122
+ pdf_doc.close()
123
+ return middle_json
@@ -1,16 +1,20 @@
1
1
  # Copyright (c) Opendatalab. All rights reserved.
2
+ import os
2
3
  import time
3
4
 
4
5
  from loguru import logger
5
6
 
7
+ from .model_output_to_middle_json import result_to_middle_json
6
8
  from ...data.data_reader_writer import DataWriter
7
9
  from mineru.utils.pdf_image_tools import load_images_from_pdf
8
- from .base_predictor import BasePredictor
9
- from .predictor import get_predictor
10
- from .token_to_middle_json import result_to_middle_json
10
+ from ...utils.config_reader import get_device
11
+
11
12
  from ...utils.enum_class import ImageType
13
+ from ...utils.model_utils import get_vram
12
14
  from ...utils.models_download_utils import auto_download_and_get_model_root_path
13
15
 
16
+ from mineru_vl_utils import MinerUClient
17
+
14
18
 
15
19
  class ModelSingleton:
16
20
  _instance = None
@@ -27,24 +31,101 @@ class ModelSingleton:
27
31
  model_path: str | None,
28
32
  server_url: str | None,
29
33
  **kwargs,
30
- ) -> BasePredictor:
34
+ ) -> MinerUClient:
31
35
  key = (backend, model_path, server_url)
32
36
  if key not in self._models:
33
- if backend in ['transformers', 'sglang-engine'] and not model_path:
37
+ start_time = time.time()
38
+ model = None
39
+ processor = None
40
+ vllm_llm = None
41
+ vllm_async_llm = None
42
+ batch_size = 0
43
+ if backend in ['transformers', 'vllm-engine', "vllm-async-engine"] and not model_path:
34
44
  model_path = auto_download_and_get_model_root_path("/","vlm")
35
- self._models[key] = get_predictor(
45
+ if backend == "transformers":
46
+ try:
47
+ from transformers import (
48
+ AutoProcessor,
49
+ Qwen2VLForConditionalGeneration,
50
+ )
51
+ from transformers import __version__ as transformers_version
52
+ except ImportError:
53
+ raise ImportError("Please install transformers to use the transformers backend.")
54
+
55
+ from packaging import version
56
+ if version.parse(transformers_version) >= version.parse("4.56.0"):
57
+ dtype_key = "dtype"
58
+ else:
59
+ dtype_key = "torch_dtype"
60
+ device = get_device()
61
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
62
+ model_path,
63
+ device_map={"": device},
64
+ **{dtype_key: "auto"}, # type: ignore
65
+ )
66
+ processor = AutoProcessor.from_pretrained(
67
+ model_path,
68
+ use_fast=True,
69
+ )
70
+ try:
71
+ vram = get_vram(device)
72
+ if vram is not None:
73
+ gpu_memory = int(os.getenv('MINERU_VIRTUAL_VRAM_SIZE', round(vram)))
74
+ if gpu_memory >= 16:
75
+ batch_size = 8
76
+ elif gpu_memory >= 8:
77
+ batch_size = 4
78
+ else:
79
+ batch_size = 1
80
+ logger.info(f'gpu_memory: {gpu_memory} GB, batch_size: {batch_size}')
81
+ else:
82
+ # Default batch_ratio when VRAM can't be determined
83
+ batch_size = 1
84
+ logger.info(f'Could not determine GPU memory, using default batch_ratio: {batch_size}')
85
+ except Exception as e:
86
+ logger.warning(f'Error determining VRAM: {e}, using default batch_ratio: 1')
87
+ batch_size = 1
88
+ elif backend == "vllm-engine":
89
+ try:
90
+ import vllm
91
+ except ImportError:
92
+ raise ImportError("Please install vllm to use the vllm-engine backend.")
93
+ if "gpu_memory_utilization" not in kwargs:
94
+ kwargs["gpu_memory_utilization"] = 0.5
95
+ if "model" not in kwargs:
96
+ kwargs["model"] = model_path
97
+ # 使用kwargs为 vllm初始化参数
98
+ vllm_llm = vllm.LLM(**kwargs)
99
+ elif backend == "vllm-async-engine":
100
+ try:
101
+ from vllm.engine.arg_utils import AsyncEngineArgs
102
+ from vllm.v1.engine.async_llm import AsyncLLM
103
+ except ImportError:
104
+ raise ImportError("Please install vllm to use the vllm-async-engine backend.")
105
+ if "gpu_memory_utilization" not in kwargs:
106
+ kwargs["gpu_memory_utilization"] = 0.5
107
+ if "model" not in kwargs:
108
+ kwargs["model"] = model_path
109
+ # 使用kwargs为 vllm初始化参数
110
+ vllm_async_llm = AsyncLLM.from_engine_args(AsyncEngineArgs(**kwargs))
111
+ self._models[key] = MinerUClient(
36
112
  backend=backend,
37
- model_path=model_path,
113
+ model=model,
114
+ processor=processor,
115
+ vllm_llm=vllm_llm,
116
+ vllm_async_llm=vllm_async_llm,
38
117
  server_url=server_url,
39
- **kwargs,
118
+ batch_size=batch_size,
40
119
  )
120
+ elapsed = round(time.time() - start_time, 2)
121
+ logger.info(f"get {backend} predictor cost: {elapsed}s")
41
122
  return self._models[key]
42
123
 
43
124
 
44
125
  def doc_analyze(
45
126
  pdf_bytes,
46
127
  image_writer: DataWriter | None,
47
- predictor: BasePredictor | None = None,
128
+ predictor: MinerUClient | None = None,
48
129
  backend="transformers",
49
130
  model_path: str | None = None,
50
131
  server_url: str | None = None,
@@ -54,13 +135,13 @@ def doc_analyze(
54
135
  predictor = ModelSingleton().get_model(backend, model_path, server_url, **kwargs)
55
136
 
56
137
  # load_images_start = time.time()
57
- images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.BASE64)
58
- images_base64_list = [image_dict["img_base64"] for image_dict in images_list]
138
+ images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
139
+ images_pil_list = [image_dict["img_pil"] for image_dict in images_list]
59
140
  # load_images_time = round(time.time() - load_images_start, 2)
60
141
  # logger.info(f"load images cost: {load_images_time}, speed: {round(len(images_base64_list)/load_images_time, 3)} images/s")
61
142
 
62
143
  # infer_start = time.time()
63
- results = predictor.batch_predict(images=images_base64_list)
144
+ results = predictor.batch_two_step_extract(images=images_pil_list)
64
145
  # infer_time = round(time.time() - infer_start, 2)
65
146
  # logger.info(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s")
66
147
 
@@ -71,7 +152,7 @@ def doc_analyze(
71
152
  async def aio_doc_analyze(
72
153
  pdf_bytes,
73
154
  image_writer: DataWriter | None,
74
- predictor: BasePredictor | None = None,
155
+ predictor: MinerUClient | None = None,
75
156
  backend="transformers",
76
157
  model_path: str | None = None,
77
158
  server_url: str | None = None,
@@ -81,13 +162,13 @@ async def aio_doc_analyze(
81
162
  predictor = ModelSingleton().get_model(backend, model_path, server_url, **kwargs)
82
163
 
83
164
  # load_images_start = time.time()
84
- images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.BASE64)
85
- images_base64_list = [image_dict["img_base64"] for image_dict in images_list]
165
+ images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
166
+ images_pil_list = [image_dict["img_pil"] for image_dict in images_list]
86
167
  # load_images_time = round(time.time() - load_images_start, 2)
87
168
  # logger.info(f"load images cost: {load_images_time}, speed: {round(len(images_base64_list)/load_images_time, 3)} images/s")
88
169
 
89
170
  # infer_start = time.time()
90
- results = await predictor.aio_batch_predict(images=images_base64_list)
171
+ results = await predictor.aio_batch_two_step_extract(images=images_pil_list)
91
172
  # infer_time = round(time.time() - infer_start, 2)
92
173
  # logger.info(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s")
93
174
  middle_json = result_to_middle_json(results, images_list, pdf_doc, image_writer)