mineru 2.6.1__py3-none-any.whl → 2.6.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. mineru/backend/pipeline/batch_analyze.py +20 -43
  2. mineru/backend/pipeline/model_init.py +1 -1
  3. mineru/backend/pipeline/model_json_to_middle_json.py +1 -1
  4. mineru/backend/pipeline/pipeline_middle_json_mkcontent.py +17 -4
  5. mineru/backend/vlm/vlm_analyze.py +11 -1
  6. mineru/backend/vlm/vlm_middle_json_mkcontent.py +6 -3
  7. mineru/cli/client.py +24 -14
  8. mineru/cli/gradio_app.py +4 -1
  9. mineru/cli/models_download.py +1 -1
  10. mineru/model/ocr/{paddleocr2pytorch/pytorch_paddle.py → pytorch_paddle.py} +1 -1
  11. mineru/model/table/rec/RapidTable.py +1 -1
  12. mineru/model/utils/pytorchocr/utils/resources/arch_config.yaml +1 -317
  13. mineru/model/utils/pytorchocr/utils/resources/dict/ppocrv5_latin_dict.txt +545 -211
  14. mineru/model/utils/pytorchocr/utils/resources/models_config.yml +0 -12
  15. mineru/utils/block_sort.py +3 -2
  16. mineru/utils/check_mac_env.py +30 -0
  17. mineru/utils/llm_aided.py +13 -8
  18. mineru/version.py +1 -1
  19. {mineru-2.6.1.dist-info → mineru-2.6.3.dist-info}/METADATA +75 -39
  20. {mineru-2.6.1.dist-info → mineru-2.6.3.dist-info}/RECORD +24 -35
  21. mineru/model/ocr/paddleocr2pytorch/__init__.py +0 -1
  22. mineru/model/utils/pytorchocr/utils/resources/dict/arabic_dict.txt +0 -162
  23. mineru/model/utils/pytorchocr/utils/resources/dict/chinese_cht_dict.txt +0 -8421
  24. mineru/model/utils/pytorchocr/utils/resources/dict/cyrillic_dict.txt +0 -163
  25. mineru/model/utils/pytorchocr/utils/resources/dict/devanagari_dict.txt +0 -167
  26. mineru/model/utils/pytorchocr/utils/resources/dict/en_dict.txt +0 -95
  27. mineru/model/utils/pytorchocr/utils/resources/dict/japan_dict.txt +0 -4399
  28. mineru/model/utils/pytorchocr/utils/resources/dict/korean_dict.txt +0 -3688
  29. mineru/model/utils/pytorchocr/utils/resources/dict/latin_dict.txt +0 -185
  30. mineru/model/utils/pytorchocr/utils/resources/dict/ppocr_keys_v1.txt +0 -6623
  31. mineru/model/utils/pytorchocr/utils/resources/dict/ta_dict.txt +0 -128
  32. mineru/model/utils/pytorchocr/utils/resources/dict/te_dict.txt +0 -151
  33. {mineru-2.6.1.dist-info → mineru-2.6.3.dist-info}/WHEEL +0 -0
  34. {mineru-2.6.1.dist-info → mineru-2.6.3.dist-info}/entry_points.txt +0 -0
  35. {mineru-2.6.1.dist-info → mineru-2.6.3.dist-info}/licenses/LICENSE.md +0 -0
  36. {mineru-2.6.1.dist-info → mineru-2.6.3.dist-info}/top_level.txt +0 -0
@@ -281,28 +281,20 @@ class BatchAnalyze:
281
281
 
282
282
  # 按分辨率分组并同时完成padding
283
283
  # RESOLUTION_GROUP_STRIDE = 32
284
- RESOLUTION_GROUP_STRIDE = 64 # 定义分辨率分组的步进值
284
+ RESOLUTION_GROUP_STRIDE = 64
285
285
 
286
286
  resolution_groups = defaultdict(list)
287
287
  for crop_info in lang_crop_list:
288
288
  cropped_img = crop_info[0]
289
289
  h, w = cropped_img.shape[:2]
290
- # 使用更大的分组容差,减少分组数量
291
- # 将尺寸标准化到32的倍数
292
- normalized_h = ((h + RESOLUTION_GROUP_STRIDE) // RESOLUTION_GROUP_STRIDE) * RESOLUTION_GROUP_STRIDE # 向上取整到32的倍数
293
- normalized_w = ((w + RESOLUTION_GROUP_STRIDE) // RESOLUTION_GROUP_STRIDE) * RESOLUTION_GROUP_STRIDE
294
- group_key = (normalized_h, normalized_w)
290
+ # 直接计算目标尺寸并用作分组键
291
+ target_h = ((h + RESOLUTION_GROUP_STRIDE - 1) // RESOLUTION_GROUP_STRIDE) * RESOLUTION_GROUP_STRIDE
292
+ target_w = ((w + RESOLUTION_GROUP_STRIDE - 1) // RESOLUTION_GROUP_STRIDE) * RESOLUTION_GROUP_STRIDE
293
+ group_key = (target_h, target_w)
295
294
  resolution_groups[group_key].append(crop_info)
296
295
 
297
296
  # 对每个分辨率组进行批处理
298
- for group_key, group_crops in tqdm(resolution_groups.items(), desc=f"OCR-det {lang}"):
299
-
300
- # 计算目标尺寸(组内最大尺寸,向上取整到32的倍数)
301
- max_h = max(crop_info[0].shape[0] for crop_info in group_crops)
302
- max_w = max(crop_info[0].shape[1] for crop_info in group_crops)
303
- target_h = ((max_h + RESOLUTION_GROUP_STRIDE - 1) // RESOLUTION_GROUP_STRIDE) * RESOLUTION_GROUP_STRIDE
304
- target_w = ((max_w + RESOLUTION_GROUP_STRIDE - 1) // RESOLUTION_GROUP_STRIDE) * RESOLUTION_GROUP_STRIDE
305
-
297
+ for (target_h, target_w), group_crops in tqdm(resolution_groups.items(), desc=f"OCR-det {lang}"):
306
298
  # 对所有图像进行padding到统一尺寸
307
299
  batch_images = []
308
300
  for crop_info in group_crops:
@@ -310,49 +302,34 @@ class BatchAnalyze:
310
302
  h, w = img.shape[:2]
311
303
  # 创建目标尺寸的白色背景
312
304
  padded_img = np.ones((target_h, target_w, 3), dtype=np.uint8) * 255
313
- # 将原图像粘贴到左上角
314
305
  padded_img[:h, :w] = img
315
306
  batch_images.append(padded_img)
316
307
 
317
308
  # 批处理检测
318
- det_batch_size = min(len(batch_images), self.batch_ratio * OCR_DET_BASE_BATCH_SIZE) # 增加批处理大小
319
- # logger.debug(f"OCR-det batch: {det_batch_size} images, target size: {target_h}x{target_w}")
309
+ det_batch_size = min(len(batch_images), self.batch_ratio * OCR_DET_BASE_BATCH_SIZE)
320
310
  batch_results = ocr_model.text_detector.batch_predict(batch_images, det_batch_size)
321
311
 
322
312
  # 处理批处理结果
323
- for i, (crop_info, (dt_boxes, elapse)) in enumerate(zip(group_crops, batch_results)):
313
+ for crop_info, (dt_boxes, _) in zip(group_crops, batch_results):
324
314
  bgr_image, useful_list, ocr_res_list_dict, res, adjusted_mfdetrec_res, _lang = crop_info
325
315
 
326
316
  if dt_boxes is not None and len(dt_boxes) > 0:
327
- # 直接应用原始OCR流程中的关键处理步骤
328
-
329
- # 1. 排序检测框
330
- if len(dt_boxes) > 0:
331
- dt_boxes_sorted = sorted_boxes(dt_boxes)
332
- else:
333
- dt_boxes_sorted = []
334
-
335
- # 2. 合并相邻检测框
336
- if dt_boxes_sorted:
337
- dt_boxes_merged = merge_det_boxes(dt_boxes_sorted)
338
- else:
339
- dt_boxes_merged = []
340
-
341
- # 3. 根据公式位置更新检测框(关键步骤!)
342
- if dt_boxes_merged and adjusted_mfdetrec_res:
343
- dt_boxes_final = update_det_boxes(dt_boxes_merged, adjusted_mfdetrec_res)
344
- else:
345
- dt_boxes_final = dt_boxes_merged
346
-
347
- # 构造OCR结果格式
348
- ocr_res = [box.tolist() if hasattr(box, 'tolist') else box for box in dt_boxes_final]
349
-
350
- if ocr_res:
317
+ # 处理检测框
318
+ dt_boxes_sorted = sorted_boxes(dt_boxes)
319
+ dt_boxes_merged = merge_det_boxes(dt_boxes_sorted) if dt_boxes_sorted else []
320
+
321
+ # 根据公式位置更新检测框
322
+ dt_boxes_final = (update_det_boxes(dt_boxes_merged, adjusted_mfdetrec_res)
323
+ if dt_boxes_merged and adjusted_mfdetrec_res
324
+ else dt_boxes_merged)
325
+
326
+ if dt_boxes_final:
327
+ ocr_res = [box.tolist() if hasattr(box, 'tolist') else box for box in dt_boxes_final]
351
328
  ocr_result_list = get_ocr_result_list(
352
329
  ocr_res, useful_list, ocr_res_list_dict['ocr_enable'], bgr_image, _lang
353
330
  )
354
-
355
331
  ocr_res_list_dict['layout_res'].extend(ocr_result_list)
332
+
356
333
  else:
357
334
  # 原始单张处理模式
358
335
  for ocr_res_list_dict in tqdm(ocr_res_list_all_page, desc="OCR-det Predict"):
@@ -8,7 +8,7 @@ from ...model.layout.doclayoutyolo import DocLayoutYOLOModel
8
8
  from ...model.mfd.yolo_v8 import YOLOv8MFDModel
9
9
  from ...model.mfr.unimernet.Unimernet import UnimernetModel
10
10
  from ...model.mfr.pp_formulanet_plus_m.predict_formula import FormulaRecognizer
11
- from ...model.ocr.paddleocr2pytorch.pytorch_paddle import PytorchPaddleOCR
11
+ from mineru.model.ocr.pytorch_paddle import PytorchPaddleOCR
12
12
  from ...model.ori_cls.paddle_ori_cls import PaddleOrientationClsModel
13
13
  from ...model.table.cls.paddle_table_cls import PaddleTableClsModel
14
14
  # from ...model.table.rec.RapidTable import RapidTableModel
@@ -148,7 +148,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
148
148
  fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans)
149
149
 
150
150
  """如果当前页面没有有效的bbox则跳过"""
151
- if len(all_bboxes) == 0:
151
+ if len(all_bboxes) == 0 and len(fix_discarded_blocks) == 0:
152
152
  return None
153
153
 
154
154
  """对image/table/interline_equation截图"""
@@ -191,11 +191,20 @@ def merge_para_with_text(para_block):
191
191
  def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size):
192
192
  para_type = para_block['type']
193
193
  para_content = {}
194
- if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX]:
194
+ if para_type in [
195
+ BlockType.TEXT,
196
+ BlockType.LIST,
197
+ BlockType.INDEX,
198
+ ]:
195
199
  para_content = {
196
200
  'type': ContentType.TEXT,
197
201
  'text': merge_para_with_text(para_block),
198
202
  }
203
+ elif para_type == BlockType.DISCARDED:
204
+ para_content = {
205
+ 'type': para_type,
206
+ 'text': merge_para_with_text(para_block),
207
+ }
199
208
  elif para_type == BlockType.TITLE:
200
209
  para_content = {
201
210
  'type': ContentType.TEXT,
@@ -268,15 +277,19 @@ def union_make(pdf_info_dict: list,
268
277
  output_content = []
269
278
  for page_info in pdf_info_dict:
270
279
  paras_of_layout = page_info.get('para_blocks')
280
+ paras_of_discarded = page_info.get('discarded_blocks')
271
281
  page_idx = page_info.get('page_idx')
272
282
  page_size = page_info.get('page_size')
273
- if not paras_of_layout:
274
- continue
275
283
  if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
284
+ if not paras_of_layout:
285
+ continue
276
286
  page_markdown = make_blocks_to_markdown(paras_of_layout, make_mode, img_buket_path)
277
287
  output_content.extend(page_markdown)
278
288
  elif make_mode == MakeMode.CONTENT_LIST:
279
- for para_block in paras_of_layout:
289
+ para_blocks = (paras_of_layout or []) + (paras_of_discarded or [])
290
+ if not para_blocks:
291
+ continue
292
+ for para_block in para_blocks:
280
293
  para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
281
294
  if para_content:
282
295
  output_content.append(para_content)
@@ -8,6 +8,7 @@ from .utils import enable_custom_logits_processors, set_default_gpu_memory_utili
8
8
  from .model_output_to_middle_json import result_to_middle_json
9
9
  from ...data.data_reader_writer import DataWriter
10
10
  from mineru.utils.pdf_image_tools import load_images_from_pdf
11
+ from ...utils.check_mac_env import is_mac_os_version_supported
11
12
  from ...utils.config_reader import get_device
12
13
 
13
14
  from ...utils.enum_class import ImageType
@@ -47,7 +48,7 @@ class ModelSingleton:
47
48
  for param in ["batch_size", "max_concurrency", "http_timeout"]:
48
49
  if param in kwargs:
49
50
  del kwargs[param]
50
- if backend in ['transformers', 'vllm-engine', "vllm-async-engine"] and not model_path:
51
+ if backend in ['transformers', 'vllm-engine', "vllm-async-engine", "mlx-engine"] and not model_path:
51
52
  model_path = auto_download_and_get_model_root_path("/","vlm")
52
53
  if backend == "transformers":
53
54
  try:
@@ -75,6 +76,15 @@ class ModelSingleton:
75
76
  )
76
77
  if batch_size == 0:
77
78
  batch_size = set_default_batch_size()
79
+ elif backend == "mlx-engine":
80
+ mlx_supported = is_mac_os_version_supported()
81
+ if not mlx_supported:
82
+ raise EnvironmentError("mlx-engine backend is only supported on macOS 13.5+ with Apple Silicon.")
83
+ try:
84
+ from mlx_vlm import load as mlx_load
85
+ except ImportError:
86
+ raise ImportError("Please install mlx-vlm to use the mlx-engine backend.")
87
+ model, processor = mlx_load(model_path)
78
88
  else:
79
89
  if os.getenv('OMP_NUM_THREADS') is None:
80
90
  os.environ["OMP_NUM_THREADS"] = "1"
@@ -248,13 +248,16 @@ def union_make(pdf_info_dict: list,
248
248
  paras_of_discarded = page_info.get('discarded_blocks')
249
249
  page_idx = page_info.get('page_idx')
250
250
  page_size = page_info.get('page_size')
251
- if not paras_of_layout:
252
- continue
253
251
  if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
252
+ if not paras_of_layout:
253
+ continue
254
254
  page_markdown = mk_blocks_to_markdown(paras_of_layout, make_mode, formula_enable, table_enable, img_buket_path)
255
255
  output_content.extend(page_markdown)
256
256
  elif make_mode == MakeMode.CONTENT_LIST:
257
- for para_block in paras_of_layout+paras_of_discarded:
257
+ para_blocks = (paras_of_layout or []) + (paras_of_discarded or [])
258
+ if not para_blocks:
259
+ continue
260
+ for para_block in para_blocks:
258
261
  para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
259
262
  output_content.append(para_content)
260
263
 
mineru/cli/client.py CHANGED
@@ -4,6 +4,7 @@ import click
4
4
  from pathlib import Path
5
5
  from loguru import logger
6
6
 
7
+ from mineru.utils.check_mac_env import is_mac_os_version_supported
7
8
  from mineru.utils.cli_parser import arg_parse
8
9
  from mineru.utils.config_reader import get_device
9
10
  from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
@@ -11,6 +12,11 @@ from mineru.utils.model_utils import get_vram
11
12
  from ..version import __version__
12
13
  from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
13
14
 
15
+
16
+ backends = ['pipeline', 'vlm-transformers', 'vlm-vllm-engine', 'vlm-http-client']
17
+ if is_mac_os_version_supported():
18
+ backends.append("vlm-mlx-engine")
19
+
14
20
  @click.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=True))
15
21
  @click.pass_context
16
22
  @click.version_option(__version__,
@@ -38,25 +44,28 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
38
44
  '--method',
39
45
  'method',
40
46
  type=click.Choice(['auto', 'txt', 'ocr']),
41
- help="""the method for parsing pdf:
42
- auto: Automatically determine the method based on the file type.
43
- txt: Use text extraction method.
44
- ocr: Use OCR method for image-based PDFs.
47
+ help="""\b
48
+ the method for parsing pdf:
49
+ auto: Automatically determine the method based on the file type.
50
+ txt: Use text extraction method.
51
+ ocr: Use OCR method for image-based PDFs.
45
52
  Without method specified, 'auto' will be used by default.
46
- Adapted only for the case where the backend is set to "pipeline".""",
53
+ Adapted only for the case where the backend is set to 'pipeline'.""",
47
54
  default='auto',
48
55
  )
49
56
  @click.option(
50
57
  '-b',
51
58
  '--backend',
52
59
  'backend',
53
- type=click.Choice(['pipeline', 'vlm-transformers', 'vlm-vllm-engine', 'vlm-http-client']),
54
- help="""the backend for parsing pdf:
55
- pipeline: More general.
56
- vlm-transformers: More general.
57
- vlm-vllm-engine: Faster(engine).
58
- vlm-http-client: Faster(client).
59
- without method specified, pipeline will be used by default.""",
60
+ type=click.Choice(backends),
61
+ help="""\b
62
+ the backend for parsing pdf:
63
+ pipeline: More general.
64
+ vlm-transformers: More general, but slower.
65
+ vlm-mlx-engine: Faster than transformers.
66
+ vlm-vllm-engine: Faster(engine).
67
+ vlm-http-client: Faster(client).
68
+ Without method specified, pipeline will be used by default.""",
60
69
  default='pipeline',
61
70
  )
62
71
  @click.option(
@@ -66,7 +75,7 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
66
75
  type=click.Choice(['ch', 'ch_server', 'ch_lite', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka', 'th', 'el',
67
76
  'latin', 'arabic', 'east_slavic', 'cyrillic', 'devanagari']),
68
77
  help="""
69
- Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
78
+ Input the languages in the pdf (if known) to improve OCR accuracy.
70
79
  Without languages specified, 'ch' will be used by default.
71
80
  Adapted only for the case where the backend is set to "pipeline".
72
81
  """,
@@ -119,7 +128,8 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
119
128
  '--device',
120
129
  'device_mode',
121
130
  type=str,
122
- help='Device mode for model inference, e.g., "cpu", "cuda", "cuda:0", "npu", "npu:0", "mps". Adapted only for the case where the backend is set to "pipeline". ',
131
+ help="""Device mode for model inference, e.g., "cpu", "cuda", "cuda:0", "npu", "npu:0", "mps".
132
+ Adapted only for the case where the backend is set to "pipeline" and "vlm-transformers". """,
123
133
  default=None,
124
134
  )
125
135
  @click.option(
mineru/cli/gradio_app.py CHANGED
@@ -13,6 +13,7 @@ from gradio_pdf import PDF
13
13
  from loguru import logger
14
14
 
15
15
  from mineru.cli.common import prepare_env, read_fn, aio_do_parse, pdf_suffixes, image_suffixes
16
+ from mineru.utils.check_mac_env import is_mac_os_version_supported
16
17
  from mineru.utils.cli_parser import arg_parse
17
18
  from mineru.utils.hash_utils import str_sha256
18
19
 
@@ -273,7 +274,7 @@ def to_pdf(file_path):
273
274
 
274
275
  # 更新界面函数
275
276
  def update_interface(backend_choice):
276
- if backend_choice in ["vlm-transformers", "vlm-vllm-async-engine"]:
277
+ if backend_choice in ["vlm-transformers", "vlm-vllm-async-engine", "vlm-mlx-engine"]:
277
278
  return gr.update(visible=False), gr.update(visible=False)
278
279
  elif backend_choice in ["vlm-http-client"]:
279
280
  return gr.update(visible=True), gr.update(visible=False)
@@ -381,6 +382,8 @@ def main(ctx,
381
382
  preferred_option = "vlm-vllm-async-engine"
382
383
  else:
383
384
  drop_list = ["pipeline", "vlm-transformers", "vlm-http-client"]
385
+ if is_mac_os_version_supported():
386
+ drop_list.append("vlm-mlx-engine")
384
387
  preferred_option = "pipeline"
385
388
  backend = gr.Dropdown(drop_list, label="Backend", value=preferred_option)
386
389
  with gr.Row(visible=False) as client_options:
@@ -21,7 +21,7 @@ def download_and_modify_json(url, local_filename, modifications):
21
21
  if os.path.exists(local_filename):
22
22
  data = json.load(open(local_filename))
23
23
  config_version = data.get('config_version', '0.0.0')
24
- if config_version < '1.3.0':
24
+ if config_version < '1.3.1':
25
25
  data = download_json(url)
26
26
  else:
27
27
  data = download_json(url)
@@ -134,7 +134,7 @@ def get_model_params(lang, config):
134
134
  raise Exception (f'Language {lang} not supported')
135
135
 
136
136
 
137
- root_dir = os.path.join(Path(__file__).resolve().parent.parent.parent, 'utils')
137
+ root_dir = os.path.join(Path(__file__).resolve().parent.parent, 'utils')
138
138
 
139
139
 
140
140
  class PytorchPaddleOCR(TextSystem):
@@ -11,7 +11,7 @@ from rapid_table import ModelType, RapidTable, RapidTableInput
11
11
  from rapid_table.utils import RapidTableOutput
12
12
  from tqdm import tqdm
13
13
 
14
- from mineru.model.ocr.paddleocr2pytorch.pytorch_paddle import PytorchPaddleOCR
14
+ from mineru.model.ocr.pytorch_paddle import PytorchPaddleOCR
15
15
  from mineru.utils.enum_class import ModelPath
16
16
  from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
17
17