mineru 2.6.2__py3-none-any.whl → 2.6.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -281,28 +281,20 @@ class BatchAnalyze:
281
281
 
282
282
  # 按分辨率分组并同时完成padding
283
283
  # RESOLUTION_GROUP_STRIDE = 32
284
- RESOLUTION_GROUP_STRIDE = 64 # 定义分辨率分组的步进值
284
+ RESOLUTION_GROUP_STRIDE = 64
285
285
 
286
286
  resolution_groups = defaultdict(list)
287
287
  for crop_info in lang_crop_list:
288
288
  cropped_img = crop_info[0]
289
289
  h, w = cropped_img.shape[:2]
290
- # 使用更大的分组容差,减少分组数量
291
- # 将尺寸标准化到32的倍数
292
- normalized_h = ((h + RESOLUTION_GROUP_STRIDE) // RESOLUTION_GROUP_STRIDE) * RESOLUTION_GROUP_STRIDE # 向上取整到32的倍数
293
- normalized_w = ((w + RESOLUTION_GROUP_STRIDE) // RESOLUTION_GROUP_STRIDE) * RESOLUTION_GROUP_STRIDE
294
- group_key = (normalized_h, normalized_w)
290
+ # 直接计算目标尺寸并用作分组键
291
+ target_h = ((h + RESOLUTION_GROUP_STRIDE - 1) // RESOLUTION_GROUP_STRIDE) * RESOLUTION_GROUP_STRIDE
292
+ target_w = ((w + RESOLUTION_GROUP_STRIDE - 1) // RESOLUTION_GROUP_STRIDE) * RESOLUTION_GROUP_STRIDE
293
+ group_key = (target_h, target_w)
295
294
  resolution_groups[group_key].append(crop_info)
296
295
 
297
296
  # 对每个分辨率组进行批处理
298
- for group_key, group_crops in tqdm(resolution_groups.items(), desc=f"OCR-det {lang}"):
299
-
300
- # 计算目标尺寸(组内最大尺寸,向上取整到32的倍数)
301
- max_h = max(crop_info[0].shape[0] for crop_info in group_crops)
302
- max_w = max(crop_info[0].shape[1] for crop_info in group_crops)
303
- target_h = ((max_h + RESOLUTION_GROUP_STRIDE - 1) // RESOLUTION_GROUP_STRIDE) * RESOLUTION_GROUP_STRIDE
304
- target_w = ((max_w + RESOLUTION_GROUP_STRIDE - 1) // RESOLUTION_GROUP_STRIDE) * RESOLUTION_GROUP_STRIDE
305
-
297
+ for (target_h, target_w), group_crops in tqdm(resolution_groups.items(), desc=f"OCR-det {lang}"):
306
298
  # 对所有图像进行padding到统一尺寸
307
299
  batch_images = []
308
300
  for crop_info in group_crops:
@@ -310,49 +302,34 @@ class BatchAnalyze:
310
302
  h, w = img.shape[:2]
311
303
  # 创建目标尺寸的白色背景
312
304
  padded_img = np.ones((target_h, target_w, 3), dtype=np.uint8) * 255
313
- # 将原图像粘贴到左上角
314
305
  padded_img[:h, :w] = img
315
306
  batch_images.append(padded_img)
316
307
 
317
308
  # 批处理检测
318
- det_batch_size = min(len(batch_images), self.batch_ratio * OCR_DET_BASE_BATCH_SIZE) # 增加批处理大小
319
- # logger.debug(f"OCR-det batch: {det_batch_size} images, target size: {target_h}x{target_w}")
309
+ det_batch_size = min(len(batch_images), self.batch_ratio * OCR_DET_BASE_BATCH_SIZE)
320
310
  batch_results = ocr_model.text_detector.batch_predict(batch_images, det_batch_size)
321
311
 
322
312
  # 处理批处理结果
323
- for i, (crop_info, (dt_boxes, elapse)) in enumerate(zip(group_crops, batch_results)):
313
+ for crop_info, (dt_boxes, _) in zip(group_crops, batch_results):
324
314
  bgr_image, useful_list, ocr_res_list_dict, res, adjusted_mfdetrec_res, _lang = crop_info
325
315
 
326
316
  if dt_boxes is not None and len(dt_boxes) > 0:
327
- # 直接应用原始OCR流程中的关键处理步骤
328
-
329
- # 1. 排序检测框
330
- if len(dt_boxes) > 0:
331
- dt_boxes_sorted = sorted_boxes(dt_boxes)
332
- else:
333
- dt_boxes_sorted = []
334
-
335
- # 2. 合并相邻检测框
336
- if dt_boxes_sorted:
337
- dt_boxes_merged = merge_det_boxes(dt_boxes_sorted)
338
- else:
339
- dt_boxes_merged = []
340
-
341
- # 3. 根据公式位置更新检测框(关键步骤!)
342
- if dt_boxes_merged and adjusted_mfdetrec_res:
343
- dt_boxes_final = update_det_boxes(dt_boxes_merged, adjusted_mfdetrec_res)
344
- else:
345
- dt_boxes_final = dt_boxes_merged
346
-
347
- # 构造OCR结果格式
348
- ocr_res = [box.tolist() if hasattr(box, 'tolist') else box for box in dt_boxes_final]
349
-
350
- if ocr_res:
317
+ # 处理检测框
318
+ dt_boxes_sorted = sorted_boxes(dt_boxes)
319
+ dt_boxes_merged = merge_det_boxes(dt_boxes_sorted) if dt_boxes_sorted else []
320
+
321
+ # 根据公式位置更新检测框
322
+ dt_boxes_final = (update_det_boxes(dt_boxes_merged, adjusted_mfdetrec_res)
323
+ if dt_boxes_merged and adjusted_mfdetrec_res
324
+ else dt_boxes_merged)
325
+
326
+ if dt_boxes_final:
327
+ ocr_res = [box.tolist() if hasattr(box, 'tolist') else box for box in dt_boxes_final]
351
328
  ocr_result_list = get_ocr_result_list(
352
329
  ocr_res, useful_list, ocr_res_list_dict['ocr_enable'], bgr_image, _lang
353
330
  )
354
-
355
331
  ocr_res_list_dict['layout_res'].extend(ocr_result_list)
332
+
356
333
  else:
357
334
  # 原始单张处理模式
358
335
  for ocr_res_list_dict in tqdm(ocr_res_list_all_page, desc="OCR-det Predict"):
@@ -8,7 +8,7 @@ from ...model.layout.doclayoutyolo import DocLayoutYOLOModel
8
8
  from ...model.mfd.yolo_v8 import YOLOv8MFDModel
9
9
  from ...model.mfr.unimernet.Unimernet import UnimernetModel
10
10
  from ...model.mfr.pp_formulanet_plus_m.predict_formula import FormulaRecognizer
11
- from ...model.ocr.paddleocr2pytorch.pytorch_paddle import PytorchPaddleOCR
11
+ from mineru.model.ocr.pytorch_paddle import PytorchPaddleOCR
12
12
  from ...model.ori_cls.paddle_ori_cls import PaddleOrientationClsModel
13
13
  from ...model.table.cls.paddle_table_cls import PaddleTableClsModel
14
14
  # from ...model.table.rec.RapidTable import RapidTableModel
@@ -148,7 +148,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
148
148
  fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans)
149
149
 
150
150
  """如果当前页面没有有效的bbox则跳过"""
151
- if len(all_bboxes) == 0:
151
+ if len(all_bboxes) == 0 and len(fix_discarded_blocks) == 0:
152
152
  return None
153
153
 
154
154
  """对image/table/interline_equation截图"""
@@ -99,7 +99,10 @@ def doc_analyze(
99
99
  _lang = lang_list[pdf_idx]
100
100
 
101
101
  # 收集每个数据集中的页面
102
+ # load_images_start = time.time()
102
103
  images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
104
+ # load_images_time = round(time.time() - load_images_start, 2)
105
+ # logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_list) / load_images_time, 3)} images/s")
103
106
  all_image_lists.append(images_list)
104
107
  all_pdf_docs.append(pdf_doc)
105
108
  for page_idx in range(len(images_list)):
@@ -191,11 +191,20 @@ def merge_para_with_text(para_block):
191
191
  def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size):
192
192
  para_type = para_block['type']
193
193
  para_content = {}
194
- if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX]:
194
+ if para_type in [
195
+ BlockType.TEXT,
196
+ BlockType.LIST,
197
+ BlockType.INDEX,
198
+ ]:
195
199
  para_content = {
196
200
  'type': ContentType.TEXT,
197
201
  'text': merge_para_with_text(para_block),
198
202
  }
203
+ elif para_type == BlockType.DISCARDED:
204
+ para_content = {
205
+ 'type': para_type,
206
+ 'text': merge_para_with_text(para_block),
207
+ }
199
208
  elif para_type == BlockType.TITLE:
200
209
  para_content = {
201
210
  'type': ContentType.TEXT,
@@ -268,15 +277,19 @@ def union_make(pdf_info_dict: list,
268
277
  output_content = []
269
278
  for page_info in pdf_info_dict:
270
279
  paras_of_layout = page_info.get('para_blocks')
280
+ paras_of_discarded = page_info.get('discarded_blocks')
271
281
  page_idx = page_info.get('page_idx')
272
282
  page_size = page_info.get('page_size')
273
- if not paras_of_layout:
274
- continue
275
283
  if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
284
+ if not paras_of_layout:
285
+ continue
276
286
  page_markdown = make_blocks_to_markdown(paras_of_layout, make_mode, img_buket_path)
277
287
  output_content.extend(page_markdown)
278
288
  elif make_mode == MakeMode.CONTENT_LIST:
279
- for para_block in paras_of_layout:
289
+ para_blocks = (paras_of_layout or []) + (paras_of_discarded or [])
290
+ if not para_blocks:
291
+ continue
292
+ for para_block in para_blocks:
280
293
  para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
281
294
  if para_content:
282
295
  output_content.append(para_content)
@@ -8,6 +8,7 @@ from .utils import enable_custom_logits_processors, set_default_gpu_memory_utili
8
8
  from .model_output_to_middle_json import result_to_middle_json
9
9
  from ...data.data_reader_writer import DataWriter
10
10
  from mineru.utils.pdf_image_tools import load_images_from_pdf
11
+ from ...utils.check_sys_env import is_mac_os_version_supported
11
12
  from ...utils.config_reader import get_device
12
13
 
13
14
  from ...utils.enum_class import ImageType
@@ -47,7 +48,7 @@ class ModelSingleton:
47
48
  for param in ["batch_size", "max_concurrency", "http_timeout"]:
48
49
  if param in kwargs:
49
50
  del kwargs[param]
50
- if backend in ['transformers', 'vllm-engine', "vllm-async-engine"] and not model_path:
51
+ if backend in ['transformers', 'vllm-engine', "vllm-async-engine", "mlx-engine"] and not model_path:
51
52
  model_path = auto_download_and_get_model_root_path("/","vlm")
52
53
  if backend == "transformers":
53
54
  try:
@@ -75,6 +76,15 @@ class ModelSingleton:
75
76
  )
76
77
  if batch_size == 0:
77
78
  batch_size = set_default_batch_size()
79
+ elif backend == "mlx-engine":
80
+ mlx_supported = is_mac_os_version_supported()
81
+ if not mlx_supported:
82
+ raise EnvironmentError("mlx-engine backend is only supported on macOS 13.5+ with Apple Silicon.")
83
+ try:
84
+ from mlx_vlm import load as mlx_load
85
+ except ImportError:
86
+ raise ImportError("Please install mlx-vlm to use the mlx-engine backend.")
87
+ model, processor = mlx_load(model_path)
78
88
  else:
79
89
  if os.getenv('OMP_NUM_THREADS') is None:
80
90
  os.environ["OMP_NUM_THREADS"] = "1"
@@ -167,7 +177,7 @@ async def aio_doc_analyze(
167
177
  images_list, pdf_doc = load_images_from_pdf(pdf_bytes, image_type=ImageType.PIL)
168
178
  images_pil_list = [image_dict["img_pil"] for image_dict in images_list]
169
179
  # load_images_time = round(time.time() - load_images_start, 2)
170
- # logger.info(f"load images cost: {load_images_time}, speed: {round(len(images_base64_list)/load_images_time, 3)} images/s")
180
+ # logger.debug(f"load images cost: {load_images_time}, speed: {round(len(images_pil_list)/load_images_time, 3)} images/s")
171
181
 
172
182
  # infer_start = time.time()
173
183
  results = await predictor.aio_batch_two_step_extract(images=images_pil_list)
@@ -248,13 +248,16 @@ def union_make(pdf_info_dict: list,
248
248
  paras_of_discarded = page_info.get('discarded_blocks')
249
249
  page_idx = page_info.get('page_idx')
250
250
  page_size = page_info.get('page_size')
251
- if not paras_of_layout:
252
- continue
253
251
  if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
252
+ if not paras_of_layout:
253
+ continue
254
254
  page_markdown = mk_blocks_to_markdown(paras_of_layout, make_mode, formula_enable, table_enable, img_buket_path)
255
255
  output_content.extend(page_markdown)
256
256
  elif make_mode == MakeMode.CONTENT_LIST:
257
- for para_block in paras_of_layout+paras_of_discarded:
257
+ para_blocks = (paras_of_layout or []) + (paras_of_discarded or [])
258
+ if not para_blocks:
259
+ continue
260
+ for para_block in para_blocks:
258
261
  para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
259
262
  output_content.append(para_content)
260
263
 
mineru/cli/client.py CHANGED
@@ -4,6 +4,7 @@ import click
4
4
  from pathlib import Path
5
5
  from loguru import logger
6
6
 
7
+ from mineru.utils.check_sys_env import is_mac_os_version_supported
7
8
  from mineru.utils.cli_parser import arg_parse
8
9
  from mineru.utils.config_reader import get_device
9
10
  from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
@@ -11,6 +12,11 @@ from mineru.utils.model_utils import get_vram
11
12
  from ..version import __version__
12
13
  from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
13
14
 
15
+
16
+ backends = ['pipeline', 'vlm-transformers', 'vlm-vllm-engine', 'vlm-http-client']
17
+ if is_mac_os_version_supported():
18
+ backends.append("vlm-mlx-engine")
19
+
14
20
  @click.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=True))
15
21
  @click.pass_context
16
22
  @click.version_option(__version__,
@@ -38,25 +44,28 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
38
44
  '--method',
39
45
  'method',
40
46
  type=click.Choice(['auto', 'txt', 'ocr']),
41
- help="""the method for parsing pdf:
42
- auto: Automatically determine the method based on the file type.
43
- txt: Use text extraction method.
44
- ocr: Use OCR method for image-based PDFs.
47
+ help="""\b
48
+ the method for parsing pdf:
49
+ auto: Automatically determine the method based on the file type.
50
+ txt: Use text extraction method.
51
+ ocr: Use OCR method for image-based PDFs.
45
52
  Without method specified, 'auto' will be used by default.
46
- Adapted only for the case where the backend is set to "pipeline".""",
53
+ Adapted only for the case where the backend is set to 'pipeline'.""",
47
54
  default='auto',
48
55
  )
49
56
  @click.option(
50
57
  '-b',
51
58
  '--backend',
52
59
  'backend',
53
- type=click.Choice(['pipeline', 'vlm-transformers', 'vlm-vllm-engine', 'vlm-http-client']),
54
- help="""the backend for parsing pdf:
55
- pipeline: More general.
56
- vlm-transformers: More general.
57
- vlm-vllm-engine: Faster(engine).
58
- vlm-http-client: Faster(client).
59
- without method specified, pipeline will be used by default.""",
60
+ type=click.Choice(backends),
61
+ help="""\b
62
+ the backend for parsing pdf:
63
+ pipeline: More general.
64
+ vlm-transformers: More general, but slower.
65
+ vlm-mlx-engine: Faster than transformers.
66
+ vlm-vllm-engine: Faster(engine).
67
+ vlm-http-client: Faster(client).
68
+ Without method specified, pipeline will be used by default.""",
60
69
  default='pipeline',
61
70
  )
62
71
  @click.option(
@@ -66,7 +75,7 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
66
75
  type=click.Choice(['ch', 'ch_server', 'ch_lite', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka', 'th', 'el',
67
76
  'latin', 'arabic', 'east_slavic', 'cyrillic', 'devanagari']),
68
77
  help="""
69
- Input the languages in the pdf (if known) to improve OCR accuracy. Optional.
78
+ Input the languages in the pdf (if known) to improve OCR accuracy.
70
79
  Without languages specified, 'ch' will be used by default.
71
80
  Adapted only for the case where the backend is set to "pipeline".
72
81
  """,
@@ -119,7 +128,8 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
119
128
  '--device',
120
129
  'device_mode',
121
130
  type=str,
122
- help='Device mode for model inference, e.g., "cpu", "cuda", "cuda:0", "npu", "npu:0", "mps". Adapted only for the case where the backend is set to "pipeline". ',
131
+ help="""Device mode for model inference, e.g., "cpu", "cuda", "cuda:0", "npu", "npu:0", "mps".
132
+ Adapted only for the case where the backend is set to "pipeline" and "vlm-transformers". """,
123
133
  default=None,
124
134
  )
125
135
  @click.option(
mineru/cli/common.py CHANGED
@@ -5,8 +5,8 @@ import os
5
5
  import copy
6
6
  from pathlib import Path
7
7
 
8
- import pypdfium2 as pdfium
9
8
  from loguru import logger
9
+ import pypdfium2 as pdfium
10
10
 
11
11
  from mineru.data.data_reader_writer import FileBasedDataWriter
12
12
  from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox, draw_line_sort_bbox
@@ -16,10 +16,12 @@ from mineru.utils.pdf_image_tools import images_bytes_to_pdf_bytes
16
16
  from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
17
17
  from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
18
18
  from mineru.backend.vlm.vlm_analyze import aio_doc_analyze as aio_vlm_doc_analyze
19
+ from mineru.utils.pdf_page_id import get_end_page_id
19
20
 
20
21
  pdf_suffixes = ["pdf"]
21
22
  image_suffixes = ["png", "jpeg", "jp2", "webp", "gif", "bmp", "jpg", "tiff"]
22
23
 
24
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
23
25
 
24
26
  def read_fn(path):
25
27
  if not isinstance(path, Path):
@@ -44,18 +46,10 @@ def prepare_env(output_dir, pdf_file_name, parse_method):
44
46
 
45
47
 
46
48
  def convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id=0, end_page_id=None):
49
+ pdf = pdfium.PdfDocument(pdf_bytes)
50
+ output_pdf = pdfium.PdfDocument.new()
47
51
  try:
48
- # 从字节数据加载PDF
49
- pdf = pdfium.PdfDocument(pdf_bytes)
50
-
51
- # 确定结束页
52
- end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(pdf) - 1
53
- if end_page_id > len(pdf) - 1:
54
- logger.warning("end_page_id is out of range, use pdf_docs length")
55
- end_page_id = len(pdf) - 1
56
-
57
- # 创建一个新的PDF文档
58
- output_pdf = pdfium.PdfDocument.new()
52
+ end_page_id = get_end_page_id(end_page_id, len(pdf))
59
53
 
60
54
  # 选择要导入的页面索引
61
55
  page_indices = list(range(start_page_id, end_page_id + 1))
@@ -69,13 +63,12 @@ def convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id=0, end_page
69
63
 
70
64
  # 获取字节数据
71
65
  output_bytes = output_buffer.getvalue()
72
-
73
- pdf.close() # 关闭原PDF文档以释放资源
74
- output_pdf.close() # 关闭新PDF文档以释放资源
75
66
  except Exception as e:
76
67
  logger.warning(f"Error in converting PDF bytes: {e}, Using original PDF bytes.")
77
68
  output_bytes = pdf_bytes
78
69
 
70
+ pdf.close()
71
+ output_pdf.close()
79
72
  return output_bytes
80
73
 
81
74
 
mineru/cli/gradio_app.py CHANGED
@@ -13,6 +13,7 @@ from gradio_pdf import PDF
13
13
  from loguru import logger
14
14
 
15
15
  from mineru.cli.common import prepare_env, read_fn, aio_do_parse, pdf_suffixes, image_suffixes
16
+ from mineru.utils.check_sys_env import is_mac_os_version_supported
16
17
  from mineru.utils.cli_parser import arg_parse
17
18
  from mineru.utils.hash_utils import str_sha256
18
19
 
@@ -273,7 +274,7 @@ def to_pdf(file_path):
273
274
 
274
275
  # 更新界面函数
275
276
  def update_interface(backend_choice):
276
- if backend_choice in ["vlm-transformers", "vlm-vllm-async-engine"]:
277
+ if backend_choice in ["vlm-transformers", "vlm-vllm-async-engine", "vlm-mlx-engine"]:
277
278
  return gr.update(visible=False), gr.update(visible=False)
278
279
  elif backend_choice in ["vlm-http-client"]:
279
280
  return gr.update(visible=True), gr.update(visible=False)
@@ -381,6 +382,8 @@ def main(ctx,
381
382
  preferred_option = "vlm-vllm-async-engine"
382
383
  else:
383
384
  drop_list = ["pipeline", "vlm-transformers", "vlm-http-client"]
385
+ if is_mac_os_version_supported():
386
+ drop_list.append("vlm-mlx-engine")
384
387
  preferred_option = "pipeline"
385
388
  backend = gr.Dropdown(drop_list, label="Backend", value=preferred_option)
386
389
  with gr.Row(visible=False) as client_options:
@@ -21,7 +21,7 @@ def download_and_modify_json(url, local_filename, modifications):
21
21
  if os.path.exists(local_filename):
22
22
  data = json.load(open(local_filename))
23
23
  config_version = data.get('config_version', '0.0.0')
24
- if config_version < '1.3.0':
24
+ if config_version < '1.3.1':
25
25
  data = download_json(url)
26
26
  else:
27
27
  data = download_json(url)
@@ -134,7 +134,7 @@ def get_model_params(lang, config):
134
134
  raise Exception (f'Language {lang} not supported')
135
135
 
136
136
 
137
- root_dir = os.path.join(Path(__file__).resolve().parent.parent.parent, 'utils')
137
+ root_dir = os.path.join(Path(__file__).resolve().parent.parent, 'utils')
138
138
 
139
139
 
140
140
  class PytorchPaddleOCR(TextSystem):
@@ -11,7 +11,7 @@ from rapid_table import ModelType, RapidTable, RapidTableInput
11
11
  from rapid_table.utils import RapidTableOutput
12
12
  from tqdm import tqdm
13
13
 
14
- from mineru.model.ocr.paddleocr2pytorch.pytorch_paddle import PytorchPaddleOCR
14
+ from mineru.model.ocr.pytorch_paddle import PytorchPaddleOCR
15
15
  from mineru.utils.enum_class import ModelPath
16
16
  from mineru.utils.models_download_utils import auto_download_and_get_model_root_path
17
17
 
@@ -16,6 +16,7 @@ from typing import Any, Dict, List, Tuple
16
16
 
17
17
  import numpy as np
18
18
 
19
+ from mineru.utils.os_env_config import get_op_num_threads
19
20
  from .table_structure_utils import (
20
21
  OrtInferSession,
21
22
  TableLabelDecode,
@@ -29,6 +30,9 @@ class TableStructurer:
29
30
  self.preprocess_op = TablePreprocess()
30
31
  self.batch_preprocess_op = BatchTablePreprocess()
31
32
 
33
+ config["intra_op_num_threads"] = get_op_num_threads("MINERU_INTRA_OP_NUM_THREADS")
34
+ config["inter_op_num_threads"] = get_op_num_threads("MINERU_INTER_OP_NUM_THREADS")
35
+
32
36
  self.session = OrtInferSession(config)
33
37
 
34
38
  self.character = self.session.get_metadata()
@@ -5,6 +5,8 @@ from typing import Optional, Dict, Any, Tuple
5
5
  import cv2
6
6
  import numpy as np
7
7
  from skimage import measure
8
+
9
+ from mineru.utils.os_env_config import get_op_num_threads
8
10
  from .utils import OrtInferSession, resize_img
9
11
  from .utils_table_line_rec import (
10
12
  get_table_line,
@@ -28,6 +30,9 @@ class TSRUnet:
28
30
  self.inp_height = 1024
29
31
  self.inp_width = 1024
30
32
 
33
+ config["intra_op_num_threads"] = get_op_num_threads("MINERU_INTRA_OP_NUM_THREADS")
34
+ config["inter_op_num_threads"] = get_op_num_threads("MINERU_INTER_OP_NUM_THREADS")
35
+
31
36
  self.session = OrtInferSession(config)
32
37
 
33
38
  def __call__(
@@ -179,13 +179,14 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
179
179
  def model_init(model_name: str):
180
180
  from transformers import LayoutLMv3ForTokenClassification
181
181
  device_name = get_device()
182
+ device = torch.device(device_name)
182
183
  bf_16_support = False
183
184
  if device_name.startswith("cuda"):
184
- bf_16_support = torch.cuda.is_bf16_supported()
185
+ if torch.cuda.get_device_properties(device).major >= 8:
186
+ bf_16_support = True
185
187
  elif device_name.startswith("mps"):
186
188
  bf_16_support = True
187
189
 
188
- device = torch.device(device_name)
189
190
  if model_name == 'layoutreader':
190
191
  # 检测modelscope的缓存目录是否存在
191
192
  layoutreader_model_dir = os.path.join(auto_download_and_get_model_root_path(ModelPath.layout_reader), ModelPath.layout_reader)
@@ -0,0 +1,34 @@
1
+ # Copyright (c) Opendatalab. All rights reserved.
2
+ import platform
3
+
4
+ from packaging import version
5
+
6
+
7
+ def is_windows_environment() -> bool:
8
+ return platform.system() == "Windows"
9
+
10
+
11
+ # Detect if the current environment is a Mac computer
12
+ def is_mac_environment() -> bool:
13
+ return platform.system() == "Darwin"
14
+
15
+
16
+ # Detect if CPU is Apple Silicon architecture
17
+ def is_apple_silicon_cpu() -> bool:
18
+ return platform.machine() in ["arm64", "aarch64"]
19
+
20
+
21
+ # If Mac computer with Apple Silicon architecture, check if macOS version is 13.5 or above
22
+ def is_mac_os_version_supported(min_version: str = "13.5") -> bool:
23
+ if not is_mac_environment() or not is_apple_silicon_cpu():
24
+ return False
25
+ mac_version = platform.mac_ver()[0]
26
+ if not mac_version:
27
+ return False
28
+ # print("Mac OS Version:", mac_version)
29
+ return version.parse(mac_version) >= version.parse(min_version)
30
+
31
+ if __name__ == "__main__":
32
+ print("Is Mac Environment:", is_mac_environment())
33
+ print("Is Apple Silicon CPU:", is_apple_silicon_cpu())
34
+ print("Is Mac OS Version Supported (>=13.5):", is_mac_os_version_supported())
mineru/utils/llm_aided.py CHANGED
@@ -84,16 +84,21 @@ Corrected title list:
84
84
  max_retries = 3
85
85
  dict_completion = None
86
86
 
87
+ # Build API call parameters
88
+ api_params = {
89
+ "model": title_aided_config["model"],
90
+ "messages": [{'role': 'user', 'content': title_optimize_prompt}],
91
+ "temperature": 0.7,
92
+ "stream": True,
93
+ }
94
+
95
+ # Only add extra_body when explicitly specified in config
96
+ if "enable_thinking" in title_aided_config:
97
+ api_params["extra_body"] = {"enable_thinking": title_aided_config["enable_thinking"]}
98
+
87
99
  while retry_count < max_retries:
88
100
  try:
89
- completion = client.chat.completions.create(
90
- model=title_aided_config["model"],
91
- messages=[
92
- {'role': 'user', 'content': title_optimize_prompt}],
93
- extra_body={"enable_thinking": False},
94
- temperature=0.7,
95
- stream=True,
96
- )
101
+ completion = client.chat.completions.create(**api_params)
97
102
  content_pieces = []
98
103
  for chunk in completion:
99
104
  if chunk.choices and chunk.choices[0].delta.content is not None:
@@ -0,0 +1,30 @@
1
+ import os
2
+
3
+
4
+ def get_op_num_threads(env_name: str) -> int:
5
+ env_value = os.getenv(env_name, None)
6
+ return get_value_from_string(env_value, -1)
7
+
8
+
9
+ def get_load_images_timeout() -> int:
10
+ env_value = os.getenv('MINERU_PDF_RENDER_TIMEOUT', None)
11
+ return get_value_from_string(env_value, 300)
12
+
13
+
14
+ def get_value_from_string(env_value: str, default_value: int) -> int:
15
+ if env_value is not None:
16
+ try:
17
+ num_threads = int(env_value)
18
+ if num_threads > 0:
19
+ return num_threads
20
+ except ValueError:
21
+ return default_value
22
+ return default_value
23
+
24
+
25
+ if __name__ == '__main__':
26
+ print(get_value_from_string('1', -1))
27
+ print(get_value_from_string('0', -1))
28
+ print(get_value_from_string('-1', -1))
29
+ print(get_value_from_string('abc', -1))
30
+ print(get_load_images_timeout())
@@ -1,4 +1,5 @@
1
1
  # Copyright (c) Opendatalab. All rights reserved.
2
+ import os
2
3
  from io import BytesIO
3
4
 
4
5
  import numpy as np
@@ -7,9 +8,14 @@ from loguru import logger
7
8
  from PIL import Image
8
9
 
9
10
  from mineru.data.data_reader_writer import FileBasedDataWriter
11
+ from mineru.utils.check_sys_env import is_windows_environment
12
+ from mineru.utils.os_env_config import get_load_images_timeout
10
13
  from mineru.utils.pdf_reader import image_to_b64str, image_to_bytes, page_to_image
11
- from .enum_class import ImageType
12
- from .hash_utils import str_sha256
14
+ from mineru.utils.enum_class import ImageType
15
+ from mineru.utils.hash_utils import str_sha256
16
+ from mineru.utils.pdf_page_id import get_end_page_id
17
+
18
+ from concurrent.futures import ProcessPoolExecutor, TimeoutError as FuturesTimeoutError
13
19
 
14
20
 
15
21
  def pdf_page_to_image(page: pdfium.PdfPage, dpi=200, image_type=ImageType.PIL) -> dict:
@@ -35,7 +41,106 @@ def pdf_page_to_image(page: pdfium.PdfPage, dpi=200, image_type=ImageType.PIL) -
35
41
  return image_dict
36
42
 
37
43
 
44
+ def _load_images_from_pdf_worker(pdf_bytes, dpi, start_page_id, end_page_id, image_type):
45
+ """用于进程池的包装函数"""
46
+ return load_images_from_pdf_core(pdf_bytes, dpi, start_page_id, end_page_id, image_type)
47
+
48
+
38
49
  def load_images_from_pdf(
50
+ pdf_bytes: bytes,
51
+ dpi=200,
52
+ start_page_id=0,
53
+ end_page_id=None,
54
+ image_type=ImageType.PIL,
55
+ timeout=None,
56
+ threads=4,
57
+ ):
58
+ """带超时控制的 PDF 转图片函数,支持多进程加速
59
+
60
+ Args:
61
+ pdf_bytes (bytes): PDF 文件的 bytes
62
+ dpi (int, optional): reset the dpi of dpi. Defaults to 200.
63
+ start_page_id (int, optional): 起始页码. Defaults to 0.
64
+ end_page_id (int | None, optional): 结束页码. Defaults to None.
65
+ image_type (ImageType, optional): 图片类型. Defaults to ImageType.PIL.
66
+ timeout (int | None, optional): 超时时间(秒)。如果为 None,则从环境变量 MINERU_PDF_LOAD_IMAGES_TIMEOUT 读取,若未设置则默认为 300 秒。
67
+ threads (int): 进程数,默认 4
68
+
69
+ Raises:
70
+ TimeoutError: 当转换超时时抛出
71
+ """
72
+ pdf_doc = pdfium.PdfDocument(pdf_bytes)
73
+ if is_windows_environment():
74
+ # Windows 环境下不使用多进程
75
+ return load_images_from_pdf_core(
76
+ pdf_bytes,
77
+ dpi,
78
+ start_page_id,
79
+ get_end_page_id(end_page_id, len(pdf_doc)),
80
+ image_type
81
+ ), pdf_doc
82
+ else:
83
+ if timeout is None:
84
+ timeout = get_load_images_timeout()
85
+ end_page_id = get_end_page_id(end_page_id, len(pdf_doc))
86
+
87
+ # 计算总页数
88
+ total_pages = end_page_id - start_page_id + 1
89
+
90
+ # 实际使用的进程数不超过总页数
91
+ actual_threads = min(os.cpu_count() or 1, threads, total_pages)
92
+
93
+ # 根据实际进程数分组页面范围
94
+ pages_per_thread = max(1, total_pages // actual_threads)
95
+ page_ranges = []
96
+
97
+ for i in range(actual_threads):
98
+ range_start = start_page_id + i * pages_per_thread
99
+ if i == actual_threads - 1:
100
+ # 最后一个进程处理剩余所有页面
101
+ range_end = end_page_id
102
+ else:
103
+ range_end = start_page_id + (i + 1) * pages_per_thread - 1
104
+
105
+ page_ranges.append((range_start, range_end))
106
+
107
+ # logger.debug(f"PDF to images using {actual_threads} processes, page ranges: {page_ranges}")
108
+
109
+ with ProcessPoolExecutor(max_workers=actual_threads) as executor:
110
+ # 提交所有任务
111
+ futures = []
112
+ for range_start, range_end in page_ranges:
113
+ future = executor.submit(
114
+ _load_images_from_pdf_worker,
115
+ pdf_bytes,
116
+ dpi,
117
+ range_start,
118
+ range_end,
119
+ image_type
120
+ )
121
+ futures.append((range_start, future))
122
+
123
+ try:
124
+ # 收集结果并按页码排序
125
+ all_results = []
126
+ for range_start, future in futures:
127
+ images_list = future.result(timeout=timeout)
128
+ all_results.append((range_start, images_list))
129
+
130
+ # 按起始页码排序并合并结果
131
+ all_results.sort(key=lambda x: x[0])
132
+ images_list = []
133
+ for _, imgs in all_results:
134
+ images_list.extend(imgs)
135
+
136
+ return images_list, pdf_doc
137
+ except FuturesTimeoutError:
138
+ pdf_doc.close()
139
+ executor.shutdown(wait=False, cancel_futures=True)
140
+ raise TimeoutError(f"PDF to images conversion timeout after {timeout}s")
141
+
142
+
143
+ def load_images_from_pdf_core(
39
144
  pdf_bytes: bytes,
40
145
  dpi=200,
41
146
  start_page_id=0,
@@ -45,18 +150,17 @@ def load_images_from_pdf(
45
150
  images_list = []
46
151
  pdf_doc = pdfium.PdfDocument(pdf_bytes)
47
152
  pdf_page_num = len(pdf_doc)
48
- end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
49
- if end_page_id > pdf_page_num - 1:
50
- logger.warning("end_page_id is out of range, use images length")
51
- end_page_id = pdf_page_num - 1
52
-
53
- for index in range(0, pdf_page_num):
54
- if start_page_id <= index <= end_page_id:
55
- page = pdf_doc[index]
56
- image_dict = pdf_page_to_image(page, dpi=dpi, image_type=image_type)
57
- images_list.append(image_dict)
58
-
59
- return images_list, pdf_doc
153
+ end_page_id = get_end_page_id(end_page_id, pdf_page_num)
154
+
155
+ for index in range(start_page_id, end_page_id + 1):
156
+ # logger.debug(f"Converting page {index}/{pdf_page_num} to image")
157
+ page = pdf_doc[index]
158
+ image_dict = pdf_page_to_image(page, dpi=dpi, image_type=image_type)
159
+ images_list.append(image_dict)
160
+
161
+ pdf_doc.close()
162
+
163
+ return images_list
60
164
 
61
165
 
62
166
  def cut_image(bbox: tuple, page_num: int, page_pil_img, return_path, image_writer: FileBasedDataWriter, scale=2):
@@ -0,0 +1,10 @@
1
+ # Copyright (c) Opendatalab. All rights reserved.
2
+ from loguru import logger
3
+
4
+
5
+ def get_end_page_id(end_page_id, pdf_page_num):
6
+ end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
7
+ if end_page_id > pdf_page_num - 1:
8
+ logger.warning("end_page_id is out of range, use images length")
9
+ end_page_id = pdf_page_num - 1
10
+ return end_page_id
mineru/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "2.6.2"
1
+ __version__ = "2.6.4"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mineru
3
- Version: 2.6.2
3
+ Version: 2.6.4
4
4
  Summary: A practical tool for converting PDF to Markdown
5
5
  License: AGPL-3.0
6
6
  Project-URL: homepage, https://mineru.net/
@@ -37,7 +37,7 @@ Requires-Dist: scikit-image<1.0.0,>=0.25.0
37
37
  Requires-Dist: openai<3,>=1.70.0
38
38
  Requires-Dist: beautifulsoup4<5,>=4.13.5
39
39
  Requires-Dist: magika<0.7.0,>=0.6.2
40
- Requires-Dist: mineru-vl-utils<1,>=0.1.14
40
+ Requires-Dist: mineru-vl-utils<1,>=0.1.15
41
41
  Provides-Extra: test
42
42
  Requires-Dist: mineru[core]; extra == "test"
43
43
  Requires-Dist: pytest; extra == "test"
@@ -50,6 +50,8 @@ Requires-Dist: transformers<5.0.0,>=4.51.1; extra == "vlm"
50
50
  Requires-Dist: accelerate>=1.5.1; extra == "vlm"
51
51
  Provides-Extra: vllm
52
52
  Requires-Dist: vllm<0.12,>=0.10.1.1; extra == "vllm"
53
+ Provides-Extra: mlx
54
+ Requires-Dist: mlx-vlm<0.4,>=0.3.3; extra == "mlx"
53
55
  Provides-Extra: pipeline
54
56
  Requires-Dist: matplotlib<4,>=3.10; extra == "pipeline"
55
57
  Requires-Dist: ultralytics<9,>=8.3.48; extra == "pipeline"
@@ -76,6 +78,7 @@ Requires-Dist: mineru[vlm]; extra == "core"
76
78
  Requires-Dist: mineru[pipeline]; extra == "core"
77
79
  Requires-Dist: mineru[api]; extra == "core"
78
80
  Requires-Dist: mineru[gradio]; extra == "core"
81
+ Requires-Dist: mineru[mlx]; sys_platform == "darwin" and extra == "core"
79
82
  Provides-Extra: all
80
83
  Requires-Dist: mineru[core]; extra == "all"
81
84
  Requires-Dist: mineru[vllm]; extra == "all"
@@ -127,6 +130,14 @@ Dynamic: license-file
127
130
  </div>
128
131
 
129
132
  # Changelog
133
+ - 2025/11/04 2.6.4 Release
134
+ - Added timeout configuration for PDF image rendering, default is 300 seconds, can be configured via environment variable `MINERU_PDF_RENDER_TIMEOUT` to prevent long blocking of the rendering process caused by some abnormal PDF files.
135
+ - Added CPU thread count configuration options for ONNX models, default is the system CPU core count, can be configured via environment variables `MINERU_INTRA_OP_NUM_THREADS` and `MINERU_INTER_OP_NUM_THREADS` to reduce CPU resource contention conflicts in high concurrency scenarios.
136
+
137
+ - 2025/10/31 2.6.3 Release
138
+ - Added support for a new backend `vlm-mlx-engine`, enabling MLX-accelerated inference for the MinerU2.5 model on Apple Silicon devices. Compared to the `vlm-transformers` backend, `vlm-mlx-engine` delivers a 100%–200% speed improvement.
139
+ - Bug fixes: #3849, #3859
140
+
130
141
  - 2025/10/24 2.6.2 Release
131
142
  - `pipeline` backend optimizations
132
143
  - Added experimental support for Chinese formulas, which can be enabled by setting the environment variable `export MINERU_FORMULA_CH_SUPPORT=1`. This feature may cause a slight decrease in MFR speed and failures in recognizing some long formulas. It is recommended to enable it only when parsing Chinese formulas is needed. To disable this feature, set the environment variable to `0`.
@@ -666,7 +677,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
666
677
  - Automatically recognize and convert formulas in the document to LaTeX format.
667
678
  - Automatically recognize and convert tables in the document to HTML format.
668
679
  - Automatically detect scanned PDFs and garbled PDFs and enable OCR functionality.
669
- - OCR supports detection and recognition of 84 languages.
680
+ - OCR supports detection and recognition of 109 languages.
670
681
  - Supports multiple output formats, such as multimodal and NLP Markdown, JSON sorted by reading order, and rich intermediate formats.
671
682
  - Supports various visualization results, including layout visualization and span visualization, for efficient confirmation of output quality.
672
683
  - Supports running in a pure CPU environment, and also supports GPU(CUDA)/NPU(CANN)/MPS acceleration
@@ -703,41 +714,70 @@ A WebUI developed based on Gradio, with a simple interface and only core parsing
703
714
  > In non-mainline environments, due to the diversity of hardware and software configurations, as well as third-party dependency compatibility issues, we cannot guarantee 100% project availability. Therefore, for users who wish to use this project in non-recommended environments, we suggest carefully reading the documentation and FAQ first. Most issues already have corresponding solutions in the FAQ. We also encourage community feedback to help us gradually expand support.
704
715
 
705
716
  <table>
706
- <tr>
707
- <td>Parsing Backend</td>
708
- <td>pipeline</td>
709
- <td>vlm-transformers</td>
710
- <td>vlm-vllm</td>
711
- </tr>
712
- <tr>
713
- <td>Operating System</td>
714
- <td>Linux / Windows / macOS</td>
715
- <td>Linux / Windows</td>
716
- <td>Linux / Windows (via WSL2)</td>
717
- </tr>
718
- <tr>
719
- <td>CPU Inference Support</td>
720
- <td>✅</td>
721
- <td colspan="2">❌</td>
722
- </tr>
723
- <tr>
724
- <td>GPU Requirements</td>
725
- <td>Turing architecture and later, 6GB+ VRAM or Apple Silicon</td>
726
- <td colspan="2">Turing architecture and later, 8GB+ VRAM</td>
727
- </tr>
728
- <tr>
729
- <td>Memory Requirements</td>
730
- <td colspan="3">Minimum 16GB+, recommended 32GB+</td>
731
- </tr>
732
- <tr>
733
- <td>Disk Space Requirements</td>
734
- <td colspan="3">20GB+, SSD recommended</td>
735
- </tr>
736
- <tr>
737
- <td>Python Version</td>
738
- <td colspan="3">3.10-3.13</td>
739
- </tr>
717
+ <thead>
718
+ <tr>
719
+ <th rowspan="2">Parsing Backend</th>
720
+ <th rowspan="2">pipeline <br> (Accuracy<sup>1</sup> 82+)</th>
721
+ <th colspan="4">vlm (Accuracy<sup>1</sup> 90+)</th>
722
+ </tr>
723
+ <tr>
724
+ <th>transformers</th>
725
+ <th>mlx-engine</th>
726
+ <th>vllm-engine / <br>vllm-async-engine</th>
727
+ <th>http-client</th>
728
+ </tr>
729
+ </thead>
730
+ <tbody>
731
+ <tr>
732
+ <th>Backend Features</th>
733
+ <td>Fast, no hallucinations</td>
734
+ <td>Good compatibility, <br>but slower</td>
735
+ <td>Faster than transformers</td>
736
+ <td>Fast, compatible with the vLLM ecosystem</td>
737
+ <td>Suitable for OpenAI-compatible servers<sup>5</sup></td>
738
+ </tr>
739
+ <tr>
740
+ <th>Operating System</th>
741
+ <td colspan="2" style="text-align:center;">Linux<sup>2</sup> / Windows / macOS</td>
742
+ <td style="text-align:center;">macOS<sup>3</sup></td>
743
+ <td style="text-align:center;">Linux<sup>2</sup> / Windows<sup>4</sup> </td>
744
+ <td>Any</td>
745
+ </tr>
746
+ <tr>
747
+ <th>CPU inference support</th>
748
+ <td colspan="2" style="text-align:center;">✅</td>
749
+ <td colspan="2" style="text-align:center;">❌</td>
750
+ <td>Not required</td>
751
+ </tr>
752
+ <tr>
753
+ <th>GPU Requirements</th><td colspan="2" style="text-align:center;">Volta or later architectures, 6 GB VRAM or more, or Apple Silicon</td>
754
+ <td>Apple Silicon</td>
755
+ <td>Volta or later architectures, 8 GB VRAM or more</td>
756
+ <td>Not required</td>
757
+ </tr>
758
+ <tr>
759
+ <th>Memory Requirements</th>
760
+ <td colspan="4" style="text-align:center;">Minimum 16 GB, 32 GB recommended</td>
761
+ <td>8 GB</td>
762
+ </tr>
763
+ <tr>
764
+ <th>Disk Space Requirements</th>
765
+ <td colspan="4" style="text-align:center;">20 GB or more, SSD recommended</td>
766
+ <td>2 GB</td>
767
+ </tr>
768
+ <tr>
769
+ <th>Python Version</th>
770
+ <td colspan="5" style="text-align:center;">3.10-3.13</td>
771
+ </tr>
772
+ </tbody>
740
773
  </table>
774
+
775
+ <sup>1</sup> Accuracy metric is the End-to-End Evaluation Overall score of OmniDocBench (v1.5), tested on the latest `MinerU` version.
776
+ <sup>2</sup> Linux supports only distributions released in 2019 or later.
777
+ <sup>3</sup> MLX requires macOS 13.5 or later, recommended for use with version 14.0 or higher.
778
+ <sup>4</sup> Windows vLLM support via WSL2(Windows Subsystem for Linux).
779
+ <sup>5</sup> Servers compatible with the OpenAI API, such as local or remote model services deployed via inference frameworks like `vLLM`, `SGLang`, or `LMDeploy`.
780
+
741
781
 
742
782
  ### Install MinerU
743
783
 
@@ -1,28 +1,28 @@
1
1
  mineru/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
2
- mineru/version.py,sha256=53Sii4w6BIWn-1RhaTyqUO46gDe4nDCRQDAcpsWFH24,22
2
+ mineru/version.py,sha256=ODIwI6SfzWmx_FdtwCfr6k5TmpNuA5JdvGyV-9G9YrM,22
3
3
  mineru/backend/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
4
4
  mineru/backend/utils.py,sha256=GLJU3IznDmhE1_qNmkU1UOtsuskIHBezgsEVO6Uar-Y,698
5
5
  mineru/backend/pipeline/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
6
- mineru/backend/pipeline/batch_analyze.py,sha256=dOnktvOMjfg84w1H34YlJg6N9_x6Yfvf14NIpOQcZqQ,22221
7
- mineru/backend/pipeline/model_init.py,sha256=OfB2MMjNmZcHl4fkqS1fT5R8I3LVoSKAHGtl8PcBfBs,9372
8
- mineru/backend/pipeline/model_json_to_middle_json.py,sha256=DtB7kE_7CtxwOMcb6QYeKzY6vMwUJNpavc5fn9z9oiI,10916
6
+ mineru/backend/pipeline/batch_analyze.py,sha256=gnilKhFlMe8-55X2PJnb-ZSVeZIS-5DxIbMpHnwLne8,20889
7
+ mineru/backend/pipeline/model_init.py,sha256=OAylOcQD9gu5TBcX7nMt7X5NpJMtQICI5IvEQ648lpI,9358
8
+ mineru/backend/pipeline/model_json_to_middle_json.py,sha256=reXkUR_wKmJD64d7vRNXMxFviwkzDlGjRshpdwsVquI,10951
9
9
  mineru/backend/pipeline/model_list.py,sha256=7cXMBfZrP0K6qWueg1D_-WoUANeSINzkn_ic9E7YQLs,222
10
10
  mineru/backend/pipeline/para_split.py,sha256=Kq95MmvkPm7rKxlCSGiTvVKyF7CErHI2eGGAs5sLl0Q,17119
11
- mineru/backend/pipeline/pipeline_analyze.py,sha256=rbO5AetOdnxR5ctkoDzFCFoElkz7Jgb7gi2Ct596NK8,6655
11
+ mineru/backend/pipeline/pipeline_analyze.py,sha256=O_HGifodg03VZbmTve-U6Cmo0T03AmuK86t1v1J9X-Q,6897
12
12
  mineru/backend/pipeline/pipeline_magic_model.py,sha256=w8jGx8f6yZN0Wf2yPP3L9rYKc9rogxreZCrUJzJvPO8,14974
13
- mineru/backend/pipeline/pipeline_middle_json_mkcontent.py,sha256=xWWOFmYL6hB8PLrxQFyRJ72dAmTIDHtqiWV-WFUfR44,14081
13
+ mineru/backend/pipeline/pipeline_middle_json_mkcontent.py,sha256=YlnEbbUnkniZXS13aLo5mjfFQvQM5SrIVvTAGBZsLmw,14478
14
14
  mineru/backend/vlm/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
15
15
  mineru/backend/vlm/model_output_to_middle_json.py,sha256=AqYX44gS9crUO_t7SuUatD71EVjow6pI6yA2Ik3gQ0s,5139
16
16
  mineru/backend/vlm/utils.py,sha256=woGqyRI4S7p69daLCU07XNXWTV27aLf7YBjjVH1x-5o,2794
17
- mineru/backend/vlm/vlm_analyze.py,sha256=nzwTGndwZFfTEvHppakyDKZxph7SYOuUZW3johY5F8c,8154
17
+ mineru/backend/vlm/vlm_analyze.py,sha256=EQKNtc12pQ6so5NuUE-ppUtWI1QH_CQnsx1QfHdzAwA,8790
18
18
  mineru/backend/vlm/vlm_magic_model.py,sha256=Pd0sOr7G1crAJIVeq6h_03gNSuxmV5U8dvGTGT_rrjs,23452
19
- mineru/backend/vlm/vlm_middle_json_mkcontent.py,sha256=Ie95XpwTgi7EmidcwE_scvXMRQjE2xASU_Rm_F8EP-I,13377
19
+ mineru/backend/vlm/vlm_middle_json_mkcontent.py,sha256=5V-AU9KkxxMn0DDSQBrb15I4GVpEyiQy8uNI_tQhS6M,13498
20
20
  mineru/cli/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
21
- mineru/cli/client.py,sha256=uo7db9Wqj1Mc11MYuaM-bi54BfKKU3SFB9Urc8md5X4,6641
22
- mineru/cli/common.py,sha256=jxFJMdc-02UMO3SXAtcZ6aIdPrakAE6DCccZ9kDlPKc,14276
21
+ mineru/cli/client.py,sha256=ul2Twu-MWT2pCPrtvWbhIwWnoR6aurHJ3KhFOmElP90,6915
22
+ mineru/cli/common.py,sha256=3kd6sF6BlnBNL_UeMjXKJ11fGQA4Y9lOckznWNiIWY8,13988
23
23
  mineru/cli/fast_api.py,sha256=t5bda769VbM5iokAboiJfPIOnm-r5GTFReE-KQy8L3g,10941
24
- mineru/cli/gradio_app.py,sha256=8rMdW7grwBUn0MdXyG4eOTQUzKWq6nErtMWl-vGdWbU,14525
25
- mineru/cli/models_download.py,sha256=7KA-Boe-eIt3WW6eyaxM1HfubTXLsQ8sMmT1H1X7vAc,4815
24
+ mineru/cli/gradio_app.py,sha256=hyhI38y-JahMJgYZiikC3CYUVrtYVjbZb67Q4RUKbw4,14731
25
+ mineru/cli/models_download.py,sha256=LNfoIpUlJM7m7qb2SiCxtjMDw4jILBQtZwNP2JoY81U,4815
26
26
  mineru/cli/vlm_vllm_server.py,sha256=fQJyD-gIPQ41hR_6aIaDJczl66N310t0CiZEBAfX5mc,90
27
27
  mineru/data/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
28
28
  mineru/data/data_reader_writer/__init__.py,sha256=9qnGNrsuGBMwwfsQy6oChdkz--a_LPdYWE0VZZr0yr4,490
@@ -62,8 +62,7 @@ mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/configuration_unimer_swin.py
62
62
  mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/image_processing_unimer_swin.py,sha256=a9kCvwzJJSRrKQNtW2oOpTwrapzep8BjGFWLhLF1T0k,6036
63
63
  mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unimer_swin.py,sha256=Q_fdmFHUBtEoAfWp9aowdwTCE2MIFMOPbYjoSyXK2iU,48929
64
64
  mineru/model/ocr/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
65
- mineru/model/ocr/paddleocr2pytorch/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
66
- mineru/model/ocr/paddleocr2pytorch/pytorch_paddle.py,sha256=wZOw82q1NARNHBW2Lk5zumjdAqzPZqnhV6rvMULvLs8,9207
65
+ mineru/model/ocr/pytorch_paddle.py,sha256=cHMTl5sKyn4BY2207-7GQ4eZl9BQUcs5ucxw_NFezII,9200
67
66
  mineru/model/ori_cls/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
68
67
  mineru/model/ori_cls/paddle_ori_cls.py,sha256=VIS22IerHST7g60AC9r2PEQIG6NQWeQaH1OrXIxNTsg,11943
69
68
  mineru/model/reading_order/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
@@ -72,18 +71,18 @@ mineru/model/reading_order/xycut.py,sha256=ezNSq_Y4UXiztB58hbXJsjTJlOBqWIjuW5A2u
72
71
  mineru/model/table/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
73
72
  mineru/model/table/cls/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
74
73
  mineru/model/table/cls/paddle_table_cls.py,sha256=5PtieKQnAzgMNRTZFgnqQsGWKTEQ3yyFWQnBRIjfQ4A,5781
75
- mineru/model/table/rec/RapidTable.py,sha256=FxO3dLNKfQrgcQU7gRI0kLAxllnoHWZptCtyyHNuMpM,5973
74
+ mineru/model/table/rec/RapidTable.py,sha256=2dNdGJsVdsGfRm6r3deERUMst5RIxH0YuiGALkQbNTw,5955
76
75
  mineru/model/table/rec/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
77
76
  mineru/model/table/rec/slanet_plus/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
78
77
  mineru/model/table/rec/slanet_plus/main.py,sha256=vfrcvQ9JBf32YZU9eNoetoqdpcrFNsA1WNqQBsG8i2o,7646
79
78
  mineru/model/table/rec/slanet_plus/matcher.py,sha256=uwF-wCLaYlaQ3JQ_-YywGVl1XQYnx7G_RTuWLW8JlBk,7321
80
79
  mineru/model/table/rec/slanet_plus/matcher_utils.py,sha256=9wt_ydeeViLd57bU6g3lnXXni49qLSra2C6wSFQZkiw,9597
81
- mineru/model/table/rec/slanet_plus/table_structure.py,sha256=Ve9eUdA0ivHf5bf9gwvHHfb7-E7drJLP3S3MPlh3uZ0,3844
80
+ mineru/model/table/rec/slanet_plus/table_structure.py,sha256=qt-HPYIQyp0aWG_MmnM_sMQCV8ZLb4rALSueyCohPgM,4085
82
81
  mineru/model/table/rec/slanet_plus/table_structure_utils.py,sha256=YYSkwN2WdLx7qkWMSGkPY7yXOH5ENVhg5CsRGhtZ5Wk,19281
83
82
  mineru/model/table/rec/unet_table/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
84
83
  mineru/model/table/rec/unet_table/main.py,sha256=J13Q7_6stYyedmVedf9CZD7R0tuguGfTg3Z3ob4GDuM,15565
85
84
  mineru/model/table/rec/unet_table/table_recover.py,sha256=rSyeWyuP10M8dLKA5e0n4P2DXMYbVbmgLxEcdZA8_0E,9059
86
- mineru/model/table/rec/unet_table/table_structure_unet.py,sha256=beBMmBHAOR2lAuf2rcOKRSbFaJqwuIgMJWxWQsFmIRI,7908
85
+ mineru/model/table/rec/unet_table/table_structure_unet.py,sha256=hnmYLzZFRlK0Y4gr874G9GaLahcKnNZYNun869FdmH8,8150
87
86
  mineru/model/table/rec/unet_table/utils.py,sha256=CYAqJW0wePJk4NAemb8W203N7E32v0ujiWbxanDhd8I,16083
88
87
  mineru/model/table/rec/unet_table/utils_table_line_rec.py,sha256=zrCdPwI4M8nu0FEfd7lRJAe0z8kYq3KFbzwElM82USE,11174
89
88
  mineru/model/table/rec/unet_table/utils_table_recover.py,sha256=XksJsY82ZS0kqUnNT-jvaYzxJ3V3svMSzj0puwIau1k,10651
@@ -152,8 +151,9 @@ mineru/resources/header.html,sha256=PUselBXLBn8gfeP3zwEtj6zIxfhcCN4vN_B796nQFNQ,
152
151
  mineru/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
153
152
  mineru/utils/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
154
153
  mineru/utils/block_pre_proc.py,sha256=uGBmxf2MR9bplTnQI8xHjCI-kj3plRhJr0hcWKidbOQ,9632
155
- mineru/utils/block_sort.py,sha256=mViceDw3O2ksBDFxt-wmX67bCZOwKyp68yZnEjS3Ijc,12934
154
+ mineru/utils/block_sort.py,sha256=5e1mOLB3W7xu5Y1hmhvGSHPL_aQ41R_4VXcP4vjYAOU,12976
156
155
  mineru/utils/boxbase.py,sha256=moP660AmZq_udHEsfvFkTQdJ4gjrrBwN7t0Enx7CIL8,6903
156
+ mineru/utils/check_sys_env.py,sha256=1o7Do3k84Hnwvlnmzx8JqkcGJA3UqiGfucMv9sPgPyI,1113
157
157
  mineru/utils/cli_parser.py,sha256=4seFAu1kulsYnw6WM2q_cxgEOt2tErZVkI-LNEF_kGw,1445
158
158
  mineru/utils/config_reader.py,sha256=IRVWTpBnbnRpck6eXZUKw-fcLt7hon5S4uqWW-RBb1w,4075
159
159
  mineru/utils/cut_image.py,sha256=g3m4nfcJNWlxi-P0kpXTtlmspXkMcLCfGwmYuQ-Z2hE,751
@@ -163,22 +163,24 @@ mineru/utils/format_utils.py,sha256=2s89vHcSISjuolk8Hvg3K-5-rRbiT3Us7eFLzUKrNKs,
163
163
  mineru/utils/guess_suffix_or_lang.py,sha256=nznyQpUn1BSA8JNw9HuG3pVV-xtVAtrtcGuHZ-VXt9M,856
164
164
  mineru/utils/hash_utils.py,sha256=UPS_8NRBmVumdyOv16Lmv6Ly2xK8OVDJEe5gG6gKIFk,857
165
165
  mineru/utils/language.py,sha256=7RT3mxSa7jdpoC5ySd7ZddHA7TO7UsnmDOWiYZAxuyg,1433
166
- mineru/utils/llm_aided.py,sha256=eBGKCD7cJBjkyn38yqCdh0S-fgRG9fLuQCByLDQuyWs,4983
166
+ mineru/utils/llm_aided.py,sha256=9WUytvxenSAuaWR4sTQhVPQ5h8pY0wVOH1O2sj_6dLs,5149
167
167
  mineru/utils/magic_model_utils.py,sha256=2xOvi4oqg3MSw1FUrJTnYDtWeFrrm6qbmlEorLZSaYs,5650
168
168
  mineru/utils/model_utils.py,sha256=6OsgFLsABX5JuShSzCMSNHWV-yi-1cjwHweafyxIgRo,18448
169
169
  mineru/utils/models_download_utils.py,sha256=UfjvwhxO6BkJHa5JSpEVNZ71GoLMPMmJpym3THET2T4,2957
170
170
  mineru/utils/ocr_utils.py,sha256=lPIrwNUib5mrzUkponRYHuUCdjV2qvETNLSzOLyflrU,15990
171
+ mineru/utils/os_env_config.py,sha256=ZNtkR4KrJW72CeIoTNzGDL6tMKv_hL8nzvWIssGWbqY,842
171
172
  mineru/utils/pdf_classify.py,sha256=6DF5pH_9Uq83fsFtp7n4i-OdYQGzoNOV9L0VBUhgBMQ,8078
172
- mineru/utils/pdf_image_tools.py,sha256=mioLEHOdDtM1YbspNaa0wWhnLw_4-H7rdHlIM40vrT4,4077
173
+ mineru/utils/pdf_image_tools.py,sha256=86_xvsGOEde5QGlKz5uJemjoO1upr6n_K7o3lCdyIjQ,7981
174
+ mineru/utils/pdf_page_id.py,sha256=em966k12CRW4Rj49RGiLB_8ILwkXPBnWRetApax3eTs,400
173
175
  mineru/utils/pdf_reader.py,sha256=WeINm5SyWBUXT0wP9lzIbeHs8P6WUIkN6nVL5X4LzG4,3267
174
176
  mineru/utils/pdf_text_tool.py,sha256=KEztjfdqsIHHuiTEAMAL7Lr1OS3R7Ur-uTqGiCRjReQ,1364
175
177
  mineru/utils/run_async.py,sha256=rPeP4BCZerR8VByRDhiYzfZiahLVqoZEBVAS54dAjNg,1286
176
178
  mineru/utils/span_block_fix.py,sha256=0eVQjJCrT03woRt9hoh6Uu42Tp1dacfGTv2x3B9qq94,8797
177
179
  mineru/utils/span_pre_proc.py,sha256=h41q2uQajI0xQbc_30hqaju1dv3oVYxBAlKgURl8HIc,13692
178
180
  mineru/utils/table_merge.py,sha256=d98zNbM1ZQ8V1kUt6RugParNUNPv7DGL-XKIzR3iJVQ,15360
179
- mineru-2.6.2.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
180
- mineru-2.6.2.dist-info/METADATA,sha256=QGCp0YLuKymDMYmMZuOn8IYM-kpbKas5nKF7yl3la_0,68440
181
- mineru-2.6.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
182
- mineru-2.6.2.dist-info/entry_points.txt,sha256=luXmbhPiZK_tKlRgWuYOaW_V6EFpG-yJcAevVv9MEqE,252
183
- mineru-2.6.2.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
184
- mineru-2.6.2.dist-info/RECORD,,
181
+ mineru-2.6.4.dist-info/licenses/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
182
+ mineru-2.6.4.dist-info/METADATA,sha256=igOwr_rwmoJGD4KXKyEBgpESlUr6CZHThNXXE2PQ59U,71241
183
+ mineru-2.6.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
184
+ mineru-2.6.4.dist-info/entry_points.txt,sha256=luXmbhPiZK_tKlRgWuYOaW_V6EFpG-yJcAevVv9MEqE,252
185
+ mineru-2.6.4.dist-info/top_level.txt,sha256=zuGQfZcbsHv4I4oKI9gaKPqEWBFm6xJroKuug2LnKP8,7
186
+ mineru-2.6.4.dist-info/RECORD,,
@@ -1 +0,0 @@
1
- # Copyright (c) Opendatalab. All rights reserved.
File without changes