mineru 2.6.8__py3-none-any.whl → 2.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. mineru/backend/hybrid/__init__.py +1 -0
  2. mineru/backend/hybrid/hybrid_analyze.py +526 -0
  3. mineru/backend/hybrid/hybrid_magic_model.py +617 -0
  4. mineru/backend/hybrid/hybrid_model_output_to_middle_json.py +212 -0
  5. mineru/backend/pipeline/batch_analyze.py +9 -1
  6. mineru/backend/pipeline/model_init.py +96 -1
  7. mineru/backend/pipeline/pipeline_analyze.py +6 -4
  8. mineru/backend/pipeline/pipeline_middle_json_mkcontent.py +32 -41
  9. mineru/backend/vlm/utils.py +3 -1
  10. mineru/backend/vlm/vlm_analyze.py +12 -12
  11. mineru/backend/vlm/vlm_magic_model.py +24 -89
  12. mineru/backend/vlm/vlm_middle_json_mkcontent.py +112 -12
  13. mineru/cli/client.py +17 -17
  14. mineru/cli/common.py +170 -20
  15. mineru/cli/fast_api.py +39 -13
  16. mineru/cli/gradio_app.py +232 -206
  17. mineru/model/mfd/yolo_v8.py +12 -6
  18. mineru/model/mfr/unimernet/Unimernet.py +71 -3
  19. mineru/resources/header.html +5 -1
  20. mineru/utils/boxbase.py +23 -0
  21. mineru/utils/char_utils.py +55 -0
  22. mineru/utils/engine_utils.py +74 -0
  23. mineru/utils/enum_class.py +18 -1
  24. mineru/utils/magic_model_utils.py +85 -2
  25. mineru/utils/span_pre_proc.py +5 -3
  26. mineru/utils/table_merge.py +5 -21
  27. mineru/version.py +1 -1
  28. mineru-2.7.0.dist-info/METADATA +433 -0
  29. {mineru-2.6.8.dist-info → mineru-2.7.0.dist-info}/RECORD +33 -27
  30. mineru-2.6.8.dist-info/METADATA +0 -954
  31. {mineru-2.6.8.dist-info → mineru-2.7.0.dist-info}/WHEEL +0 -0
  32. {mineru-2.6.8.dist-info → mineru-2.7.0.dist-info}/entry_points.txt +0 -0
  33. {mineru-2.6.8.dist-info → mineru-2.7.0.dist-info}/licenses/LICENSE.md +0 -0
  34. {mineru-2.6.8.dist-info → mineru-2.7.0.dist-info}/top_level.txt +0 -0
mineru/cli/common.py CHANGED
@@ -10,12 +10,15 @@ import pypdfium2 as pdfium
10
10
 
11
11
  from mineru.data.data_reader_writer import FileBasedDataWriter
12
12
  from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox, draw_line_sort_bbox
13
+ from mineru.utils.engine_utils import get_vlm_engine
13
14
  from mineru.utils.enum_class import MakeMode
14
15
  from mineru.utils.guess_suffix_or_lang import guess_suffix_by_bytes
15
16
  from mineru.utils.pdf_image_tools import images_bytes_to_pdf_bytes
16
17
  from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
17
18
  from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
18
19
  from mineru.backend.vlm.vlm_analyze import aio_doc_analyze as aio_vlm_doc_analyze
20
+ from mineru.backend.hybrid.hybrid_analyze import doc_analyze as hybrid_doc_analyze
21
+ from mineru.backend.hybrid.hybrid_analyze import aio_doc_analyze as aio_hybrid_doc_analyze
19
22
  from mineru.utils.pdf_page_id import get_end_page_id
20
23
 
21
24
  if os.getenv("MINERU_LMDEPLOY_DEVICE", "") == "maca":
@@ -304,6 +307,111 @@ def _process_vlm(
304
307
  )
305
308
 
306
309
 
310
+ def _process_hybrid(
311
+ output_dir,
312
+ pdf_file_names,
313
+ pdf_bytes_list,
314
+ h_lang_list,
315
+ parse_method,
316
+ inline_formula_enable,
317
+ backend,
318
+ f_draw_layout_bbox,
319
+ f_draw_span_bbox,
320
+ f_dump_md,
321
+ f_dump_middle_json,
322
+ f_dump_model_output,
323
+ f_dump_orig_pdf,
324
+ f_dump_content_list,
325
+ f_make_md_mode,
326
+ server_url=None,
327
+ **kwargs,
328
+ ):
329
+ """同步处理hybrid后端逻辑"""
330
+ if not backend.endswith("client"):
331
+ server_url = None
332
+
333
+ for idx, (pdf_bytes, lang) in enumerate(zip(pdf_bytes_list, h_lang_list)):
334
+ pdf_file_name = pdf_file_names[idx]
335
+ local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, f"hybrid_{parse_method}")
336
+ image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
337
+
338
+ middle_json, infer_result, _vlm_ocr_enable = hybrid_doc_analyze(
339
+ pdf_bytes,
340
+ image_writer=image_writer,
341
+ backend=backend,
342
+ parse_method=parse_method,
343
+ language=lang,
344
+ inline_formula_enable=inline_formula_enable,
345
+ server_url=server_url,
346
+ **kwargs,
347
+ )
348
+
349
+ pdf_info = middle_json["pdf_info"]
350
+
351
+ # f_draw_span_bbox = not _vlm_ocr_enable
352
+ f_draw_span_bbox = False
353
+
354
+ _process_output(
355
+ pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
356
+ md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
357
+ f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
358
+ f_make_md_mode, middle_json, infer_result, is_pipeline=False
359
+ )
360
+
361
+
362
+ async def _async_process_hybrid(
363
+ output_dir,
364
+ pdf_file_names,
365
+ pdf_bytes_list,
366
+ h_lang_list,
367
+ parse_method,
368
+ inline_formula_enable,
369
+ backend,
370
+ f_draw_layout_bbox,
371
+ f_draw_span_bbox,
372
+ f_dump_md,
373
+ f_dump_middle_json,
374
+ f_dump_model_output,
375
+ f_dump_orig_pdf,
376
+ f_dump_content_list,
377
+ f_make_md_mode,
378
+ server_url=None,
379
+ **kwargs,
380
+ ):
381
+ """异步处理hybrid后端逻辑"""
382
+
383
+ if not backend.endswith("client"):
384
+ server_url = None
385
+
386
+ for idx, (pdf_bytes, lang) in enumerate(zip(pdf_bytes_list, h_lang_list)):
387
+ pdf_file_name = pdf_file_names[idx]
388
+ local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, f"hybrid_{parse_method}")
389
+ image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
390
+
391
+ middle_json, infer_result, _vlm_ocr_enable = await aio_hybrid_doc_analyze(
392
+ pdf_bytes,
393
+ image_writer=image_writer,
394
+ backend=backend,
395
+ parse_method=parse_method,
396
+ language=lang,
397
+ inline_formula_enable=inline_formula_enable,
398
+ server_url=server_url,
399
+ **kwargs,
400
+ )
401
+
402
+ pdf_info = middle_json["pdf_info"]
403
+
404
+ # f_draw_span_bbox = not _vlm_ocr_enable
405
+ f_draw_span_bbox = False
406
+
407
+ _process_output(
408
+ pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
409
+ md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
410
+ f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
411
+ f_make_md_mode, middle_json, infer_result, is_pipeline=False
412
+ )
413
+
414
+
307
415
  def do_parse(
308
416
  output_dir,
309
417
  pdf_file_names: list[str],
@@ -340,18 +448,40 @@ def do_parse(
340
448
  if backend.startswith("vlm-"):
341
449
  backend = backend[4:]
342
450
 
343
- if backend == "vllm-async-engine":
344
- raise Exception("vlm-vllm-async-engine backend is not supported in sync mode, please use vlm-vllm-engine backend")
451
+ if backend == "vllm-async-engine":
452
+ raise Exception("vlm-vllm-async-engine backend is not supported in sync mode, please use vlm-vllm-engine backend")
345
453
 
346
- os.environ['MINERU_VLM_FORMULA_ENABLE'] = str(formula_enable)
347
- os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable)
454
+ if backend == "auto-engine":
455
+ backend = get_vlm_engine(inference_engine='auto', is_async=False)
348
456
 
349
- _process_vlm(
350
- output_dir, pdf_file_names, pdf_bytes_list, backend,
351
- f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
352
- f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode,
353
- server_url, **kwargs,
354
- )
457
+ os.environ['MINERU_VLM_FORMULA_ENABLE'] = str(formula_enable)
458
+ os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable)
459
+
460
+ _process_vlm(
461
+ output_dir, pdf_file_names, pdf_bytes_list, backend,
462
+ f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
463
+ f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode,
464
+ server_url, **kwargs,
465
+ )
466
+ elif backend.startswith("hybrid-"):
467
+ backend = backend[7:]
468
+
469
+ if backend == "vllm-async-engine":
470
+ raise Exception(
471
+ "hybrid-vllm-async-engine backend is not supported in sync mode, please use hybrid-vllm-engine backend")
472
+
473
+ if backend == "auto-engine":
474
+ backend = get_vlm_engine(inference_engine='auto', is_async=False)
475
+
476
+ os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable)
477
+ os.environ['MINERU_VLM_FORMULA_ENABLE'] = "true"
478
+
479
+ _process_hybrid(
480
+ output_dir, pdf_file_names, pdf_bytes_list, p_lang_list, parse_method, formula_enable, backend,
481
+ f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
482
+ f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode,
483
+ server_url, **kwargs,
484
+ )
355
485
 
356
486
 
357
487
  async def aio_do_parse(
@@ -391,19 +521,39 @@ async def aio_do_parse(
391
521
  if backend.startswith("vlm-"):
392
522
  backend = backend[4:]
393
523
 
394
- if backend == "vllm-engine":
395
- raise Exception("vlm-vllm-engine backend is not supported in async mode, please use vlm-vllm-async-engine backend")
524
+ if backend == "vllm-engine":
525
+ raise Exception("vlm-vllm-engine backend is not supported in async mode, please use vlm-vllm-async-engine backend")
396
526
 
397
- os.environ['MINERU_VLM_FORMULA_ENABLE'] = str(formula_enable)
398
- os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable)
527
+ if backend == "auto-engine":
528
+ backend = get_vlm_engine(inference_engine='auto', is_async=True)
399
529
 
400
- await _async_process_vlm(
401
- output_dir, pdf_file_names, pdf_bytes_list, backend,
402
- f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
403
- f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode,
404
- server_url, **kwargs,
405
- )
530
+ os.environ['MINERU_VLM_FORMULA_ENABLE'] = str(formula_enable)
531
+ os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable)
406
532
 
533
+ await _async_process_vlm(
534
+ output_dir, pdf_file_names, pdf_bytes_list, backend,
535
+ f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
536
+ f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode,
537
+ server_url, **kwargs,
538
+ )
539
+ elif backend.startswith("hybrid-"):
540
+ backend = backend[7:]
541
+
542
+ if backend == "vllm-engine":
543
+ raise Exception("hybrid-vllm-engine backend is not supported in async mode, please use hybrid-vllm-async-engine backend")
544
+
545
+ if backend == "auto-engine":
546
+ backend = get_vlm_engine(inference_engine='auto', is_async=True)
547
+
548
+ os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable)
549
+ os.environ['MINERU_VLM_FORMULA_ENABLE'] = "true"
550
+
551
+ await _async_process_hybrid(
552
+ output_dir, pdf_file_names, pdf_bytes_list, p_lang_list, parse_method, formula_enable, backend,
553
+ f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
554
+ f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode,
555
+ server_url, **kwargs,
556
+ )
407
557
 
408
558
 
409
559
  if __name__ == "__main__":
mineru/cli/fast_api.py CHANGED
@@ -1,3 +1,4 @@
1
+ import sys
1
2
  import uuid
2
3
  import os
3
4
  import re
@@ -14,6 +15,11 @@ from fastapi.responses import JSONResponse, FileResponse
14
15
  from starlette.background import BackgroundTask
15
16
  from typing import List, Optional
16
17
  from loguru import logger
18
+
19
+ log_level = os.getenv("MINERU_LOG_LEVEL", "INFO").upper()
20
+ logger.remove() # 移除默认handler
21
+ logger.add(sys.stderr, level=log_level) # 添加新handler
22
+
17
23
  from base64 import b64encode
18
24
 
19
25
  from mineru.cli.common import aio_do_parse, read_fn, pdf_suffixes, image_suffixes
@@ -105,23 +111,38 @@ async def parse_pdf(
105
111
  output_dir: str = Form("./output", description="Output local directory"),
106
112
  lang_list: List[str] = Form(
107
113
  ["ch"],
108
- description="""(Adapted only for pipeline backend)Input the languages in the pdf to improve OCR accuracy.
109
- Options: ch, ch_server, ch_lite, en, korean, japan, chinese_cht, ta, te, ka, th, el, latin, arabic, east_slavic, cyrillic, devanagari.
114
+ description="""(Adapted only for pipeline and hybrid backend)Input the languages in the pdf to improve OCR accuracy.Options:
115
+ - ch: Chinese, English, Chinese Traditional.
116
+ - ch_lite: Chinese, English, Chinese Traditional, Japanese.
117
+ - ch_server: Chinese, English, Chinese Traditional, Japanese.
118
+ - en: English.
119
+ - korean: Korean, English.
120
+ - japan: Chinese, English, Chinese Traditional, Japanese.
121
+ - chinese_cht: Chinese, English, Chinese Traditional, Japanese.
122
+ - ta: Tamil, English.
123
+ - te: Telugu, English.
124
+ - ka: Kannada.
125
+ - th: Thai, English.
126
+ - el: Greek, English.
127
+ - latin: French, German, Afrikaans, Italian, Spanish, Bosnian, Portuguese, Czech, Welsh, Danish, Estonian, Irish, Croatian, Uzbek, Hungarian, Serbian (Latin), Indonesian, Occitan, Icelandic, Lithuanian, Maori, Malay, Dutch, Norwegian, Polish, Slovak, Slovenian, Albanian, Swedish, Swahili, Tagalog, Turkish, Latin, Azerbaijani, Kurdish, Latvian, Maltese, Pali, Romanian, Vietnamese, Finnish, Basque, Galician, Luxembourgish, Romansh, Catalan, Quechua.
128
+ - arabic: Arabic, Persian, Uyghur, Urdu, Pashto, Kurdish, Sindhi, Balochi, English.
129
+ - east_slavic: Russian, Belarusian, Ukrainian, English.
130
+ - cyrillic: Russian, Belarusian, Ukrainian, Serbian (Cyrillic), Bulgarian, Mongolian, Abkhazian, Adyghe, Kabardian, Avar, Dargin, Ingush, Chechen, Lak, Lezgin, Tabasaran, Kazakh, Kyrgyz, Tajik, Macedonian, Tatar, Chuvash, Bashkir, Malian, Moldovan, Udmurt, Komi, Ossetian, Buryat, Kalmyk, Tuvan, Sakha, Karakalpak, English.
131
+ - devanagari: Hindi, Marathi, Nepali, Bihari, Maithili, Angika, Bhojpuri, Magahi, Santali, Newari, Konkani, Sanskrit, Haryanvi, English.
110
132
  """
111
133
  ),
112
134
  backend: str = Form(
113
- "pipeline",
135
+ "hybrid-auto-engine",
114
136
  description="""The backend for parsing:
115
- - pipeline: More general
116
- - vlm-transformers: More general, but slower
117
- - vlm-mlx-engine: Faster than transformers (need apple silicon and macOS 13.5+)
118
- - vlm-vllm-async-engine: Faster (vllm-engine, need vllm installed)
119
- - vlm-lmdeploy-engine: Faster (lmdeploy-engine, need lmdeploy installed)
120
- - vlm-http-client: Faster (client suitable for openai-compatible servers)"""
137
+ - pipeline: More general, supports multiple languages, hallucination-free.
138
+ - vlm-auto-engine: High accuracy via local computing power, supports Chinese and English documents only.
139
+ - vlm-http-client: High accuracy via remote computing power(client suitable for openai-compatible servers), supports Chinese and English documents only.
140
+ - hybrid-auto-engine: Next-generation high accuracy solution via local computing power, supports multiple languages.
141
+ - hybrid-http-client: High accuracy via remote computing power but requires a little local computing power(client suitable for openai-compatible servers), supports multiple languages."""
121
142
  ),
122
143
  parse_method: str = Form(
123
144
  "auto",
124
- description="""(Adapted only for pipeline backend)The method for parsing PDF:
145
+ description="""(Adapted only for pipeline and hybrid backend)The method for parsing PDF:
125
146
  - auto: Automatically determine the method based on the file type
126
147
  - txt: Use text extraction method
127
148
  - ocr: Use OCR method for image-based PDFs
@@ -131,7 +152,7 @@ Options: ch, ch_server, ch_lite, en, korean, japan, chinese_cht, ta, te, ka, th,
131
152
  table_enable: bool = Form(True, description="Enable table parsing."),
132
153
  server_url: Optional[str] = Form(
133
154
  None,
134
- description="(Adapted only for vlm-http-client backend)openai compatible server url, e.g., http://127.0.0.1:30000"
155
+ description="(Adapted only for <vlm/hybrid>-http-client backend)openai compatible server url, e.g., http://127.0.0.1:30000"
135
156
  ),
136
157
  return_md: bool = Form(True, description="Return markdown content in response"),
137
158
  return_middle_json: bool = Form(False, description="Return middle JSON in response"),
@@ -220,10 +241,13 @@ Options: ch, ch_server, ch_lite, en, korean, japan, chinese_cht, ta, te, ka, th,
220
241
  with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
221
242
  for pdf_name in pdf_file_names:
222
243
  safe_pdf_name = sanitize_filename(pdf_name)
244
+
223
245
  if backend.startswith("pipeline"):
224
246
  parse_dir = os.path.join(unique_dir, pdf_name, parse_method)
225
- else:
247
+ elif backend.startswith("vlm"):
226
248
  parse_dir = os.path.join(unique_dir, pdf_name, "vlm")
249
+ elif backend.startswith("hybrid"):
250
+ parse_dir = os.path.join(unique_dir, pdf_name, f"hybrid_{parse_method}")
227
251
 
228
252
  if not os.path.exists(parse_dir):
229
253
  continue
@@ -271,8 +295,10 @@ Options: ch, ch_server, ch_lite, en, korean, japan, chinese_cht, ta, te, ka, th,
271
295
 
272
296
  if backend.startswith("pipeline"):
273
297
  parse_dir = os.path.join(unique_dir, pdf_name, parse_method)
274
- else:
298
+ elif backend.startswith("vlm"):
275
299
  parse_dir = os.path.join(unique_dir, pdf_name, "vlm")
300
+ elif backend.startswith("hybrid"):
301
+ parse_dir = os.path.join(unique_dir, pdf_name, f"hybrid_{parse_method}")
276
302
 
277
303
  if os.path.exists(parse_dir):
278
304
  if return_md: