mineru 2.6.7__py3-none-any.whl → 2.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mineru/backend/hybrid/__init__.py +1 -0
- mineru/backend/hybrid/hybrid_analyze.py +526 -0
- mineru/backend/hybrid/hybrid_magic_model.py +617 -0
- mineru/backend/hybrid/hybrid_model_output_to_middle_json.py +212 -0
- mineru/backend/pipeline/batch_analyze.py +9 -1
- mineru/backend/pipeline/model_init.py +96 -1
- mineru/backend/pipeline/pipeline_analyze.py +6 -4
- mineru/backend/pipeline/pipeline_middle_json_mkcontent.py +32 -41
- mineru/backend/vlm/utils.py +3 -1
- mineru/backend/vlm/vlm_analyze.py +12 -12
- mineru/backend/vlm/vlm_magic_model.py +24 -89
- mineru/backend/vlm/vlm_middle_json_mkcontent.py +118 -19
- mineru/cli/client.py +17 -17
- mineru/cli/common.py +170 -20
- mineru/cli/fast_api.py +39 -13
- mineru/cli/gradio_app.py +232 -206
- mineru/model/mfd/yolo_v8.py +12 -6
- mineru/model/mfr/unimernet/Unimernet.py +71 -3
- mineru/resources/header.html +5 -1
- mineru/utils/boxbase.py +23 -0
- mineru/utils/char_utils.py +55 -0
- mineru/utils/engine_utils.py +74 -0
- mineru/utils/enum_class.py +18 -1
- mineru/utils/magic_model_utils.py +85 -2
- mineru/utils/span_pre_proc.py +5 -3
- mineru/utils/table_merge.py +5 -21
- mineru/version.py +1 -1
- mineru-2.7.0.dist-info/METADATA +433 -0
- {mineru-2.6.7.dist-info → mineru-2.7.0.dist-info}/RECORD +33 -27
- mineru-2.6.7.dist-info/METADATA +0 -954
- {mineru-2.6.7.dist-info → mineru-2.7.0.dist-info}/WHEEL +0 -0
- {mineru-2.6.7.dist-info → mineru-2.7.0.dist-info}/entry_points.txt +0 -0
- {mineru-2.6.7.dist-info → mineru-2.7.0.dist-info}/licenses/LICENSE.md +0 -0
- {mineru-2.6.7.dist-info → mineru-2.7.0.dist-info}/top_level.txt +0 -0
mineru/cli/common.py
CHANGED
|
@@ -10,12 +10,15 @@ import pypdfium2 as pdfium
|
|
|
10
10
|
|
|
11
11
|
from mineru.data.data_reader_writer import FileBasedDataWriter
|
|
12
12
|
from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox, draw_line_sort_bbox
|
|
13
|
+
from mineru.utils.engine_utils import get_vlm_engine
|
|
13
14
|
from mineru.utils.enum_class import MakeMode
|
|
14
15
|
from mineru.utils.guess_suffix_or_lang import guess_suffix_by_bytes
|
|
15
16
|
from mineru.utils.pdf_image_tools import images_bytes_to_pdf_bytes
|
|
16
17
|
from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
|
|
17
18
|
from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze
|
|
18
19
|
from mineru.backend.vlm.vlm_analyze import aio_doc_analyze as aio_vlm_doc_analyze
|
|
20
|
+
from mineru.backend.hybrid.hybrid_analyze import doc_analyze as hybrid_doc_analyze
|
|
21
|
+
from mineru.backend.hybrid.hybrid_analyze import aio_doc_analyze as aio_hybrid_doc_analyze
|
|
19
22
|
from mineru.utils.pdf_page_id import get_end_page_id
|
|
20
23
|
|
|
21
24
|
if os.getenv("MINERU_LMDEPLOY_DEVICE", "") == "maca":
|
|
@@ -304,6 +307,111 @@ def _process_vlm(
|
|
|
304
307
|
)
|
|
305
308
|
|
|
306
309
|
|
|
310
|
+
def _process_hybrid(
|
|
311
|
+
output_dir,
|
|
312
|
+
pdf_file_names,
|
|
313
|
+
pdf_bytes_list,
|
|
314
|
+
h_lang_list,
|
|
315
|
+
parse_method,
|
|
316
|
+
inline_formula_enable,
|
|
317
|
+
backend,
|
|
318
|
+
f_draw_layout_bbox,
|
|
319
|
+
f_draw_span_bbox,
|
|
320
|
+
f_dump_md,
|
|
321
|
+
f_dump_middle_json,
|
|
322
|
+
f_dump_model_output,
|
|
323
|
+
f_dump_orig_pdf,
|
|
324
|
+
f_dump_content_list,
|
|
325
|
+
f_make_md_mode,
|
|
326
|
+
server_url=None,
|
|
327
|
+
**kwargs,
|
|
328
|
+
):
|
|
329
|
+
"""同步处理hybrid后端逻辑"""
|
|
330
|
+
if not backend.endswith("client"):
|
|
331
|
+
server_url = None
|
|
332
|
+
|
|
333
|
+
for idx, (pdf_bytes, lang) in enumerate(zip(pdf_bytes_list, h_lang_list)):
|
|
334
|
+
pdf_file_name = pdf_file_names[idx]
|
|
335
|
+
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, f"hybrid_{parse_method}")
|
|
336
|
+
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
|
|
337
|
+
|
|
338
|
+
middle_json, infer_result, _vlm_ocr_enable = hybrid_doc_analyze(
|
|
339
|
+
pdf_bytes,
|
|
340
|
+
image_writer=image_writer,
|
|
341
|
+
backend=backend,
|
|
342
|
+
parse_method=parse_method,
|
|
343
|
+
language=lang,
|
|
344
|
+
inline_formula_enable=inline_formula_enable,
|
|
345
|
+
server_url=server_url,
|
|
346
|
+
**kwargs,
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
pdf_info = middle_json["pdf_info"]
|
|
350
|
+
|
|
351
|
+
# f_draw_span_bbox = not _vlm_ocr_enable
|
|
352
|
+
f_draw_span_bbox = False
|
|
353
|
+
|
|
354
|
+
_process_output(
|
|
355
|
+
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
|
|
356
|
+
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
|
|
357
|
+
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
|
|
358
|
+
f_make_md_mode, middle_json, infer_result, is_pipeline=False
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
async def _async_process_hybrid(
|
|
363
|
+
output_dir,
|
|
364
|
+
pdf_file_names,
|
|
365
|
+
pdf_bytes_list,
|
|
366
|
+
h_lang_list,
|
|
367
|
+
parse_method,
|
|
368
|
+
inline_formula_enable,
|
|
369
|
+
backend,
|
|
370
|
+
f_draw_layout_bbox,
|
|
371
|
+
f_draw_span_bbox,
|
|
372
|
+
f_dump_md,
|
|
373
|
+
f_dump_middle_json,
|
|
374
|
+
f_dump_model_output,
|
|
375
|
+
f_dump_orig_pdf,
|
|
376
|
+
f_dump_content_list,
|
|
377
|
+
f_make_md_mode,
|
|
378
|
+
server_url=None,
|
|
379
|
+
**kwargs,
|
|
380
|
+
):
|
|
381
|
+
"""异步处理hybrid后端逻辑"""
|
|
382
|
+
|
|
383
|
+
if not backend.endswith("client"):
|
|
384
|
+
server_url = None
|
|
385
|
+
|
|
386
|
+
for idx, (pdf_bytes, lang) in enumerate(zip(pdf_bytes_list, h_lang_list)):
|
|
387
|
+
pdf_file_name = pdf_file_names[idx]
|
|
388
|
+
local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, f"hybrid_{parse_method}")
|
|
389
|
+
image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
|
|
390
|
+
|
|
391
|
+
middle_json, infer_result, _vlm_ocr_enable = await aio_hybrid_doc_analyze(
|
|
392
|
+
pdf_bytes,
|
|
393
|
+
image_writer=image_writer,
|
|
394
|
+
backend=backend,
|
|
395
|
+
parse_method=parse_method,
|
|
396
|
+
language=lang,
|
|
397
|
+
inline_formula_enable=inline_formula_enable,
|
|
398
|
+
server_url=server_url,
|
|
399
|
+
**kwargs,
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
pdf_info = middle_json["pdf_info"]
|
|
403
|
+
|
|
404
|
+
# f_draw_span_bbox = not _vlm_ocr_enable
|
|
405
|
+
f_draw_span_bbox = False
|
|
406
|
+
|
|
407
|
+
_process_output(
|
|
408
|
+
pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
|
|
409
|
+
md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
|
|
410
|
+
f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
|
|
411
|
+
f_make_md_mode, middle_json, infer_result, is_pipeline=False
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
|
|
307
415
|
def do_parse(
|
|
308
416
|
output_dir,
|
|
309
417
|
pdf_file_names: list[str],
|
|
@@ -340,18 +448,40 @@ def do_parse(
|
|
|
340
448
|
if backend.startswith("vlm-"):
|
|
341
449
|
backend = backend[4:]
|
|
342
450
|
|
|
343
|
-
|
|
344
|
-
|
|
451
|
+
if backend == "vllm-async-engine":
|
|
452
|
+
raise Exception("vlm-vllm-async-engine backend is not supported in sync mode, please use vlm-vllm-engine backend")
|
|
345
453
|
|
|
346
|
-
|
|
347
|
-
|
|
454
|
+
if backend == "auto-engine":
|
|
455
|
+
backend = get_vlm_engine(inference_engine='auto', is_async=False)
|
|
348
456
|
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
457
|
+
os.environ['MINERU_VLM_FORMULA_ENABLE'] = str(formula_enable)
|
|
458
|
+
os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable)
|
|
459
|
+
|
|
460
|
+
_process_vlm(
|
|
461
|
+
output_dir, pdf_file_names, pdf_bytes_list, backend,
|
|
462
|
+
f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
|
|
463
|
+
f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode,
|
|
464
|
+
server_url, **kwargs,
|
|
465
|
+
)
|
|
466
|
+
elif backend.startswith("hybrid-"):
|
|
467
|
+
backend = backend[7:]
|
|
468
|
+
|
|
469
|
+
if backend == "vllm-async-engine":
|
|
470
|
+
raise Exception(
|
|
471
|
+
"hybrid-vllm-async-engine backend is not supported in sync mode, please use hybrid-vllm-engine backend")
|
|
472
|
+
|
|
473
|
+
if backend == "auto-engine":
|
|
474
|
+
backend = get_vlm_engine(inference_engine='auto', is_async=False)
|
|
475
|
+
|
|
476
|
+
os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable)
|
|
477
|
+
os.environ['MINERU_VLM_FORMULA_ENABLE'] = "true"
|
|
478
|
+
|
|
479
|
+
_process_hybrid(
|
|
480
|
+
output_dir, pdf_file_names, pdf_bytes_list, p_lang_list, parse_method, formula_enable, backend,
|
|
481
|
+
f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
|
|
482
|
+
f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode,
|
|
483
|
+
server_url, **kwargs,
|
|
484
|
+
)
|
|
355
485
|
|
|
356
486
|
|
|
357
487
|
async def aio_do_parse(
|
|
@@ -391,19 +521,39 @@ async def aio_do_parse(
|
|
|
391
521
|
if backend.startswith("vlm-"):
|
|
392
522
|
backend = backend[4:]
|
|
393
523
|
|
|
394
|
-
|
|
395
|
-
|
|
524
|
+
if backend == "vllm-engine":
|
|
525
|
+
raise Exception("vlm-vllm-engine backend is not supported in async mode, please use vlm-vllm-async-engine backend")
|
|
396
526
|
|
|
397
|
-
|
|
398
|
-
|
|
527
|
+
if backend == "auto-engine":
|
|
528
|
+
backend = get_vlm_engine(inference_engine='auto', is_async=True)
|
|
399
529
|
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
|
|
403
|
-
f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode,
|
|
404
|
-
server_url, **kwargs,
|
|
405
|
-
)
|
|
530
|
+
os.environ['MINERU_VLM_FORMULA_ENABLE'] = str(formula_enable)
|
|
531
|
+
os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable)
|
|
406
532
|
|
|
533
|
+
await _async_process_vlm(
|
|
534
|
+
output_dir, pdf_file_names, pdf_bytes_list, backend,
|
|
535
|
+
f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
|
|
536
|
+
f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode,
|
|
537
|
+
server_url, **kwargs,
|
|
538
|
+
)
|
|
539
|
+
elif backend.startswith("hybrid-"):
|
|
540
|
+
backend = backend[7:]
|
|
541
|
+
|
|
542
|
+
if backend == "vllm-engine":
|
|
543
|
+
raise Exception("hybrid-vllm-engine backend is not supported in async mode, please use hybrid-vllm-async-engine backend")
|
|
544
|
+
|
|
545
|
+
if backend == "auto-engine":
|
|
546
|
+
backend = get_vlm_engine(inference_engine='auto', is_async=True)
|
|
547
|
+
|
|
548
|
+
os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable)
|
|
549
|
+
os.environ['MINERU_VLM_FORMULA_ENABLE'] = "true"
|
|
550
|
+
|
|
551
|
+
await _async_process_hybrid(
|
|
552
|
+
output_dir, pdf_file_names, pdf_bytes_list, p_lang_list, parse_method, formula_enable, backend,
|
|
553
|
+
f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
|
|
554
|
+
f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode,
|
|
555
|
+
server_url, **kwargs,
|
|
556
|
+
)
|
|
407
557
|
|
|
408
558
|
|
|
409
559
|
if __name__ == "__main__":
|
mineru/cli/fast_api.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import sys
|
|
1
2
|
import uuid
|
|
2
3
|
import os
|
|
3
4
|
import re
|
|
@@ -14,6 +15,11 @@ from fastapi.responses import JSONResponse, FileResponse
|
|
|
14
15
|
from starlette.background import BackgroundTask
|
|
15
16
|
from typing import List, Optional
|
|
16
17
|
from loguru import logger
|
|
18
|
+
|
|
19
|
+
log_level = os.getenv("MINERU_LOG_LEVEL", "INFO").upper()
|
|
20
|
+
logger.remove() # 移除默认handler
|
|
21
|
+
logger.add(sys.stderr, level=log_level) # 添加新handler
|
|
22
|
+
|
|
17
23
|
from base64 import b64encode
|
|
18
24
|
|
|
19
25
|
from mineru.cli.common import aio_do_parse, read_fn, pdf_suffixes, image_suffixes
|
|
@@ -105,23 +111,38 @@ async def parse_pdf(
|
|
|
105
111
|
output_dir: str = Form("./output", description="Output local directory"),
|
|
106
112
|
lang_list: List[str] = Form(
|
|
107
113
|
["ch"],
|
|
108
|
-
description="""(Adapted only for pipeline backend)Input the languages in the pdf to improve OCR accuracy.
|
|
109
|
-
|
|
114
|
+
description="""(Adapted only for pipeline and hybrid backend)Input the languages in the pdf to improve OCR accuracy.Options:
|
|
115
|
+
- ch: Chinese, English, Chinese Traditional.
|
|
116
|
+
- ch_lite: Chinese, English, Chinese Traditional, Japanese.
|
|
117
|
+
- ch_server: Chinese, English, Chinese Traditional, Japanese.
|
|
118
|
+
- en: English.
|
|
119
|
+
- korean: Korean, English.
|
|
120
|
+
- japan: Chinese, English, Chinese Traditional, Japanese.
|
|
121
|
+
- chinese_cht: Chinese, English, Chinese Traditional, Japanese.
|
|
122
|
+
- ta: Tamil, English.
|
|
123
|
+
- te: Telugu, English.
|
|
124
|
+
- ka: Kannada.
|
|
125
|
+
- th: Thai, English.
|
|
126
|
+
- el: Greek, English.
|
|
127
|
+
- latin: French, German, Afrikaans, Italian, Spanish, Bosnian, Portuguese, Czech, Welsh, Danish, Estonian, Irish, Croatian, Uzbek, Hungarian, Serbian (Latin), Indonesian, Occitan, Icelandic, Lithuanian, Maori, Malay, Dutch, Norwegian, Polish, Slovak, Slovenian, Albanian, Swedish, Swahili, Tagalog, Turkish, Latin, Azerbaijani, Kurdish, Latvian, Maltese, Pali, Romanian, Vietnamese, Finnish, Basque, Galician, Luxembourgish, Romansh, Catalan, Quechua.
|
|
128
|
+
- arabic: Arabic, Persian, Uyghur, Urdu, Pashto, Kurdish, Sindhi, Balochi, English.
|
|
129
|
+
- east_slavic: Russian, Belarusian, Ukrainian, English.
|
|
130
|
+
- cyrillic: Russian, Belarusian, Ukrainian, Serbian (Cyrillic), Bulgarian, Mongolian, Abkhazian, Adyghe, Kabardian, Avar, Dargin, Ingush, Chechen, Lak, Lezgin, Tabasaran, Kazakh, Kyrgyz, Tajik, Macedonian, Tatar, Chuvash, Bashkir, Malian, Moldovan, Udmurt, Komi, Ossetian, Buryat, Kalmyk, Tuvan, Sakha, Karakalpak, English.
|
|
131
|
+
- devanagari: Hindi, Marathi, Nepali, Bihari, Maithili, Angika, Bhojpuri, Magahi, Santali, Newari, Konkani, Sanskrit, Haryanvi, English.
|
|
110
132
|
"""
|
|
111
133
|
),
|
|
112
134
|
backend: str = Form(
|
|
113
|
-
"
|
|
135
|
+
"hybrid-auto-engine",
|
|
114
136
|
description="""The backend for parsing:
|
|
115
|
-
- pipeline: More general
|
|
116
|
-
- vlm-
|
|
117
|
-
- vlm-
|
|
118
|
-
-
|
|
119
|
-
-
|
|
120
|
-
- vlm-http-client: Faster (client suitable for openai-compatible servers)"""
|
|
137
|
+
- pipeline: More general, supports multiple languages, hallucination-free.
|
|
138
|
+
- vlm-auto-engine: High accuracy via local computing power, supports Chinese and English documents only.
|
|
139
|
+
- vlm-http-client: High accuracy via remote computing power(client suitable for openai-compatible servers), supports Chinese and English documents only.
|
|
140
|
+
- hybrid-auto-engine: Next-generation high accuracy solution via local computing power, supports multiple languages.
|
|
141
|
+
- hybrid-http-client: High accuracy via remote computing power but requires a little local computing power(client suitable for openai-compatible servers), supports multiple languages."""
|
|
121
142
|
),
|
|
122
143
|
parse_method: str = Form(
|
|
123
144
|
"auto",
|
|
124
|
-
description="""(Adapted only for pipeline backend)The method for parsing PDF:
|
|
145
|
+
description="""(Adapted only for pipeline and hybrid backend)The method for parsing PDF:
|
|
125
146
|
- auto: Automatically determine the method based on the file type
|
|
126
147
|
- txt: Use text extraction method
|
|
127
148
|
- ocr: Use OCR method for image-based PDFs
|
|
@@ -131,7 +152,7 @@ Options: ch, ch_server, ch_lite, en, korean, japan, chinese_cht, ta, te, ka, th,
|
|
|
131
152
|
table_enable: bool = Form(True, description="Enable table parsing."),
|
|
132
153
|
server_url: Optional[str] = Form(
|
|
133
154
|
None,
|
|
134
|
-
description="(Adapted only for vlm
|
|
155
|
+
description="(Adapted only for <vlm/hybrid>-http-client backend)openai compatible server url, e.g., http://127.0.0.1:30000"
|
|
135
156
|
),
|
|
136
157
|
return_md: bool = Form(True, description="Return markdown content in response"),
|
|
137
158
|
return_middle_json: bool = Form(False, description="Return middle JSON in response"),
|
|
@@ -220,10 +241,13 @@ Options: ch, ch_server, ch_lite, en, korean, japan, chinese_cht, ta, te, ka, th,
|
|
|
220
241
|
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
|
|
221
242
|
for pdf_name in pdf_file_names:
|
|
222
243
|
safe_pdf_name = sanitize_filename(pdf_name)
|
|
244
|
+
|
|
223
245
|
if backend.startswith("pipeline"):
|
|
224
246
|
parse_dir = os.path.join(unique_dir, pdf_name, parse_method)
|
|
225
|
-
|
|
247
|
+
elif backend.startswith("vlm"):
|
|
226
248
|
parse_dir = os.path.join(unique_dir, pdf_name, "vlm")
|
|
249
|
+
elif backend.startswith("hybrid"):
|
|
250
|
+
parse_dir = os.path.join(unique_dir, pdf_name, f"hybrid_{parse_method}")
|
|
227
251
|
|
|
228
252
|
if not os.path.exists(parse_dir):
|
|
229
253
|
continue
|
|
@@ -271,8 +295,10 @@ Options: ch, ch_server, ch_lite, en, korean, japan, chinese_cht, ta, te, ka, th,
|
|
|
271
295
|
|
|
272
296
|
if backend.startswith("pipeline"):
|
|
273
297
|
parse_dir = os.path.join(unique_dir, pdf_name, parse_method)
|
|
274
|
-
|
|
298
|
+
elif backend.startswith("vlm"):
|
|
275
299
|
parse_dir = os.path.join(unique_dir, pdf_name, "vlm")
|
|
300
|
+
elif backend.startswith("hybrid"):
|
|
301
|
+
parse_dir = os.path.join(unique_dir, pdf_name, f"hybrid_{parse_method}")
|
|
276
302
|
|
|
277
303
|
if os.path.exists(parse_dir):
|
|
278
304
|
if return_md:
|