PyPI - mineru - Versions diffs - 2.6.8__py3-none-any.whl → 2.7.1__py3-none-any.whl - Mend

mineru 2.6.8py3-none-any.whl → 2.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

mineru/backend/hybrid/__init__.py +1 -0
mineru/backend/hybrid/hybrid_analyze.py +526 -0
mineru/backend/hybrid/hybrid_magic_model.py +617 -0
mineru/backend/hybrid/hybrid_model_output_to_middle_json.py +212 -0
mineru/backend/pipeline/batch_analyze.py +9 -1
mineru/backend/pipeline/model_init.py +96 -1
mineru/backend/pipeline/pipeline_analyze.py +6 -4
mineru/backend/pipeline/pipeline_middle_json_mkcontent.py +32 -41
mineru/backend/vlm/utils.py +3 -1
mineru/backend/vlm/vlm_analyze.py +12 -12
mineru/backend/vlm/vlm_magic_model.py +24 -89
mineru/backend/vlm/vlm_middle_json_mkcontent.py +112 -12
mineru/cli/client.py +17 -17
mineru/cli/common.py +169 -20
mineru/cli/fast_api.py +39 -13
mineru/cli/gradio_app.py +232 -206
mineru/model/mfd/yolo_v8.py +12 -6
mineru/model/mfr/unimernet/Unimernet.py +71 -3
mineru/resources/header.html +5 -1
mineru/utils/boxbase.py +23 -0
mineru/utils/char_utils.py +55 -0
mineru/utils/engine_utils.py +74 -0
mineru/utils/enum_class.py +18 -1
mineru/utils/magic_model_utils.py +85 -2
mineru/utils/pdf_image_tools.py +37 -17
mineru/utils/span_pre_proc.py +5 -3
mineru/utils/table_merge.py +13 -22
mineru/version.py +1 -1
mineru-2.7.1.dist-info/METADATA +438 -0
{mineru-2.6.8.dist-info → mineru-2.7.1.dist-info}/RECORD +34 -28
mineru-2.6.8.dist-info/METADATA +0 -954
{mineru-2.6.8.dist-info → mineru-2.7.1.dist-info}/WHEEL +0 -0
{mineru-2.6.8.dist-info → mineru-2.7.1.dist-info}/entry_points.txt +0 -0
{mineru-2.6.8.dist-info → mineru-2.7.1.dist-info}/licenses/LICENSE.md +0 -0
{mineru-2.6.8.dist-info → mineru-2.7.1.dist-info}/top_level.txt +0 -0

mineru/backend/vlm/vlm_magic_model.py CHANGED Viewed

@@ -6,7 +6,7 @@ from loguru import logger
 from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio
 from mineru.utils.enum_class import ContentType, BlockType
 from mineru.utils.guess_suffix_or_lang import guess_language_by_text
-from mineru.utils.magic_model_utils import reduct_overlap, tie_up_category_by_distance_v3
+from mineru.utils.magic_model_utils import reduct_overlap, tie_up_category_by_index
 class MagicModel:
@@ -317,13 +317,14 @@ def clean_content(content):
     return content
-def __tie_up_category_by_distance_v3(blocks, subject_block_type, object_block_type):
+def __tie_up_category_by_index(blocks, subject_block_type, object_block_type):
+    """基于index的主客体关联包装函数"""
     # 定义获取主体和客体对象的函数
     def get_subjects():
         return reduct_overlap(
             list(
                 map(
-                    lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"], "angle":x["angle"]},
+                    lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"], "angle": x["angle"]},
                     filter(
                         lambda x: x["type"] == subject_block_type,
                         blocks,
@@ -336,7 +337,7 @@ def __tie_up_category_by_distance_v3(blocks, subject_block_type, object_block_ty
         return reduct_overlap(
             list(
                 map(
-                    lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"], "angle":x["angle"]},
+                    lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"], "angle": x["angle"]},
                     filter(
                         lambda x: x["type"] == object_block_type,
                         blocks,
@@ -346,15 +347,15 @@ def __tie_up_category_by_distance_v3(blocks, subject_block_type, object_block_ty
         )
     # 调用通用方法
-    return tie_up_category_by_distance_v3(
+    return tie_up_category_by_index(
         get_subjects,
         get_objects
     )
 def get_type_blocks(blocks, block_type: Literal["image", "table", "code"]):
-    with_captions = __tie_up_category_by_distance_v3(blocks, f"{block_type}_body", f"{block_type}_caption")
-    with_footnotes = __tie_up_category_by_distance_v3(blocks, f"{block_type}_body", f"{block_type}_footnote")
+    with_captions = __tie_up_category_by_index(blocks, f"{block_type}_body", f"{block_type}_caption")
+    with_footnotes = __tie_up_category_by_index(blocks, f"{block_type}_body", f"{block_type}_footnote")
     ret = []
     for v in with_captions:
         record = {
@@ -368,49 +369,6 @@ def get_type_blocks(blocks, block_type: Literal["image", "table", "code"]):
     return ret
-def fix_two_layer_blocks_back(blocks, fix_type: Literal["image", "table", "code"]):
-    need_fix_blocks = get_type_blocks(blocks, fix_type)
-    fixed_blocks = []
-    not_include_blocks = []
-    processed_indices = set()
-    # 处理需要组织成two_layer结构的blocks
-    for block in need_fix_blocks:
-        body = block[f"{fix_type}_body"]
-        caption_list = block[f"{fix_type}_caption_list"]
-        footnote_list = block[f"{fix_type}_footnote_list"]
-        body["type"] = f"{fix_type}_body"
-        for caption in caption_list:
-            caption["type"] = f"{fix_type}_caption"
-            processed_indices.add(caption["index"])
-        for footnote in footnote_list:
-            footnote["type"] = f"{fix_type}_footnote"
-            processed_indices.add(footnote["index"])
-        processed_indices.add(body["index"])
-        two_layer_block = {
-            "type": fix_type,
-            "bbox": body["bbox"],
-            "blocks": [
-                body,
-            ],
-            "index": body["index"],
-        }
-        two_layer_block["blocks"].extend([*caption_list, *footnote_list])
-        fixed_blocks.append(two_layer_block)
-    # 添加未处理的blocks
-    for block in blocks:
-        if block["index"] not in processed_indices:
-            # 直接添加未处理的block
-            not_include_blocks.append(block)
-    return fixed_blocks, not_include_blocks
 def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table", "code"]):
     need_fix_blocks = get_type_blocks(blocks, fix_type)
     fixed_blocks = []
@@ -418,25 +376,16 @@ def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table", "code"]):
     processed_indices = set()
     # 特殊处理表格类型，确保标题在表格前，注脚在表格后
-    if fix_type == "table":
+    if fix_type in ["table", "image"]:
         # 收集所有不合适的caption和footnote
         misplaced_captions = []  # 存储(caption, 原始block索引)
         misplaced_footnotes = []  # 存储(footnote, 原始block索引)
-        # 第一步：移除不符合位置要求的caption和footnote
+        # 第一步：移除不符合位置要求的footnote
         for block_idx, block in enumerate(need_fix_blocks):
             body = block[f"{fix_type}_body"]
             body_index = body["index"]
-            # 检查caption应在body前或同位置
-            valid_captions = []
-            for caption in block[f"{fix_type}_caption_list"]:
-                if caption["index"] <= body_index:
-                    valid_captions.append(caption)
-                else:
-                    misplaced_captions.append((caption, block_idx))
-            block[f"{fix_type}_caption_list"] = valid_captions
             # 检查footnote应在body后或同位置
             valid_footnotes = []
             for footnote in block[f"{fix_type}_footnote_list"]:
@@ -446,28 +395,6 @@ def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table", "code"]):
                     misplaced_footnotes.append((footnote, block_idx))
             block[f"{fix_type}_footnote_list"] = valid_footnotes
-        # 第二步：重新分配不合规的caption到合适的body
-        for caption, original_block_idx in misplaced_captions:
-            caption_index = caption["index"]
-            best_block_idx = None
-            min_distance = float('inf')
-            # 寻找索引大于等于caption_index的最近body
-            for idx, block in enumerate(need_fix_blocks):
-                body_index = block[f"{fix_type}_body"]["index"]
-                if body_index >= caption_index and idx != original_block_idx:
-                    distance = body_index - caption_index
-                    if distance < min_distance:
-                        min_distance = distance
-                        best_block_idx = idx
-            if best_block_idx is not None:
-                # 找到合适的body，添加到对应block的caption_list
-                need_fix_blocks[best_block_idx][f"{fix_type}_caption_list"].append(caption)
-            else:
-                # 没找到合适的body，作为普通block处理
-                not_include_blocks.append(caption)
         # 第三步：重新分配不合规的footnote到合适的body
         for footnote, original_block_idx in misplaced_footnotes:
             footnote_index = footnote["index"]
@@ -502,13 +429,22 @@ def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table", "code"]):
                 caption_list.sort(key=lambda x: x["index"], reverse=True)
                 filtered_captions = [caption_list[0]]
                 for i in range(1, len(caption_list)):
-                    # 检查是否与前一个caption连续(降序所以是-1)
-                    if caption_list[i]["index"] == caption_list[i - 1]["index"] - 1:
+                    prev_index = caption_list[i - 1]["index"]
+                    curr_index = caption_list[i]["index"]
+                    # 检查是否连续
+                    if curr_index == prev_index - 1:
                         filtered_captions.append(caption_list[i])
                     else:
-                        # 出现gap,后续所有caption都作为普通block
-                        not_include_blocks.extend(caption_list[i:])
-                        break
+                        # 检查gap中是否只有body_index
+                        gap_indices = set(range(curr_index + 1, prev_index))
+                        if gap_indices == {body_index}:
+                            # gap中只有body_index,不算真正的gap
+                            filtered_captions.append(caption_list[i])
+                        else:
+                            # 出现真正的gap,后续所有caption都作为普通block
+                            not_include_blocks.extend(caption_list[i:])
+                            break
                 # 恢复升序
                 filtered_captions.reverse()
                 block[f"{fix_type}_caption_list"] = filtered_captions
@@ -592,7 +528,6 @@ def fix_list_blocks(list_blocks, text_blocks, ref_text_blocks):
     for list_block in list_blocks:
         # 统计list_block["blocks"]中所有block的type，用众数作为list_block的sub_type
         type_count = {}
-        line_content = []
         for sub_block in list_block["blocks"]:
             sub_block_type = sub_block["type"]
             if sub_block_type not in type_count:

mineru/backend/vlm/vlm_middle_json_mkcontent.py CHANGED Viewed

@@ -1,8 +1,11 @@
 import os
 from loguru import logger
+from mineru.utils.char_utils import full_to_half_exclude_marks, is_hyphen_at_line_end
 from mineru.utils.config_reader import get_latex_delimiter_config, get_formula_enable, get_table_enable
 from mineru.utils.enum_class import MakeMode, BlockType, ContentType, ContentTypeV2
+from mineru.utils.language import detect_lang
 latex_delimiters_config = get_latex_delimiter_config()
@@ -18,9 +21,18 @@ display_right_delimiter = delimiters['display']['right']
 inline_left_delimiter = delimiters['inline']['left']
 inline_right_delimiter = delimiters['inline']['right']
 def merge_para_with_text(para_block, formula_enable=True, img_buket_path=''):
-    para_text = ''
+    block_text = ''
     for line in para_block['lines']:
+        for span in line['spans']:
+            if span['type'] in [ContentType.TEXT]:
+                span['content'] = full_to_half_exclude_marks(span['content'])
+                block_text += span['content']
+    block_lang = detect_lang(block_text)
+    para_text = ''
+    for i, line in enumerate(para_block['lines']):
         for j, span in enumerate(line['spans']):
             span_type = span['type']
             content = ''
@@ -34,17 +46,51 @@ def merge_para_with_text(para_block, formula_enable=True, img_buket_path=''):
                 else:
                     if span.get('image_path', ''):
                         content = f"![]({img_buket_path}/{span['image_path']})"
-            # content = content.strip()
+            content = content.strip()
             if content:
-                if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]:
-                    if j == len(line['spans']) - 1:
+                if span_type == ContentType.INTERLINE_EQUATION:
+                    para_text += content
+                    continue
+                # 定义CJK语言集合(中日韩)
+                cjk_langs = {'zh', 'ja', 'ko'}
+                # logger.info(f'block_lang: {block_lang}, content: {content}')
+                # 判断是否为行末span
+                is_last_span = j == len(line['spans']) - 1
+                if block_lang in cjk_langs:  # 中文/日语/韩文语境下，换行不需要空格分隔,但是如果是行内公式结尾，还是要加空格
+                    if is_last_span and span_type != ContentType.INLINE_EQUATION:
                         para_text += content
                     else:
                         para_text += f'{content} '
-                elif span_type == ContentType.INTERLINE_EQUATION:
-                    para_text += content
+                else:
+                    # 西方文本语境下 每行的最后一个span判断是否要去除连字符
+                    if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]:
+                        # 如果span是line的最后一个且末尾带有-连字符，那么末尾不应该加空格,同时应该把-删除
+                        if (
+                                is_last_span
+                                and span_type == ContentType.TEXT
+                                and is_hyphen_at_line_end(content)
+                        ):
+                            # 如果下一行的第一个span是小写字母开头，删除连字符
+                            if (
+                                    i+1 < len(para_block['lines'])
+                                    and para_block['lines'][i + 1].get('spans')
+                                    and para_block['lines'][i + 1]['spans'][0].get('type') == ContentType.TEXT
+                                    and para_block['lines'][i + 1]['spans'][0].get('content', '')
+                                    and para_block['lines'][i + 1]['spans'][0]['content'][0].islower()
+                            ):
+                                para_text += content[:-1]
+                            else:  # 如果没有下一行，或者下一行的第一个span不是小写字母开头，则保留连字符但不加空格
+                                para_text += content
+                        else:  # 西方文本语境下 content间需要空格分隔
+                            para_text += f'{content} '
     return para_text
 def mk_blocks_to_markdown(para_blocks, make_mode, formula_enable, table_enable, img_buket_path=''):
     page_markdown = []
     for para_block in para_blocks:
@@ -479,22 +525,76 @@ def get_body_data(para_block):
 def merge_para_with_text_v2(para_block):
-    para_content = []
-    para_type = para_block['type']
+    block_text = ''
     for line in para_block['lines']:
         for span in line['spans']:
+            if span['type'] in [ContentType.TEXT]:
+                span['content'] = full_to_half_exclude_marks(span['content'])
+                block_text += span['content']
+    block_lang = detect_lang(block_text)
+    para_content = []
+    para_type = para_block['type']
+    for i, line in enumerate(para_block['lines']):
+        for j, span in enumerate(line['spans']):
             span_type = span['type']
             if span.get("content", '').strip():
-                if para_type == BlockType.PHONETIC and span_type == ContentTypeV2.SPAN_TEXT:
-                    span_type = ContentTypeV2.SPAN_PHONETIC
+                if span_type == ContentType.TEXT:
+                    if para_type == BlockType.PHONETIC:
+                        span_type = ContentTypeV2.SPAN_PHONETIC
+                    else:
+                        span_type = ContentTypeV2.SPAN_TEXT
                 if span_type == ContentType.INLINE_EQUATION:
                     span_type = ContentTypeV2.SPAN_EQUATION_INLINE
                 if span_type in [
                     ContentTypeV2.SPAN_TEXT,
+                ]:
+                    # 定义CJK语言集合(中日韩)
+                    cjk_langs = {'zh', 'ja', 'ko'}
+                    # logger.info(f'block_lang: {block_lang}, content: {content}')
+                    # 判断是否为行末span
+                    is_last_span = j == len(line['spans']) - 1
+                    if block_lang in cjk_langs:  # 中文/日语/韩文语境下，换行不需要空格分隔,但是如果是行内公式结尾，还是要加空格
+                        if is_last_span:
+                            span_content = span['content']
+                        else:
+                            span_content = f"{span['content']} "
+                    else:
+                        # 如果span是line的最后一个且末尾带有-连字符，那么末尾不应该加空格,同时应该把-删除
+                        if (
+                                is_last_span
+                                and is_hyphen_at_line_end(span['content'])
+                        ):
+                            # 如果下一行的第一个span是小写字母开头，删除连字符
+                            if (
+                                    i + 1 < len(para_block['lines'])
+                                    and para_block['lines'][i + 1].get('spans')
+                                    and para_block['lines'][i + 1]['spans'][0].get('type') == ContentType.TEXT
+                                    and para_block['lines'][i + 1]['spans'][0].get('content', '')
+                                    and para_block['lines'][i + 1]['spans'][0]['content'][0].islower()
+                            ):
+                                span_content = span['content'][:-1]
+                            else:  # 如果没有下一行，或者下一行的第一个span不是小写字母开头，则保留连字符但不加空格
+                                span_content = span['content']
+                        else:
+                            # 西方文本语境下content间需要空格分隔
+                            span_content = f"{span['content']} "
+                    if para_content and para_content[-1]['type'] == span_type:
+                        # 合并相同类型的span
+                        para_content[-1]['content'] += span_content
+                    else:
+                        span_content = {
+                            'type': span_type,
+                            'content': span_content,
+                        }
+                        para_content.append(span_content)
+                elif span_type in [
                     ContentTypeV2.SPAN_PHONETIC,
                     ContentTypeV2.SPAN_EQUATION_INLINE,
-                    ContentTypeV2.SPAN_MD,
-                    ContentTypeV2.SPAN_CODE_INLINE,
                 ]:
                     span_content = {
                         'type': span_type,

mineru/cli/client.py CHANGED Viewed

@@ -1,10 +1,15 @@
 # Copyright (c) Opendatalab. All rights reserved.
 import os
+import sys
 import click
 from pathlib import Path
 from loguru import logger
-from mineru.utils.check_sys_env import is_mac_os_version_supported
+log_level = os.getenv("MINERU_LOG_LEVEL", "INFO").upper()
+logger.remove()  # 移除默认handler
+logger.add(sys.stderr, level=log_level)  # 添加新handler
 from mineru.utils.cli_parser import arg_parse
 from mineru.utils.config_reader import get_device
 from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
@@ -13,10 +18,6 @@ from ..version import __version__
 from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
-backends = ['pipeline', 'vlm-transformers', 'vlm-vllm-engine', 'vlm-lmdeploy-engine', 'vlm-http-client']
-if is_mac_os_version_supported():
-    backends.append("vlm-mlx-engine")
 @click.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=True))
 @click.pass_context
 @click.version_option(__version__,
@@ -50,24 +51,23 @@ if is_mac_os_version_supported():
       txt: Use text extraction method.
       ocr: Use OCR method for image-based PDFs.
     Without method specified, 'auto' will be used by default.
-    Adapted only for the case where the backend is set to 'pipeline'.""",
+    Adapted only for the case where the backend is set to 'pipeline' and 'hybrid-*'.""",
     default='auto',
 )
 @click.option(
     '-b',
     '--backend',
     'backend',
-    type=click.Choice(backends),
+    type=click.Choice(['pipeline', 'vlm-http-client', 'hybrid-http-client', 'vlm-auto-engine', 'hybrid-auto-engine',]),
     help="""\b
     the backend for parsing pdf:
       pipeline: More general.
-      vlm-transformers: More general, but slower.
-      vlm-mlx-engine: Faster than transformers(macOS 13.5+).
-      vlm-vllm-engine: Faster(vllm-engine).
-      vlm-lmdeploy-engine: Faster(lmdeploy-engine).
-      vlm-http-client: Faster(client suitable for openai-compatible servers).
-    Without method specified, pipeline will be used by default.""",
-    default='pipeline',
+      vlm-auto-engine: High accuracy via local computing power.
+      vlm-http-client: High accuracy via remote computing power(client suitable for openai-compatible servers).
+      hybrid-auto-engine: Next-generation high accuracy solution via local computing power.
+      hybrid-http-client: High accuracy but requires a little local computing power(client suitable for openai-compatible servers).
+    Without method specified, hybrid-auto-engine will be used by default.""",
+    default='hybrid-auto-engine',
 )
 @click.option(
     '-l',
@@ -78,7 +78,7 @@ if is_mac_os_version_supported():
     help="""
     Input the languages in the pdf (if known) to improve OCR accuracy.
     Without languages specified, 'ch' will be used by default.
-    Adapted only for the case where the backend is set to "pipeline".
+    Adapted only for the case where the backend is set to 'pipeline' and 'hybrid-*'.
     """,
     default='ch',
 )
@@ -88,7 +88,7 @@ if is_mac_os_version_supported():
     'server_url',
     type=str,
     help="""
-    When the backend is `vlm-http-client`, you need to specify the server_url, for example:`http://127.0.0.1:30000`
+    When the backend is `<vlm/hybrid>-http-client`, you need to specify the server_url, for example:`http://127.0.0.1:30000`
     """,
     default=None,
 )
@@ -130,7 +130,7 @@ if is_mac_os_version_supported():
     'device_mode',
     type=str,
     help="""Device mode for model inference, e.g., "cpu", "cuda", "cuda:0", "npu", "npu:0", "mps".
-         Adapted only for the case where the backend is set to "pipeline" and "vlm-transformers". """,
+         Adapted only for the case where the backend is set to "pipeline". """,
     default=None,
 )
 @click.option(

mineru/cli/common.py CHANGED Viewed

@@ -10,6 +10,7 @@ import pypdfium2 as pdfium
 from mineru.data.data_reader_writer import FileBasedDataWriter
 from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox, draw_line_sort_bbox
+from mineru.utils.engine_utils import get_vlm_engine
 from mineru.utils.enum_class import MakeMode
 from mineru.utils.guess_suffix_or_lang import guess_suffix_by_bytes
 from mineru.utils.pdf_image_tools import images_bytes_to_pdf_bytes
@@ -304,6 +305,112 @@ def _process_vlm(
         )
+def _process_hybrid(
+        output_dir,
+        pdf_file_names,
+        pdf_bytes_list,
+        h_lang_list,
+        parse_method,
+        inline_formula_enable,
+        backend,
+        f_draw_layout_bbox,
+        f_draw_span_bbox,
+        f_dump_md,
+        f_dump_middle_json,
+        f_dump_model_output,
+        f_dump_orig_pdf,
+        f_dump_content_list,
+        f_make_md_mode,
+        server_url=None,
+        **kwargs,
+):
+    from mineru.backend.hybrid.hybrid_analyze import doc_analyze as hybrid_doc_analyze
+    """同步处理hybrid后端逻辑"""
+    if not backend.endswith("client"):
+        server_url = None
+    for idx, (pdf_bytes, lang) in enumerate(zip(pdf_bytes_list, h_lang_list)):
+        pdf_file_name = pdf_file_names[idx]
+        local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, f"hybrid_{parse_method}")
+        image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
+        middle_json, infer_result, _vlm_ocr_enable = hybrid_doc_analyze(
+            pdf_bytes,
+            image_writer=image_writer,
+            backend=backend,
+            parse_method=parse_method,
+            language=lang,
+            inline_formula_enable=inline_formula_enable,
+            server_url=server_url,
+            **kwargs,
+        )
+        pdf_info = middle_json["pdf_info"]
+        # f_draw_span_bbox = not _vlm_ocr_enable
+        f_draw_span_bbox = False
+        _process_output(
+            pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
+            md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
+            f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
+            f_make_md_mode, middle_json, infer_result, is_pipeline=False
+        )
+async def _async_process_hybrid(
+        output_dir,
+        pdf_file_names,
+        pdf_bytes_list,
+        h_lang_list,
+        parse_method,
+        inline_formula_enable,
+        backend,
+        f_draw_layout_bbox,
+        f_draw_span_bbox,
+        f_dump_md,
+        f_dump_middle_json,
+        f_dump_model_output,
+        f_dump_orig_pdf,
+        f_dump_content_list,
+        f_make_md_mode,
+        server_url=None,
+        **kwargs,
+):
+    from mineru.backend.hybrid.hybrid_analyze import aio_doc_analyze as aio_hybrid_doc_analyze
+    """异步处理hybrid后端逻辑"""
+    if not backend.endswith("client"):
+        server_url = None
+    for idx, (pdf_bytes, lang) in enumerate(zip(pdf_bytes_list, h_lang_list)):
+        pdf_file_name = pdf_file_names[idx]
+        local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, f"hybrid_{parse_method}")
+        image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir)
+        middle_json, infer_result, _vlm_ocr_enable = await aio_hybrid_doc_analyze(
+            pdf_bytes,
+            image_writer=image_writer,
+            backend=backend,
+            parse_method=parse_method,
+            language=lang,
+            inline_formula_enable=inline_formula_enable,
+            server_url=server_url,
+            **kwargs,
+        )
+        pdf_info = middle_json["pdf_info"]
+        # f_draw_span_bbox = not _vlm_ocr_enable
+        f_draw_span_bbox = False
+        _process_output(
+            pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir,
+            md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf,
+            f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output,
+            f_make_md_mode, middle_json, infer_result, is_pipeline=False
+        )
 def do_parse(
         output_dir,
         pdf_file_names: list[str],
@@ -340,18 +447,40 @@ def do_parse(
         if backend.startswith("vlm-"):
             backend = backend[4:]
-        if backend == "vllm-async-engine":
-            raise Exception("vlm-vllm-async-engine backend is not supported in sync mode, please use vlm-vllm-engine backend")
+            if backend == "vllm-async-engine":
+                raise Exception("vlm-vllm-async-engine backend is not supported in sync mode, please use vlm-vllm-engine backend")
-        os.environ['MINERU_VLM_FORMULA_ENABLE'] = str(formula_enable)
-        os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable)
+            if backend == "auto-engine":
+                backend = get_vlm_engine(inference_engine='auto', is_async=False)
-        _process_vlm(
-            output_dir, pdf_file_names, pdf_bytes_list, backend,
-            f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
-            f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode,
-            server_url, **kwargs,
-        )
+            os.environ['MINERU_VLM_FORMULA_ENABLE'] = str(formula_enable)
+            os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable)
+            _process_vlm(
+                output_dir, pdf_file_names, pdf_bytes_list, backend,
+                f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
+                f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode,
+                server_url, **kwargs,
+            )
+        elif backend.startswith("hybrid-"):
+            backend = backend[7:]
+            if backend == "vllm-async-engine":
+                raise Exception(
+                    "hybrid-vllm-async-engine backend is not supported in sync mode, please use hybrid-vllm-engine backend")
+            if backend == "auto-engine":
+                backend = get_vlm_engine(inference_engine='auto', is_async=False)
+            os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable)
+            os.environ['MINERU_VLM_FORMULA_ENABLE'] = "true"
+            _process_hybrid(
+                output_dir, pdf_file_names, pdf_bytes_list, p_lang_list, parse_method, formula_enable, backend,
+                f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
+                f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode,
+                server_url, **kwargs,
+            )
 async def aio_do_parse(
@@ -391,19 +520,39 @@ async def aio_do_parse(
         if backend.startswith("vlm-"):
             backend = backend[4:]
-        if backend == "vllm-engine":
-            raise Exception("vlm-vllm-engine backend is not supported in async mode, please use vlm-vllm-async-engine backend")
+            if backend == "vllm-engine":
+                raise Exception("vlm-vllm-engine backend is not supported in async mode, please use vlm-vllm-async-engine backend")
-        os.environ['MINERU_VLM_FORMULA_ENABLE'] = str(formula_enable)
-        os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable)
+            if backend == "auto-engine":
+                backend = get_vlm_engine(inference_engine='auto', is_async=True)
-        await _async_process_vlm(
-            output_dir, pdf_file_names, pdf_bytes_list, backend,
-            f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
-            f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode,
-            server_url, **kwargs,
-        )
+            os.environ['MINERU_VLM_FORMULA_ENABLE'] = str(formula_enable)
+            os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable)
+            await _async_process_vlm(
+                output_dir, pdf_file_names, pdf_bytes_list, backend,
+                f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
+                f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode,
+                server_url, **kwargs,
+            )
+        elif backend.startswith("hybrid-"):
+            backend = backend[7:]
+            if backend == "vllm-engine":
+                raise Exception("hybrid-vllm-engine backend is not supported in async mode, please use hybrid-vllm-async-engine backend")
+            if backend == "auto-engine":
+                backend = get_vlm_engine(inference_engine='auto', is_async=True)
+            os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable)
+            os.environ['MINERU_VLM_FORMULA_ENABLE'] = "true"
+            await _async_process_hybrid(
+                output_dir, pdf_file_names, pdf_bytes_list, p_lang_list, parse_method, formula_enable, backend,
+                f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json,
+                f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode,
+                server_url, **kwargs,
+            )
 if __name__ == "__main__":

mineru 2.6.8__py3-none-any.whl → 2.7.1__py3-none-any.whl

mineru 2.6.8py3-none-any.whl → 2.7.1py3-none-any.whl