PyPI - mineru - Versions diffs - 2.2.2__py3-none-any.whl → 2.5.1__py3-none-any.whl - Mend

mineru 2.2.2py3-none-any.whl → 2.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

mineru/backend/pipeline/pipeline_middle_json_mkcontent.py +3 -3
mineru/backend/vlm/model_output_to_middle_json.py +123 -0
mineru/backend/vlm/vlm_analyze.py +105 -16
mineru/backend/vlm/vlm_magic_model.py +201 -135
mineru/backend/vlm/vlm_middle_json_mkcontent.py +52 -11
mineru/cli/client.py +6 -5
mineru/cli/common.py +17 -16
mineru/cli/fast_api.py +9 -7
mineru/cli/gradio_app.py +15 -16
mineru/cli/vlm_vllm_server.py +4 -0
mineru/model/table/rec/unet_table/main.py +8 -0
mineru/model/vlm_vllm_model/__init__.py +0 -0
mineru/model/vlm_vllm_model/server.py +59 -0
mineru/resources/header.html +10 -2
mineru/utils/draw_bbox.py +32 -10
mineru/utils/enum_class.py +16 -2
mineru/utils/guess_suffix_or_lang.py +20 -0
mineru/utils/span_block_fix.py +4 -2
mineru/version.py +1 -1
{mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/METADATA +70 -25
{mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/RECORD +25 -38
{mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/entry_points.txt +1 -1
mineru/backend/vlm/base_predictor.py +0 -186
mineru/backend/vlm/hf_predictor.py +0 -217
mineru/backend/vlm/predictor.py +0 -111
mineru/backend/vlm/sglang_client_predictor.py +0 -443
mineru/backend/vlm/sglang_engine_predictor.py +0 -246
mineru/backend/vlm/token_to_middle_json.py +0 -122
mineru/backend/vlm/utils.py +0 -40
mineru/cli/vlm_sglang_server.py +0 -4
mineru/model/vlm_hf_model/__init__.py +0 -9
mineru/model/vlm_hf_model/configuration_mineru2.py +0 -38
mineru/model/vlm_hf_model/image_processing_mineru2.py +0 -269
mineru/model/vlm_hf_model/modeling_mineru2.py +0 -449
mineru/model/vlm_sglang_model/__init__.py +0 -14
mineru/model/vlm_sglang_model/engine.py +0 -264
mineru/model/vlm_sglang_model/image_processor.py +0 -213
mineru/model/vlm_sglang_model/logit_processor.py +0 -90
mineru/model/vlm_sglang_model/model.py +0 -453
mineru/model/vlm_sglang_model/server.py +0 -75
{mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/WHEEL +0 -0
{mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/licenses/LICENSE.md +0 -0
{mineru-2.2.2.dist-info → mineru-2.5.1.dist-info}/top_level.txt +0 -0

mineru/backend/vlm/vlm_magic_model.py CHANGED Viewed

@@ -3,46 +3,37 @@ from typing import Literal
 from loguru import logger
-from mineru.utils.enum_class import ContentType, BlockType, SplitFlag
-from mineru.backend.vlm.vlm_middle_json_mkcontent import merge_para_with_text
-from mineru.utils.format_utils import block_content_to_html
+from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio
+from mineru.utils.enum_class import ContentType, BlockType
+from mineru.utils.guess_suffix_or_lang import guess_language_by_text
 from mineru.utils.magic_model_utils import reduct_overlap, tie_up_category_by_distance_v3
 class MagicModel:
-    def __init__(self, token: str, width, height):
-        self.token = token
-        # 使用正则表达式查找所有块
-        pattern = (
-            r"<\|box_start\|>(.*?)<\|box_end\|><\|ref_start\|>(.*?)<\|ref_end\|><\|md_start\|>(.*?)(?:<\|md_end\|>|<\|im_end\|>)"
-        )
-        block_infos = re.findall(pattern, token, re.DOTALL)
+    def __init__(self, page_blocks: list, width, height):
+        self.page_blocks = page_blocks
         blocks = []
         self.all_spans = []
         # 解析每个块
-        for index, block_info in enumerate(block_infos):
-            block_bbox = block_info[0].strip()
+        for index, block_info in enumerate(page_blocks):
+            block_bbox = block_info["bbox"]
             try:
-                x1, y1, x2, y2 = map(int, block_bbox.split())
+                x1, y1, x2, y2 = block_bbox
                 x_1, y_1, x_2, y_2 = (
-                    int(x1 * width / 1000),
-                    int(y1 * height / 1000),
-                    int(x2 * width / 1000),
-                    int(y2 * height / 1000),
+                    int(x1 * width),
+                    int(y1 * height),
+                    int(x2 * width),
+                    int(y2 * height),
                 )
                 if x_2 < x_1:
                     x_1, x_2 = x_2, x_1
                 if y_2 < y_1:
                     y_1, y_2 = y_2, y_1
                 block_bbox = (x_1, y_1, x_2, y_2)
-                block_type = block_info[1].strip()
-                block_content = block_info[2].strip()
-                # 如果bbox是0,0,999,999，且type为text，按notes增加表格处理
-                if x1 == 0 and y1 == 0 and x2 == 999 and y2 == 999 and block_type == "text":
-                    block_content = block_content_to_html(block_content)
+                block_type = block_info["type"]
+                block_content = block_info["content"]
+                block_angle = block_info["angle"]
                 # print(f"坐标: {block_bbox}")
                 # print(f"类型: {block_type}")
@@ -54,6 +45,9 @@ class MagicModel:
                 continue
             span_type = "unknown"
+            line_type = None
+            guess_lang = None
             if block_type in [
                 "text",
                 "title",
@@ -61,8 +55,15 @@ class MagicModel:
                 "image_footnote",
                 "table_caption",
                 "table_footnote",
-                "list",
-                "index",
+                "code_caption",
+                "ref_text",
+                "phonetic",
+                "header",
+                "footer",
+                "page_number",
+                "aside_text",
+                "page_footnote",
+                "list"
             ]:
                 span_type = ContentType.TEXT
             elif block_type in ["image"]:
@@ -71,6 +72,12 @@ class MagicModel:
             elif block_type in ["table"]:
                 block_type = BlockType.TABLE_BODY
                 span_type = ContentType.TABLE
+            elif block_type in ["code", "algorithm"]:
+                block_content = code_content_clean(block_content)
+                line_type = block_type
+                block_type = BlockType.CODE_BODY
+                span_type = ContentType.TEXT
+                guess_lang = guess_language_by_text(block_content)
             elif block_type in ["equation"]:
                 block_type = BlockType.INTERLINE_EQUATION
                 span_type = ContentType.INTERLINE_EQUATION
@@ -81,7 +88,7 @@ class MagicModel:
                     "type": span_type,
                 }
                 if span_type == ContentType.TABLE:
-                    span["html"] = block_content_to_html(block_content)
+                    span["html"] = block_content
             elif span_type in [ContentType.INTERLINE_EQUATION]:
                 span = {
                     "bbox": block_bbox,
@@ -89,7 +96,12 @@ class MagicModel:
                     "content": isolated_formula_clean(block_content),
                 }
             else:
-                if block_content.count("\\(") == block_content.count("\\)") and block_content.count("\\(") > 0:
+                if block_content:
+                    block_content = clean_content(block_content)
+                if block_content and block_content.count("\\(") == block_content.count("\\)") and block_content.count("\\(") > 0:
                     # 生成包含文本和公式的span列表
                     spans = []
                     last_end = 0
@@ -136,25 +148,27 @@ class MagicModel:
                         "content": block_content,
                     }
+            # 处理span类型并添加到all_spans
             if isinstance(span, dict) and "bbox" in span:
                 self.all_spans.append(span)
-                line = {
-                    "bbox": block_bbox,
-                    "spans": [span],
-                }
+                spans = [span]
             elif isinstance(span, list):
                 self.all_spans.extend(span)
-                line = {
-                    "bbox": block_bbox,
-                    "spans": span,
-                }
+                spans = span
             else:
                 raise ValueError(f"Invalid span type: {span_type}, expected dict or list, got {type(span)}")
+            # 构造line对象
+            if block_type in [BlockType.CODE_BODY]:
+                line = {"bbox": block_bbox, "spans": spans, "extra": {"type": line_type, "guess_lang": guess_lang}}
+            else:
+                line = {"bbox": block_bbox, "spans": spans}
             blocks.append(
                 {
                     "bbox": block_bbox,
                     "type": block_type,
+                    "angle": block_angle,
                     "lines": [line],
                     "index": index,
                 }
@@ -165,35 +179,87 @@ class MagicModel:
         self.interline_equation_blocks = []
         self.text_blocks = []
         self.title_blocks = []
+        self.code_blocks = []
+        self.discarded_blocks = []
+        self.ref_text_blocks = []
+        self.phonetic_blocks = []
+        self.list_blocks = []
         for block in blocks:
             if block["type"] in [BlockType.IMAGE_BODY, BlockType.IMAGE_CAPTION, BlockType.IMAGE_FOOTNOTE]:
                 self.image_blocks.append(block)
             elif block["type"] in [BlockType.TABLE_BODY, BlockType.TABLE_CAPTION, BlockType.TABLE_FOOTNOTE]:
                 self.table_blocks.append(block)
+            elif block["type"] in [BlockType.CODE_BODY, BlockType.CODE_CAPTION]:
+                self.code_blocks.append(block)
             elif block["type"] == BlockType.INTERLINE_EQUATION:
                 self.interline_equation_blocks.append(block)
             elif block["type"] == BlockType.TEXT:
                 self.text_blocks.append(block)
             elif block["type"] == BlockType.TITLE:
                 self.title_blocks.append(block)
+            elif block["type"] in [BlockType.REF_TEXT]:
+                self.ref_text_blocks.append(block)
+            elif block["type"] in [BlockType.PHONETIC]:
+                self.phonetic_blocks.append(block)
+            elif block["type"] in [BlockType.HEADER, BlockType.FOOTER, BlockType.PAGE_NUMBER, BlockType.ASIDE_TEXT, BlockType.PAGE_FOOTNOTE]:
+                self.discarded_blocks.append(block)
+            elif block["type"] == BlockType.LIST:
+                self.list_blocks.append(block)
             else:
                 continue
+        self.list_blocks, self.text_blocks, self.ref_text_blocks = fix_list_blocks(self.list_blocks, self.text_blocks, self.ref_text_blocks)
+        self.image_blocks, not_include_image_blocks = fix_two_layer_blocks(self.image_blocks, BlockType.IMAGE)
+        self.table_blocks, not_include_table_blocks = fix_two_layer_blocks(self.table_blocks, BlockType.TABLE)
+        self.code_blocks, not_include_code_blocks = fix_two_layer_blocks(self.code_blocks, BlockType.CODE)
+        for code_block in self.code_blocks:
+            for block in code_block['blocks']:
+                if block['type'] == BlockType.CODE_BODY:
+                    if len(block["lines"]) > 0:
+                        line = block["lines"][0]
+                        code_block["sub_type"] = line["extra"]["type"]
+                        if code_block["sub_type"] in ["code"]:
+                            code_block["guess_lang"] = line["extra"]["guess_lang"]
+                        del line["extra"]
+                    else:
+                        code_block["sub_type"] = "code"
+                        code_block["guess_lang"] = "txt"
+        for block in not_include_image_blocks + not_include_table_blocks + not_include_code_blocks:
+            block["type"] = BlockType.TEXT
+            self.text_blocks.append(block)
+    def get_list_blocks(self):
+        return self.list_blocks
     def get_image_blocks(self):
-        return fix_two_layer_blocks(self.image_blocks, BlockType.IMAGE)
+        return self.image_blocks
     def get_table_blocks(self):
-        return fix_two_layer_blocks(self.table_blocks, BlockType.TABLE)
+        return self.table_blocks
+    def get_code_blocks(self):
+        return self.code_blocks
+    def get_ref_text_blocks(self):
+        return self.ref_text_blocks
+    def get_phonetic_blocks(self):
+        return self.phonetic_blocks
     def get_title_blocks(self):
-        return fix_title_blocks(self.title_blocks)
+        return self.title_blocks
     def get_text_blocks(self):
-        return fix_text_blocks(self.text_blocks)
+        return self.text_blocks
     def get_interline_equation_blocks(self):
         return self.interline_equation_blocks
+    def get_discarded_blocks(self):
+        return self.discarded_blocks
     def get_all_spans(self):
         return self.all_spans
@@ -202,48 +268,46 @@ def isolated_formula_clean(txt):
     latex = txt[:]
     if latex.startswith("\\["): latex = latex[2:]
     if latex.endswith("\\]"): latex = latex[:-2]
-    latex = latex_fix(latex.strip())
+    latex = latex.strip()
     return latex
-def latex_fix(latex):
-    # valid pairs:
-    # \left\{ ... \right\}
-    # \left( ... \right)
-    # \left| ... \right|
-    # \left\| ... \right\|
-    # \left[ ... \right]
-    LEFT_COUNT_PATTERN = re.compile(r'\\left(?![a-zA-Z])')
-    RIGHT_COUNT_PATTERN = re.compile(r'\\right(?![a-zA-Z])')
-    left_count = len(LEFT_COUNT_PATTERN.findall(latex))  # 不匹配\lefteqn等
-    right_count = len(RIGHT_COUNT_PATTERN.findall(latex))  # 不匹配\rightarrow
-    if left_count != right_count:
-        for _ in range(2):
-            # replace valid pairs
-            latex = re.sub(r'\\left\\\{', "{", latex) # \left\{
-            latex = re.sub(r"\\left\|", "|", latex) # \left|
-            latex = re.sub(r"\\left\\\|", "|", latex) # \left\|
-            latex = re.sub(r"\\left\(", "(", latex) # \left(
-            latex = re.sub(r"\\left\[", "[", latex) # \left[
-            latex = re.sub(r"\\right\\\}", "}", latex) # \right\}
-            latex = re.sub(r"\\right\|", "|", latex) # \right|
-            latex = re.sub(r"\\right\\\|", "|", latex) # \right\|
-            latex = re.sub(r"\\right\)", ")", latex) # \right)
-            latex = re.sub(r"\\right\]", "]", latex) # \right]
-            latex = re.sub(r"\\right\.", "", latex) # \right.
-            # replace invalid pairs first
-            latex = re.sub(r'\\left\{', "{", latex)
-            latex = re.sub(r'\\right\}', "}", latex) # \left{ ... \right}
-            latex = re.sub(r'\\left\\\(', "(", latex)
-            latex = re.sub(r'\\right\\\)', ")", latex) # \left\( ... \right\)
-            latex = re.sub(r'\\left\\\[', "[", latex)
-            latex = re.sub(r'\\right\\\]', "]", latex) # \left\[ ... \right\]
+def code_content_clean(content):
+    """清理代码内容，移除Markdown代码块的开始和结束标记"""
+    if not content:
+        return ""
+    lines = content.splitlines()
+    start_idx = 0
+    end_idx = len(lines)
+    # 处理开头的三个反引号
+    if lines and lines[0].startswith("```"):
+        start_idx = 1
+    # 处理结尾的三个反引号
+    if lines and end_idx > start_idx and lines[end_idx - 1].strip() == "```":
+        end_idx -= 1
+    # 只有在有内容时才进行join操作
+    if start_idx < end_idx:
+        return "\n".join(lines[start_idx:end_idx]).strip()
+    return ""
-    return latex
+def clean_content(content):
+    if content and content.count("\\[") == content.count("\\]") and content.count("\\[") > 0:
+        # Function to handle each match
+        def replace_pattern(match):
+            # Extract content between \[ and \]
+            inner_content = match.group(1)
+            return f"[{inner_content}]"
+        # Find all patterns of \[x\] and apply replacement
+        pattern = r'\\\[(.*?)\\\]'
+        content = re.sub(pattern, replace_pattern, content)
+    return content
 def __tie_up_category_by_distance_v3(blocks, subject_block_type, object_block_type):
@@ -252,7 +316,7 @@ def __tie_up_category_by_distance_v3(blocks, subject_block_type, object_block_ty
         return reduct_overlap(
             list(
                 map(
-                    lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"]},
+                    lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"], "angle":x["angle"]},
                     filter(
                         lambda x: x["type"] == subject_block_type,
                         blocks,
@@ -265,7 +329,7 @@ def __tie_up_category_by_distance_v3(blocks, subject_block_type, object_block_ty
         return reduct_overlap(
             list(
                 map(
-                    lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"]},
+                    lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"], "angle":x["angle"]},
                     filter(
                         lambda x: x["type"] == object_block_type,
                         blocks,
@@ -281,7 +345,7 @@ def __tie_up_category_by_distance_v3(blocks, subject_block_type, object_block_ty
     )
-def get_type_blocks(blocks, block_type: Literal["image", "table"]):
+def get_type_blocks(blocks, block_type: Literal["image", "table", "code"]):
     with_captions = __tie_up_category_by_distance_v3(blocks, f"{block_type}_body", f"{block_type}_caption")
     with_footnotes = __tie_up_category_by_distance_v3(blocks, f"{block_type}_body", f"{block_type}_footnote")
     ret = []
@@ -297,9 +361,13 @@ def get_type_blocks(blocks, block_type: Literal["image", "table"]):
     return ret
-def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table"]):
+def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table", "code"]):
     need_fix_blocks = get_type_blocks(blocks, fix_type)
     fixed_blocks = []
+    not_include_blocks = []
+    processed_indices = set()
+    # 处理需要组织成two_layer结构的blocks
     for block in need_fix_blocks:
         body = block[f"{fix_type}_body"]
         caption_list = block[f"{fix_type}_caption_list"]
@@ -308,8 +376,12 @@ def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table"]):
         body["type"] = f"{fix_type}_body"
         for caption in caption_list:
             caption["type"] = f"{fix_type}_caption"
+            processed_indices.add(caption["index"])
         for footnote in footnote_list:
             footnote["type"] = f"{fix_type}_footnote"
+            processed_indices.add(footnote["index"])
+        processed_indices.add(body["index"])
         two_layer_block = {
             "type": fix_type,
@@ -323,58 +395,52 @@ def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table"]):
         fixed_blocks.append(two_layer_block)
-    return fixed_blocks
-def fix_title_blocks(blocks):
+    # 添加未处理的blocks
     for block in blocks:
-        if block["type"] == BlockType.TITLE:
-            title_content = merge_para_with_text(block)
-            title_level = count_leading_hashes(title_content)
-            block['level'] = title_level
-            for line in block['lines']:
-                for span in line['spans']:
-                    span['content'] = strip_leading_hashes(span['content'])
-                    break
+        if block["index"] not in processed_indices:
+            # 直接添加未处理的block
+            not_include_blocks.append(block)
+    return fixed_blocks, not_include_blocks
+def fix_list_blocks(list_blocks, text_blocks, ref_text_blocks):
+    for list_block in list_blocks:
+        list_block["blocks"] = []
+        if "lines" in list_block:
+            del list_block["lines"]
+    temp_text_blocks = text_blocks + ref_text_blocks
+    need_remove_blocks = []
+    for block in temp_text_blocks:
+        for list_block in list_blocks:
+            if calculate_overlap_area_in_bbox1_area_ratio(block["bbox"], list_block["bbox"]) >= 0.8:
+                list_block["blocks"].append(block)
+                need_remove_blocks.append(block)
                 break
-    return blocks
-def count_leading_hashes(text):
-    match = re.match(r'^(#+)', text)
-    return len(match.group(1)) if match else 0
-def strip_leading_hashes(text):
-    # 去除开头的#和紧随其后的空格
-    return re.sub(r'^#+\s*', '', text)
-def fix_text_blocks(blocks):
-    i = 0
-    while i < len(blocks):
-        block = blocks[i]
-        last_line = block["lines"][-1]if block["lines"] else None
-        if last_line:
-            last_span = last_line["spans"][-1] if last_line["spans"] else None
-            if last_span and last_span['content'].endswith('<|txt_contd|>'):
-                last_span['content'] = last_span['content'][:-len('<|txt_contd|>')]
-                # 查找下一个未被清空的块
-                next_idx = i + 1
-                while next_idx < len(blocks) and blocks[next_idx].get(SplitFlag.LINES_DELETED, False):
-                    next_idx += 1
-                # 如果找到下一个有效块，则合并
-                if next_idx < len(blocks):
-                    next_block = blocks[next_idx]
-                    # 将下一个块的lines扩展到当前块的lines中
-                    block["lines"].extend(next_block["lines"])
-                    # 清空下一个块的lines
-                    next_block["lines"] = []
-                    # 在下一个块中添加标志
-                    next_block[SplitFlag.LINES_DELETED] = True
-                    # 不增加i，继续检查当前块（现在已包含下一个块的内容）
-                    continue
-        i += 1
-    return blocks
+    for block in need_remove_blocks:
+        if block in text_blocks:
+            text_blocks.remove(block)
+        elif block in ref_text_blocks:
+            ref_text_blocks.remove(block)
+    # 移除blocks为空的list_block
+    list_blocks = [lb for lb in list_blocks if lb["blocks"]]
+    for list_block in list_blocks:
+        # 统计list_block["blocks"]中所有block的type，用众数作为list_block的sub_type
+        type_count = {}
+        line_content = []
+        for sub_block in list_block["blocks"]:
+            sub_block_type = sub_block["type"]
+            if sub_block_type not in type_count:
+                type_count[sub_block_type] = 0
+            type_count[sub_block_type] += 1
+        if type_count:
+            list_block["sub_type"] = max(type_count, key=type_count.get)
+        else:
+            list_block["sub_type"] = "unknown"
+    return list_blocks, text_blocks, ref_text_blocks

mineru/backend/vlm/vlm_middle_json_mkcontent.py CHANGED Viewed

@@ -3,7 +3,6 @@ import os
 from mineru.utils.config_reader import get_latex_delimiter_config, get_formula_enable, get_table_enable
 from mineru.utils.enum_class import MakeMode, BlockType, ContentType
 latex_delimiters_config = get_latex_delimiter_config()
 default_delimiters = {
@@ -50,8 +49,12 @@ def mk_blocks_to_markdown(para_blocks, make_mode, formula_enable, table_enable,
     for para_block in para_blocks:
         para_text = ''
         para_type = para_block['type']
-        if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX, BlockType.INTERLINE_EQUATION]:
+        if para_type in [BlockType.TEXT, BlockType.INTERLINE_EQUATION, BlockType.PHONETIC, BlockType.REF_TEXT]:
             para_text = merge_para_with_text(para_block, formula_enable=formula_enable, img_buket_path=img_buket_path)
+        elif para_type == BlockType.LIST:
+            for block in para_block['blocks']:
+                item_text = merge_para_with_text(block, formula_enable=formula_enable, img_buket_path=img_buket_path)
+                para_text += f"{item_text}\n"
         elif para_type == BlockType.TITLE:
             title_level = get_title_level(para_block)
             para_text = f'{"#" * title_level} {merge_para_with_text(para_block)}'
@@ -112,6 +115,18 @@ def mk_blocks_to_markdown(para_blocks, make_mode, formula_enable, table_enable,
                 for block in para_block['blocks']:  # 3rd.拼table_footnote
                     if block['type'] == BlockType.TABLE_FOOTNOTE:
                         para_text += '\n' + merge_para_with_text(block) + '  '
+        elif para_type == BlockType.CODE:
+            sub_type = para_block["sub_type"]
+            for block in para_block['blocks']:  # 1st.拼code_caption
+                if block['type'] == BlockType.CODE_CAPTION:
+                    para_text += merge_para_with_text(block) + '  \n'
+            for block in para_block['blocks']:  # 2nd.拼code_body
+                if block['type'] == BlockType.CODE_BODY:
+                    if sub_type == BlockType.CODE:
+                        guess_lang = para_block["guess_lang"]
+                        para_text += f"```{guess_lang}\n{merge_para_with_text(block)}\n```"
+                    elif sub_type == BlockType.ALGORITHM:
+                        para_text += merge_para_with_text(block)
         if para_text.strip() == '':
             continue
@@ -122,17 +137,33 @@ def mk_blocks_to_markdown(para_blocks, make_mode, formula_enable, table_enable,
     return page_markdown
 def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size):
     para_type = para_block['type']
     para_content = {}
-    if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX]:
+    if para_type in [
+        BlockType.TEXT,
+        BlockType.REF_TEXT,
+        BlockType.PHONETIC,
+        BlockType.HEADER,
+        BlockType.FOOTER,
+        BlockType.PAGE_NUMBER,
+        BlockType.ASIDE_TEXT,
+        BlockType.PAGE_FOOTNOTE,
+    ]:
         para_content = {
-            'type': ContentType.TEXT,
+            'type': para_type,
             'text': merge_para_with_text(para_block),
         }
+    elif para_type == BlockType.LIST:
+        para_content = {
+            'type': para_type,
+            'sub_type': para_block.get('sub_type', ''),
+            'list_items':[],
+        }
+        for block in para_block['blocks']:
+            item_text = merge_para_with_text(block)
+            if item_text.strip():
+                para_content['list_items'].append(item_text)
     elif para_type == BlockType.TITLE:
         title_level = get_title_level(para_block)
         para_content = {
@@ -178,15 +209,24 @@ def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
                 para_content[BlockType.TABLE_CAPTION].append(merge_para_with_text(block))
             if block['type'] == BlockType.TABLE_FOOTNOTE:
                 para_content[BlockType.TABLE_FOOTNOTE].append(merge_para_with_text(block))
+    elif para_type == BlockType.CODE:
+        para_content = {'type': BlockType.CODE, 'sub_type': para_block["sub_type"], BlockType.CODE_CAPTION: []}
+        for block in para_block['blocks']:
+            if block['type'] == BlockType.CODE_BODY:
+                para_content[BlockType.CODE_BODY] = merge_para_with_text(block)
+                if para_block["sub_type"] == BlockType.CODE:
+                    para_content["guess_lang"] = para_block["guess_lang"]
+            if block['type'] == BlockType.CODE_CAPTION:
+                para_content[BlockType.CODE_CAPTION].append(merge_para_with_text(block))
-    page_weight, page_height = page_size
+    page_width, page_height = page_size
     para_bbox = para_block.get('bbox')
     if para_bbox:
         x0, y0, x1, y1 = para_bbox
         para_content['bbox'] = [
-            int(x0 * 1000 / page_weight),
+            int(x0 * 1000 / page_width),
             int(y0 * 1000 / page_height),
-            int(x1 * 1000 / page_weight),
+            int(x1 * 1000 / page_width),
             int(y1 * 1000 / page_height),
         ]
@@ -205,6 +245,7 @@ def union_make(pdf_info_dict: list,
     output_content = []
     for page_info in pdf_info_dict:
         paras_of_layout = page_info.get('para_blocks')
+        paras_of_discarded = page_info.get('discarded_blocks')
         page_idx = page_info.get('page_idx')
         page_size = page_info.get('page_size')
         if not paras_of_layout:
@@ -213,7 +254,7 @@ def union_make(pdf_info_dict: list,
             page_markdown = mk_blocks_to_markdown(paras_of_layout, make_mode, formula_enable, table_enable, img_buket_path)
             output_content.extend(page_markdown)
         elif make_mode == MakeMode.CONTENT_LIST:
-            for para_block in paras_of_layout:
+            for para_block in paras_of_layout+paras_of_discarded:
                 para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
                 output_content.append(para_content)

mineru/cli/client.py CHANGED Viewed

@@ -6,6 +6,7 @@ from loguru import logger
 from mineru.utils.cli_parser import arg_parse
 from mineru.utils.config_reader import get_device
+from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
 from mineru.utils.model_utils import get_vram
 from ..version import __version__
 from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
@@ -49,12 +50,12 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
     '-b',
     '--backend',
     'backend',
-    type=click.Choice(['pipeline', 'vlm-transformers', 'vlm-sglang-engine', 'vlm-sglang-client']),
+    type=click.Choice(['pipeline', 'vlm-transformers', 'vlm-vllm-engine', 'vlm-http-client']),
     help="""the backend for parsing pdf:
     pipeline: More general.
     vlm-transformers: More general.
-    vlm-sglang-engine: Faster(engine).
-    vlm-sglang-client: Faster(client).
+    vlm-vllm-engine: Faster(engine).
+    vlm-http-client: Faster(client).
     without method specified, pipeline will be used by default.""",
     default='pipeline',
 )
@@ -77,7 +78,7 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
     'server_url',
     type=str,
     help="""
-    When the backend is `sglang-client`, you need to specify the server_url, for example:`http://127.0.0.1:30000`
+    When the backend is `vlm-http-client`, you need to specify the server_url, for example:`http://127.0.0.1:30000`
     """,
     default=None,
 )
@@ -202,7 +203,7 @@ def main(
     if os.path.isdir(input_path):
         doc_path_list = []
         for doc_path in Path(input_path).glob('*'):
-            if doc_path.suffix in pdf_suffixes + image_suffixes:
+            if guess_suffix_by_path(doc_path) in pdf_suffixes + image_suffixes:
                 doc_path_list.append(doc_path)
         parse_doc(doc_path_list)
     else:

mineru 2.2.2__py3-none-any.whl → 2.5.1__py3-none-any.whl

mineru 2.2.2py3-none-any.whl → 2.5.1py3-none-any.whl