PyPI - mineru - Versions diffs - 2.6.8__py3-none-any.whl → 2.7.0__py3-none-any.whl - Mend

mineru 2.6.8py3-none-any.whl → 2.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

mineru/backend/hybrid/__init__.py +1 -0
mineru/backend/hybrid/hybrid_analyze.py +526 -0
mineru/backend/hybrid/hybrid_magic_model.py +617 -0
mineru/backend/hybrid/hybrid_model_output_to_middle_json.py +212 -0
mineru/backend/pipeline/batch_analyze.py +9 -1
mineru/backend/pipeline/model_init.py +96 -1
mineru/backend/pipeline/pipeline_analyze.py +6 -4
mineru/backend/pipeline/pipeline_middle_json_mkcontent.py +32 -41
mineru/backend/vlm/utils.py +3 -1
mineru/backend/vlm/vlm_analyze.py +12 -12
mineru/backend/vlm/vlm_magic_model.py +24 -89
mineru/backend/vlm/vlm_middle_json_mkcontent.py +112 -12
mineru/cli/client.py +17 -17
mineru/cli/common.py +170 -20
mineru/cli/fast_api.py +39 -13
mineru/cli/gradio_app.py +232 -206
mineru/model/mfd/yolo_v8.py +12 -6
mineru/model/mfr/unimernet/Unimernet.py +71 -3
mineru/resources/header.html +5 -1
mineru/utils/boxbase.py +23 -0
mineru/utils/char_utils.py +55 -0
mineru/utils/engine_utils.py +74 -0
mineru/utils/enum_class.py +18 -1
mineru/utils/magic_model_utils.py +85 -2
mineru/utils/span_pre_proc.py +5 -3
mineru/utils/table_merge.py +5 -21
mineru/version.py +1 -1
mineru-2.7.0.dist-info/METADATA +433 -0
{mineru-2.6.8.dist-info → mineru-2.7.0.dist-info}/RECORD +33 -27
mineru-2.6.8.dist-info/METADATA +0 -954
{mineru-2.6.8.dist-info → mineru-2.7.0.dist-info}/WHEEL +0 -0
{mineru-2.6.8.dist-info → mineru-2.7.0.dist-info}/entry_points.txt +0 -0
{mineru-2.6.8.dist-info → mineru-2.7.0.dist-info}/licenses/LICENSE.md +0 -0
{mineru-2.6.8.dist-info → mineru-2.7.0.dist-info}/top_level.txt +0 -0

mineru/backend/hybrid/hybrid_magic_model.py ADDED Viewed

@@ -0,0 +1,617 @@
+import re
+from typing import Literal
+from loguru import logger
+from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio
+from mineru.utils.enum_class import ContentType, BlockType, NotExtractType
+from mineru.utils.guess_suffix_or_lang import guess_language_by_text
+from mineru.utils.magic_model_utils import reduct_overlap, tie_up_category_by_index
+from mineru.utils.span_block_fix import fix_text_block
+from mineru.utils.span_pre_proc import txt_spans_extract
+not_extract_list = [item.value for item in NotExtractType]
+class MagicModel:
+    def __init__(self,
+        page_blocks: list,
+        page_inline_formula,
+        page_ocr_res,
+        page,
+        scale,
+        page_pil_img,
+        width,
+        height,
+        _ocr_enable,
+        _vlm_ocr_enable,
+    ):
+        self.page_blocks = page_blocks
+        self.page_inline_formula = page_inline_formula
+        self.page_ocr_res = page_ocr_res
+        self.width = width
+        self.height = height
+        blocks = []
+        self.all_spans = []
+        page_text_inline_formula_spans = []
+        if not _vlm_ocr_enable:
+            for inline_formula in page_inline_formula:
+                inline_formula["bbox"] = self.cal_real_bbox(inline_formula["bbox"])
+                inline_formula_latex = inline_formula.pop("latex", "")
+                if inline_formula_latex:
+                    page_text_inline_formula_spans.append({
+                        "bbox": inline_formula["bbox"],
+                        "type": ContentType.INLINE_EQUATION,
+                        "content": inline_formula_latex,
+                        "score": inline_formula["score"],
+                    })
+            for ocr_res in page_ocr_res:
+                ocr_res["bbox"] = self.cal_real_bbox(ocr_res["bbox"])
+                if ocr_res['category_id'] == 15:
+                    page_text_inline_formula_spans.append({
+                        "bbox": ocr_res["bbox"],
+                        "type": ContentType.TEXT,
+                        "content": ocr_res["text"],
+                        "score": ocr_res["score"],
+                    })
+            if not _ocr_enable:
+                virtual_block = [0, 0, width, height, None, None, None, "text"]
+                page_text_inline_formula_spans = txt_spans_extract(page, page_text_inline_formula_spans, page_pil_img, scale, [virtual_block],[])
+        # 解析每个块
+        for index, block_info in enumerate(page_blocks):
+            try:
+                block_bbox = self.cal_real_bbox(block_info["bbox"])
+                block_type = block_info["type"]
+                block_content = block_info["content"]
+                block_angle = block_info["angle"]
+                # print(f"坐标: {block_bbox}")
+                # print(f"类型: {block_type}")
+                # print(f"内容: {block_content}")
+                # print("-" * 50)
+            except Exception as e:
+                # 如果解析失败，可能是因为格式不正确，跳过这个块
+                logger.warning(f"Invalid block format: {block_info}, error: {e}")
+                continue
+            span_type = "unknown"
+            code_block_sub_type = None
+            guess_lang = None
+            if block_type in [
+                "text",
+                "title",
+                "image_caption",
+                "image_footnote",
+                "table_caption",
+                "table_footnote",
+                "code_caption",
+                "ref_text",
+                "phonetic",
+                "header",
+                "footer",
+                "page_number",
+                "aside_text",
+                "page_footnote",
+                "list"
+            ]:
+                span_type = ContentType.TEXT
+            elif block_type in ["image"]:
+                block_type = BlockType.IMAGE_BODY
+                span_type = ContentType.IMAGE
+            elif block_type in ["table"]:
+                block_type = BlockType.TABLE_BODY
+                span_type = ContentType.TABLE
+            elif block_type in ["code", "algorithm"]:
+                block_content = code_content_clean(block_content)
+                code_block_sub_type = block_type
+                block_type = BlockType.CODE_BODY
+                span_type = ContentType.TEXT
+                guess_lang = guess_language_by_text(block_content)
+            elif block_type in ["equation"]:
+                block_type = BlockType.INTERLINE_EQUATION
+                span_type = ContentType.INTERLINE_EQUATION
+            #  code 和 algorithm 类型的块，如果内容中包含行内公式，则需要将块类型切换为algorithm
+            switch_code_to_algorithm = False
+            span = None
+            if span_type in ["image", "table"]:
+                span = {
+                    "bbox": block_bbox,
+                    "type": span_type,
+                }
+                if span_type == ContentType.TABLE:
+                    span["html"] = block_content
+            elif span_type in [ContentType.INTERLINE_EQUATION]:
+                span = {
+                    "bbox": block_bbox,
+                    "type": span_type,
+                    "content": isolated_formula_clean(block_content),
+                }
+            elif _vlm_ocr_enable or block_type not in not_extract_list:
+                #  vlm_ocr_enable模式下，所有文本块都直接使用block的内容
+                #  非vlm_ocr_enable模式下，非提取块需要使用span填充方式
+                if block_content:
+                    block_content = clean_content(block_content)
+                if block_content and block_content.count("\\(") == block_content.count("\\)") and block_content.count("\\(") > 0:
+                    switch_code_to_algorithm = True
+                    # 生成包含文本和公式的span列表
+                    spans = []
+                    last_end = 0
+                    # 查找所有公式
+                    for match in re.finditer(r'\\\((.+?)\\\)', block_content):
+                        start, end = match.span()
+                        # 添加公式前的文本
+                        if start > last_end:
+                            text_before = block_content[last_end:start]
+                            if text_before.strip():
+                                spans.append({
+                                    "bbox": block_bbox,
+                                    "type": ContentType.TEXT,
+                                    "content": text_before
+                                })
+                        # 添加公式（去除\(和\)）
+                        formula = match.group(1)
+                        spans.append({
+                            "bbox": block_bbox,
+                            "type": ContentType.INLINE_EQUATION,
+                            "content": formula.strip()
+                        })
+                        last_end = end
+                    # 添加最后一个公式后的文本
+                    if last_end < len(block_content):
+                        text_after = block_content[last_end:]
+                        if text_after.strip():
+                            spans.append({
+                                "bbox": block_bbox,
+                                "type": ContentType.TEXT,
+                                "content": text_after
+                            })
+                    span = spans
+                else:
+                    span = {
+                        "bbox": block_bbox,
+                        "type": span_type,
+                        "content": block_content,
+                    }
+            if (
+                    span_type in ["image", "table", ContentType.INTERLINE_EQUATION]
+                    or (_vlm_ocr_enable or block_type not in not_extract_list)
+            ):
+                if span is None:
+                    continue
+                # 处理span类型并添加到all_spans
+                if isinstance(span, dict) and "bbox" in span:
+                    self.all_spans.append(span)
+                    spans = [span]
+                elif isinstance(span, list):
+                    self.all_spans.extend(span)
+                    spans = span
+                else:
+                    raise ValueError(f"Invalid span type: {span_type}, expected dict or list, got {type(span)}")
+                # 构造line对象
+                if block_type in [BlockType.CODE_BODY]:
+                    if switch_code_to_algorithm and code_block_sub_type == "code":
+                        code_block_sub_type = "algorithm"
+                    line = {"bbox": block_bbox, "spans": spans,
+                            "extra": {"type": code_block_sub_type, "guess_lang": guess_lang}}
+                else:
+                    line = {"bbox": block_bbox, "spans": spans}
+                block = {
+                    "bbox": block_bbox,
+                    "type": block_type,
+                    "angle": block_angle,
+                    "lines": [line],
+                    "index": index,
+                }
+            else:  #  使用span填充方式
+                block_spans = []
+                for span in page_text_inline_formula_spans:
+                    if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5:
+                        block_spans.append(span)
+                # 从spans删除已经放入block_spans中的span
+                if len(block_spans) > 0:
+                    for span in block_spans:
+                        page_text_inline_formula_spans.remove(span)
+                block = {
+                    "bbox": block_bbox,
+                    "type": block_type,
+                    "angle": block_angle,
+                    "spans": block_spans,
+                    "index": index,
+                }
+                block = fix_text_block(block)
+            blocks.append(block)
+        self.image_blocks = []
+        self.table_blocks = []
+        self.interline_equation_blocks = []
+        self.text_blocks = []
+        self.title_blocks = []
+        self.code_blocks = []
+        self.discarded_blocks = []
+        self.ref_text_blocks = []
+        self.phonetic_blocks = []
+        self.list_blocks = []
+        for block in blocks:
+            if block["type"] in [BlockType.IMAGE_BODY, BlockType.IMAGE_CAPTION, BlockType.IMAGE_FOOTNOTE]:
+                self.image_blocks.append(block)
+            elif block["type"] in [BlockType.TABLE_BODY, BlockType.TABLE_CAPTION, BlockType.TABLE_FOOTNOTE]:
+                self.table_blocks.append(block)
+            elif block["type"] in [BlockType.CODE_BODY, BlockType.CODE_CAPTION]:
+                self.code_blocks.append(block)
+            elif block["type"] == BlockType.INTERLINE_EQUATION:
+                self.interline_equation_blocks.append(block)
+            elif block["type"] == BlockType.TEXT:
+                self.text_blocks.append(block)
+            elif block["type"] == BlockType.TITLE:
+                self.title_blocks.append(block)
+            elif block["type"] in [BlockType.REF_TEXT]:
+                self.ref_text_blocks.append(block)
+            elif block["type"] in [BlockType.PHONETIC]:
+                self.phonetic_blocks.append(block)
+            elif block["type"] in [BlockType.HEADER, BlockType.FOOTER, BlockType.PAGE_NUMBER, BlockType.ASIDE_TEXT, BlockType.PAGE_FOOTNOTE]:
+                self.discarded_blocks.append(block)
+            elif block["type"] == BlockType.LIST:
+                self.list_blocks.append(block)
+            else:
+                continue
+        self.list_blocks, self.text_blocks, self.ref_text_blocks = fix_list_blocks(self.list_blocks, self.text_blocks, self.ref_text_blocks)
+        self.image_blocks, not_include_image_blocks = fix_two_layer_blocks(self.image_blocks, BlockType.IMAGE)
+        self.table_blocks, not_include_table_blocks = fix_two_layer_blocks(self.table_blocks, BlockType.TABLE)
+        self.code_blocks, not_include_code_blocks = fix_two_layer_blocks(self.code_blocks, BlockType.CODE)
+        for code_block in self.code_blocks:
+            for block in code_block['blocks']:
+                if block['type'] == BlockType.CODE_BODY:
+                    if len(block["lines"]) > 0:
+                        line = block["lines"][0]
+                        code_block["sub_type"] = line["extra"]["type"]
+                        if code_block["sub_type"] in ["code"]:
+                            code_block["guess_lang"] = line["extra"]["guess_lang"]
+                        del line["extra"]
+                    else:
+                        code_block["sub_type"] = "code"
+                        code_block["guess_lang"] = "txt"
+        for block in not_include_image_blocks + not_include_table_blocks + not_include_code_blocks:
+            block["type"] = BlockType.TEXT
+            self.text_blocks.append(block)
+    def cal_real_bbox(self, bbox):
+        x1, y1, x2, y2 = bbox
+        x_1, y_1, x_2, y_2 = (
+            int(x1 * self.width),
+            int(y1 * self.height),
+            int(x2 * self.width),
+            int(y2 * self.height),
+        )
+        if x_2 < x_1:
+            x_1, x_2 = x_2, x_1
+        if y_2 < y_1:
+            y_1, y_2 = y_2, y_1
+        bbox = (x_1, y_1, x_2, y_2)
+        return bbox
+    def get_list_blocks(self):
+        return self.list_blocks
+    def get_image_blocks(self):
+        return self.image_blocks
+    def get_table_blocks(self):
+        return self.table_blocks
+    def get_code_blocks(self):
+        return self.code_blocks
+    def get_ref_text_blocks(self):
+        return self.ref_text_blocks
+    def get_phonetic_blocks(self):
+        return self.phonetic_blocks
+    def get_title_blocks(self):
+        return self.title_blocks
+    def get_text_blocks(self):
+        return self.text_blocks
+    def get_interline_equation_blocks(self):
+        return self.interline_equation_blocks
+    def get_discarded_blocks(self):
+        return self.discarded_blocks
+    def get_all_spans(self):
+        return self.all_spans
+def isolated_formula_clean(txt):
+    latex = txt[:]
+    if latex.startswith("\\["): latex = latex[2:]
+    if latex.endswith("\\]"): latex = latex[:-2]
+    latex = latex.strip()
+    return latex
+def code_content_clean(content):
+    """清理代码内容，移除Markdown代码块的开始和结束标记"""
+    if not content:
+        return ""
+    lines = content.splitlines()
+    start_idx = 0
+    end_idx = len(lines)
+    # 处理开头的三个反引号
+    if lines and lines[0].startswith("```"):
+        start_idx = 1
+    # 处理结尾的三个反引号
+    if lines and end_idx > start_idx and lines[end_idx - 1].strip() == "```":
+        end_idx -= 1
+    # 只有在有内容时才进行join操作
+    if start_idx < end_idx:
+        return "\n".join(lines[start_idx:end_idx]).strip()
+    return ""
+def clean_content(content):
+    if content and content.count("\\[") == content.count("\\]") and content.count("\\[") > 0:
+        # Function to handle each match
+        def replace_pattern(match):
+            # Extract content between \[ and \]
+            inner_content = match.group(1)
+            return f"[{inner_content}]"
+        # Find all patterns of \[x\] and apply replacement
+        pattern = r'\\\[(.*?)\\\]'
+        content = re.sub(pattern, replace_pattern, content)
+    return content
+def __tie_up_category_by_index(blocks, subject_block_type, object_block_type):
+    """基于index的主客体关联包装函数"""
+    # 定义获取主体和客体对象的函数
+    def get_subjects():
+        return reduct_overlap(
+            list(
+                map(
+                    lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"], "angle": x["angle"]},
+                    filter(
+                        lambda x: x["type"] == subject_block_type,
+                        blocks,
+                    ),
+                )
+            )
+        )
+    def get_objects():
+        return reduct_overlap(
+            list(
+                map(
+                    lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"], "angle": x["angle"]},
+                    filter(
+                        lambda x: x["type"] == object_block_type,
+                        blocks,
+                    ),
+                )
+            )
+        )
+    # 调用通用方法
+    return tie_up_category_by_index(
+        get_subjects,
+        get_objects
+    )
+def get_type_blocks_by_index(blocks, block_type: Literal["image", "table", "code"]):
+    """使用基于index的匹配策略来组织blocks"""
+    with_captions = __tie_up_category_by_index(blocks, f"{block_type}_body", f"{block_type}_caption")
+    with_footnotes = __tie_up_category_by_index(blocks, f"{block_type}_body", f"{block_type}_footnote")
+    ret = []
+    for v in with_captions:
+        record = {
+            f"{block_type}_body": v["sub_bbox"],
+            f"{block_type}_caption_list": v["obj_bboxes"],
+        }
+        filter_idx = v["sub_idx"]
+        d = next(filter(lambda x: x["sub_idx"] == filter_idx, with_footnotes))
+        record[f"{block_type}_footnote_list"] = d["obj_bboxes"]
+        ret.append(record)
+    return ret
+def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table", "code"]):
+    need_fix_blocks = get_type_blocks_by_index(blocks, fix_type)
+    fixed_blocks = []
+    not_include_blocks = []
+    processed_indices = set()
+    # 特殊处理表格类型，确保标题在表格前，注脚在表格后
+    if fix_type in ["table", "image"]:
+        # 收集所有不合适的caption和footnote
+        misplaced_footnotes = []  # 存储(footnote, 原始block索引)
+        # 第一步：移除不符合位置要求的footnote
+        for block_idx, block in enumerate(need_fix_blocks):
+            body = block[f"{fix_type}_body"]
+            body_index = body["index"]
+            # 检查footnote应在body后或同位置
+            valid_footnotes = []
+            for footnote in block[f"{fix_type}_footnote_list"]:
+                if footnote["index"] >= body_index:
+                    valid_footnotes.append(footnote)
+                else:
+                    misplaced_footnotes.append((footnote, block_idx))
+            block[f"{fix_type}_footnote_list"] = valid_footnotes
+        # 第三步：重新分配不合规的footnote到合适的body
+        for footnote, original_block_idx in misplaced_footnotes:
+            footnote_index = footnote["index"]
+            best_block_idx = None
+            min_distance = float('inf')
+            # 寻找索引小于等于footnote_index的最近body
+            for idx, block in enumerate(need_fix_blocks):
+                body_index = block[f"{fix_type}_body"]["index"]
+                if body_index <= footnote_index and idx != original_block_idx:
+                    distance = footnote_index - body_index
+                    if distance < min_distance:
+                        min_distance = distance
+                        best_block_idx = idx
+            if best_block_idx is not None:
+                # 找到合适的body，添加到对应block的footnote_list
+                need_fix_blocks[best_block_idx][f"{fix_type}_footnote_list"].append(footnote)
+            else:
+                # 没找到合适的body，作为普通block处理
+                not_include_blocks.append(footnote)
+        # 第四步:将每个block的caption_list和footnote_list中不连续index的元素提出来作为普通block处理
+        for block in need_fix_blocks:
+            caption_list = block[f"{fix_type}_caption_list"]
+            footnote_list = block[f"{fix_type}_footnote_list"]
+            body_index = block[f"{fix_type}_body"]["index"]
+            # 处理caption_list (从body往前看,caption在body之前)
+            if caption_list:
+                # 按index降序排列,从最接近body的开始检查
+                caption_list.sort(key=lambda x: x["index"], reverse=True)
+                filtered_captions = [caption_list[0]]
+                for i in range(1, len(caption_list)):
+                    prev_index = caption_list[i - 1]["index"]
+                    curr_index = caption_list[i]["index"]
+                    # 检查是否连续
+                    if curr_index == prev_index - 1:
+                        filtered_captions.append(caption_list[i])
+                    else:
+                        # 检查gap中是否只有body_index
+                        gap_indices = set(range(curr_index + 1, prev_index))
+                        if gap_indices == {body_index}:
+                            # gap中只有body_index,不算真正的gap
+                            filtered_captions.append(caption_list[i])
+                        else:
+                            # 出现真正的gap,后续所有caption都作为普通block
+                            not_include_blocks.extend(caption_list[i:])
+                            break
+                # 恢复升序
+                filtered_captions.reverse()
+                block[f"{fix_type}_caption_list"] = filtered_captions
+            # 处理footnote_list (从body往后看,footnote在body之后)
+            if footnote_list:
+                # 按index升序排列,从最接近body的开始检查
+                footnote_list.sort(key=lambda x: x["index"])
+                filtered_footnotes = [footnote_list[0]]
+                for i in range(1, len(footnote_list)):
+                    # 检查是否与前一个footnote连续
+                    if footnote_list[i]["index"] == footnote_list[i - 1]["index"] + 1:
+                        filtered_footnotes.append(footnote_list[i])
+                    else:
+                        # 出现gap,后续所有footnote都作为普通block
+                        not_include_blocks.extend(footnote_list[i:])
+                        break
+                block[f"{fix_type}_footnote_list"] = filtered_footnotes
+    # 构建两层结构blocks
+    for block in need_fix_blocks:
+        body = block[f"{fix_type}_body"]
+        caption_list = block[f"{fix_type}_caption_list"]
+        footnote_list = block[f"{fix_type}_footnote_list"]
+        body["type"] = f"{fix_type}_body"
+        for caption in caption_list:
+            caption["type"] = f"{fix_type}_caption"
+            processed_indices.add(caption["index"])
+        for footnote in footnote_list:
+            footnote["type"] = f"{fix_type}_footnote"
+            processed_indices.add(footnote["index"])
+        processed_indices.add(body["index"])
+        two_layer_block = {
+            "type": fix_type,
+            "bbox": body["bbox"],
+            "blocks": [body],
+            "index": body["index"],
+        }
+        two_layer_block["blocks"].extend([*caption_list, *footnote_list])
+        # 对blocks按index排序
+        two_layer_block["blocks"].sort(key=lambda x: x["index"])
+        fixed_blocks.append(two_layer_block)
+    # 添加未处理的blocks
+    for block in blocks:
+        block.pop("type", None)
+        if block["index"] not in processed_indices and block not in not_include_blocks:
+            not_include_blocks.append(block)
+    return fixed_blocks, not_include_blocks
+def fix_list_blocks(list_blocks, text_blocks, ref_text_blocks):
+    for list_block in list_blocks:
+        list_block["blocks"] = []
+        if "lines" in list_block:
+            del list_block["lines"]
+    temp_text_blocks = text_blocks + ref_text_blocks
+    need_remove_blocks = []
+    for block in temp_text_blocks:
+        for list_block in list_blocks:
+            if calculate_overlap_area_in_bbox1_area_ratio(block["bbox"], list_block["bbox"]) >= 0.8:
+                list_block["blocks"].append(block)
+                need_remove_blocks.append(block)
+                break
+    for block in need_remove_blocks:
+        if block in text_blocks:
+            text_blocks.remove(block)
+        elif block in ref_text_blocks:
+            ref_text_blocks.remove(block)
+    # 移除blocks为空的list_block
+    list_blocks = [lb for lb in list_blocks if lb["blocks"]]
+    for list_block in list_blocks:
+        # 统计list_block["blocks"]中所有block的type，用众数作为list_block的sub_type
+        type_count = {}
+        for sub_block in list_block["blocks"]:
+            sub_block_type = sub_block["type"]
+            if sub_block_type not in type_count:
+                type_count[sub_block_type] = 0
+            type_count[sub_block_type] += 1
+        if type_count:
+            list_block["sub_type"] = max(type_count, key=type_count.get)
+        else:
+            list_block["sub_type"] = "unknown"
+    return list_blocks, text_blocks, ref_text_blocks

mineru 2.6.8__py3-none-any.whl → 2.7.0__py3-none-any.whl

mineru 2.6.8py3-none-any.whl → 2.7.0py3-none-any.whl