PyPI - mineru - Versions diffs - 2.6.8__py3-none-any.whl → 2.7.0__py3-none-any.whl - Mend

mineru 2.6.8py3-none-any.whl → 2.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

mineru/backend/hybrid/__init__.py +1 -0
mineru/backend/hybrid/hybrid_analyze.py +526 -0
mineru/backend/hybrid/hybrid_magic_model.py +617 -0
mineru/backend/hybrid/hybrid_model_output_to_middle_json.py +212 -0
mineru/backend/pipeline/batch_analyze.py +9 -1
mineru/backend/pipeline/model_init.py +96 -1
mineru/backend/pipeline/pipeline_analyze.py +6 -4
mineru/backend/pipeline/pipeline_middle_json_mkcontent.py +32 -41
mineru/backend/vlm/utils.py +3 -1
mineru/backend/vlm/vlm_analyze.py +12 -12
mineru/backend/vlm/vlm_magic_model.py +24 -89
mineru/backend/vlm/vlm_middle_json_mkcontent.py +112 -12
mineru/cli/client.py +17 -17
mineru/cli/common.py +170 -20
mineru/cli/fast_api.py +39 -13
mineru/cli/gradio_app.py +232 -206
mineru/model/mfd/yolo_v8.py +12 -6
mineru/model/mfr/unimernet/Unimernet.py +71 -3
mineru/resources/header.html +5 -1
mineru/utils/boxbase.py +23 -0
mineru/utils/char_utils.py +55 -0
mineru/utils/engine_utils.py +74 -0
mineru/utils/enum_class.py +18 -1
mineru/utils/magic_model_utils.py +85 -2
mineru/utils/span_pre_proc.py +5 -3
mineru/utils/table_merge.py +5 -21
mineru/version.py +1 -1
mineru-2.7.0.dist-info/METADATA +433 -0
{mineru-2.6.8.dist-info → mineru-2.7.0.dist-info}/RECORD +33 -27
mineru-2.6.8.dist-info/METADATA +0 -954
{mineru-2.6.8.dist-info → mineru-2.7.0.dist-info}/WHEEL +0 -0
{mineru-2.6.8.dist-info → mineru-2.7.0.dist-info}/entry_points.txt +0 -0
{mineru-2.6.8.dist-info → mineru-2.7.0.dist-info}/licenses/LICENSE.md +0 -0
{mineru-2.6.8.dist-info → mineru-2.7.0.dist-info}/top_level.txt +0 -0

mineru/model/mfr/unimernet/Unimernet.py CHANGED Viewed

@@ -2,6 +2,8 @@ import torch
 from torch.utils.data import DataLoader, Dataset
 from tqdm import tqdm
+from mineru.utils.boxbase import calculate_iou
 class MathDataset(Dataset):
     def __init__(self, image_paths, transform=None):
@@ -31,11 +33,64 @@ class UnimernetModel(object):
             self.model = self.model.to(dtype=torch.float16)
         self.model.eval()
+    @staticmethod
+    def _filter_boxes_by_iou(xyxy, conf, cla, iou_threshold=0.8):
+        """过滤IOU超过阈值的重叠框，保留置信度较高的框。
+        Args:
+            xyxy: 框坐标张量，shape为(N, 4)
+            conf: 置信度张量，shape为(N,)
+            cla: 类别张量，shape为(N,)
+            iou_threshold: IOU阈值，默认0.9
+        Returns:
+            过滤后的xyxy, conf, cla张量
+        """
+        if len(xyxy) == 0:
+            return xyxy, conf, cla
+        # 转换为CPU进行处理
+        xyxy_cpu = xyxy.cpu()
+        conf_cpu = conf.cpu()
+        n = len(xyxy_cpu)
+        keep = [True] * n
+        for i in range(n):
+            if not keep[i]:
+                continue
+            bbox1 = xyxy_cpu[i].tolist()
+            for j in range(i + 1, n):
+                if not keep[j]:
+                    continue
+                bbox2 = xyxy_cpu[j].tolist()
+                iou = calculate_iou(bbox1, bbox2)
+                if iou > iou_threshold:
+                    # 保留置信度较高的框
+                    if conf_cpu[i] >= conf_cpu[j]:
+                        keep[j] = False
+                    else:
+                        keep[i] = False
+                        break  # i被删除，跳出内循环
+        keep_indices = [i for i in range(n) if keep[i]]
+        if len(keep_indices) == n:
+            return xyxy, conf, cla
+        keep_indices = torch.tensor(keep_indices, dtype=torch.long)
+        return xyxy[keep_indices], conf[keep_indices], cla[keep_indices]
     def predict(self, mfd_res, image):
         formula_list = []
         mf_image_list = []
+        # 对检测框进行IOU去重，保留置信度较高的框
+        xyxy_filtered, conf_filtered, cla_filtered = self._filter_boxes_by_iou(
+            mfd_res.boxes.xyxy, mfd_res.boxes.conf, mfd_res.boxes.cls
+        )
         for xyxy, conf, cla in zip(
-            mfd_res.boxes.xyxy.cpu(), mfd_res.boxes.conf.cpu(), mfd_res.boxes.cls.cpu()
+            xyxy_filtered.cpu(), conf_filtered.cpu(), cla_filtered.cpu()
         ):
             xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
             new_item = {
@@ -61,7 +116,13 @@ class UnimernetModel(object):
             res["latex"] = latex
         return formula_list
-    def batch_predict(self, images_mfd_res: list, images: list, batch_size: int = 64) -> list:
+    def batch_predict(
+            self,
+            images_mfd_res: list,
+            images: list,
+            batch_size: int = 64,
+            interline_enable: bool = True,
+    ) -> list:
         images_formula_list = []
         mf_image_list = []
         backfill_list = []
@@ -73,9 +134,16 @@ class UnimernetModel(object):
             image = images[image_index]
             formula_list = []
+            # 对检测框进行IOU去重，保留置信度较高的框
+            xyxy_filtered, conf_filtered, cla_filtered = self._filter_boxes_by_iou(
+                mfd_res.boxes.xyxy, mfd_res.boxes.conf, mfd_res.boxes.cls
+            )
             for idx, (xyxy, conf, cla) in enumerate(zip(
-                    mfd_res.boxes.xyxy, mfd_res.boxes.conf, mfd_res.boxes.cls
+                    xyxy_filtered, conf_filtered, cla_filtered
             )):
+                if not interline_enable and cla.item() == 1:
+                    continue  # Skip interline regions if not enabled
                 xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
                 new_item = {
                     "category_id": 13 + int(cla.item()),

mineru/resources/header.html CHANGED Viewed

@@ -66,7 +66,11 @@
         color: #fafafa;
         opacity: 0.8;
       ">
-      A one-stop, open-source, high-quality data extraction tool that supports converting PDF to Markdown and JSON.<br>
+        A one-stop, open-source, high-quality data extraction tool that supports converting PDF to Markdown and JSON.<br>
+        If you found our project helpful, please give us a ⭐️ to support us!
+        <a href="https://github.com/opendatalab/MinerU" style="display: inline-flex; align-items: center;">
+          <img src="https://img.shields.io/github/stars/opendatalab/MinerU.svg" alt="stars" style="vertical-align: middle; position: relative; top: 5px;">
+        </a>
     </p>
     <style>
       .link-block {

mineru/utils/boxbase.py CHANGED Viewed

@@ -74,6 +74,29 @@ def bbox_distance(bbox1, bbox2):
     return 0.0
+def bbox_center_distance(bbox1, bbox2):
+    """计算两个矩形框中心点之间的欧氏距离。
+    Args:
+        bbox1 (tuple): 第一个矩形框的坐标，格式为 (x1, y1, x2, y2)
+        bbox2 (tuple): 第二个矩形框的坐标，格式为 (x1, y1, x2, y2)
+    Returns:
+        float: 两个矩形框中心点之间的距离
+    """
+    x1, y1, x1b, y1b = bbox1
+    x2, y2, x2b, y2b = bbox2
+    # 计算中心点
+    center1_x = (x1 + x1b) / 2
+    center1_y = (y1 + y1b) / 2
+    center2_x = (x2 + x2b) / 2
+    center2_y = (y2 + y2b) / 2
+    # 计算欧氏距离
+    return math.sqrt((center1_x - center2_x) ** 2 + (center1_y - center2_y) ** 2)
 def get_minbox_if_overlap_by_ratio(bbox1, bbox2, ratio):
     """通过calculate_overlap_area_2_minbox_area_ratio计算两个bbox重叠的面积占最小面积的box的比例
     如果比例大于ratio，则返回小的那个bbox, 否则返回None."""

mineru/utils/char_utils.py ADDED Viewed

@@ -0,0 +1,55 @@
+#  Copyright (c) Opendatalab. All rights reserved.
+import re
+def is_hyphen_at_line_end(line):
+    """Check if a line ends with one or more letters followed by a hyphen.
+    Args:
+    line (str): The line of text to check.
+    Returns:
+    bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
+    """
+    # Use regex to check if the line ends with one or more letters followed by a hyphen
+    return bool(re.search(r'[A-Za-z]+-\s*$', line))
+def full_to_half_exclude_marks(text: str) -> str:
+    """Convert full-width characters to half-width characters using code point manipulation.
+    Args:
+        text: String containing full-width characters
+    Returns:
+        String with full-width characters converted to half-width
+    """
+    result = []
+    for char in text:
+        code = ord(char)
+        # Full-width letters and numbers (FF21-FF3A for A-Z, FF41-FF5A for a-z, FF10-FF19 for 0-9)
+        if (0xFF21 <= code <= 0xFF3A) or (0xFF41 <= code <= 0xFF5A) or (0xFF10 <= code <= 0xFF19):
+            result.append(chr(code - 0xFEE0))  # Shift to ASCII range
+        else:
+            result.append(char)
+    return ''.join(result)
+def full_to_half(text: str) -> str:
+    """Convert full-width characters to half-width characters using code point manipulation.
+    Args:
+        text: String containing full-width characters
+    Returns:
+        String with full-width characters converted to half-width
+    """
+    result = []
+    for char in text:
+        code = ord(char)
+        # Full-width letters, numbers and punctuation (FF01-FF5E)
+        if 0xFF01 <= code <= 0xFF5E:
+            result.append(chr(code - 0xFEE0))  # Shift to ASCII range
+        else:
+            result.append(char)
+    return ''.join(result)

mineru/utils/engine_utils.py ADDED Viewed

@@ -0,0 +1,74 @@
+#  Copyright (c) Opendatalab. All rights reserved.
+from loguru import logger
+from mineru.utils.check_sys_env import is_mac_os_version_supported, is_windows_environment, is_mac_environment, \
+    is_linux_environment
+def get_vlm_engine(inference_engine: str, is_async: bool = False) -> str:
+    """
+    自动选择或验证 VLM 推理引擎
+    Args:
+        inference_engine: 指定的引擎名称或 'auto' 进行自动选择
+        is_async: 是否使用异步引擎(仅对 vllm 有效)
+    Returns:
+        最终选择的引擎名称
+    """
+    if inference_engine == 'auto':
+        # 根据操作系统自动选择引擎
+        if is_windows_environment():
+            inference_engine = _select_windows_engine()
+        elif is_linux_environment():
+            inference_engine = _select_linux_engine(is_async)
+        elif is_mac_environment():
+            inference_engine = _select_mac_engine()
+        else:
+            logger.warning("Unknown operating system, falling back to transformers")
+            inference_engine = 'transformers'
+    formatted_engine = _format_engine_name(inference_engine)
+    logger.info(f"Using {formatted_engine} as the inference engine for VLM.")
+    return formatted_engine
+def _select_windows_engine() -> str:
+    """Windows 平台引擎选择"""
+    try:
+        import lmdeploy
+        return 'lmdeploy'
+    except ImportError:
+        return 'transformers'
+def _select_linux_engine(is_async: bool) -> str:
+    """Linux 平台引擎选择"""
+    try:
+        import vllm
+        return 'vllm-async' if is_async else 'vllm'
+    except ImportError:
+        try:
+            import lmdeploy
+            return 'lmdeploy'
+        except ImportError:
+            return 'transformers'
+def _select_mac_engine() -> str:
+    """macOS 平台引擎选择"""
+    try:
+        from mlx_vlm import load as mlx_load
+        if is_mac_os_version_supported():
+            return 'mlx'
+        else:
+            return 'transformers'
+    except ImportError:
+        return 'transformers'
+def _format_engine_name(engine: str) -> str:
+    """统一格式化引擎名称"""
+    if engine != 'transformers':
+        return f"{engine}-engine"
+    return engine

mineru/utils/enum_class.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from enum import Enum
 class BlockType:
     IMAGE = 'image'
     TABLE = 'table'
@@ -112,4 +114,19 @@ class SplitFlag:
 class ImageType:
     PIL = 'pil_img'
-    BASE64 = 'base64_img'
+    BASE64 = 'base64_img'
+class NotExtractType(Enum):
+    TEXT = BlockType.TEXT
+    TITLE = BlockType.TITLE
+    HEADER = BlockType.HEADER
+    FOOTER = BlockType.FOOTER
+    PAGE_NUMBER = BlockType.PAGE_NUMBER
+    PAGE_FOOTNOTE = BlockType.PAGE_FOOTNOTE
+    REF_TEXT = BlockType.REF_TEXT
+    TABLE_CAPTION = BlockType.TABLE_CAPTION
+    IMAGE_CAPTION = BlockType.IMAGE_CAPTION
+    TABLE_FOOTNOTE = BlockType.TABLE_FOOTNOTE
+    IMAGE_FOOTNOTE = BlockType.IMAGE_FOOTNOTE
+    CODE_CAPTION = BlockType.CODE_CAPTION

mineru/utils/magic_model_utils.py CHANGED Viewed

@@ -2,7 +2,7 @@
 包含两个MagicModel类中重复使用的方法和逻辑
 """
 from typing import List, Dict, Any, Callable
-from mineru.utils.boxbase import bbox_distance, is_in
+from mineru.utils.boxbase import bbox_distance, bbox_center_distance, is_in
 def reduct_overlap(bboxes: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
@@ -165,4 +165,87 @@ def tie_up_category_by_distance_v3(
             }
         )
-    return ret
+    return ret
+def tie_up_category_by_index(
+        get_subjects_func: Callable,
+        get_objects_func: Callable,
+        extract_subject_func: Callable = None,
+        extract_object_func: Callable = None
+):
+    """
+    基于index的类别关联方法，用于将主体对象与客体对象进行关联
+    客体优先匹配给index最接近的主体，index差值相同时使用bbox中心点距离作为tiebreaker
+    参数:
+        get_subjects_func: 函数，提取主体对象
+        get_objects_func: 函数，提取客体对象
+        extract_subject_func: 函数，自定义提取主体属性（默认使用bbox和其他属性）
+        extract_object_func: 函数，自定义提取客体属性（默认使用bbox和其他属性）
+    返回:
+        关联后的对象列表，按主体index升序排列
+    """
+    subjects = get_subjects_func()
+    objects = get_objects_func()
+    # 如果没有提供自定义提取函数，使用默认函数
+    if extract_subject_func is None:
+        extract_subject_func = lambda x: x
+    if extract_object_func is None:
+        extract_object_func = lambda x: x
+    # 初始化结果字典，key为主体索引，value为关联信息
+    result_dict = {}
+    # 初始化所有主体
+    for i, subject in enumerate(subjects):
+        result_dict[i] = {
+            "sub_bbox": extract_subject_func(subject),
+            "obj_bboxes": [],
+            "sub_idx": i,
+        }
+    # 为每个客体找到最匹配的主体
+    for obj in objects:
+        if len(subjects) == 0:
+            # 如果没有主体，跳过客体
+            continue
+        obj_index = obj["index"]
+        min_index_diff = float("inf")
+        best_subject_indices = []
+        # 找出index差值最小的所有主体
+        for i, subject in enumerate(subjects):
+            sub_index = subject["index"]
+            index_diff = abs(obj_index - sub_index)
+            if index_diff < min_index_diff:
+                min_index_diff = index_diff
+                best_subject_indices = [i]
+            elif index_diff == min_index_diff:
+                best_subject_indices.append(i)
+        # 如果有多个主体的index差值相同，使用中心点距离作为tiebreaker
+        if len(best_subject_indices) > 1:
+            min_center_dist = float("inf")
+            best_subject_idx = best_subject_indices[0]
+            for idx in best_subject_indices:
+                center_dist = bbox_center_distance(obj["bbox"], subjects[idx]["bbox"])
+                if center_dist < min_center_dist:
+                    min_center_dist = center_dist
+                    best_subject_idx = idx
+        else:
+            best_subject_idx = best_subject_indices[0]
+        # 将客体添加到最佳主体的obj_bboxes中
+        result_dict[best_subject_idx]["obj_bboxes"].append(extract_object_func(obj))
+    # 转换为列表并按主体index排序
+    ret = list(result_dict.values())
+    ret.sort(key=lambda x: x["sub_idx"])
+    return ret

mineru/utils/span_pre_proc.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # Copyright (c) Opendatalab. All rights reserved.
 import collections
+import math
 import re
 import statistics
@@ -128,8 +129,9 @@ def txt_spans_extract(pdf_page, spans, pil_img, scale, all_bboxes, all_discarded
     page_all_lines = []
     for block in page_dict['blocks']:
         for line in block['lines']:
-            if 0 < abs(line['rotation']) < 90:
-                # 旋转角度在0-90度之间的行，直接跳过
+            rotation_degrees = math.degrees(line['rotation'])
+            # 旋转角度不为0, 90, 180, 270的行，直接跳过（rotation_degrees的值可能不为整数）
+            if not any(abs(rotation_degrees - angle) < 0.1 for angle in [0, 90, 180, 270]):
                 continue
             page_all_lines.append(line)
             for span in line['spans']:
@@ -159,7 +161,7 @@ def txt_spans_extract(pdf_page, spans, pil_img, scale, all_bboxes, all_discarded
                 if block[7] in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY, BlockType.INTERLINE_EQUATION]:
                     continue
                 if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
-                    if span['height'] > median_span_height * 3 and span['height'] > span['width'] * 3:
+                    if span['height'] > median_span_height * 2.3 and span['height'] > span['width'] * 2.3:
                         vertical_spans.append(span)
                     elif block in all_bboxes:
                         useful_spans.append(span)

mineru/utils/table_merge.py CHANGED Viewed

@@ -1,35 +1,17 @@
 # Copyright (c) Opendatalab. All rights reserved.
+from copy import deepcopy
 from loguru import logger
 from bs4 import BeautifulSoup
 from mineru.backend.vlm.vlm_middle_json_mkcontent import merge_para_with_text
+from mineru.utils.char_utils import full_to_half
 from mineru.utils.enum_class import BlockType, SplitFlag
 CONTINUATION_MARKERS = ["(续)", "(续表)", "(continued)", "(cont.)"]
-def full_to_half(text: str) -> str:
-    """Convert full-width characters to half-width characters using code point manipulation.
-    Args:
-        text: String containing full-width characters
-    Returns:
-        String with full-width characters converted to half-width
-    """
-    result = []
-    for char in text:
-        code = ord(char)
-        # Full-width letters, numbers and punctuation (FF01-FF5E)
-        if 0xFF01 <= code <= 0xFF5E:
-            result.append(chr(code - 0xFEE0))  # Shift to ASCII range
-        else:
-            result.append(char)
-    return ''.join(result)
 def calculate_table_total_columns(soup):
     """计算表格的总列数，通过分析整个表格结构来处理rowspan和colspan
@@ -296,6 +278,8 @@ def adjust_table_rows_colspan(rows, start_idx, end_idx,
         current_cols: 当前总列数
         reference_row: 参考行对象
     """
+    reference_row_copy = deepcopy(reference_row)
     for i in range(start_idx, end_idx):
         row = rows[i]
         cells = row.find_all(["td", "th"])
@@ -307,7 +291,7 @@ def adjust_table_rows_colspan(rows, start_idx, end_idx,
             continue
         # 检查是否与参考行结构匹配
-        if calculate_visual_columns(row) == reference_visual_cols and check_row_columns_match(row, reference_row):
+        if calculate_visual_columns(row) == reference_visual_cols and check_row_columns_match(row, reference_row_copy):
             # 尝试应用参考结构
             if len(cells) <= len(reference_structure):
                 for j, cell in enumerate(cells):

mineru/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "2.6.8"
1	+ __version__ = "2.7.0"

mineru 2.6.8__py3-none-any.whl → 2.7.0__py3-none-any.whl

mineru 2.6.8py3-none-any.whl → 2.7.0py3-none-any.whl