PyPI - mineru - Versions diffs - 2.6.1__py3-none-any.whl → 2.6.3__py3-none-any.whl - Mend

mineru 2.6.1py3-none-any.whl → 2.6.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

mineru/backend/pipeline/batch_analyze.py CHANGED Viewed

@@ -281,28 +281,20 @@ class BatchAnalyze:
                 # 按分辨率分组并同时完成padding
                 # RESOLUTION_GROUP_STRIDE = 32
-                RESOLUTION_GROUP_STRIDE = 64  # 定义分辨率分组的步进值
+                RESOLUTION_GROUP_STRIDE = 64
                 resolution_groups = defaultdict(list)
                 for crop_info in lang_crop_list:
                     cropped_img = crop_info[0]
                     h, w = cropped_img.shape[:2]
-                    # 使用更大的分组容差，减少分组数量
-                    # 将尺寸标准化到32的倍数
-                    normalized_h = ((h + RESOLUTION_GROUP_STRIDE) // RESOLUTION_GROUP_STRIDE) * RESOLUTION_GROUP_STRIDE  # 向上取整到32的倍数
-                    normalized_w = ((w + RESOLUTION_GROUP_STRIDE) // RESOLUTION_GROUP_STRIDE) * RESOLUTION_GROUP_STRIDE
-                    group_key = (normalized_h, normalized_w)
+                    # 直接计算目标尺寸并用作分组键
+                    target_h = ((h + RESOLUTION_GROUP_STRIDE - 1) // RESOLUTION_GROUP_STRIDE) * RESOLUTION_GROUP_STRIDE
+                    target_w = ((w + RESOLUTION_GROUP_STRIDE - 1) // RESOLUTION_GROUP_STRIDE) * RESOLUTION_GROUP_STRIDE
+                    group_key = (target_h, target_w)
                     resolution_groups[group_key].append(crop_info)
                 # 对每个分辨率组进行批处理
-                for group_key, group_crops in tqdm(resolution_groups.items(), desc=f"OCR-det {lang}"):
-                    # 计算目标尺寸（组内最大尺寸，向上取整到32的倍数）
-                    max_h = max(crop_info[0].shape[0] for crop_info in group_crops)
-                    max_w = max(crop_info[0].shape[1] for crop_info in group_crops)
-                    target_h = ((max_h + RESOLUTION_GROUP_STRIDE - 1) // RESOLUTION_GROUP_STRIDE) * RESOLUTION_GROUP_STRIDE
-                    target_w = ((max_w + RESOLUTION_GROUP_STRIDE - 1) // RESOLUTION_GROUP_STRIDE) * RESOLUTION_GROUP_STRIDE
+                for (target_h, target_w), group_crops in tqdm(resolution_groups.items(), desc=f"OCR-det {lang}"):
                     # 对所有图像进行padding到统一尺寸
                     batch_images = []
                     for crop_info in group_crops:
@@ -310,49 +302,34 @@ class BatchAnalyze:
                         h, w = img.shape[:2]
                         # 创建目标尺寸的白色背景
                         padded_img = np.ones((target_h, target_w, 3), dtype=np.uint8) * 255
-                        # 将原图像粘贴到左上角
                         padded_img[:h, :w] = img
                         batch_images.append(padded_img)
                     # 批处理检测
-                    det_batch_size = min(len(batch_images), self.batch_ratio * OCR_DET_BASE_BATCH_SIZE)  # 增加批处理大小
-                    # logger.debug(f"OCR-det batch: {det_batch_size} images, target size: {target_h}x{target_w}")
+                    det_batch_size = min(len(batch_images), self.batch_ratio * OCR_DET_BASE_BATCH_SIZE)
                     batch_results = ocr_model.text_detector.batch_predict(batch_images, det_batch_size)
                     # 处理批处理结果
-                    for i, (crop_info, (dt_boxes, elapse)) in enumerate(zip(group_crops, batch_results)):
+                    for crop_info, (dt_boxes, _) in zip(group_crops, batch_results):
                         bgr_image, useful_list, ocr_res_list_dict, res, adjusted_mfdetrec_res, _lang = crop_info
                         if dt_boxes is not None and len(dt_boxes) > 0:
-                            # 直接应用原始OCR流程中的关键处理步骤
-                            # 1. 排序检测框
-                            if len(dt_boxes) > 0:
-                                dt_boxes_sorted = sorted_boxes(dt_boxes)
-                            else:
-                                dt_boxes_sorted = []
-                            # 2. 合并相邻检测框
-                            if dt_boxes_sorted:
-                                dt_boxes_merged = merge_det_boxes(dt_boxes_sorted)
-                            else:
-                                dt_boxes_merged = []
-                            # 3. 根据公式位置更新检测框（关键步骤！）
-                            if dt_boxes_merged and adjusted_mfdetrec_res:
-                                dt_boxes_final = update_det_boxes(dt_boxes_merged, adjusted_mfdetrec_res)
-                            else:
-                                dt_boxes_final = dt_boxes_merged
-                            # 构造OCR结果格式
-                            ocr_res = [box.tolist() if hasattr(box, 'tolist') else box for box in dt_boxes_final]
-                            if ocr_res:
+                            # 处理检测框
+                            dt_boxes_sorted = sorted_boxes(dt_boxes)
+                            dt_boxes_merged = merge_det_boxes(dt_boxes_sorted) if dt_boxes_sorted else []
+                            # 根据公式位置更新检测框
+                            dt_boxes_final = (update_det_boxes(dt_boxes_merged, adjusted_mfdetrec_res)
+                                              if dt_boxes_merged and adjusted_mfdetrec_res
+                                              else dt_boxes_merged)
+                            if dt_boxes_final:
+                                ocr_res = [box.tolist() if hasattr(box, 'tolist') else box for box in dt_boxes_final]
                                 ocr_result_list = get_ocr_result_list(
                                     ocr_res, useful_list, ocr_res_list_dict['ocr_enable'], bgr_image, _lang
                                 )
                                 ocr_res_list_dict['layout_res'].extend(ocr_result_list)
         else:
             # 原始单张处理模式
             for ocr_res_list_dict in tqdm(ocr_res_list_all_page, desc="OCR-det Predict"):

mineru/backend/pipeline/model_init.py CHANGED Viewed

@@ -8,7 +8,7 @@ from ...model.layout.doclayoutyolo import DocLayoutYOLOModel
 from ...model.mfd.yolo_v8 import YOLOv8MFDModel
 from ...model.mfr.unimernet.Unimernet import UnimernetModel
 from ...model.mfr.pp_formulanet_plus_m.predict_formula import FormulaRecognizer
-from ...model.ocr.paddleocr2pytorch.pytorch_paddle import PytorchPaddleOCR
+from mineru.model.ocr.pytorch_paddle import PytorchPaddleOCR
 from ...model.ori_cls.paddle_ori_cls import PaddleOrientationClsModel
 from ...model.table.cls.paddle_table_cls import PaddleTableClsModel
 # from ...model.table.rec.RapidTable import RapidTableModel

mineru/backend/pipeline/model_json_to_middle_json.py CHANGED Viewed

@@ -148,7 +148,7 @@ def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer
     fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans)
     """如果当前页面没有有效的bbox则跳过"""
-    if len(all_bboxes) == 0:
+    if len(all_bboxes) == 0 and len(fix_discarded_blocks) == 0:
         return None
     """对image/table/interline_equation截图"""

mineru/backend/pipeline/pipeline_middle_json_mkcontent.py CHANGED Viewed

@@ -191,11 +191,20 @@ def merge_para_with_text(para_block):
 def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size):
     para_type = para_block['type']
     para_content = {}
-    if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX]:
+    if para_type in [
+        BlockType.TEXT,
+        BlockType.LIST,
+        BlockType.INDEX,
+    ]:
         para_content = {
             'type': ContentType.TEXT,
             'text': merge_para_with_text(para_block),
         }
+    elif para_type == BlockType.DISCARDED:
+        para_content = {
+            'type': para_type,
+            'text': merge_para_with_text(para_block),
+        }
     elif para_type == BlockType.TITLE:
         para_content = {
             'type': ContentType.TEXT,
@@ -268,15 +277,19 @@ def union_make(pdf_info_dict: list,
     output_content = []
     for page_info in pdf_info_dict:
         paras_of_layout = page_info.get('para_blocks')
+        paras_of_discarded = page_info.get('discarded_blocks')
         page_idx = page_info.get('page_idx')
         page_size = page_info.get('page_size')
-        if not paras_of_layout:
-            continue
         if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
+            if not paras_of_layout:
+                continue
             page_markdown = make_blocks_to_markdown(paras_of_layout, make_mode, img_buket_path)
             output_content.extend(page_markdown)
         elif make_mode == MakeMode.CONTENT_LIST:
-            for para_block in paras_of_layout:
+            para_blocks = (paras_of_layout or []) + (paras_of_discarded or [])
+            if not para_blocks:
+                continue
+            for para_block in para_blocks:
                 para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
                 if para_content:
                     output_content.append(para_content)

mineru/backend/vlm/vlm_analyze.py CHANGED Viewed

@@ -8,6 +8,7 @@ from .utils import enable_custom_logits_processors, set_default_gpu_memory_utili
 from .model_output_to_middle_json import result_to_middle_json
 from ...data.data_reader_writer import DataWriter
 from mineru.utils.pdf_image_tools import load_images_from_pdf
+from ...utils.check_mac_env import is_mac_os_version_supported
 from ...utils.config_reader import get_device
 from ...utils.enum_class import ImageType
@@ -47,7 +48,7 @@ class ModelSingleton:
             for param in ["batch_size", "max_concurrency", "http_timeout"]:
                 if param in kwargs:
                     del kwargs[param]
-            if backend in ['transformers', 'vllm-engine', "vllm-async-engine"] and not model_path:
+            if backend in ['transformers', 'vllm-engine', "vllm-async-engine", "mlx-engine"] and not model_path:
                 model_path = auto_download_and_get_model_root_path("/","vlm")
                 if backend == "transformers":
                     try:
@@ -75,6 +76,15 @@ class ModelSingleton:
                     )
                     if batch_size == 0:
                         batch_size = set_default_batch_size()
+                elif backend == "mlx-engine":
+                    mlx_supported = is_mac_os_version_supported()
+                    if not mlx_supported:
+                        raise EnvironmentError("mlx-engine backend is only supported on macOS 13.5+ with Apple Silicon.")
+                    try:
+                        from mlx_vlm import load as mlx_load
+                    except ImportError:
+                        raise ImportError("Please install mlx-vlm to use the mlx-engine backend.")
+                    model, processor = mlx_load(model_path)
                 else:
                     if os.getenv('OMP_NUM_THREADS') is None:
                         os.environ["OMP_NUM_THREADS"] = "1"

mineru/backend/vlm/vlm_middle_json_mkcontent.py CHANGED Viewed

@@ -248,13 +248,16 @@ def union_make(pdf_info_dict: list,
         paras_of_discarded = page_info.get('discarded_blocks')
         page_idx = page_info.get('page_idx')
         page_size = page_info.get('page_size')
-        if not paras_of_layout:
-            continue
         if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
+            if not paras_of_layout:
+                continue
             page_markdown = mk_blocks_to_markdown(paras_of_layout, make_mode, formula_enable, table_enable, img_buket_path)
             output_content.extend(page_markdown)
         elif make_mode == MakeMode.CONTENT_LIST:
-            for para_block in paras_of_layout+paras_of_discarded:
+            para_blocks = (paras_of_layout or []) + (paras_of_discarded or [])
+            if not para_blocks:
+                continue
+            for para_block in para_blocks:
                 para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
                 output_content.append(para_content)

mineru/cli/client.py CHANGED Viewed

@@ -4,6 +4,7 @@ import click
 from pathlib import Path
 from loguru import logger
+from mineru.utils.check_mac_env import is_mac_os_version_supported
 from mineru.utils.cli_parser import arg_parse
 from mineru.utils.config_reader import get_device
 from mineru.utils.guess_suffix_or_lang import guess_suffix_by_path
@@ -11,6 +12,11 @@ from mineru.utils.model_utils import get_vram
 from ..version import __version__
 from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
+backends = ['pipeline', 'vlm-transformers', 'vlm-vllm-engine', 'vlm-http-client']
+if is_mac_os_version_supported():
+    backends.append("vlm-mlx-engine")
 @click.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=True))
 @click.pass_context
 @click.version_option(__version__,
@@ -38,25 +44,28 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
     '--method',
     'method',
     type=click.Choice(['auto', 'txt', 'ocr']),
-    help="""the method for parsing pdf:
-    auto: Automatically determine the method based on the file type.
-    txt: Use text extraction method.
-    ocr: Use OCR method for image-based PDFs.
+    help="""\b
+    the method for parsing pdf:
+      auto: Automatically determine the method based on the file type.
+      txt: Use text extraction method.
+      ocr: Use OCR method for image-based PDFs.
     Without method specified, 'auto' will be used by default.
-    Adapted only for the case where the backend is set to "pipeline".""",
+    Adapted only for the case where the backend is set to 'pipeline'.""",
     default='auto',
 )
 @click.option(
     '-b',
     '--backend',
     'backend',
-    type=click.Choice(['pipeline', 'vlm-transformers', 'vlm-vllm-engine', 'vlm-http-client']),
-    help="""the backend for parsing pdf:
-    pipeline: More general.
-    vlm-transformers: More general.
-    vlm-vllm-engine: Faster(engine).
-    vlm-http-client: Faster(client).
-    without method specified, pipeline will be used by default.""",
+    type=click.Choice(backends),
+    help="""\b
+    the backend for parsing pdf:
+      pipeline: More general.
+      vlm-transformers: More general, but slower.
+      vlm-mlx-engine: Faster than transformers.
+      vlm-vllm-engine: Faster(engine).
+      vlm-http-client: Faster(client).
+    Without method specified, pipeline will be used by default.""",
     default='pipeline',
 )
 @click.option(
@@ -66,7 +75,7 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
     type=click.Choice(['ch', 'ch_server', 'ch_lite', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka', 'th', 'el',
                        'latin', 'arabic', 'east_slavic', 'cyrillic', 'devanagari']),
     help="""
-    Input the languages in the pdf (if known) to improve OCR accuracy.  Optional.
+    Input the languages in the pdf (if known) to improve OCR accuracy.
     Without languages specified, 'ch' will be used by default.
     Adapted only for the case where the backend is set to "pipeline".
     """,
@@ -119,7 +128,8 @@ from .common import do_parse, read_fn, pdf_suffixes, image_suffixes
     '--device',
     'device_mode',
     type=str,
-    help='Device mode for model inference, e.g., "cpu", "cuda", "cuda:0", "npu", "npu:0", "mps". Adapted only for the case where the backend is set to "pipeline". ',
+    help="""Device mode for model inference, e.g., "cpu", "cuda", "cuda:0", "npu", "npu:0", "mps".
+         Adapted only for the case where the backend is set to "pipeline" and "vlm-transformers". """,
     default=None,
 )
 @click.option(

mineru/cli/gradio_app.py CHANGED Viewed

@@ -13,6 +13,7 @@ from gradio_pdf import PDF
 from loguru import logger
 from mineru.cli.common import prepare_env, read_fn, aio_do_parse, pdf_suffixes, image_suffixes
+from mineru.utils.check_mac_env import is_mac_os_version_supported
 from mineru.utils.cli_parser import arg_parse
 from mineru.utils.hash_utils import str_sha256
@@ -273,7 +274,7 @@ def to_pdf(file_path):
 # 更新界面函数
 def update_interface(backend_choice):
-    if backend_choice in ["vlm-transformers", "vlm-vllm-async-engine"]:
+    if backend_choice in ["vlm-transformers", "vlm-vllm-async-engine", "vlm-mlx-engine"]:
         return gr.update(visible=False), gr.update(visible=False)
     elif backend_choice in ["vlm-http-client"]:
         return gr.update(visible=True), gr.update(visible=False)
@@ -381,6 +382,8 @@ def main(ctx,
                         preferred_option = "vlm-vllm-async-engine"
                     else:
                         drop_list = ["pipeline", "vlm-transformers", "vlm-http-client"]
+                        if is_mac_os_version_supported():
+                            drop_list.append("vlm-mlx-engine")
                         preferred_option = "pipeline"
                     backend = gr.Dropdown(drop_list, label="Backend", value=preferred_option)
                 with gr.Row(visible=False) as client_options:

mineru/cli/models_download.py CHANGED Viewed

@@ -21,7 +21,7 @@ def download_and_modify_json(url, local_filename, modifications):
     if os.path.exists(local_filename):
         data = json.load(open(local_filename))
         config_version = data.get('config_version', '0.0.0')
-        if config_version < '1.3.0':
+        if config_version < '1.3.1':
             data = download_json(url)
     else:
         data = download_json(url)

mineru/model/ocr/{paddleocr2pytorch/pytorch_paddle.py → pytorch_paddle.py} RENAMED Viewed

@@ -134,7 +134,7 @@ def get_model_params(lang, config):
         raise Exception (f'Language {lang} not supported')
-root_dir = os.path.join(Path(__file__).resolve().parent.parent.parent, 'utils')
+root_dir = os.path.join(Path(__file__).resolve().parent.parent, 'utils')
 class PytorchPaddleOCR(TextSystem):

mineru/model/table/rec/RapidTable.py CHANGED Viewed

@@ -11,7 +11,7 @@ from rapid_table import ModelType, RapidTable, RapidTableInput
 from rapid_table.utils import RapidTableOutput
 from tqdm import tqdm
-from mineru.model.ocr.paddleocr2pytorch.pytorch_paddle import PytorchPaddleOCR
+from mineru.model.ocr.pytorch_paddle import PytorchPaddleOCR
 from mineru.utils.enum_class import ModelPath
 from mineru.utils.models_download_utils import auto_download_and_get_model_root_path

mineru 2.6.1__py3-none-any.whl → 2.6.3__py3-none-any.whl

mineru 2.6.1py3-none-any.whl → 2.6.3py3-none-any.whl