PyPI - magic-pdf - Versions diffs - 1.0.1__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

magic-pdf 1.0.1py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

magic_pdf/dict2md/ocr_mkcontent.py +24 -0
magic_pdf/filter/__init__.py +1 -1
magic_pdf/filter/pdf_classify_by_type.py +6 -4
magic_pdf/filter/pdf_meta_scan.py +4 -4
magic_pdf/libs/boxbase.py +5 -2
magic_pdf/libs/draw_bbox.py +14 -2
magic_pdf/libs/language.py +9 -0
magic_pdf/libs/pdf_check.py +11 -1
magic_pdf/libs/version.py +1 -1
magic_pdf/model/batch_analyze.py +103 -99
magic_pdf/model/doc_analyze_by_custom_model.py +87 -36
magic_pdf/model/magic_model.py +161 -4
magic_pdf/model/pdf_extract_kit.py +23 -28
magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +4 -3
magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +7 -3
magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +1 -1
magic_pdf/model/sub_modules/model_init.py +34 -19
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +33 -26
magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +25 -6
magic_pdf/pdf_parse_union_core_v2.py +176 -61
magic_pdf/post_proc/llm_aided.py +55 -24
magic_pdf/pre_proc/ocr_dict_merge.py +14 -2
magic_pdf/pre_proc/ocr_span_list_modify.py +1 -1
magic_pdf/resources/model_config/model_configs.yaml +2 -2
{magic_pdf-1.0.1.dist-info → magic_pdf-1.2.0.dist-info}/METADATA +36 -19
{magic_pdf-1.0.1.dist-info → magic_pdf-1.2.0.dist-info}/RECORD +30 -30
{magic_pdf-1.0.1.dist-info → magic_pdf-1.2.0.dist-info}/LICENSE.md +0 -0
{magic_pdf-1.0.1.dist-info → magic_pdf-1.2.0.dist-info}/WHEEL +0 -0
{magic_pdf-1.0.1.dist-info → magic_pdf-1.2.0.dist-info}/entry_points.txt +0 -0
{magic_pdf-1.0.1.dist-info → magic_pdf-1.2.0.dist-info}/top_level.txt +0 -0

magic_pdf/dict2md/ocr_mkcontent.py CHANGED Viewed

@@ -126,11 +126,35 @@ def detect_language(text):
         return 'empty'
+def full_to_half(text: str) -> str:
+    """Convert full-width characters to half-width characters using code point manipulation.
+    Args:
+        text: String containing full-width characters
+    Returns:
+        String with full-width characters converted to half-width
+    """
+    result = []
+    for char in text:
+        code = ord(char)
+        # Full-width ASCII variants (FF01-FF5E)
+        if 0xFF01 <= code <= 0xFF5E:
+            result.append(chr(code - 0xFEE0))  # Shift to ASCII range
+        # Full-width space
+        elif code == 0x3000:
+            result.append(' ')
+        else:
+            result.append(char)
+    return ''.join(result)
 def merge_para_with_text(para_block):
     block_text = ''
     for line in para_block['lines']:
         for span in line['spans']:
             if span['type'] in [ContentType.Text]:
+                span['content'] = full_to_half(span['content'])
                 block_text += span['content']
     block_lang = detect_lang(block_text)

magic_pdf/filter/__init__.py CHANGED Viewed

@@ -23,7 +23,7 @@ def classify(pdf_bytes: bytes) -> SupportedPdfParseMethod:
                 pdf_meta['image_info_per_page'],
                 pdf_meta['text_len_per_page'],
                 pdf_meta['imgs_per_page'],
-                pdf_meta['text_layout_per_page'],
+                # pdf_meta['text_layout_per_page'],
                 pdf_meta['invalid_chars'],
             )
             if is_text_pdf:

magic_pdf/filter/pdf_classify_by_type.py CHANGED Viewed

@@ -305,7 +305,8 @@ def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
 def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list,
-             text_layout_list: list, invalid_chars: bool):
+             # text_layout_list: list,
+             invalid_chars: bool):
     """
     这里的图片和页面长度单位是pts
     :param total_page:
@@ -321,7 +322,7 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
         'by_text_len': classify_by_text_len(text_len_list, total_page),
         'by_avg_words': classify_by_avg_words(text_len_list),
         'by_img_num': classify_by_img_num(img_sz_list, img_num_list),
-        'by_text_layout': classify_by_text_layout(text_layout_list),
+        # 'by_text_layout': classify_by_text_layout(text_layout_list),
         'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list),
         'by_invalid_chars': invalid_chars,
     }
@@ -332,9 +333,10 @@ def classify(total_page: int, page_width, page_height, img_sz_list: list, text_l
         return False, results
     else:
         logger.warning(
-            f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']},"
+            f"OCR needed based on classification result, by_image_area: {results['by_image_area']},"
             f" by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']},"
-            f" by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']},"
+            # f" by_text_layout: {results['by_text_layout']},"
+            f" by_img_narrow_strips: {results['by_img_narrow_strips']},"
             f" by_invalid_chars: {results['by_invalid_chars']}",
             file=sys.stderr)  # 利用这种情况可以快速找出来哪些pdf比较特殊，针对性修正分类算法
         return False, results

magic_pdf/filter/pdf_meta_scan.py CHANGED Viewed

@@ -356,9 +356,9 @@ def pdf_meta_scan(pdf_bytes: bytes):
         # logger.info(f"image_info_per_page: {image_info_per_page}, junk_img_bojids: {junk_img_bojids}")
         text_len_per_page = get_pdf_textlen_per_page(doc)
         # logger.info(f"text_len_per_page: {text_len_per_page}")
-        text_layout_per_page = get_pdf_text_layout_per_page(doc)
+        # text_layout_per_page = get_pdf_text_layout_per_page(doc)
         # logger.info(f"text_layout_per_page: {text_layout_per_page}")
-        text_language = get_language(doc)
+        # text_language = get_language(doc)
         # logger.info(f"text_language: {text_language}")
         invalid_chars = check_invalid_chars(pdf_bytes)
         # logger.info(f"invalid_chars: {invalid_chars}")
@@ -372,8 +372,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
             'page_height_pts': int(page_height_pts),
             'image_info_per_page': image_info_per_page,
             'text_len_per_page': text_len_per_page,
-            'text_layout_per_page': text_layout_per_page,
-            'text_language': text_language,
+            # 'text_layout_per_page': text_layout_per_page,
+            # 'text_language': text_language,
             # "svgs_per_page": svgs_per_page,
             'imgs_per_page': imgs_per_page,  # 增加每页img数量list
             'junk_img_bojids': junk_img_bojids,  # 增加垃圾图片的bojid list

magic_pdf/libs/boxbase.py CHANGED Viewed

@@ -185,10 +185,13 @@ def calculate_iou(bbox1, bbox2):
     bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
     bbox2_area = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
+    if any([bbox1_area == 0, bbox2_area == 0]):
+        return 0
     # Compute the intersection over union by taking the intersection area
     # and dividing it by the sum of both areas minus the intersection area
-    iou = intersection_area / float(bbox1_area + bbox2_area -
-                                    intersection_area)
+    iou = intersection_area / float(bbox1_area + bbox2_area - intersection_area)
     return iou

magic_pdf/libs/draw_bbox.py CHANGED Viewed

@@ -362,12 +362,24 @@ def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
     for page in pdf_info:
         page_line_list = []
         for block in page['preproc_blocks']:
-            if block['type'] in [BlockType.Text, BlockType.Title, BlockType.InterlineEquation]:
+            if block['type'] in [BlockType.Text]:
                 for line in block['lines']:
                     bbox = line['bbox']
                     index = line['index']
                     page_line_list.append({'index': index, 'bbox': bbox})
-            if block['type'] in [BlockType.Image, BlockType.Table]:
+            elif block['type'] in [BlockType.Title, BlockType.InterlineEquation]:
+                if 'virtual_lines' in block:
+                    if len(block['virtual_lines']) > 0 and block['virtual_lines'][0].get('index', None) is not None:
+                        for line in block['virtual_lines']:
+                            bbox = line['bbox']
+                            index = line['index']
+                            page_line_list.append({'index': index, 'bbox': bbox})
+                else:
+                    for line in block['lines']:
+                        bbox = line['bbox']
+                        index = line['index']
+                        page_line_list.append({'index': index, 'bbox': bbox})
+            elif block['type'] in [BlockType.Image, BlockType.Table]:
                 for sub_block in block['blocks']:
                     if sub_block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
                         if len(sub_block['virtual_lines']) > 0 and sub_block['virtual_lines'][0].get('index', None) is not None:

magic_pdf/libs/language.py CHANGED Viewed

@@ -12,12 +12,20 @@ if not os.getenv("FTLANG_CACHE"):
 from fast_langdetect import detect_language
+def remove_invalid_surrogates(text):
+    # 移除无效的 UTF-16 代理对
+    return ''.join(c for c in text if not (0xD800 <= ord(c) <= 0xDFFF))
 def detect_lang(text: str) -> str:
     if len(text) == 0:
         return ""
     text = text.replace("\n", "")
+    text = remove_invalid_surrogates(text)
+    # print(text)
     try:
         lang_upper = detect_language(text)
     except:
@@ -37,3 +45,4 @@ if __name__ == '__main__':
     print(detect_lang("<html>This is a test</html>"))
     print(detect_lang("这个是中文测试。"))
     print(detect_lang("<html>这个是中文测试。</html>"))
+    print(detect_lang("〖\ud835\udc46\ud835〗这是个包含utf-16的中文测试"))

magic_pdf/libs/pdf_check.py CHANGED Viewed

@@ -4,6 +4,7 @@ from loguru import logger
 import re
 from io import BytesIO
 from pdfminer.high_level import extract_text
+from pdfminer.layout import LAParams
 def calculate_sample_count(total_page: int):
@@ -41,7 +42,16 @@ def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
     sample_docs = extract_pages(src_pdf_bytes)
     sample_pdf_bytes = sample_docs.tobytes()
     sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
-    text = extract_text(sample_pdf_file_like_object)
+    laparams = LAParams(
+        line_overlap=0.5,
+        char_margin=2.0,
+        line_margin=0.5,
+        word_margin=0.1,
+        boxes_flow=None,
+        detect_vertical=False,
+        all_texts=False,
+    )
+    text = extract_text(pdf_file=sample_pdf_file_like_object, laparams=laparams)
     text = text.replace("\n", "")
     # logger.info(text)
     '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''

magic_pdf/libs/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.0.1"
1	+ __version__ = "1.2.0"

magic_pdf/model/batch_analyze.py CHANGED Viewed

@@ -7,19 +7,19 @@ from loguru import logger
 from PIL import Image
 from magic_pdf.config.constants import MODEL_NAME
-from magic_pdf.config.exceptions import CUDA_NOT_AVAILABLE
-from magic_pdf.data.dataset import Dataset
-from magic_pdf.libs.clean_memory import clean_memory
-from magic_pdf.libs.config_reader import get_device
-from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
+# from magic_pdf.config.exceptions import CUDA_NOT_AVAILABLE
+# from magic_pdf.data.dataset import Dataset
+# from magic_pdf.libs.clean_memory import clean_memory
+# from magic_pdf.libs.config_reader import get_device
+# from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
 from magic_pdf.model.pdf_extract_kit import CustomPEKModel
 from magic_pdf.model.sub_modules.model_utils import (
     clean_vram, crop_img, get_res_list_from_layout_res)
 from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import (
     get_adjusted_mfdetrec_res, get_ocr_result_list)
-from magic_pdf.operators.models import InferenceResult
+# from magic_pdf.operators.models import InferenceResult
-YOLO_LAYOUT_BASE_BATCH_SIZE = 4
+YOLO_LAYOUT_BASE_BATCH_SIZE = 1
 MFD_BASE_BATCH_SIZE = 1
 MFR_BASE_BATCH_SIZE = 16
@@ -44,19 +44,20 @@ class BatchAnalyze:
             modified_images = []
             for image_index, image in enumerate(images):
                 pil_img = Image.fromarray(image)
-                width, height = pil_img.size
-                if height > width:
-                    input_res = {'poly': [0, 0, width, 0, width, height, 0, height]}
-                    new_image, useful_list = crop_img(
-                        input_res, pil_img, crop_paste_x=width // 2, crop_paste_y=0
-                    )
-                    layout_images.append(new_image)
-                    modified_images.append([image_index, useful_list])
-                else:
-                    layout_images.append(pil_img)
+                # width, height = pil_img.size
+                # if height > width:
+                #     input_res = {'poly': [0, 0, width, 0, width, height, 0, height]}
+                #     new_image, useful_list = crop_img(
+                #         input_res, pil_img, crop_paste_x=width // 2, crop_paste_y=0
+                #     )
+                #     layout_images.append(new_image)
+                #     modified_images.append([image_index, useful_list])
+                # else:
+                layout_images.append(pil_img)
             images_layout_res += self.model.layout_model.batch_predict(
-                layout_images, self.batch_ratio * YOLO_LAYOUT_BASE_BATCH_SIZE
+                # layout_images, self.batch_ratio * YOLO_LAYOUT_BASE_BATCH_SIZE
+                layout_images, YOLO_LAYOUT_BASE_BATCH_SIZE
             )
             for image_index, useful_list in modified_images:
@@ -78,7 +79,8 @@ class BatchAnalyze:
             # 公式检测
             mfd_start_time = time.time()
             images_mfd_res = self.model.mfd_model.batch_predict(
-                images, self.batch_ratio * MFD_BASE_BATCH_SIZE
+                # images, self.batch_ratio * MFD_BASE_BATCH_SIZE
+                images, MFD_BASE_BATCH_SIZE
             )
             logger.info(
                 f'mfd time: {round(time.time() - mfd_start_time, 2)}, image num: {len(images)}'
@@ -91,10 +93,12 @@ class BatchAnalyze:
                 images,
                 batch_size=self.batch_ratio * MFR_BASE_BATCH_SIZE,
             )
+            mfr_count = 0
             for image_index in range(len(images)):
                 images_layout_res[image_index] += images_formula_list[image_index]
+                mfr_count += len(images_formula_list[image_index])
             logger.info(
-                f'mfr time: {round(time.time() - mfr_start_time, 2)}, image num: {len(images)}'
+                f'mfr time: {round(time.time() - mfr_start_time, 2)}, image num: {mfr_count}'
             )
         # 清理显存
@@ -159,7 +163,7 @@ class BatchAnalyze:
                     elif self.model.table_model_name == MODEL_NAME.TABLE_MASTER:
                         html_code = self.model.table_model.img2html(new_image)
                     elif self.model.table_model_name == MODEL_NAME.RAPID_TABLE:
-                        html_code, table_cell_bboxes, elapse = (
+                        html_code, table_cell_bboxes, logic_points, elapse = (
                             self.model.table_model.predict(new_image)
                         )
                     run_time = time.time() - single_table_start_time
@@ -195,81 +199,81 @@ class BatchAnalyze:
         return images_layout_res
-def doc_batch_analyze(
-    dataset: Dataset,
-    ocr: bool = False,
-    show_log: bool = False,
-    start_page_id=0,
-    end_page_id=None,
-    lang=None,
-    layout_model=None,
-    formula_enable=None,
-    table_enable=None,
-    batch_ratio: int | None = None,
-) -> InferenceResult:
-    """Perform batch analysis on a document dataset.
-    Args:
-        dataset (Dataset): The dataset containing document pages to be analyzed.
-        ocr (bool, optional): Flag to enable OCR (Optical Character Recognition). Defaults to False.
-        show_log (bool, optional): Flag to enable logging. Defaults to False.
-        start_page_id (int, optional): The starting page ID for analysis. Defaults to 0.
-        end_page_id (int, optional): The ending page ID for analysis. Defaults to None, which means analyze till the last page.
-        lang (str, optional): Language for OCR. Defaults to None.
-        layout_model (optional): Layout model to be used for analysis. Defaults to None.
-        formula_enable (optional): Flag to enable formula detection. Defaults to None.
-        table_enable (optional): Flag to enable table detection. Defaults to None.
-        batch_ratio (int | None, optional): Ratio for batch processing. Defaults to None, which sets it to 1.
-    Raises:
-        CUDA_NOT_AVAILABLE: If CUDA is not available, raises an exception as batch analysis is not supported in CPU mode.
-    Returns:
-        InferenceResult: The result of the batch analysis containing the analyzed data and the dataset.
-    """
-    if not torch.cuda.is_available():
-        raise CUDA_NOT_AVAILABLE('batch analyze not support in CPU mode')
-    lang = None if lang == '' else lang
-    # TODO: auto detect batch size
-    batch_ratio = 1 if batch_ratio is None else batch_ratio
-    end_page_id = end_page_id if end_page_id else len(dataset)
-    model_manager = ModelSingleton()
-    custom_model: CustomPEKModel = model_manager.get_model(
-        ocr, show_log, lang, layout_model, formula_enable, table_enable
-    )
-    batch_model = BatchAnalyze(model=custom_model, batch_ratio=batch_ratio)
-    model_json = []
-    # batch analyze
-    images = []
-    for index in range(len(dataset)):
-        if start_page_id <= index <= end_page_id:
-            page_data = dataset.get_page(index)
-            img_dict = page_data.get_image()
-            images.append(img_dict['img'])
-    analyze_result = batch_model(images)
-    for index in range(len(dataset)):
-        page_data = dataset.get_page(index)
-        img_dict = page_data.get_image()
-        page_width = img_dict['width']
-        page_height = img_dict['height']
-        if start_page_id <= index <= end_page_id:
-            result = analyze_result.pop(0)
-        else:
-            result = []
-        page_info = {'page_no': index, 'height': page_height, 'width': page_width}
-        page_dict = {'layout_dets': result, 'page_info': page_info}
-        model_json.append(page_dict)
-    # TODO: clean memory when gpu memory is not enough
-    clean_memory_start_time = time.time()
-    clean_memory(get_device())
-    logger.info(f'clean memory time: {round(time.time() - clean_memory_start_time, 2)}')
-    return InferenceResult(model_json, dataset)
+# def doc_batch_analyze(
+#     dataset: Dataset,
+#     ocr: bool = False,
+#     show_log: bool = False,
+#     start_page_id=0,
+#     end_page_id=None,
+#     lang=None,
+#     layout_model=None,
+#     formula_enable=None,
+#     table_enable=None,
+#     batch_ratio: int | None = None,
+# ) -> InferenceResult:
+#     """Perform batch analysis on a document dataset.
+#
+#     Args:
+#         dataset (Dataset): The dataset containing document pages to be analyzed.
+#         ocr (bool, optional): Flag to enable OCR (Optical Character Recognition). Defaults to False.
+#         show_log (bool, optional): Flag to enable logging. Defaults to False.
+#         start_page_id (int, optional): The starting page ID for analysis. Defaults to 0.
+#         end_page_id (int, optional): The ending page ID for analysis. Defaults to None, which means analyze till the last page.
+#         lang (str, optional): Language for OCR. Defaults to None.
+#         layout_model (optional): Layout model to be used for analysis. Defaults to None.
+#         formula_enable (optional): Flag to enable formula detection. Defaults to None.
+#         table_enable (optional): Flag to enable table detection. Defaults to None.
+#         batch_ratio (int | None, optional): Ratio for batch processing. Defaults to None, which sets it to 1.
+#
+#     Raises:
+#         CUDA_NOT_AVAILABLE: If CUDA is not available, raises an exception as batch analysis is not supported in CPU mode.
+#
+#     Returns:
+#         InferenceResult: The result of the batch analysis containing the analyzed data and the dataset.
+#     """
+#
+#     if not torch.cuda.is_available():
+#         raise CUDA_NOT_AVAILABLE('batch analyze not support in CPU mode')
+#
+#     lang = None if lang == '' else lang
+#     # TODO: auto detect batch size
+#     batch_ratio = 1 if batch_ratio is None else batch_ratio
+#     end_page_id = end_page_id if end_page_id else len(dataset)
+#
+#     model_manager = ModelSingleton()
+#     custom_model: CustomPEKModel = model_manager.get_model(
+#         ocr, show_log, lang, layout_model, formula_enable, table_enable
+#     )
+#     batch_model = BatchAnalyze(model=custom_model, batch_ratio=batch_ratio)
+#
+#     model_json = []
+#
+#     # batch analyze
+#     images = []
+#     for index in range(len(dataset)):
+#         if start_page_id <= index <= end_page_id:
+#             page_data = dataset.get_page(index)
+#             img_dict = page_data.get_image()
+#             images.append(img_dict['img'])
+#     analyze_result = batch_model(images)
+#
+#     for index in range(len(dataset)):
+#         page_data = dataset.get_page(index)
+#         img_dict = page_data.get_image()
+#         page_width = img_dict['width']
+#         page_height = img_dict['height']
+#         if start_page_id <= index <= end_page_id:
+#             result = analyze_result.pop(0)
+#         else:
+#             result = []
+#
+#         page_info = {'page_no': index, 'height': page_height, 'width': page_width}
+#         page_dict = {'layout_dets': result, 'page_info': page_info}
+#         model_json.append(page_dict)
+#
+#     # TODO: clean memory when gpu memory is not enough
+#     clean_memory_start_time = time.time()
+#     clean_memory(get_device())
+#     logger.info(f'clean memory time: {round(time.time() - clean_memory_start_time, 2)}')
+#
+#     return InferenceResult(model_json, dataset)

magic_pdf/model/doc_analyze_by_custom_model.py CHANGED Viewed

@@ -1,17 +1,22 @@
 import os
 import time
+import torch
+os.environ['FLAGS_npu_jit_compile'] = '0'  # 关闭paddle的jit编译
+os.environ['FLAGS_use_stride_kernel'] = '0'
+os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'  # 让mps可以fallback
+os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
 # 关闭paddle的信号处理
 import paddle
-from loguru import logger
 paddle.disable_signal_handler()
-os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
+from loguru import logger
+from magic_pdf.model.batch_analyze import BatchAnalyze
+from magic_pdf.model.sub_modules.model_utils import get_vram
 try:
     import torchtext
     if torchtext.__version__ >= '0.18.0':
         torchtext.disable_torchtext_deprecation_warning()
 except ImportError:
@@ -28,20 +33,6 @@ from magic_pdf.model.model_list import MODEL
 from magic_pdf.operators.models import InferenceResult
-def dict_compare(d1, d2):
-    return d1.items() == d2.items()
-def remove_duplicates_dicts(lst):
-    unique_dicts = []
-    for dict_item in lst:
-        if not any(
-            dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts
-        ):
-            unique_dicts.append(dict_item)
-    return unique_dicts
 class ModelSingleton:
     _instance = None
     _models = {}
@@ -154,33 +145,93 @@ def doc_analyze(
     table_enable=None,
 ) -> InferenceResult:
+    end_page_id = (
+        end_page_id
+        if end_page_id is not None and end_page_id >= 0
+        else len(dataset) - 1
+    )
     model_manager = ModelSingleton()
     custom_model = model_manager.get_model(
         ocr, show_log, lang, layout_model, formula_enable, table_enable
     )
+    batch_analyze = False
+    batch_ratio = 1
+    device = get_device()
+    npu_support = False
+    if str(device).startswith("npu"):
+        import torch_npu
+        if torch_npu.npu.is_available():
+            npu_support = True
+    if torch.cuda.is_available() and device != 'cpu' or npu_support:
+        gpu_memory = int(os.getenv("VIRTUAL_VRAM_SIZE", round(get_vram(device))))
+        if gpu_memory is not None and gpu_memory >= 8:
+            if gpu_memory >= 40:
+                batch_ratio = 32
+            elif gpu_memory >=20:
+                batch_ratio = 16
+            elif gpu_memory >= 16:
+                batch_ratio = 8
+            elif gpu_memory >= 10:
+                batch_ratio = 4
+            else:
+                batch_ratio = 2
+            logger.info(f'gpu_memory: {gpu_memory} GB, batch_ratio: {batch_ratio}')
+            batch_analyze = True
     model_json = []
     doc_analyze_start = time.time()
-    if end_page_id is None:
-        end_page_id = len(dataset)
-    for index in range(len(dataset)):
-        page_data = dataset.get_page(index)
-        img_dict = page_data.get_image()
-        img = img_dict['img']
-        page_width = img_dict['width']
-        page_height = img_dict['height']
-        if start_page_id <= index <= end_page_id:
-            page_start = time.time()
-            result = custom_model(img)
-            logger.info(f'-----page_id : {index}, page total time: {round(time.time() - page_start, 2)}-----')
-        else:
-            result = []
+    if batch_analyze:
+        # batch analyze
+        images = []
+        page_wh_list = []
+        for index in range(len(dataset)):
+            if start_page_id <= index <= end_page_id:
+                page_data = dataset.get_page(index)
+                img_dict = page_data.get_image()
+                images.append(img_dict['img'])
+                page_wh_list.append((img_dict['width'], img_dict['height']))
+        batch_model = BatchAnalyze(model=custom_model, batch_ratio=batch_ratio)
+        analyze_result = batch_model(images)
+        for index in range(len(dataset)):
+            if start_page_id <= index <= end_page_id:
+                result = analyze_result.pop(0)
+                page_width, page_height = page_wh_list.pop(0)
+            else:
+                result = []
+                page_height = 0
+                page_width = 0
+            page_info = {'page_no': index, 'width': page_width, 'height': page_height}
+            page_dict = {'layout_dets': result, 'page_info': page_info}
+            model_json.append(page_dict)
-        page_info = {'page_no': index, 'height': page_height, 'width': page_width}
-        page_dict = {'layout_dets': result, 'page_info': page_info}
-        model_json.append(page_dict)
+    else:
+        # single analyze
+        for index in range(len(dataset)):
+            page_data = dataset.get_page(index)
+            img_dict = page_data.get_image()
+            img = img_dict['img']
+            page_width = img_dict['width']
+            page_height = img_dict['height']
+            if start_page_id <= index <= end_page_id:
+                page_start = time.time()
+                result = custom_model(img)
+                logger.info(f'-----page_id : {index}, page total time: {round(time.time() - page_start, 2)}-----')
+            else:
+                result = []
+            page_info = {'page_no': index, 'width': page_width, 'height': page_height}
+            page_dict = {'layout_dets': result, 'page_info': page_info}
+            model_json.append(page_dict)
     gc_start = time.time()
     clean_memory(get_device())

magic-pdf 1.0.1__py3-none-any.whl → 1.2.0__py3-none-any.whl

magic-pdf 1.0.1py3-none-any.whl → 1.2.0py3-none-any.whl