PyPI - magic-pdf - Versions diffs - 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

magic-pdf 1.2.1py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

magic_pdf/model/batch_analyze.py CHANGED Viewed

@@ -1,23 +1,14 @@
 import time
 import cv2
-import numpy as np
-import torch
 from loguru import logger
-from PIL import Image
+from tqdm import tqdm
 from magic_pdf.config.constants import MODEL_NAME
-# from magic_pdf.config.exceptions import CUDA_NOT_AVAILABLE
-# from magic_pdf.data.dataset import Dataset
-# from magic_pdf.libs.clean_memory import clean_memory
-# from magic_pdf.libs.config_reader import get_device
-# from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
-from magic_pdf.model.pdf_extract_kit import CustomPEKModel
+from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
 from magic_pdf.model.sub_modules.model_utils import (
     clean_vram, crop_img, get_res_list_from_layout_res)
-from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import (
+from magic_pdf.model.sub_modules.ocr.paddleocr2pytorch.ocr_utils import (
     get_adjusted_mfdetrec_res, get_ocr_result_list)
-# from magic_pdf.operators.models import InferenceResult
 YOLO_LAYOUT_BASE_BATCH_SIZE = 1
 MFD_BASE_BATCH_SIZE = 1
@@ -25,14 +16,25 @@ MFR_BASE_BATCH_SIZE = 16
 class BatchAnalyze:
-    def __init__(self, model: CustomPEKModel, batch_ratio: int):
-        self.model = model
+    def __init__(self, model_manager, batch_ratio: int, show_log, layout_model, formula_enable, table_enable):
+        self.model_manager = model_manager
         self.batch_ratio = batch_ratio
-    def __call__(self, images: list) -> list:
+        self.show_log = show_log
+        self.layout_model = layout_model
+        self.formula_enable = formula_enable
+        self.table_enable = table_enable
+    def __call__(self, images_with_extra_info: list) -> list:
+        if len(images_with_extra_info) == 0:
+            return []
         images_layout_res = []
         layout_start_time = time.time()
+        _, fst_ocr, fst_lang = images_with_extra_info[0]
+        self.model = self.model_manager.get_model(fst_ocr, self.show_log, fst_lang, self.layout_model, self.formula_enable, self.table_enable)
+        images = [image for image, _, _ in images_with_extra_info]
         if self.model.layout_model_name == MODEL_NAME.LAYOUTLMv3:
             # layoutlmv3
             for image in images:
@@ -41,39 +43,17 @@ class BatchAnalyze:
         elif self.model.layout_model_name == MODEL_NAME.DocLayout_YOLO:
             # doclayout_yolo
             layout_images = []
-            modified_images = []
             for image_index, image in enumerate(images):
-                pil_img = Image.fromarray(image)
-                # width, height = pil_img.size
-                # if height > width:
-                #     input_res = {'poly': [0, 0, width, 0, width, height, 0, height]}
-                #     new_image, useful_list = crop_img(
-                #         input_res, pil_img, crop_paste_x=width // 2, crop_paste_y=0
-                #     )
-                #     layout_images.append(new_image)
-                #     modified_images.append([image_index, useful_list])
-                # else:
-                layout_images.append(pil_img)
+                layout_images.append(image)
             images_layout_res += self.model.layout_model.batch_predict(
                 # layout_images, self.batch_ratio * YOLO_LAYOUT_BASE_BATCH_SIZE
                 layout_images, YOLO_LAYOUT_BASE_BATCH_SIZE
             )
-            for image_index, useful_list in modified_images:
-                for res in images_layout_res[image_index]:
-                    for i in range(len(res['poly'])):
-                        if i % 2 == 0:
-                            res['poly'][i] = (
-                                res['poly'][i] - useful_list[0] + useful_list[2]
-                            )
-                        else:
-                            res['poly'][i] = (
-                                res['poly'][i] - useful_list[1] + useful_list[3]
-                            )
-        logger.info(
-            f'layout time: {round(time.time() - layout_start_time, 2)}, image num: {len(images)}'
-        )
+        # logger.info(
+        #     f'layout time: {round(time.time() - layout_start_time, 2)}, image num: {len(images)}'
+        # )
         if self.model.apply_formula:
             # 公式检测
@@ -82,9 +62,9 @@ class BatchAnalyze:
                 # images, self.batch_ratio * MFD_BASE_BATCH_SIZE
                 images, MFD_BASE_BATCH_SIZE
             )
-            logger.info(
-                f'mfd time: {round(time.time() - mfd_start_time, 2)}, image num: {len(images)}'
-            )
+            # logger.info(
+            #     f'mfd time: {round(time.time() - mfd_start_time, 2)}, image num: {len(images)}'
+            # )
             # 公式识别
             mfr_start_time = time.time()
@@ -97,183 +77,177 @@ class BatchAnalyze:
             for image_index in range(len(images)):
                 images_layout_res[image_index] += images_formula_list[image_index]
                 mfr_count += len(images_formula_list[image_index])
-            logger.info(
-                f'mfr time: {round(time.time() - mfr_start_time, 2)}, image num: {mfr_count}'
-            )
+            # logger.info(
+            #     f'mfr time: {round(time.time() - mfr_start_time, 2)}, image num: {mfr_count}'
+            # )
         # 清理显存
-        clean_vram(self.model.device, vram_threshold=8)
+        # clean_vram(self.model.device, vram_threshold=8)
-        ocr_time = 0
-        ocr_count = 0
-        table_time = 0
-        table_count = 0
-        # reference: magic_pdf/model/doc_analyze_by_custom_model.py:doc_analyze
+        ocr_res_list_all_page = []
+        table_res_list_all_page = []
         for index in range(len(images)):
+            _, ocr_enable, _lang = images_with_extra_info[index]
             layout_res = images_layout_res[index]
-            pil_img = Image.fromarray(images[index])
+            np_array_img = images[index]
             ocr_res_list, table_res_list, single_page_mfdetrec_res = (
                 get_res_list_from_layout_res(layout_res)
             )
-            # ocr识别
-            ocr_start = time.time()
+            ocr_res_list_all_page.append({'ocr_res_list':ocr_res_list,
+                                          'lang':_lang,
+                                          'ocr_enable':ocr_enable,
+                                          'np_array_img':np_array_img,
+                                          'single_page_mfdetrec_res':single_page_mfdetrec_res,
+                                          'layout_res':layout_res,
+                                          })
+            for table_res in table_res_list:
+                table_img, _ = crop_img(table_res, np_array_img)
+                table_res_list_all_page.append({'table_res':table_res,
+                                                'lang':_lang,
+                                                'table_img':table_img,
+                                              })
+        # 文本框检测
+        det_start = time.time()
+        det_count = 0
+        # for ocr_res_list_dict in ocr_res_list_all_page:
+        for ocr_res_list_dict in tqdm(ocr_res_list_all_page, desc="OCR-det Predict"):
             # Process each area that requires OCR processing
-            for res in ocr_res_list:
+            _lang = ocr_res_list_dict['lang']
+            # Get OCR results for this language's images
+            atom_model_manager = AtomModelSingleton()
+            ocr_model = atom_model_manager.get_atom_model(
+                atom_model_name='ocr',
+                ocr_show_log=False,
+                det_db_box_thresh=0.3,
+                lang=_lang
+            )
+            for res in ocr_res_list_dict['ocr_res_list']:
                 new_image, useful_list = crop_img(
-                    res, pil_img, crop_paste_x=50, crop_paste_y=50
+                    res, ocr_res_list_dict['np_array_img'], crop_paste_x=50, crop_paste_y=50
                 )
                 adjusted_mfdetrec_res = get_adjusted_mfdetrec_res(
-                    single_page_mfdetrec_res, useful_list
+                    ocr_res_list_dict['single_page_mfdetrec_res'], useful_list
                 )
-                # OCR recognition
-                new_image = cv2.cvtColor(np.asarray(new_image), cv2.COLOR_RGB2BGR)
-                if self.model.apply_ocr:
-                    ocr_res = self.model.ocr_model.ocr(
-                        new_image, mfd_res=adjusted_mfdetrec_res
-                    )[0]
-                else:
-                    ocr_res = self.model.ocr_model.ocr(
-                        new_image, mfd_res=adjusted_mfdetrec_res, rec=False
-                    )[0]
+                # OCR-det
+                new_image = cv2.cvtColor(new_image, cv2.COLOR_RGB2BGR)
+                ocr_res = ocr_model.ocr(
+                    new_image, mfd_res=adjusted_mfdetrec_res, rec=False
+                )[0]
                 # Integration results
                 if ocr_res:
-                    ocr_result_list = get_ocr_result_list(ocr_res, useful_list)
-                    layout_res.extend(ocr_result_list)
-            ocr_time += time.time() - ocr_start
-            ocr_count += len(ocr_res_list)
-            # 表格识别 table recognition
-            if self.model.apply_table:
-                table_start = time.time()
-                for res in table_res_list:
-                    new_image, _ = crop_img(res, pil_img)
-                    single_table_start_time = time.time()
-                    html_code = None
-                    if self.model.table_model_name == MODEL_NAME.STRUCT_EQTABLE:
-                        with torch.no_grad():
-                            table_result = self.model.table_model.predict(
-                                new_image, 'html'
-                            )
-                            if len(table_result) > 0:
-                                html_code = table_result[0]
-                    elif self.model.table_model_name == MODEL_NAME.TABLE_MASTER:
-                        html_code = self.model.table_model.img2html(new_image)
-                    elif self.model.table_model_name == MODEL_NAME.RAPID_TABLE:
-                        html_code, table_cell_bboxes, logic_points, elapse = (
-                            self.model.table_model.predict(new_image)
-                        )
-                    run_time = time.time() - single_table_start_time
-                    if run_time > self.model.table_max_time:
-                        logger.warning(
-                            f'table recognition processing exceeds max time {self.model.table_max_time}s'
-                        )
-                    # 判断是否返回正常
-                    if html_code:
-                        expected_ending = html_code.strip().endswith(
-                            '</html>'
-                        ) or html_code.strip().endswith('</table>')
-                        if expected_ending:
-                            res['html'] = html_code
-                        else:
-                            logger.warning(
-                                'table recognition processing fails, not found expected HTML table end'
-                            )
+                    ocr_result_list = get_ocr_result_list(ocr_res, useful_list, ocr_res_list_dict['ocr_enable'], new_image, _lang)
+                    ocr_res_list_dict['layout_res'].extend(ocr_result_list)
+            det_count += len(ocr_res_list_dict['ocr_res_list'])
+        # logger.info(f'ocr-det time: {round(time.time()-det_start, 2)}, image num: {det_count}')
+        # 表格识别 table recognition
+        if self.model.apply_table:
+            table_start = time.time()
+            table_count = 0
+            # for table_res_list_dict in table_res_list_all_page:
+            for table_res_dict in tqdm(table_res_list_all_page, desc="Table Predict"):
+                _lang = table_res_dict['lang']
+                atom_model_manager = AtomModelSingleton()
+                ocr_engine = atom_model_manager.get_atom_model(
+                    atom_model_name='ocr',
+                    ocr_show_log=False,
+                    det_db_box_thresh=0.5,
+                    det_db_unclip_ratio=1.6,
+                    lang=_lang
+                )
+                table_model = atom_model_manager.get_atom_model(
+                    atom_model_name='table',
+                    table_model_name='rapid_table',
+                    table_model_path='',
+                    table_max_time=400,
+                    device='cpu',
+                    ocr_engine=ocr_engine,
+                    table_sub_model_name='slanet_plus'
+                )
+                html_code, table_cell_bboxes, logic_points, elapse = table_model.predict(table_res_dict['table_img'])
+                # 判断是否返回正常
+                if html_code:
+                    expected_ending = html_code.strip().endswith(
+                        '</html>'
+                    ) or html_code.strip().endswith('</table>')
+                    if expected_ending:
+                        table_res_dict['table_res']['html'] = html_code
                     else:
                         logger.warning(
-                            'table recognition processing fails, not get html return'
+                            'table recognition processing fails, not found expected HTML table end'
                         )
-                table_time += time.time() - table_start
-                table_count += len(table_res_list)
+                else:
+                    logger.warning(
+                        'table recognition processing fails, not get html return'
+                    )
+            # logger.info(f'table time: {round(time.time() - table_start, 2)}, image num: {len(table_res_list_all_page)}')
-        if self.model.apply_ocr:
-            logger.info(f'ocr time: {round(ocr_time, 2)}, image num: {ocr_count}')
-        else:
-            logger.info(f'det time: {round(ocr_time, 2)}, image num: {ocr_count}')
-        if self.model.apply_table:
-            logger.info(f'table time: {round(table_time, 2)}, image num: {table_count}')
+        # Create dictionaries to store items by language
+        need_ocr_lists_by_lang = {}  # Dict of lists for each language
+        img_crop_lists_by_lang = {}  # Dict of lists for each language
-        return images_layout_res
+        for layout_res in images_layout_res:
+            for layout_res_item in layout_res:
+                if layout_res_item['category_id'] in [15]:
+                    if 'np_img' in layout_res_item and 'lang' in layout_res_item:
+                        lang = layout_res_item['lang']
+                        # Initialize lists for this language if not exist
+                        if lang not in need_ocr_lists_by_lang:
+                            need_ocr_lists_by_lang[lang] = []
+                            img_crop_lists_by_lang[lang] = []
+                        # Add to the appropriate language-specific lists
+                        need_ocr_lists_by_lang[lang].append(layout_res_item)
+                        img_crop_lists_by_lang[lang].append(layout_res_item['np_img'])
+                        # Remove the fields after adding to lists
+                        layout_res_item.pop('np_img')
+                        layout_res_item.pop('lang')
+        if len(img_crop_lists_by_lang) > 0:
+            # Process OCR by language
+            rec_time = 0
+            rec_start = time.time()
+            total_processed = 0
-# def doc_batch_analyze(
-#     dataset: Dataset,
-#     ocr: bool = False,
-#     show_log: bool = False,
-#     start_page_id=0,
-#     end_page_id=None,
-#     lang=None,
-#     layout_model=None,
-#     formula_enable=None,
-#     table_enable=None,
-#     batch_ratio: int | None = None,
-# ) -> InferenceResult:
-#     """Perform batch analysis on a document dataset.
-#
-#     Args:
-#         dataset (Dataset): The dataset containing document pages to be analyzed.
-#         ocr (bool, optional): Flag to enable OCR (Optical Character Recognition). Defaults to False.
-#         show_log (bool, optional): Flag to enable logging. Defaults to False.
-#         start_page_id (int, optional): The starting page ID for analysis. Defaults to 0.
-#         end_page_id (int, optional): The ending page ID for analysis. Defaults to None, which means analyze till the last page.
-#         lang (str, optional): Language for OCR. Defaults to None.
-#         layout_model (optional): Layout model to be used for analysis. Defaults to None.
-#         formula_enable (optional): Flag to enable formula detection. Defaults to None.
-#         table_enable (optional): Flag to enable table detection. Defaults to None.
-#         batch_ratio (int | None, optional): Ratio for batch processing. Defaults to None, which sets it to 1.
-#
-#     Raises:
-#         CUDA_NOT_AVAILABLE: If CUDA is not available, raises an exception as batch analysis is not supported in CPU mode.
-#
-#     Returns:
-#         InferenceResult: The result of the batch analysis containing the analyzed data and the dataset.
-#     """
-#
-#     if not torch.cuda.is_available():
-#         raise CUDA_NOT_AVAILABLE('batch analyze not support in CPU mode')
-#
-#     lang = None if lang == '' else lang
-#     # TODO: auto detect batch size
-#     batch_ratio = 1 if batch_ratio is None else batch_ratio
-#     end_page_id = end_page_id if end_page_id else len(dataset)
-#
-#     model_manager = ModelSingleton()
-#     custom_model: CustomPEKModel = model_manager.get_model(
-#         ocr, show_log, lang, layout_model, formula_enable, table_enable
-#     )
-#     batch_model = BatchAnalyze(model=custom_model, batch_ratio=batch_ratio)
-#
-#     model_json = []
-#
-#     # batch analyze
-#     images = []
-#     for index in range(len(dataset)):
-#         if start_page_id <= index <= end_page_id:
-#             page_data = dataset.get_page(index)
-#             img_dict = page_data.get_image()
-#             images.append(img_dict['img'])
-#     analyze_result = batch_model(images)
-#
-#     for index in range(len(dataset)):
-#         page_data = dataset.get_page(index)
-#         img_dict = page_data.get_image()
-#         page_width = img_dict['width']
-#         page_height = img_dict['height']
-#         if start_page_id <= index <= end_page_id:
-#             result = analyze_result.pop(0)
-#         else:
-#             result = []
-#
-#         page_info = {'page_no': index, 'height': page_height, 'width': page_width}
-#         page_dict = {'layout_dets': result, 'page_info': page_info}
-#         model_json.append(page_dict)
-#
-#     # TODO: clean memory when gpu memory is not enough
-#     clean_memory_start_time = time.time()
-#     clean_memory(get_device())
-#     logger.info(f'clean memory time: {round(time.time() - clean_memory_start_time, 2)}')
-#
-#     return InferenceResult(model_json, dataset)
+            # Process each language separately
+            for lang, img_crop_list in img_crop_lists_by_lang.items():
+                if len(img_crop_list) > 0:
+                    # Get OCR results for this language's images
+                    atom_model_manager = AtomModelSingleton()
+                    ocr_model = atom_model_manager.get_atom_model(
+                        atom_model_name='ocr',
+                        ocr_show_log=False,
+                        det_db_box_thresh=0.3,
+                        lang=lang
+                    )
+                    ocr_res_list = ocr_model.ocr(img_crop_list, det=False, tqdm_enable=True)[0]
+                    # Verify we have matching counts
+                    assert len(ocr_res_list) == len(
+                        need_ocr_lists_by_lang[lang]), f'ocr_res_list: {len(ocr_res_list)}, need_ocr_list: {len(need_ocr_lists_by_lang[lang])} for lang: {lang}'
+                    # Process OCR results for this language
+                    for index, layout_res_item in enumerate(need_ocr_lists_by_lang[lang]):
+                        ocr_text, ocr_score = ocr_res_list[index]
+                        layout_res_item['text'] = ocr_text
+                        layout_res_item['score'] = float(round(ocr_score, 2))
+                    total_processed += len(img_crop_list)
+            rec_time += time.time() - rec_start
+            # logger.info(f'ocr-rec time: {round(rec_time, 2)}, total images processed: {total_processed}')
+        return images_layout_res

magic-pdf 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl

magic-pdf 1.2.1py3-none-any.whl → 1.3.0py3-none-any.whl