PyPI - magic-pdf - Versions diffs - 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

magic-pdf 0.8.1py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

magic_pdf/config/__init__.py +0 -0
magic_pdf/config/enums.py +7 -0
magic_pdf/config/exceptions.py +32 -0
magic_pdf/data/__init__.py +0 -0
magic_pdf/data/data_reader_writer/__init__.py +12 -0
magic_pdf/data/data_reader_writer/base.py +51 -0
magic_pdf/data/data_reader_writer/filebase.py +59 -0
magic_pdf/data/data_reader_writer/multi_bucket_s3.py +137 -0
magic_pdf/data/data_reader_writer/s3.py +69 -0
magic_pdf/data/dataset.py +194 -0
magic_pdf/data/io/__init__.py +0 -0
magic_pdf/data/io/base.py +42 -0
magic_pdf/data/io/http.py +37 -0
magic_pdf/data/io/s3.py +114 -0
magic_pdf/data/read_api.py +95 -0
magic_pdf/data/schemas.py +15 -0
magic_pdf/data/utils.py +32 -0
magic_pdf/dict2md/ocr_mkcontent.py +74 -234
magic_pdf/libs/Constants.py +21 -8
magic_pdf/libs/MakeContentConfig.py +1 -0
magic_pdf/libs/boxbase.py +35 -0
magic_pdf/libs/clean_memory.py +10 -0
magic_pdf/libs/config_reader.py +53 -23
magic_pdf/libs/draw_bbox.py +150 -65
magic_pdf/libs/ocr_content_type.py +2 -0
magic_pdf/libs/version.py +1 -1
magic_pdf/model/doc_analyze_by_custom_model.py +77 -32
magic_pdf/model/magic_model.py +331 -15
magic_pdf/model/pdf_extract_kit.py +164 -80
magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +8 -1
magic_pdf/model/ppTableModel.py +2 -2
magic_pdf/model/pp_structure_v2.py +5 -2
magic_pdf/model/v3/__init__.py +0 -0
magic_pdf/model/v3/helpers.py +125 -0
magic_pdf/para/para_split_v3.py +296 -0
magic_pdf/pdf_parse_by_ocr.py +6 -3
magic_pdf/pdf_parse_by_txt.py +6 -3
magic_pdf/pdf_parse_union_core_v2.py +644 -0
magic_pdf/pipe/AbsPipe.py +5 -1
magic_pdf/pipe/OCRPipe.py +10 -4
magic_pdf/pipe/TXTPipe.py +10 -4
magic_pdf/pipe/UNIPipe.py +16 -7
magic_pdf/pre_proc/ocr_detect_all_bboxes.py +83 -1
magic_pdf/pre_proc/ocr_dict_merge.py +27 -2
magic_pdf/resources/model_config/UniMERNet/demo.yaml +7 -7
magic_pdf/resources/model_config/model_configs.yaml +5 -13
magic_pdf/tools/cli.py +14 -1
magic_pdf/tools/common.py +18 -8
magic_pdf/user_api.py +25 -6
magic_pdf/utils/__init__.py +0 -0
magic_pdf/utils/annotations.py +11 -0
{magic_pdf-0.8.1.dist-info → magic_pdf-0.9.0.dist-info}/LICENSE.md +1 -0
{magic_pdf-0.8.1.dist-info → magic_pdf-0.9.0.dist-info}/METADATA +120 -75
{magic_pdf-0.8.1.dist-info → magic_pdf-0.9.0.dist-info}/RECORD +57 -33
{magic_pdf-0.8.1.dist-info → magic_pdf-0.9.0.dist-info}/WHEEL +0 -0
{magic_pdf-0.8.1.dist-info → magic_pdf-0.9.0.dist-info}/entry_points.txt +0 -0
{magic_pdf-0.8.1.dist-info → magic_pdf-0.9.0.dist-info}/top_level.txt +0 -0

magic_pdf/model/pdf_extract_kit.py CHANGED Viewed

@@ -1,11 +1,14 @@
 from loguru import logger
 import os
 import time
+from pathlib import Path
+import shutil
 from magic_pdf.libs.Constants import *
+from magic_pdf.libs.clean_memory import clean_memory
 from magic_pdf.model.model_list import AtomicModel
 os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
+os.environ['YOLO_VERBOSE'] = 'False'  # disable yolo logger
 try:
     import cv2
     import yaml
@@ -23,6 +26,7 @@ try:
     from unimernet.common.config import Config
     import unimernet.tasks as tasks
     from unimernet.processors import load_processor
+    from doclayout_yolo import YOLOv10
 except ImportError as e:
     logger.exception(e)
@@ -32,21 +36,26 @@ except ImportError as e:
     exit(1)
 from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
-from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
+from magic_pdf.model.pek_sub_modules.post_process import latex_rm_whitespace
 from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
-from magic_pdf.model.pek_sub_modules.structeqtable.StructTableModel import StructTableModel
+# from magic_pdf.model.pek_sub_modules.structeqtable.StructTableModel import StructTableModel
 from magic_pdf.model.ppTableModel import ppTableModel
 def table_model_init(table_model_type, model_path, max_time, _device_='cpu'):
-    if table_model_type == STRUCT_EQTABLE:
-        table_model = StructTableModel(model_path, max_time=max_time, device=_device_)
-    else:
+    if table_model_type == MODEL_NAME.STRUCT_EQTABLE:
+        # table_model = StructTableModel(model_path, max_time=max_time, device=_device_)
+        logger.error("StructEqTable is under upgrade, the current version does not support it.")
+        exit(1)
+    elif table_model_type == MODEL_NAME.TABLE_MASTER:
         config = {
             "model_dir": model_path,
             "device": _device_
         }
         table_model = ppTableModel(config)
+    else:
+        logger.error("table model type not allow")
+        exit(1)
     return table_model
@@ -58,12 +67,13 @@ def mfd_model_init(weight):
 def mfr_model_init(weight_dir, cfg_path, _device_='cpu'):
     args = argparse.Namespace(cfg_path=cfg_path, options=None)
     cfg = Config(args)
-    cfg.config.model.pretrained = os.path.join(weight_dir, "pytorch_model.bin")
+    cfg.config.model.pretrained = os.path.join(weight_dir, "pytorch_model.pth")
     cfg.config.model.model_config.model_name = weight_dir
     cfg.config.model.tokenizer_config.path = weight_dir
     task = tasks.setup_task(cfg)
     model = task.build_model(cfg)
-    model = model.to(_device_)
+    model.to(_device_)
+    model.eval()
     vis_processor = load_processor('formula_image_eval', cfg.config.datasets.formula_rec_eval.vis_processor.eval)
     mfr_transform = transforms.Compose([vis_processor, ])
     return [model, mfr_transform]
@@ -74,8 +84,16 @@ def layout_model_init(weight, config_file, device):
     return model
-def ocr_model_init(show_log: bool = False, det_db_box_thresh=0.3):
-    model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh)
+def doclayout_yolo_model_init(weight):
+    model = YOLOv10(weight)
+    return model
+def ocr_model_init(show_log: bool = False, det_db_box_thresh=0.3, lang=None, use_dilation=True, det_db_unclip_ratio=1.8):
+    if lang is not None:
+        model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh, lang=lang, use_dilation=use_dilation, det_db_unclip_ratio=det_db_unclip_ratio)
+    else:
+        model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh, use_dilation=use_dilation, det_db_unclip_ratio=det_db_unclip_ratio)
     return model
@@ -108,19 +126,27 @@ class AtomModelSingleton:
         return cls._instance
     def get_atom_model(self, atom_model_name: str, **kwargs):
-        if atom_model_name not in self._models:
-            self._models[atom_model_name] = atom_model_init(model_name=atom_model_name, **kwargs)
-        return self._models[atom_model_name]
+        lang = kwargs.get("lang", None)
+        layout_model_name = kwargs.get("layout_model_name", None)
+        key = (atom_model_name, layout_model_name, lang)
+        if key not in self._models:
+            self._models[key] = atom_model_init(model_name=atom_model_name, **kwargs)
+        return self._models[key]
 def atom_model_init(model_name: str, **kwargs):
     if model_name == AtomicModel.Layout:
-        atom_model = layout_model_init(
-            kwargs.get("layout_weights"),
-            kwargs.get("layout_config_file"),
-            kwargs.get("device")
-        )
+        if kwargs.get("layout_model_name") == MODEL_NAME.LAYOUTLMv3:
+            atom_model = layout_model_init(
+                kwargs.get("layout_weights"),
+                kwargs.get("layout_config_file"),
+                kwargs.get("device")
+            )
+        elif kwargs.get("layout_model_name") == MODEL_NAME.DocLayout_YOLO:
+            atom_model = doclayout_yolo_model_init(
+                kwargs.get("doclayout_yolo_weights"),
+            )
     elif model_name == AtomicModel.MFD:
         atom_model = mfd_model_init(
             kwargs.get("mfd_weights")
@@ -134,11 +160,12 @@ def atom_model_init(model_name: str, **kwargs):
     elif model_name == AtomicModel.OCR:
         atom_model = ocr_model_init(
             kwargs.get("ocr_show_log"),
-            kwargs.get("det_db_box_thresh")
+            kwargs.get("det_db_box_thresh"),
+            kwargs.get("lang")
         )
     elif model_name == AtomicModel.Table:
         atom_model = table_model_init(
-            kwargs.get("table_model_type"),
+            kwargs.get("table_model_name"),
             kwargs.get("table_model_path"),
             kwargs.get("table_max_time"),
             kwargs.get("device")
@@ -150,6 +177,23 @@ def atom_model_init(model_name: str, **kwargs):
     return atom_model
+#  Unified crop img logic
+def crop_img(input_res, input_pil_img, crop_paste_x=0, crop_paste_y=0):
+    crop_xmin, crop_ymin = int(input_res['poly'][0]), int(input_res['poly'][1])
+    crop_xmax, crop_ymax = int(input_res['poly'][4]), int(input_res['poly'][5])
+    # Create a white background with an additional width and height of 50
+    crop_new_width = crop_xmax - crop_xmin + crop_paste_x * 2
+    crop_new_height = crop_ymax - crop_ymin + crop_paste_y * 2
+    return_image = Image.new('RGB', (crop_new_width, crop_new_height), 'white')
+    # Crop image
+    crop_box = (crop_xmin, crop_ymin, crop_xmax, crop_ymax)
+    cropped_img = input_pil_img.crop(crop_box)
+    return_image.paste(cropped_img, (crop_paste_x, crop_paste_y))
+    return_list = [crop_paste_x, crop_paste_y, crop_xmin, crop_ymin, crop_xmax, crop_ymax, crop_new_width, crop_new_height]
+    return return_image, return_list
 class CustomPEKModel:
     def __init__(self, ocr: bool = False, show_log: bool = False, **kwargs):
@@ -169,22 +213,35 @@ class CustomPEKModel:
         with open(config_path, "r", encoding='utf-8') as f:
             self.configs = yaml.load(f, Loader=yaml.FullLoader)
         # 初始化解析配置
-        self.apply_layout = kwargs.get("apply_layout", self.configs["config"]["layout"])
-        self.apply_formula = kwargs.get("apply_formula", self.configs["config"]["formula"])
+        # layout config
+        self.layout_config = kwargs.get("layout_config")
+        self.layout_model_name = self.layout_config.get("model", MODEL_NAME.DocLayout_YOLO)
+        # formula config
+        self.formula_config = kwargs.get("formula_config")
+        self.mfd_model_name = self.formula_config.get("mfd_model", MODEL_NAME.YOLO_V8_MFD)
+        self.mfr_model_name = self.formula_config.get("mfr_model", MODEL_NAME.UniMerNet_v2_Small)
+        self.apply_formula = self.formula_config.get("enable", True)
         # table config
-        self.table_config = kwargs.get("table_config", self.configs["config"]["table_config"])
-        self.apply_table = self.table_config.get("is_table_recog_enable", False)
+        self.table_config = kwargs.get("table_config")
+        self.apply_table = self.table_config.get("enable", False)
         self.table_max_time = self.table_config.get("max_time", TABLE_MAX_TIME_VALUE)
-        self.table_model_type = self.table_config.get("model", TABLE_MASTER)
+        self.table_model_name = self.table_config.get("model", MODEL_NAME.TABLE_MASTER)
+        # ocr config
         self.apply_ocr = ocr
+        self.lang = kwargs.get("lang", None)
         logger.info(
-            "DocAnalysis init, this may take some times. apply_layout: {}, apply_formula: {}, apply_ocr: {}, apply_table: {}".format(
-                self.apply_layout, self.apply_formula, self.apply_ocr, self.apply_table
+            "DocAnalysis init, this may take some times, layout_model: {}, apply_formula: {}, apply_ocr: {}, "
+            "apply_table: {}, table_model: {}, lang: {}".format(
+                self.layout_model_name, self.apply_formula, self.apply_ocr, self.apply_table, self.table_model_name, self.lang
             )
         )
-        assert self.apply_layout, "DocAnalysis must contain layout model."
         # 初始化解析方案
-        self.device = kwargs.get("device", self.configs["config"]["device"])
+        self.device = kwargs.get("device", "cpu")
         logger.info("using device: {}".format(self.device))
         models_dir = kwargs.get("models_dir", os.path.join(root_dir, "resources", "models"))
         logger.info("using models_dir: {}".format(models_dir))
@@ -193,17 +250,16 @@ class CustomPEKModel:
         # 初始化公式识别
         if self.apply_formula:
             # 初始化公式检测模型
-            # self.mfd_model = mfd_model_init(str(os.path.join(models_dir, self.configs["weights"]["mfd"])))
             self.mfd_model = atom_model_manager.get_atom_model(
                 atom_model_name=AtomicModel.MFD,
-                mfd_weights=str(os.path.join(models_dir, self.configs["weights"]["mfd"]))
+                mfd_weights=str(os.path.join(models_dir, self.configs["weights"][self.mfd_model_name]))
             )
             # 初始化公式解析模型
-            mfr_weight_dir = str(os.path.join(models_dir, self.configs["weights"]["mfr"]))
+            mfr_weight_dir = str(os.path.join(models_dir, self.configs["weights"][self.mfr_model_name]))
             mfr_cfg_path = str(os.path.join(model_config_dir, "UniMERNet", "demo.yaml"))
-            # self.mfr_model, mfr_vis_processors = mfr_model_init(mfr_weight_dir, mfr_cfg_path, _device_=self.device)
-            # self.mfr_transform = transforms.Compose([mfr_vis_processors, ])
             self.mfr_model, self.mfr_transform = atom_model_manager.get_atom_model(
                 atom_model_name=AtomicModel.MFR,
                 mfr_weight_dir=mfr_weight_dir,
@@ -212,17 +268,20 @@ class CustomPEKModel:
             )
         # 初始化layout模型
-        # self.layout_model = Layoutlmv3_Predictor(
-        #     str(os.path.join(models_dir, self.configs['weights']['layout'])),
-        #     str(os.path.join(model_config_dir, "layoutlmv3", "layoutlmv3_base_inference.yaml")),
-        #     device=self.device
-        # )
-        self.layout_model = atom_model_manager.get_atom_model(
-            atom_model_name=AtomicModel.Layout,
-            layout_weights=str(os.path.join(models_dir, self.configs['weights']['layout'])),
-            layout_config_file=str(os.path.join(model_config_dir, "layoutlmv3", "layoutlmv3_base_inference.yaml")),
-            device=self.device
-        )
+        if self.layout_model_name == MODEL_NAME.LAYOUTLMv3:
+            self.layout_model = atom_model_manager.get_atom_model(
+                atom_model_name=AtomicModel.Layout,
+                layout_model_name=MODEL_NAME.LAYOUTLMv3,
+                layout_weights=str(os.path.join(models_dir, self.configs['weights'][self.layout_model_name])),
+                layout_config_file=str(os.path.join(model_config_dir, "layoutlmv3", "layoutlmv3_base_inference.yaml")),
+                device=self.device
+            )
+        elif self.layout_model_name == MODEL_NAME.DocLayout_YOLO:
+            self.layout_model = atom_model_manager.get_atom_model(
+                atom_model_name=AtomicModel.Layout,
+                layout_model_name=MODEL_NAME.DocLayout_YOLO,
+                doclayout_yolo_weights=str(os.path.join(models_dir, self.configs['weights'][self.layout_model_name]))
+            )
         # 初始化ocr
         if self.apply_ocr:
@@ -230,37 +289,67 @@ class CustomPEKModel:
             self.ocr_model = atom_model_manager.get_atom_model(
                 atom_model_name=AtomicModel.OCR,
                 ocr_show_log=show_log,
-                det_db_box_thresh=0.3
+                det_db_box_thresh=0.3,
+                lang=self.lang
             )
         # init table model
         if self.apply_table:
-            table_model_dir = self.configs["weights"][self.table_model_type]
-            # self.table_model = table_model_init(self.table_model_type, str(os.path.join(models_dir, table_model_dir)),
-            #                                     max_time=self.table_max_time, _device_=self.device)
+            table_model_dir = self.configs["weights"][self.table_model_name]
             self.table_model = atom_model_manager.get_atom_model(
                 atom_model_name=AtomicModel.Table,
-                table_model_type=self.table_model_type,
+                table_model_name=self.table_model_name,
                 table_model_path=str(os.path.join(models_dir, table_model_dir)),
                 table_max_time=self.table_max_time,
                 device=self.device
             )
+            home_directory = Path.home()
+            det_source = os.path.join(models_dir, table_model_dir, DETECT_MODEL_DIR)
+            rec_source = os.path.join(models_dir, table_model_dir, REC_MODEL_DIR)
+            det_dest_dir = os.path.join(home_directory, PP_DET_DIRECTORY)
+            rec_dest_dir = os.path.join(home_directory, PP_REC_DIRECTORY)
+            if not os.path.exists(det_dest_dir):
+                shutil.copytree(det_source, det_dest_dir)
+            if not os.path.exists(rec_dest_dir):
+                shutil.copytree(rec_source, rec_dest_dir)
         logger.info('DocAnalysis init done!')
     def __call__(self, image):
+        page_start = time.time()
         latex_filling_list = []
         mf_image_list = []
         # layout检测
         layout_start = time.time()
-        layout_res = self.layout_model(image, ignore_catids=[])
+        if self.layout_model_name == MODEL_NAME.LAYOUTLMv3:
+            # layoutlmv3
+            layout_res = self.layout_model(image, ignore_catids=[])
+        elif self.layout_model_name == MODEL_NAME.DocLayout_YOLO:
+            # doclayout_yolo
+            layout_res = []
+            doclayout_yolo_res = self.layout_model.predict(image, imgsz=1024, conf=0.25, iou=0.45, verbose=True, device=self.device)[0]
+            for xyxy, conf, cla in zip(doclayout_yolo_res.boxes.xyxy.cpu(), doclayout_yolo_res.boxes.conf.cpu(), doclayout_yolo_res.boxes.cls.cpu()):
+                xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
+                new_item = {
+                    'category_id': int(cla.item()),
+                    'poly': [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax],
+                    'score': round(float(conf.item()), 3),
+                }
+                layout_res.append(new_item)
         layout_cost = round(time.time() - layout_start, 2)
-        logger.info(f"layout detection cost: {layout_cost}")
+        logger.info(f"layout detection time: {layout_cost}")
+        pil_img = Image.fromarray(image)
         if self.apply_formula:
             # 公式检测
-            mfd_res = self.mfd_model.predict(image, imgsz=1888, conf=0.25, iou=0.45, verbose=True)[0]
+            mfd_start = time.time()
+            mfd_res = self.mfd_model.predict(image, imgsz=1888, conf=0.25, iou=0.45, verbose=True, device=self.device)[0]
+            logger.info(f"mfd time: {round(time.time() - mfd_start, 2)}")
             for xyxy, conf, cla in zip(mfd_res.boxes.xyxy.cpu(), mfd_res.boxes.conf.cpu(), mfd_res.boxes.cls.cpu()):
                 xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
                 new_item = {
@@ -271,7 +360,7 @@ class CustomPEKModel:
                 }
                 layout_res.append(new_item)
                 latex_filling_list.append(new_item)
-                bbox_img = get_croped_image(Image.fromarray(image), [xmin, ymin, xmax, ymax])
+                bbox_img = pil_img.crop((xmin, ymin, xmax, ymax))
                 mf_image_list.append(bbox_img)
             # 公式识别
@@ -281,7 +370,8 @@ class CustomPEKModel:
             mfr_res = []
             for mf_img in dataloader:
                 mf_img = mf_img.to(self.device)
-                output = self.mfr_model.generate({'image': mf_img})
+                with torch.no_grad():
+                    output = self.mfr_model.generate({'image': mf_img})
                 mfr_res.extend(output['pred_str'])
             for res, latex in zip(latex_filling_list, mfr_res):
                 res['latex'] = latex_rm_whitespace(latex)
@@ -303,23 +393,14 @@ class CustomPEKModel:
             elif int(res['category_id']) in [5]:
                 table_res_list.append(res)
-        #  Unified crop img logic
-        def crop_img(input_res, input_pil_img, crop_paste_x=0, crop_paste_y=0):
-            crop_xmin, crop_ymin = int(input_res['poly'][0]), int(input_res['poly'][1])
-            crop_xmax, crop_ymax = int(input_res['poly'][4]), int(input_res['poly'][5])
-            # Create a white background with an additional width and height of 50
-            crop_new_width = crop_xmax - crop_xmin + crop_paste_x * 2
-            crop_new_height = crop_ymax - crop_ymin + crop_paste_y * 2
-            return_image = Image.new('RGB', (crop_new_width, crop_new_height), 'white')
-            # Crop image
-            crop_box = (crop_xmin, crop_ymin, crop_xmax, crop_ymax)
-            cropped_img = input_pil_img.crop(crop_box)
-            return_image.paste(cropped_img, (crop_paste_x, crop_paste_y))
-            return_list = [crop_paste_x, crop_paste_y, crop_xmin, crop_ymin, crop_xmax, crop_ymax, crop_new_width, crop_new_height]
-            return return_image, return_list
-        pil_img = Image.fromarray(image)
+        if torch.cuda.is_available():
+            properties = torch.cuda.get_device_properties(self.device)
+            total_memory = properties.total_memory / (1024 ** 3)  # 将字节转换为 GB
+            if total_memory <= 10:
+                gc_start = time.time()
+                clean_memory()
+                gc_time = round(time.time() - gc_start, 2)
+                logger.info(f"gc time: {gc_time}")
         # ocr识别
         if self.apply_ocr:
@@ -369,7 +450,7 @@ class CustomPEKModel:
                         })
             ocr_cost = round(time.time() - ocr_start, 2)
-            logger.info(f"ocr cost: {ocr_cost}")
+            logger.info(f"ocr time: {ocr_cost}")
         # 表格识别 table recognition
         if self.apply_table:
@@ -377,17 +458,17 @@ class CustomPEKModel:
             for res in table_res_list:
                 new_image, _ = crop_img(res, pil_img)
                 single_table_start_time = time.time()
-                logger.info("------------------table recognition processing begins-----------------")
+                # logger.info("------------------table recognition processing begins-----------------")
                 latex_code = None
                 html_code = None
-                if self.table_model_type == STRUCT_EQTABLE:
+                if self.table_model_name == MODEL_NAME.STRUCT_EQTABLE:
                     with torch.no_grad():
                         latex_code = self.table_model.image2latex(new_image)[0]
                 else:
                     html_code = self.table_model.img2html(new_image)
                 run_time = time.time() - single_table_start_time
-                logger.info(f"------------table recognition processing ends within {run_time}s-----")
+                # logger.info(f"------------table recognition processing ends within {run_time}s-----")
                 if run_time > self.table_max_time:
                     logger.warning(f"------------table recognition processing exceeds max time {self.table_max_time}s----------")
                 # 判断是否返回正常
@@ -398,12 +479,15 @@ class CustomPEKModel:
                     if expected_ending:
                         res["latex"] = latex_code
                     else:
-                        logger.warning(f"------------table recognition processing fails----------")
+                        logger.warning(f"table recognition processing fails, not found expected LaTeX table end")
                 elif html_code:
                     res["html"] = html_code
                 else:
-                    logger.warning(f"------------table recognition processing fails----------")
-            table_cost = round(time.time() - table_start, 2)
-            logger.info(f"table cost: {table_cost}")
+                    logger.warning(f"table recognition processing fails, not get latex or html return")
+            logger.info(f"table time: {round(time.time() - table_start, 2)}")
+        logger.info(f"-----page total time: {round(time.time() - page_start, 2)}-----")
         return layout_res

magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py CHANGED Viewed

@@ -1,5 +1,12 @@
-from struct_eqtable.model import StructTable
+from loguru import logger
+try:
+    from struct_eqtable.model import StructTable
+except ImportError:
+    logger.error("StructEqTable is under upgrade, the current version does not support it.")
 from pypandoc import convert_text
 class StructTableModel:
     def __init__(self, model_path, max_new_tokens=2048, max_time=400, device = 'cpu'):
         # init

magic_pdf/model/ppTableModel.py CHANGED Viewed

@@ -52,11 +52,11 @@ class ppTableModel(object):
         rec_model_dir = os.path.join(model_dir, REC_MODEL_DIR)
         rec_char_dict_path = os.path.join(model_dir, REC_CHAR_DICT)
         device = kwargs.get("device", "cpu")
-        use_gpu = True if device == "cuda" else False
+        use_gpu = True if device.startswith("cuda") else False
         config = {
             "use_gpu": use_gpu,
             "table_max_len": kwargs.get("table_max_len", TABLE_MAX_LEN),
-            "table_algorithm": TABLE_MASTER,
+            "table_algorithm": "TableMaster",
             "table_model_dir": table_model_dir,
             "table_char_dict_path": table_char_dict_path,
             "det_model_dir": det_model_dir,

magic_pdf/model/pp_structure_v2.py CHANGED Viewed

@@ -18,8 +18,11 @@ def region_to_bbox(region):
 class CustomPaddleModel:
-    def __init__(self, ocr: bool = False, show_log: bool = False):
-        self.model = PPStructure(table=False, ocr=ocr, show_log=show_log)
+    def __init__(self, ocr: bool = False, show_log: bool = False, lang=None):
+        if lang is not None:
+            self.model = PPStructure(table=False, ocr=ocr, show_log=show_log, lang=lang)
+        else:
+            self.model = PPStructure(table=False, ocr=ocr, show_log=show_log)
     def __call__(self, img):
         try:

magic_pdf/model/v3/__init__.py ADDED Viewed

File without changes

magic_pdf/model/v3/helpers.py ADDED Viewed

@@ -0,0 +1,125 @@
+from collections import defaultdict
+from typing import List, Dict
+import torch
+from transformers import LayoutLMv3ForTokenClassification
+MAX_LEN = 510
+CLS_TOKEN_ID = 0
+UNK_TOKEN_ID = 3
+EOS_TOKEN_ID = 2
+class DataCollator:
+    def __call__(self, features: List[dict]) -> Dict[str, torch.Tensor]:
+        bbox = []
+        labels = []
+        input_ids = []
+        attention_mask = []
+        # clip bbox and labels to max length, build input_ids and attention_mask
+        for feature in features:
+            _bbox = feature["source_boxes"]
+            if len(_bbox) > MAX_LEN:
+                _bbox = _bbox[:MAX_LEN]
+            _labels = feature["target_index"]
+            if len(_labels) > MAX_LEN:
+                _labels = _labels[:MAX_LEN]
+            _input_ids = [UNK_TOKEN_ID] * len(_bbox)
+            _attention_mask = [1] * len(_bbox)
+            assert len(_bbox) == len(_labels) == len(_input_ids) == len(_attention_mask)
+            bbox.append(_bbox)
+            labels.append(_labels)
+            input_ids.append(_input_ids)
+            attention_mask.append(_attention_mask)
+        # add CLS and EOS tokens
+        for i in range(len(bbox)):
+            bbox[i] = [[0, 0, 0, 0]] + bbox[i] + [[0, 0, 0, 0]]
+            labels[i] = [-100] + labels[i] + [-100]
+            input_ids[i] = [CLS_TOKEN_ID] + input_ids[i] + [EOS_TOKEN_ID]
+            attention_mask[i] = [1] + attention_mask[i] + [1]
+        # padding to max length
+        max_len = max(len(x) for x in bbox)
+        for i in range(len(bbox)):
+            bbox[i] = bbox[i] + [[0, 0, 0, 0]] * (max_len - len(bbox[i]))
+            labels[i] = labels[i] + [-100] * (max_len - len(labels[i]))
+            input_ids[i] = input_ids[i] + [EOS_TOKEN_ID] * (max_len - len(input_ids[i]))
+            attention_mask[i] = attention_mask[i] + [0] * (
+                max_len - len(attention_mask[i])
+            )
+        ret = {
+            "bbox": torch.tensor(bbox),
+            "attention_mask": torch.tensor(attention_mask),
+            "labels": torch.tensor(labels),
+            "input_ids": torch.tensor(input_ids),
+        }
+        # set label > MAX_LEN to -100, because original labels may be > MAX_LEN
+        ret["labels"][ret["labels"] > MAX_LEN] = -100
+        # set label > 0 to label-1, because original labels are 1-indexed
+        ret["labels"][ret["labels"] > 0] -= 1
+        return ret
+def boxes2inputs(boxes: List[List[int]]) -> Dict[str, torch.Tensor]:
+    bbox = [[0, 0, 0, 0]] + boxes + [[0, 0, 0, 0]]
+    input_ids = [CLS_TOKEN_ID] + [UNK_TOKEN_ID] * len(boxes) + [EOS_TOKEN_ID]
+    attention_mask = [1] + [1] * len(boxes) + [1]
+    return {
+        "bbox": torch.tensor([bbox]),
+        "attention_mask": torch.tensor([attention_mask]),
+        "input_ids": torch.tensor([input_ids]),
+    }
+def prepare_inputs(
+    inputs: Dict[str, torch.Tensor], model: LayoutLMv3ForTokenClassification
+) -> Dict[str, torch.Tensor]:
+    ret = {}
+    for k, v in inputs.items():
+        v = v.to(model.device)
+        if torch.is_floating_point(v):
+            v = v.to(model.dtype)
+        ret[k] = v
+    return ret
+def parse_logits(logits: torch.Tensor, length: int) -> List[int]:
+    """
+    parse logits to orders
+    :param logits: logits from model
+    :param length: input length
+    :return: orders
+    """
+    logits = logits[1 : length + 1, :length]
+    orders = logits.argsort(descending=False).tolist()
+    ret = [o.pop() for o in orders]
+    while True:
+        order_to_idxes = defaultdict(list)
+        for idx, order in enumerate(ret):
+            order_to_idxes[order].append(idx)
+        # filter idxes len > 1
+        order_to_idxes = {k: v for k, v in order_to_idxes.items() if len(v) > 1}
+        if not order_to_idxes:
+            break
+        # filter
+        for order, idxes in order_to_idxes.items():
+            # find original logits of idxes
+            idxes_to_logit = {}
+            for idx in idxes:
+                idxes_to_logit[idx] = logits[idx, order]
+            idxes_to_logit = sorted(
+                idxes_to_logit.items(), key=lambda x: x[1], reverse=True
+            )
+            # keep the highest logit as order, set others to next candidate
+            for idx, _ in idxes_to_logit[1:]:
+                ret[idx] = orders[idx].pop()
+    return ret
+def check_duplicate(a: List[int]) -> bool:
+    return len(a) != len(set(a))

magic-pdf 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl

magic-pdf 0.8.1py3-none-any.whl → 0.9.0py3-none-any.whl