PyPI - magic-pdf - Versions diffs - 0.8.1__py3-none-any.whl → 0.9.1__py3-none-any.whl - Mend

magic-pdf 0.8.1py3-none-any.whl → 0.9.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

magic_pdf/config/__init__.py +0 -0
magic_pdf/config/enums.py +7 -0
magic_pdf/config/exceptions.py +32 -0
magic_pdf/data/__init__.py +0 -0
magic_pdf/data/data_reader_writer/__init__.py +12 -0
magic_pdf/data/data_reader_writer/base.py +51 -0
magic_pdf/data/data_reader_writer/filebase.py +59 -0
magic_pdf/data/data_reader_writer/multi_bucket_s3.py +143 -0
magic_pdf/data/data_reader_writer/s3.py +73 -0
magic_pdf/data/dataset.py +194 -0
magic_pdf/data/io/__init__.py +6 -0
magic_pdf/data/io/base.py +42 -0
magic_pdf/data/io/http.py +37 -0
magic_pdf/data/io/s3.py +114 -0
magic_pdf/data/read_api.py +95 -0
magic_pdf/data/schemas.py +19 -0
magic_pdf/data/utils.py +32 -0
magic_pdf/dict2md/ocr_mkcontent.py +106 -244
magic_pdf/libs/Constants.py +21 -8
magic_pdf/libs/MakeContentConfig.py +1 -0
magic_pdf/libs/boxbase.py +35 -0
magic_pdf/libs/clean_memory.py +10 -0
magic_pdf/libs/config_reader.py +53 -23
magic_pdf/libs/draw_bbox.py +150 -65
magic_pdf/libs/ocr_content_type.py +2 -0
magic_pdf/libs/version.py +1 -1
magic_pdf/model/doc_analyze_by_custom_model.py +77 -32
magic_pdf/model/magic_model.py +331 -15
magic_pdf/model/pdf_extract_kit.py +170 -83
magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +40 -16
magic_pdf/model/ppTableModel.py +8 -6
magic_pdf/model/pp_structure_v2.py +5 -2
magic_pdf/model/v3/__init__.py +0 -0
magic_pdf/model/v3/helpers.py +125 -0
magic_pdf/para/para_split_v3.py +322 -0
magic_pdf/pdf_parse_by_ocr.py +6 -3
magic_pdf/pdf_parse_by_txt.py +6 -3
magic_pdf/pdf_parse_union_core_v2.py +644 -0
magic_pdf/pipe/AbsPipe.py +5 -1
magic_pdf/pipe/OCRPipe.py +10 -4
magic_pdf/pipe/TXTPipe.py +10 -4
magic_pdf/pipe/UNIPipe.py +16 -7
magic_pdf/pre_proc/ocr_detect_all_bboxes.py +83 -1
magic_pdf/pre_proc/ocr_dict_merge.py +27 -2
magic_pdf/resources/model_config/UniMERNet/demo.yaml +7 -7
magic_pdf/resources/model_config/model_configs.yaml +5 -13
magic_pdf/tools/cli.py +14 -1
magic_pdf/tools/common.py +18 -8
magic_pdf/user_api.py +25 -6
magic_pdf/utils/__init__.py +0 -0
magic_pdf/utils/annotations.py +11 -0
{magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/LICENSE.md +1 -0
{magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/METADATA +124 -78
{magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/RECORD +57 -33
{magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/WHEEL +0 -0
{magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/entry_points.txt +0 -0
{magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/top_level.txt +0 -0

magic_pdf/model/pdf_extract_kit.py CHANGED Viewed

@@ -1,11 +1,14 @@
 from loguru import logger
 import os
 import time
+from pathlib import Path
+import shutil
 from magic_pdf.libs.Constants import *
+from magic_pdf.libs.clean_memory import clean_memory
 from magic_pdf.model.model_list import AtomicModel
 os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
+os.environ['YOLO_VERBOSE'] = 'False'  # disable yolo logger
 try:
     import cv2
     import yaml
@@ -23,6 +26,7 @@ try:
     from unimernet.common.config import Config
     import unimernet.tasks as tasks
     from unimernet.processors import load_processor
+    from doclayout_yolo import YOLOv10
 except ImportError as e:
     logger.exception(e)
@@ -32,21 +36,24 @@ except ImportError as e:
     exit(1)
 from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
-from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
+from magic_pdf.model.pek_sub_modules.post_process import latex_rm_whitespace
 from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
 from magic_pdf.model.pek_sub_modules.structeqtable.StructTableModel import StructTableModel
 from magic_pdf.model.ppTableModel import ppTableModel
 def table_model_init(table_model_type, model_path, max_time, _device_='cpu'):
-    if table_model_type == STRUCT_EQTABLE:
-        table_model = StructTableModel(model_path, max_time=max_time, device=_device_)
-    else:
+    if table_model_type == MODEL_NAME.STRUCT_EQTABLE:
+        table_model = StructTableModel(model_path, max_time=max_time)
+    elif table_model_type == MODEL_NAME.TABLE_MASTER:
         config = {
             "model_dir": model_path,
             "device": _device_
         }
         table_model = ppTableModel(config)
+    else:
+        logger.error("table model type not allow")
+        exit(1)
     return table_model
@@ -58,12 +65,13 @@ def mfd_model_init(weight):
 def mfr_model_init(weight_dir, cfg_path, _device_='cpu'):
     args = argparse.Namespace(cfg_path=cfg_path, options=None)
     cfg = Config(args)
-    cfg.config.model.pretrained = os.path.join(weight_dir, "pytorch_model.bin")
+    cfg.config.model.pretrained = os.path.join(weight_dir, "pytorch_model.pth")
     cfg.config.model.model_config.model_name = weight_dir
     cfg.config.model.tokenizer_config.path = weight_dir
     task = tasks.setup_task(cfg)
     model = task.build_model(cfg)
-    model = model.to(_device_)
+    model.to(_device_)
+    model.eval()
     vis_processor = load_processor('formula_image_eval', cfg.config.datasets.formula_rec_eval.vis_processor.eval)
     mfr_transform = transforms.Compose([vis_processor, ])
     return [model, mfr_transform]
@@ -74,8 +82,16 @@ def layout_model_init(weight, config_file, device):
     return model
-def ocr_model_init(show_log: bool = False, det_db_box_thresh=0.3):
-    model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh)
+def doclayout_yolo_model_init(weight):
+    model = YOLOv10(weight)
+    return model
+def ocr_model_init(show_log: bool = False, det_db_box_thresh=0.3, lang=None, use_dilation=True, det_db_unclip_ratio=1.8):
+    if lang is not None:
+        model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh, lang=lang, use_dilation=use_dilation, det_db_unclip_ratio=det_db_unclip_ratio)
+    else:
+        model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh, use_dilation=use_dilation, det_db_unclip_ratio=det_db_unclip_ratio)
     return model
@@ -108,19 +124,27 @@ class AtomModelSingleton:
         return cls._instance
     def get_atom_model(self, atom_model_name: str, **kwargs):
-        if atom_model_name not in self._models:
-            self._models[atom_model_name] = atom_model_init(model_name=atom_model_name, **kwargs)
-        return self._models[atom_model_name]
+        lang = kwargs.get("lang", None)
+        layout_model_name = kwargs.get("layout_model_name", None)
+        key = (atom_model_name, layout_model_name, lang)
+        if key not in self._models:
+            self._models[key] = atom_model_init(model_name=atom_model_name, **kwargs)
+        return self._models[key]
 def atom_model_init(model_name: str, **kwargs):
     if model_name == AtomicModel.Layout:
-        atom_model = layout_model_init(
-            kwargs.get("layout_weights"),
-            kwargs.get("layout_config_file"),
-            kwargs.get("device")
-        )
+        if kwargs.get("layout_model_name") == MODEL_NAME.LAYOUTLMv3:
+            atom_model = layout_model_init(
+                kwargs.get("layout_weights"),
+                kwargs.get("layout_config_file"),
+                kwargs.get("device")
+            )
+        elif kwargs.get("layout_model_name") == MODEL_NAME.DocLayout_YOLO:
+            atom_model = doclayout_yolo_model_init(
+                kwargs.get("doclayout_yolo_weights"),
+            )
     elif model_name == AtomicModel.MFD:
         atom_model = mfd_model_init(
             kwargs.get("mfd_weights")
@@ -134,11 +158,12 @@ def atom_model_init(model_name: str, **kwargs):
     elif model_name == AtomicModel.OCR:
         atom_model = ocr_model_init(
             kwargs.get("ocr_show_log"),
-            kwargs.get("det_db_box_thresh")
+            kwargs.get("det_db_box_thresh"),
+            kwargs.get("lang")
         )
     elif model_name == AtomicModel.Table:
         atom_model = table_model_init(
-            kwargs.get("table_model_type"),
+            kwargs.get("table_model_name"),
             kwargs.get("table_model_path"),
             kwargs.get("table_max_time"),
             kwargs.get("device")
@@ -150,6 +175,23 @@ def atom_model_init(model_name: str, **kwargs):
     return atom_model
+#  Unified crop img logic
+def crop_img(input_res, input_pil_img, crop_paste_x=0, crop_paste_y=0):
+    crop_xmin, crop_ymin = int(input_res['poly'][0]), int(input_res['poly'][1])
+    crop_xmax, crop_ymax = int(input_res['poly'][4]), int(input_res['poly'][5])
+    # Create a white background with an additional width and height of 50
+    crop_new_width = crop_xmax - crop_xmin + crop_paste_x * 2
+    crop_new_height = crop_ymax - crop_ymin + crop_paste_y * 2
+    return_image = Image.new('RGB', (crop_new_width, crop_new_height), 'white')
+    # Crop image
+    crop_box = (crop_xmin, crop_ymin, crop_xmax, crop_ymax)
+    cropped_img = input_pil_img.crop(crop_box)
+    return_image.paste(cropped_img, (crop_paste_x, crop_paste_y))
+    return_list = [crop_paste_x, crop_paste_y, crop_xmin, crop_ymin, crop_xmax, crop_ymax, crop_new_width, crop_new_height]
+    return return_image, return_list
 class CustomPEKModel:
     def __init__(self, ocr: bool = False, show_log: bool = False, **kwargs):
@@ -169,22 +211,35 @@ class CustomPEKModel:
         with open(config_path, "r", encoding='utf-8') as f:
             self.configs = yaml.load(f, Loader=yaml.FullLoader)
         # 初始化解析配置
-        self.apply_layout = kwargs.get("apply_layout", self.configs["config"]["layout"])
-        self.apply_formula = kwargs.get("apply_formula", self.configs["config"]["formula"])
+        # layout config
+        self.layout_config = kwargs.get("layout_config")
+        self.layout_model_name = self.layout_config.get("model", MODEL_NAME.DocLayout_YOLO)
+        # formula config
+        self.formula_config = kwargs.get("formula_config")
+        self.mfd_model_name = self.formula_config.get("mfd_model", MODEL_NAME.YOLO_V8_MFD)
+        self.mfr_model_name = self.formula_config.get("mfr_model", MODEL_NAME.UniMerNet_v2_Small)
+        self.apply_formula = self.formula_config.get("enable", True)
         # table config
-        self.table_config = kwargs.get("table_config", self.configs["config"]["table_config"])
-        self.apply_table = self.table_config.get("is_table_recog_enable", False)
+        self.table_config = kwargs.get("table_config")
+        self.apply_table = self.table_config.get("enable", False)
         self.table_max_time = self.table_config.get("max_time", TABLE_MAX_TIME_VALUE)
-        self.table_model_type = self.table_config.get("model", TABLE_MASTER)
+        self.table_model_name = self.table_config.get("model", MODEL_NAME.TABLE_MASTER)
+        # ocr config
         self.apply_ocr = ocr
+        self.lang = kwargs.get("lang", None)
         logger.info(
-            "DocAnalysis init, this may take some times. apply_layout: {}, apply_formula: {}, apply_ocr: {}, apply_table: {}".format(
-                self.apply_layout, self.apply_formula, self.apply_ocr, self.apply_table
+            "DocAnalysis init, this may take some times, layout_model: {}, apply_formula: {}, apply_ocr: {}, "
+            "apply_table: {}, table_model: {}, lang: {}".format(
+                self.layout_model_name, self.apply_formula, self.apply_ocr, self.apply_table, self.table_model_name, self.lang
             )
         )
-        assert self.apply_layout, "DocAnalysis must contain layout model."
         # 初始化解析方案
-        self.device = kwargs.get("device", self.configs["config"]["device"])
+        self.device = kwargs.get("device", "cpu")
         logger.info("using device: {}".format(self.device))
         models_dir = kwargs.get("models_dir", os.path.join(root_dir, "resources", "models"))
         logger.info("using models_dir: {}".format(models_dir))
@@ -193,17 +248,16 @@ class CustomPEKModel:
         # 初始化公式识别
         if self.apply_formula:
             # 初始化公式检测模型
-            # self.mfd_model = mfd_model_init(str(os.path.join(models_dir, self.configs["weights"]["mfd"])))
             self.mfd_model = atom_model_manager.get_atom_model(
                 atom_model_name=AtomicModel.MFD,
-                mfd_weights=str(os.path.join(models_dir, self.configs["weights"]["mfd"]))
+                mfd_weights=str(os.path.join(models_dir, self.configs["weights"][self.mfd_model_name]))
             )
             # 初始化公式解析模型
-            mfr_weight_dir = str(os.path.join(models_dir, self.configs["weights"]["mfr"]))
+            mfr_weight_dir = str(os.path.join(models_dir, self.configs["weights"][self.mfr_model_name]))
             mfr_cfg_path = str(os.path.join(model_config_dir, "UniMERNet", "demo.yaml"))
-            # self.mfr_model, mfr_vis_processors = mfr_model_init(mfr_weight_dir, mfr_cfg_path, _device_=self.device)
-            # self.mfr_transform = transforms.Compose([mfr_vis_processors, ])
             self.mfr_model, self.mfr_transform = atom_model_manager.get_atom_model(
                 atom_model_name=AtomicModel.MFR,
                 mfr_weight_dir=mfr_weight_dir,
@@ -212,17 +266,20 @@ class CustomPEKModel:
             )
         # 初始化layout模型
-        # self.layout_model = Layoutlmv3_Predictor(
-        #     str(os.path.join(models_dir, self.configs['weights']['layout'])),
-        #     str(os.path.join(model_config_dir, "layoutlmv3", "layoutlmv3_base_inference.yaml")),
-        #     device=self.device
-        # )
-        self.layout_model = atom_model_manager.get_atom_model(
-            atom_model_name=AtomicModel.Layout,
-            layout_weights=str(os.path.join(models_dir, self.configs['weights']['layout'])),
-            layout_config_file=str(os.path.join(model_config_dir, "layoutlmv3", "layoutlmv3_base_inference.yaml")),
-            device=self.device
-        )
+        if self.layout_model_name == MODEL_NAME.LAYOUTLMv3:
+            self.layout_model = atom_model_manager.get_atom_model(
+                atom_model_name=AtomicModel.Layout,
+                layout_model_name=MODEL_NAME.LAYOUTLMv3,
+                layout_weights=str(os.path.join(models_dir, self.configs['weights'][self.layout_model_name])),
+                layout_config_file=str(os.path.join(model_config_dir, "layoutlmv3", "layoutlmv3_base_inference.yaml")),
+                device=self.device
+            )
+        elif self.layout_model_name == MODEL_NAME.DocLayout_YOLO:
+            self.layout_model = atom_model_manager.get_atom_model(
+                atom_model_name=AtomicModel.Layout,
+                layout_model_name=MODEL_NAME.DocLayout_YOLO,
+                doclayout_yolo_weights=str(os.path.join(models_dir, self.configs['weights'][self.layout_model_name]))
+            )
         # 初始化ocr
         if self.apply_ocr:
@@ -230,37 +287,67 @@ class CustomPEKModel:
             self.ocr_model = atom_model_manager.get_atom_model(
                 atom_model_name=AtomicModel.OCR,
                 ocr_show_log=show_log,
-                det_db_box_thresh=0.3
+                det_db_box_thresh=0.3,
+                lang=self.lang
             )
         # init table model
         if self.apply_table:
-            table_model_dir = self.configs["weights"][self.table_model_type]
-            # self.table_model = table_model_init(self.table_model_type, str(os.path.join(models_dir, table_model_dir)),
-            #                                     max_time=self.table_max_time, _device_=self.device)
+            table_model_dir = self.configs["weights"][self.table_model_name]
             self.table_model = atom_model_manager.get_atom_model(
                 atom_model_name=AtomicModel.Table,
-                table_model_type=self.table_model_type,
+                table_model_name=self.table_model_name,
                 table_model_path=str(os.path.join(models_dir, table_model_dir)),
                 table_max_time=self.table_max_time,
                 device=self.device
             )
+            home_directory = Path.home()
+            det_source = os.path.join(models_dir, table_model_dir, DETECT_MODEL_DIR)
+            rec_source = os.path.join(models_dir, table_model_dir, REC_MODEL_DIR)
+            det_dest_dir = os.path.join(home_directory, PP_DET_DIRECTORY)
+            rec_dest_dir = os.path.join(home_directory, PP_REC_DIRECTORY)
+            if not os.path.exists(det_dest_dir):
+                shutil.copytree(det_source, det_dest_dir)
+            if not os.path.exists(rec_dest_dir):
+                shutil.copytree(rec_source, rec_dest_dir)
         logger.info('DocAnalysis init done!')
     def __call__(self, image):
+        page_start = time.time()
         latex_filling_list = []
         mf_image_list = []
         # layout检测
         layout_start = time.time()
-        layout_res = self.layout_model(image, ignore_catids=[])
+        if self.layout_model_name == MODEL_NAME.LAYOUTLMv3:
+            # layoutlmv3
+            layout_res = self.layout_model(image, ignore_catids=[])
+        elif self.layout_model_name == MODEL_NAME.DocLayout_YOLO:
+            # doclayout_yolo
+            layout_res = []
+            doclayout_yolo_res = self.layout_model.predict(image, imgsz=1024, conf=0.25, iou=0.45, verbose=True, device=self.device)[0]
+            for xyxy, conf, cla in zip(doclayout_yolo_res.boxes.xyxy.cpu(), doclayout_yolo_res.boxes.conf.cpu(), doclayout_yolo_res.boxes.cls.cpu()):
+                xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
+                new_item = {
+                    'category_id': int(cla.item()),
+                    'poly': [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax],
+                    'score': round(float(conf.item()), 3),
+                }
+                layout_res.append(new_item)
         layout_cost = round(time.time() - layout_start, 2)
-        logger.info(f"layout detection cost: {layout_cost}")
+        logger.info(f"layout detection time: {layout_cost}")
+        pil_img = Image.fromarray(image)
         if self.apply_formula:
             # 公式检测
-            mfd_res = self.mfd_model.predict(image, imgsz=1888, conf=0.25, iou=0.45, verbose=True)[0]
+            mfd_start = time.time()
+            mfd_res = self.mfd_model.predict(image, imgsz=1888, conf=0.25, iou=0.45, verbose=True, device=self.device)[0]
+            logger.info(f"mfd time: {round(time.time() - mfd_start, 2)}")
             for xyxy, conf, cla in zip(mfd_res.boxes.xyxy.cpu(), mfd_res.boxes.conf.cpu(), mfd_res.boxes.cls.cpu()):
                 xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
                 new_item = {
@@ -271,7 +358,7 @@ class CustomPEKModel:
                 }
                 layout_res.append(new_item)
                 latex_filling_list.append(new_item)
-                bbox_img = get_croped_image(Image.fromarray(image), [xmin, ymin, xmax, ymax])
+                bbox_img = pil_img.crop((xmin, ymin, xmax, ymax))
                 mf_image_list.append(bbox_img)
             # 公式识别
@@ -281,7 +368,8 @@ class CustomPEKModel:
             mfr_res = []
             for mf_img in dataloader:
                 mf_img = mf_img.to(self.device)
-                output = self.mfr_model.generate({'image': mf_img})
+                with torch.no_grad():
+                    output = self.mfr_model.generate({'image': mf_img})
                 mfr_res.extend(output['pred_str'])
             for res, latex in zip(latex_filling_list, mfr_res):
                 res['latex'] = latex_rm_whitespace(latex)
@@ -303,23 +391,14 @@ class CustomPEKModel:
             elif int(res['category_id']) in [5]:
                 table_res_list.append(res)
-        #  Unified crop img logic
-        def crop_img(input_res, input_pil_img, crop_paste_x=0, crop_paste_y=0):
-            crop_xmin, crop_ymin = int(input_res['poly'][0]), int(input_res['poly'][1])
-            crop_xmax, crop_ymax = int(input_res['poly'][4]), int(input_res['poly'][5])
-            # Create a white background with an additional width and height of 50
-            crop_new_width = crop_xmax - crop_xmin + crop_paste_x * 2
-            crop_new_height = crop_ymax - crop_ymin + crop_paste_y * 2
-            return_image = Image.new('RGB', (crop_new_width, crop_new_height), 'white')
-            # Crop image
-            crop_box = (crop_xmin, crop_ymin, crop_xmax, crop_ymax)
-            cropped_img = input_pil_img.crop(crop_box)
-            return_image.paste(cropped_img, (crop_paste_x, crop_paste_y))
-            return_list = [crop_paste_x, crop_paste_y, crop_xmin, crop_ymin, crop_xmax, crop_ymax, crop_new_width, crop_new_height]
-            return return_image, return_list
-        pil_img = Image.fromarray(image)
+        if torch.cuda.is_available() and self.device != 'cpu':
+            properties = torch.cuda.get_device_properties(self.device)
+            total_memory = properties.total_memory / (1024 ** 3)  # 将字节转换为 GB
+            if total_memory <= 10:
+                gc_start = time.time()
+                clean_memory()
+                gc_time = round(time.time() - gc_start, 2)
+                logger.info(f"gc time: {gc_time}")
         # ocr识别
         if self.apply_ocr:
@@ -369,7 +448,7 @@ class CustomPEKModel:
                         })
             ocr_cost = round(time.time() - ocr_start, 2)
-            logger.info(f"ocr cost: {ocr_cost}")
+            logger.info(f"ocr time: {ocr_cost}")
         # 表格识别 table recognition
         if self.apply_table:
@@ -377,33 +456,41 @@ class CustomPEKModel:
             for res in table_res_list:
                 new_image, _ = crop_img(res, pil_img)
                 single_table_start_time = time.time()
-                logger.info("------------------table recognition processing begins-----------------")
+                # logger.info("------------------table recognition processing begins-----------------")
                 latex_code = None
                 html_code = None
-                if self.table_model_type == STRUCT_EQTABLE:
+                if self.table_model_name == MODEL_NAME.STRUCT_EQTABLE:
                     with torch.no_grad():
-                        latex_code = self.table_model.image2latex(new_image)[0]
+                        table_result = self.table_model.predict(new_image, "html")
+                        if len(table_result) > 0:
+                            html_code = table_result[0]
                 else:
                     html_code = self.table_model.img2html(new_image)
                 run_time = time.time() - single_table_start_time
-                logger.info(f"------------table recognition processing ends within {run_time}s-----")
+                # logger.info(f"------------table recognition processing ends within {run_time}s-----")
                 if run_time > self.table_max_time:
                     logger.warning(f"------------table recognition processing exceeds max time {self.table_max_time}s----------")
                 # 判断是否返回正常
                 if latex_code:
-                    expected_ending = latex_code.strip().endswith('end{tabular}') or latex_code.strip().endswith(
-                        'end{table}')
+                    expected_ending = latex_code.strip().endswith('end{tabular}') or latex_code.strip().endswith('end{table}')
                     if expected_ending:
                         res["latex"] = latex_code
                     else:
-                        logger.warning(f"------------table recognition processing fails----------")
+                        logger.warning(f"table recognition processing fails, not found expected LaTeX table end")
                 elif html_code:
-                    res["html"] = html_code
+                    expected_ending = html_code.strip().endswith('</html>') or html_code.strip().endswith('</table>')
+                    if expected_ending:
+                        res["html"] = html_code
+                    else:
+                        logger.warning(f"table recognition processing fails, not found expected HTML table end")
                 else:
-                    logger.warning(f"------------table recognition processing fails----------")
-            table_cost = round(time.time() - table_start, 2)
-            logger.info(f"table cost: {table_cost}")
+                    logger.warning(f"table recognition processing fails, not get latex or html return")
+            logger.info(f"table time: {round(time.time() - table_start, 2)}")
+        logger.info(f"-----page total time: {round(time.time() - page_start, 2)}-----")
         return layout_res

magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py CHANGED Viewed

@@ -1,21 +1,45 @@
-from struct_eqtable.model import StructTable
-from pypandoc import convert_text
+import re
+import torch
+from struct_eqtable import build_model
 class StructTableModel:
-    def __init__(self, model_path, max_new_tokens=2048, max_time=400, device = 'cpu'):
+    def __init__(self, model_path, max_new_tokens=1024, max_time=60):
         # init
-        self.model_path = model_path
-        self.max_new_tokens = max_new_tokens # maximum output tokens length
-        self.max_time = max_time # timeout for processing in seconds
-        if device == 'cuda':
-            self.model = StructTable(self.model_path, self.max_new_tokens, self.max_time).cuda()
+        assert torch.cuda.is_available(), "CUDA must be available for StructEqTable model."
+        self.model = build_model(
+            model_ckpt=model_path,
+            max_new_tokens=max_new_tokens,
+            max_time=max_time,
+            lmdeploy=False,
+            flash_attn=False,
+            batch_size=1,
+        ).cuda()
+        self.default_format = "html"
+    def predict(self, images, output_format=None, **kwargs):
+        if output_format is None:
+            output_format = self.default_format
         else:
-            self.model = StructTable(self.model_path, self.max_new_tokens, self.max_time)
+            if output_format not in ['latex', 'markdown', 'html']:
+                raise ValueError(f"Output format {output_format} is not supported.")
+        results = self.model(
+            images, output_format=output_format
+        )
+        if output_format == "html":
+            results = [self.minify_html(html) for html in results]
-    def image2latex(self, image) -> str:
-        table_latex = self.model.forward(image)
-        return table_latex
+        return results
-    def image2html(self, image) -> str:
-        table_latex = self.image2latex(image)
-        table_html = convert_text(table_latex, 'html', format='latex')
-        return table_html
+    def minify_html(self, html):
+        # 移除多余的空白字符
+        html = re.sub(r'\s+', ' ', html)
+        # 移除行尾的空白字符
+        html = re.sub(r'\s*>\s*', '>', html)
+        # 移除标签前的空白字符
+        html = re.sub(r'\s*<\s*', '<', html)
+        return html.strip()

magic_pdf/model/ppTableModel.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import cv2
 from paddleocr.ppstructure.table.predict_table import TableSystem
 from paddleocr.ppstructure.utility import init_args
 from magic_pdf.libs.Constants import *
@@ -36,12 +37,13 @@ class ppTableModel(object):
         - HTML (str): A string representing the HTML structure with content of the table.
         """
         if isinstance(image, Image.Image):
-            image = np.array(image)
+            image = np.asarray(image)
+            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
         pred_res, _ = self.table_sys(image)
         pred_html = pred_res["html"]
-        res = '<td><table  border="1">' + pred_html.replace("<html><body><table>", "").replace("</table></body></html>",
-                                                                                               "") + "</table></td>\n"
-        return res
+        # res = '<td><table  border="1">' + pred_html.replace("<html><body><table>", "").replace(
+        # "</table></body></html>","") + "</table></td>\n"
+        return pred_html
     def parse_args(self, **kwargs):
         parser = init_args()
@@ -52,11 +54,11 @@ class ppTableModel(object):
         rec_model_dir = os.path.join(model_dir, REC_MODEL_DIR)
         rec_char_dict_path = os.path.join(model_dir, REC_CHAR_DICT)
         device = kwargs.get("device", "cpu")
-        use_gpu = True if device == "cuda" else False
+        use_gpu = True if device.startswith("cuda") else False
         config = {
             "use_gpu": use_gpu,
             "table_max_len": kwargs.get("table_max_len", TABLE_MAX_LEN),
-            "table_algorithm": TABLE_MASTER,
+            "table_algorithm": "TableMaster",
             "table_model_dir": table_model_dir,
             "table_char_dict_path": table_char_dict_path,
             "det_model_dir": det_model_dir,

magic_pdf/model/pp_structure_v2.py CHANGED Viewed

@@ -18,8 +18,11 @@ def region_to_bbox(region):
 class CustomPaddleModel:
-    def __init__(self, ocr: bool = False, show_log: bool = False):
-        self.model = PPStructure(table=False, ocr=ocr, show_log=show_log)
+    def __init__(self, ocr: bool = False, show_log: bool = False, lang=None):
+        if lang is not None:
+            self.model = PPStructure(table=False, ocr=ocr, show_log=show_log, lang=lang)
+        else:
+            self.model = PPStructure(table=False, ocr=ocr, show_log=show_log)
     def __call__(self, img):
         try:

magic_pdf/model/v3/__init__.py ADDED Viewed

File without changes

magic-pdf 0.8.1__py3-none-any.whl → 0.9.1__py3-none-any.whl

magic-pdf 0.8.1py3-none-any.whl → 0.9.1py3-none-any.whl