PyPI - magic-pdf - Versions diffs - 0.7.0b1__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

magic-pdf 0.7.0b1py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

magic_pdf/dict2md/ocr_mkcontent.py +134 -76
magic_pdf/integrations/__init__.py +0 -0
magic_pdf/integrations/rag/__init__.py +0 -0
magic_pdf/integrations/rag/api.py +82 -0
magic_pdf/integrations/rag/type.py +82 -0
magic_pdf/integrations/rag/utils.py +285 -0
magic_pdf/layout/layout_sort.py +472 -283
magic_pdf/libs/Constants.py +27 -1
magic_pdf/libs/boxbase.py +169 -149
magic_pdf/libs/draw_bbox.py +113 -87
magic_pdf/libs/ocr_content_type.py +21 -18
magic_pdf/libs/version.py +1 -1
magic_pdf/model/doc_analyze_by_custom_model.py +14 -2
magic_pdf/model/magic_model.py +230 -161
magic_pdf/model/model_list.py +8 -0
magic_pdf/model/pdf_extract_kit.py +135 -22
magic_pdf/model/pek_sub_modules/self_modify.py +84 -0
magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +0 -1
magic_pdf/model/ppTableModel.py +67 -0
magic_pdf/para/para_split_v2.py +76 -74
magic_pdf/pdf_parse_union_core.py +34 -6
magic_pdf/pipe/AbsPipe.py +4 -1
magic_pdf/pipe/OCRPipe.py +7 -4
magic_pdf/pipe/TXTPipe.py +7 -4
magic_pdf/pipe/UNIPipe.py +11 -6
magic_pdf/pre_proc/ocr_detect_all_bboxes.py +12 -3
magic_pdf/pre_proc/ocr_dict_merge.py +60 -59
magic_pdf/resources/model_config/model_configs.yaml +3 -1
magic_pdf/tools/cli.py +56 -29
magic_pdf/tools/cli_dev.py +61 -64
magic_pdf/tools/common.py +57 -37
magic_pdf/user_api.py +17 -9
{magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/METADATA +71 -33
{magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/RECORD +38 -32
{magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/LICENSE.md +0 -0
{magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/WHEEL +0 -0
{magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/entry_points.txt +0 -0
{magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/top_level.txt +0 -0

magic_pdf/model/model_list.py CHANGED Viewed

@@ -1,3 +1,11 @@
 class MODEL:
     Paddle = "pp_structure_v2"
     PEK = "pdf_extract_kit"
+class AtomicModel:
+    Layout = "layout"
+    MFD = "mfd"
+    MFR = "mfr"
+    OCR = "ocr"
+    Table = "table"

magic_pdf/model/pdf_extract_kit.py CHANGED Viewed

@@ -2,7 +2,8 @@ from loguru import logger
 import os
 import time
-from magic_pdf.libs.Constants import TABLE_MAX_TIME_VALUE
+from magic_pdf.libs.Constants import *
+from magic_pdf.model.model_list import AtomicModel
 os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
 try:
@@ -34,10 +35,18 @@ from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Pre
 from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
 from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
 from magic_pdf.model.pek_sub_modules.structeqtable.StructTableModel import StructTableModel
-def table_model_init(model_path, max_time, _device_='cpu'):
-    table_model = StructTableModel(model_path, max_time=max_time, device=_device_)
+from magic_pdf.model.ppTableModel import ppTableModel
+def table_model_init(table_model_type, model_path, max_time, _device_='cpu'):
+    if table_model_type == STRUCT_EQTABLE:
+        table_model = StructTableModel(model_path, max_time=max_time, device=_device_)
+    else:
+        config = {
+            "model_dir": model_path,
+            "device": _device_
+        }
+        table_model = ppTableModel(config)
     return table_model
@@ -56,7 +65,8 @@ def mfr_model_init(weight_dir, cfg_path, _device_='cpu'):
     model = task.build_model(cfg)
     model = model.to(_device_)
     vis_processor = load_processor('formula_image_eval', cfg.config.datasets.formula_rec_eval.vis_processor.eval)
-    return model, vis_processor
+    mfr_transform = transforms.Compose([vis_processor, ])
+    return [model, mfr_transform]
 def layout_model_init(weight, config_file, device):
@@ -64,6 +74,11 @@ def layout_model_init(weight, config_file, device):
     return model
+def ocr_model_init(show_log: bool = False, det_db_box_thresh=0.3):
+    model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh)
+    return model
 class MathDataset(Dataset):
     def __init__(self, image_paths, transform=None):
         self.image_paths = image_paths
@@ -83,6 +98,58 @@ class MathDataset(Dataset):
             return image
+class AtomModelSingleton:
+    _instance = None
+    _models = {}
+    def __new__(cls, *args, **kwargs):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+    def get_atom_model(self, atom_model_name: str, **kwargs):
+        if atom_model_name not in self._models:
+            self._models[atom_model_name] = atom_model_init(model_name=atom_model_name, **kwargs)
+        return self._models[atom_model_name]
+def atom_model_init(model_name: str, **kwargs):
+    if model_name == AtomicModel.Layout:
+        atom_model = layout_model_init(
+            kwargs.get("layout_weights"),
+            kwargs.get("layout_config_file"),
+            kwargs.get("device")
+        )
+    elif model_name == AtomicModel.MFD:
+        atom_model = mfd_model_init(
+            kwargs.get("mfd_weights")
+        )
+    elif model_name == AtomicModel.MFR:
+        atom_model = mfr_model_init(
+            kwargs.get("mfr_weight_dir"),
+            kwargs.get("mfr_cfg_path"),
+            kwargs.get("device")
+        )
+    elif model_name == AtomicModel.OCR:
+        atom_model = ocr_model_init(
+            kwargs.get("ocr_show_log"),
+            kwargs.get("det_db_box_thresh")
+        )
+    elif model_name == AtomicModel.Table:
+        atom_model = table_model_init(
+            kwargs.get("table_model_type"),
+            kwargs.get("table_model_path"),
+            kwargs.get("table_max_time"),
+            kwargs.get("device")
+        )
+    else:
+        logger.error("model name not allow")
+        exit(1)
+    return atom_model
 class CustomPEKModel:
     def __init__(self, ocr: bool = False, show_log: bool = False, **kwargs):
@@ -104,9 +171,11 @@ class CustomPEKModel:
         # 初始化解析配置
         self.apply_layout = kwargs.get("apply_layout", self.configs["config"]["layout"])
         self.apply_formula = kwargs.get("apply_formula", self.configs["config"]["formula"])
+        # table config
         self.table_config = kwargs.get("table_config", self.configs["config"]["table_config"])
         self.apply_table = self.table_config.get("is_table_recog_enable", False)
         self.table_max_time = self.table_config.get("max_time", TABLE_MAX_TIME_VALUE)
+        self.table_model_type = self.table_config.get("model", TABLE_MASTER)
         self.apply_ocr = ocr
         logger.info(
             "DocAnalysis init, this may take some times. apply_layout: {}, apply_formula: {}, apply_ocr: {}, apply_table: {}".format(
@@ -120,31 +189,62 @@ class CustomPEKModel:
         models_dir = kwargs.get("models_dir", os.path.join(root_dir, "resources", "models"))
         logger.info("using models_dir: {}".format(models_dir))
+        atom_model_manager = AtomModelSingleton()
         # 初始化公式识别
         if self.apply_formula:
             # 初始化公式检测模型
-            self.mfd_model = mfd_model_init(str(os.path.join(models_dir, self.configs["weights"]["mfd"])))
+            # self.mfd_model = mfd_model_init(str(os.path.join(models_dir, self.configs["weights"]["mfd"])))
+            self.mfd_model = atom_model_manager.get_atom_model(
+                atom_model_name=AtomicModel.MFD,
+                mfd_weights=str(os.path.join(models_dir, self.configs["weights"]["mfd"]))
+            )
             # 初始化公式解析模型
             mfr_weight_dir = str(os.path.join(models_dir, self.configs["weights"]["mfr"]))
             mfr_cfg_path = str(os.path.join(model_config_dir, "UniMERNet", "demo.yaml"))
-            self.mfr_model, mfr_vis_processors = mfr_model_init(mfr_weight_dir, mfr_cfg_path, _device_=self.device)
-            self.mfr_transform = transforms.Compose([mfr_vis_processors, ])
+            # self.mfr_model, mfr_vis_processors = mfr_model_init(mfr_weight_dir, mfr_cfg_path, _device_=self.device)
+            # self.mfr_transform = transforms.Compose([mfr_vis_processors, ])
+            self.mfr_model, self.mfr_transform = atom_model_manager.get_atom_model(
+                atom_model_name=AtomicModel.MFR,
+                mfr_weight_dir=mfr_weight_dir,
+                mfr_cfg_path=mfr_cfg_path,
+                device=self.device
+            )
         # 初始化layout模型
-        self.layout_model = Layoutlmv3_Predictor(
-            str(os.path.join(models_dir, self.configs['weights']['layout'])),
-            str(os.path.join(model_config_dir, "layoutlmv3", "layoutlmv3_base_inference.yaml")),
+        # self.layout_model = Layoutlmv3_Predictor(
+        #     str(os.path.join(models_dir, self.configs['weights']['layout'])),
+        #     str(os.path.join(model_config_dir, "layoutlmv3", "layoutlmv3_base_inference.yaml")),
+        #     device=self.device
+        # )
+        self.layout_model = atom_model_manager.get_atom_model(
+            atom_model_name=AtomicModel.Layout,
+            layout_weights=str(os.path.join(models_dir, self.configs['weights']['layout'])),
+            layout_config_file=str(os.path.join(model_config_dir, "layoutlmv3", "layoutlmv3_base_inference.yaml")),
             device=self.device
         )
         # 初始化ocr
         if self.apply_ocr:
-            self.ocr_model = ModifiedPaddleOCR(show_log=show_log)
-        # init structeqtable
+            # self.ocr_model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=0.3)
+            self.ocr_model = atom_model_manager.get_atom_model(
+                atom_model_name=AtomicModel.OCR,
+                ocr_show_log=show_log,
+                det_db_box_thresh=0.3
+            )
+        # init table model
         if self.apply_table:
-            self.table_model = table_model_init(str(os.path.join(models_dir, self.configs["weights"]["table"])),
-                                                max_time = self.table_max_time, _device_=self.device)
+            table_model_dir = self.configs["weights"][self.table_model_type]
+            # self.table_model = table_model_init(self.table_model_type, str(os.path.join(models_dir, table_model_dir)),
+            #                                     max_time=self.table_max_time, _device_=self.device)
+            self.table_model = atom_model_manager.get_atom_model(
+                atom_model_name=AtomicModel.Table,
+                table_model_type=self.table_model_type,
+                table_model_path=str(os.path.join(models_dir, table_model_dir)),
+                table_max_time=self.table_max_time,
+                device=self.device
+            )
         logger.info('DocAnalysis init done!')
     def __call__(self, image):
@@ -278,16 +378,29 @@ class CustomPEKModel:
                 new_image, _ = crop_img(res, pil_img)
                 single_table_start_time = time.time()
                 logger.info("------------------table recognition processing begins-----------------")
-                with torch.no_grad():
-                    latex_code = self.table_model.image2latex(new_image)[0]
+                latex_code = None
+                html_code = None
+                if self.table_model_type == STRUCT_EQTABLE:
+                    with torch.no_grad():
+                        latex_code = self.table_model.image2latex(new_image)[0]
+                else:
+                    html_code = self.table_model.img2html(new_image)
                 run_time = time.time() - single_table_start_time
                 logger.info(f"------------table recognition processing ends within {run_time}s-----")
                 if run_time > self.table_max_time:
                     logger.warning(f"------------table recognition processing exceeds max time {self.table_max_time}s----------")
                 # 判断是否返回正常
-                expected_ending = latex_code.strip().endswith('end{tabular}') or latex_code.strip().endswith('end{table}')
-                if latex_code and expected_ending:
-                    res["latex"] = latex_code
+                if latex_code:
+                    expected_ending = latex_code.strip().endswith('end{tabular}') or latex_code.strip().endswith(
+                        'end{table}')
+                    if expected_ending:
+                        res["latex"] = latex_code
+                    else:
+                        logger.warning(f"------------table recognition processing fails----------")
+                elif html_code:
+                    res["html"] = html_code
                 else:
                     logger.warning(f"------------table recognition processing fails----------")
             table_cost = round(time.time() - table_start, 2)

magic_pdf/model/pek_sub_modules/self_modify.py CHANGED Viewed

@@ -12,6 +12,7 @@ from paddleocr.ppocr.utils.utility import check_and_read, alpha_to_color, binari
 from paddleocr.tools.infer.utility import draw_ocr_box_txt, get_rotate_crop_image, get_minarea_rect_crop
 from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold
+from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line
 logger = get_logger()
@@ -162,6 +163,86 @@ def update_det_boxes(dt_boxes, mfd_res):
     return new_dt_boxes
+def merge_overlapping_spans(spans):
+    """
+    Merges overlapping spans on the same line.
+    :param spans: A list of span coordinates [(x1, y1, x2, y2), ...]
+    :return: A list of merged spans
+    """
+    # Return an empty list if the input spans list is empty
+    if not spans:
+        return []
+    # Sort spans by their starting x-coordinate
+    spans.sort(key=lambda x: x[0])
+    # Initialize the list of merged spans
+    merged = []
+    for span in spans:
+        # Unpack span coordinates
+        x1, y1, x2, y2 = span
+        # If the merged list is empty or there's no horizontal overlap, add the span directly
+        if not merged or merged[-1][2] < x1:
+            merged.append(span)
+        else:
+            # If there is horizontal overlap, merge the current span with the previous one
+            last_span = merged.pop()
+            # Update the merged span's top-left corner to the smaller (x1, y1) and bottom-right to the larger (x2, y2)
+            x1 = min(last_span[0], x1)
+            y1 = min(last_span[1], y1)
+            x2 = max(last_span[2], x2)
+            y2 = max(last_span[3], y2)
+            # Add the merged span back to the list
+            merged.append((x1, y1, x2, y2))
+    # Return the list of merged spans
+    return merged
+def merge_det_boxes(dt_boxes):
+    """
+    Merge detection boxes.
+    This function takes a list of detected bounding boxes, each represented by four corner points.
+    The goal is to merge these bounding boxes into larger text regions.
+    Parameters:
+    dt_boxes (list): A list containing multiple text detection boxes, where each box is defined by four corner points.
+    Returns:
+    list: A list containing the merged text regions, where each region is represented by four corner points.
+    """
+    # Convert the detection boxes into a dictionary format with bounding boxes and type
+    dt_boxes_dict_list = []
+    for text_box in dt_boxes:
+        text_bbox = points_to_bbox(text_box)
+        text_box_dict = {
+            'bbox': text_bbox,
+            'type': 'text',
+        }
+        dt_boxes_dict_list.append(text_box_dict)
+    # Merge adjacent text regions into lines
+    lines = merge_spans_to_line(dt_boxes_dict_list)
+    # Initialize a new list for storing the merged text regions
+    new_dt_boxes = []
+    for line in lines:
+        line_bbox_list = []
+        for span in line:
+            line_bbox_list.append(span['bbox'])
+        # Merge overlapping text regions within the same line
+        merged_spans = merge_overlapping_spans(line_bbox_list)
+        # Convert the merged text regions back to point format and add them to the new detection box list
+        for span in merged_spans:
+            new_dt_boxes.append(bbox_to_points(span))
+    return new_dt_boxes
 class ModifiedPaddleOCR(PaddleOCR):
     def ocr(self, img, det=True, rec=True, cls=True, bin=False, inv=False, mfd_res=None, alpha_color=(255, 255, 255)):
         """
@@ -265,6 +346,9 @@ class ModifiedPaddleOCR(PaddleOCR):
         img_crop_list = []
         dt_boxes = sorted_boxes(dt_boxes)
+        dt_boxes = merge_det_boxes(dt_boxes)
         if mfd_res:
             bef = time.time()
             dt_boxes = update_det_boxes(dt_boxes, mfd_res)

magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py CHANGED Viewed

@@ -12,7 +12,6 @@ class StructTableModel:
             self.model = StructTable(self.model_path, self.max_new_tokens, self.max_time)
     def image2latex(self, image) -> str:
-        #
         table_latex = self.model.forward(image)
         return table_latex

magic_pdf/model/ppTableModel.py ADDED Viewed

@@ -0,0 +1,67 @@
+from paddleocr.ppstructure.table.predict_table import TableSystem
+from paddleocr.ppstructure.utility import init_args
+from magic_pdf.libs.Constants import *
+import os
+from PIL import Image
+import numpy as np
+class ppTableModel(object):
+    """
+        This class is responsible for converting image of table into HTML format using a pre-trained model.
+        Attributes:
+        - table_sys: An instance of TableSystem initialized with parsed arguments.
+        Methods:
+        - __init__(config): Initializes the model with configuration parameters.
+        - img2html(image): Converts a PIL Image or NumPy array to HTML string.
+        - parse_args(**kwargs): Parses configuration arguments.
+    """
+    def __init__(self, config):
+        """
+        Parameters:
+        - config (dict): Configuration dictionary containing model_dir and device.
+        """
+        args = self.parse_args(**config)
+        self.table_sys = TableSystem(args)
+    def img2html(self, image):
+        """
+        Parameters:
+        - image (PIL.Image or np.ndarray): The image of the table to be converted.
+        Return:
+        - HTML (str): A string representing the HTML structure with content of the table.
+        """
+        if isinstance(image, Image.Image):
+            image = np.array(image)
+        pred_res, _ = self.table_sys(image)
+        pred_html = pred_res["html"]
+        res = '<td><table  border="1">' + pred_html.replace("<html><body><table>", "").replace("</table></body></html>",
+                                                                                               "") + "</table></td>\n"
+        return res
+    def parse_args(self, **kwargs):
+        parser = init_args()
+        model_dir = kwargs.get("model_dir")
+        table_model_dir = os.path.join(model_dir, TABLE_MASTER_DIR)
+        table_char_dict_path = os.path.join(model_dir, TABLE_MASTER_DICT)
+        det_model_dir = os.path.join(model_dir, DETECT_MODEL_DIR)
+        rec_model_dir = os.path.join(model_dir, REC_MODEL_DIR)
+        rec_char_dict_path = os.path.join(model_dir, REC_CHAR_DICT)
+        device = kwargs.get("device", "cpu")
+        use_gpu = True if device == "cuda" else False
+        config = {
+            "use_gpu": use_gpu,
+            "table_max_len": kwargs.get("table_max_len", TABLE_MAX_LEN),
+            "table_algorithm": TABLE_MASTER,
+            "table_model_dir": table_model_dir,
+            "table_char_dict_path": table_char_dict_path,
+            "det_model_dir": det_model_dir,
+            "rec_model_dir": rec_model_dir,
+            "rec_char_dict_path": rec_char_dict_path,
+        }
+        parser.set_defaults(**config)
+        return parser.parse_args([])

magic-pdf 0.7.0b1__py3-none-any.whl → 0.8.0__py3-none-any.whl

magic-pdf 0.7.0b1py3-none-any.whl → 0.8.0py3-none-any.whl