PyPI - magic-pdf - Versions diffs - 0.7.0a1__py3-none-any.whl → 0.7.1__py3-none-any.whl - Mend

magic-pdf 0.7.0a1py3-none-any.whl → 0.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

magic_pdf/dict2md/ocr_mkcontent.py +4 -0
magic_pdf/libs/Constants.py +30 -1
magic_pdf/libs/draw_bbox.py +66 -1
magic_pdf/libs/ocr_content_type.py +14 -0
magic_pdf/libs/version.py +1 -1
magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
magic_pdf/model/magic_model.py +3 -0
magic_pdf/model/pdf_extract_kit.py +94 -70
magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +0 -1
magic_pdf/model/ppTableModel.py +67 -0
magic_pdf/para/para_split_v2.py +50 -47
magic_pdf/resources/model_config/model_configs.yaml +3 -1
magic_pdf/tools/cli_dev.py +8 -9
magic_pdf/tools/common.py +4 -1
magic_pdf-0.7.1.dist-info/METADATA +417 -0
{magic_pdf-0.7.0a1.dist-info → magic_pdf-0.7.1.dist-info}/RECORD +20 -19
magic_pdf-0.7.0a1.dist-info/METADATA +0 -362
{magic_pdf-0.7.0a1.dist-info → magic_pdf-0.7.1.dist-info}/LICENSE.md +0 -0
{magic_pdf-0.7.0a1.dist-info → magic_pdf-0.7.1.dist-info}/WHEEL +0 -0
{magic_pdf-0.7.0a1.dist-info → magic_pdf-0.7.1.dist-info}/entry_points.txt +0 -0
{magic_pdf-0.7.0a1.dist-info → magic_pdf-0.7.1.dist-info}/top_level.txt +0 -0

magic_pdf/dict2md/ocr_mkcontent.py CHANGED Viewed

@@ -132,6 +132,8 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
                                     # if processed by table model
                                     if span.get('latex', ''):
                                         para_text += f"\n\n$\n {span['latex']}\n$\n\n"
+                                    elif span.get('html', ''):
+                                        para_text += f"\n\n{span['html']}\n\n"
                                     else:
                                         para_text += f"\n![{table_caption}]({join_path(img_buket_path, span['image_path'])})  \n"
                 for block in para_block['blocks']:  # 3rd.拼table_footnote
@@ -256,6 +258,8 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
             if block['type'] == BlockType.TableBody:
                 if block["lines"][0]["spans"][0].get('latex', ''):
                     para_content['table_body'] = f"\n\n$\n {block['lines'][0]['spans'][0]['latex']}\n$\n\n"
+                elif block["lines"][0]["spans"][0].get('html', ''):
+                    para_content['table_body'] = f"\n\n{block['lines'][0]['spans'][0]['html']}\n\n"
                 para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
             if block['type'] == BlockType.TableCaption:
                 para_content['table_caption'] = merge_para_with_text(block)

magic_pdf/libs/Constants.py CHANGED Viewed

@@ -8,4 +8,33 @@ CROSS_PAGE = "cross_page"
 block维度自定义字段
 """
 # block中lines是否被删除
-LINES_DELETED = "lines_deleted"
+LINES_DELETED = "lines_deleted"
+# struct eqtable
+STRUCT_EQTABLE = "struct_eqtable"
+# table recognition max time default value
+TABLE_MAX_TIME_VALUE = 400
+# pp_table_result_max_length
+TABLE_MAX_LEN = 480
+# pp table structure algorithm
+TABLE_MASTER = "TableMaster"
+# table master structure dict
+TABLE_MASTER_DICT = "table_master_structure_dict.txt"
+# table master dir
+TABLE_MASTER_DIR = "table_structure_tablemaster_infer/"
+# pp detect model dir
+DETECT_MODEL_DIR = "ch_PP-OCRv3_det_infer"
+# pp rec model dir
+REC_MODEL_DIR = "ch_PP-OCRv3_rec_infer"
+# pp rec char dict path
+REC_CHAR_DICT = "ppocr_keys_v1.txt"

magic_pdf/libs/draw_bbox.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from magic_pdf.libs.Constants import CROSS_PAGE
 from magic_pdf.libs.commons import fitz  # PyMuPDF
-from magic_pdf.libs.ocr_content_type import ContentType, BlockType
+from magic_pdf.libs.ocr_content_type import ContentType, BlockType, CategoryId
+from magic_pdf.model.magic_model import MagicModel
 def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
@@ -225,3 +226,67 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
     # Save the PDF
     pdf_docs.save(f"{out_path}/spans.pdf")
+def drow_model_bbox(model_list: list, pdf_bytes, out_path):
+    dropped_bbox_list = []
+    tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
+    imgs_body_list, imgs_caption_list = [], []
+    titles_list = []
+    texts_list = []
+    interequations_list = []
+    pdf_docs = fitz.open("pdf", pdf_bytes)
+    magic_model = MagicModel(model_list, pdf_docs)
+    for i in range(len(model_list)):
+        page_dropped_list = []
+        tables_body, tables_caption, tables_footnote = [], [], []
+        imgs_body, imgs_caption = [], []
+        titles = []
+        texts = []
+        interequations = []
+        page_info = magic_model.get_model_list(i)
+        layout_dets = page_info["layout_dets"]
+        for layout_det in layout_dets:
+            bbox = layout_det["bbox"]
+            if layout_det["category_id"] == CategoryId.Text:
+                texts.append(bbox)
+            elif layout_det["category_id"] == CategoryId.Title:
+                titles.append(bbox)
+            elif layout_det["category_id"] == CategoryId.TableBody:
+                tables_body.append(bbox)
+            elif layout_det["category_id"] == CategoryId.TableCaption:
+                tables_caption.append(bbox)
+            elif layout_det["category_id"] == CategoryId.TableFootnote:
+                tables_footnote.append(bbox)
+            elif layout_det["category_id"] == CategoryId.ImageBody:
+                imgs_body.append(bbox)
+            elif layout_det["category_id"] == CategoryId.ImageCaption:
+                imgs_caption.append(bbox)
+            elif layout_det["category_id"] == CategoryId.InterlineEquation_YOLO:
+                interequations.append(bbox)
+            elif layout_det["category_id"] == CategoryId.Abandon:
+                page_dropped_list.append(bbox)
+        tables_body_list.append(tables_body)
+        tables_caption_list.append(tables_caption)
+        tables_footnote_list.append(tables_footnote)
+        imgs_body_list.append(imgs_body)
+        imgs_caption_list.append(imgs_caption)
+        titles_list.append(titles)
+        texts_list.append(texts)
+        interequations_list.append(interequations)
+        dropped_bbox_list.append(page_dropped_list)
+    for i, page in enumerate(pdf_docs):
+        draw_bbox_with_number(i, dropped_bbox_list, page, [158, 158, 158], True) # color !
+        draw_bbox_with_number(i, tables_body_list, page, [204, 204, 0], True)
+        draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102], True)
+        draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204], True)
+        draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True)
+        draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255], True)
+        draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True)
+        draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True)
+        draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)
+    # Save the PDF
+    pdf_docs.save(f"{out_path}/model.pdf")

magic_pdf/libs/ocr_content_type.py CHANGED Viewed

@@ -19,3 +19,17 @@ class BlockType:
     Footnote = "footnote"
     Discarded = "discarded"
+class CategoryId:
+    Title = 0
+    Text = 1
+    Abandon = 2
+    ImageBody = 3
+    ImageCaption = 4
+    TableBody = 5
+    TableCaption = 6
+    TableFootnote = 7
+    InterlineEquation_Layout = 8
+    InlineEquation = 13
+    InterlineEquation_YOLO = 14
+    OcrText = 15

magic_pdf/libs/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.7.~~0a1~~"
1	+ __version__ = "0.7.1"

magic_pdf/model/doc_analyze_by_custom_model.py CHANGED Viewed

@@ -37,8 +37,8 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
             mat = fitz.Matrix(dpi / 72, dpi / 72)
             pm = page.get_pixmap(matrix=mat, alpha=False)
-            # if width or height > 3000 pixels, don't enlarge the image
-            if pm.width > 3000 or pm.height > 3000:
+            # If the width or height exceeds 9000 after scaling, do not scale further.
+            if pm.width > 9000 or pm.height > 9000:
                 pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
             img = Image.frombytes("RGB", (pm.width, pm.height), pm.samples)

magic_pdf/model/magic_model.py CHANGED Viewed

@@ -562,8 +562,11 @@ class MagicModel:
                 elif category_id == 5:
                     # 获取table模型结果
                     latex = layout_det.get("latex", None)
+                    html = layout_det.get("html", None)
                     if latex:
                         span["latex"] = latex
+                    elif html:
+                        span["html"] = html
                     span["type"] = ContentType.Table
                 elif category_id == 13:
                     span["content"] = layout_det["latex"]

magic_pdf/model/pdf_extract_kit.py CHANGED Viewed

@@ -2,6 +2,7 @@ from loguru import logger
 import os
 import time
+from magic_pdf.libs.Constants import *
 os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
 try:
@@ -26,17 +27,25 @@ except ImportError as e:
     logger.exception(e)
     logger.error(
         'Required dependency not installed, please install by \n'
-        '"pip install magic-pdf[full] detectron2 --extra-index-url https://myhloli.github.io/wheels/"')
+        '"pip install magic-pdf[full] --extra-index-url https://myhloli.github.io/wheels/"')
     exit(1)
 from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
 from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
 from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
 from magic_pdf.model.pek_sub_modules.structeqtable.StructTableModel import StructTableModel
-def table_model_init(model_path, max_time=400, _device_='cpu'):
-    table_model = StructTableModel(model_path, max_time=max_time, device=_device_)
+from magic_pdf.model.ppTableModel import ppTableModel
+def table_model_init(table_model_type, model_path, max_time, _device_='cpu'):
+    if table_model_type == STRUCT_EQTABLE:
+        table_model = StructTableModel(model_path, max_time=max_time, device=_device_)
+    else:
+        config = {
+            "model_dir": model_path,
+            "device": _device_
+        }
+        table_model = ppTableModel(config)
     return table_model
@@ -103,8 +112,11 @@ class CustomPEKModel:
         # 初始化解析配置
         self.apply_layout = kwargs.get("apply_layout", self.configs["config"]["layout"])
         self.apply_formula = kwargs.get("apply_formula", self.configs["config"]["formula"])
+        # table config
         self.table_config = kwargs.get("table_config", self.configs["config"]["table_config"])
         self.apply_table = self.table_config.get("is_table_recog_enable", False)
+        self.table_max_time = self.table_config.get("max_time", TABLE_MAX_TIME_VALUE)
+        self.table_model_type = self.table_config.get("model", TABLE_MASTER)
         self.apply_ocr = ocr
         logger.info(
             "DocAnalysis init, this may take some times. apply_layout: {}, apply_formula: {}, apply_ocr: {}, apply_table: {}".format(
@@ -139,11 +151,11 @@ class CustomPEKModel:
         if self.apply_ocr:
             self.ocr_model = ModifiedPaddleOCR(show_log=show_log)
-        # init structeqtable
+        # init table model
         if self.apply_table:
-            max_time = self.table_config.get("max_time", 400)
-            self.table_model = table_model_init(str(os.path.join(models_dir, self.configs["weights"]["table"])),
-                                                max_time=max_time, _device_=self.device)
+            table_model_dir = self.configs["weights"][self.table_model_type]
+            self.table_model = table_model_init(self.table_model_type, str(os.path.join(models_dir, table_model_dir)),
+                                                max_time=self.table_max_time, _device_=self.device)
         logger.info('DocAnalysis init done!')
     def __call__(self, image):
@@ -187,50 +199,56 @@ class CustomPEKModel:
             mfr_cost = round(time.time() - mfr_start, 2)
             logger.info(f"formula nums: {len(mf_image_list)}, mfr time: {mfr_cost}")
+        # Select regions for OCR / formula regions / table regions
+        ocr_res_list = []
+        table_res_list = []
+        single_page_mfdetrec_res = []
+        for res in layout_res:
+            if int(res['category_id']) in [13, 14]:
+                single_page_mfdetrec_res.append({
+                    "bbox": [int(res['poly'][0]), int(res['poly'][1]),
+                             int(res['poly'][4]), int(res['poly'][5])],
+                })
+            elif int(res['category_id']) in [0, 1, 2, 4, 6, 7]:
+                ocr_res_list.append(res)
+            elif int(res['category_id']) in [5]:
+                table_res_list.append(res)
+        #  Unified crop img logic
+        def crop_img(input_res, input_pil_img, crop_paste_x=0, crop_paste_y=0):
+            crop_xmin, crop_ymin = int(input_res['poly'][0]), int(input_res['poly'][1])
+            crop_xmax, crop_ymax = int(input_res['poly'][4]), int(input_res['poly'][5])
+            # Create a white background with an additional width and height of 50
+            crop_new_width = crop_xmax - crop_xmin + crop_paste_x * 2
+            crop_new_height = crop_ymax - crop_ymin + crop_paste_y * 2
+            return_image = Image.new('RGB', (crop_new_width, crop_new_height), 'white')
+            # Crop image
+            crop_box = (crop_xmin, crop_ymin, crop_xmax, crop_ymax)
+            cropped_img = input_pil_img.crop(crop_box)
+            return_image.paste(cropped_img, (crop_paste_x, crop_paste_y))
+            return_list = [crop_paste_x, crop_paste_y, crop_xmin, crop_ymin, crop_xmax, crop_ymax, crop_new_width, crop_new_height]
+            return return_image, return_list
+        pil_img = Image.fromarray(image)
         # ocr识别
         if self.apply_ocr:
             ocr_start = time.time()
-            pil_img = Image.fromarray(image)
-            # 筛选出需要OCR的区域和公式区域
-            ocr_res_list = []
-            single_page_mfdetrec_res = []
-            for res in layout_res:
-                if int(res['category_id']) in [13, 14]:
-                    single_page_mfdetrec_res.append({
-                        "bbox": [int(res['poly'][0]), int(res['poly'][1]),
-                                 int(res['poly'][4]), int(res['poly'][5])],
-                    })
-                elif int(res['category_id']) in [0, 1, 2, 4, 6, 7]:
-                    ocr_res_list.append(res)
-            # 对每一个需OCR处理的区域进行处理
+            # Process each area that requires OCR processing
             for res in ocr_res_list:
-                xmin, ymin = int(res['poly'][0]), int(res['poly'][1])
-                xmax, ymax = int(res['poly'][4]), int(res['poly'][5])
-                paste_x = 50
-                paste_y = 50
-                # 创建一个宽高各多50的白色背景
-                new_width = xmax - xmin + paste_x * 2
-                new_height = ymax - ymin + paste_y * 2
-                new_image = Image.new('RGB', (new_width, new_height), 'white')
-                # 裁剪图像
-                crop_box = (xmin, ymin, xmax, ymax)
-                cropped_img = pil_img.crop(crop_box)
-                new_image.paste(cropped_img, (paste_x, paste_y))
-                # 调整公式区域坐标
+                new_image, useful_list = crop_img(res, pil_img, crop_paste_x=50, crop_paste_y=50)
+                paste_x, paste_y, xmin, ymin, xmax, ymax, new_width, new_height = useful_list
+                # Adjust the coordinates of the formula area
                 adjusted_mfdetrec_res = []
                 for mf_res in single_page_mfdetrec_res:
                     mf_xmin, mf_ymin, mf_xmax, mf_ymax = mf_res["bbox"]
-                    # 将公式区域坐标调整为相对于裁剪区域的坐标
+                    # Adjust the coordinates of the formula area to the coordinates relative to the cropping area
                     x0 = mf_xmin - xmin + paste_x
                     y0 = mf_ymin - ymin + paste_y
                     x1 = mf_xmax - xmin + paste_x
                     y1 = mf_ymax - ymin + paste_y
-                    # 过滤在图外的公式块
+                    # Filter formula blocks outside the graph
                     if any([x1 < 0, y1 < 0]) or any([x0 > new_width, y0 > new_height]):
                         continue
                     else:
@@ -238,17 +256,17 @@ class CustomPEKModel:
                             "bbox": [x0, y0, x1, y1],
                         })
-                # OCR识别
+                # OCR recognition
                 new_image = cv2.cvtColor(np.asarray(new_image), cv2.COLOR_RGB2BGR)
                 ocr_res = self.ocr_model.ocr(new_image, mfd_res=adjusted_mfdetrec_res)[0]
-                # 整合结果
+                # Integration results
                 if ocr_res:
                     for box_ocr_res in ocr_res:
                         p1, p2, p3, p4 = box_ocr_res[0]
                         text, score = box_ocr_res[1]
-                        # 将坐标转换回原图坐标系
+                        # Convert the coordinates back to the original coordinate system
                         p1 = [p1[0] - paste_x + xmin, p1[1] - paste_y + ymin]
                         p2 = [p2[0] - paste_x + xmin, p2[1] - paste_y + ymin]
                         p3 = [p3[0] - paste_x + xmin, p3[1] - paste_y + ymin]
@@ -266,30 +284,36 @@ class CustomPEKModel:
         # 表格识别 table recognition
         if self.apply_table:
-            pil_img = Image.fromarray(image)
-            for layout in layout_res:
-                if layout.get("category_id", -1) == 5:
-                    poly = layout["poly"]
-                    xmin, ymin = int(poly[0]), int(poly[1])
-                    xmax, ymax = int(poly[4]), int(poly[5])
-                    paste_x = 50
-                    paste_y = 50
-                    # 创建一个宽高各多50的白色背景 create a whiteboard with 50 larger width and length
-                    new_width = xmax - xmin + paste_x * 2
-                    new_height = ymax - ymin + paste_y * 2
-                    new_image = Image.new('RGB', (new_width, new_height), 'white')
-                    # 裁剪图像 crop image
-                    crop_box = (xmin, ymin, xmax, ymax)
-                    cropped_img = pil_img.crop(crop_box)
-                    new_image.paste(cropped_img, (paste_x, paste_y))
-                    start_time = time.time()
-                    logger.info("------------------table recognition processing begins-----------------")
-                    latex_code = self.table_model.image2latex(new_image)[0]
-                    end_time = time.time()
-                    run_time = end_time - start_time
-                    logger.info(f"------------table recognition processing ends within {run_time}s-----")
-                    layout["latex"] = latex_code
+            table_start = time.time()
+            for res in table_res_list:
+                new_image, _ = crop_img(res, pil_img)
+                single_table_start_time = time.time()
+                logger.info("------------------table recognition processing begins-----------------")
+                latex_code = None
+                html_code = None
+                with torch.no_grad():
+                    if self.table_model_type == STRUCT_EQTABLE:
+                        latex_code = self.table_model.image2latex(new_image)[0]
+                    else:
+                        html_code = self.table_model.img2html(new_image)
+                run_time = time.time() - single_table_start_time
+                logger.info(f"------------table recognition processing ends within {run_time}s-----")
+                if run_time > self.table_max_time:
+                    logger.warning(f"------------table recognition processing exceeds max time {self.table_max_time}s----------")
+                # 判断是否返回正常
+                if latex_code:
+                    expected_ending = latex_code.strip().endswith('end{tabular}') or latex_code.strip().endswith(
+                        'end{table}')
+                    if expected_ending:
+                        res["latex"] = latex_code
+                    else:
+                        logger.warning(f"------------table recognition processing fails----------")
+                elif html_code:
+                    res["html"] = html_code
+                else:
+                    logger.warning(f"------------table recognition processing fails----------")
+            table_cost = round(time.time() - table_start, 2)
+            logger.info(f"table cost: {table_cost}")
         return layout_res

magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py CHANGED Viewed

@@ -12,7 +12,6 @@ class StructTableModel:
             self.model = StructTable(self.model_path, self.max_new_tokens, self.max_time)
     def image2latex(self, image) -> str:
-        #
         table_latex = self.model.forward(image)
         return table_latex

magic_pdf/model/ppTableModel.py ADDED Viewed

@@ -0,0 +1,67 @@
+from paddleocr.ppstructure.table.predict_table import TableSystem
+from paddleocr.ppstructure.utility import init_args
+from magic_pdf.libs.Constants import *
+import os
+from PIL import Image
+import numpy as np
+class ppTableModel(object):
+    """
+        This class is responsible for converting image of table into HTML format using a pre-trained model.
+        Attributes:
+        - table_sys: An instance of TableSystem initialized with parsed arguments.
+        Methods:
+        - __init__(config): Initializes the model with configuration parameters.
+        - img2html(image): Converts a PIL Image or NumPy array to HTML string.
+        - parse_args(**kwargs): Parses configuration arguments.
+    """
+    def __init__(self, config):
+        """
+        Parameters:
+        - config (dict): Configuration dictionary containing model_dir and device.
+        """
+        args = self.parse_args(**config)
+        self.table_sys = TableSystem(args)
+    def img2html(self, image):
+        """
+        Parameters:
+        - image (PIL.Image or np.ndarray): The image of the table to be converted.
+        Return:
+        - HTML (str): A string representing the HTML structure with content of the table.
+        """
+        if isinstance(image, Image.Image):
+            image = np.array(image)
+        pred_res, _ = self.table_sys(image)
+        pred_html = pred_res["html"]
+        res = '<td><table  border="1">' + pred_html.replace("<html><body><table>", "").replace("</table></body></html>",
+                                                                                               "") + "</table></td>\n"
+        return res
+    def parse_args(self, **kwargs):
+        parser = init_args()
+        model_dir = kwargs.get("model_dir")
+        table_model_dir = os.path.join(model_dir, TABLE_MASTER_DIR)
+        table_char_dict_path = os.path.join(model_dir, TABLE_MASTER_DICT)
+        det_model_dir = os.path.join(model_dir, DETECT_MODEL_DIR)
+        rec_model_dir = os.path.join(model_dir, REC_MODEL_DIR)
+        rec_char_dict_path = os.path.join(model_dir, REC_CHAR_DICT)
+        device = kwargs.get("device", "cpu")
+        use_gpu = True if device == "cuda" else False
+        config = {
+            "use_gpu": use_gpu,
+            "table_max_len": kwargs.get("table_max_len", TABLE_MAX_LEN),
+            "table_algorithm": TABLE_MASTER,
+            "table_model_dir": table_model_dir,
+            "table_char_dict_path": table_char_dict_path,
+            "det_model_dir": det_model_dir,
+            "rec_model_dir": rec_model_dir,
+            "rec_char_dict_path": rec_char_dict_path,
+        }
+        parser.set_defaults(**config)
+        return parser.parse_args([])

magic_pdf/para/para_split_v2.py CHANGED Viewed

@@ -100,59 +100,62 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
     if lang != 'en':
         return lines, None
-    else:
-        total_lines = len(lines)
-        line_fea_encode = []
-        """
-        对每一行进行特征编码，编码规则如下：
-        1. 如果行顶格，且大写字母开头或者数字开头，编码为1
-        2. 如果顶格，其他非大写开头编码为4
-        3. 如果非顶格，首字符大写，编码为2
-        4. 如果非顶格，首字符非大写编码为3
-        """
-        if len(lines) > 0:
-            x_map_tag_dict, min_x_tag = cluster_line_x(lines)
-        for l in lines:
-            span_text = __get_span_text(l['spans'][0])
-            first_char = span_text[0]
-            layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)
-            if not layout:
-                line_fea_encode.append(0)
+    total_lines = len(lines)
+    line_fea_encode = []
+    """
+    对每一行进行特征编码，编码规则如下：
+    1. 如果行顶格，且大写字母开头或者数字开头，编码为1
+    2. 如果顶格，其他非大写开头编码为4
+    3. 如果非顶格，首字符大写，编码为2
+    4. 如果非顶格，首字符非大写编码为3
+    """
+    if len(lines) > 0:
+        x_map_tag_dict, min_x_tag = cluster_line_x(lines)
+    for l in lines:
+        span_text = __get_span_text(l['spans'][0])
+        if not span_text:
+            line_fea_encode.append(0)
+            continue
+        first_char = span_text[0]
+        layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)
+        if not layout:
+            line_fea_encode.append(0)
+        else:
+            #
+            if x_map_tag_dict[round(l['bbox'][0])] == min_x_tag:
+                # if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
+                if not first_char.isalnum() or if_match_reference_list(span_text):
+                    line_fea_encode.append(1)
+                else:
+                    line_fea_encode.append(4)
             else:
-                #
-                if x_map_tag_dict[round(l['bbox'][0])] == min_x_tag:
-                    # if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
-                    if not first_char.isalnum() or if_match_reference_list(span_text):
-                        line_fea_encode.append(1)
-                    else:
-                        line_fea_encode.append(4)
+                if first_char.isupper():
+                    line_fea_encode.append(2)
                 else:
-                    if first_char.isupper():
-                        line_fea_encode.append(2)
-                    else:
-                        line_fea_encode.append(3)
+                    line_fea_encode.append(3)
-        # 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行，认为是列表。
+    # 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行，认为是列表。
-        list_indice, list_start_idx = find_repeating_patterns2(line_fea_encode)
-        if len(list_indice) > 0:
+    list_indice, list_start_idx = find_repeating_patterns2(line_fea_encode)
+    if len(list_indice) > 0:
+        if debug_able:
+            logger.info(f"发现了列表，列表行数：{list_indice}， {list_start_idx}")
+    # TODO check一下这个特列表里缩进的行左侧是不是对齐的。
+    segments = []
+    for start, end in list_indice:
+        for i in range(start, end + 1):
+            if i > 0:
+                if line_fea_encode[i] == 4:
+                    if debug_able:
+                        logger.info(f"列表行的第{i}行不是顶格的")
+                    break
+        else:
             if debug_able:
-                logger.info(f"发现了列表，列表行数：{list_indice}， {list_start_idx}")
-        # TODO check一下这个特列表里缩进的行左侧是不是对齐的。
-        segments = []
-        for start, end in list_indice:
-            for i in range(start, end + 1):
-                if i > 0:
-                    if line_fea_encode[i] == 4:
-                        if debug_able:
-                            logger.info(f"列表行的第{i}行不是顶格的")
-                        break
-            else:
-                if debug_able:
-                    logger.info(f"列表行的第{start}到第{end}行是列表")
+                logger.info(f"列表行的第{start}到第{end}行是列表")
-        return split_indices(total_lines, list_indice), list_start_idx
+    return split_indices(total_lines, list_indice), list_start_idx
 def cluster_line_x(lines: list) -> dict:

magic_pdf/resources/model_config/model_configs.yaml CHANGED Viewed

@@ -3,6 +3,7 @@ config:
   layout: True
   formula: True
   table_config:
+    model: TableMaster
     is_table_recog_enable: False
     max_time: 400
@@ -10,4 +11,5 @@ weights:
   layout: Layout/model_final.pth
   mfd: MFD/weights.pt
   mfr: MFR/UniMERNet
-  table: TabRec/StructEqTable
+  struct_eqtable: TabRec/StructEqTable
+  TableMaster: TabRec/TableMaster

magic-pdf 0.7.0a1__py3-none-any.whl → 0.7.1__py3-none-any.whl

magic-pdf 0.7.0a1py3-none-any.whl → 0.7.1py3-none-any.whl