PyPI - magic-pdf - Versions diffs - 0.6.1__py3-none-any.whl → 0.7.0a1__py3-none-any.whl - Mend

magic-pdf 0.6.1py3-none-any.whl → 0.7.0a1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

magic_pdf/dict2md/ocr_mkcontent.py +20 -7
magic_pdf/libs/config_reader.py +28 -10
magic_pdf/libs/language.py +12 -0
magic_pdf/libs/version.py +1 -1
magic_pdf/model/__init__.py +1 -1
magic_pdf/model/doc_analyze_by_custom_model.py +35 -3
magic_pdf/model/magic_model.py +49 -41
magic_pdf/model/pdf_extract_kit.py +155 -60
magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py +7 -6
magic_pdf/model/pek_sub_modules/self_modify.py +87 -43
magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +22 -0
magic_pdf/model/pp_structure_v2.py +1 -1
magic_pdf/pdf_parse_union_core.py +4 -2
magic_pdf/pre_proc/citationmarker_remove.py +5 -1
magic_pdf/pre_proc/ocr_detect_all_bboxes.py +40 -2
magic_pdf/pre_proc/ocr_span_list_modify.py +12 -7
magic_pdf/resources/fasttext-langdetect/lid.176.ftz +0 -0
magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml +2 -2
magic_pdf/resources/model_config/model_configs.yaml +4 -0
magic_pdf/rw/AbsReaderWriter.py +1 -18
magic_pdf/rw/DiskReaderWriter.py +32 -24
magic_pdf/rw/S3ReaderWriter.py +83 -48
magic_pdf/tools/cli.py +79 -0
magic_pdf/tools/cli_dev.py +156 -0
magic_pdf/tools/common.py +119 -0
{magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/METADATA +120 -72
{magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/RECORD +34 -35
{magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/WHEEL +1 -1
magic_pdf-0.7.0a1.dist-info/entry_points.txt +3 -0
magic_pdf/cli/magicpdf.py +0 -337
magic_pdf/pdf_parse_for_train.py +0 -685
magic_pdf/train_utils/convert_to_train_format.py +0 -65
magic_pdf/train_utils/extract_caption.py +0 -59
magic_pdf/train_utils/remove_footer_header.py +0 -159
magic_pdf/train_utils/vis_utils.py +0 -327
magic_pdf-0.6.1.dist-info/entry_points.txt +0 -2
/magic_pdf/libs/{math.py → local_math.py} +0 -0
/magic_pdf/{cli → model/pek_sub_modules/structeqtable}/__init__.py +0 -0
/magic_pdf/{train_utils → tools}/__init__.py +0 -0
{magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/LICENSE.md +0 -0
{magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/top_level.txt +0 -0

magic_pdf/model/pdf_extract_kit.py CHANGED Viewed

@@ -1,14 +1,19 @@
 from loguru import logger
 import os
+import time
+os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
 try:
     import cv2
     import yaml
-    import time
     import argparse
     import numpy as np
     import torch
+    import torchtext
-    from paddleocr import draw_ocr
+    if torchtext.__version__ >= "0.18.0":
+        torchtext.disable_torchtext_deprecation_warning()
     from PIL import Image
     from torchvision import transforms
     from torch.utils.data import Dataset, DataLoader
@@ -17,13 +22,23 @@ try:
     import unimernet.tasks as tasks
     from unimernet.processors import load_processor
-    from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
-    from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
-    from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
-except ImportError:
-    logger.error('Required dependency not installed, please install by \n"pip install magic-pdf[full-cpu] detectron2 --extra-index-url https://myhloli.github.io/wheels/"')
+except ImportError as e:
+    logger.exception(e)
+    logger.error(
+        'Required dependency not installed, please install by \n'
+        '"pip install magic-pdf[full] detectron2 --extra-index-url https://myhloli.github.io/wheels/"')
     exit(1)
+from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
+from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
+from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
+from magic_pdf.model.pek_sub_modules.structeqtable.StructTableModel import StructTableModel
+def table_model_init(model_path, max_time=400, _device_='cpu'):
+    table_model = StructTableModel(model_path, max_time=max_time, device=_device_)
+    return table_model
 def mfd_model_init(weight):
     mfd_model = YOLO(weight)
@@ -83,15 +98,17 @@ class CustomPEKModel:
         model_config_dir = os.path.join(root_dir, 'resources', 'model_config')
         # 构建 model_configs.yaml 文件的完整路径
         config_path = os.path.join(model_config_dir, 'model_configs.yaml')
-        with open(config_path, "r") as f:
+        with open(config_path, "r", encoding='utf-8') as f:
             self.configs = yaml.load(f, Loader=yaml.FullLoader)
         # 初始化解析配置
         self.apply_layout = kwargs.get("apply_layout", self.configs["config"]["layout"])
         self.apply_formula = kwargs.get("apply_formula", self.configs["config"]["formula"])
+        self.table_config = kwargs.get("table_config", self.configs["config"]["table_config"])
+        self.apply_table = self.table_config.get("is_table_recog_enable", False)
         self.apply_ocr = ocr
         logger.info(
-            "DocAnalysis init, this may take some times. apply_layout: {}, apply_formula: {}, apply_ocr: {}".format(
-                self.apply_layout, self.apply_formula, self.apply_ocr
+            "DocAnalysis init, this may take some times. apply_layout: {}, apply_formula: {}, apply_ocr: {}, apply_table: {}".format(
+                self.apply_layout, self.apply_formula, self.apply_ocr, self.apply_table
             )
         )
         assert self.apply_layout, "DocAnalysis must contain layout model."
@@ -99,6 +116,7 @@ class CustomPEKModel:
         self.device = kwargs.get("device", self.configs["config"]["device"])
         logger.info("using device: {}".format(self.device))
         models_dir = kwargs.get("models_dir", os.path.join(root_dir, "resources", "models"))
+        logger.info("using models_dir: {}".format(models_dir))
         # 初始化公式识别
         if self.apply_formula:
@@ -121,6 +139,11 @@ class CustomPEKModel:
         if self.apply_ocr:
             self.ocr_model = ModifiedPaddleOCR(show_log=show_log)
+        # init structeqtable
+        if self.apply_table:
+            max_time = self.table_config.get("max_time", 400)
+            self.table_model = table_model_init(str(os.path.join(models_dir, self.configs["weights"]["table"])),
+                                                max_time=max_time, _device_=self.device)
         logger.info('DocAnalysis init done!')
     def __call__(self, image):
@@ -134,67 +157,139 @@ class CustomPEKModel:
         layout_cost = round(time.time() - layout_start, 2)
         logger.info(f"layout detection cost: {layout_cost}")
-        # 公式检测
-        mfd_res = self.mfd_model.predict(image, imgsz=1888, conf=0.25, iou=0.45, verbose=True)[0]
-        for xyxy, conf, cla in zip(mfd_res.boxes.xyxy.cpu(), mfd_res.boxes.conf.cpu(), mfd_res.boxes.cls.cpu()):
-            xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
-            new_item = {
-                'category_id': 13 + int(cla.item()),
-                'poly': [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax],
-                'score': round(float(conf.item()), 2),
-                'latex': '',
-            }
-            layout_res.append(new_item)
-            latex_filling_list.append(new_item)
-            bbox_img = get_croped_image(Image.fromarray(image), [xmin, ymin, xmax, ymax])
-            mf_image_list.append(bbox_img)
-        # 公式识别
-        mfr_start = time.time()
-        dataset = MathDataset(mf_image_list, transform=self.mfr_transform)
-        dataloader = DataLoader(dataset, batch_size=64, num_workers=0)
-        mfr_res = []
-        for mf_img in dataloader:
-            mf_img = mf_img.to(self.device)
-            output = self.mfr_model.generate({'image': mf_img})
-            mfr_res.extend(output['pred_str'])
-        for res, latex in zip(latex_filling_list, mfr_res):
-            res['latex'] = latex_rm_whitespace(latex)
-        mfr_cost = round(time.time() - mfr_start, 2)
-        logger.info(f"formula nums: {len(mf_image_list)}, mfr time: {mfr_cost}")
+        if self.apply_formula:
+            # 公式检测
+            mfd_res = self.mfd_model.predict(image, imgsz=1888, conf=0.25, iou=0.45, verbose=True)[0]
+            for xyxy, conf, cla in zip(mfd_res.boxes.xyxy.cpu(), mfd_res.boxes.conf.cpu(), mfd_res.boxes.cls.cpu()):
+                xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
+                new_item = {
+                    'category_id': 13 + int(cla.item()),
+                    'poly': [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax],
+                    'score': round(float(conf.item()), 2),
+                    'latex': '',
+                }
+                layout_res.append(new_item)
+                latex_filling_list.append(new_item)
+                bbox_img = get_croped_image(Image.fromarray(image), [xmin, ymin, xmax, ymax])
+                mf_image_list.append(bbox_img)
+            # 公式识别
+            mfr_start = time.time()
+            dataset = MathDataset(mf_image_list, transform=self.mfr_transform)
+            dataloader = DataLoader(dataset, batch_size=64, num_workers=0)
+            mfr_res = []
+            for mf_img in dataloader:
+                mf_img = mf_img.to(self.device)
+                output = self.mfr_model.generate({'image': mf_img})
+                mfr_res.extend(output['pred_str'])
+            for res, latex in zip(latex_filling_list, mfr_res):
+                res['latex'] = latex_rm_whitespace(latex)
+            mfr_cost = round(time.time() - mfr_start, 2)
+            logger.info(f"formula nums: {len(mf_image_list)}, mfr time: {mfr_cost}")
         # ocr识别
         if self.apply_ocr:
             ocr_start = time.time()
             pil_img = Image.fromarray(image)
+            # 筛选出需要OCR的区域和公式区域
+            ocr_res_list = []
             single_page_mfdetrec_res = []
             for res in layout_res:
                 if int(res['category_id']) in [13, 14]:
-                    xmin, ymin = int(res['poly'][0]), int(res['poly'][1])
-                    xmax, ymax = int(res['poly'][4]), int(res['poly'][5])
                     single_page_mfdetrec_res.append({
-                        "bbox": [xmin, ymin, xmax, ymax],
+                        "bbox": [int(res['poly'][0]), int(res['poly'][1]),
+                                 int(res['poly'][4]), int(res['poly'][5])],
                     })
-            for res in layout_res:
-                if int(res['category_id']) in [0, 1, 2, 4, 6, 7]:  # 需要进行ocr的类别
-                    xmin, ymin = int(res['poly'][0]), int(res['poly'][1])
-                    xmax, ymax = int(res['poly'][4]), int(res['poly'][5])
-                    crop_box = (xmin, ymin, xmax, ymax)
-                    cropped_img = Image.new('RGB', pil_img.size, 'white')
-                    cropped_img.paste(pil_img.crop(crop_box), crop_box)
-                    cropped_img = cv2.cvtColor(np.asarray(cropped_img), cv2.COLOR_RGB2BGR)
-                    ocr_res = self.ocr_model.ocr(cropped_img, mfd_res=single_page_mfdetrec_res)[0]
-                    if ocr_res:
-                        for box_ocr_res in ocr_res:
-                            p1, p2, p3, p4 = box_ocr_res[0]
-                            text, score = box_ocr_res[1]
-                            layout_res.append({
-                                'category_id': 15,
-                                'poly': p1 + p2 + p3 + p4,
-                                'score': round(score, 2),
-                                'text': text,
-                            })
+                elif int(res['category_id']) in [0, 1, 2, 4, 6, 7]:
+                    ocr_res_list.append(res)
+            # 对每一个需OCR处理的区域进行处理
+            for res in ocr_res_list:
+                xmin, ymin = int(res['poly'][0]), int(res['poly'][1])
+                xmax, ymax = int(res['poly'][4]), int(res['poly'][5])
+                paste_x = 50
+                paste_y = 50
+                # 创建一个宽高各多50的白色背景
+                new_width = xmax - xmin + paste_x * 2
+                new_height = ymax - ymin + paste_y * 2
+                new_image = Image.new('RGB', (new_width, new_height), 'white')
+                # 裁剪图像
+                crop_box = (xmin, ymin, xmax, ymax)
+                cropped_img = pil_img.crop(crop_box)
+                new_image.paste(cropped_img, (paste_x, paste_y))
+                # 调整公式区域坐标
+                adjusted_mfdetrec_res = []
+                for mf_res in single_page_mfdetrec_res:
+                    mf_xmin, mf_ymin, mf_xmax, mf_ymax = mf_res["bbox"]
+                    # 将公式区域坐标调整为相对于裁剪区域的坐标
+                    x0 = mf_xmin - xmin + paste_x
+                    y0 = mf_ymin - ymin + paste_y
+                    x1 = mf_xmax - xmin + paste_x
+                    y1 = mf_ymax - ymin + paste_y
+                    # 过滤在图外的公式块
+                    if any([x1 < 0, y1 < 0]) or any([x0 > new_width, y0 > new_height]):
+                        continue
+                    else:
+                        adjusted_mfdetrec_res.append({
+                            "bbox": [x0, y0, x1, y1],
+                        })
+                # OCR识别
+                new_image = cv2.cvtColor(np.asarray(new_image), cv2.COLOR_RGB2BGR)
+                ocr_res = self.ocr_model.ocr(new_image, mfd_res=adjusted_mfdetrec_res)[0]
+                # 整合结果
+                if ocr_res:
+                    for box_ocr_res in ocr_res:
+                        p1, p2, p3, p4 = box_ocr_res[0]
+                        text, score = box_ocr_res[1]
+                        # 将坐标转换回原图坐标系
+                        p1 = [p1[0] - paste_x + xmin, p1[1] - paste_y + ymin]
+                        p2 = [p2[0] - paste_x + xmin, p2[1] - paste_y + ymin]
+                        p3 = [p3[0] - paste_x + xmin, p3[1] - paste_y + ymin]
+                        p4 = [p4[0] - paste_x + xmin, p4[1] - paste_y + ymin]
+                        layout_res.append({
+                            'category_id': 15,
+                            'poly': p1 + p2 + p3 + p4,
+                            'score': round(score, 2),
+                            'text': text,
+                        })
             ocr_cost = round(time.time() - ocr_start, 2)
             logger.info(f"ocr cost: {ocr_cost}")
+        # 表格识别 table recognition
+        if self.apply_table:
+            pil_img = Image.fromarray(image)
+            for layout in layout_res:
+                if layout.get("category_id", -1) == 5:
+                    poly = layout["poly"]
+                    xmin, ymin = int(poly[0]), int(poly[1])
+                    xmax, ymax = int(poly[4]), int(poly[5])
+                    paste_x = 50
+                    paste_y = 50
+                    # 创建一个宽高各多50的白色背景 create a whiteboard with 50 larger width and length
+                    new_width = xmax - xmin + paste_x * 2
+                    new_height = ymax - ymin + paste_y * 2
+                    new_image = Image.new('RGB', (new_width, new_height), 'white')
+                    # 裁剪图像 crop image
+                    crop_box = (xmin, ymin, xmax, ymax)
+                    cropped_img = pil_img.crop(crop_box)
+                    new_image.paste(cropped_img, (paste_x, paste_y))
+                    start_time = time.time()
+                    logger.info("------------------table recognition processing begins-----------------")
+                    latex_code = self.table_model.image2latex(new_image)[0]
+                    end_time = time.time()
+                    run_time = end_time - start_time
+                    logger.info(f"------------table recognition processing ends within {run_time}s-----")
+                    layout["latex"] = latex_code
         return layout_res

magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py CHANGED Viewed

@@ -79,12 +79,13 @@ def setup(args, device):
     cfg.freeze()
     default_setup(cfg, args)
-    register_coco_instances(
-        "scihub_train",
-        {},
-        cfg.SCIHUB_DATA_DIR_TRAIN + ".json",
-        cfg.SCIHUB_DATA_DIR_TRAIN
-    )
+    #@todo 可以删掉这块？
+    # register_coco_instances(
+    #     "scihub_train",
+    #     {},
+    #     cfg.SCIHUB_DATA_DIR_TRAIN + ".json",
+    #     cfg.SCIHUB_DATA_DIR_TRAIN
+    # )
     return cfg

magic_pdf/model/pek_sub_modules/self_modify.py CHANGED Viewed

@@ -10,12 +10,17 @@ from paddleocr import PaddleOCR
 from paddleocr.ppocr.utils.logging import get_logger
 from paddleocr.ppocr.utils.utility import check_and_read, alpha_to_color, binarize_img
 from paddleocr.tools.infer.utility import draw_ocr_box_txt, get_rotate_crop_image, get_minarea_rect_crop
+from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold
 logger = get_logger()
 def img_decode(content: bytes):
     np_arr = np.frombuffer(content, dtype=np.uint8)
     return cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED)
 def check_img(img):
     if isinstance(img, bytes):
         img = img_decode(img)
@@ -51,6 +56,7 @@ def check_img(img):
     return img
 def sorted_boxes(dt_boxes):
     """
     Sort text boxes in order from top to bottom, left to right
@@ -75,49 +81,87 @@ def sorted_boxes(dt_boxes):
     return _boxes
-def formula_in_text(mf_bbox, text_bbox):
-    x1, y1, x2, y2 = mf_bbox
-    x3, y3 = text_bbox[0]
-    x4, y4 = text_bbox[2]
-    left_box, right_box = None, None
-    same_line = abs((y1+y2)/2 - (y3+y4)/2) / abs(y4-y3) < 0.2
-    if not same_line:
-        return False, left_box, right_box
-    else:
-        drop_origin = False
-        left_x = x1 - 1
-        right_x = x2 + 1
-        if x3 < x1 and x2 < x4:
-            drop_origin = True
-            left_box = np.array([text_bbox[0], [left_x, text_bbox[1][1]], [left_x, text_bbox[2][1]], text_bbox[3]]).astype('float32')
-            right_box = np.array([[right_x, text_bbox[0][1]], text_bbox[1], text_bbox[2], [right_x, text_bbox[3][1]]]).astype('float32')
-        if x3 < x1 and x1 <= x4 <= x2:
-            drop_origin = True
-            left_box = np.array([text_bbox[0], [left_x, text_bbox[1][1]], [left_x, text_bbox[2][1]], text_bbox[3]]).astype('float32')
-        if x1 <= x3 <= x2 and x2 < x4:
-            drop_origin = True
-            right_box = np.array([[right_x, text_bbox[0][1]], text_bbox[1], text_bbox[2], [right_x, text_bbox[3][1]]]).astype('float32')
-        if x1 <= x3 < x4 <= x2:
-            drop_origin = True
-        return drop_origin, left_box, right_box
-def update_det_boxes(dt_boxes, mfdetrec_res):
-    new_dt_boxes = dt_boxes
-    for mf_box in mfdetrec_res:
-        flag, left_box, right_box = False, None, None
-        for idx, text_box in enumerate(new_dt_boxes):
-            ret, left_box, right_box = formula_in_text(mf_box['bbox'], text_box)
-            if ret:
-                new_dt_boxes.pop(idx)
-                if left_box is not None:
-                    new_dt_boxes.append(left_box)
-                if right_box is not None:
-                    new_dt_boxes.append(right_box)
-                break
+def bbox_to_points(bbox):
+    """ 将bbox格式转换为四个顶点的数组 """
+    x0, y0, x1, y1 = bbox
+    return np.array([[x0, y0], [x1, y0], [x1, y1], [x0, y1]]).astype('float32')
+def points_to_bbox(points):
+    """ 将四个顶点的数组转换为bbox格式 """
+    x0, y0 = points[0]
+    x1, _ = points[1]
+    _, y1 = points[2]
+    return [x0, y0, x1, y1]
+def merge_intervals(intervals):
+    # Sort the intervals based on the start value
+    intervals.sort(key=lambda x: x[0])
+    merged = []
+    for interval in intervals:
+        # If the list of merged intervals is empty or if the current
+        # interval does not overlap with the previous, simply append it.
+        if not merged or merged[-1][1] < interval[0]:
+            merged.append(interval)
+        else:
+            # Otherwise, there is overlap, so we merge the current and previous intervals.
+            merged[-1][1] = max(merged[-1][1], interval[1])
+    return merged
+def remove_intervals(original, masks):
+    # Merge all mask intervals
+    merged_masks = merge_intervals(masks)
+    result = []
+    original_start, original_end = original
+    for mask in merged_masks:
+        mask_start, mask_end = mask
+        # If the mask starts after the original range, ignore it
+        if mask_start > original_end:
+            continue
+        # If the mask ends before the original range starts, ignore it
+        if mask_end < original_start:
+            continue
+        # Remove the masked part from the original range
+        if original_start < mask_start:
+            result.append([original_start, mask_start - 1])
+        original_start = max(mask_end + 1, original_start)
+    # Add the remaining part of the original range, if any
+    if original_start <= original_end:
+        result.append([original_start, original_end])
+    return result
+def update_det_boxes(dt_boxes, mfd_res):
+    new_dt_boxes = []
+    for text_box in dt_boxes:
+        text_bbox = points_to_bbox(text_box)
+        masks_list = []
+        for mf_box in mfd_res:
+            mf_bbox = mf_box['bbox']
+            if __is_overlaps_y_exceeds_threshold(text_bbox, mf_bbox):
+                masks_list.append([mf_bbox[0], mf_bbox[2]])
+        text_x_range = [text_bbox[0], text_bbox[2]]
+        text_remove_mask_range = remove_intervals(text_x_range, masks_list)
+        temp_dt_box = []
+        for text_remove_mask in text_remove_mask_range:
+            temp_dt_box.append(bbox_to_points([text_remove_mask[0], text_bbox[1], text_remove_mask[1], text_bbox[3]]))
+        if len(temp_dt_box) > 0:
+            new_dt_boxes.extend(temp_dt_box)
     return new_dt_boxes
 class ModifiedPaddleOCR(PaddleOCR):
     def ocr(self, img, det=True, rec=True, cls=True, bin=False, inv=False, mfd_res=None, alpha_color=(255, 255, 255)):
         """
@@ -197,7 +241,7 @@ class ModifiedPaddleOCR(PaddleOCR):
             if not rec:
                 return cls_res
             return ocr_res
     def __call__(self, img, cls=True, mfd_res=None):
         time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0}
@@ -226,7 +270,7 @@ class ModifiedPaddleOCR(PaddleOCR):
             dt_boxes = update_det_boxes(dt_boxes, mfd_res)
             aft = time.time()
             logger.debug("split text box by formula, new dt_boxes num : {}, elapsed : {}".format(
-                len(dt_boxes), aft-bef))
+                len(dt_boxes), aft - bef))
         for bno in range(len(dt_boxes)):
             tmp_box = copy.deepcopy(dt_boxes[bno])

magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py ADDED Viewed

@@ -0,0 +1,22 @@
+from struct_eqtable.model import StructTable
+from pypandoc import convert_text
+class StructTableModel:
+    def __init__(self, model_path, max_new_tokens=2048, max_time=400, device = 'cpu'):
+        # init
+        self.model_path = model_path
+        self.max_new_tokens = max_new_tokens # maximum output tokens length
+        self.max_time = max_time # timeout for processing in seconds
+        if device == 'cuda':
+            self.model = StructTable(self.model_path, self.max_new_tokens, self.max_time).cuda()
+        else:
+            self.model = StructTable(self.model_path, self.max_new_tokens, self.max_time)
+    def image2latex(self, image) -> str:
+        #
+        table_latex = self.model.forward(image)
+        return table_latex
+    def image2html(self, image) -> str:
+        table_latex = self.image2latex(image)
+        table_html = convert_text(table_latex, 'html', format='latex')
+        return table_html

magic_pdf/model/pp_structure_v2.py CHANGED Viewed

@@ -5,7 +5,7 @@ from loguru import logger
 try:
     from paddleocr import PPStructure
 except ImportError:
-    logger.error('paddleocr not installed, please install by "pip install magic-pdf[cpu]" or "pip install magic-pdf[gpu]"')
+    logger.error('paddleocr not installed, please install by "pip install magic-pdf[lite]"')
     exit(1)

magic_pdf/pdf_parse_union_core.py CHANGED Viewed

@@ -7,7 +7,7 @@ from magic_pdf.layout.layout_sort import get_bboxes_layout, LAYOUT_UNPROC, get_c
 from magic_pdf.libs.convert_utils import dict_to_list
 from magic_pdf.libs.drop_reason import DropReason
 from magic_pdf.libs.hash_utils import compute_md5
-from magic_pdf.libs.math import float_equal
+from magic_pdf.libs.local_math import float_equal
 from magic_pdf.libs.ocr_content_type import ContentType
 from magic_pdf.model.magic_model import MagicModel
 from magic_pdf.para.para_split_v2 import para_split
@@ -111,7 +111,8 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
     spans = ocr_cut_image_and_table(spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter)
     '''将所有区块的bbox整理到一起'''
-    # @todo interline_equation_blocks参数不够准，后面切换到interline_equations上
+    # interline_equation_blocks参数不够准，后面切换到interline_equations上
+    interline_equation_blocks = []
     if len(interline_equation_blocks) > 0:
         all_bboxes, all_discarded_blocks, drop_reasons = ocr_prepare_bboxes_for_layout_split(
             img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
@@ -120,6 +121,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
         all_bboxes, all_discarded_blocks, drop_reasons = ocr_prepare_bboxes_for_layout_split(
             img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
             interline_equations, page_w, page_h)
     if len(drop_reasons) > 0:
         need_drop = True
         drop_reason.append(DropReason.OVERLAP_BLOCKS_CAN_NOT_SEPARATION)

magic_pdf/pre_proc/citationmarker_remove.py CHANGED Viewed

@@ -135,7 +135,11 @@ def remove_citation_marker(with_char_text_blcoks):
                 if max_font_sz-span_font_sz<1: # 先以字体过滤正文，如果是正文就不再继续判断了
                     continue
+                # 对被除数为0的情况进行过滤
+                if span_hi==0 or min_font_sz==0:
+                    continue
                 if (base_span_mid_y-span_mid_y)/span_hi>0.2 or (base_span_mid_y-span_mid_y>0 and abs(span_font_sz-min_font_sz)/min_font_sz<0.1):
                     """
                     1. 它的前一个char如果是句号或者逗号的话，那么肯定是角标而不是公式

magic_pdf/pre_proc/ocr_detect_all_bboxes.py CHANGED Viewed

@@ -36,9 +36,12 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
     all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
     '''任何框体与舍弃框重叠，优先信任舍弃框'''
     all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
-    # @todo interline_equation 与title或text框冲突的情况，分两种情况处理
+    # interline_equation 与title或text框冲突的情况，分两种情况处理
     '''interline_equation框与文本类型框iou比较接近1的时候，信任行间公式框'''
+    all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
     '''interline_equation框被包含在文本类型框内，且interline_equation比文本区块小很多时信任文本框，这时需要舍弃公式框'''
+    # 通过后续大框套小框逻辑删除
     '''discarded_blocks中只保留宽度超过1/3页面宽度的，高度超过10的，处于页面下半50%区域的（限定footnote）'''
     for discarded in discarded_blocks:
@@ -57,6 +60,34 @@ def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_bloc
     return all_bboxes, all_discarded_blocks, drop_reasons
+def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes):
+    # 先提取所有text和interline block
+    text_blocks = []
+    for block in all_bboxes:
+        if block[7] == BlockType.Text:
+            text_blocks.append(block)
+    interline_equation_blocks = []
+    for block in all_bboxes:
+        if block[7] == BlockType.InterlineEquation:
+            interline_equation_blocks.append(block)
+    need_remove = []
+    for interline_equation_block in interline_equation_blocks:
+        for text_block in text_blocks:
+            interline_equation_block_bbox = interline_equation_block[:4]
+            text_block_bbox = text_block[:4]
+            if calculate_iou(interline_equation_block_bbox, text_block_bbox) > 0.8:
+                if text_block not in need_remove:
+                    need_remove.append(text_block)
+    if len(need_remove) > 0:
+        for block in need_remove:
+            all_bboxes.remove(block)
+    return all_bboxes
 def fix_text_overlap_title_blocks(all_bboxes):
     # 先提取所有text和title block
     text_blocks = []
@@ -68,12 +99,19 @@ def fix_text_overlap_title_blocks(all_bboxes):
         if block[7] == BlockType.Title:
             title_blocks.append(block)
+    need_remove = []
     for text_block in text_blocks:
         for title_block in title_blocks:
             text_block_bbox = text_block[:4]
             title_block_bbox = title_block[:4]
             if calculate_iou(text_block_bbox, title_block_bbox) > 0.8:
-                all_bboxes.remove(title_block)
+                if title_block not in need_remove:
+                    need_remove.append(title_block)
+    if len(need_remove) > 0:
+        for block in need_remove:
+            all_bboxes.remove(block)
     return all_bboxes

magic_pdf/pre_proc/ocr_span_list_modify.py CHANGED Viewed

@@ -5,19 +5,24 @@ from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, g
 from magic_pdf.libs.drop_tag import DropTag
 from magic_pdf.libs.ocr_content_type import ContentType, BlockType
 def remove_overlaps_low_confidence_spans(spans):
     dropped_spans = []
     #  删除重叠spans中置信度低的的那些
     for span1 in spans:
         for span2 in spans:
             if span1 != span2:
-                if calculate_iou(span1['bbox'], span2['bbox']) > 0.9:
-                    if span1['score'] < span2['score']:
-                        span_need_remove = span1
-                    else:
-                        span_need_remove = span2
-                    if span_need_remove is not None and span_need_remove not in dropped_spans:
-                        dropped_spans.append(span_need_remove)
+                # span1 或 span2 任何一个都不应该在 dropped_spans 中
+                if span1 in dropped_spans or span2 in dropped_spans:
+                    continue
+                else:
+                    if calculate_iou(span1['bbox'], span2['bbox']) > 0.9:
+                        if span1['score'] < span2['score']:
+                            span_need_remove = span1
+                        else:
+                            span_need_remove = span2
+                        if span_need_remove is not None and span_need_remove not in dropped_spans:
+                            dropped_spans.append(span_need_remove)
     if len(dropped_spans) > 0:
         for span_need_remove in dropped_spans:

magic_pdf/resources/fasttext-langdetect/lid.176.ftz ADDED Viewed

Binary file

magic-pdf 0.6.1__py3-none-any.whl → 0.7.0a1__py3-none-any.whl

magic-pdf 0.6.1py3-none-any.whl → 0.7.0a1py3-none-any.whl