PyPI - magic-pdf - Versions diffs - 0.7.1__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

magic-pdf 0.7.1py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

magic_pdf/dict2md/ocr_mkcontent.py +130 -76
magic_pdf/integrations/__init__.py +0 -0
magic_pdf/integrations/rag/__init__.py +0 -0
magic_pdf/integrations/rag/api.py +82 -0
magic_pdf/integrations/rag/type.py +82 -0
magic_pdf/integrations/rag/utils.py +285 -0
magic_pdf/layout/layout_sort.py +472 -283
magic_pdf/libs/boxbase.py +188 -149
magic_pdf/libs/draw_bbox.py +113 -87
magic_pdf/libs/ocr_content_type.py +21 -18
magic_pdf/libs/version.py +1 -1
magic_pdf/model/doc_analyze_by_custom_model.py +14 -2
magic_pdf/model/magic_model.py +283 -166
magic_pdf/model/model_list.py +8 -0
magic_pdf/model/pdf_extract_kit.py +105 -15
magic_pdf/model/pek_sub_modules/self_modify.py +84 -0
magic_pdf/para/para_split_v2.py +26 -27
magic_pdf/pdf_parse_union_core.py +34 -6
magic_pdf/pipe/AbsPipe.py +4 -1
magic_pdf/pipe/OCRPipe.py +7 -4
magic_pdf/pipe/TXTPipe.py +7 -4
magic_pdf/pipe/UNIPipe.py +11 -6
magic_pdf/pre_proc/ocr_detect_all_bboxes.py +12 -3
magic_pdf/pre_proc/ocr_dict_merge.py +60 -59
magic_pdf/tools/cli.py +56 -29
magic_pdf/tools/cli_dev.py +61 -64
magic_pdf/tools/common.py +57 -37
magic_pdf/user_api.py +17 -9
{magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/METADATA +72 -27
{magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/RECORD +34 -29
{magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/LICENSE.md +0 -0
{magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/WHEEL +0 -0
{magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/entry_points.txt +0 -0
{magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/top_level.txt +0 -0

magic_pdf/model/model_list.py CHANGED Viewed

@@ -1,3 +1,11 @@
 class MODEL:
     Paddle = "pp_structure_v2"
     PEK = "pdf_extract_kit"
+class AtomicModel:
+    Layout = "layout"
+    MFD = "mfd"
+    MFR = "mfr"
+    OCR = "ocr"
+    Table = "table"

magic_pdf/model/pdf_extract_kit.py CHANGED Viewed

@@ -3,6 +3,7 @@ import os
 import time
 from magic_pdf.libs.Constants import *
+from magic_pdf.model.model_list import AtomicModel
 os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
 try:
@@ -64,7 +65,8 @@ def mfr_model_init(weight_dir, cfg_path, _device_='cpu'):
     model = task.build_model(cfg)
     model = model.to(_device_)
     vis_processor = load_processor('formula_image_eval', cfg.config.datasets.formula_rec_eval.vis_processor.eval)
-    return model, vis_processor
+    mfr_transform = transforms.Compose([vis_processor, ])
+    return [model, mfr_transform]
 def layout_model_init(weight, config_file, device):
@@ -72,6 +74,11 @@ def layout_model_init(weight, config_file, device):
     return model
+def ocr_model_init(show_log: bool = False, det_db_box_thresh=0.3):
+    model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh)
+    return model
 class MathDataset(Dataset):
     def __init__(self, image_paths, transform=None):
         self.image_paths = image_paths
@@ -91,6 +98,58 @@ class MathDataset(Dataset):
             return image
+class AtomModelSingleton:
+    _instance = None
+    _models = {}
+    def __new__(cls, *args, **kwargs):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+    def get_atom_model(self, atom_model_name: str, **kwargs):
+        if atom_model_name not in self._models:
+            self._models[atom_model_name] = atom_model_init(model_name=atom_model_name, **kwargs)
+        return self._models[atom_model_name]
+def atom_model_init(model_name: str, **kwargs):
+    if model_name == AtomicModel.Layout:
+        atom_model = layout_model_init(
+            kwargs.get("layout_weights"),
+            kwargs.get("layout_config_file"),
+            kwargs.get("device")
+        )
+    elif model_name == AtomicModel.MFD:
+        atom_model = mfd_model_init(
+            kwargs.get("mfd_weights")
+        )
+    elif model_name == AtomicModel.MFR:
+        atom_model = mfr_model_init(
+            kwargs.get("mfr_weight_dir"),
+            kwargs.get("mfr_cfg_path"),
+            kwargs.get("device")
+        )
+    elif model_name == AtomicModel.OCR:
+        atom_model = ocr_model_init(
+            kwargs.get("ocr_show_log"),
+            kwargs.get("det_db_box_thresh")
+        )
+    elif model_name == AtomicModel.Table:
+        atom_model = table_model_init(
+            kwargs.get("table_model_type"),
+            kwargs.get("table_model_path"),
+            kwargs.get("table_max_time"),
+            kwargs.get("device")
+        )
+    else:
+        logger.error("model name not allow")
+        exit(1)
+    return atom_model
 class CustomPEKModel:
     def __init__(self, ocr: bool = False, show_log: bool = False, **kwargs):
@@ -130,32 +189,62 @@ class CustomPEKModel:
         models_dir = kwargs.get("models_dir", os.path.join(root_dir, "resources", "models"))
         logger.info("using models_dir: {}".format(models_dir))
+        atom_model_manager = AtomModelSingleton()
         # 初始化公式识别
         if self.apply_formula:
             # 初始化公式检测模型
-            self.mfd_model = mfd_model_init(str(os.path.join(models_dir, self.configs["weights"]["mfd"])))
+            # self.mfd_model = mfd_model_init(str(os.path.join(models_dir, self.configs["weights"]["mfd"])))
+            self.mfd_model = atom_model_manager.get_atom_model(
+                atom_model_name=AtomicModel.MFD,
+                mfd_weights=str(os.path.join(models_dir, self.configs["weights"]["mfd"]))
+            )
             # 初始化公式解析模型
             mfr_weight_dir = str(os.path.join(models_dir, self.configs["weights"]["mfr"]))
             mfr_cfg_path = str(os.path.join(model_config_dir, "UniMERNet", "demo.yaml"))
-            self.mfr_model, mfr_vis_processors = mfr_model_init(mfr_weight_dir, mfr_cfg_path, _device_=self.device)
-            self.mfr_transform = transforms.Compose([mfr_vis_processors, ])
+            # self.mfr_model, mfr_vis_processors = mfr_model_init(mfr_weight_dir, mfr_cfg_path, _device_=self.device)
+            # self.mfr_transform = transforms.Compose([mfr_vis_processors, ])
+            self.mfr_model, self.mfr_transform = atom_model_manager.get_atom_model(
+                atom_model_name=AtomicModel.MFR,
+                mfr_weight_dir=mfr_weight_dir,
+                mfr_cfg_path=mfr_cfg_path,
+                device=self.device
+            )
         # 初始化layout模型
-        self.layout_model = Layoutlmv3_Predictor(
-            str(os.path.join(models_dir, self.configs['weights']['layout'])),
-            str(os.path.join(model_config_dir, "layoutlmv3", "layoutlmv3_base_inference.yaml")),
+        # self.layout_model = Layoutlmv3_Predictor(
+        #     str(os.path.join(models_dir, self.configs['weights']['layout'])),
+        #     str(os.path.join(model_config_dir, "layoutlmv3", "layoutlmv3_base_inference.yaml")),
+        #     device=self.device
+        # )
+        self.layout_model = atom_model_manager.get_atom_model(
+            atom_model_name=AtomicModel.Layout,
+            layout_weights=str(os.path.join(models_dir, self.configs['weights']['layout'])),
+            layout_config_file=str(os.path.join(model_config_dir, "layoutlmv3", "layoutlmv3_base_inference.yaml")),
             device=self.device
         )
         # 初始化ocr
         if self.apply_ocr:
-            self.ocr_model = ModifiedPaddleOCR(show_log=show_log)
+            # self.ocr_model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=0.3)
+            self.ocr_model = atom_model_manager.get_atom_model(
+                atom_model_name=AtomicModel.OCR,
+                ocr_show_log=show_log,
+                det_db_box_thresh=0.3
+            )
         # init table model
         if self.apply_table:
             table_model_dir = self.configs["weights"][self.table_model_type]
-            self.table_model = table_model_init(self.table_model_type, str(os.path.join(models_dir, table_model_dir)),
-                                                max_time=self.table_max_time, _device_=self.device)
+            # self.table_model = table_model_init(self.table_model_type, str(os.path.join(models_dir, table_model_dir)),
+            #                                     max_time=self.table_max_time, _device_=self.device)
+            self.table_model = atom_model_manager.get_atom_model(
+                atom_model_name=AtomicModel.Table,
+                table_model_type=self.table_model_type,
+                table_model_path=str(os.path.join(models_dir, table_model_dir)),
+                table_max_time=self.table_max_time,
+                device=self.device
+            )
         logger.info('DocAnalysis init done!')
     def __call__(self, image):
@@ -291,11 +380,12 @@ class CustomPEKModel:
                 logger.info("------------------table recognition processing begins-----------------")
                 latex_code = None
                 html_code = None
-                with torch.no_grad():
-                    if self.table_model_type == STRUCT_EQTABLE:
+                if self.table_model_type == STRUCT_EQTABLE:
+                    with torch.no_grad():
                         latex_code = self.table_model.image2latex(new_image)[0]
-                    else:
-                        html_code = self.table_model.img2html(new_image)
+                else:
+                    html_code = self.table_model.img2html(new_image)
                 run_time = time.time() - single_table_start_time
                 logger.info(f"------------table recognition processing ends within {run_time}s-----")
                 if run_time > self.table_max_time:

magic_pdf/model/pek_sub_modules/self_modify.py CHANGED Viewed

@@ -12,6 +12,7 @@ from paddleocr.ppocr.utils.utility import check_and_read, alpha_to_color, binari
 from paddleocr.tools.infer.utility import draw_ocr_box_txt, get_rotate_crop_image, get_minarea_rect_crop
 from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold
+from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line
 logger = get_logger()
@@ -162,6 +163,86 @@ def update_det_boxes(dt_boxes, mfd_res):
     return new_dt_boxes
+def merge_overlapping_spans(spans):
+    """
+    Merges overlapping spans on the same line.
+    :param spans: A list of span coordinates [(x1, y1, x2, y2), ...]
+    :return: A list of merged spans
+    """
+    # Return an empty list if the input spans list is empty
+    if not spans:
+        return []
+    # Sort spans by their starting x-coordinate
+    spans.sort(key=lambda x: x[0])
+    # Initialize the list of merged spans
+    merged = []
+    for span in spans:
+        # Unpack span coordinates
+        x1, y1, x2, y2 = span
+        # If the merged list is empty or there's no horizontal overlap, add the span directly
+        if not merged or merged[-1][2] < x1:
+            merged.append(span)
+        else:
+            # If there is horizontal overlap, merge the current span with the previous one
+            last_span = merged.pop()
+            # Update the merged span's top-left corner to the smaller (x1, y1) and bottom-right to the larger (x2, y2)
+            x1 = min(last_span[0], x1)
+            y1 = min(last_span[1], y1)
+            x2 = max(last_span[2], x2)
+            y2 = max(last_span[3], y2)
+            # Add the merged span back to the list
+            merged.append((x1, y1, x2, y2))
+    # Return the list of merged spans
+    return merged
+def merge_det_boxes(dt_boxes):
+    """
+    Merge detection boxes.
+    This function takes a list of detected bounding boxes, each represented by four corner points.
+    The goal is to merge these bounding boxes into larger text regions.
+    Parameters:
+    dt_boxes (list): A list containing multiple text detection boxes, where each box is defined by four corner points.
+    Returns:
+    list: A list containing the merged text regions, where each region is represented by four corner points.
+    """
+    # Convert the detection boxes into a dictionary format with bounding boxes and type
+    dt_boxes_dict_list = []
+    for text_box in dt_boxes:
+        text_bbox = points_to_bbox(text_box)
+        text_box_dict = {
+            'bbox': text_bbox,
+            'type': 'text',
+        }
+        dt_boxes_dict_list.append(text_box_dict)
+    # Merge adjacent text regions into lines
+    lines = merge_spans_to_line(dt_boxes_dict_list)
+    # Initialize a new list for storing the merged text regions
+    new_dt_boxes = []
+    for line in lines:
+        line_bbox_list = []
+        for span in line:
+            line_bbox_list.append(span['bbox'])
+        # Merge overlapping text regions within the same line
+        merged_spans = merge_overlapping_spans(line_bbox_list)
+        # Convert the merged text regions back to point format and add them to the new detection box list
+        for span in merged_spans:
+            new_dt_boxes.append(bbox_to_points(span))
+    return new_dt_boxes
 class ModifiedPaddleOCR(PaddleOCR):
     def ocr(self, img, det=True, rec=True, cls=True, bin=False, inv=False, mfd_res=None, alpha_color=(255, 255, 255)):
         """
@@ -265,6 +346,9 @@ class ModifiedPaddleOCR(PaddleOCR):
         img_crop_list = []
         dt_boxes = sorted_boxes(dt_boxes)
+        dt_boxes = merge_det_boxes(dt_boxes)
         if mfd_res:
             bef = time.time()
             dt_boxes = update_det_boxes(dt_boxes, mfd_res)

magic_pdf/para/para_split_v2.py CHANGED Viewed

@@ -1,3 +1,5 @@
+import copy
 from sklearn.cluster import DBSCAN
 import numpy as np
 from loguru import logger
@@ -167,7 +169,7 @@ def cluster_line_x(lines: list) -> dict:
     x0_lst = np.array([[round(line['bbox'][0]), 0] for line in lines])
     x0_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x0_lst)
     x0_uniq_label = np.unique(x0_clusters.labels_)
-    #x1_lst = np.array([[line['bbox'][2], 0] for line in lines])
+    # x1_lst = np.array([[line['bbox'][2], 0] for line in lines])
     x0_2_new_val = {}  # 存储旧值对应的新值映射
     min_x0 = round(lines[0]["bbox"][0])
     for label in x0_uniq_label:
@@ -200,7 +202,9 @@ def __valign_lines(blocks, layout_bboxes):
     min_distance = 3
     min_sample = 2
     new_layout_bboxes = []
+    # add bbox_fs for para split calculation
+    for block in blocks:
+        block["bbox_fs"] = copy.deepcopy(block["bbox"])
     for layout_box in layout_bboxes:
         blocks_in_layoutbox = [b for b in blocks if
                                b["type"] == BlockType.Text and is_in_layout(b['bbox'], layout_box['layout_bbox'])]
@@ -245,16 +249,15 @@ def __valign_lines(blocks, layout_bboxes):
         # 由于修改了block里的line长度，现在需要重新计算block的bbox
         for block in blocks_in_layoutbox:
             if len(block["lines"]) > 0:
-                block['bbox'] = [min([line['bbox'][0] for line in block['lines']]),
-                                 min([line['bbox'][1] for line in block['lines']]),
-                                 max([line['bbox'][2] for line in block['lines']]),
-                                 max([line['bbox'][3] for line in block['lines']])]
+                block['bbox_fs'] = [min([line['bbox'][0] for line in block['lines']]),
+                                    min([line['bbox'][1] for line in block['lines']]),
+                                    max([line['bbox'][2] for line in block['lines']]),
+                                    max([line['bbox'][3] for line in block['lines']])]
         """新计算layout的bbox，因为block的bbox变了。"""
-        layout_x0 = min([block['bbox'][0] for block in blocks_in_layoutbox])
-        layout_y0 = min([block['bbox'][1] for block in blocks_in_layoutbox])
-        layout_x1 = max([block['bbox'][2] for block in blocks_in_layoutbox])
-        layout_y1 = max([block['bbox'][3] for block in blocks_in_layoutbox])
+        layout_x0 = min([block['bbox_fs'][0] for block in blocks_in_layoutbox])
+        layout_y0 = min([block['bbox_fs'][1] for block in blocks_in_layoutbox])
+        layout_x1 = max([block['bbox_fs'][2] for block in blocks_in_layoutbox])
+        layout_y1 = max([block['bbox_fs'][3] for block in blocks_in_layoutbox])
         new_layout_bboxes.append([layout_x0, layout_y0, layout_x1, layout_y1])
     return new_layout_bboxes
@@ -312,7 +315,7 @@ def __group_line_by_layout(blocks, layout_bboxes):
     # 因为只是一个block一行目前, 一个block就是一个段落
     blocks_group = []
     for lyout in layout_bboxes:
-        blocks_in_layout = [block for block in blocks if is_in_layout(block['bbox'], lyout['layout_bbox'])]
+        blocks_in_layout = [block for block in blocks if is_in_layout(block.get('bbox_fs', None), lyout['layout_bbox'])]
         blocks_group.append(blocks_in_layout)
     return blocks_group
@@ -365,7 +368,8 @@ def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en"):
                 for i in range(0, len(list_start)):
                     index = list_start[i] - 1
                     if index >= 0:
-                        if "content" in lines[index]["spans"][-1]:
+                        if "content" in lines[index]["spans"][-1] and lines[index]["spans"][-1].get('type', '') not in [
+                            ContentType.InlineEquation, ContentType.InterlineEquation]:
                             lines[index]["spans"][-1]["content"] += '\n\n'
         layout_list_info = [False, False]  # 这个layout最后是不是列表,记录每一个layout里是不是列表开头，列表结尾
         for content_type, start, end in text_segments:
@@ -477,7 +481,7 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
                     break
         # 如果这些行的缩进是相等的，那么连到上一个layout的最后一个段落上。
         if len(may_list_lines) > 0 and len(set([x['bbox'][0] for x in may_list_lines])) == 1:
-            #pre_page_paras[-1].append(may_list_lines)
+            # pre_page_paras[-1].append(may_list_lines)
             # 下一页合并到上一页最后一段，打一个cross_page的标签
             for line in may_list_lines:
                 for span in line["spans"]:
@@ -537,7 +541,6 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
         next_first_line_text = ''.join([__get_span_text(span) for span in next_first_line['spans']])
         next_first_line_type = next_first_line['spans'][0]['type']
         if pre_last_line_type not in [TEXT, INLINE_EQUATION] or next_first_line_type not in [TEXT, INLINE_EQUATION]:
-            #connected_layout_paras.append(layout_paras[i])
             connected_layout_blocks.append(blocks_group[i])
             continue
         pre_layout = __find_layout_bbox_by_line(pre_last_line['bbox'], new_layout_bbox)
@@ -552,10 +555,8 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
             -1] not in LINE_STOP_FLAG and \
                 next_first_line['bbox'][0] == next_x0_min:  # 前面一行沾满了整个行，并且没有结尾符号.下一行没有空白开头。
             """连接段落条件成立，将前一个layout的段落和后一个layout的段落连接。"""
-            #connected_layout_paras[-1][-1].extend(layout_paras[i][0])
             connected_layout_blocks[-1][-1]["lines"].extend(blocks_group[i][0]["lines"])
-            #layout_paras[i].pop(0)  # 删除后一个layout的第一个段落， 因为他已经被合并到前一个layout的最后一个段落了。
-            blocks_group[i][0]["lines"] = []  #删除后一个layout第一个段落中的lines，因为他已经被合并到前一个layout的最后一个段落了
+            blocks_group[i][0]["lines"] = []  # 删除后一个layout第一个段落中的lines，因为他已经被合并到前一个layout的最后一个段落了
             blocks_group[i][0][LINES_DELETED] = True
             # if len(layout_paras[i]) == 0:
             #     layout_paras.pop(i)
@@ -564,7 +565,6 @@ def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
             connected_layout_blocks.append(blocks_group[i])
         else:
             """连接段落条件不成立，将前一个layout的段落加入到结果中。"""
-            #connected_layout_paras.append(layout_paras[i])
             connected_layout_blocks.append(blocks_group[i])
     return connected_layout_blocks
@@ -622,7 +622,7 @@ def __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
                 span[CROSS_PAGE] = True
         pre_last_para.extend(next_first_para)
-        #next_page_paras[0].pop(0)  # 删除后一个页面的第一个段落， 因为他已经被合并到前一个页面的最后一个段落了。
+        # next_page_paras[0].pop(0)  # 删除后一个页面的第一个段落， 因为他已经被合并到前一个页面的最后一个段落了。
         next_page_paras[0][0]["lines"] = []
         next_page_paras[0][0][LINES_DELETED] = True
         return True
@@ -666,16 +666,15 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang):
         layout_box = new_layout_bbox[layout_i]
         single_line_paras_tag = []
         for i in range(len(layout_para)):
-            #single_line_paras_tag.append(len(layout_para[i]) == 1 and layout_para[i][0]['spans'][0]['type'] == TEXT)
+            # single_line_paras_tag.append(len(layout_para[i]) == 1 and layout_para[i][0]['spans'][0]['type'] == TEXT)
             single_line_paras_tag.append(layout_para[i]['type'] == BlockType.Text and len(layout_para[i]["lines"]) == 1)
         """找出来连续的单行文本，如果连续行高度相同，那么合并为一个段落。"""
         consecutive_single_line_indices = find_consecutive_true_regions(single_line_paras_tag)
         if len(consecutive_single_line_indices) > 0:
-            #index_offset = 0
             """检查这些行是否是高度相同的，居中的"""
             for start, end in consecutive_single_line_indices:
-                #start += index_offset
-                #end += index_offset
+                # start += index_offset
+                # end += index_offset
                 line_hi = np.array([block["lines"][0]['bbox'][3] - block["lines"][0]['bbox'][1] for block in
                                     layout_para[start:end + 1]])
                 first_line_text = ''.join([__get_span_text(span) for span in layout_para[start]["lines"][0]['spans']])
@@ -700,9 +699,9 @@ def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang):
                         for i_para in range(start + 1, end + 1):
                             layout_para[i_para]["lines"] = []
                             layout_para[i_para][LINES_DELETED] = True
-                        #layout_para[start:end + 1] = [merge_para]
+                        # layout_para[start:end + 1] = [merge_para]
-                        #index_offset -= end - start
+                        # index_offset -= end - start
     return
@@ -742,7 +741,7 @@ def para_split(pdf_info_dict, debug_mode, lang="en"):
     new_layout_of_pages = []  # 数组的数组，每个元素是一个页面的layoutS
     all_page_list_info = []  # 保存每个页面开头和结尾是否是列表
     for page_num, page in pdf_info_dict.items():
-        blocks = page['preproc_blocks']
+        blocks = copy.deepcopy(page['preproc_blocks'])
         layout_bboxes = page['layout_bboxes']
         new_layout_bbox = __common_pre_proc(blocks, layout_bboxes)
         new_layout_of_pages.append(new_layout_bbox)

magic_pdf/pdf_parse_union_core.py CHANGED Viewed

@@ -41,6 +41,23 @@ def remove_horizontal_overlap_block_which_smaller(all_bboxes):
     return is_useful_block_horz_overlap, all_bboxes
+def __replace_STX_ETX(text_str:str):
+    """ Replace \u0002 and \u0003, as these characters become garbled when extracted using pymupdf. In fact, they were originally quotation marks.
+Drawback: This issue is only observed in English text; it has not been found in Chinese text so far.
+    Args:
+        text_str (str): raw text
+    Returns:
+        _type_: replaced text
+    """
+    if text_str:
+        s = text_str.replace('\u0002', "'")
+        s = s.replace("\u0003", "'")
+        return s
+    return text_str
 def txt_spans_extract(pdf_page, inline_equations, interline_equations):
     text_raw_blocks = pdf_page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
     char_level_text_blocks = pdf_page.get_text("rawdict", flags=fitz.TEXTFLAGS_TEXT)[
@@ -63,7 +80,7 @@ def txt_spans_extract(pdf_page, inline_equations, interline_equations):
                     spans.append(
                         {
                             "bbox": list(span["bbox"]),
-                            "content": span["text"],
+                            "content": __replace_STX_ETX(span["text"]),
                             "type": ContentType.Text,
                             "score": 1.0,
                         }
@@ -175,7 +192,7 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
     sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes)
     '''将span填入排好序的blocks中'''
-    block_with_spans, spans = fill_spans_in_blocks(sorted_blocks, spans, 0.6)
+    block_with_spans, spans = fill_spans_in_blocks(sorted_blocks, spans, 0.3)
     '''对block进行fix操作'''
     fix_blocks = fix_block_spans(block_with_spans, img_blocks, table_blocks)
@@ -208,13 +225,17 @@ def pdf_parse_union(pdf_bytes,
     magic_model = MagicModel(model_list, pdf_docs)
     '''根据输入的起始范围解析pdf'''
-    end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
+    # end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
+    end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(pdf_docs) - 1
+    if end_page_id > len(pdf_docs) - 1:
+        logger.warning("end_page_id is out of range, use pdf_docs length")
+        end_page_id = len(pdf_docs) - 1
     '''初始化启动时间'''
     start_time = time.time()
-    for page_id in range(start_page_id, end_page_id + 1):
+    for page_id, page in enumerate(pdf_docs):
         '''debug时输出每页解析的耗时'''
         if debug_mode:
             time_now = time.time()
@@ -224,7 +245,14 @@ def pdf_parse_union(pdf_bytes,
             start_time = time_now
         '''解析pdf中的每一页'''
-        page_info = parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode)
+        if start_page_id <= page_id <= end_page_id:
+            page_info = parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode)
+        else:
+            page_w = page.rect.width
+            page_h = page.rect.height
+            page_info = ocr_construct_page_component_v2([], [], page_id, page_w, page_h, [],
+                                                [], [], [], [],
+                                                True, "skip page")
         pdf_info_dict[f"page_{page_id}"] = page_info
     """分段"""

magic_pdf/pipe/AbsPipe.py CHANGED Viewed

@@ -16,12 +16,15 @@ class AbsPipe(ABC):
     PIP_OCR = "ocr"
     PIP_TXT = "txt"
-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
+                 start_page_id=0, end_page_id=None):
         self.pdf_bytes = pdf_bytes
         self.model_list = model_list
         self.image_writer = image_writer
         self.pdf_mid_data = None  # 未压缩
         self.is_debug = is_debug
+        self.start_page_id = start_page_id
+        self.end_page_id = end_page_id
     def get_compress_pdf_mid_data(self):
         return JsonCompressor.compress_json(self.pdf_mid_data)

magic_pdf/pipe/OCRPipe.py CHANGED Viewed

@@ -9,17 +9,20 @@ from magic_pdf.user_api import parse_ocr_pdf
 class OCRPipe(AbsPipe):
-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
-        super().__init__(pdf_bytes, model_list, image_writer, is_debug)
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
+                 start_page_id=0, end_page_id=None):
+        super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id)
     def pipe_classify(self):
         pass
     def pipe_analyze(self):
-        self.model_list = doc_analyze(self.pdf_bytes, ocr=True)
+        self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
+                                      start_page_id=self.start_page_id, end_page_id=self.end_page_id)
     def pipe_parse(self):
-        self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
+        self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
+                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id)
     def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
         result = super().pipe_mk_uni_format(img_parent_path, drop_mode)

magic_pdf/pipe/TXTPipe.py CHANGED Viewed

@@ -10,17 +10,20 @@ from magic_pdf.user_api import parse_txt_pdf
 class TXTPipe(AbsPipe):
-    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
-        super().__init__(pdf_bytes, model_list, image_writer, is_debug)
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False,
+                 start_page_id=0, end_page_id=None):
+        super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id)
     def pipe_classify(self):
         pass
     def pipe_analyze(self):
-        self.model_list = doc_analyze(self.pdf_bytes, ocr=False)
+        self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
+                                      start_page_id=self.start_page_id, end_page_id=self.end_page_id)
     def pipe_parse(self):
-        self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
+        self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
+                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id)
     def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
         result = super().pipe_mk_uni_format(img_parent_path, drop_mode)

magic_pdf/pipe/UNIPipe.py CHANGED Viewed

@@ -13,9 +13,10 @@ from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
 class UNIPipe(AbsPipe):
-    def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False):
+    def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False,
+                 start_page_id=0, end_page_id=None):
         self.pdf_type = jso_useful_key["_pdf_type"]
-        super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug)
+        super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug, start_page_id, end_page_id)
         if len(self.model_list) == 0:
             self.input_model_is_empty = True
         else:
@@ -26,17 +27,21 @@ class UNIPipe(AbsPipe):
     def pipe_analyze(self):
         if self.pdf_type == self.PIP_TXT:
-            self.model_list = doc_analyze(self.pdf_bytes, ocr=False)
+            self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
+                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id)
         elif self.pdf_type == self.PIP_OCR:
-            self.model_list = doc_analyze(self.pdf_bytes, ocr=True)
+            self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
+                                          start_page_id=self.start_page_id, end_page_id=self.end_page_id)
     def pipe_parse(self):
         if self.pdf_type == self.PIP_TXT:
             self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
-                                                is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty)
+                                                is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty,
+                                                start_page_id=self.start_page_id, end_page_id=self.end_page_id)
         elif self.pdf_type == self.PIP_OCR:
             self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
-                                              is_debug=self.is_debug)
+                                              is_debug=self.is_debug,
+                                              start_page_id=self.start_page_id, end_page_id=self.end_page_id)
     def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
         result = super().pipe_mk_uni_format(img_parent_path, drop_mode)

magic-pdf 0.7.1__py3-none-any.whl → 0.8.1__py3-none-any.whl

magic-pdf 0.7.1py3-none-any.whl → 0.8.1py3-none-any.whl