PyPI - magic-pdf - Versions diffs - 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl - Mend

magic-pdf 0.10.0py3-none-any.whl → 0.10.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

magic_pdf/data/data_reader_writer/filebase.py +3 -0
magic_pdf/filter/pdf_meta_scan.py +3 -17
magic_pdf/libs/commons.py +0 -161
magic_pdf/libs/draw_bbox.py +2 -3
magic_pdf/libs/markdown_utils.py +0 -21
magic_pdf/libs/pdf_image_tools.py +2 -1
magic_pdf/libs/version.py +1 -1
magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
magic_pdf/model/magic_model.py +0 -30
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
magic_pdf/para/para_split_v3.py +7 -2
magic_pdf/pdf_parse_union_core_v2.py +97 -124
magic_pdf/pre_proc/construct_page_dict.py +0 -55
magic_pdf/pre_proc/cut_image.py +0 -37
magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
magic_pdf/rw/S3ReaderWriter.py +1 -1
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +25 -76
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +1 -1
magic_pdf/dict2md/mkcontent.py +0 -438
magic_pdf/layout/__init__.py +0 -0
magic_pdf/layout/bbox_sort.py +0 -681
magic_pdf/layout/layout_det_utils.py +0 -182
magic_pdf/layout/layout_sort.py +0 -921
magic_pdf/layout/layout_spiler_recog.py +0 -101
magic_pdf/layout/mcol_sort.py +0 -336
magic_pdf/libs/calc_span_stats.py +0 -239
magic_pdf/libs/detect_language_from_model.py +0 -21
magic_pdf/libs/nlp_utils.py +0 -203
magic_pdf/libs/textbase.py +0 -33
magic_pdf/libs/vis_utils.py +0 -308
magic_pdf/para/block_continuation_processor.py +0 -562
magic_pdf/para/block_termination_processor.py +0 -480
magic_pdf/para/commons.py +0 -222
magic_pdf/para/denoise.py +0 -246
magic_pdf/para/draw.py +0 -121
magic_pdf/para/exceptions.py +0 -198
magic_pdf/para/layout_match_processor.py +0 -40
magic_pdf/para/para_split.py +0 -807
magic_pdf/para/para_split_v2.py +0 -959
magic_pdf/para/raw_processor.py +0 -207
magic_pdf/para/stats.py +0 -268
magic_pdf/para/title_processor.py +0 -1014
magic_pdf/pdf_parse_union_core.py +0 -345
magic_pdf/post_proc/__init__.py +0 -0
magic_pdf/post_proc/detect_para.py +0 -3472
magic_pdf/post_proc/pdf_post_filter.py +0 -60
magic_pdf/post_proc/remove_footnote.py +0 -153
magic_pdf/pre_proc/citationmarker_remove.py +0 -161
magic_pdf/pre_proc/detect_equation.py +0 -134
magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
magic_pdf/pre_proc/detect_footnote.py +0 -170
magic_pdf/pre_proc/detect_header.py +0 -64
magic_pdf/pre_proc/detect_images.py +0 -647
magic_pdf/pre_proc/detect_page_number.py +0 -64
magic_pdf/pre_proc/detect_tables.py +0 -62
magic_pdf/pre_proc/equations_replace.py +0 -550
magic_pdf/pre_proc/fix_image.py +0 -244
magic_pdf/pre_proc/fix_table.py +0 -270
magic_pdf/pre_proc/main_text_font.py +0 -23
magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
magic_pdf/pre_proc/post_layout_split.py +0 -0
magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
magic_pdf/pre_proc/remove_footer_header.py +0 -114
magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
magic_pdf/pre_proc/solve_line_alien.py +0 -29
magic_pdf/pre_proc/statistics.py +0 -12
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0

magic_pdf/libs/nlp_utils.py DELETED Viewed

@@ -1,203 +0,0 @@
-import re
-from os import path
-from collections import Counter
-from loguru import logger
-# from langdetect import detect
-import spacy
-import en_core_web_sm
-import zh_core_web_sm
-from magic_pdf.libs.language import detect_lang
-class NLPModels:
-    """
-    How to upload local models to s3:
-        - config aws cli:
-            doc\SETUP-CLI.md
-            doc\setup_cli.sh
-            app\config\__init__.py
-        - $ cd {local_dir_storing_models}
-        - $ ls models
-            en_core_web_sm-3.7.1/
-            zh_core_web_sm-3.7.0/
-        - $ aws s3 sync models/ s3://llm-infra/models --profile=p_project_norm
-        - $ aws s3 --profile=p_project_norm ls  s3://llm-infra/models/
-            PRE en_core_web_sm-3.7.1/
-            PRE zh_core_web_sm-3.7.0/
-    """
-    def __init__(self):
-        # if OS is windows, set "TMP_DIR" to "D:/tmp"
-        home_dir = path.expanduser("~")
-        self.default_local_path = path.join(home_dir, ".nlp_models")
-        self.default_shared_path = "/share/pdf_processor/nlp_models"
-        self.default_hdfs_path = "hdfs://pdf_processor/nlp_models"
-        self.default_s3_path = "s3://llm-infra/models"
-        self.nlp_models = self.nlp_models = {
-            "en_core_web_sm": {
-                "type": "spacy",
-                "version": "3.7.1",
-            },
-            "en_core_web_md": {
-                "type": "spacy",
-                "version": "3.7.1",
-            },
-            "en_core_web_lg": {
-                "type": "spacy",
-                "version": "3.7.1",
-            },
-            "zh_core_web_sm": {
-                "type": "spacy",
-                "version": "3.7.0",
-            },
-            "zh_core_web_md": {
-                "type": "spacy",
-                "version": "3.7.0",
-            },
-            "zh_core_web_lg": {
-                "type": "spacy",
-                "version": "3.7.0",
-            },
-        }
-        self.en_core_web_sm_model = en_core_web_sm.load()
-        self.zh_core_web_sm_model = zh_core_web_sm.load()
-    def load_model(self, model_name, model_type, model_version):
-        if (
-            model_name in self.nlp_models
-            and self.nlp_models[model_name]["type"] == model_type
-            and self.nlp_models[model_name]["version"] == model_version
-        ):
-            return spacy.load(model_name) if spacy.util.is_package(model_name) else None
-        else:
-            logger.error(f"Unsupported model name or version: {model_name} {model_version}")
-            return None
-    def detect_language(self, text, use_langdetect=False):
-        if len(text) == 0:
-            return None
-        if use_langdetect:
-            # print("use_langdetect")
-            # print(detect_lang(text))
-            # return detect_lang(text)
-            if detect_lang(text) == "zh":
-                return "zh"
-            else:
-                return "en"
-        if not use_langdetect:
-            en_count = len(re.findall(r"[a-zA-Z]", text))
-            cn_count = len(re.findall(r"[\u4e00-\u9fff]", text))
-            if en_count > cn_count:
-                return "en"
-            if cn_count > en_count:
-                return "zh"
-    def detect_entity_catgr_using_nlp(self, text, threshold=0.5):
-        """
-        Detect entity categories using NLP models and return the most frequent entity types.
-        Parameters
-        ----------
-        text : str
-            Text to be processed.
-        Returns
-        -------
-        str
-            The most frequent entity type.
-        """
-        lang = self.detect_language(text, use_langdetect=True)
-        if lang == "en":
-            nlp_model = self.en_core_web_sm_model
-        elif lang == "zh":
-            nlp_model = self.zh_core_web_sm_model
-        else:
-            # logger.error(f"Unsupported language: {lang}")
-            return {}
-        # Splitting text into smaller parts
-        text_parts = re.split(r"[,;，；、\s & |]+", text)
-        text_parts = [part for part in text_parts if not re.match(r"[\d\W]+", part)]  # Remove non-words
-        text_combined = " ".join(text_parts)
-        try:
-            doc = nlp_model(text_combined)
-            entity_counts = Counter([ent.label_ for ent in doc.ents])
-            word_counts_in_entities = Counter()
-            for ent in doc.ents:
-                word_counts_in_entities[ent.label_] += len(ent.text.split())
-            total_words_in_entities = sum(word_counts_in_entities.values())
-            total_words = len([token for token in doc if not token.is_punct])
-            if total_words_in_entities == 0 or total_words == 0:
-                return None
-            entity_percentage = total_words_in_entities / total_words
-            if entity_percentage < 0.5:
-                return None
-            most_common_entity, word_count = word_counts_in_entities.most_common(1)[0]
-            entity_percentage = word_count / total_words_in_entities
-            if entity_percentage >= threshold:
-                return most_common_entity
-            else:
-                return None
-        except Exception as e:
-            logger.error(f"Error in entity detection: {e}")
-            return None
-def __main__():
-    nlpModel = NLPModels()
-    test_strings = [
-        "张三",
-        "张三, 李四，王五; 赵六",
-        "John Doe",
-        "Jane Smith",
-        "Lee, John",
-        "John Doe, Jane Smith; Alice Johnson，Bob Lee",
-        "孙七, Michael Jordan；赵八",
-        "David Smith  Michael O'Connor; Kevin ßáçøñ",
-        "李雷·韩梅梅, 张三·李四",
-        "Charles Robert Darwin, Isaac Newton",
-        "莱昂纳多·迪卡普里奥, 杰克·吉伦哈尔",
-        "John Doe, Jane Smith; Alice Johnson",
-        "张三, 李四，王五; 赵六",
-        "Lei Wang, Jia Li, and Xiaojun Chen, LINKE YANG OU, and YUAN ZHANG",
-        "Rachel Mills  &  William Barry  &  Susanne B. Haga",
-        "Claire Chabut* and Jean-François Bussières",
-        "1 Department of Chemistry, Northeastern University, Shenyang 110004, China 2 State Key Laboratory of Polymer Physics and Chemistry, Changchun Institute of Applied Chemistry, Chinese Academy of Sciences, Changchun 130022, China",
-        "Changchun",
-        "china",
-        "Rongjun Song, 1,2 Baoyan Zhang, 1 Baotong Huang, 2 Tao Tang 2",
-        "Synergistic Effect of Supported Nickel Catalyst with Intumescent Flame-Retardants on Flame Retardancy and Thermal Stability of Polypropylene",
-        "Synergistic Effect of Supported Nickel Catalyst with",
-        "Intumescent Flame-Retardants on Flame Retardancy",
-        "and Thermal Stability of Polypropylene",
-    ]
-    for test in test_strings:
-        print()
-        print(f"Original String: {test}")
-        result = nlpModel.detect_entity_catgr_using_nlp(test)
-        print(f"Detected entities: {result}")
-if __name__ == "__main__":
-    __main__()

magic_pdf/libs/textbase.py DELETED Viewed

@@ -1,33 +0,0 @@
-import math
-def __inc_dict_val(mp, key, val_inc:int):
-    if mp.get(key):
-        mp[key] = mp[key] + val_inc
-    else:
-        mp[key] = val_inc
-def get_text_block_base_info(block):
-    """
-    获取这个文本块里的字体的颜色、字号、字体
-    按照正文字数最多的返回
-    """
-    counter = {}
-    for line in block['lines']:
-        for span in line['spans']:
-            color = span['color']
-            size = round(span['size'], 2)
-            font = span['font']
-            txt_len = len(span['text'])
-            __inc_dict_val(counter, (color, size, font), txt_len)
-    c, s, ft = max(counter, key=counter.get)
-    return c, s, ft

magic_pdf/libs/vis_utils.py DELETED Viewed

@@ -1,308 +0,0 @@
-from magic_pdf.libs.commons import fitz
-import os
-def draw_bbox_on_page(raw_pdf_doc: fitz.Document, paras_dict:dict, save_path: str):
-    """
-    在page上画出bbox，保存到save_path
-    """
-    # 检查文件是否存在
-    is_new_pdf = False
-    if os.path.exists(save_path):
-        # 打开现有的 PDF 文件
-        doc = fitz.open(save_path)
-    else:
-        # 创建一个新的空白 PDF 文件
-        is_new_pdf = True
-        doc = fitz.open('')
-    color_map = {
-        'image': fitz.pdfcolor["yellow"],
-        'text': fitz.pdfcolor['blue'],
-        "table": fitz.pdfcolor['green']
-    }
-    for k, v in paras_dict.items():
-        page_idx = v['page_idx']
-        width = raw_pdf_doc[page_idx].rect.width
-        height = raw_pdf_doc[page_idx].rect.height
-        new_page = doc.new_page(width=width, height=height)
-        shape = new_page.new_shape()
-        for order, block in enumerate(v['preproc_blocks']):
-            rect = fitz.Rect(block['bbox'])
-            shape = new_page.new_shape()
-            shape.draw_rect(rect)
-            shape.finish(color=None, fill=color_map['text'], fill_opacity=0.2)
-            shape.finish()
-            shape.commit()
-        for img in v['images']:
-            # 原始box画上去
-            rect = fitz.Rect(img['bbox'])
-            shape = new_page.new_shape()
-            shape.draw_rect(rect)
-            shape.finish(color=None, fill=fitz.pdfcolor['yellow'])
-            shape.finish()
-            shape.commit()
-        for img in v['image_backup']:
-            # 原始box画上去
-            rect = fitz.Rect(img['bbox'])
-            shape = new_page.new_shape()
-            shape.draw_rect(rect)
-            shape.finish(color=fitz.pdfcolor['yellow'],  fill=None)
-            shape.finish()
-            shape.commit()
-        for tb in v['droped_text_block']:
-            # 原始box画上去
-            rect = fitz.Rect(tb['bbox'])
-            shape = new_page.new_shape()
-            shape.draw_rect(rect)
-            shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.4)
-            shape.finish()
-            shape.commit()
-        # TODO table
-        for tb in v['tables']:
-            rect = fitz.Rect(tb['bbox'])
-            shape = new_page.new_shape()
-            shape.draw_rect(rect)
-            shape.finish(color=None, fill=fitz.pdfcolor['green'], fill_opacity=0.2)
-            shape.finish()
-            shape.commit()
-    parent_dir = os.path.dirname(save_path)
-    if not os.path.exists(parent_dir):
-        os.makedirs(parent_dir)
-    if is_new_pdf:
-        doc.save(save_path)
-    else:
-        doc.saveIncr()
-    doc.close()
-def debug_show_bbox(raw_pdf_doc: fitz.Document, page_idx: int, bboxes: list, droped_bboxes:list,  expect_drop_bboxes:list, save_path: str, expected_page_id:int):
-    """
-    以覆盖的方式写个临时的pdf，用于debug
-    """
-    if page_idx!=expected_page_id:
-        return
-    if os.path.exists(save_path):
-        # 删除已经存在的文件
-        os.remove(save_path)
-    # 创建一个新的空白 PDF 文件
-    doc = fitz.open('')
-    width = raw_pdf_doc[page_idx].rect.width
-    height = raw_pdf_doc[page_idx].rect.height
-    new_page = doc.new_page(width=width, height=height)
-    shape = new_page.new_shape()
-    for bbox in bboxes:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
-        shape.finish()
-        shape.commit()
-    for bbox in droped_bboxes:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
-        shape.finish()
-        shape.commit()
-    for bbox in expect_drop_bboxes:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=fitz.pdfcolor['red'], fill=None)
-        shape.finish()
-        shape.commit()
-    # shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(bboxes)}", fontname="helv", fontsize=12,
-    #                      color=(0, 0, 0))
-    # shape.finish(color=fitz.pdfcolor['black'])
-    # shape.commit()
-    parent_dir = os.path.dirname(save_path)
-    if not os.path.exists(parent_dir):
-        os.makedirs(parent_dir)
-    doc.save(save_path)
-    doc.close()
-def debug_show_page(page, bboxes1: list,bboxes2: list,bboxes3: list,):
-    save_path = "./tmp/debug.pdf"
-    if os.path.exists(save_path):
-        # 删除已经存在的文件
-        os.remove(save_path)
-    # 创建一个新的空白 PDF 文件
-    doc = fitz.open('')
-    width = page.rect.width
-    height = page.rect.height
-    new_page = doc.new_page(width=width, height=height)
-    shape = new_page.new_shape()
-    for bbox in bboxes1:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
-        shape.finish()
-        shape.commit()
-    for bbox in bboxes2:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
-        shape.finish()
-        shape.commit()
-    for bbox in bboxes3:
-        # 原始box画上去
-        rect = fitz.Rect(*bbox[0:4])
-        shape = new_page.new_shape()
-        shape.draw_rect(rect)
-        shape.finish(color=fitz.pdfcolor['red'], fill=None)
-        shape.finish()
-        shape.commit()
-    parent_dir = os.path.dirname(save_path)
-    if not os.path.exists(parent_dir):
-        os.makedirs(parent_dir)
-    doc.save(save_path)
-    doc.close()
-def draw_layout_bbox_on_page(raw_pdf_doc: fitz.Document, paras_dict:dict, header, footer, pdf_path: str):
-    """
-    在page上画出bbox，保存到save_path
-    """
-    # 检查文件是否存在
-    is_new_pdf = False
-    if os.path.exists(pdf_path):
-        # 打开现有的 PDF 文件
-        doc = fitz.open(pdf_path)
-    else:
-        # 创建一个新的空白 PDF 文件
-        is_new_pdf = True
-        doc = fitz.open('')
-    for k, v in paras_dict.items():
-        page_idx = v['page_idx']
-        layouts = v['layout_bboxes']
-        page = doc[page_idx]
-        shape = page.new_shape()
-        for order, layout in enumerate(layouts):
-            border_offset = 1
-            rect_box = layout['layout_bbox']
-            layout_label = layout['layout_label']
-            fill_color = fitz.pdfcolor['pink'] if layout_label=='U' else None
-            rect_box = [rect_box[0]+1, rect_box[1]-border_offset, rect_box[2]-1, rect_box[3]+border_offset]
-            rect = fitz.Rect(*rect_box)
-            shape.draw_rect(rect)
-            shape.finish(color=fitz.pdfcolor['red'], fill=fill_color, fill_opacity=0.4)
-            """
-            draw order text on layout box
-            """
-            font_size = 10
-            shape.insert_text((rect_box[0] + 1, rect_box[1] + font_size), f"{order}", fontsize=font_size, color=(0, 0, 0))
-        """画上footer header"""
-        if header:
-            shape.draw_rect(fitz.Rect(header))
-            shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.2)
-        if footer:
-            shape.draw_rect(fitz.Rect(footer))
-            shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.2)
-        shape.commit()
-    if is_new_pdf:
-        doc.save(pdf_path)
-    else:
-        doc.saveIncr()
-    doc.close()
-@DeprecationWarning
-def draw_layout_on_page(raw_pdf_doc: fitz.Document,  page_idx: int, page_layout: list, pdf_path: str):
-    """
-    把layout的box用红色边框花在pdf_path的page_idx上
-    """
-    def draw(shape, layout, fill_color=fitz.pdfcolor['pink']):
-        border_offset = 1
-        rect_box = layout['layout_bbox']
-        layout_label = layout['layout_label']
-        sub_layout = layout['sub_layout']
-        if len(sub_layout)==0:
-            fill_color = fill_color if layout_label=='U' else None
-            rect_box = [rect_box[0]+1, rect_box[1]-border_offset, rect_box[2]-1, rect_box[3]+border_offset]
-            rect = fitz.Rect(*rect_box)
-            shape.draw_rect(rect)
-            shape.finish(color=fitz.pdfcolor['red'], fill=fill_color, fill_opacity=0.2)
-            # if layout_label=='U':
-            #     bad_boxes = layout.get("bad_boxes", [])
-            #     for bad_box in bad_boxes:
-            #         rect = fitz.Rect(*bad_box)
-            #         shape.draw_rect(rect)
-            #         shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['red'], fill_opacity=0.2)
-        # else:
-        #     rect = fitz.Rect(*rect_box)
-        #     shape.draw_rect(rect)
-        #     shape.finish(color=fitz.pdfcolor['blue'])
-        for sub_layout in sub_layout:
-            draw(shape, sub_layout)
-        shape.commit()
-    # 检查文件是否存在
-    is_new_pdf = False
-    if os.path.exists(pdf_path):
-        # 打开现有的 PDF 文件
-        doc = fitz.open(pdf_path)
-    else:
-        # 创建一个新的空白 PDF 文件
-        is_new_pdf = True
-        doc = fitz.open('')
-    page = doc[page_idx]
-    shape = page.new_shape()
-    for order, layout in enumerate(page_layout):
-        draw(shape, layout, fitz.pdfcolor['yellow'])
-    # shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(layout)}", fontname="helv", fontsize=12,
-    #                      color=(0, 0, 0))
-    # shape.finish(color=fitz.pdfcolor['black'])
-    # shape.commit()
-    parent_dir = os.path.dirname(pdf_path)
-    if not os.path.exists(parent_dir):
-        os.makedirs(parent_dir)
-    if is_new_pdf:
-        doc.save(pdf_path)
-    else:
-        doc.saveIncr()
-    doc.close()

magic-pdf 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

magic-pdf 0.10.0py3-none-any.whl → 0.10.2py3-none-any.whl