PyPI - magic-pdf - Versions diffs - 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl - Mend

magic-pdf 0.10.0py3-none-any.whl → 0.10.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

magic_pdf/data/data_reader_writer/filebase.py +3 -0
magic_pdf/filter/pdf_meta_scan.py +3 -17
magic_pdf/libs/commons.py +0 -161
magic_pdf/libs/draw_bbox.py +2 -3
magic_pdf/libs/markdown_utils.py +0 -21
magic_pdf/libs/pdf_image_tools.py +2 -1
magic_pdf/libs/version.py +1 -1
magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
magic_pdf/model/magic_model.py +0 -30
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
magic_pdf/para/para_split_v3.py +7 -2
magic_pdf/pdf_parse_union_core_v2.py +97 -124
magic_pdf/pre_proc/construct_page_dict.py +0 -55
magic_pdf/pre_proc/cut_image.py +0 -37
magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
magic_pdf/rw/S3ReaderWriter.py +1 -1
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +25 -76
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +1 -1
magic_pdf/dict2md/mkcontent.py +0 -438
magic_pdf/layout/__init__.py +0 -0
magic_pdf/layout/bbox_sort.py +0 -681
magic_pdf/layout/layout_det_utils.py +0 -182
magic_pdf/layout/layout_sort.py +0 -921
magic_pdf/layout/layout_spiler_recog.py +0 -101
magic_pdf/layout/mcol_sort.py +0 -336
magic_pdf/libs/calc_span_stats.py +0 -239
magic_pdf/libs/detect_language_from_model.py +0 -21
magic_pdf/libs/nlp_utils.py +0 -203
magic_pdf/libs/textbase.py +0 -33
magic_pdf/libs/vis_utils.py +0 -308
magic_pdf/para/block_continuation_processor.py +0 -562
magic_pdf/para/block_termination_processor.py +0 -480
magic_pdf/para/commons.py +0 -222
magic_pdf/para/denoise.py +0 -246
magic_pdf/para/draw.py +0 -121
magic_pdf/para/exceptions.py +0 -198
magic_pdf/para/layout_match_processor.py +0 -40
magic_pdf/para/para_split.py +0 -807
magic_pdf/para/para_split_v2.py +0 -959
magic_pdf/para/raw_processor.py +0 -207
magic_pdf/para/stats.py +0 -268
magic_pdf/para/title_processor.py +0 -1014
magic_pdf/pdf_parse_union_core.py +0 -345
magic_pdf/post_proc/__init__.py +0 -0
magic_pdf/post_proc/detect_para.py +0 -3472
magic_pdf/post_proc/pdf_post_filter.py +0 -60
magic_pdf/post_proc/remove_footnote.py +0 -153
magic_pdf/pre_proc/citationmarker_remove.py +0 -161
magic_pdf/pre_proc/detect_equation.py +0 -134
magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
magic_pdf/pre_proc/detect_footnote.py +0 -170
magic_pdf/pre_proc/detect_header.py +0 -64
magic_pdf/pre_proc/detect_images.py +0 -647
magic_pdf/pre_proc/detect_page_number.py +0 -64
magic_pdf/pre_proc/detect_tables.py +0 -62
magic_pdf/pre_proc/equations_replace.py +0 -550
magic_pdf/pre_proc/fix_image.py +0 -244
magic_pdf/pre_proc/fix_table.py +0 -270
magic_pdf/pre_proc/main_text_font.py +0 -23
magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
magic_pdf/pre_proc/post_layout_split.py +0 -0
magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
magic_pdf/pre_proc/remove_footer_header.py +0 -114
magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
magic_pdf/pre_proc/solve_line_alien.py +0 -29
magic_pdf/pre_proc/statistics.py +0 -12
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0

magic_pdf/data/data_reader_writer/filebase.py CHANGED Viewed

@@ -55,5 +55,8 @@ class FileBasedDataWriter(DataWriter):
         if not os.path.isabs(fn_path) and len(self._parent_dir) > 0:
             fn_path = os.path.join(self._parent_dir, path)
+        if not os.path.exists(os.path.dirname(fn_path)):
+            os.makedirs(os.path.dirname(fn_path), exist_ok=True)
         with open(fn_path, 'wb') as f:
             f.write(data)

magic_pdf/filter/pdf_meta_scan.py CHANGED Viewed

@@ -1,13 +1,12 @@
 """输入： s3路径，每行一个 输出： pdf文件元信息，包括每一页上的所有图片的长宽高，bbox位置."""
-import sys
 from collections import Counter
-import click
+import fitz
 from loguru import logger
 from magic_pdf.config.drop_reason import DropReason
-from magic_pdf.libs.commons import fitz, get_top_percent_list, mymax, read_file
+from magic_pdf.libs.commons import get_top_percent_list, mymax
 from magic_pdf.libs.language import detect_lang
 from magic_pdf.libs.pdf_check import detect_invalid_chars
@@ -384,21 +383,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
         return res
-@click.command()
-@click.option('--s3-pdf-path', help='s3上pdf文件的路径')
-@click.option('--s3-profile', help='s3上的profile')
-def main(s3_pdf_path: str, s3_profile: str):
-    """"""
-    try:
-        file_content = read_file(s3_pdf_path, s3_profile)
-        pdf_meta_scan(file_content)
-    except Exception as e:
-        print(f'ERROR: {s3_pdf_path}, {e}', file=sys.stderr)
-        logger.exception(e)
 if __name__ == '__main__':
-    main()
+    pass
     # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师-大乘无量寿.pdf"
     # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\三国演义_繁体竖排版.pdf"
     # "D:\project/20231108code-clean\pdf_cost_time\scihub\scihub_86800000\libgen.scimag86880000-86880999.zip_10.1021/acsami.1c03109.s002.pdf"

magic_pdf/libs/commons.py CHANGED Viewed

@@ -1,34 +1,8 @@
-import datetime
-import json
-import os, re, configparser
-import subprocess
-import time
-import boto3
-from loguru import logger
-from boto3.s3.transfer import TransferConfig
-from botocore.config import Config
-import fitz # 1.23.9中已经切换到rebase
-# import fitz_old as fitz  # 使用1.23.9之前的pymupdf库
-def get_delta_time(input_time):
-    return round(time.time() - input_time, 2)
 def join_path(*args):
     return '/'.join(str(s).rstrip('/') for s in args)
-#配置全局的errlog_path，方便demo同步引用
-error_log_path = "s3://llm-pdf-text/err_logs/"
-# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
-json_dump_path = "s3://llm-pdf-text/json_dump/"
-# s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # 基础库不应该有这些存在的路径，应该在业务代码中定义
 def get_top_percent_list(num_list, percent):
     """
     获取列表中前百分之多少的元素
@@ -48,51 +22,12 @@ def get_top_percent_list(num_list, percent):
     return top_percent_list
-def formatted_time(time_stamp):
-    dt_object = datetime.datetime.fromtimestamp(time_stamp)
-    output_time = dt_object.strftime("%Y-%m-%d-%H:%M:%S")
-    return output_time
 def mymax(alist: list):
     if len(alist) == 0:
         return 0  # 空是0， 0*0也是0大小q
     else:
         return max(alist)
-def parse_aws_param(profile):
-    if isinstance(profile, str):
-        # 解析配置文件
-        config_file = join_path(os.path.expanduser("~"), ".aws", "config")
-        credentials_file = join_path(os.path.expanduser("~"), ".aws", "credentials")
-        config = configparser.ConfigParser()
-        config.read(credentials_file)
-        config.read(config_file)
-        # 获取 AWS 账户相关信息
-        ak = config.get(profile, "aws_access_key_id")
-        sk = config.get(profile, "aws_secret_access_key")
-        if profile == "default":
-            s3_str = config.get(f"{profile}", "s3")
-        else:
-            s3_str = config.get(f"profile {profile}", "s3")
-        end_match = re.search("endpoint_url[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
-        if end_match:
-            endpoint = end_match.group(1)
-        else:
-            raise ValueError(f"aws 配置文件中没有找到 endpoint_url")
-        style_match = re.search("addressing_style[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
-        if style_match:
-            addressing_style = style_match.group(1)
-        else:
-            addressing_style = "path"
-    elif isinstance(profile, dict):
-        ak = profile["ak"]
-        sk = profile["sk"]
-        endpoint = profile["endpoint"]
-        addressing_style = "auto"
-    return ak, sk, endpoint, addressing_style
 def parse_bucket_key(s3_full_path: str):
     """
@@ -106,99 +41,3 @@ def parse_bucket_key(s3_full_path: str):
         s3_full_path = s3_full_path[1:]
     bucket, key = s3_full_path.split("/", 1)
     return bucket, key
-def read_file(pdf_path: str, s3_profile):
-    if pdf_path.startswith("s3://"):
-        ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
-        cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
-                           config=Config(s3={'addressing_style': addressing_style}, retries={'max_attempts': 10, 'mode': 'standard'}))
-        bucket_name, bucket_key = parse_bucket_key(pdf_path)
-        res = cli.get_object(Bucket=bucket_name, Key=bucket_key)
-        file_content = res["Body"].read()
-        return file_content
-    else:
-        with open(pdf_path, "rb") as f:
-            return f.read()
-def get_docx_model_output(pdf_model_output, page_id):
-    model_output_json = pdf_model_output[page_id]
-    return model_output_json
-def list_dir(dir_path:str, s3_profile:str):
-    """
-    列出dir_path下的所有文件
-    """
-    ret = []
-    if dir_path.startswith("s3"):
-        ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
-        s3info = re.findall(r"s3:\/\/([^\/]+)\/(.*)", dir_path)
-        bucket, path = s3info[0][0], s3info[0][1]
-        try:
-            cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
-                                            config=Config(s3={'addressing_style': addressing_style}))
-            def list_obj_scluster():
-                marker = None
-                while True:
-                    list_kwargs = dict(MaxKeys=1000, Bucket=bucket, Prefix=path)
-                    if marker:
-                        list_kwargs['Marker'] = marker
-                    response = cli.list_objects(**list_kwargs)
-                    contents = response.get("Contents", [])
-                    yield from contents
-                    if not response.get("IsTruncated") or len(contents)==0:
-                        break
-                    marker = contents[-1]['Key']
-            for info in list_obj_scluster():
-                file_path = info['Key']
-                #size = info['Size']
-                if path!="":
-                    afile = file_path[len(path):]
-                    if afile.endswith(".json"):
-                        ret.append(f"s3://{bucket}/{file_path}")
-            return ret
-        except Exception as e:
-            logger.exception(e)
-            exit(-1)
-    else: #本地的目录，那么扫描本地目录并返会这个目录里的所有jsonl文件
-        for root, dirs, files in os.walk(dir_path):
-            for file in files:
-                if file.endswith(".json"):
-                    ret.append(join_path(root, file))
-        ret.sort()
-        return ret
-def get_img_s3_client(save_path:str, image_s3_config:str):
-    """
-    """
-    if save_path.startswith("s3://"):  # 放这里是为了最少创建一个s3 client
-        ak, sk, end_point, addressing_style = parse_aws_param(image_s3_config)
-        img_s3_client = boto3.client(
-            service_name="s3",
-            aws_access_key_id=ak,
-            aws_secret_access_key=sk,
-            endpoint_url=end_point,
-            config=Config(s3={"addressing_style": addressing_style}, retries={'max_attempts': 5, 'mode': 'standard'}),
-        )
-    else:
-        img_s3_client = None
-    return img_s3_client
-if __name__=="__main__":
-    s3_path = "s3://llm-pdf-text/layout_det/scihub/scimag07865000-07865999/10.1007/s10729-011-9175-6.pdf/"
-    s3_profile = "langchao"
-    ret = list_dir(s3_path, s3_profile)
-    print(ret)

magic_pdf/libs/draw_bbox.py CHANGED Viewed

@@ -1,8 +1,7 @@
+import fitz
 from magic_pdf.config.constants import CROSS_PAGE
-from magic_pdf.config.ocr_content_type import (BlockType, CategoryId,
-                                               ContentType)
+from magic_pdf.config.ocr_content_type import BlockType, CategoryId, ContentType
 from magic_pdf.data.dataset import PymuDocDataset
-from magic_pdf.libs.commons import fitz  # PyMuPDF
 from magic_pdf.model.magic_model import MagicModel

magic_pdf/libs/markdown_utils.py CHANGED Viewed

@@ -1,24 +1,3 @@
-import re
-def escape_special_markdown_char(pymu_blocks):
-    """
-    转义正文里对markdown语法有特殊意义的字符
-    """
-    special_chars = ["*", "`", "~", "$"]
-    for blk in pymu_blocks:
-        for line in blk['lines']:
-            for span in line['spans']:
-                for char in special_chars:
-                    span_text = span['text']
-                    span_type = span.get("_type", None)
-                    if span_type in ['inline-equation', 'interline-equation']:
-                        continue
-                    elif span_text:
-                        span['text'] = span['text'].replace(char, "\\" + char)
-    return pymu_blocks
 def ocr_escape_special_markdown_char(content):
     """

magic_pdf/libs/pdf_image_tools.py CHANGED Viewed

@@ -1,9 +1,10 @@
 from io import BytesIO
 import cv2
+import fitz
 import numpy as np
 from PIL import Image
 from magic_pdf.data.data_reader_writer import DataWriter
-from magic_pdf.libs.commons import fitz, join_path
+from magic_pdf.libs.commons import join_path
 from magic_pdf.libs.hash_utils import compute_sha256

magic_pdf/libs/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.10.0"
1	+ __version__ = "0.10.2"

magic_pdf/model/doc_analyze_by_custom_model.py CHANGED Viewed

@@ -46,8 +46,8 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id
                 mat = fitz.Matrix(dpi / 72, dpi / 72)
                 pm = page.get_pixmap(matrix=mat, alpha=False)
-                # If the width or height exceeds 9000 after scaling, do not scale further.
-                if pm.width > 9000 or pm.height > 9000:
+                # If the width or height exceeds 4500 after scaling, do not scale further.
+                if pm.width > 4500 or pm.height > 4500:
                     pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
                 img = Image.frombytes("RGB", (pm.width, pm.height), pm.samples)

magic_pdf/model/magic_model.py CHANGED Viewed

@@ -1,16 +1,12 @@
 import enum
-import json
 from magic_pdf.config.model_block_type import ModelBlockTypeEnum
 from magic_pdf.config.ocr_content_type import CategoryId, ContentType
-from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
-                                               FileBasedDataWriter)
 from magic_pdf.data.dataset import Dataset
 from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
                                     bbox_relative_pos, box_area, calculate_iou,
                                     calculate_overlap_area_in_bbox1_area_ratio,
                                     get_overlap_area)
-from magic_pdf.libs.commons import fitz, join_path
 from magic_pdf.libs.coordinate_transform import get_scale_ratio
 from magic_pdf.libs.local_math import float_gt
 from magic_pdf.pre_proc.remove_bbox_overlap import _remove_overlap_between_bbox
@@ -1048,29 +1044,3 @@ class MagicModel:
     def get_model_list(self, page_no):
         return self.__model_list[page_no]
-if __name__ == '__main__':
-    drw = FileBasedDataReader(r'D:/project/20231108code-clean')
-    if 0:
-        pdf_file_path = r'linshixuqiu\19983-00.pdf'
-        model_file_path = r'linshixuqiu\19983-00_new.json'
-        pdf_bytes = drw.read(pdf_file_path)
-        model_json_txt = drw.read(model_file_path).decode()
-        model_list = json.loads(model_json_txt)
-        write_path = r'D:\project\20231108code-clean\linshixuqiu\19983-00'
-        img_bucket_path = 'imgs'
-        img_writer = FileBasedDataWriter(join_path(write_path, img_bucket_path))
-        pdf_docs = fitz.open('pdf', pdf_bytes)
-        magic_model = MagicModel(model_list, pdf_docs)
-    if 1:
-        from magic_pdf.data.dataset import PymuDocDataset
-        model_list = json.loads(
-            drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.json')
-        )
-        pdf_bytes = drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.pdf')
-        magic_model = MagicModel(model_list, PymuDocDataset(pdf_bytes))
-        for i in range(7):
-            print(magic_model.get_imgs(i))

magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py CHANGED Viewed

@@ -1,5 +1,3 @@
-import math
 import numpy as np
 from loguru import logger
@@ -214,6 +212,9 @@ def get_ocr_result_list(ocr_res, useful_list):
         if len(box_ocr_res) == 2:
             p1, p2, p3, p4 = box_ocr_res[0]
             text, score = box_ocr_res[1]
+            # logger.info(f"text: {text}, score: {score}")
+            if score < 0.6:  # 过滤低置信度的结果
+                continue
         else:
             p1, p2, p3, p4 = box_ocr_res
             text, score = "", 1
@@ -249,32 +250,6 @@ def get_ocr_result_list(ocr_res, useful_list):
     return ocr_result_list
-def calculate_angle_degrees(poly):
-    # 定义对角线的顶点
-    diagonal1 = (poly[0], poly[2])
-    diagonal2 = (poly[1], poly[3])
-    # 计算对角线的斜率
-    def slope(p1, p2):
-        return (p2[1] - p1[1]) / (p2[0] - p1[0]) if p2[0] != p1[0] else float('inf')
-    slope1 = slope(diagonal1[0], diagonal1[1])
-    slope2 = slope(diagonal2[0], diagonal2[1])
-    # 计算对角线与x轴的夹角（以弧度为单位）
-    angle1_radians = math.atan(slope1)
-    angle2_radians = math.atan(slope2)
-    # 将弧度转换为角度
-    angle1_degrees = math.degrees(angle1_radians)
-    angle2_degrees = math.degrees(angle2_radians)
-    # 取两条对角线与x轴夹角的平均值
-    average_angle_degrees = abs((angle1_degrees + angle2_degrees) / 2)
-    # logger.info(f"average_angle_degrees: {average_angle_degrees}")
-    return average_angle_degrees
 def calculate_is_angle(poly):
     p1, p2, p3, p4 = poly
     height = ((p4[1] - p1[1]) + (p3[1] - p2[1])) / 2

magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py CHANGED Viewed

@@ -63,7 +63,7 @@ class ModifiedPaddleOCR(PaddleOCR):
         if det and rec:
             ocr_res = []
-            for idx, img in enumerate(imgs):
+            for img in imgs:
                 img = preprocess_image(img)
                 dt_boxes, rec_res, _ = self.__call__(img, cls, mfd_res=mfd_res)
                 if not dt_boxes and not rec_res:
@@ -75,7 +75,7 @@ class ModifiedPaddleOCR(PaddleOCR):
             return ocr_res
         elif det and not rec:
             ocr_res = []
-            for idx, img in enumerate(imgs):
+            for img in imgs:
                 img = preprocess_image(img)
                 dt_boxes, elapse = self.text_detector(img)
                 if dt_boxes is None:
@@ -96,7 +96,7 @@ class ModifiedPaddleOCR(PaddleOCR):
         else:
             ocr_res = []
             cls_res = []
-            for idx, img in enumerate(imgs):
+            for img in imgs:
                 if not isinstance(img, list):
                     img = preprocess_image(img)
                     img = [img]

magic_pdf/para/para_split_v3.py CHANGED Viewed

@@ -271,13 +271,18 @@ def __merge_2_text_blocks(block1, block2):
                     first_span = first_line['spans'][0]
                     if len(first_span['content']) > 0:
                         span_start_with_num = first_span['content'][0].isdigit()
+                        span_start_with_big_char = first_span['content'][0].isupper()
                         if (
-                            abs(block2['bbox_fs'][2] - last_line['bbox'][2])
-                            < line_height
+                            # 上一个block的最后一个line的右边界和block的右边界差距不超过line_height
+                            abs(block2['bbox_fs'][2] - last_line['bbox'][2]) < line_height
+                            # 上一个block的最后一个span不是以特定符号结尾
                             and not last_span['content'].endswith(LINE_STOP_FLAG)
                             # 两个block宽度差距超过2倍也不合并
                             and abs(block1_weight - block2_weight) < min_block_weight
+                            # 下一个block的第一个字符是数字
                             and not span_start_with_num
+                            # 下一个block的第一个字符是大写字母
+                            and not span_start_with_big_char
                         ):
                             if block1['page_num'] != block2['page_num']:
                                 for line in block1['lines']:

magic-pdf 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

magic-pdf 0.10.0py3-none-any.whl → 0.10.2py3-none-any.whl