PyPI - magic-pdf - Versions diffs - 0.10.1__py3-none-any.whl → 0.10.3__py3-none-any.whl - Mend

magic-pdf 0.10.1py3-none-any.whl → 0.10.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

magic_pdf/dict2md/ocr_mkcontent.py +16 -22
magic_pdf/filter/pdf_meta_scan.py +5 -19
magic_pdf/libs/commons.py +0 -161
magic_pdf/libs/draw_bbox.py +2 -3
magic_pdf/libs/markdown_utils.py +0 -21
magic_pdf/libs/pdf_check.py +52 -25
magic_pdf/libs/pdf_image_tools.py +2 -1
magic_pdf/libs/version.py +1 -1
magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
magic_pdf/model/magic_model.py +0 -30
magic_pdf/model/pp_structure_v2.py +23 -3
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +50 -29
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +11 -9
magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +2 -2
magic_pdf/para/para_split_v3.py +21 -7
magic_pdf/pdf_parse_union_core_v2.py +134 -146
magic_pdf/pre_proc/construct_page_dict.py +0 -55
magic_pdf/pre_proc/cut_image.py +0 -37
magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
magic_pdf/rw/S3ReaderWriter.py +1 -1
{magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/METADATA +3 -78
{magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/RECORD +28 -79
magic_pdf/dict2md/mkcontent.py +0 -438
magic_pdf/layout/__init__.py +0 -0
magic_pdf/layout/bbox_sort.py +0 -681
magic_pdf/layout/layout_det_utils.py +0 -182
magic_pdf/layout/layout_sort.py +0 -921
magic_pdf/layout/layout_spiler_recog.py +0 -101
magic_pdf/layout/mcol_sort.py +0 -336
magic_pdf/libs/calc_span_stats.py +0 -239
magic_pdf/libs/detect_language_from_model.py +0 -21
magic_pdf/libs/nlp_utils.py +0 -203
magic_pdf/libs/textbase.py +0 -33
magic_pdf/libs/vis_utils.py +0 -308
magic_pdf/para/block_continuation_processor.py +0 -562
magic_pdf/para/block_termination_processor.py +0 -480
magic_pdf/para/commons.py +0 -222
magic_pdf/para/denoise.py +0 -246
magic_pdf/para/draw.py +0 -121
magic_pdf/para/exceptions.py +0 -198
magic_pdf/para/layout_match_processor.py +0 -40
magic_pdf/para/para_split.py +0 -807
magic_pdf/para/para_split_v2.py +0 -959
magic_pdf/para/raw_processor.py +0 -207
magic_pdf/para/stats.py +0 -268
magic_pdf/para/title_processor.py +0 -1014
magic_pdf/pdf_parse_union_core.py +0 -345
magic_pdf/post_proc/__init__.py +0 -0
magic_pdf/post_proc/detect_para.py +0 -3472
magic_pdf/post_proc/pdf_post_filter.py +0 -60
magic_pdf/post_proc/remove_footnote.py +0 -153
magic_pdf/pre_proc/citationmarker_remove.py +0 -161
magic_pdf/pre_proc/detect_equation.py +0 -134
magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
magic_pdf/pre_proc/detect_footnote.py +0 -170
magic_pdf/pre_proc/detect_header.py +0 -64
magic_pdf/pre_proc/detect_images.py +0 -647
magic_pdf/pre_proc/detect_page_number.py +0 -64
magic_pdf/pre_proc/detect_tables.py +0 -62
magic_pdf/pre_proc/equations_replace.py +0 -550
magic_pdf/pre_proc/fix_image.py +0 -244
magic_pdf/pre_proc/fix_table.py +0 -270
magic_pdf/pre_proc/main_text_font.py +0 -23
magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
magic_pdf/pre_proc/post_layout_split.py +0 -0
magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
magic_pdf/pre_proc/remove_footer_header.py +0 -114
magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
magic_pdf/pre_proc/solve_line_alien.py +0 -29
magic_pdf/pre_proc/statistics.py +0 -12
{magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/LICENSE.md +0 -0
{magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/WHEEL +0 -0
{magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/entry_points.txt +0 -0
{magic_pdf-0.10.1.dist-info → magic_pdf-0.10.3.dist-info}/top_level.txt +0 -0

magic_pdf/dict2md/ocr_mkcontent.py CHANGED Viewed

@@ -5,7 +5,6 @@ from loguru import logger
 from magic_pdf.config.make_content_config import DropMode, MakeMode
 from magic_pdf.config.ocr_content_type import BlockType, ContentType
 from magic_pdf.libs.commons import join_path
-from magic_pdf.libs.language import detect_lang
 from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
 from magic_pdf.para.para_split_v3 import ListLineTag
@@ -30,6 +29,13 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
     for page_info in pdf_info_dict:
         paras_of_layout = page_info.get('para_blocks')
         if not paras_of_layout:
+            markdown_with_para_and_pagination.append({
+                'page_no':
+                    page_no,
+                'md_content':
+                    '',
+            })
+            page_no += 1
             continue
         page_markdown = ocr_mk_markdown_with_para_core_v2(
             paras_of_layout, 'mm', img_buket_path)
@@ -136,14 +142,11 @@ def merge_para_with_text(para_block):
             para_text += '  \n'
         line_text = ''
-        line_lang = ''
         for span in line['spans']:
             span_type = span['type']
             if span_type == ContentType.Text:
                 line_text += span['content'].strip()
-        if line_text != '':
-            line_lang = detect_lang(line_text)
         for j, span in enumerate(line['spans']):
             span_type = span['type']
@@ -157,27 +160,18 @@ def merge_para_with_text(para_block):
             content = content.strip()
             if content != '':
-                langs = ['zh', 'ja', 'ko']
-                if line_lang in langs:  # 遇到一些一个字一个span的文档，这种单字语言判断不准，需要用整行文本判断
-                    if span_type in [ContentType.Text, ContentType.InterlineEquation]:
-                        para_text += content  # 中文/日语/韩文语境下，content间不需要空格分隔
-                    elif span_type == ContentType.InlineEquation:
-                        para_text += f' {content} '
-                else:
-                    if span_type in [ContentType.Text, ContentType.InlineEquation]:
-                        # 如果span是line的最后一个且末尾带有-连字符，那么末尾不应该加空格,同时应该把-删除
-                        if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
-                            para_text += content[:-1]
-                        elif len(content) == 1 and content not in ['A', 'I', 'a', 'i'] and not content.isdigit():
-                            para_text += content
-                        else:  # 西方文本语境下 content间需要空格分隔
-                            para_text += f'{content} '
-                    elif span_type == ContentType.InterlineEquation:
-                        para_text += content
+                if span_type in [ContentType.Text, ContentType.InlineEquation]:
+                    # 如果span是line的最后一个且末尾带有-连字符，那么末尾不应该加空格,同时应该把-删除
+                    if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
+                        para_text += content[:-1]
+                    else:  # content间需要空格分隔
+                        para_text += f'{content} '
+                elif span_type == ContentType.InterlineEquation:
+                    para_text += content
             else:
                 continue
     # 连写字符拆分
-    para_text = __replace_ligatures(para_text)
+    # para_text = __replace_ligatures(para_text)
     return para_text

magic_pdf/filter/pdf_meta_scan.py CHANGED Viewed

@@ -1,15 +1,14 @@
 """输入： s3路径，每行一个 输出： pdf文件元信息，包括每一页上的所有图片的长宽高，bbox位置."""
-import sys
 from collections import Counter
-import click
+import fitz
 from loguru import logger
 from magic_pdf.config.drop_reason import DropReason
-from magic_pdf.libs.commons import fitz, get_top_percent_list, mymax, read_file
+from magic_pdf.libs.commons import get_top_percent_list, mymax
 from magic_pdf.libs.language import detect_lang
-from magic_pdf.libs.pdf_check import detect_invalid_chars
+from magic_pdf.libs.pdf_check import detect_invalid_chars_by_pymupdf
 scan_max_page = 50
 junk_limit_min = 10
@@ -324,7 +323,7 @@ def get_language(doc: fitz.Document):
 def check_invalid_chars(pdf_bytes):
     """乱码检测."""
-    return detect_invalid_chars(pdf_bytes)
+    return detect_invalid_chars_by_pymupdf(pdf_bytes)
 def pdf_meta_scan(pdf_bytes: bytes):
@@ -384,21 +383,8 @@ def pdf_meta_scan(pdf_bytes: bytes):
         return res
-@click.command()
-@click.option('--s3-pdf-path', help='s3上pdf文件的路径')
-@click.option('--s3-profile', help='s3上的profile')
-def main(s3_pdf_path: str, s3_profile: str):
-    """"""
-    try:
-        file_content = read_file(s3_pdf_path, s3_profile)
-        pdf_meta_scan(file_content)
-    except Exception as e:
-        print(f'ERROR: {s3_pdf_path}, {e}', file=sys.stderr)
-        logger.exception(e)
 if __name__ == '__main__':
-    main()
+    pass
     # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师-大乘无量寿.pdf"
     # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\三国演义_繁体竖排版.pdf"
     # "D:\project/20231108code-clean\pdf_cost_time\scihub\scihub_86800000\libgen.scimag86880000-86880999.zip_10.1021/acsami.1c03109.s002.pdf"

magic_pdf/libs/commons.py CHANGED Viewed

@@ -1,34 +1,8 @@
-import datetime
-import json
-import os, re, configparser
-import subprocess
-import time
-import boto3
-from loguru import logger
-from boto3.s3.transfer import TransferConfig
-from botocore.config import Config
-import fitz # 1.23.9中已经切换到rebase
-# import fitz_old as fitz  # 使用1.23.9之前的pymupdf库
-def get_delta_time(input_time):
-    return round(time.time() - input_time, 2)
 def join_path(*args):
     return '/'.join(str(s).rstrip('/') for s in args)
-#配置全局的errlog_path，方便demo同步引用
-error_log_path = "s3://llm-pdf-text/err_logs/"
-# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
-json_dump_path = "s3://llm-pdf-text/json_dump/"
-# s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # 基础库不应该有这些存在的路径，应该在业务代码中定义
 def get_top_percent_list(num_list, percent):
     """
     获取列表中前百分之多少的元素
@@ -48,51 +22,12 @@ def get_top_percent_list(num_list, percent):
     return top_percent_list
-def formatted_time(time_stamp):
-    dt_object = datetime.datetime.fromtimestamp(time_stamp)
-    output_time = dt_object.strftime("%Y-%m-%d-%H:%M:%S")
-    return output_time
 def mymax(alist: list):
     if len(alist) == 0:
         return 0  # 空是0， 0*0也是0大小q
     else:
         return max(alist)
-def parse_aws_param(profile):
-    if isinstance(profile, str):
-        # 解析配置文件
-        config_file = join_path(os.path.expanduser("~"), ".aws", "config")
-        credentials_file = join_path(os.path.expanduser("~"), ".aws", "credentials")
-        config = configparser.ConfigParser()
-        config.read(credentials_file)
-        config.read(config_file)
-        # 获取 AWS 账户相关信息
-        ak = config.get(profile, "aws_access_key_id")
-        sk = config.get(profile, "aws_secret_access_key")
-        if profile == "default":
-            s3_str = config.get(f"{profile}", "s3")
-        else:
-            s3_str = config.get(f"profile {profile}", "s3")
-        end_match = re.search("endpoint_url[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
-        if end_match:
-            endpoint = end_match.group(1)
-        else:
-            raise ValueError(f"aws 配置文件中没有找到 endpoint_url")
-        style_match = re.search("addressing_style[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
-        if style_match:
-            addressing_style = style_match.group(1)
-        else:
-            addressing_style = "path"
-    elif isinstance(profile, dict):
-        ak = profile["ak"]
-        sk = profile["sk"]
-        endpoint = profile["endpoint"]
-        addressing_style = "auto"
-    return ak, sk, endpoint, addressing_style
 def parse_bucket_key(s3_full_path: str):
     """
@@ -106,99 +41,3 @@ def parse_bucket_key(s3_full_path: str):
         s3_full_path = s3_full_path[1:]
     bucket, key = s3_full_path.split("/", 1)
     return bucket, key
-def read_file(pdf_path: str, s3_profile):
-    if pdf_path.startswith("s3://"):
-        ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
-        cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
-                           config=Config(s3={'addressing_style': addressing_style}, retries={'max_attempts': 10, 'mode': 'standard'}))
-        bucket_name, bucket_key = parse_bucket_key(pdf_path)
-        res = cli.get_object(Bucket=bucket_name, Key=bucket_key)
-        file_content = res["Body"].read()
-        return file_content
-    else:
-        with open(pdf_path, "rb") as f:
-            return f.read()
-def get_docx_model_output(pdf_model_output, page_id):
-    model_output_json = pdf_model_output[page_id]
-    return model_output_json
-def list_dir(dir_path:str, s3_profile:str):
-    """
-    列出dir_path下的所有文件
-    """
-    ret = []
-    if dir_path.startswith("s3"):
-        ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
-        s3info = re.findall(r"s3:\/\/([^\/]+)\/(.*)", dir_path)
-        bucket, path = s3info[0][0], s3info[0][1]
-        try:
-            cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
-                                            config=Config(s3={'addressing_style': addressing_style}))
-            def list_obj_scluster():
-                marker = None
-                while True:
-                    list_kwargs = dict(MaxKeys=1000, Bucket=bucket, Prefix=path)
-                    if marker:
-                        list_kwargs['Marker'] = marker
-                    response = cli.list_objects(**list_kwargs)
-                    contents = response.get("Contents", [])
-                    yield from contents
-                    if not response.get("IsTruncated") or len(contents)==0:
-                        break
-                    marker = contents[-1]['Key']
-            for info in list_obj_scluster():
-                file_path = info['Key']
-                #size = info['Size']
-                if path!="":
-                    afile = file_path[len(path):]
-                    if afile.endswith(".json"):
-                        ret.append(f"s3://{bucket}/{file_path}")
-            return ret
-        except Exception as e:
-            logger.exception(e)
-            exit(-1)
-    else: #本地的目录，那么扫描本地目录并返会这个目录里的所有jsonl文件
-        for root, dirs, files in os.walk(dir_path):
-            for file in files:
-                if file.endswith(".json"):
-                    ret.append(join_path(root, file))
-        ret.sort()
-        return ret
-def get_img_s3_client(save_path:str, image_s3_config:str):
-    """
-    """
-    if save_path.startswith("s3://"):  # 放这里是为了最少创建一个s3 client
-        ak, sk, end_point, addressing_style = parse_aws_param(image_s3_config)
-        img_s3_client = boto3.client(
-            service_name="s3",
-            aws_access_key_id=ak,
-            aws_secret_access_key=sk,
-            endpoint_url=end_point,
-            config=Config(s3={"addressing_style": addressing_style}, retries={'max_attempts': 5, 'mode': 'standard'}),
-        )
-    else:
-        img_s3_client = None
-    return img_s3_client
-if __name__=="__main__":
-    s3_path = "s3://llm-pdf-text/layout_det/scihub/scimag07865000-07865999/10.1007/s10729-011-9175-6.pdf/"
-    s3_profile = "langchao"
-    ret = list_dir(s3_path, s3_profile)
-    print(ret)

magic_pdf/libs/draw_bbox.py CHANGED Viewed

@@ -1,8 +1,7 @@
+import fitz
 from magic_pdf.config.constants import CROSS_PAGE
-from magic_pdf.config.ocr_content_type import (BlockType, CategoryId,
-                                               ContentType)
+from magic_pdf.config.ocr_content_type import BlockType, CategoryId, ContentType
 from magic_pdf.data.dataset import PymuDocDataset
-from magic_pdf.libs.commons import fitz  # PyMuPDF
 from magic_pdf.model.magic_model import MagicModel

magic_pdf/libs/markdown_utils.py CHANGED Viewed

@@ -1,24 +1,3 @@
-import re
-def escape_special_markdown_char(pymu_blocks):
-    """
-    转义正文里对markdown语法有特殊意义的字符
-    """
-    special_chars = ["*", "`", "~", "$"]
-    for blk in pymu_blocks:
-        for line in blk['lines']:
-            for span in line['spans']:
-                for char in special_chars:
-                    span_text = span['text']
-                    span_type = span.get("_type", None)
-                    if span_type in ['inline-equation', 'interline-equation']:
-                        continue
-                    elif span_text:
-                        span['text'] = span['text'].replace(char, "\\" + char)
-    return pymu_blocks
 def ocr_escape_special_markdown_char(content):
     """

magic_pdf/libs/pdf_check.py CHANGED Viewed

@@ -1,9 +1,9 @@
-from io import BytesIO
-import re
 import fitz
 import numpy as np
 from loguru import logger
-from pdfminer.high_level import extract_text
+# import re
+# from io import BytesIO
+# from pdfminer.high_level import extract_text
 def calculate_sample_count(total_page: int):
@@ -14,7 +14,7 @@ def calculate_sample_count(total_page: int):
     return select_page_cnt
-def extract_pages(src_pdf_bytes: bytes):
+def extract_pages(src_pdf_bytes: bytes) -> fitz.Document:
     pdf_docs = fitz.open("pdf", src_pdf_bytes)
     total_page = len(pdf_docs)
     if total_page == 0:
@@ -33,30 +33,57 @@ def extract_pages(src_pdf_bytes: bytes):
     return sample_docs
-def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
-    """"
-    检测PDF中是否包含非法字符
+# def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
+#     """"
+#     检测PDF中是否包含非法字符
+#     """
+#     '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
+#     sample_docs = extract_pages(src_pdf_bytes)
+#     sample_pdf_bytes = sample_docs.tobytes()
+#     sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
+#     text = extract_text(sample_pdf_file_like_object)
+#     text = text.replace("\n", "")
+#     # logger.info(text)
+#     '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
+#     cid_pattern = re.compile(r'\(cid:\d+\)')
+#     matches = cid_pattern.findall(text)
+#     cid_count = len(matches)
+#     cid_len = sum(len(match) for match in matches)
+#     text_len = len(text)
+#     if text_len == 0:
+#         cid_chars_radio = 0
+#     else:
+#         cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
+#     logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
+#     '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
+#     if cid_chars_radio > 0.05:
+#         return False  # 乱码文档
+#     else:
+#         return True   # 正常文档
+def count_replacement_characters(text: str) -> int:
+    """
+    统计字符串中 0xfffd 字符的数量。
     """
-    '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
+    return text.count('\ufffd')
+def detect_invalid_chars_by_pymupdf(src_pdf_bytes: bytes) -> bool:
     sample_docs = extract_pages(src_pdf_bytes)
-    sample_pdf_bytes = sample_docs.tobytes()
-    sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
-    text = extract_text(sample_pdf_file_like_object)
-    text = text.replace("\n", "")
-    # logger.info(text)
-    '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
-    cid_pattern = re.compile(r'\(cid:\d+\)')
-    matches = cid_pattern.findall(text)
-    cid_count = len(matches)
-    cid_len = sum(len(match) for match in matches)
-    text_len = len(text)
+    doc_text = ""
+    for page in sample_docs:
+        page_text = page.get_text('text', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)
+        doc_text += page_text
+    text_len = len(doc_text)
+    uffd_count = count_replacement_characters(doc_text)
     if text_len == 0:
-        cid_chars_radio = 0
+        uffd_chars_radio = 0
     else:
-        cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
-    logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
-    '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
-    if cid_chars_radio > 0.05:
+        uffd_chars_radio = uffd_count / text_len
+    logger.info(f"uffd_count: {uffd_count}, text_len: {text_len}, uffd_chars_radio: {uffd_chars_radio}")
+    '''当一篇文章存在1%以上的文本是乱码时,认为该文档为乱码文档'''
+    if uffd_chars_radio > 0.01:
         return False  # 乱码文档
     else:
-        return True   # 正常文档
+        return True   # 正常文档

magic_pdf/libs/pdf_image_tools.py CHANGED Viewed

@@ -1,9 +1,10 @@
 from io import BytesIO
 import cv2
+import fitz
 import numpy as np
 from PIL import Image
 from magic_pdf.data.data_reader_writer import DataWriter
-from magic_pdf.libs.commons import fitz, join_path
+from magic_pdf.libs.commons import join_path
 from magic_pdf.libs.hash_utils import compute_sha256

magic_pdf/libs/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.10.1"
1	+ __version__ = "0.10.3"

magic_pdf/model/doc_analyze_by_custom_model.py CHANGED Viewed

@@ -46,8 +46,8 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id
                 mat = fitz.Matrix(dpi / 72, dpi / 72)
                 pm = page.get_pixmap(matrix=mat, alpha=False)
-                # If the width or height exceeds 9000 after scaling, do not scale further.
-                if pm.width > 9000 or pm.height > 9000:
+                # If the width or height exceeds 4500 after scaling, do not scale further.
+                if pm.width > 4500 or pm.height > 4500:
                     pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
                 img = Image.frombytes("RGB", (pm.width, pm.height), pm.samples)

magic_pdf/model/magic_model.py CHANGED Viewed

@@ -1,16 +1,12 @@
 import enum
-import json
 from magic_pdf.config.model_block_type import ModelBlockTypeEnum
 from magic_pdf.config.ocr_content_type import CategoryId, ContentType
-from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
-                                               FileBasedDataWriter)
 from magic_pdf.data.dataset import Dataset
 from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
                                     bbox_relative_pos, box_area, calculate_iou,
                                     calculate_overlap_area_in_bbox1_area_ratio,
                                     get_overlap_area)
-from magic_pdf.libs.commons import fitz, join_path
 from magic_pdf.libs.coordinate_transform import get_scale_ratio
 from magic_pdf.libs.local_math import float_gt
 from magic_pdf.pre_proc.remove_bbox_overlap import _remove_overlap_between_bbox
@@ -1048,29 +1044,3 @@ class MagicModel:
     def get_model_list(self, page_no):
         return self.__model_list[page_no]
-if __name__ == '__main__':
-    drw = FileBasedDataReader(r'D:/project/20231108code-clean')
-    if 0:
-        pdf_file_path = r'linshixuqiu\19983-00.pdf'
-        model_file_path = r'linshixuqiu\19983-00_new.json'
-        pdf_bytes = drw.read(pdf_file_path)
-        model_json_txt = drw.read(model_file_path).decode()
-        model_list = json.loads(model_json_txt)
-        write_path = r'D:\project\20231108code-clean\linshixuqiu\19983-00'
-        img_bucket_path = 'imgs'
-        img_writer = FileBasedDataWriter(join_path(write_path, img_bucket_path))
-        pdf_docs = fitz.open('pdf', pdf_bytes)
-        magic_model = MagicModel(model_list, pdf_docs)
-    if 1:
-        from magic_pdf.data.dataset import PymuDocDataset
-        model_list = json.loads(
-            drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.json')
-        )
-        pdf_bytes = drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.pdf')
-        magic_model = MagicModel(model_list, PymuDocDataset(pdf_bytes))
-        for i in range(7):
-            print(magic_model.get_imgs(i))

magic_pdf/model/pp_structure_v2.py CHANGED Viewed

@@ -18,11 +18,31 @@ def region_to_bbox(region):
 class CustomPaddleModel:
-    def __init__(self, ocr: bool = False, show_log: bool = False, lang=None):
+    def __init__(self,
+                 ocr: bool = False,
+                 show_log: bool = False,
+                 lang=None,
+                 det_db_box_thresh=0.3,
+                 use_dilation=True,
+                 det_db_unclip_ratio=1.8
+    ):
         if lang is not None:
-            self.model = PPStructure(table=False, ocr=ocr, show_log=show_log, lang=lang)
+            self.model = PPStructure(table=False,
+                                     ocr=True,
+                                     show_log=show_log,
+                                     lang=lang,
+                                     det_db_box_thresh=det_db_box_thresh,
+                                     use_dilation=use_dilation,
+                                     det_db_unclip_ratio=det_db_unclip_ratio,
+            )
         else:
-            self.model = PPStructure(table=False, ocr=ocr, show_log=show_log)
+            self.model = PPStructure(table=False,
+                                     ocr=True,
+                                     show_log=show_log,
+                                     det_db_box_thresh=det_db_box_thresh,
+                                     use_dilation=use_dilation,
+                                     det_db_unclip_ratio=det_db_unclip_ratio,
+            )
     def __call__(self, img):
         try:

magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py CHANGED Viewed

@@ -1,11 +1,55 @@
-import math
+import cv2
 import numpy as np
 from loguru import logger
+from io import BytesIO
+from PIL import Image
+import base64
 from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold
 from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line
+from ppocr.utils.utility import check_and_read
+def img_decode(content: bytes):
+    np_arr = np.frombuffer(content, dtype=np.uint8)
+    return cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED)
+def check_img(img):
+    if isinstance(img, bytes):
+        img = img_decode(img)
+    if isinstance(img, str):
+        image_file = img
+        img, flag_gif, flag_pdf = check_and_read(image_file)
+        if not flag_gif and not flag_pdf:
+            with open(image_file, 'rb') as f:
+                img_str = f.read()
+                img = img_decode(img_str)
+            if img is None:
+                try:
+                    buf = BytesIO()
+                    image = BytesIO(img_str)
+                    im = Image.open(image)
+                    rgb = im.convert('RGB')
+                    rgb.save(buf, 'jpeg')
+                    buf.seek(0)
+                    image_bytes = buf.read()
+                    data_base64 = str(base64.b64encode(image_bytes),
+                                      encoding="utf-8")
+                    image_decode = base64.b64decode(data_base64)
+                    img_array = np.frombuffer(image_decode, np.uint8)
+                    img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
+                except:
+                    logger.error("error in loading image:{}".format(image_file))
+                    return None
+        if img is None:
+            logger.error("error in loading image:{}".format(image_file))
+            return None
+    if isinstance(img, np.ndarray) and len(img.shape) == 2:
+        img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+    return img
 def bbox_to_points(bbox):
     """ 将bbox格式转换为四个顶点的数组 """
@@ -214,6 +258,9 @@ def get_ocr_result_list(ocr_res, useful_list):
         if len(box_ocr_res) == 2:
             p1, p2, p3, p4 = box_ocr_res[0]
             text, score = box_ocr_res[1]
+            # logger.info(f"text: {text}, score: {score}")
+            if score < 0.6:  # 过滤低置信度的结果
+                continue
         else:
             p1, p2, p3, p4 = box_ocr_res
             text, score = "", 1
@@ -249,32 +296,6 @@ def get_ocr_result_list(ocr_res, useful_list):
     return ocr_result_list
-def calculate_angle_degrees(poly):
-    # 定义对角线的顶点
-    diagonal1 = (poly[0], poly[2])
-    diagonal2 = (poly[1], poly[3])
-    # 计算对角线的斜率
-    def slope(p1, p2):
-        return (p2[1] - p1[1]) / (p2[0] - p1[0]) if p2[0] != p1[0] else float('inf')
-    slope1 = slope(diagonal1[0], diagonal1[1])
-    slope2 = slope(diagonal2[0], diagonal2[1])
-    # 计算对角线与x轴的夹角（以弧度为单位）
-    angle1_radians = math.atan(slope1)
-    angle2_radians = math.atan(slope2)
-    # 将弧度转换为角度
-    angle1_degrees = math.degrees(angle1_radians)
-    angle2_degrees = math.degrees(angle2_radians)
-    # 取两条对角线与x轴夹角的平均值
-    average_angle_degrees = abs((angle1_degrees + angle2_degrees) / 2)
-    # logger.info(f"average_angle_degrees: {average_angle_degrees}")
-    return average_angle_degrees
 def calculate_is_angle(poly):
     p1, p2, p3, p4 = poly
     height = ((p4[1] - p1[1]) + (p3[1] - p2[1])) / 2

magic-pdf 0.10.1__py3-none-any.whl → 0.10.3__py3-none-any.whl

magic-pdf 0.10.1py3-none-any.whl → 0.10.3py3-none-any.whl