PyPI - magic-pdf - Versions diffs - 0.10.5__py3-none-any.whl → 0.10.6__py3-none-any.whl - Mend

magic-pdf 0.10.5py3-none-any.whl → 0.10.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

magic_pdf/config/constants.py +5 -0
magic_pdf/data/data_reader_writer/base.py +13 -1
magic_pdf/data/dataset.py +175 -4
magic_pdf/dict2md/ocr_mkcontent.py +2 -2
magic_pdf/filter/__init__.py +32 -0
magic_pdf/filter/pdf_meta_scan.py +3 -2
magic_pdf/libs/draw_bbox.py +11 -10
magic_pdf/libs/pdf_check.py +30 -30
magic_pdf/libs/version.py +1 -1
magic_pdf/model/__init__.py +124 -0
magic_pdf/model/doc_analyze_by_custom_model.py +119 -60
magic_pdf/model/operators.py +190 -0
magic_pdf/model/pdf_extract_kit.py +20 -1
magic_pdf/model/sub_modules/model_init.py +13 -3
magic_pdf/model/sub_modules/model_utils.py +11 -5
magic_pdf/pdf_parse_by_ocr.py +4 -5
magic_pdf/pdf_parse_by_txt.py +4 -5
magic_pdf/pdf_parse_union_core_v2.py +10 -11
magic_pdf/pipe/AbsPipe.py +3 -2
magic_pdf/pipe/OCRPipe.py +54 -15
magic_pdf/pipe/TXTPipe.py +5 -4
magic_pdf/pipe/UNIPipe.py +82 -30
magic_pdf/pipe/operators.py +138 -0
magic_pdf/tools/common.py +108 -59
magic_pdf/user_api.py +47 -24
{magic_pdf-0.10.5.dist-info → magic_pdf-0.10.6.dist-info}/METADATA +7 -4
{magic_pdf-0.10.5.dist-info → magic_pdf-0.10.6.dist-info}/RECORD +31 -29
{magic_pdf-0.10.5.dist-info → magic_pdf-0.10.6.dist-info}/LICENSE.md +0 -0
{magic_pdf-0.10.5.dist-info → magic_pdf-0.10.6.dist-info}/WHEEL +0 -0
{magic_pdf-0.10.5.dist-info → magic_pdf-0.10.6.dist-info}/entry_points.txt +0 -0
{magic_pdf-0.10.5.dist-info → magic_pdf-0.10.6.dist-info}/top_level.txt +0 -0

magic_pdf/config/constants.py CHANGED Viewed

@@ -51,3 +51,8 @@ class MODEL_NAME:
     UniMerNet_v2_Small = 'unimernet_small'
     RAPID_TABLE = 'rapid_table'
+PARSE_TYPE_TXT = 'txt'
+PARSE_TYPE_OCR = 'ocr'

magic_pdf/data/data_reader_writer/base.py CHANGED Viewed

@@ -48,4 +48,16 @@ class DataWriter(ABC):
             path (str): the target file where to write
             data (str): the data want to write
         """
-        self.write(path, data.encode())
+        def safe_encode(data: str, method: str):
+            try:
+                bit_data = data.encode(encoding=method, errors='replace')
+                return bit_data, True
+            except:  # noqa
+                return None, False
+        for method in ['utf-8', 'ascii']:
+            bit_data, flag = safe_encode(data, method)
+            if flag:
+                self.write(path, bit_data)
+                break

magic_pdf/data/dataset.py CHANGED Viewed

@@ -1,11 +1,13 @@
+import os
 from abc import ABC, abstractmethod
-from typing import Iterator
+from typing import Callable, Iterator
 import fitz
 from magic_pdf.config.enums import SupportedPdfParseMethod
 from magic_pdf.data.schemas import PageInfo
 from magic_pdf.data.utils import fitz_doc_to_image
+from magic_pdf.filter import classify
 class PageableData(ABC):
@@ -28,6 +30,32 @@ class PageableData(ABC):
         """
         pass
+    @abstractmethod
+    def draw_rect(self, rect_coords, color, fill, fill_opacity, width, overlay):
+        """draw rectangle.
+        Args:
+            rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
+            color (list[float] | None): three element tuple which describe the RGB of the board line, None means no board line
+            fill (list[float] | None): fill the board with RGB, None means will not fill with color
+            fill_opacity (float): opacity of the fill, range from [0, 1]
+            width (float): the width of board
+            overlay (bool): fill the color in foreground or background. True means fill in background.
+        """
+        pass
+    @abstractmethod
+    def insert_text(self, coord, content, fontsize, color):
+        """insert text.
+        Args:
+            coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
+            content (str): the text content
+            fontsize (int): font size of the text
+            color (list[float] | None):  three element tuple which describe the RGB of the board line, None will use the default font color!
+        """
+        pass
 class Dataset(ABC):
     @abstractmethod
@@ -66,6 +94,43 @@ class Dataset(ABC):
         """
         pass
+    @abstractmethod
+    def dump_to_file(self, file_path: str):
+        """Dump the file
+        Args:
+            file_path (str): the file path
+        """
+        pass
+    @abstractmethod
+    def apply(self, proc: Callable, *args, **kwargs):
+        """Apply callable method which.
+        Args:
+            proc (Callable): invoke proc as follows:
+                proc(self, *args, **kwargs)
+        Returns:
+            Any: return the result generated by proc
+        """
+        pass
+    @abstractmethod
+    def classify(self) -> SupportedPdfParseMethod:
+        """classify the dataset
+        Returns:
+            SupportedPdfParseMethod: _description_
+        """
+        pass
+    @abstractmethod
+    def clone(self):
+        """clone this dataset
+        """
+        pass
 class PymuDocDataset(Dataset):
     def __init__(self, bits: bytes):
@@ -74,7 +139,8 @@ class PymuDocDataset(Dataset):
         Args:
             bits (bytes): the bytes of the pdf
         """
-        self._records = [Doc(v) for v in fitz.open('pdf', bits)]
+        self._raw_fitz = fitz.open('pdf', bits)
+        self._records = [Doc(v) for v in self._raw_fitz]
         self._data_bits = bits
         self._raw_data = bits
@@ -109,6 +175,43 @@ class PymuDocDataset(Dataset):
         """
         return self._records[page_id]
+    def dump_to_file(self, file_path: str):
+        """Dump the file
+        Args:
+            file_path (str): the file path
+        """
+        dir_name = os.path.dirname(file_path)
+        if dir_name not in ('', '.', '..'):
+            os.makedirs(dir_name, exist_ok=True)
+        self._raw_fitz.save(file_path)
+    def apply(self, proc: Callable, *args, **kwargs):
+        """Apply callable method which.
+        Args:
+            proc (Callable): invoke proc as follows:
+                proc(dataset, *args, **kwargs)
+        Returns:
+            Any: return the result generated by proc
+        """
+        return proc(self, *args, **kwargs)
+    def classify(self) -> SupportedPdfParseMethod:
+        """classify the dataset
+        Returns:
+            SupportedPdfParseMethod: _description_
+        """
+        return classify(self._data_bits)
+    def clone(self):
+        """clone this dataset
+        """
+        return PymuDocDataset(self._raw_data)
 class ImageDataset(Dataset):
     def __init__(self, bits: bytes):
@@ -118,7 +221,8 @@ class ImageDataset(Dataset):
             bits (bytes): the bytes of the photo which will be converted to pdf first. then converted to pymudoc.
         """
         pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
-        self._records = [Doc(v) for v in fitz.open('pdf', pdf_bytes)]
+        self._raw_fitz = fitz.open('pdf', pdf_bytes)
+        self._records = [Doc(v) for v in self._raw_fitz]
         self._raw_data = bits
         self._data_bits = pdf_bytes
@@ -153,14 +257,50 @@ class ImageDataset(Dataset):
         """
         return self._records[page_id]
+    def dump_to_file(self, file_path: str):
+        """Dump the file
+        Args:
+            file_path (str): the file path
+        """
+        dir_name = os.path.dirname(file_path)
+        if dir_name not in ('', '.', '..'):
+            os.makedirs(dir_name, exist_ok=True)
+        self._raw_fitz.save(file_path)
+    def apply(self, proc: Callable, *args, **kwargs):
+        """Apply callable method which.
+        Args:
+            proc (Callable): invoke proc as follows:
+                proc(dataset, *args, **kwargs)
+        Returns:
+            Any: return the result generated by proc
+        """
+        return proc(self, *args, **kwargs)
+    def classify(self) -> SupportedPdfParseMethod:
+        """classify the dataset
+        Returns:
+            SupportedPdfParseMethod: _description_
+        """
+        return SupportedPdfParseMethod.OCR
+    def clone(self):
+        """clone this dataset
+        """
+        return ImageDataset(self._raw_data)
 class Doc(PageableData):
     """Initialized with pymudoc object."""
     def __init__(self, doc: fitz.Page):
         self._doc = doc
     def get_image(self):
-        """Return the imge info.
+        """Return the image info.
         Returns:
             dict: {
@@ -192,3 +332,34 @@ class Doc(PageableData):
     def __getattr__(self, name):
         if hasattr(self._doc, name):
             return getattr(self._doc, name)
+    def draw_rect(self, rect_coords, color, fill, fill_opacity, width, overlay):
+        """draw rectangle.
+        Args:
+            rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
+            color (list[float] | None): three element tuple which describe the RGB of the board line, None means no board line
+            fill (list[float] | None): fill the board with RGB, None means will not fill with color
+            fill_opacity (float): opacity of the fill, range from [0, 1]
+            width (float): the width of board
+            overlay (bool): fill the color in foreground or background. True means fill in background.
+        """
+        self._doc.draw_rect(
+            rect_coords,
+            color=color,
+            fill=fill,
+            fill_opacity=fill_opacity,
+            width=width,
+            overlay=overlay,
+        )
+    def insert_text(self, coord, content, fontsize, color):
+        """insert text.
+        Args:
+            coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
+            content (str): the text content
+            fontsize (int): font size of the text
+            color (list[float] | None):  three element tuple which describe the RGB of the board line, None will use the default font color!
+        """
+        self._doc.insert_text(coord, content, fontsize=fontsize, color=color)

magic_pdf/dict2md/ocr_mkcontent.py CHANGED Viewed

@@ -165,8 +165,8 @@ def merge_para_with_text(para_block):
             if content:
                 langs = ['zh', 'ja', 'ko']
                 # logger.info(f'block_lang: {block_lang}, content: {content}')
-                if block_lang in langs: # 中文/日语/韩文语境下，换行不需要空格分隔
-                    if j == len(line['spans']) - 1:
+                if block_lang in langs: # 中文/日语/韩文语境下，换行不需要空格分隔,但是如果是行内公式结尾，还是要加空格
+                    if j == len(line['spans']) - 1 and span_type not in [ContentType.InlineEquation]:
                         para_text += content
                     else:
                         para_text += f'{content} '

magic_pdf/filter/__init__.py CHANGED Viewed

@@ -0,0 +1,32 @@
+from magic_pdf.config.drop_reason import DropReason
+from magic_pdf.config.enums import SupportedPdfParseMethod
+from magic_pdf.filter.pdf_classify_by_type import classify as do_classify
+from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
+def classify(pdf_bytes: bytes) -> SupportedPdfParseMethod:
+    """根据pdf的元数据，判断是文本pdf，还是ocr pdf."""
+    pdf_meta = pdf_meta_scan(pdf_bytes)
+    if pdf_meta.get('_need_drop', False):  # 如果返回了需要丢弃的标志，则抛出异常
+        raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}")
+    else:
+        is_encrypted = pdf_meta['is_encrypted']
+        is_needs_password = pdf_meta['is_needs_password']
+        if is_encrypted or is_needs_password:  # 加密的，需要密码的，没有页面的，都不处理
+            raise Exception(f'pdf meta_scan need_drop,reason is {DropReason.ENCRYPTED}')
+        else:
+            is_text_pdf, results = do_classify(
+                pdf_meta['total_page'],
+                pdf_meta['page_width_pts'],
+                pdf_meta['page_height_pts'],
+                pdf_meta['image_info_per_page'],
+                pdf_meta['text_len_per_page'],
+                pdf_meta['imgs_per_page'],
+                pdf_meta['text_layout_per_page'],
+                pdf_meta['invalid_chars'],
+            )
+            if is_text_pdf:
+                return SupportedPdfParseMethod.TXT
+            else:
+                return SupportedPdfParseMethod.OCR

magic_pdf/filter/pdf_meta_scan.py CHANGED Viewed

@@ -8,7 +8,7 @@ from loguru import logger
 from magic_pdf.config.drop_reason import DropReason
 from magic_pdf.libs.commons import get_top_percent_list, mymax
 from magic_pdf.libs.language import detect_lang
-from magic_pdf.libs.pdf_check import detect_invalid_chars_by_pymupdf
+from magic_pdf.libs.pdf_check import detect_invalid_chars_by_pymupdf, detect_invalid_chars
 scan_max_page = 50
 junk_limit_min = 10
@@ -323,7 +323,8 @@ def get_language(doc: fitz.Document):
 def check_invalid_chars(pdf_bytes):
     """乱码检测."""
-    return detect_invalid_chars_by_pymupdf(pdf_bytes)
+    # return detect_invalid_chars_by_pymupdf(pdf_bytes)
+    return detect_invalid_chars(pdf_bytes)
 def pdf_meta_scan(pdf_bytes: bytes):

magic_pdf/libs/draw_bbox.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import fitz
 from magic_pdf.config.constants import CROSS_PAGE
-from magic_pdf.config.ocr_content_type import BlockType, CategoryId, ContentType
-from magic_pdf.data.dataset import PymuDocDataset
+from magic_pdf.config.ocr_content_type import (BlockType, CategoryId,
+                                               ContentType)
+from magic_pdf.data.dataset import Dataset
 from magic_pdf.model.magic_model import MagicModel
@@ -194,7 +195,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
         )
     # Save the PDF
-    pdf_docs.save(f'{out_path}/{filename}_layout.pdf')
+    pdf_docs.save(f'{out_path}/{filename}')
 def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
@@ -282,18 +283,17 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
         draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False)
     # Save the PDF
-    pdf_docs.save(f'{out_path}/{filename}_spans.pdf')
+    pdf_docs.save(f'{out_path}/{filename}')
-def draw_model_bbox(model_list: list, pdf_bytes, out_path, filename):
+def draw_model_bbox(model_list, dataset: Dataset, out_path, filename):
     dropped_bbox_list = []
     tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
     imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
     titles_list = []
     texts_list = []
     interequations_list = []
-    pdf_docs = fitz.open('pdf', pdf_bytes)
-    magic_model = MagicModel(model_list, PymuDocDataset(pdf_bytes))
+    magic_model = MagicModel(model_list, dataset)
     for i in range(len(model_list)):
         page_dropped_list = []
         tables_body, tables_caption, tables_footnote = [], [], []
@@ -337,7 +337,8 @@ def draw_model_bbox(model_list: list, pdf_bytes, out_path, filename):
         dropped_bbox_list.append(page_dropped_list)
         imgs_footnote_list.append(imgs_footnote)
-    for i, page in enumerate(pdf_docs):
+    for i in range(len(dataset)):
+        page = dataset.get_page(i)
         draw_bbox_with_number(
             i, dropped_bbox_list, page, [158, 158, 158], True
         )  # color !
@@ -352,7 +353,7 @@ def draw_model_bbox(model_list: list, pdf_bytes, out_path, filename):
         draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)
     # Save the PDF
-    pdf_docs.save(f'{out_path}/{filename}_model.pdf')
+    dataset.dump_to_file(f'{out_path}/{filename}')
 def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
@@ -390,7 +391,7 @@ def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
     for i, page in enumerate(pdf_docs):
         draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
-    pdf_docs.save(f'{out_path}/{filename}_line_sort.pdf')
+    pdf_docs.save(f'{out_path}/{filename}')
 def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename):

magic_pdf/libs/pdf_check.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import fitz
 import numpy as np
 from loguru import logger
-# import re
-# from io import BytesIO
-# from pdfminer.high_level import extract_text
+import re
+from io import BytesIO
+from pdfminer.high_level import extract_text
 def calculate_sample_count(total_page: int):
@@ -33,33 +33,33 @@ def extract_pages(src_pdf_bytes: bytes) -> fitz.Document:
     return sample_docs
-# def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
-#     """"
-#     检测PDF中是否包含非法字符
-#     """
-#     '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
-#     sample_docs = extract_pages(src_pdf_bytes)
-#     sample_pdf_bytes = sample_docs.tobytes()
-#     sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
-#     text = extract_text(sample_pdf_file_like_object)
-#     text = text.replace("\n", "")
-#     # logger.info(text)
-#     '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
-#     cid_pattern = re.compile(r'\(cid:\d+\)')
-#     matches = cid_pattern.findall(text)
-#     cid_count = len(matches)
-#     cid_len = sum(len(match) for match in matches)
-#     text_len = len(text)
-#     if text_len == 0:
-#         cid_chars_radio = 0
-#     else:
-#         cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
-#     logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
-#     '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
-#     if cid_chars_radio > 0.05:
-#         return False  # 乱码文档
-#     else:
-#         return True   # 正常文档
+def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
+    """"
+    检测PDF中是否包含非法字符
+    """
+    '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
+    sample_docs = extract_pages(src_pdf_bytes)
+    sample_pdf_bytes = sample_docs.tobytes()
+    sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
+    text = extract_text(sample_pdf_file_like_object)
+    text = text.replace("\n", "")
+    # logger.info(text)
+    '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
+    cid_pattern = re.compile(r'\(cid:\d+\)')
+    matches = cid_pattern.findall(text)
+    cid_count = len(matches)
+    cid_len = sum(len(match) for match in matches)
+    text_len = len(text)
+    if text_len == 0:
+        cid_chars_radio = 0
+    else:
+        cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
+    logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
+    '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
+    if cid_chars_radio > 0.05:
+        return False  # 乱码文档
+    else:
+        return True   # 正常文档
 def count_replacement_characters(text: str) -> int:

magic_pdf/libs/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.10.5"
1	+ __version__ = "0.10.6"

magic_pdf/model/__init__.py CHANGED Viewed

@@ -1,2 +1,126 @@
+from typing import Callable
+from abc import ABC, abstractmethod
+from magic_pdf.data.data_reader_writer import DataWriter
+from magic_pdf.data.dataset import Dataset
+from magic_pdf.pipe.operators import PipeResult
 __use_inside_model__ = True
 __model_mode__ = "full"
+class InferenceResultBase(ABC):
+    @abstractmethod
+    def __init__(self, inference_results: list, dataset: Dataset):
+        """Initialized method.
+        Args:
+            inference_results (list): the inference result generated by model
+            dataset (Dataset): the dataset related with model inference result
+        """
+        self._infer_res = inference_results
+        self._dataset = dataset
+    @abstractmethod
+    def draw_model(self, file_path: str) -> None:
+        """Draw model inference result.
+        Args:
+            file_path (str): the output file path
+        """
+        pass
+    @abstractmethod
+    def dump_model(self, writer: DataWriter, file_path: str):
+        """Dump model inference result to file.
+        Args:
+            writer (DataWriter): writer handle
+            file_path (str): the location of target file
+        """
+        pass
+    @abstractmethod
+    def get_infer_res(self):
+        """Get the inference result.
+        Returns:
+            list: the inference result generated by model
+        """
+        pass
+    @abstractmethod
+    def apply(self, proc: Callable, *args, **kwargs):
+        """Apply callable method which.
+        Args:
+            proc (Callable): invoke proc as follows:
+                proc(inference_result, *args, **kwargs)
+        Returns:
+            Any: return the result generated by proc
+        """
+        pass
+    @abstractmethod
+    def pipe_auto_mode(
+        self,
+        imageWriter: DataWriter,
+        start_page_id=0,
+        end_page_id=None,
+        debug_mode=False,
+        lang=None,
+    ) -> PipeResult:
+        """Post-proc the model inference result.
+            step1: classify the dataset type
+            step2: based the result of step1, using `pipe_txt_mode` or `pipe_ocr_mode`
+        Args:
+            imageWriter (DataWriter): the image writer handle
+            start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
+            end_page_id (int, optional):  Defaults to the last page index of dataset. Let user select some pages He/She want to process
+            debug_mode (bool, optional): Defaults to False. will dump more log if enabled
+            lang (str, optional): Defaults to None.
+        Returns:
+            PipeResult: the result
+        """
+        pass
+    @abstractmethod
+    def pipe_txt_mode(
+        self,
+        imageWriter: DataWriter,
+        start_page_id=0,
+        end_page_id=None,
+        debug_mode=False,
+        lang=None,
+    ) -> PipeResult:
+        """Post-proc the model inference result, Extract the text using the
+        third library, such as `pymupdf`
+        Args:
+            imageWriter (DataWriter): the image writer handle
+            start_page_id (int, optional): Defaults to 0. Let user select some pages He/She want to process
+            end_page_id (int, optional):  Defaults to the last page index of dataset. Let user select some pages He/She want to process
+            debug_mode (bool, optional): Defaults to False. will dump more log if enabled
+            lang (str, optional): Defaults to None.
+        Returns:
+            PipeResult: the result
+        """
+        pass
+    @abstractmethod
+    def pipe_ocr_mode(
+        self,
+        imageWriter: DataWriter,
+        start_page_id=0,
+        end_page_id=None,
+        debug_mode=False,
+        lang=None,
+    ) -> PipeResult:
+        pass

magic-pdf 0.10.5__py3-none-any.whl → 0.10.6__py3-none-any.whl

magic-pdf 0.10.5py3-none-any.whl → 0.10.6py3-none-any.whl