PyPI - magic-pdf - Versions diffs - 0.10.5__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

magic-pdf 0.10.5py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

magic_pdf/config/constants.py +7 -0
magic_pdf/config/exceptions.py +7 -0
magic_pdf/data/data_reader_writer/base.py +13 -1
magic_pdf/data/data_reader_writer/filebase.py +1 -1
magic_pdf/data/data_reader_writer/multi_bucket_s3.py +8 -6
magic_pdf/data/dataset.py +188 -5
magic_pdf/data/read_api.py +59 -12
magic_pdf/data/utils.py +35 -0
magic_pdf/dict2md/ocr_mkcontent.py +16 -15
magic_pdf/filter/__init__.py +32 -0
magic_pdf/filter/pdf_meta_scan.py +3 -2
magic_pdf/libs/clean_memory.py +11 -4
magic_pdf/libs/config_reader.py +9 -0
magic_pdf/libs/draw_bbox.py +19 -22
magic_pdf/libs/language.py +3 -0
magic_pdf/libs/pdf_check.py +30 -30
magic_pdf/libs/version.py +1 -1
magic_pdf/model/__init__.py +1 -1
magic_pdf/model/batch_analyze.py +275 -0
magic_pdf/model/doc_analyze_by_custom_model.py +104 -92
magic_pdf/model/magic_model.py +4 -435
magic_pdf/model/model_list.py +1 -0
magic_pdf/model/pdf_extract_kit.py +35 -5
magic_pdf/model/sub_modules/language_detection/__init__.py +1 -0
magic_pdf/model/sub_modules/language_detection/utils.py +82 -0
magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +139 -0
magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py +1 -0
magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +44 -7
magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +21 -2
magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +70 -27
magic_pdf/model/sub_modules/model_init.py +43 -7
magic_pdf/model/sub_modules/model_utils.py +17 -5
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +51 -1
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +32 -6
magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +42 -7
magic_pdf/operators/__init__.py +94 -0
magic_pdf/operators/models.py +154 -0
magic_pdf/operators/pipes.py +191 -0
magic_pdf/pdf_parse_union_core_v2.py +77 -27
magic_pdf/post_proc/__init__.py +1 -0
magic_pdf/post_proc/llm_aided.py +133 -0
magic_pdf/pre_proc/ocr_span_list_modify.py +8 -0
magic_pdf/pre_proc/remove_bbox_overlap.py +1 -1
magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt +0 -0
magic_pdf/tools/cli.py +36 -11
magic_pdf/tools/common.py +120 -61
magic_pdf/utils/office_to_pdf.py +29 -0
{magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/METADATA +78 -25
{magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/RECORD +54 -55
magic_pdf/para/__init__.py +0 -0
magic_pdf/pdf_parse_by_ocr.py +0 -23
magic_pdf/pdf_parse_by_txt.py +0 -24
magic_pdf/pipe/AbsPipe.py +0 -98
magic_pdf/pipe/OCRPipe.py +0 -41
magic_pdf/pipe/TXTPipe.py +0 -41
magic_pdf/pipe/UNIPipe.py +0 -98
magic_pdf/pipe/__init__.py +0 -0
magic_pdf/rw/AbsReaderWriter.py +0 -17
magic_pdf/rw/DiskReaderWriter.py +0 -74
magic_pdf/rw/S3ReaderWriter.py +0 -142
magic_pdf/rw/__init__.py +0 -0
magic_pdf/user_api.py +0 -121
/magic_pdf/{para → post_proc}/para_split_v3.py +0 -0
{magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/LICENSE.md +0 -0
{magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/WHEEL +0 -0
{magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/entry_points.txt +0 -0
{magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/top_level.txt +0 -0

magic_pdf/config/constants.py CHANGED Viewed

@@ -51,3 +51,10 @@ class MODEL_NAME:
     UniMerNet_v2_Small = 'unimernet_small'
     RAPID_TABLE = 'rapid_table'
+    YOLO_V11_LangDetect = 'yolo_v11n_langdetect'
+PARSE_TYPE_TXT = 'txt'
+PARSE_TYPE_OCR = 'ocr'

magic_pdf/config/exceptions.py CHANGED Viewed

@@ -30,3 +30,10 @@ class EmptyData(Exception):
     def __str__(self):
         return f'Empty data: {self.msg}'
+class CUDA_NOT_AVAILABLE(Exception):
+    def __init__(self, msg):
+        self.msg = msg
+    def __str__(self):
+        return f'CUDA not available: {self.msg}'

magic_pdf/data/data_reader_writer/base.py CHANGED Viewed

@@ -48,4 +48,16 @@ class DataWriter(ABC):
             path (str): the target file where to write
             data (str): the data want to write
         """
-        self.write(path, data.encode())
+        def safe_encode(data: str, method: str):
+            try:
+                bit_data = data.encode(encoding=method, errors='replace')
+                return bit_data, True
+            except:  # noqa
+                return None, False
+        for method in ['utf-8', 'ascii']:
+            bit_data, flag = safe_encode(data, method)
+            if flag:
+                self.write(path, bit_data)
+                break

magic_pdf/data/data_reader_writer/filebase.py CHANGED Viewed

@@ -55,7 +55,7 @@ class FileBasedDataWriter(DataWriter):
         if not os.path.isabs(fn_path) and len(self._parent_dir) > 0:
             fn_path = os.path.join(self._parent_dir, path)
-        if not os.path.exists(os.path.dirname(fn_path)):
+        if not os.path.exists(os.path.dirname(fn_path)) and os.path.dirname(fn_path) != "":
             os.makedirs(os.path.dirname(fn_path), exist_ok=True)
         with open(fn_path, 'wb') as f:

magic_pdf/data/data_reader_writer/multi_bucket_s3.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import os
 from magic_pdf.config.exceptions import InvalidConfig, InvalidParams
 from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter
 from magic_pdf.data.io.s3 import S3Reader, S3Writer
@@ -22,10 +22,10 @@ class MultiS3Mixin:
         """
         if len(default_prefix) == 0:
             raise InvalidConfig('default_prefix must be provided')
-        arr = default_prefix.strip("/").split("/")
+        arr = default_prefix.strip('/').split('/')
         self.default_bucket = arr[0]
-        self.default_prefix = "/".join(arr[1:])
+        self.default_prefix = '/'.join(arr[1:])
         found_default_bucket_config = False
         for conf in s3_configs:
@@ -103,7 +103,8 @@ class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
             s3_reader = self.__get_s3_client(bucket_name)
         else:
             s3_reader = self.__get_s3_client(self.default_bucket)
-            path = os.path.join(self.default_prefix, path)
+            if self.default_prefix:
+                path = self.default_prefix + '/' + path
         return s3_reader.read_at(path, offset, limit)
@@ -139,5 +140,6 @@ class MultiBucketS3DataWriter(DataWriter, MultiS3Mixin):
             s3_writer = self.__get_s3_client(bucket_name)
         else:
             s3_writer = self.__get_s3_client(self.default_bucket)
-            path = os.path.join(self.default_prefix, path)
+            if self.default_prefix:
+                path = self.default_prefix + '/' + path
         return s3_writer.write(path, data)

magic_pdf/data/dataset.py CHANGED Viewed

@@ -1,11 +1,14 @@
+import os
 from abc import ABC, abstractmethod
-from typing import Iterator
+from typing import Callable, Iterator
 import fitz
+from loguru import logger
 from magic_pdf.config.enums import SupportedPdfParseMethod
 from magic_pdf.data.schemas import PageInfo
 from magic_pdf.data.utils import fitz_doc_to_image
+from magic_pdf.filter import classify
 class PageableData(ABC):
@@ -28,6 +31,32 @@ class PageableData(ABC):
         """
         pass
+    @abstractmethod
+    def draw_rect(self, rect_coords, color, fill, fill_opacity, width, overlay):
+        """draw rectangle.
+        Args:
+            rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
+            color (list[float] | None): three element tuple which describe the RGB of the board line, None means no board line
+            fill (list[float] | None): fill the board with RGB, None means will not fill with color
+            fill_opacity (float): opacity of the fill, range from [0, 1]
+            width (float): the width of board
+            overlay (bool): fill the color in foreground or background. True means fill in background.
+        """
+        pass
+    @abstractmethod
+    def insert_text(self, coord, content, fontsize, color):
+        """insert text.
+        Args:
+            coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
+            content (str): the text content
+            fontsize (int): font size of the text
+            color (list[float] | None):  three element tuple which describe the RGB of the board line, None will use the default font color!
+        """
+        pass
 class Dataset(ABC):
     @abstractmethod
@@ -66,18 +95,65 @@ class Dataset(ABC):
         """
         pass
+    @abstractmethod
+    def dump_to_file(self, file_path: str):
+        """Dump the file
+        Args:
+            file_path (str): the file path
+        """
+        pass
+    @abstractmethod
+    def apply(self, proc: Callable, *args, **kwargs):
+        """Apply callable method which.
+        Args:
+            proc (Callable): invoke proc as follows:
+                proc(self, *args, **kwargs)
+        Returns:
+            Any: return the result generated by proc
+        """
+        pass
+    @abstractmethod
+    def classify(self) -> SupportedPdfParseMethod:
+        """classify the dataset
+        Returns:
+            SupportedPdfParseMethod: _description_
+        """
+        pass
+    @abstractmethod
+    def clone(self):
+        """clone this dataset
+        """
+        pass
 class PymuDocDataset(Dataset):
-    def __init__(self, bits: bytes):
+    def __init__(self, bits: bytes, lang=None):
         """Initialize the dataset, which wraps the pymudoc documents.
         Args:
             bits (bytes): the bytes of the pdf
         """
-        self._records = [Doc(v) for v in fitz.open('pdf', bits)]
+        self._raw_fitz = fitz.open('pdf', bits)
+        self._records = [Doc(v) for v in self._raw_fitz]
         self._data_bits = bits
         self._raw_data = bits
+        if lang == '':
+            self._lang = None
+        elif lang == 'auto':
+            from magic_pdf.model.sub_modules.language_detection.utils import auto_detect_lang
+            self._lang = auto_detect_lang(bits)
+            logger.info(f"lang: {lang}, detect_lang: {self._lang}")
+        else:
+            self._lang = lang
+            logger.info(f"lang: {lang}")
     def __len__(self) -> int:
         """The page number of the pdf."""
         return len(self._records)
@@ -109,6 +185,45 @@ class PymuDocDataset(Dataset):
         """
         return self._records[page_id]
+    def dump_to_file(self, file_path: str):
+        """Dump the file
+        Args:
+            file_path (str): the file path
+        """
+        dir_name = os.path.dirname(file_path)
+        if dir_name not in ('', '.', '..'):
+            os.makedirs(dir_name, exist_ok=True)
+        self._raw_fitz.save(file_path)
+    def apply(self, proc: Callable, *args, **kwargs):
+        """Apply callable method which.
+        Args:
+            proc (Callable): invoke proc as follows:
+                proc(dataset, *args, **kwargs)
+        Returns:
+            Any: return the result generated by proc
+        """
+        if 'lang' in kwargs and self._lang is not None:
+            kwargs['lang'] = self._lang
+        return proc(self, *args, **kwargs)
+    def classify(self) -> SupportedPdfParseMethod:
+        """classify the dataset
+        Returns:
+            SupportedPdfParseMethod: _description_
+        """
+        return classify(self._data_bits)
+    def clone(self):
+        """clone this dataset
+        """
+        return PymuDocDataset(self._raw_data)
 class ImageDataset(Dataset):
     def __init__(self, bits: bytes):
@@ -118,7 +233,8 @@ class ImageDataset(Dataset):
             bits (bytes): the bytes of the photo which will be converted to pdf first. then converted to pymudoc.
         """
         pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
-        self._records = [Doc(v) for v in fitz.open('pdf', pdf_bytes)]
+        self._raw_fitz = fitz.open('pdf', pdf_bytes)
+        self._records = [Doc(v) for v in self._raw_fitz]
         self._raw_data = bits
         self._data_bits = pdf_bytes
@@ -153,14 +269,50 @@ class ImageDataset(Dataset):
         """
         return self._records[page_id]
+    def dump_to_file(self, file_path: str):
+        """Dump the file
+        Args:
+            file_path (str): the file path
+        """
+        dir_name = os.path.dirname(file_path)
+        if dir_name not in ('', '.', '..'):
+            os.makedirs(dir_name, exist_ok=True)
+        self._raw_fitz.save(file_path)
+    def apply(self, proc: Callable, *args, **kwargs):
+        """Apply callable method which.
+        Args:
+            proc (Callable): invoke proc as follows:
+                proc(dataset, *args, **kwargs)
+        Returns:
+            Any: return the result generated by proc
+        """
+        return proc(self, *args, **kwargs)
+    def classify(self) -> SupportedPdfParseMethod:
+        """classify the dataset
+        Returns:
+            SupportedPdfParseMethod: _description_
+        """
+        return SupportedPdfParseMethod.OCR
+    def clone(self):
+        """clone this dataset
+        """
+        return ImageDataset(self._raw_data)
 class Doc(PageableData):
     """Initialized with pymudoc object."""
     def __init__(self, doc: fitz.Page):
         self._doc = doc
     def get_image(self):
-        """Return the imge info.
+        """Return the image info.
         Returns:
             dict: {
@@ -192,3 +344,34 @@ class Doc(PageableData):
     def __getattr__(self, name):
         if hasattr(self._doc, name):
             return getattr(self._doc, name)
+    def draw_rect(self, rect_coords, color, fill, fill_opacity, width, overlay):
+        """draw rectangle.
+        Args:
+            rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
+            color (list[float] | None): three element tuple which describe the RGB of the board line, None means no board line
+            fill (list[float] | None): fill the board with RGB, None means will not fill with color
+            fill_opacity (float): opacity of the fill, range from [0, 1]
+            width (float): the width of board
+            overlay (bool): fill the color in foreground or background. True means fill in background.
+        """
+        self._doc.draw_rect(
+            rect_coords,
+            color=color,
+            fill=fill,
+            fill_opacity=fill_opacity,
+            width=width,
+            overlay=overlay,
+        )
+    def insert_text(self, coord, content, fontsize, color):
+        """insert text.
+        Args:
+            coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
+            content (str): the text content
+            fontsize (int): font size of the text
+            color (list[float] | None):  three element tuple which describe the RGB of the board line, None will use the default font color!
+        """
+        self._doc.insert_text(coord, content, fontsize=fontsize, color=color)

magic_pdf/data/read_api.py CHANGED Viewed

@@ -1,12 +1,14 @@
 import json
 import os
+import tempfile
+import shutil
 from pathlib import Path
 from magic_pdf.config.exceptions import EmptyData, InvalidParams
 from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
                                                MultiBucketS3DataReader)
 from magic_pdf.data.dataset import ImageDataset, PymuDocDataset
+from magic_pdf.utils.office_to_pdf import convert_file_to_pdf, ConvertToPdfError
 def read_jsonl(
     s3_path_or_local: str, s3_client: MultiBucketS3DataReader | None = None
@@ -58,23 +60,68 @@ def read_local_pdfs(path: str) -> list[PymuDocDataset]:
         list[PymuDocDataset]: each pdf file will converted to a PymuDocDataset
     """
     if os.path.isdir(path):
-        reader = FileBasedDataReader(path)
-        return [
-            PymuDocDataset(reader.read(doc_path.name))
-            for doc_path in Path(path).glob('*.pdf')
-        ]
+        reader = FileBasedDataReader()
+        ret = []
+        for root, _, files in os.walk(path):
+            for file in files:
+                suffix = file.split('.')
+                if suffix[-1] == 'pdf':
+                    ret.append( PymuDocDataset(reader.read(os.path.join(root, file))))
+        return ret
     else:
         reader = FileBasedDataReader()
         bits = reader.read(path)
         return [PymuDocDataset(bits)]
+def read_local_office(path: str) -> list[PymuDocDataset]:
+    """Read ms-office file (ppt, pptx, doc, docx) from path or directory.
-def read_local_images(path: str, suffixes: list[str]) -> list[ImageDataset]:
+    Args:
+        path (str): ms-office file or directory that contains ms-office files
+    Returns:
+        list[PymuDocDataset]: each ms-office file will converted to a PymuDocDataset
+    Raises:
+        ConvertToPdfError: Failed to convert ms-office file to pdf via libreoffice
+        FileNotFoundError: File not Found
+        Exception: Unknown Exception raised
+    """
+    suffixes = ['.ppt', '.pptx', '.doc', '.docx']
+    fns = []
+    ret = []
+    if os.path.isdir(path):
+        for root, _, files in os.walk(path):
+            for file in files:
+                suffix = Path(file).suffix
+                if suffix in suffixes:
+                    fns.append((os.path.join(root, file)))
+    else:
+        fns.append(path)
+    reader = FileBasedDataReader()
+    temp_dir = tempfile.mkdtemp()
+    for fn in fns:
+        try:
+            convert_file_to_pdf(fn, temp_dir)
+        except ConvertToPdfError as e:
+            raise e
+        except FileNotFoundError as e:
+            raise e
+        except Exception as e:
+            raise e
+        fn_path = Path(fn)
+        pdf_fn = f"{temp_dir}/{fn_path.stem}.pdf"
+        ret.append(PymuDocDataset(reader.read(pdf_fn)))
+    shutil.rmtree(temp_dir)
+    return ret
+def read_local_images(path: str, suffixes: list[str]=['.png', '.jpg']) -> list[ImageDataset]:
     """Read images from path or directory.
     Args:
         path (str): image file path or directory that contains image files
-        suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['jpg', 'png']
+        suffixes (list[str]): the suffixes of the image files used to filter the files. Example: ['.jpg', '.png']
     Returns:
         list[ImageDataset]: each image file will converted to a ImageDataset
@@ -82,12 +129,12 @@ def read_local_images(path: str, suffixes: list[str]) -> list[ImageDataset]:
     if os.path.isdir(path):
         imgs_bits = []
         s_suffixes = set(suffixes)
-        reader = FileBasedDataReader(path)
+        reader = FileBasedDataReader()
         for root, _, files in os.walk(path):
             for file in files:
-                suffix = file.split('.')
-                if suffix[-1] in s_suffixes:
-                    imgs_bits.append(reader.read(file))
+                suffix = Path(file).suffix
+                if suffix in s_suffixes:
+                    imgs_bits.append(reader.read(os.path.join(root, file)))
         return [ImageDataset(bits) for bits in imgs_bits]
     else:
         reader = FileBasedDataReader()

magic_pdf/data/utils.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import fitz
 import numpy as np
+from loguru import logger
 from magic_pdf.utils.annotations import ImportPIL
@@ -30,3 +31,37 @@ def fitz_doc_to_image(doc, dpi=200) -> dict:
     img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
     return img_dict
+@ImportPIL
+def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id=None) -> list:
+    from PIL import Image
+    images = []
+    with fitz.open('pdf', pdf_bytes) as doc:
+        pdf_page_num = doc.page_count
+        end_page_id = (
+            end_page_id
+            if end_page_id is not None and end_page_id >= 0
+            else pdf_page_num - 1
+        )
+        if end_page_id > pdf_page_num - 1:
+            logger.warning('end_page_id is out of range, use images length')
+            end_page_id = pdf_page_num - 1
+        for index in range(0, doc.page_count):
+            if start_page_id <= index <= end_page_id:
+                page = doc[index]
+                mat = fitz.Matrix(dpi / 72, dpi / 72)
+                pm = page.get_pixmap(matrix=mat, alpha=False)
+                # If the width or height exceeds 4500 after scaling, do not scale further.
+                if pm.width > 4500 or pm.height > 4500:
+                    pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
+                img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
+                img = np.array(img)
+                img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
+            else:
+                img_dict = {'img': [], 'width': 0, 'height': 0}
+            images.append(img_dict)
+    return images

magic_pdf/dict2md/ocr_mkcontent.py CHANGED Viewed

@@ -7,7 +7,7 @@ from magic_pdf.config.ocr_content_type import BlockType, ContentType
 from magic_pdf.libs.commons import join_path
 from magic_pdf.libs.language import detect_lang
 from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
-from magic_pdf.para.para_split_v3 import ListLineTag
+from magic_pdf.post_proc.para_split_v3 import ListLineTag
 def __is_hyphen_at_line_end(line):
@@ -61,7 +61,8 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
         if para_type in [BlockType.Text, BlockType.List, BlockType.Index]:
             para_text = merge_para_with_text(para_block)
         elif para_type == BlockType.Title:
-            para_text = f'# {merge_para_with_text(para_block)}'
+            title_level = get_title_level(para_block)
+            para_text = f'{"#" * title_level} {merge_para_with_text(para_block)}'
         elif para_type == BlockType.InterlineEquation:
             para_text = merge_para_with_text(para_block)
         elif para_type == BlockType.Image:
@@ -125,16 +126,6 @@ def detect_language(text):
         return 'empty'
-# 连写字符拆分
-def __replace_ligatures(text: str):
-    text = re.sub(r'ﬁ', 'fi', text)  # 替换 fi 连写符
-    text = re.sub(r'ﬂ', 'fl', text)  # 替换 fl 连写符
-    text = re.sub(r'ﬀ', 'ff', text)  # 替换 ff 连写符
-    text = re.sub(r'ﬃ', 'ffi', text)  # 替换 ffi 连写符
-    text = re.sub(r'ﬄ', 'ffl', text)  # 替换 ffl 连写符
-    return text
 def merge_para_with_text(para_block):
     block_text = ''
     for line in para_block['lines']:
@@ -165,8 +156,8 @@ def merge_para_with_text(para_block):
             if content:
                 langs = ['zh', 'ja', 'ko']
                 # logger.info(f'block_lang: {block_lang}, content: {content}')
-                if block_lang in langs: # 中文/日语/韩文语境下，换行不需要空格分隔
-                    if j == len(line['spans']) - 1:
+                if block_lang in langs: # 中文/日语/韩文语境下，换行不需要空格分隔,但是如果是行内公式结尾，还是要加空格
+                    if j == len(line['spans']) - 1 and span_type not in [ContentType.InlineEquation]:
                         para_text += content
                     else:
                         para_text += f'{content} '
@@ -196,10 +187,11 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
             'text': merge_para_with_text(para_block),
         }
     elif para_type == BlockType.Title:
+        title_level = get_title_level(para_block)
         para_content = {
             'type': 'text',
             'text': merge_para_with_text(para_block),
-            'text_level': 1,
+            'text_level': title_level,
         }
     elif para_type == BlockType.InterlineEquation:
         para_content = {
@@ -299,3 +291,12 @@ def union_make(pdf_info_dict: list,
         return '\n\n'.join(output_content)
     elif make_mode == MakeMode.STANDARD_FORMAT:
         return output_content
+def get_title_level(block):
+    title_level = block.get('level', 1)
+    if title_level > 4:
+        title_level = 4
+    elif title_level < 1:
+        title_level = 1
+    return title_level

magic_pdf/filter/__init__.py CHANGED Viewed

@@ -0,0 +1,32 @@
+from magic_pdf.config.drop_reason import DropReason
+from magic_pdf.config.enums import SupportedPdfParseMethod
+from magic_pdf.filter.pdf_classify_by_type import classify as do_classify
+from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
+def classify(pdf_bytes: bytes) -> SupportedPdfParseMethod:
+    """根据pdf的元数据，判断是文本pdf，还是ocr pdf."""
+    pdf_meta = pdf_meta_scan(pdf_bytes)
+    if pdf_meta.get('_need_drop', False):  # 如果返回了需要丢弃的标志，则抛出异常
+        raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}")
+    else:
+        is_encrypted = pdf_meta['is_encrypted']
+        is_needs_password = pdf_meta['is_needs_password']
+        if is_encrypted or is_needs_password:  # 加密的，需要密码的，没有页面的，都不处理
+            raise Exception(f'pdf meta_scan need_drop,reason is {DropReason.ENCRYPTED}')
+        else:
+            is_text_pdf, results = do_classify(
+                pdf_meta['total_page'],
+                pdf_meta['page_width_pts'],
+                pdf_meta['page_height_pts'],
+                pdf_meta['image_info_per_page'],
+                pdf_meta['text_len_per_page'],
+                pdf_meta['imgs_per_page'],
+                pdf_meta['text_layout_per_page'],
+                pdf_meta['invalid_chars'],
+            )
+            if is_text_pdf:
+                return SupportedPdfParseMethod.TXT
+            else:
+                return SupportedPdfParseMethod.OCR

magic_pdf/filter/pdf_meta_scan.py CHANGED Viewed

@@ -8,7 +8,7 @@ from loguru import logger
 from magic_pdf.config.drop_reason import DropReason
 from magic_pdf.libs.commons import get_top_percent_list, mymax
 from magic_pdf.libs.language import detect_lang
-from magic_pdf.libs.pdf_check import detect_invalid_chars_by_pymupdf
+from magic_pdf.libs.pdf_check import detect_invalid_chars_by_pymupdf, detect_invalid_chars
 scan_max_page = 50
 junk_limit_min = 10
@@ -323,7 +323,8 @@ def get_language(doc: fitz.Document):
 def check_invalid_chars(pdf_bytes):
     """乱码检测."""
-    return detect_invalid_chars_by_pymupdf(pdf_bytes)
+    # return detect_invalid_chars_by_pymupdf(pdf_bytes)
+    return detect_invalid_chars(pdf_bytes)
 def pdf_meta_scan(pdf_bytes: bytes):

magic_pdf/libs/clean_memory.py CHANGED Viewed

@@ -3,8 +3,15 @@ import torch
 import gc
-def clean_memory():
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-        torch.cuda.ipc_collect()
+def clean_memory(device='cuda'):
+    if device == 'cuda':
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()
+    elif str(device).startswith("npu"):
+        import torch_npu
+        if torch_npu.npu.is_available():
+            torch_npu.npu.empty_cache()
+    elif str(device).startswith("mps"):
+        torch.mps.empty_cache()
     gc.collect()

magic_pdf/libs/config_reader.py CHANGED Viewed

@@ -116,6 +116,15 @@ def get_formula_config():
     else:
         return formula_config
+def get_llm_aided_config():
+    config = read_config()
+    llm_aided_config = config.get('llm-aided-config')
+    if llm_aided_config is None:
+        logger.warning(f"'llm-aided-config' not found in {CONFIG_FILE_NAME}, use 'None' as default")
+        return None
+    else:
+        return llm_aided_config
 if __name__ == '__main__':
     ak, sk, endpoint = get_s3_config('llm-raw')

magic-pdf 0.10.5__py3-none-any.whl → 1.0.0__py3-none-any.whl

magic-pdf 0.10.5py3-none-any.whl → 1.0.0py3-none-any.whl