PyPI - magic-pdf - Versions diffs - 1.2.2__py3-none-any.whl → 1.3.1__py3-none-any.whl - Mend

magic-pdf 1.2.2py3-none-any.whl → 1.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

magic_pdf/data/batch_build_dataset.py ADDED Viewed

@@ -0,0 +1,156 @@
+import concurrent.futures
+import fitz
+from magic_pdf.data.dataset import PymuDocDataset
+from magic_pdf.data.utils import fitz_doc_to_image  # PyMuPDF
+def partition_array_greedy(arr, k):
+    """Partition an array into k parts using a simple greedy approach.
+    Parameters:
+    -----------
+    arr : list
+        The input array of integers
+    k : int
+        Number of partitions to create
+    Returns:
+    --------
+    partitions : list of lists
+        The k partitions of the array
+    """
+    # Handle edge cases
+    if k <= 0:
+        raise ValueError('k must be a positive integer')
+    if k > len(arr):
+        k = len(arr)  # Adjust k if it's too large
+    if k == 1:
+        return [list(range(len(arr)))]
+    if k == len(arr):
+        return [[i] for i in range(len(arr))]
+    # Sort the array in descending order
+    sorted_indices = sorted(range(len(arr)), key=lambda i: arr[i][1], reverse=True)
+    # Initialize k empty partitions
+    partitions = [[] for _ in range(k)]
+    partition_sums = [0] * k
+    # Assign each element to the partition with the smallest current sum
+    for idx in sorted_indices:
+        # Find the partition with the smallest sum
+        min_sum_idx = partition_sums.index(min(partition_sums))
+        # Add the element to this partition
+        partitions[min_sum_idx].append(idx)  # Store the original index
+        partition_sums[min_sum_idx] += arr[idx][1]
+    return partitions
+def process_pdf_batch(pdf_jobs, idx):
+    """Process a batch of PDF pages using multiple threads.
+    Parameters:
+    -----------
+    pdf_jobs : list of tuples
+        List of (pdf_path, page_num) tuples
+    output_dir : str or None
+        Directory to save images to
+    num_threads : int
+        Number of threads to use
+    **kwargs :
+        Additional arguments for process_pdf_page
+    Returns:
+    --------
+    images : list
+        List of processed images
+    """
+    images = []
+    for pdf_path, _ in pdf_jobs:
+        doc = fitz.open(pdf_path)
+        tmp = []
+        for page_num in range(len(doc)):
+            page = doc[page_num]
+            tmp.append(fitz_doc_to_image(page))
+        images.append(tmp)
+    return (idx, images)
+def batch_build_dataset(pdf_paths, k, lang=None):
+    """Process multiple PDFs by partitioning them into k balanced parts and
+    processing each part in parallel.
+    Parameters:
+    -----------
+    pdf_paths : list
+        List of paths to PDF files
+    k : int
+        Number of partitions to create
+    output_dir : str or None
+        Directory to save images to
+    threads_per_worker : int
+        Number of threads to use per worker
+    **kwargs :
+        Additional arguments for process_pdf_page
+    Returns:
+    --------
+    all_images : list
+        List of all processed images
+    """
+    # Get page counts for each PDF
+    pdf_info = []
+    total_pages = 0
+    for pdf_path in pdf_paths:
+        try:
+            doc = fitz.open(pdf_path)
+            num_pages = len(doc)
+            pdf_info.append((pdf_path, num_pages))
+            total_pages += num_pages
+            doc.close()
+        except Exception as e:
+            print(f'Error opening {pdf_path}: {e}')
+    # Partition the jobs based on page countEach job has 1 page
+    partitions = partition_array_greedy(pdf_info, k)
+    # Process each partition in parallel
+    all_images_h = {}
+    with concurrent.futures.ProcessPoolExecutor(max_workers=k) as executor:
+        # Submit one task per partition
+        futures = []
+        for sn, partition in enumerate(partitions):
+            # Get the jobs for this partition
+            partition_jobs = [pdf_info[idx] for idx in partition]
+            # Submit the task
+            future = executor.submit(
+                process_pdf_batch,
+                partition_jobs,
+                sn
+            )
+            futures.append(future)
+        # Process results as they complete
+        for i, future in enumerate(concurrent.futures.as_completed(futures)):
+            try:
+                idx, images = future.result()
+                all_images_h[idx] = images
+            except Exception as e:
+                print(f'Error processing partition: {e}')
+    results = [None] * len(pdf_paths)
+    for i in range(len(partitions)):
+        partition = partitions[i]
+        for j in range(len(partition)):
+            with open(pdf_info[partition[j]][0], 'rb') as f:
+                pdf_bytes = f.read()
+            dataset = PymuDocDataset(pdf_bytes, lang=lang)
+            dataset.set_images(all_images_h[i][j])
+            results[partition[j]] = dataset
+    return results

magic_pdf/data/dataset.py CHANGED Viewed

@@ -97,10 +97,10 @@ class Dataset(ABC):
     @abstractmethod
     def dump_to_file(self, file_path: str):
-        """Dump the file
+        """Dump the file.
-        Args:
-            file_path (str): the file path
+        Args:
+            file_path (str): the file path
         """
         pass
@@ -119,7 +119,7 @@ class Dataset(ABC):
     @abstractmethod
     def classify(self) -> SupportedPdfParseMethod:
-        """classify the dataset
+        """classify the dataset.
         Returns:
             SupportedPdfParseMethod: _description_
@@ -128,8 +128,7 @@ class Dataset(ABC):
     @abstractmethod
     def clone(self):
-        """clone this dataset
-        """
+        """clone this dataset."""
         pass
@@ -144,16 +143,19 @@ class PymuDocDataset(Dataset):
         self._records = [Doc(v) for v in self._raw_fitz]
         self._data_bits = bits
         self._raw_data = bits
+        self._classify_result = None
         if lang == '':
             self._lang = None
         elif lang == 'auto':
-            from magic_pdf.model.sub_modules.language_detection.utils import auto_detect_lang
+            from magic_pdf.model.sub_modules.language_detection.utils import \
+                auto_detect_lang
             self._lang = auto_detect_lang(bits)
-            logger.info(f"lang: {lang}, detect_lang: {self._lang}")
+            logger.info(f'lang: {lang}, detect_lang: {self._lang}')
         else:
             self._lang = lang
-            logger.info(f"lang: {lang}")
+            logger.info(f'lang: {lang}')
     def __len__(self) -> int:
         """The page number of the pdf."""
         return len(self._records)
@@ -186,12 +188,12 @@ class PymuDocDataset(Dataset):
         return self._records[page_id]
     def dump_to_file(self, file_path: str):
-        """Dump the file
+        """Dump the file.
-        Args:
-            file_path (str): the file path
+        Args:
+            file_path (str): the file path
         """
         dir_name = os.path.dirname(file_path)
         if dir_name not in ('', '.', '..'):
             os.makedirs(dir_name, exist_ok=True)
@@ -212,21 +214,25 @@ class PymuDocDataset(Dataset):
         return proc(self, *args, **kwargs)
     def classify(self) -> SupportedPdfParseMethod:
-        """classify the dataset
+        """classify the dataset.
         Returns:
             SupportedPdfParseMethod: _description_
         """
-        return classify(self._data_bits)
+        if self._classify_result is None:
+            self._classify_result = classify(self._data_bits)
+        return self._classify_result
     def clone(self):
-        """clone this dataset
-        """
+        """clone this dataset."""
         return PymuDocDataset(self._raw_data)
+    def set_images(self, images):
+        for i in range(len(self._records)):
+            self._records[i].set_image(images[i])
 class ImageDataset(Dataset):
-    def __init__(self, bits: bytes):
+    def __init__(self, bits: bytes, lang=None):
         """Initialize the dataset, which wraps the pymudoc documents.
         Args:
@@ -238,6 +244,17 @@ class ImageDataset(Dataset):
         self._raw_data = bits
         self._data_bits = pdf_bytes
+        if lang == '':
+            self._lang = None
+        elif lang == 'auto':
+            from magic_pdf.model.sub_modules.language_detection.utils import \
+                auto_detect_lang
+            self._lang = auto_detect_lang(bits)
+            logger.info(f'lang: {lang}, detect_lang: {self._lang}')
+        else:
+            self._lang = lang
+            logger.info(f'lang: {lang}')
     def __len__(self) -> int:
         """The length of the dataset."""
         return len(self._records)
@@ -270,10 +287,10 @@ class ImageDataset(Dataset):
         return self._records[page_id]
     def dump_to_file(self, file_path: str):
-        """Dump the file
+        """Dump the file.
-        Args:
-            file_path (str): the file path
+        Args:
+            file_path (str): the file path
         """
         dir_name = os.path.dirname(file_path)
         if dir_name not in ('', '.', '..'):
@@ -293,7 +310,7 @@ class ImageDataset(Dataset):
         return proc(self, *args, **kwargs)
     def classify(self) -> SupportedPdfParseMethod:
-        """classify the dataset
+        """classify the dataset.
         Returns:
             SupportedPdfParseMethod: _description_
@@ -301,15 +318,19 @@ class ImageDataset(Dataset):
         return SupportedPdfParseMethod.OCR
     def clone(self):
-        """clone this dataset
-        """
+        """clone this dataset."""
         return ImageDataset(self._raw_data)
+    def set_images(self, images):
+        for i in range(len(self._records)):
+            self._records[i].set_image(images[i])
 class Doc(PageableData):
     """Initialized with pymudoc object."""
     def __init__(self, doc: fitz.Page):
         self._doc = doc
+        self._img = None
     def get_image(self):
         """Return the image info.
@@ -321,7 +342,17 @@ class Doc(PageableData):
                 height: int
             }
         """
-        return fitz_doc_to_image(self._doc)
+        if self._img is None:
+            self._img = fitz_doc_to_image(self._doc)
+        return self._img
+    def set_image(self, img):
+        """
+        Args:
+            img (np.ndarray): the image
+        """
+        if self._img is None:
+            self._img = img
     def get_doc(self) -> fitz.Page:
         """Get the pymudoc object.

magic_pdf/data/utils.py CHANGED Viewed

@@ -1,12 +1,15 @@
+import multiprocessing as mp
+import threading
+from concurrent.futures import (ProcessPoolExecutor, ThreadPoolExecutor,
+                                as_completed)
 import fitz
 import numpy as np
 from loguru import logger
-from magic_pdf.utils.annotations import ImportPIL
-@ImportPIL
 def fitz_doc_to_image(doc, dpi=200) -> dict:
     """Convert fitz.Document to image, Then convert the image to numpy array.
@@ -17,7 +20,6 @@ def fitz_doc_to_image(doc, dpi=200) -> dict:
     Returns:
         dict:  {'img': numpy array, 'width': width, 'height': height }
     """
-    from PIL import Image
     mat = fitz.Matrix(dpi / 72, dpi / 72)
     pm = doc.get_pixmap(matrix=mat, alpha=False)
@@ -25,16 +27,14 @@ def fitz_doc_to_image(doc, dpi=200) -> dict:
     if pm.width > 4500 or pm.height > 4500:
         pm = doc.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
-    img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
-    img = np.array(img)
+    # Convert pixmap samples directly to numpy array
+    img = np.frombuffer(pm.samples, dtype=np.uint8).reshape(pm.height, pm.width, 3)
     img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
     return img_dict
-@ImportPIL
 def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id=None) -> list:
-    from PIL import Image
     images = []
     with fitz.open('pdf', pdf_bytes) as doc:
         pdf_page_num = doc.page_count
@@ -57,11 +57,110 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id
                 if pm.width > 4500 or pm.height > 4500:
                     pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
-                img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
-                img = np.array(img)
+                # Convert pixmap samples directly to numpy array
+                img = np.frombuffer(pm.samples, dtype=np.uint8).reshape(pm.height, pm.width, 3)
                 img_dict = {'img': img, 'width': pm.width, 'height': pm.height}
             else:
                 img_dict = {'img': [], 'width': 0, 'height': 0}
             images.append(img_dict)
     return images
+def convert_page(bytes_page):
+    pdfs = fitz.open('pdf', bytes_page)
+    page = pdfs[0]
+    return fitz_doc_to_image(page)
+def parallel_process_pdf_safe(pages, num_workers=None, **kwargs):
+    """Process PDF pages in parallel with serialization-safe approach."""
+    if num_workers is None:
+        num_workers = mp.cpu_count()
+    # Process the extracted page data in parallel
+    with ProcessPoolExecutor(max_workers=num_workers) as executor:
+        # Process the page data
+        results = list(
+            executor.map(convert_page, pages)
+        )
+    return results
+def threaded_process_pdf(pdf_path, num_threads=4, **kwargs):
+    """Process all pages of a PDF using multiple threads.
+    Parameters:
+    -----------
+    pdf_path : str
+        Path to the PDF file
+    num_threads : int
+        Number of threads to use
+    **kwargs :
+        Additional arguments for fitz_doc_to_image
+    Returns:
+    --------
+    images : list
+        List of processed images, in page order
+    """
+    # Open the PDF
+    doc = fitz.open(pdf_path)
+    num_pages = len(doc)
+    # Create a list to store results in the correct order
+    results = [None] * num_pages
+    # Create a thread pool
+    with ThreadPoolExecutor(max_workers=num_threads) as executor:
+        # Submit all tasks
+        futures = {}
+        for page_num in range(num_pages):
+            page = doc[page_num]
+            future = executor.submit(fitz_doc_to_image, page, **kwargs)
+            futures[future] = page_num
+        # Process results as they complete with progress bar
+        for future in as_completed(futures):
+            page_num = futures[future]
+            try:
+                results[page_num] = future.result()
+            except Exception as e:
+                print(f'Error processing page {page_num}: {e}')
+                results[page_num] = None
+    # Close the document
+    doc.close()
+if __name__ == '__main__':
+    pdf = fitz.open('/tmp/[MS-DOC].pdf')
+    pdf_page = [fitz.open() for i in range(pdf.page_count)]
+    [pdf_page[i].insert_pdf(pdf, from_page=i, to_page=i) for i in range(pdf.page_count)]
+    pdf_page = [v.tobytes() for v in pdf_page]
+    results = parallel_process_pdf_safe(pdf_page, num_workers=16)
+    # threaded_process_pdf('/tmp/[MS-DOC].pdf', num_threads=16)
+    """ benchmark results of multi-threaded processing (fitz page to image)
+    total page nums: 578
+    thread nums,    time cost
+    1               7.351 sec
+    2               6.334 sec
+    4               5.968 sec
+    8               6.728 sec
+    16              8.085 sec
+    """
+    """ benchmark results of multi-processor processing (fitz page to image)
+    total page nums: 578
+    processor nums,    time cost
+    1                  17.170 sec
+    2                  10.170 sec
+    4                  7.841 sec
+    8                  7.900 sec
+    16                 7.984 sec
+    """

magic_pdf/dict2md/ocr_mkcontent.py CHANGED Viewed

@@ -208,12 +208,13 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
             'text': merge_para_with_text(para_block),
         }
     elif para_type == BlockType.Title:
-        title_level = get_title_level(para_block)
         para_content = {
             'type': 'text',
             'text': merge_para_with_text(para_block),
-            'text_level': title_level,
         }
+        title_level = get_title_level(para_block)
+        if title_level != 0:
+            para_content['text_level'] = title_level
     elif para_type == BlockType.InterlineEquation:
         para_content = {
             'type': 'equation',
@@ -319,5 +320,5 @@ def get_title_level(block):
     if title_level > 4:
         title_level = 4
     elif title_level < 1:
-        title_level = 1
+        title_level = 0
     return title_level

magic_pdf/libs/pdf_image_tools.py CHANGED Viewed

@@ -44,14 +44,19 @@ def cut_image_to_pil_image(bbox: tuple, page: fitz.Page, mode="pillow"):
     # 截取图片
     pix = page.get_pixmap(clip=rect, matrix=zoom)
-    # 将字节数据转换为文件对象
-    image_file = BytesIO(pix.tobytes(output='png'))
-    # 使用 Pillow 打开图像
-    pil_image = Image.open(image_file)
     if mode == "cv2":
-        image_result = cv2.cvtColor(np.asarray(pil_image), cv2.COLOR_RGB2BGR)
+        # 直接转换为numpy数组供cv2使用
+        img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
+        # PyMuPDF使用RGB顺序，而cv2使用BGR顺序
+        if pix.n == 3 or pix.n == 4:
+            image_result = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
+        else:
+            image_result = img_array
     elif mode == "pillow":
-        image_result = pil_image
+        # 将字节数据转换为文件对象
+        image_file = BytesIO(pix.tobytes(output='png'))
+        # 使用 Pillow 打开图像
+        image_result = Image.open(image_file)
     else:
         raise ValueError(f"mode: {mode} is not supported.")

magic_pdf/libs/performance_stats.py CHANGED Viewed

@@ -48,7 +48,18 @@ def measure_time(func):
         start_time = time.time()
         result = func(*args, **kwargs)
         execution_time = time.time() - start_time
-        PerformanceStats.add_execution_time(func.__name__, execution_time)
+        # 获取更详细的函数标识
+        if hasattr(func, "__self__"):  # 实例方法
+            class_name = func.__self__.__class__.__name__
+            full_name = f"{class_name}.{func.__name__}"
+        elif hasattr(func, "__qualname__"):  # 类方法或静态方法
+            full_name = func.__qualname__
+        else:
+            module_name = func.__module__
+            full_name = f"{module_name}.{func.__name__}"
+        PerformanceStats.add_execution_time(full_name, execution_time)
         return result
     return wrapper

magic_pdf/libs/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.2.2"
1	+ __version__ = "1.3.1"

magic-pdf 1.2.2__py3-none-any.whl → 1.3.1__py3-none-any.whl

magic-pdf 1.2.2py3-none-any.whl → 1.3.1py3-none-any.whl