PyPI - natural-pdf - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl - Mend

natural-pdf 0.1.6py3-none-any.whl → 0.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

docs/finetuning/index.md +176 -0
docs/tutorials/01-loading-and-extraction.ipynb +34 -1550
natural_pdf/__init__.py +1 -0
natural_pdf/analyzers/layout/gemini.py +63 -47
natural_pdf/collections/pdf_collection.py +5 -2
natural_pdf/core/element_manager.py +6 -4
natural_pdf/core/page.py +36 -27
natural_pdf/core/pdf.py +25 -16
natural_pdf/elements/base.py +1 -3
natural_pdf/elements/collections.py +13 -14
natural_pdf/elements/region.py +7 -6
natural_pdf/exporters/__init__.py +4 -0
natural_pdf/exporters/base.py +61 -0
natural_pdf/exporters/paddleocr.py +345 -0
natural_pdf/ocr/__init__.py +16 -8
natural_pdf/ocr/engine.py +46 -30
natural_pdf/ocr/engine_easyocr.py +81 -40
natural_pdf/ocr/engine_paddle.py +39 -28
natural_pdf/ocr/engine_surya.py +32 -16
natural_pdf/ocr/ocr_factory.py +34 -23
natural_pdf/ocr/ocr_manager.py +15 -11
natural_pdf/ocr/ocr_options.py +5 -0
natural_pdf/ocr/utils.py +46 -31
natural_pdf/templates/finetune/fine_tune_paddleocr.md +415 -0
natural_pdf/utils/debug.py +4 -2
natural_pdf/utils/identifiers.py +9 -5
natural_pdf/utils/packaging.py +172 -105
natural_pdf/utils/text_extraction.py +44 -64
natural_pdf/utils/visualization.py +1 -1
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/METADATA +5 -3
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/RECORD +34 -30
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/top_level.txt +0 -0

natural_pdf/elements/region.py CHANGED Viewed

@@ -11,7 +11,7 @@ from natural_pdf.elements.base import DirectionalMixin
 # Import new utils
 from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
-from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
+from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements  # Import utility
 if TYPE_CHECKING:
     from natural_pdf.core.page import Page
@@ -1108,7 +1108,7 @@ class Region(DirectionalMixin):
         # Determine rendering resolution from parameters
         final_resolution = ocr_params.get("resolution")
-        if final_resolution is None and hasattr(self.page, '_parent') and self.page._parent:
+        if final_resolution is None and hasattr(self.page, "_parent") and self.page._parent:
             final_resolution = getattr(self.page._parent, "_config", {}).get("resolution", 150)
         elif final_resolution is None:
             final_resolution = 150
@@ -1191,6 +1191,7 @@ class Region(DirectionalMixin):
                 ocr_char_dict.setdefault("adv", ocr_char_dict.get("width", 0))
                 element_data["_char_dicts"] = [ocr_char_dict]
                 from natural_pdf.elements.text import TextElement
                 elem = TextElement(element_data, self.page)
                 created_elements.append(elem)
                 self.page._element_mgr.add_element(elem, element_type="words")
@@ -1692,7 +1693,7 @@ class Region(DirectionalMixin):
     def correct_ocr(
         self,
         correction_callback: Callable[[Any], Optional[str]],
-    ) -> "Region": # Return self for chaining
+    ) -> "Region":  # Return self for chaining
         """
         Applies corrections to OCR-generated text elements within this region
         using a user-provided callback function.
@@ -1722,9 +1723,9 @@ class Region(DirectionalMixin):
         # Delegate to the utility function
         _apply_ocr_correction_to_elements(
-            elements=target_elements, # Pass the ElementCollection directly
+            elements=target_elements,  # Pass the ElementCollection directly
             correction_callback=correction_callback,
-            caller_info=f"Region({self.bbox})", # Pass caller info
+            caller_info=f"Region({self.bbox})",  # Pass caller info
         )
-        return self # Return self for chaining
+        return self  # Return self for chaining

natural_pdf/exporters/__init__.py CHANGED Viewed

@@ -0,0 +1,4 @@
+from .base import FinetuneExporter
+from .paddleocr import PaddleOCRRecognitionExporter
+__all__ = ["FinetuneExporter", "PaddleOCRRecognitionExporter"]

natural_pdf/exporters/base.py ADDED Viewed

@@ -0,0 +1,61 @@
+import abc
+import logging
+from typing import Union, List, TYPE_CHECKING
+if TYPE_CHECKING:
+    from natural_pdf.core.pdf import PDF
+    from natural_pdf.collections.pdf_collection import PDFCollection
+logger = logging.getLogger(__name__)
+class FinetuneExporter(abc.ABC):
+    """
+    Abstract base class for exporting data suitable for fine-tuning models.
+    """
+    @abc.abstractmethod
+    def __init__(self, **kwargs):
+        """
+        Initialize the exporter with format-specific options.
+        """
+        pass
+    @abc.abstractmethod
+    def export(self, source: Union["PDF", "PDFCollection", List["PDF"]], output_dir: str, **kwargs):
+        """
+        Exports the data from the source PDF(s) to the specified output directory
+        in a format suitable for fine-tuning a specific model type.
+        Args:
+            source: The PDF object, PDFCollection, or list of PDF objects to process.
+            output_dir: The path to the directory where the exported files will be saved.
+            **kwargs: Additional export-time arguments.
+        """
+        pass
+    def _resolve_source_pdfs(
+        self, source: Union["PDF", "PDFCollection", List["PDF"]]
+    ) -> List["PDF"]:
+        """
+        Helper to consistently resolve the input source to a list of PDF objects.
+        """
+        from natural_pdf.core.pdf import PDF  # Avoid circular import at module level
+        from natural_pdf.collections.pdf_collection import PDFCollection  # Avoid circular import
+        pdfs_to_process: List["PDF"] = []
+        if isinstance(source, PDF):
+            pdfs_to_process = [source]
+        elif isinstance(source, PDFCollection):
+            pdfs_to_process = source.pdfs
+        elif isinstance(source, list) and all(isinstance(p, PDF) for p in source):
+            pdfs_to_process = source
+        else:
+            raise TypeError(
+                f"Unsupported source type: {type(source)}. Must be PDF, PDFCollection, or List[PDF]."
+            )
+        if not pdfs_to_process:
+            logger.warning("No PDF documents provided in the source.")
+        return pdfs_to_process

natural_pdf/exporters/paddleocr.py ADDED Viewed

@@ -0,0 +1,345 @@
+import os
+import logging
+import random
+import shutil
+from typing import Union, List, Optional, TYPE_CHECKING, Set, Tuple
+from tqdm import tqdm
+from natural_pdf.exporters.base import FinetuneExporter
+# Need to import this utility
+from natural_pdf.utils.identifiers import generate_short_path_hash
+if TYPE_CHECKING:
+    from natural_pdf.core.pdf import PDF
+    from natural_pdf.collections.pdf_collection import PDFCollection
+    from natural_pdf.elements.text import TextElement
+logger = logging.getLogger(__name__)
+DEFAULT_SELECTOR_CORRECTED = "text[source^=manifest]"  # Match manifest-import etc.
+class PaddleOCRRecognitionExporter(FinetuneExporter):
+    """
+    Exports data for fine-tuning a PaddleOCR text recognition model.
+    Creates a directory structure with cropped text images and label files
+    (`train.txt`, `val.txt`, or `label.txt`) suitable for PaddleOCR training.
+    Optionally includes a Jupyter Notebook guide for fine-tuning on Colab.
+    """
+    def __init__(
+        self,
+        resolution: int = 150,
+        padding: int = 2,
+        selector: Optional[str] = None,
+        corrected_only: bool = False,
+        split_ratio: Optional[float] = 0.9,
+        include_guide: bool = True,
+        random_seed: Optional[int] = 42,
+    ):
+        """
+        Initialize the PaddleOCR Recognition Exporter.
+        Args:
+            resolution: DPI resolution for rendering text region images (default: 150).
+            padding: Padding (in points) to add around text element bbox before cropping (default: 2).
+            selector: CSS-like selector to filter which TextElements to export.
+                      If None and corrected_only is False, all 'text' elements are considered.
+            corrected_only: If True, overrides selector and exports only elements likely
+                            originating from a correction manifest (selector="text[source^=manifest]").
+                            (default: False).
+            split_ratio: Ratio for splitting data into training/validation sets (e.g., 0.9 for 90% train).
+                         If None, creates a single `label.txt` file (default: 0.9).
+            include_guide: If True, includes a template Jupyter Notebook guide for fine-tuning
+                           in the output directory (default: True).
+            random_seed: Seed for the random number generator used for train/val split shuffling,
+                         ensuring reproducibility (default: 42).
+        """
+        if corrected_only and selector:
+            logger.warning(
+                f"Both 'corrected_only=True' and 'selector=\"{selector}\"' were provided. "
+                f"Using corrected_only=True (selector='{DEFAULT_SELECTOR_CORRECTED}')."
+            )
+            self.selector = DEFAULT_SELECTOR_CORRECTED
+        elif corrected_only:
+            self.selector = DEFAULT_SELECTOR_CORRECTED
+        elif selector:
+            self.selector = selector
+        else:
+            self.selector = "text"  # Default to all text elements if nothing else specified
+        self.resolution = resolution
+        self.padding = padding
+        self.split_ratio = split_ratio
+        self.include_guide = include_guide
+        self.random_seed = random_seed
+        logger.info(
+            f"Initialized PaddleOCRRecognitionExporter: selector='{self.selector}', resolution={resolution}, "
+            f"padding={padding}, split_ratio={split_ratio}, include_guide={include_guide}"
+        )
+    def export(
+        self,
+        source: Union["PDF", "PDFCollection", List["PDF"]],
+        output_dir: str,
+        **kwargs,  # Allow for potential future args
+    ):
+        """
+        Exports text elements from the source PDF(s) to the specified output directory
+        in PaddleOCR text recognition format.
+        Args:
+            source: The PDF object, PDFCollection, or list of PDF objects to process.
+            output_dir: The path to the directory where the exported files will be saved.
+                        The directory will be created if it doesn't exist.
+            **kwargs: Optional keyword arguments (currently unused).
+        """
+        # --- 1. Setup and Validation ---
+        pdfs_to_process = self._resolve_source_pdfs(source)
+        if not pdfs_to_process:
+            logger.error("No valid PDF sources found. Aborting export.")
+            return
+        try:
+            os.makedirs(output_dir, exist_ok=True)
+            images_dir = os.path.join(output_dir, "images")
+            os.makedirs(images_dir, exist_ok=True)
+        except OSError as e:
+            logger.error(f"Failed to create output directory '{output_dir}': {e}", exc_info=True)
+            raise
+        # --- 2. Collect Elements and Render Images ---
+        labels: List[Tuple[str, str]] = []  # List of (relative_image_path, text_label)
+        char_set: Set[str] = set()
+        elements_processed = 0
+        elements_skipped = 0
+        logger.info(
+            f"Processing {len(pdfs_to_process)} PDF(s) to find elements matching selector: '{self.selector}'"
+        )
+        for pdf in tqdm(pdfs_to_process, desc="Processing PDFs"):
+            # Need to ensure pdf.path exists and is string
+            if not hasattr(pdf, "path") or not isinstance(pdf.path, str):
+                logger.warning(f"Skipping PDF object without a valid path attribute: {pdf}")
+                continue
+            pdf_hash = generate_short_path_hash(pdf.path)
+            try:
+                # Find elements using the specified selector
+                # Need to check if pdf has find_all method
+                if not hasattr(pdf, "find_all"):
+                    logger.warning(
+                        f"PDF object {pdf.path} does not have find_all method. Skipping."
+                    )
+                    continue
+                elements = pdf.find_all(
+                    self.selector, apply_exclusions=False
+                )  # Usually want all text, even if excluded
+                if not elements:
+                    logger.debug(f"No elements matching '{self.selector}' found in {pdf.path}")
+                    continue
+                for i, element in enumerate(
+                    tqdm(
+                        elements,
+                        desc=f"Exporting '{os.path.basename(pdf.path)}'",
+                        leave=False,
+                        position=1,
+                    )
+                ):
+                    # Ensure it's a TextElement with necessary methods/attributes
+                    # Removed check for to_image as it's called after expand()
+                    if not (
+                        hasattr(element, "page")
+                        and hasattr(element, "text")
+                        and hasattr(element, "expand")
+                    ):
+                        logger.warning(f"Skipping invalid/non-text element {i} in {pdf.path}")
+                        elements_skipped += 1
+                        continue
+                    element_text = element.text
+                    # Skip elements with no text, non-string text, or newlines
+                    if (
+                        not element_text
+                        or not isinstance(element_text, str)
+                        or "\n" in element_text
+                    ):
+                        if "\n" in str(element_text):
+                            reason = "contains newline"
+                        elif not element_text:
+                            reason = "empty text"
+                        else:
+                            reason = "invalid text type"
+                        logger.debug(
+                            f"Skipping element {i} in {pdf.path} page {getattr(element.page, 'number', 'N/A')} because {reason}."
+                        )
+                        elements_skipped += 1
+                        continue
+                    # Use page index if available, otherwise fallback or skip? Fallback to 0 for now.
+                    page_index = getattr(element.page, "index", 0)
+                    image_filename = f"{pdf_hash}_p{page_index}_e{i}.png"
+                    relative_image_path = os.path.join("images", image_filename)
+                    absolute_image_path = os.path.join(output_dir, relative_image_path)
+                    try:
+                        # Expand region, render, and save image
+                        region = element.expand(self.padding)
+                        img = region.to_image(
+                            resolution=self.resolution, crop_only=True, include_highlights=False
+                        )
+                        img.save(absolute_image_path, "PNG")
+                        # Add to labels and character set
+                        labels.append(
+                            (relative_image_path.replace(os.path.sep, "/"), element_text)
+                        )  # Use forward slashes for labels
+                        char_set.update(element_text)
+                        elements_processed += 1
+                    except Exception as e:
+                        page_num_str = getattr(
+                            element.page, "number", "N/A"
+                        )  # Get page number safely
+                        logger.error(
+                            f"Failed to process/save image for element {i} in {pdf.path} page {page_num_str}: {e}",
+                            exc_info=False,  # Keep log cleaner
+                        )
+                        elements_skipped += 1
+            except Exception as e:
+                logger.error(f"Failed to process PDF {pdf.path}: {e}", exc_info=True)
+                # Continue with other PDFs if possible
+        if elements_processed == 0:
+            logger.error(
+                f"No text elements were successfully processed and exported matching '{self.selector}'. Aborting."
+            )
+            # Clean up potentially created directories? Or leave them empty? Let's leave them.
+            return
+        logger.info(f"Processed {elements_processed} text elements, skipped {elements_skipped}.")
+        # --- 3. Generate Dictionary File (`dict.txt`) ---
+        dict_path = os.path.join(output_dir, "dict.txt")
+        try:
+            # Log the character set before sorting/writing
+            logger.debug(f"Exporter final char_set before sorting: {repr(char_set)}")
+            # PaddleOCR typically doesn't require special tokens like <UNK> or <BLK> in the dict
+            # for recognition models, but this might depend on the specific base model.
+            # Start with just the characters found.
+            sorted_chars = sorted(list(char_set), reverse=True)
+            with open(dict_path, "w", encoding="utf-8") as f_dict:
+                for char in sorted_chars:
+                    # Ensure we don't write empty strings or just newlines as dictionary entries
+                    if char and char != "\n":
+                        f_dict.write(char + "\n")
+            logger.info(f"Created dictionary file with {len(sorted_chars)} characters: {dict_path}")
+        except Exception as e:
+            logger.error(f"Failed to write dictionary file '{dict_path}': {e}", exc_info=True)
+            raise  # Re-raise as this is critical
+        # --- 4. Generate Label Files (`train.txt`, `val.txt` or `label.txt`) ---
+        if self.split_ratio is not None and 0 < self.split_ratio < 1:
+            if self.random_seed is not None:
+                random.seed(self.random_seed)
+            random.shuffle(labels)
+            split_index = int(len(labels) * self.split_ratio)
+            train_labels = labels[:split_index]
+            val_labels = labels[split_index:]
+            try:
+                train_path = os.path.join(output_dir, "train.txt")
+                with open(train_path, "w", encoding="utf-8") as f_train:
+                    for img_path, text in train_labels:
+                        f_train.write(f"{img_path}\t{text}\n")  # Use literal tabs and newlines
+                logger.info(
+                    f"Created training label file with {len(train_labels)} entries: {train_path}"
+                )
+                val_path = os.path.join(output_dir, "val.txt")
+                with open(val_path, "w", encoding="utf-8") as f_val:
+                    for img_path, text in val_labels:
+                        f_val.write(f"{img_path}\t{text}\n")  # Use literal tabs and newlines
+                logger.info(
+                    f"Created validation label file with {len(val_labels)} entries: {val_path}"
+                )
+            except Exception as e:
+                logger.error(f"Failed to write train/validation label files: {e}", exc_info=True)
+                raise
+        else:
+            # Create a single label file
+            label_path = os.path.join(output_dir, "label.txt")
+            try:
+                with open(label_path, "w", encoding="utf-8") as f_label:
+                    for img_path, text in labels:
+                        f_label.write(f"{img_path}\t{text}\n")  # Use literal tabs and newlines
+                logger.info(f"Created single label file with {len(labels)} entries: {label_path}")
+            except Exception as e:
+                logger.error(f"Failed to write label file '{label_path}': {e}", exc_info=True)
+                raise
+        # --- 5. Include Guide Notebook ---
+        if self.include_guide:
+            self._copy_guide_notebook(output_dir)
+        logger.info(f"PaddleOCR recognition data export completed successfully to '{output_dir}'.")
+    def _copy_guide_notebook(self, output_dir: str):
+        """Locates, converts (md->ipynb), and copies the guide notebook."""
+        try:
+            # Try importing conversion library
+            import jupytext
+            from nbformat import write as write_notebook
+        except ImportError:
+            logger.warning(
+                "Could not import 'jupytext' or 'nbformat'. Skipping guide notebook generation. "
+                "Install with 'pip install natural-pdf[dev]' or 'pip install jupytext nbformat'."
+            )
+            return
+        try:
+            # Locate the template .md file relative to this script
+            exporter_dir = os.path.dirname(os.path.abspath(__file__))
+            # Go up two levels (exporters -> natural_pdf) then down to templates/finetune
+            template_dir = os.path.abspath(
+                os.path.join(exporter_dir, "..", "templates", "finetune")
+            )
+            template_md_path = os.path.join(template_dir, "fine_tune_paddleocr.md")
+            output_ipynb_path = os.path.join(output_dir, "fine_tune_paddleocr.ipynb")
+            if not os.path.exists(template_md_path):
+                logger.error(
+                    f"Guide template not found at expected location: {template_md_path}. Trying alternate path."
+                )
+                # Try path relative to workspace root as fallback if run from project root
+                alt_template_path = os.path.abspath(
+                    os.path.join("natural_pdf", "templates", "finetune", "fine_tune_paddleocr.md")
+                )
+                if os.path.exists(alt_template_path):
+                    template_md_path = alt_template_path
+                    logger.info(f"Found guide template at alternate path: {template_md_path}")
+                else:
+                    logger.error(
+                        f"Guide template also not found at: {alt_template_path}. Cannot copy guide."
+                    )
+                    return
+            # Convert Markdown to Notebook object using jupytext
+            logger.debug(f"Reading guide template from: {template_md_path}")
+            notebook = jupytext.read(template_md_path)  # Reads md and returns NotebookNode
+            # Write the Notebook object to the output .ipynb file
+            logger.debug(f"Writing guide notebook to: {output_ipynb_path}")
+            with open(output_ipynb_path, "w", encoding="utf-8") as f_nb:
+                write_notebook(notebook, f_nb)
+            logger.info(f"Copied and converted fine-tuning guide notebook to: {output_ipynb_path}")
+        except Exception as e:
+            logger.error(f"Failed to copy/convert guide notebook: {e}", exc_info=True)

natural_pdf/ocr/__init__.py CHANGED Viewed

@@ -11,7 +11,13 @@ logger = logging.getLogger("natural_pdf.ocr")
 # Import the base classes that are always available
 from .engine import OCREngine
-from .ocr_options import OCROptions, BaseOCROptions, EasyOCROptions, PaddleOCROptions, SuryaOCROptions
+from .ocr_options import (
+    OCROptions,
+    BaseOCROptions,
+    EasyOCROptions,
+    PaddleOCROptions,
+    SuryaOCROptions,
+)
 from .ocr_manager import OCRManager
 from .ocr_factory import OCRFactory
@@ -22,13 +28,14 @@ __all__ = [
     "OCROptions",
     "BaseOCROptions",
     "EasyOCROptions",
-    "PaddleOCROptions",
+    "PaddleOCROptions",
     "SuryaOCROptions",
     "OCRFactory",
     "get_engine",
-    "list_available_engines"
+    "list_available_engines",
 ]
 def get_engine(engine_name=None, **kwargs):
     """
     Get OCR engine by name with graceful handling of missing dependencies.
@@ -40,27 +47,27 @@ def get_engine(engine_name=None, **kwargs):
     Returns:
         OCREngine instance
     Raises:
         ImportError: If the requested engine's dependencies aren't installed
         ValueError: If the engine_name is unknown
     """
     logger.debug(f"Initializing OCR engine: {engine_name or 'best available'}")
     try:
         if engine_name is None or engine_name == "default":
             # Use the factory to get the best available engine
             engine = OCRFactory.get_recommended_engine(**kwargs)
             logger.info(f"Using recommended OCR engine: {engine.__class__.__name__}")
             return engine
         # Use the factory to create a specific engine
         normalized_name = engine_name.lower()
         if normalized_name in ["easyocr", "paddle", "surya"]:
             return OCRFactory.create_engine(normalized_name, **kwargs)
         else:
             raise ValueError(f"Unknown OCR engine: {engine_name}")
     except ImportError as e:
         logger.error(f"OCR engine dependency error: {e}")
         raise
@@ -68,10 +75,11 @@ def get_engine(engine_name=None, **kwargs):
         logger.error(f"Error initializing OCR engine: {e}")
         raise
 def list_available_engines():
     """
     List all available OCR engines.
     Returns:
         Dict[str, bool]: Dictionary mapping engine names to availability status
     """

natural-pdf 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

natural-pdf 0.1.6py3-none-any.whl → 0.1.7py3-none-any.whl