PyPI - natural-pdf - Versions diffs - 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl - Mend

natural-pdf 0.1.5py3-none-any.whl → 0.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

docs/finetuning/index.md +176 -0
docs/ocr/index.md +34 -47
docs/tutorials/01-loading-and-extraction.ipynb +34 -1536
docs/tutorials/02-finding-elements.ipynb +42 -42
docs/tutorials/03-extracting-blocks.ipynb +17 -17
docs/tutorials/04-table-extraction.ipynb +12 -12
docs/tutorials/05-excluding-content.ipynb +30 -30
docs/tutorials/06-document-qa.ipynb +28 -28
docs/tutorials/07-layout-analysis.ipynb +63 -35
docs/tutorials/07-working-with-regions.ipynb +55 -51
docs/tutorials/07-working-with-regions.md +2 -2
docs/tutorials/08-spatial-navigation.ipynb +60 -60
docs/tutorials/09-section-extraction.ipynb +113 -113
docs/tutorials/10-form-field-extraction.ipynb +78 -50
docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
docs/tutorials/12-ocr-integration.ipynb +149 -131
docs/tutorials/12-ocr-integration.md +0 -13
docs/tutorials/13-semantic-search.ipynb +313 -873
natural_pdf/__init__.py +21 -22
natural_pdf/analyzers/layout/gemini.py +280 -0
natural_pdf/analyzers/layout/layout_manager.py +28 -1
natural_pdf/analyzers/layout/layout_options.py +11 -0
natural_pdf/analyzers/layout/yolo.py +6 -2
natural_pdf/collections/pdf_collection.py +24 -0
natural_pdf/core/element_manager.py +18 -13
natural_pdf/core/page.py +174 -36
natural_pdf/core/pdf.py +156 -42
natural_pdf/elements/base.py +9 -17
natural_pdf/elements/collections.py +99 -38
natural_pdf/elements/region.py +77 -37
natural_pdf/elements/text.py +5 -0
natural_pdf/exporters/__init__.py +4 -0
natural_pdf/exporters/base.py +61 -0
natural_pdf/exporters/paddleocr.py +345 -0
natural_pdf/ocr/__init__.py +57 -36
natural_pdf/ocr/engine.py +160 -49
natural_pdf/ocr/engine_easyocr.py +178 -157
natural_pdf/ocr/engine_paddle.py +114 -189
natural_pdf/ocr/engine_surya.py +87 -144
natural_pdf/ocr/ocr_factory.py +125 -0
natural_pdf/ocr/ocr_manager.py +65 -89
natural_pdf/ocr/ocr_options.py +8 -13
natural_pdf/ocr/utils.py +113 -0
natural_pdf/templates/finetune/fine_tune_paddleocr.md +415 -0
natural_pdf/templates/spa/css/style.css +334 -0
natural_pdf/templates/spa/index.html +31 -0
natural_pdf/templates/spa/js/app.js +472 -0
natural_pdf/templates/spa/words.txt +235976 -0
natural_pdf/utils/debug.py +34 -0
natural_pdf/utils/identifiers.py +33 -0
natural_pdf/utils/packaging.py +485 -0
natural_pdf/utils/text_extraction.py +44 -64
natural_pdf/utils/visualization.py +1 -1
{natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/METADATA +44 -20
{natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/RECORD +58 -47
{natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/top_level.txt +0 -1
natural_pdf/templates/ocr_debug.html +0 -517
tests/test_loading.py +0 -50
tests/test_optional_deps.py +0 -298
{natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/licenses/LICENSE +0 -0

natural_pdf/exporters/paddleocr.py ADDED Viewed

@@ -0,0 +1,345 @@
+import os
+import logging
+import random
+import shutil
+from typing import Union, List, Optional, TYPE_CHECKING, Set, Tuple
+from tqdm import tqdm
+from natural_pdf.exporters.base import FinetuneExporter
+# Need to import this utility
+from natural_pdf.utils.identifiers import generate_short_path_hash
+if TYPE_CHECKING:
+    from natural_pdf.core.pdf import PDF
+    from natural_pdf.collections.pdf_collection import PDFCollection
+    from natural_pdf.elements.text import TextElement
+logger = logging.getLogger(__name__)
+DEFAULT_SELECTOR_CORRECTED = "text[source^=manifest]"  # Match manifest-import etc.
+class PaddleOCRRecognitionExporter(FinetuneExporter):
+    """
+    Exports data for fine-tuning a PaddleOCR text recognition model.
+    Creates a directory structure with cropped text images and label files
+    (`train.txt`, `val.txt`, or `label.txt`) suitable for PaddleOCR training.
+    Optionally includes a Jupyter Notebook guide for fine-tuning on Colab.
+    """
+    def __init__(
+        self,
+        resolution: int = 150,
+        padding: int = 2,
+        selector: Optional[str] = None,
+        corrected_only: bool = False,
+        split_ratio: Optional[float] = 0.9,
+        include_guide: bool = True,
+        random_seed: Optional[int] = 42,
+    ):
+        """
+        Initialize the PaddleOCR Recognition Exporter.
+        Args:
+            resolution: DPI resolution for rendering text region images (default: 150).
+            padding: Padding (in points) to add around text element bbox before cropping (default: 2).
+            selector: CSS-like selector to filter which TextElements to export.
+                      If None and corrected_only is False, all 'text' elements are considered.
+            corrected_only: If True, overrides selector and exports only elements likely
+                            originating from a correction manifest (selector="text[source^=manifest]").
+                            (default: False).
+            split_ratio: Ratio for splitting data into training/validation sets (e.g., 0.9 for 90% train).
+                         If None, creates a single `label.txt` file (default: 0.9).
+            include_guide: If True, includes a template Jupyter Notebook guide for fine-tuning
+                           in the output directory (default: True).
+            random_seed: Seed for the random number generator used for train/val split shuffling,
+                         ensuring reproducibility (default: 42).
+        """
+        if corrected_only and selector:
+            logger.warning(
+                f"Both 'corrected_only=True' and 'selector=\"{selector}\"' were provided. "
+                f"Using corrected_only=True (selector='{DEFAULT_SELECTOR_CORRECTED}')."
+            )
+            self.selector = DEFAULT_SELECTOR_CORRECTED
+        elif corrected_only:
+            self.selector = DEFAULT_SELECTOR_CORRECTED
+        elif selector:
+            self.selector = selector
+        else:
+            self.selector = "text"  # Default to all text elements if nothing else specified
+        self.resolution = resolution
+        self.padding = padding
+        self.split_ratio = split_ratio
+        self.include_guide = include_guide
+        self.random_seed = random_seed
+        logger.info(
+            f"Initialized PaddleOCRRecognitionExporter: selector='{self.selector}', resolution={resolution}, "
+            f"padding={padding}, split_ratio={split_ratio}, include_guide={include_guide}"
+        )
+    def export(
+        self,
+        source: Union["PDF", "PDFCollection", List["PDF"]],
+        output_dir: str,
+        **kwargs,  # Allow for potential future args
+    ):
+        """
+        Exports text elements from the source PDF(s) to the specified output directory
+        in PaddleOCR text recognition format.
+        Args:
+            source: The PDF object, PDFCollection, or list of PDF objects to process.
+            output_dir: The path to the directory where the exported files will be saved.
+                        The directory will be created if it doesn't exist.
+            **kwargs: Optional keyword arguments (currently unused).
+        """
+        # --- 1. Setup and Validation ---
+        pdfs_to_process = self._resolve_source_pdfs(source)
+        if not pdfs_to_process:
+            logger.error("No valid PDF sources found. Aborting export.")
+            return
+        try:
+            os.makedirs(output_dir, exist_ok=True)
+            images_dir = os.path.join(output_dir, "images")
+            os.makedirs(images_dir, exist_ok=True)
+        except OSError as e:
+            logger.error(f"Failed to create output directory '{output_dir}': {e}", exc_info=True)
+            raise
+        # --- 2. Collect Elements and Render Images ---
+        labels: List[Tuple[str, str]] = []  # List of (relative_image_path, text_label)
+        char_set: Set[str] = set()
+        elements_processed = 0
+        elements_skipped = 0
+        logger.info(
+            f"Processing {len(pdfs_to_process)} PDF(s) to find elements matching selector: '{self.selector}'"
+        )
+        for pdf in tqdm(pdfs_to_process, desc="Processing PDFs"):
+            # Need to ensure pdf.path exists and is string
+            if not hasattr(pdf, "path") or not isinstance(pdf.path, str):
+                logger.warning(f"Skipping PDF object without a valid path attribute: {pdf}")
+                continue
+            pdf_hash = generate_short_path_hash(pdf.path)
+            try:
+                # Find elements using the specified selector
+                # Need to check if pdf has find_all method
+                if not hasattr(pdf, "find_all"):
+                    logger.warning(
+                        f"PDF object {pdf.path} does not have find_all method. Skipping."
+                    )
+                    continue
+                elements = pdf.find_all(
+                    self.selector, apply_exclusions=False
+                )  # Usually want all text, even if excluded
+                if not elements:
+                    logger.debug(f"No elements matching '{self.selector}' found in {pdf.path}")
+                    continue
+                for i, element in enumerate(
+                    tqdm(
+                        elements,
+                        desc=f"Exporting '{os.path.basename(pdf.path)}'",
+                        leave=False,
+                        position=1,
+                    )
+                ):
+                    # Ensure it's a TextElement with necessary methods/attributes
+                    # Removed check for to_image as it's called after expand()
+                    if not (
+                        hasattr(element, "page")
+                        and hasattr(element, "text")
+                        and hasattr(element, "expand")
+                    ):
+                        logger.warning(f"Skipping invalid/non-text element {i} in {pdf.path}")
+                        elements_skipped += 1
+                        continue
+                    element_text = element.text
+                    # Skip elements with no text, non-string text, or newlines
+                    if (
+                        not element_text
+                        or not isinstance(element_text, str)
+                        or "\n" in element_text
+                    ):
+                        if "\n" in str(element_text):
+                            reason = "contains newline"
+                        elif not element_text:
+                            reason = "empty text"
+                        else:
+                            reason = "invalid text type"
+                        logger.debug(
+                            f"Skipping element {i} in {pdf.path} page {getattr(element.page, 'number', 'N/A')} because {reason}."
+                        )
+                        elements_skipped += 1
+                        continue
+                    # Use page index if available, otherwise fallback or skip? Fallback to 0 for now.
+                    page_index = getattr(element.page, "index", 0)
+                    image_filename = f"{pdf_hash}_p{page_index}_e{i}.png"
+                    relative_image_path = os.path.join("images", image_filename)
+                    absolute_image_path = os.path.join(output_dir, relative_image_path)
+                    try:
+                        # Expand region, render, and save image
+                        region = element.expand(self.padding)
+                        img = region.to_image(
+                            resolution=self.resolution, crop_only=True, include_highlights=False
+                        )
+                        img.save(absolute_image_path, "PNG")
+                        # Add to labels and character set
+                        labels.append(
+                            (relative_image_path.replace(os.path.sep, "/"), element_text)
+                        )  # Use forward slashes for labels
+                        char_set.update(element_text)
+                        elements_processed += 1
+                    except Exception as e:
+                        page_num_str = getattr(
+                            element.page, "number", "N/A"
+                        )  # Get page number safely
+                        logger.error(
+                            f"Failed to process/save image for element {i} in {pdf.path} page {page_num_str}: {e}",
+                            exc_info=False,  # Keep log cleaner
+                        )
+                        elements_skipped += 1
+            except Exception as e:
+                logger.error(f"Failed to process PDF {pdf.path}: {e}", exc_info=True)
+                # Continue with other PDFs if possible
+        if elements_processed == 0:
+            logger.error(
+                f"No text elements were successfully processed and exported matching '{self.selector}'. Aborting."
+            )
+            # Clean up potentially created directories? Or leave them empty? Let's leave them.
+            return
+        logger.info(f"Processed {elements_processed} text elements, skipped {elements_skipped}.")
+        # --- 3. Generate Dictionary File (`dict.txt`) ---
+        dict_path = os.path.join(output_dir, "dict.txt")
+        try:
+            # Log the character set before sorting/writing
+            logger.debug(f"Exporter final char_set before sorting: {repr(char_set)}")
+            # PaddleOCR typically doesn't require special tokens like <UNK> or <BLK> in the dict
+            # for recognition models, but this might depend on the specific base model.
+            # Start with just the characters found.
+            sorted_chars = sorted(list(char_set), reverse=True)
+            with open(dict_path, "w", encoding="utf-8") as f_dict:
+                for char in sorted_chars:
+                    # Ensure we don't write empty strings or just newlines as dictionary entries
+                    if char and char != "\n":
+                        f_dict.write(char + "\n")
+            logger.info(f"Created dictionary file with {len(sorted_chars)} characters: {dict_path}")
+        except Exception as e:
+            logger.error(f"Failed to write dictionary file '{dict_path}': {e}", exc_info=True)
+            raise  # Re-raise as this is critical
+        # --- 4. Generate Label Files (`train.txt`, `val.txt` or `label.txt`) ---
+        if self.split_ratio is not None and 0 < self.split_ratio < 1:
+            if self.random_seed is not None:
+                random.seed(self.random_seed)
+            random.shuffle(labels)
+            split_index = int(len(labels) * self.split_ratio)
+            train_labels = labels[:split_index]
+            val_labels = labels[split_index:]
+            try:
+                train_path = os.path.join(output_dir, "train.txt")
+                with open(train_path, "w", encoding="utf-8") as f_train:
+                    for img_path, text in train_labels:
+                        f_train.write(f"{img_path}\t{text}\n")  # Use literal tabs and newlines
+                logger.info(
+                    f"Created training label file with {len(train_labels)} entries: {train_path}"
+                )
+                val_path = os.path.join(output_dir, "val.txt")
+                with open(val_path, "w", encoding="utf-8") as f_val:
+                    for img_path, text in val_labels:
+                        f_val.write(f"{img_path}\t{text}\n")  # Use literal tabs and newlines
+                logger.info(
+                    f"Created validation label file with {len(val_labels)} entries: {val_path}"
+                )
+            except Exception as e:
+                logger.error(f"Failed to write train/validation label files: {e}", exc_info=True)
+                raise
+        else:
+            # Create a single label file
+            label_path = os.path.join(output_dir, "label.txt")
+            try:
+                with open(label_path, "w", encoding="utf-8") as f_label:
+                    for img_path, text in labels:
+                        f_label.write(f"{img_path}\t{text}\n")  # Use literal tabs and newlines
+                logger.info(f"Created single label file with {len(labels)} entries: {label_path}")
+            except Exception as e:
+                logger.error(f"Failed to write label file '{label_path}': {e}", exc_info=True)
+                raise
+        # --- 5. Include Guide Notebook ---
+        if self.include_guide:
+            self._copy_guide_notebook(output_dir)
+        logger.info(f"PaddleOCR recognition data export completed successfully to '{output_dir}'.")
+    def _copy_guide_notebook(self, output_dir: str):
+        """Locates, converts (md->ipynb), and copies the guide notebook."""
+        try:
+            # Try importing conversion library
+            import jupytext
+            from nbformat import write as write_notebook
+        except ImportError:
+            logger.warning(
+                "Could not import 'jupytext' or 'nbformat'. Skipping guide notebook generation. "
+                "Install with 'pip install natural-pdf[dev]' or 'pip install jupytext nbformat'."
+            )
+            return
+        try:
+            # Locate the template .md file relative to this script
+            exporter_dir = os.path.dirname(os.path.abspath(__file__))
+            # Go up two levels (exporters -> natural_pdf) then down to templates/finetune
+            template_dir = os.path.abspath(
+                os.path.join(exporter_dir, "..", "templates", "finetune")
+            )
+            template_md_path = os.path.join(template_dir, "fine_tune_paddleocr.md")
+            output_ipynb_path = os.path.join(output_dir, "fine_tune_paddleocr.ipynb")
+            if not os.path.exists(template_md_path):
+                logger.error(
+                    f"Guide template not found at expected location: {template_md_path}. Trying alternate path."
+                )
+                # Try path relative to workspace root as fallback if run from project root
+                alt_template_path = os.path.abspath(
+                    os.path.join("natural_pdf", "templates", "finetune", "fine_tune_paddleocr.md")
+                )
+                if os.path.exists(alt_template_path):
+                    template_md_path = alt_template_path
+                    logger.info(f"Found guide template at alternate path: {template_md_path}")
+                else:
+                    logger.error(
+                        f"Guide template also not found at: {alt_template_path}. Cannot copy guide."
+                    )
+                    return
+            # Convert Markdown to Notebook object using jupytext
+            logger.debug(f"Reading guide template from: {template_md_path}")
+            notebook = jupytext.read(template_md_path)  # Reads md and returns NotebookNode
+            # Write the Notebook object to the output .ipynb file
+            logger.debug(f"Writing guide notebook to: {output_ipynb_path}")
+            with open(output_ipynb_path, "w", encoding="utf-8") as f_nb:
+                write_notebook(notebook, f_nb)
+            logger.info(f"Copied and converted fine-tuning guide notebook to: {output_ipynb_path}")
+        except Exception as e:
+            logger.error(f"Failed to copy/convert guide notebook: {e}", exc_info=True)

natural_pdf/ocr/__init__.py CHANGED Viewed

@@ -8,58 +8,79 @@ import logging
 # Set up module logger
 logger = logging.getLogger("natural_pdf.ocr")
+# Import the base classes that are always available
 from .engine import OCREngine
-from .engine_paddle import PaddleOCREngine
-from .engine_surya import SuryaOCREngine
+from .ocr_options import (
+    OCROptions,
+    BaseOCROptions,
+    EasyOCROptions,
+    PaddleOCROptions,
+    SuryaOCROptions,
+)
 from .ocr_manager import OCRManager
-from .ocr_options import OCROptions
+from .ocr_factory import OCRFactory
+# Add all public symbols that should be available when importing this module
 __all__ = [
     "OCRManager",
     "OCREngine",
     "OCROptions",
-    "EasyOCREngine",
-    "PaddleOCREngine",
-    "SuryaOCREngine",
+    "BaseOCROptions",
+    "EasyOCROptions",
+    "PaddleOCROptions",
+    "SuryaOCROptions",
+    "OCRFactory",
+    "get_engine",
+    "list_available_engines",
 ]
-DEFAULT_ENGINE = SuryaOCREngine
 def get_engine(engine_name=None, **kwargs):
     """
-    Get OCR engine by name.
+    Get OCR engine by name with graceful handling of missing dependencies.
     Args:
-        engine_name: Name of the engine to use ('easyocr', 'paddleocr', etc.)
-                     If None, the default engine is used (PaddleOCR if available, otherwise EasyOCR)
+        engine_name: Name of the engine to use ('easyocr', 'paddle', 'surya')
+                     If None, the best available engine is used
         **kwargs: Additional arguments to pass to the engine constructor
     Returns:
         OCREngine instance
+    Raises:
+        ImportError: If the requested engine's dependencies aren't installed
+        ValueError: If the engine_name is unknown
+    """
+    logger.debug(f"Initializing OCR engine: {engine_name or 'best available'}")
+    try:
+        if engine_name is None or engine_name == "default":
+            # Use the factory to get the best available engine
+            engine = OCRFactory.get_recommended_engine(**kwargs)
+            logger.info(f"Using recommended OCR engine: {engine.__class__.__name__}")
+            return engine
+        # Use the factory to create a specific engine
+        normalized_name = engine_name.lower()
+        if normalized_name in ["easyocr", "paddle", "surya"]:
+            return OCRFactory.create_engine(normalized_name, **kwargs)
+        else:
+            raise ValueError(f"Unknown OCR engine: {engine_name}")
+    except ImportError as e:
+        logger.error(f"OCR engine dependency error: {e}")
+        raise
+    except Exception as e:
+        logger.error(f"Error initializing OCR engine: {e}")
+        raise
+def list_available_engines():
+    """
+    List all available OCR engines.
+    Returns:
+        Dict[str, bool]: Dictionary mapping engine names to availability status
     """
-    logger.debug(f"Initializing OCR engine: {engine_name or 'default'}")
-    if engine_name is None or engine_name == "default":
-        engine = DEFAULT_ENGINE(**kwargs)
-        logger.info(f"Using default OCR engine: {engine.__class__.__name__}")
-        return engine
-    if engine_name.lower() == "easyocr":
-        logger.info("Initializing EasyOCR engine")
-        return EasyOCREngine(**kwargs)
-    if engine_name.lower() == "paddleocr":
-        try:
-            from .engine_paddle import PaddleOCREngine
-            logger.info("Initializing PaddleOCR engine")
-            return PaddleOCREngine(**kwargs)
-        except ImportError:
-            logger.error("PaddleOCR is not installed")
-            raise ImportError(
-                "PaddleOCR is not installed. Please install it with: pip install paddlepaddle paddleocr"
-            )
-    logger.error(f"Unknown OCR engine: {engine_name}")
-    raise ValueError(f"Unknown OCR engine: {engine_name}")
+    return OCRFactory.list_available_engines()

natural-pdf 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

natural-pdf 0.1.5py3-none-any.whl → 0.1.7py3-none-any.whl