PyPI - natural-pdf - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl - Mend

natural-pdf 0.1.7py3-none-any.whl → 0.1.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

docs/categorizing-documents/index.md +168 -0
docs/data-extraction/index.md +87 -0
docs/element-selection/index.ipynb +218 -164
docs/element-selection/index.md +20 -0
docs/index.md +19 -0
docs/ocr/index.md +63 -16
docs/tutorials/01-loading-and-extraction.ipynb +1713 -34
docs/tutorials/02-finding-elements.ipynb +123 -46
docs/tutorials/03-extracting-blocks.ipynb +24 -19
docs/tutorials/04-table-extraction.ipynb +17 -12
docs/tutorials/05-excluding-content.ipynb +37 -32
docs/tutorials/06-document-qa.ipynb +36 -31
docs/tutorials/07-layout-analysis.ipynb +45 -40
docs/tutorials/07-working-with-regions.ipynb +61 -60
docs/tutorials/08-spatial-navigation.ipynb +76 -71
docs/tutorials/09-section-extraction.ipynb +160 -155
docs/tutorials/10-form-field-extraction.ipynb +71 -66
docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
docs/tutorials/12-ocr-integration.ipynb +3420 -312
docs/tutorials/12-ocr-integration.md +68 -106
docs/tutorials/13-semantic-search.ipynb +641 -251
natural_pdf/__init__.py +2 -0
natural_pdf/classification/manager.py +343 -0
natural_pdf/classification/mixin.py +149 -0
natural_pdf/classification/results.py +62 -0
natural_pdf/collections/mixins.py +63 -0
natural_pdf/collections/pdf_collection.py +321 -15
natural_pdf/core/element_manager.py +67 -0
natural_pdf/core/page.py +227 -64
natural_pdf/core/pdf.py +387 -378
natural_pdf/elements/collections.py +272 -41
natural_pdf/elements/region.py +99 -15
natural_pdf/elements/text.py +5 -2
natural_pdf/exporters/paddleocr.py +1 -1
natural_pdf/extraction/manager.py +134 -0
natural_pdf/extraction/mixin.py +246 -0
natural_pdf/extraction/result.py +37 -0
natural_pdf/ocr/engine_easyocr.py +6 -3
natural_pdf/ocr/ocr_manager.py +85 -25
natural_pdf/ocr/ocr_options.py +33 -10
natural_pdf/ocr/utils.py +14 -3
natural_pdf/qa/document_qa.py +0 -4
natural_pdf/selectors/parser.py +363 -238
natural_pdf/templates/finetune/fine_tune_paddleocr.md +10 -5
natural_pdf/utils/locks.py +8 -0
natural_pdf/utils/text_extraction.py +52 -1
natural_pdf/utils/tqdm_utils.py +43 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +6 -1
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +52 -41
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0

natural_pdf/core/page.py CHANGED Viewed

@@ -6,14 +6,19 @@ import logging
 import os
 import re
 import tempfile
+import time # Import time
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+import concurrent.futures # Added import
+from tqdm.auto import tqdm # Added tqdm import
+import threading
 import pdfplumber
 from PIL import Image, ImageDraw
 from natural_pdf.elements.collections import ElementCollection
 from natural_pdf.elements.region import Region
+from natural_pdf.utils.locks import pdf_render_lock  # Import from utils instead
 if TYPE_CHECKING:
     import pdfplumber
@@ -46,10 +51,20 @@ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveV
 from natural_pdf.qa import DocumentQA, get_qa_engine
 from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
+# --- Classification Imports --- #
+from natural_pdf.classification.mixin import ClassificationMixin
+from natural_pdf.classification.manager import ClassificationManager # For type hint
+# --- End Classification Imports --- #
+from natural_pdf.utils.locks import pdf_render_lock # Import the lock
+from natural_pdf.elements.base import Element # Import base element
+from natural_pdf.classification.mixin import ClassificationMixin # Import classification mixin
+from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
 logger = logging.getLogger(__name__)
-class Page:
+class Page(ClassificationMixin, ExtractionMixin):
     """
     Enhanced Page wrapper built on top of pdfplumber.Page.
@@ -73,14 +88,21 @@ class Page:
         self._text_styles = None  # Lazy-loaded text style analyzer results
         self._exclusions = []  # List to store exclusion functions/regions
+        # --- ADDED --- Metadata store for mixins
+        self.metadata: Dict[str, Any] = {}
+        # --- END ADDED ---
         # Region management
         self._regions = {
             "detected": [],  # Layout detection results
             "named": {},  # Named regions (name -> region)
         }
-        # Initialize ElementManager
-        self._element_mgr = ElementManager(self, font_attrs)
+        # Initialize ElementManager, passing font_attrs
+        self._element_mgr = ElementManager(self, font_attrs=font_attrs)
+        # self._highlighter = HighlightingService(self) # REMOVED - Use property accessor
+        # --- NEW --- Central registry for analysis results
+        self.analyses: Dict[str, Any] = {}
         # --- Get OCR Manager Instance ---
         if (
@@ -115,6 +137,8 @@ class Page:
         # Initialize the internal variable with a single underscore
         self._layout_analyzer = None
+        self._load_elements()
     @property
     def pdf(self) -> "PDF":
         """Provides public access to the parent PDF object."""
@@ -1257,38 +1281,48 @@ class Page:
         """
         image = None
         render_resolution = resolution if resolution is not None else scale * 72
+        thread_id = threading.current_thread().name
+        logger.debug(f"[{thread_id}] Page {self.index}: Attempting to acquire pdf_render_lock for to_image...")
+        lock_wait_start = time.monotonic()
         try:
-            if include_highlights:
-                # Delegate rendering to the central service
-                image = self._highlighter.render_page(
-                    page_index=self.index,
-                    scale=scale,  # Note: scale is used by highlighter internally for drawing
-                    labels=labels,
-                    legend_position=legend_position,
-                    render_ocr=render_ocr,
-                    resolution=render_resolution,  # Pass the calculated resolution
-                    **kwargs,
-                )
-            else:
-                # Get the base page image directly from pdfplumber if no highlights needed
-                # Use the underlying pdfplumber page object
-                img_object = self._page.to_image(resolution=render_resolution, **kwargs)
-                # Access the PIL image directly (assuming pdfplumber structure)
-                image = (
-                    img_object.annotated
-                    if hasattr(img_object, "annotated")
-                    else img_object._repr_png_()
-                )
-                if isinstance(image, bytes):  # Handle cases where it returns bytes
-                    from io import BytesIO
+            # Acquire the global PDF rendering lock
+            with pdf_render_lock:
+                lock_acquired_time = time.monotonic()
+                logger.debug(f"[{thread_id}] Page {self.index}: Acquired pdf_render_lock (waited {lock_acquired_time - lock_wait_start:.2f}s). Starting render...")
+                if include_highlights:
+                    # Delegate rendering to the central service
+                    image = self._highlighter.render_page(
+                        page_index=self.index,
+                        scale=scale,  # Note: scale is used by highlighter internally for drawing
+                        labels=labels,
+                        legend_position=legend_position,
+                        render_ocr=render_ocr,
+                        resolution=render_resolution,  # Pass the calculated resolution
+                        **kwargs,
+                    )
+                else:
+                    # Get the base page image directly from pdfplumber if no highlights needed
+                    # Use the underlying pdfplumber page object
+                    img_object = self._page.to_image(resolution=render_resolution, **kwargs)
+                    # Access the PIL image directly (assuming pdfplumber structure)
+                    image = (
+                        img_object.annotated
+                        if hasattr(img_object, "annotated")
+                        else img_object._repr_png_()
+                    )
+                    if isinstance(image, bytes):  # Handle cases where it returns bytes
+                        from io import BytesIO
-                    image = Image.open(BytesIO(image)).convert(
-                        "RGB"
-                    )  # Convert to RGB for consistency
+                        image = Image.open(BytesIO(image)).convert(
+                            "RGB"
+                        )  # Convert to RGB for consistency
         except Exception as e:
             logger.error(f"Error rendering page {self.index}: {e}", exc_info=True)
             return None  # Return None on error
+        finally:
+            render_end_time = time.monotonic()
+            logger.debug(f"[{thread_id}] Page {self.index}: Released pdf_render_lock. Total render time (incl. lock wait): {render_end_time - lock_wait_start:.2f}s")
         if image is None:
             return None
@@ -1384,6 +1418,7 @@ class Page:
         resolution: Optional[int] = None,
         detect_only: bool = False,
         apply_exclusions: bool = True,
+        replace: bool = True,
     ) -> "Page":
         """
         Apply OCR to THIS page and add results to page elements via PDF.apply_ocr.
@@ -1397,13 +1432,21 @@ class Page:
             resolution: DPI resolution for rendering page image before OCR.
             apply_exclusions: If True (default), render page image for OCR
                               with excluded areas masked (whited out).
+            detect_only: If True, only detect text bounding boxes, don't perform OCR.
+            replace: If True (default), remove any existing OCR elements before
+                    adding new ones. If False, add new OCR elements to existing ones.
         Returns:
-            List of created TextElements derived from OCR results for this page.
+            Self for method chaining.
         """
         if not hasattr(self._parent, "apply_ocr"):
             logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr'. Cannot apply OCR.")
-            return []  # Return empty list for consistency
+            return self  # Return self for chaining
+        # Remove existing OCR elements if replace is True
+        if replace and hasattr(self, "_element_mgr"):
+            logger.info(f"Page {self.number}: Removing existing OCR elements before applying new OCR.")
+            self._element_mgr.remove_ocr_elements()
         logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr.")
         try:
@@ -1419,18 +1462,13 @@ class Page:
                 resolution=resolution,
                 detect_only=detect_only,
                 apply_exclusions=apply_exclusions,
+                replace=replace,  # Pass the replace parameter to PDF.apply_ocr
             )
         except Exception as e:
             logger.error(f"Page {self.number}: Error during delegated OCR call: {e}", exc_info=True)
-            return []
+            return self  # Return self for chaining
-        # Return the OCR elements specifically added to this page
-        ocr_elements = [el for el in self.words if getattr(el, "source", None) == "ocr"]
-        logger.debug(
-            f"Page {self.number}: apply_ocr completed. Found {len(ocr_elements)} OCR elements."
-        )
-        # Note: The method is typed to return Page for chaining, but the log indicates
-        # finding elements. Let's stick to returning self for chaining consistency.
+        # Return self for chaining
         return self
     def extract_ocr_elements(
@@ -1471,11 +1509,13 @@ class Page:
         try:
             # Get base image without highlights using the determined resolution
-            image = self.to_image(resolution=final_resolution, include_highlights=False)
-            if not image:
-                logger.error(f"  Failed to render page {self.number} to image for OCR extraction.")
-                return []
-            logger.debug(f"  Rendered image size: {image.width}x{image.height}")
+            # Use the global PDF rendering lock
+            with pdf_render_lock:
+                image = self.to_image(resolution=final_resolution, include_highlights=False)
+                if not image:
+                    logger.error(f"  Failed to render page {self.number} to image for OCR extraction.")
+                    return []
+                logger.debug(f"  Rendered image size: {image.width}x{image.height}")
         except Exception as e:
             logger.error(f"  Failed to render page {self.number} to image: {e}", exc_info=True)
             return []
@@ -2027,43 +2067,166 @@ class Page:
     def correct_ocr(
         self,
         correction_callback: Callable[[Any], Optional[str]],
+        max_workers: Optional[int] = None,
+        progress_callback: Optional[Callable[[], None]] = None, # Added progress callback
     ) -> "Page":  # Return self for chaining
         """
         Applies corrections to OCR-generated text elements on this page
-        using a user-provided callback function.
+        using a user-provided callback function, potentially in parallel.
         Finds text elements on this page whose 'source' attribute starts
         with 'ocr' and calls the `correction_callback` for each, passing the
-        element itself.
-        The `correction_callback` should contain the logic to:
-        1. Determine if the element needs correction.
-        2. Perform the correction (e.g., call an LLM).
-        3. Return the new text (`str`) or `None`.
-        If the callback returns a string, the element's `.text` is updated.
-        Metadata updates (source, confidence, etc.) should happen within the callback.
+        element itself. Updates the element's text if the callback returns
+        a new string.
         Args:
             correction_callback: A function accepting an element and returning
                                  `Optional[str]` (new text or None).
+            max_workers: The maximum number of threads to use for parallel execution.
+                         If None or 0 or 1, runs sequentially.
+            progress_callback: Optional callback function to call after processing each element.
         Returns:
             Self for method chaining.
         """
         logger.info(
-            f"Page {self.number}: Starting OCR correction process using callback '{correction_callback.__name__}'"
+            f"Page {self.number}: Starting OCR correction with callback '{correction_callback.__name__}' (max_workers={max_workers})"
+        )
+        target_elements_collection = self.find_all(
+            selector="text[source=ocr]", apply_exclusions=False
         )
+        target_elements = target_elements_collection.elements # Get the list
-        # Find OCR elements specifically on this page
-        # Note: We typically want to correct even if the element falls in an excluded area
-        target_elements = self.find_all(selector="text[source^=ocr]", apply_exclusions=False)
+        if not target_elements:
+            logger.info(f"Page {self.number}: No OCR elements found to correct.")
+            return self
+        processed_count = 0
+        updated_count = 0
+        error_count = 0
+        # Define the task to be run by the worker thread or sequentially
+        def _process_element_task(element):
+            try:
+                current_text = getattr(element, 'text', None)
+                # Call the user-provided callback
+                corrected_text = correction_callback(element)
+                # Validate result type
+                if corrected_text is not None and not isinstance(corrected_text, str):
+                    logger.warning(f"Page {self.number}: Correction callback for element '{getattr(element, 'text', '')[:20]}...' returned non-string, non-None type: {type(corrected_text)}. Skipping update.")
+                    return element, None, None # Treat as no correction
+                return element, corrected_text, None  # Return element, result, no error
+            except Exception as e:
+                logger.error(
+                    f"Page {self.number}: Error applying correction callback to element '{getattr(element, 'text', '')[:30]}...' ({element.bbox}): {e}",
+                    exc_info=False # Keep log concise
+                )
+                return element, None, e # Return element, no result, error
+            finally:
+                # --- Call progress callback here --- #
+                if progress_callback:
+                    try:
+                        progress_callback()
+                    except Exception as cb_e:
+                        # Log error in callback itself, but don't stop processing
+                        logger.error(f"Page {self.number}: Error executing progress_callback: {cb_e}", exc_info=False)
+        # Choose execution strategy based on max_workers
+        if max_workers is not None and max_workers > 1:
+            # --- Parallel execution --- #
+            logger.info(f"Page {self.number}: Running OCR correction in parallel with {max_workers} workers.")
+            futures = []
+            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+                # Submit all tasks
+                future_to_element = {executor.submit(_process_element_task, element): element for element in target_elements}
+                # Process results as they complete (progress_callback called by worker)
+                for future in concurrent.futures.as_completed(future_to_element):
+                    processed_count += 1
+                    try:
+                        element, corrected_text, error = future.result()
+                        if error:
+                            error_count += 1
+                            # Error already logged in worker
+                        elif corrected_text is not None:
+                            # Apply correction if text changed
+                            current_text = getattr(element, 'text', None)
+                            if corrected_text != current_text:
+                                element.text = corrected_text
+                                updated_count += 1
+                    except Exception as exc:
+                        # Catch errors from future.result() itself
+                        element = future_to_element[future] # Find original element
+                        logger.error(f"Page {self.number}: Internal error retrieving correction result for element {element.bbox}: {exc}", exc_info=True)
+                        error_count += 1
+                        # Note: progress_callback was already called in the worker's finally block
+        else:
+            # --- Sequential execution --- #
+            logger.info(f"Page {self.number}: Running OCR correction sequentially.")
+            for element in target_elements:
+                 # Call the task function directly (it handles progress_callback)
+                 processed_count += 1
+                 _element, corrected_text, error = _process_element_task(element)
+                 if error:
+                     error_count += 1
+                 elif corrected_text is not None:
+                     # Apply correction if text changed
+                     current_text = getattr(_element, 'text', None)
+                     if corrected_text != current_text:
+                         _element.text = corrected_text
+                         updated_count += 1
-        # Delegate to the utility function
-        _apply_ocr_correction_to_elements(
-            elements=target_elements,  # Pass the ElementCollection directly
-            correction_callback=correction_callback,
-            caller_info=f"Page({self.number})",  # Pass caller info
+        logger.info(
+             f"Page {self.number}: OCR correction finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
         )
-        return self  # Return self for chaining
+        return self # Return self for chaining
+    # --- Classification Mixin Implementation --- #
+    def _get_classification_manager(self) -> "ClassificationManager":
+        if not hasattr(self, 'pdf') or not hasattr(self.pdf, 'get_manager'):
+             raise AttributeError("ClassificationManager cannot be accessed: Parent PDF or get_manager method missing.")
+        try:
+             # Use the PDF's manager registry accessor
+             return self.pdf.get_manager('classification')
+        except (ValueError, RuntimeError, AttributeError) as e:
+            # Wrap potential errors from get_manager for clarity
+            raise AttributeError(f"Failed to get ClassificationManager from PDF: {e}") from e
+    def _get_classification_content(self, model_type: str, **kwargs) -> Union[str, "Image"]: # Use "Image" for lazy import
+        if model_type == 'text':
+            text_content = self.extract_text(layout=False, use_exclusions=False) # Simple join, ignore exclusions for classification
+            if not text_content or text_content.isspace():
+                raise ValueError("Cannot classify page with 'text' model: No text content found.")
+            return text_content
+        elif model_type == 'vision':
+            # Get resolution from manager/kwargs if possible, else default
+            manager = self._get_classification_manager()
+            default_resolution = 150
+            # Access kwargs passed to classify method if needed
+            resolution = kwargs.get('resolution', default_resolution) if 'kwargs' in locals() else default_resolution
+            # Use to_image, ensuring no highlights interfere
+            img = self.to_image(
+                resolution=resolution,
+                include_highlights=False,
+                labels=False,
+                exclusions=None # Don't mask exclusions for classification input image
+            )
+            if img is None:
+                raise ValueError("Cannot classify page with 'vision' model: Failed to render image.")
+            return img
+        else:
+            raise ValueError(f"Unsupported model_type for classification: {model_type}")
+    def _get_metadata_storage(self) -> Dict[str, Any]:
+        # Ensure metadata exists
+        if not hasattr(self, 'metadata') or self.metadata is None:
+            self.metadata = {}
+        return self.metadata
+    # --- Content Extraction ---

natural-pdf 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

natural-pdf 0.1.7py3-none-any.whl → 0.1.8py3-none-any.whl