PyPI - natural-pdf - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl - Mend

natural-pdf 0.1.7py3-none-any.whl → 0.1.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

docs/categorizing-documents/index.md +168 -0
docs/data-extraction/index.md +87 -0
docs/element-selection/index.ipynb +218 -164
docs/element-selection/index.md +20 -0
docs/index.md +19 -0
docs/ocr/index.md +63 -16
docs/tutorials/01-loading-and-extraction.ipynb +1713 -34
docs/tutorials/02-finding-elements.ipynb +123 -46
docs/tutorials/03-extracting-blocks.ipynb +24 -19
docs/tutorials/04-table-extraction.ipynb +17 -12
docs/tutorials/05-excluding-content.ipynb +37 -32
docs/tutorials/06-document-qa.ipynb +36 -31
docs/tutorials/07-layout-analysis.ipynb +45 -40
docs/tutorials/07-working-with-regions.ipynb +61 -60
docs/tutorials/08-spatial-navigation.ipynb +76 -71
docs/tutorials/09-section-extraction.ipynb +160 -155
docs/tutorials/10-form-field-extraction.ipynb +71 -66
docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
docs/tutorials/12-ocr-integration.ipynb +3420 -312
docs/tutorials/12-ocr-integration.md +68 -106
docs/tutorials/13-semantic-search.ipynb +641 -251
natural_pdf/__init__.py +2 -0
natural_pdf/classification/manager.py +343 -0
natural_pdf/classification/mixin.py +149 -0
natural_pdf/classification/results.py +62 -0
natural_pdf/collections/mixins.py +63 -0
natural_pdf/collections/pdf_collection.py +321 -15
natural_pdf/core/element_manager.py +67 -0
natural_pdf/core/page.py +227 -64
natural_pdf/core/pdf.py +387 -378
natural_pdf/elements/collections.py +272 -41
natural_pdf/elements/region.py +99 -15
natural_pdf/elements/text.py +5 -2
natural_pdf/exporters/paddleocr.py +1 -1
natural_pdf/extraction/manager.py +134 -0
natural_pdf/extraction/mixin.py +246 -0
natural_pdf/extraction/result.py +37 -0
natural_pdf/ocr/engine_easyocr.py +6 -3
natural_pdf/ocr/ocr_manager.py +85 -25
natural_pdf/ocr/ocr_options.py +33 -10
natural_pdf/ocr/utils.py +14 -3
natural_pdf/qa/document_qa.py +0 -4
natural_pdf/selectors/parser.py +363 -238
natural_pdf/templates/finetune/fine_tune_paddleocr.md +10 -5
natural_pdf/utils/locks.py +8 -0
natural_pdf/utils/text_extraction.py +52 -1
natural_pdf/utils/tqdm_utils.py +43 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +6 -1
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +52 -41
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0

natural_pdf/collections/pdf_collection.py CHANGED Viewed

@@ -4,12 +4,24 @@ import logging
 import os
 import re  # Added for safe path generation
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Type, Union
+from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Type, Union, Callable
+import concurrent.futures # Import concurrent.futures
+import time # Import time for logging timestamps
+import threading # Import threading for logging thread information
 from PIL import Image
 from tqdm import tqdm
+from tqdm.auto import tqdm as auto_tqdm
+from tqdm.notebook import tqdm as notebook_tqdm
+from natural_pdf.utils.tqdm_utils import get_tqdm
+# Get the appropriate tqdm class once
+tqdm = get_tqdm()
 # Set up logger early
+# Configure logging to include thread information
+# logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(threadName)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 from natural_pdf.core.pdf import PDF
@@ -36,9 +48,11 @@ except ImportError as e:
     SearchServiceProtocol, SearchOptions, Indexable = object, object, object
 from natural_pdf.search.searchable_mixin import SearchableMixin  # Import the new mixin
+# Import the ApplyMixin
+from natural_pdf.collections.mixins import ApplyMixin
-class PDFCollection(SearchableMixin):  # Inherit from the mixin
+class PDFCollection(SearchableMixin, ApplyMixin):  # Inherit from ApplyMixin
     def __init__(
         self,
         source: Union[str, Iterable[Union[str, "PDF"]]],
@@ -237,30 +251,214 @@ class PDFCollection(SearchableMixin):  # Inherit from the mixin
     def __repr__(self) -> str:
         # Removed search status
-        return f"<PDFCollection(count={len(self)})>"
+        return f"<PDFCollection(count={len(self._pdfs)})>"
     @property
     def pdfs(self) -> List["PDF"]:
         """Returns the list of PDF objects held by the collection."""
         return self._pdfs
-    def apply_ocr(self, *args, **kwargs):
-        PDF = self._get_pdf_class()
-        # Delegate to individual PDF objects
-        logger.info("Applying OCR to relevant PDFs in collection...")
-        results = []
+    def find_all(
+        self,
+        selector: str,
+        apply_exclusions: bool = True,  # Added explicit parameter
+        regex: bool = False,            # Added explicit parameter
+        case: bool = True,             # Added explicit parameter
+        **kwargs
+    ) -> "ElementCollection":
+        """
+        Find all elements matching the selector across all PDFs in the collection.
+        This creates an ElementCollection that can span multiple PDFs. Note that
+        some ElementCollection methods have limitations when spanning PDFs.
+        Args:
+            selector: CSS-like selector string to query elements
+            apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
+            regex: Whether to use regex for text search in :contains (default: False)
+            case: Whether to do case-sensitive text search (default: True)
+            **kwargs: Additional keyword arguments passed to the find_all method of each PDF
+        Returns:
+            ElementCollection containing all matching elements across all PDFs
+        """
+        from natural_pdf.elements.collections import ElementCollection
+        # Collect elements from all PDFs
+        all_elements = []
         for pdf in self._pdfs:
-            # We need to figure out which pages belong to which PDF if batching here
-            # For now, simpler to call on each PDF
             try:
-                # Assume apply_ocr exists on PDF and accepts similar args
-                pdf.apply_ocr(*args, **kwargs)
+                # Explicitly pass the relevant arguments down
+                elements = pdf.find_all(
+                    selector,
+                    apply_exclusions=apply_exclusions,
+                    regex=regex,
+                    case=case,
+                    **kwargs
+                )
+                all_elements.extend(elements.elements)
+            except Exception as e:
+                logger.error(f"Error finding elements in {pdf.path}: {e}", exc_info=True)
+        return ElementCollection(all_elements)
+    def apply_ocr(
+        self,
+        engine: Optional[str] = None,
+        languages: Optional[List[str]] = None,
+        min_confidence: Optional[float] = None,
+        device: Optional[str] = None,
+        resolution: Optional[int] = None,
+        apply_exclusions: bool = True,
+        detect_only: bool = False,
+        replace: bool = True,
+        options: Optional[Any] = None,
+        pages: Optional[Union[slice, List[int]]] = None,
+        max_workers: Optional[int] = None,
+    ) -> "PDFCollection":
+        """
+        Apply OCR to all PDFs in the collection, potentially in parallel.
+        Args:
+            engine: OCR engine to use (e.g., 'easyocr', 'paddleocr', 'surya')
+            languages: List of language codes for OCR
+            min_confidence: Minimum confidence threshold for text detection
+            device: Device to use for OCR (e.g., 'cpu', 'cuda')
+            resolution: DPI resolution for page rendering
+            apply_exclusions: Whether to apply exclusion regions
+            detect_only: If True, only detect text regions without extracting text
+            replace: If True, replace existing OCR elements
+            options: Engine-specific options
+            pages: Specific pages to process (None for all pages)
+            max_workers: Maximum number of threads to process PDFs concurrently.
+                         If None or 1, processing is sequential. (default: None)
+        Returns:
+            Self for method chaining
+        """
+        PDF = self._get_pdf_class()
+        logger.info(f"Applying OCR to {len(self._pdfs)} PDFs in collection (max_workers={max_workers})...")
+        # Worker function takes PDF object again
+        def _process_pdf(pdf: PDF):
+            """Helper function to apply OCR to a single PDF, handling errors."""
+            thread_id = threading.current_thread().name # Get thread name for logging
+            pdf_path = pdf.path # Get path for logging
+            logger.debug(f"[{thread_id}] Starting OCR process for: {pdf_path}")
+            start_time = time.monotonic()
+            try:
+                pdf.apply_ocr( # Call apply_ocr on the original PDF object
+                    pages=pages,
+                    engine=engine,
+                    languages=languages,
+                    min_confidence=min_confidence,
+                    device=device,
+                    resolution=resolution,
+                    apply_exclusions=apply_exclusions,
+                    detect_only=detect_only,
+                    replace=replace,
+                    options=options,
+                    # Note: We might want a max_workers here too for page rendering?
+                    # For now, PDF.apply_ocr doesn't have it.
+                )
+                end_time = time.monotonic()
+                logger.debug(f"[{thread_id}] Finished OCR process for: {pdf_path} (Duration: {end_time - start_time:.2f}s)")
+                return pdf_path, None
             except Exception as e:
-                logger.error(f"Failed applying OCR to {pdf.path}: {e}", exc_info=True)
+                end_time = time.monotonic()
+                logger.error(f"[{thread_id}] Failed OCR process for {pdf_path} after {end_time - start_time:.2f}s: {e}", exc_info=False)
+                return pdf_path, e # Return path and error
+        # Use ThreadPoolExecutor for parallel processing if max_workers > 1
+        if max_workers is not None and max_workers > 1:
+            futures = []
+            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix="OCRWorker") as executor:
+                for pdf in self._pdfs:
+                    # Submit the PDF object to the worker function
+                    futures.append(executor.submit(_process_pdf, pdf))
+            # Use the selected tqdm class with as_completed for progress tracking
+            progress_bar = tqdm(
+                concurrent.futures.as_completed(futures),
+                total=len(self._pdfs),
+                desc="Applying OCR (Parallel)",
+                unit="pdf"
+            )
+            for future in progress_bar:
+                pdf_path, error = future.result() # Get result (or exception)
+                if error:
+                    progress_bar.set_postfix_str(f"Error: {pdf_path}", refresh=True)
+                # Progress is updated automatically by tqdm
+        else: # Sequential processing (max_workers is None or 1)
+            logger.info("Applying OCR sequentially...")
+            # Use the selected tqdm class for sequential too for consistency
+            # Iterate over PDF objects directly for sequential
+            for pdf in tqdm(self._pdfs, desc="Applying OCR (Sequential)", unit="pdf"):
+                _process_pdf(pdf) # Call helper directly with PDF object
+        logger.info("Finished applying OCR across the collection.")
         return self
-    # --- Advanced Method Placeholders ---
-    # Placeholder for categorize removed as find_relevant is now implemented
+    def correct_ocr(
+        self,
+        correction_callback: Callable[[Any], Optional[str]],
+        max_workers: Optional[int] = None,
+        progress_callback: Optional[Callable[[], None]] = None,
+    ) -> "PDFCollection":
+        """
+        Apply OCR correction to all relevant elements across all pages and PDFs
+        in the collection using a single progress bar.
+        Args:
+            correction_callback: Function to apply to each OCR element.
+                                 It receives the element and should return
+                                 the corrected text (str) or None.
+            max_workers: Max threads to use for parallel execution within each page.
+            progress_callback: Optional callback function to call after processing each element.
+        Returns:
+            Self for method chaining.
+        """
+        PDF = self._get_pdf_class() # Ensure PDF class is available
+        if not callable(correction_callback):
+            raise TypeError("`correction_callback` must be a callable function.")
+        logger.info(f"Gathering OCR elements from {len(self._pdfs)} PDFs for correction...")
+        # 1. Gather all target elements using the collection's find_all
+        #    Crucially, set apply_exclusions=False to include elements in headers/footers etc.
+        all_ocr_elements = self.find_all("text[source=ocr]", apply_exclusions=False).elements
+        if not all_ocr_elements:
+            logger.info("No OCR elements found in the collection to correct.")
+            return self
+        total_elements = len(all_ocr_elements)
+        logger.info(f"Found {total_elements} OCR elements across the collection. Starting correction process...")
+        # 2. Initialize the progress bar
+        progress_bar = tqdm(total=total_elements, desc="Correcting OCR Elements", unit="element")
+        # 3. Iterate through PDFs and delegate to PDF.correct_ocr
+        #    PDF.correct_ocr handles page iteration and passing the progress callback down.
+        for pdf in self._pdfs:
+            if not pdf.pages:
+                continue
+            try:
+                pdf.correct_ocr(
+                    correction_callback=correction_callback,
+                    max_workers=max_workers,
+                    progress_callback=progress_bar.update # Pass the bar's update method
+                )
+            except Exception as e:
+                 logger.error(f"Error occurred during correction process for PDF {pdf.path}: {e}", exc_info=True)
+                 # Decide if we should stop or continue? For now, continue.
+        progress_bar.close()
+        return self
     def categorize(self, categories: List[str], **kwargs):
         """Categorizes PDFs in the collection based on content or features."""
@@ -309,3 +507,111 @@ class PDFCollection(SearchableMixin):  # Inherit from the mixin
                 #     logger.debug(f"Skipping empty page {page.page_number} from PDF '{pdf.path}'.")
                 #     continue
                 yield page
+    # --- Classification Method --- #
+    def classify_all(
+        self,
+        categories: List[str],
+        model: str = "text",
+        max_workers: Optional[int] = None,
+        **kwargs,
+    ) -> "PDFCollection":
+        """
+        Classify all pages across all PDFs in the collection, potentially in parallel.
+        This method uses the unified `classify_all` approach, delegating page
+        classification to each PDF's `classify_pages` method.
+        It displays a progress bar tracking individual pages.
+        Args:
+            categories: A list of string category names.
+            model: Model identifier ('text', 'vision', or specific HF ID).
+            max_workers: Maximum number of threads to process PDFs concurrently.
+                         If None or 1, processing is sequential.
+            **kwargs: Additional arguments passed down to `pdf.classify_pages` and
+                      subsequently to `page.classify` (e.g., device,
+                      confidence_threshold, resolution).
+        Returns:
+            Self for method chaining.
+        Raises:
+            ValueError: If categories list is empty.
+            ClassificationError: If classification fails for any page (will stop processing).
+            ImportError: If classification dependencies are missing.
+        """
+        PDF = self._get_pdf_class()
+        if not categories:
+            raise ValueError("Categories list cannot be empty.")
+        logger.info(f"Starting classification for {len(self._pdfs)} PDFs in collection (model: '{model}')...")
+        # Calculate total pages for the progress bar
+        total_pages = sum(len(pdf.pages) for pdf in self._pdfs if pdf.pages)
+        if total_pages == 0:
+            logger.warning("No pages found in the PDF collection to classify.")
+            return self
+        progress_bar = tqdm(
+            total=total_pages,
+            desc=f"Classifying Pages (model: {model})",
+            unit="page"
+        )
+        # Worker function
+        def _process_pdf_classification(pdf: PDF):
+            thread_id = threading.current_thread().name
+            pdf_path = pdf.path
+            logger.debug(f"[{thread_id}] Starting classification process for: {pdf_path}")
+            start_time = time.monotonic()
+            try:
+                # Call classify_pages on the PDF, passing the progress callback
+                pdf.classify_pages(
+                    categories=categories,
+                    model=model,
+                    progress_callback=progress_bar.update,
+                    **kwargs
+                )
+                end_time = time.monotonic()
+                logger.debug(f"[{thread_id}] Finished classification for: {pdf_path} (Duration: {end_time - start_time:.2f}s)")
+                return pdf_path, None # Return path and no error
+            except Exception as e:
+                end_time = time.monotonic()
+                # Error is logged within classify_pages, but log summary here
+                logger.error(f"[{thread_id}] Failed classification process for {pdf_path} after {end_time - start_time:.2f}s: {e}", exc_info=False)
+                # Close progress bar immediately on error to avoid hanging
+                progress_bar.close()
+                # Re-raise the exception to stop the entire collection processing
+                raise
+        # Use ThreadPoolExecutor for parallel processing if max_workers > 1
+        try:
+            if max_workers is not None and max_workers > 1:
+                logger.info(f"Classifying PDFs in parallel with {max_workers} workers.")
+                futures = []
+                with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix="ClassifyWorker") as executor:
+                    for pdf in self._pdfs:
+                        futures.append(executor.submit(_process_pdf_classification, pdf))
+                    # Wait for all futures to complete (progress updated by callback)
+                    # Exceptions are raised by future.result() if worker failed
+                    for future in concurrent.futures.as_completed(futures):
+                         future.result() # Raise exception if worker failed
+            else: # Sequential processing
+                logger.info("Classifying PDFs sequentially.")
+                for pdf in self._pdfs:
+                    _process_pdf_classification(pdf)
+            logger.info("Finished classification across the collection.")
+        finally:
+             # Ensure progress bar is closed even if errors occurred elsewhere
+             if not progress_bar.disable and progress_bar.n < progress_bar.total:
+                 progress_bar.close()
+             elif progress_bar.disable is False:
+                  progress_bar.close()
+        return self
+    # --- End Classification Method --- #

natural_pdf/core/element_manager.py CHANGED Viewed

@@ -539,3 +539,70 @@ class ElementManager:
         """Get all region elements."""
         self.load_elements()
         return self._elements.get("regions", [])
+    def remove_ocr_elements(self):
+        """
+        Remove all elements with source="ocr" from the elements dictionary.
+        This should be called before adding new OCR elements if replacement is desired.
+        Returns:
+            int: Number of OCR elements removed
+        """
+        # Load elements if not already loaded
+        self.load_elements()
+        removed_count = 0
+        # Filter out OCR elements from words
+        if "words" in self._elements:
+            original_len = len(self._elements["words"])
+            self._elements["words"] = [
+                word for word in self._elements["words"]
+                if getattr(word, "source", None) != "ocr"
+            ]
+            removed_count += original_len - len(self._elements["words"])
+        # Filter out OCR elements from chars
+        if "chars" in self._elements:
+            original_len = len(self._elements["chars"])
+            self._elements["chars"] = [
+                char for char in self._elements["chars"]
+                if (isinstance(char, dict) and char.get("source") != "ocr") or
+                   (not isinstance(char, dict) and getattr(char, "source", None) != "ocr")
+            ]
+            removed_count += original_len - len(self._elements["chars"])
+        logger.info(f"Page {self._page.number}: Removed {removed_count} OCR elements.")
+        return removed_count
+    def remove_element(self, element, element_type="words"):
+        """
+        Remove a specific element from the managed elements.
+        Args:
+            element: The element to remove
+            element_type: The type of element ('words', 'chars', etc.)
+        Returns:
+            bool: True if removed successfully, False otherwise
+        """
+        # Load elements if not already loaded
+        self.load_elements()
+        # Check if the collection exists
+        if element_type not in self._elements:
+            logger.warning(f"Cannot remove element: collection '{element_type}' does not exist")
+            return False
+        # Try to remove the element
+        try:
+            if element in self._elements[element_type]:
+                self._elements[element_type].remove(element)
+                logger.debug(f"Removed element from {element_type}: {element}")
+                return True
+            else:
+                logger.debug(f"Element not found in {element_type}: {element}")
+                return False
+        except Exception as e:
+            logger.error(f"Error removing element from {element_type}: {e}", exc_info=True)
+            return False

natural-pdf 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

natural-pdf 0.1.7py3-none-any.whl → 0.1.8py3-none-any.whl