PyPI - natural-pdf - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl - Mend

natural-pdf 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

natural_pdf/__init__.py +3 -0
natural_pdf/analyzers/layout/base.py +1 -5
natural_pdf/analyzers/layout/gemini.py +61 -51
natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
natural_pdf/analyzers/layout/layout_manager.py +26 -84
natural_pdf/analyzers/layout/layout_options.py +7 -0
natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
natural_pdf/analyzers/layout/surya.py +46 -123
natural_pdf/analyzers/layout/tatr.py +51 -4
natural_pdf/analyzers/text_structure.py +3 -5
natural_pdf/analyzers/utils.py +3 -3
natural_pdf/classification/manager.py +422 -0
natural_pdf/classification/mixin.py +163 -0
natural_pdf/classification/results.py +80 -0
natural_pdf/collections/mixins.py +111 -0
natural_pdf/collections/pdf_collection.py +434 -15
natural_pdf/core/element_manager.py +83 -0
natural_pdf/core/highlighting_service.py +13 -22
natural_pdf/core/page.py +578 -93
natural_pdf/core/pdf.py +912 -460
natural_pdf/elements/base.py +134 -40
natural_pdf/elements/collections.py +712 -109
natural_pdf/elements/region.py +722 -69
natural_pdf/elements/text.py +4 -1
natural_pdf/export/mixin.py +137 -0
natural_pdf/exporters/base.py +3 -3
natural_pdf/exporters/paddleocr.py +5 -4
natural_pdf/extraction/manager.py +135 -0
natural_pdf/extraction/mixin.py +279 -0
natural_pdf/extraction/result.py +23 -0
natural_pdf/ocr/__init__.py +5 -5
natural_pdf/ocr/engine_doctr.py +346 -0
natural_pdf/ocr/engine_easyocr.py +6 -3
natural_pdf/ocr/ocr_factory.py +24 -4
natural_pdf/ocr/ocr_manager.py +122 -26
natural_pdf/ocr/ocr_options.py +94 -11
natural_pdf/ocr/utils.py +19 -6
natural_pdf/qa/document_qa.py +0 -4
natural_pdf/search/__init__.py +20 -34
natural_pdf/search/haystack_search_service.py +309 -265
natural_pdf/search/haystack_utils.py +99 -75
natural_pdf/search/search_service_protocol.py +11 -12
natural_pdf/selectors/parser.py +431 -230
natural_pdf/utils/debug.py +3 -3
natural_pdf/utils/identifiers.py +1 -1
natural_pdf/utils/locks.py +8 -0
natural_pdf/utils/packaging.py +8 -6
natural_pdf/utils/text_extraction.py +60 -1
natural_pdf/utils/tqdm_utils.py +51 -0
natural_pdf/utils/visualization.py +18 -0
natural_pdf/widgets/viewer.py +4 -25
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
natural_pdf-0.1.9.dist-info/RECORD +80 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
docs/api/index.md +0 -386
docs/assets/favicon.png +0 -3
docs/assets/favicon.svg +0 -3
docs/assets/javascripts/custom.js +0 -17
docs/assets/logo.svg +0 -3
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +0 -17
docs/assets/social-preview.svg +0 -17
docs/assets/stylesheets/custom.css +0 -65
docs/document-qa/index.ipynb +0 -435
docs/document-qa/index.md +0 -79
docs/element-selection/index.ipynb +0 -915
docs/element-selection/index.md +0 -229
docs/finetuning/index.md +0 -176
docs/index.md +0 -170
docs/installation/index.md +0 -69
docs/interactive-widget/index.ipynb +0 -962
docs/interactive-widget/index.md +0 -12
docs/layout-analysis/index.ipynb +0 -818
docs/layout-analysis/index.md +0 -185
docs/ocr/index.md +0 -209
docs/pdf-navigation/index.ipynb +0 -314
docs/pdf-navigation/index.md +0 -97
docs/regions/index.ipynb +0 -816
docs/regions/index.md +0 -294
docs/tables/index.ipynb +0 -658
docs/tables/index.md +0 -144
docs/text-analysis/index.ipynb +0 -370
docs/text-analysis/index.md +0 -105
docs/text-extraction/index.ipynb +0 -1478
docs/text-extraction/index.md +0 -292
docs/tutorials/01-loading-and-extraction.ipynb +0 -194
docs/tutorials/01-loading-and-extraction.md +0 -95
docs/tutorials/02-finding-elements.ipynb +0 -340
docs/tutorials/02-finding-elements.md +0 -149
docs/tutorials/03-extracting-blocks.ipynb +0 -147
docs/tutorials/03-extracting-blocks.md +0 -48
docs/tutorials/04-table-extraction.ipynb +0 -114
docs/tutorials/04-table-extraction.md +0 -50
docs/tutorials/05-excluding-content.ipynb +0 -270
docs/tutorials/05-excluding-content.md +0 -109
docs/tutorials/06-document-qa.ipynb +0 -332
docs/tutorials/06-document-qa.md +0 -91
docs/tutorials/07-layout-analysis.ipynb +0 -288
docs/tutorials/07-layout-analysis.md +0 -66
docs/tutorials/07-working-with-regions.ipynb +0 -413
docs/tutorials/07-working-with-regions.md +0 -151
docs/tutorials/08-spatial-navigation.ipynb +0 -508
docs/tutorials/08-spatial-navigation.md +0 -190
docs/tutorials/09-section-extraction.ipynb +0 -2434
docs/tutorials/09-section-extraction.md +0 -256
docs/tutorials/10-form-field-extraction.ipynb +0 -512
docs/tutorials/10-form-field-extraction.md +0 -201
docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
docs/tutorials/11-enhanced-table-processing.md +0 -9
docs/tutorials/12-ocr-integration.ipynb +0 -604
docs/tutorials/12-ocr-integration.md +0 -175
docs/tutorials/13-semantic-search.ipynb +0 -1328
docs/tutorials/13-semantic-search.md +0 -77
docs/visual-debugging/index.ipynb +0 -2970
docs/visual-debugging/index.md +0 -157
docs/visual-debugging/region.png +0 -0
natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
natural_pdf/templates/spa/css/style.css +0 -334
natural_pdf/templates/spa/index.html +0 -31
natural_pdf/templates/spa/js/app.js +0 -472
natural_pdf/templates/spa/words.txt +0 -235976
natural_pdf/widgets/frontend/viewer.js +0 -88
natural_pdf-0.1.7.dist-info/RECORD +0 -145
notebooks/Examples.ipynb +0 -1293
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +0 -543
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0

natural_pdf/elements/collections.py CHANGED Viewed

@@ -1,27 +1,41 @@
 import logging
+from collections.abc import MutableSequence
+from pathlib import Path
 from typing import (
     TYPE_CHECKING,
     Any,
     Callable,
     Dict,
     Generic,
+    Iterable,
     Iterator,
     List,
     Optional,
+    Sequence,
     Tuple,
+    Type,
     TypeVar,
     Union,
+    overload,
 )
 from pdfplumber.utils.geometry import objects_to_bbox
 # New Imports
 from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
-from natural_pdf.elements.text import TextElement  # Needed for isinstance check
+from tqdm.auto import tqdm
+from natural_pdf.classification.manager import ClassificationManager
+from natural_pdf.classification.mixin import ClassificationMixin
+from natural_pdf.collections.mixins import ApplyMixin, DirectionalCollectionMixin
+from natural_pdf.core.pdf import PDF
+from natural_pdf.elements.base import Element
+from natural_pdf.elements.region import Region
+from natural_pdf.elements.text import TextElement
+from natural_pdf.export.mixin import ExportMixin
 from natural_pdf.ocr import OCROptions
+from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
 from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
-from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements  # Import the new utility
 logger = logging.getLogger(__name__)
@@ -33,7 +47,9 @@ T = TypeVar("T")
 P = TypeVar("P", bound="Page")
-class ElementCollection(Generic[T]):
+class ElementCollection(
+    Generic[T], ApplyMixin, ExportMixin, DirectionalCollectionMixin, MutableSequence
+):
     """
     Collection of PDF elements with batch operations.
     """
@@ -55,10 +71,6 @@ class ElementCollection(Generic[T]):
         """Get an element by index."""
         return self._elements[index]
-    def __iter__(self):
-        """Iterate over elements."""
-        return iter(self._elements)
     def __repr__(self) -> str:
         """Return a string representation showing the element count."""
         element_type = "Mixed"
@@ -68,6 +80,20 @@ class ElementCollection(Generic[T]):
                 element_type = types.pop()
         return f"<ElementCollection[{element_type}](count={len(self)})>"
+    def __add__(self, other: "ElementCollection") -> "ElementCollection":
+        if not isinstance(other, ElementCollection):
+            return NotImplemented
+        return ElementCollection(self._elements + other._elements)
+    def __setitem__(self, index, value):
+        self._elements[index] = value
+    def __delitem__(self, index):
+        del self._elements[index]
+    def insert(self, index, value):
+        self._elements.insert(index, value)
     @property
     def elements(self) -> List["Element"]:
         """Get the elements in this collection."""
@@ -83,12 +109,53 @@ class ElementCollection(Generic[T]):
         """Get the last element in the collection."""
         return self._elements[-1] if self._elements else None
+    def _are_on_multiple_pages(self) -> bool:
+        """
+        Check if elements in this collection span multiple pages.
+        Returns:
+            True if elements are on different pages, False otherwise
+        """
+        if not self._elements:
+            return False
+        # Get the page index of the first element
+        if not hasattr(self._elements[0], "page"):
+            return False
+        first_page_idx = self._elements[0].page.index
+        # Check if any element is on a different page
+        return any(hasattr(e, "page") and e.page.index != first_page_idx for e in self._elements)
+    def _are_on_multiple_pdfs(self) -> bool:
+        """
+        Check if elements in this collection span multiple PDFs.
+        Returns:
+            True if elements are from different PDFs, False otherwise
+        """
+        if not self._elements:
+            return False
+        # Get the PDF of the first element
+        if not hasattr(self._elements[0], "page") or not hasattr(self._elements[0].page, "pdf"):
+            return False
+        first_pdf = self._elements[0].page.pdf
+        # Check if any element is from a different PDF
+        return any(
+            hasattr(e, "page") and hasattr(e.page, "pdf") and e.page.pdf is not first_pdf
+            for e in self._elements
+        )
     def highest(self) -> Optional["Element"]:
         """
         Get element with the smallest top y-coordinate (highest on page).
         Raises:
-            ValueError: If elements are on multiple pages
+            ValueError: If elements are on multiple pages or multiple PDFs
         Returns:
             Element with smallest top value or None if empty
@@ -96,7 +163,9 @@ class ElementCollection(Generic[T]):
         if not self._elements:
             return None
-        # Check if elements are on multiple pages
+        # Check if elements are on multiple pages or PDFs
+        if self._are_on_multiple_pdfs():
+            raise ValueError("Cannot determine highest element across multiple PDFs")
         if self._are_on_multiple_pages():
             raise ValueError("Cannot determine highest element across multiple pages")
@@ -107,7 +176,7 @@ class ElementCollection(Generic[T]):
         Get element with the largest bottom y-coordinate (lowest on page).
         Raises:
-            ValueError: If elements are on multiple pages
+            ValueError: If elements are on multiple pages or multiple PDFs
         Returns:
             Element with largest bottom value or None if empty
@@ -115,7 +184,9 @@ class ElementCollection(Generic[T]):
         if not self._elements:
             return None
-        # Check if elements are on multiple pages
+        # Check if elements are on multiple pages or PDFs
+        if self._are_on_multiple_pdfs():
+            raise ValueError("Cannot determine lowest element across multiple PDFs")
         if self._are_on_multiple_pages():
             raise ValueError("Cannot determine lowest element across multiple pages")
@@ -126,7 +197,7 @@ class ElementCollection(Generic[T]):
         Get element with the smallest x0 coordinate (leftmost on page).
         Raises:
-            ValueError: If elements are on multiple pages
+            ValueError: If elements are on multiple pages or multiple PDFs
         Returns:
             Element with smallest x0 value or None if empty
@@ -134,7 +205,9 @@ class ElementCollection(Generic[T]):
         if not self._elements:
             return None
-        # Check if elements are on multiple pages
+        # Check if elements are on multiple pages or PDFs
+        if self._are_on_multiple_pdfs():
+            raise ValueError("Cannot determine leftmost element across multiple PDFs")
         if self._are_on_multiple_pages():
             raise ValueError("Cannot determine leftmost element across multiple pages")
@@ -145,7 +218,7 @@ class ElementCollection(Generic[T]):
         Get element with the largest x1 coordinate (rightmost on page).
         Raises:
-            ValueError: If elements are on multiple pages
+            ValueError: If elements are on multiple pages or multiple PDFs
         Returns:
             Element with largest x1 value or None if empty
@@ -153,31 +226,14 @@ class ElementCollection(Generic[T]):
         if not self._elements:
             return None
-        # Check if elements are on multiple pages
+        # Check if elements are on multiple pages or PDFs
+        if self._are_on_multiple_pdfs():
+            raise ValueError("Cannot determine rightmost element across multiple PDFs")
         if self._are_on_multiple_pages():
             raise ValueError("Cannot determine rightmost element across multiple pages")
         return max(self._elements, key=lambda e: e.x1)
-    def _are_on_multiple_pages(self) -> bool:
-        """
-        Check if elements in this collection span multiple pages.
-        Returns:
-            True if elements are on different pages, False otherwise
-        """
-        if not self._elements:
-            return False
-        # Get the page index of the first element
-        if not hasattr(self._elements[0], "page"):
-            return False
-        first_page_idx = self._elements[0].page.index
-        # Check if any element is on a different page
-        return any(hasattr(e, "page") and e.page.index != first_page_idx for e in self._elements)
     def exclude_regions(self, regions: List["Region"]) -> "ElementCollection":
         """
         Remove elements that are within any of the specified regions.
@@ -359,6 +415,9 @@ class ElementCollection(Generic[T]):
         Uses grouping logic based on parameters (defaulting to grouping by type).
+        Note: Elements must be from the same PDF for this operation to work properly,
+        as each PDF has its own highlighting service.
         Args:
             label: Optional explicit label for the entire collection. If provided,
                    all elements are highlighted as a single group with this label,
@@ -389,8 +448,12 @@ class ElementCollection(Generic[T]):
             AttributeError: If 'group_by' is provided but the attribute doesn't exist
                             on some elements.
             ValueError: If 'label_format' is provided but contains invalid keys for
-                        element attributes.
+                        element attributes, or if elements span multiple PDFs.
         """
+        # Check if elements span multiple PDFs
+        if self._are_on_multiple_pdfs():
+            raise ValueError("highlight() does not support elements from multiple PDFs")
         # 1. Prepare the highlight data based on parameters
         highlight_data_list = self._prepare_highlight_data(
             distinct=distinct,
@@ -761,7 +824,8 @@ class ElementCollection(Generic[T]):
         Generates a temporary preview image highlighting elements in this collection
         on their page, ignoring any persistent highlights.
-        Currently only supports collections where all elements are on the same page.
+        Currently only supports collections where all elements are on the same page
+        of the same PDF.
         Allows grouping and coloring elements based on attributes, similar to the
         persistent `highlight()` method, but only for this temporary view.
@@ -780,14 +844,20 @@ class ElementCollection(Generic[T]):
         Returns:
             PIL Image object of the temporary preview, or None if rendering fails or
-            elements span multiple pages.
+            elements span multiple pages/PDFs.
         Raises:
-            ValueError: If the collection is empty or elements are on different pages.
+            ValueError: If the collection is empty or elements are on different pages/PDFs.
         """
         if not self._elements:
             raise ValueError("Cannot show an empty collection.")
+        # Check if elements are on multiple PDFs
+        if self._are_on_multiple_pdfs():
+            raise ValueError(
+                "show() currently only supports collections where all elements are from the same PDF."
+            )
         # Check if elements are on multiple pages
         if self._are_on_multiple_pages():
             raise ValueError(
@@ -1062,70 +1132,33 @@ class ElementCollection(Generic[T]):
             logger.error(f"Error creating interactive viewer from collection: {e}", exc_info=True)
             return None
-    def find_all(
-        self, selector: str, regex: bool = False, case: bool = True, **kwargs
-    ) -> "ElementCollection[T]":
+    def find(self, selector: str, **kwargs) -> "ElementCollection":
         """
-        Filter elements within this collection matching the selector.
+        Find elements in this collection matching the selector.
         Args:
-            selector: CSS-like selector string.
-            regex: Whether to use regex for text search in :contains (default: False).
-            case: Whether to do case-sensitive text search (default: True).
-            **kwargs: Additional filter parameters passed to the selector function.
-        Returns:
-            A new ElementCollection containing only the matching elements from this collection.
+            selector: CSS-like selector string
+            apply_exclusions: Whether to exclude elements in exclusion regions
         """
-        if not self._elements:
-            return ElementCollection([])
-        try:
-            selector_obj = parse_selector(selector)
-        except Exception as e:
-            logger.error(f"Error parsing selector '{selector}': {e}")
-            return ElementCollection([])  # Return empty on parse error
-        # Pass regex and case flags to selector function generator
-        kwargs["regex"] = regex
-        kwargs["case"] = case
-        try:
-            filter_func = selector_to_filter_func(selector_obj, **kwargs)
-        except Exception as e:
-            logger.error(f"Error creating filter function for selector '{selector}': {e}")
-            return ElementCollection([])  # Return empty on filter creation error
-        matching_elements = [element for element in self._elements if filter_func(element)]
+        return self.apply(lambda element: element.find(selector, **kwargs))
-        # Note: Unlike Page.find_all, this doesn't re-sort.
-        # Sorting should be done explicitly on the collection if needed.
-        return ElementCollection(matching_elements)
-    def find(self, selector: str, regex: bool = False, case: bool = True, **kwargs) -> Optional[T]:
+    def extract_each_text(self, **kwargs) -> List[str]:
         """
-        Find the first element within this collection matching the selector.
-        Args:
-            selector: CSS-like selector string.
-            regex: Whether to use regex for text search in :contains (default: False).
-            case: Whether to do case-sensitive text search (default: True).
-            **kwargs: Additional filter parameters passed to the selector function.
-        Returns:
-            The first matching element or None.
+        Extract text from each element in this region.
         """
-        results = self.find_all(selector, regex=regex, case=case, **kwargs)
-        return results.first
+        return self.apply(
+            lambda element: element.extract_text(**kwargs) if element is not None else None
+        )
     def correct_ocr(
         self,
         correction_callback: Callable[[Any], Optional[str]],
+        max_workers: Optional[int] = None,
     ) -> "ElementCollection":
         """
         Applies corrections to OCR-generated text elements within this collection
-        using a user-provided callback function.
+        using a user-provided callback function, executed
+        in parallel if `max_workers` is specified.
         Iterates through elements currently in the collection. If an element's
         'source' attribute starts with 'ocr', it calls the `correction_callback`
@@ -1143,6 +1176,8 @@ class ElementCollection(Generic[T]):
         Args:
             correction_callback: A function accepting an element and returning
                                  `Optional[str]` (new text or None).
+            max_workers: The maximum number of worker threads to use for parallel
+                         correction on each page. If None, defaults are used.
         Returns:
             Self for method chaining.
@@ -1152,11 +1187,296 @@ class ElementCollection(Generic[T]):
             elements=self._elements,
             correction_callback=correction_callback,
             caller_info=f"ElementCollection(len={len(self._elements)})",  # Pass caller info
+            max_workers=max_workers,
         )
         return self  # Return self for chaining
+    def remove(self) -> int:
+        """
+        Remove all elements in this collection from their respective pages.
+        This method removes elements from the page's _element_mgr storage.
+        It's particularly useful for removing OCR elements before applying new OCR.
+        Returns:
+            int: Number of elements successfully removed
+        """
+        if not self._elements:
+            return 0
+        removed_count = 0
+        for element in self._elements:
+            # Each element should have a reference to its page
+            if hasattr(element, "page") and hasattr(element.page, "_element_mgr"):
+                element_mgr = element.page._element_mgr
+                # Determine element type
+                element_type = getattr(element, "object_type", None)
+                if element_type:
+                    # Convert to plural form expected by element_mgr
+                    if element_type == "word":
+                        element_type = "words"
+                    elif element_type == "char":
+                        element_type = "chars"
+                    elif element_type == "rect":
+                        element_type = "rects"
+                    elif element_type == "line":
+                        element_type = "lines"
+                    # Try to remove from the element manager
+                    if hasattr(element_mgr, "remove_element"):
+                        success = element_mgr.remove_element(element, element_type)
+                        if success:
+                            removed_count += 1
+                    else:
+                        logger.warning("ElementManager does not have remove_element method")
+            else:
+                logger.warning(f"Element has no page or page has no _element_mgr: {element}")
+        return removed_count
+    # --- Classification Method --- #
+    def classify_all(
+        self,
+        categories: List[str],
+        model: Optional[str] = None,
+        using: Optional[str] = None,
+        min_confidence: float = 0.0,
+        analysis_key: str = "classification",
+        multi_label: bool = False,
+        batch_size: int = 8,
+        max_workers: Optional[int] = None,
+        progress_bar: bool = True,
+        **kwargs,
+    ):
+        """Classifies all elements in the collection in batch.
+        Args:
+            categories: List of category labels.
+            model: Model ID (or alias 'text', 'vision').
+            using: Optional processing mode ('text' or 'vision'). Inferred if None.
+            min_confidence: Minimum confidence threshold.
+            analysis_key: Key for storing results in element.analyses.
+            multi_label: Allow multiple labels per item.
+            batch_size: Size of batches passed to the inference pipeline.
+            max_workers: (Not currently used for classification batching which is
+                         handled by the underlying pipeline).
+            progress_bar: Display a progress bar.
+            **kwargs: Additional arguments for the ClassificationManager.
+        """
+        if not self.elements:
+            logger.info("ElementCollection is empty, skipping classification.")
+            return self
+        # Requires access to the PDF's manager. Assume first element has it.
+        first_element = self.elements[0]
+        manager_source = None
+        if hasattr(first_element, "page") and hasattr(first_element.page, "pdf"):
+            manager_source = first_element.page.pdf
+        elif hasattr(first_element, "pdf"):  # Maybe it's a PageCollection?
+            manager_source = first_element.pdf
+        if not manager_source or not hasattr(manager_source, "get_manager"):
+            raise RuntimeError("Cannot access ClassificationManager via elements.")
+        try:
+            manager = manager_source.get_manager("classification")
+        except Exception as e:
+            raise RuntimeError(f"Failed to get ClassificationManager: {e}") from e
+        if not manager or not manager.is_available():
+            raise RuntimeError("ClassificationManager is not available.")
+        # Determine engine type early for content gathering
+        inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
+        # Gather content from all elements
+        items_to_classify: List[Tuple[Any, Union[str, Image.Image]]] = []
+        original_elements: List[Any] = []
+        logger.info(
+            f"Gathering content for {len(self.elements)} elements for batch classification..."
+        )
+        for element in self.elements:
+            if not isinstance(element, ClassificationMixin):
+                logger.warning(f"Skipping element (not ClassificationMixin): {element!r}")
+                continue
+            try:
+                # Delegate content fetching to the element itself
+                content = element._get_classification_content(model_type=inferred_using, **kwargs)
+                items_to_classify.append(content)
+                original_elements.append(element)
+            except (ValueError, NotImplementedError) as e:
+                logger.warning(
+                    f"Skipping element {element!r}: Cannot get content for classification - {e}"
+                )
+            except Exception as e:
+                logger.warning(
+                    f"Skipping element {element!r}: Error getting classification content - {e}"
+                )
+        if not items_to_classify:
+            logger.warning("No content could be gathered from elements for batch classification.")
+            return self
+        logger.info(
+            f"Collected content for {len(items_to_classify)} elements. Running batch classification..."
+        )
+        # Call manager's batch classify
+        batch_results: List[ClassificationResult] = manager.classify_batch(
+            item_contents=items_to_classify,
+            categories=categories,
+            model_id=model,
+            using=inferred_using,
+            min_confidence=min_confidence,
+            multi_label=multi_label,
+            batch_size=batch_size,
+            progress_bar=progress_bar,
+            **kwargs,
+        )
+        # Assign results back to elements
+        if len(batch_results) != len(original_elements):
+            logger.error(
+                f"Batch classification result count ({len(batch_results)}) mismatch "
+                f"with elements processed ({len(original_elements)}). Cannot assign results."
+            )
+            # Decide how to handle mismatch - maybe store errors?
+        else:
+            logger.info(
+                f"Assigning {len(batch_results)} results to elements under key '{analysis_key}'."
+            )
+            for element, result_obj in zip(original_elements, batch_results):
+                try:
+                    if not hasattr(element, "analyses") or element.analyses is None:
+                        element.analyses = {}
+                    element.analyses[analysis_key] = result_obj
+                except Exception as e:
+                    logger.warning(f"Failed to store classification result for {element!r}: {e}")
+        return self
+    # --- End Classification Method --- #
+    def _gather_analysis_data(
+        self,
+        analysis_keys: List[str],
+        include_content: bool,
+        include_images: bool,
+        image_dir: Optional[Path],
+        image_format: str,
+        image_resolution: int,
+    ) -> List[Dict[str, Any]]:
+        """
+        Gather analysis data from all elements in the collection.
+        Args:
+            analysis_keys: Keys in the analyses dictionary to export
+            include_content: Whether to include extracted text
+            include_images: Whether to export images
+            image_dir: Directory to save images
+            image_format: Format to save images
+            image_resolution: Resolution for exported images
+        Returns:
+            List of dictionaries containing analysis data
+        """
+        if not self.elements:
+            logger.warning("No elements found in collection")
+            return []
+        all_data = []
+        for i, element in enumerate(self.elements):
+            # Base element information
+            element_data = {
+                "element_index": i,
+                "element_type": getattr(element, "type", type(element).__name__),
+            }
+            # Add geometry if available
+            for attr in ["x0", "top", "x1", "bottom", "width", "height"]:
+                if hasattr(element, attr):
+                    element_data[attr] = getattr(element, attr)
+            # Add page information if available
+            if hasattr(element, "page"):
+                page = element.page
+                if page:
+                    element_data["page_number"] = getattr(page, "number", None)
+                    element_data["pdf_path"] = (
+                        getattr(page.pdf, "path", None) if hasattr(page, "pdf") else None
+                    )
+            # Include extracted text if requested
+            if include_content and hasattr(element, "extract_text"):
+                try:
+                    element_data["content"] = element.extract_text(preserve_whitespace=True)
+                except Exception as e:
+                    logger.error(f"Error extracting text from element {i}: {e}")
+                    element_data["content"] = ""
-class PageCollection(Generic[P]):
+            # Save image if requested
+            if include_images and hasattr(element, "to_image"):
+                try:
+                    # Create identifier for the element
+                    pdf_name = "unknown"
+                    page_num = "unknown"
+                    if hasattr(element, "page") and element.page:
+                        page_num = element.page.number
+                        if hasattr(element.page, "pdf") and element.page.pdf:
+                            pdf_name = Path(element.page.pdf.path).stem
+                    # Create image filename
+                    element_type = element_data.get("element_type", "element").lower()
+                    image_filename = f"{pdf_name}_page{page_num}_{element_type}_{i}.{image_format}"
+                    image_path = image_dir / image_filename
+                    # Save image
+                    element.to_image(
+                        path=str(image_path), resolution=image_resolution, include_highlights=True
+                    )
+                    # Add relative path to data
+                    element_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
+                except Exception as e:
+                    logger.error(f"Error saving image for element {i}: {e}")
+                    element_data["image_path"] = None
+            # Add analyses data
+            if hasattr(element, "analyses"):
+                for key in analysis_keys:
+                    if key not in element.analyses:
+                        # Skip this key if it doesn't exist - elements might have different analyses
+                        logger.warning(f"Analysis key '{key}' not found in element {i}")
+                        continue
+                    # Get the analysis result
+                    analysis_result = element.analyses[key]
+                    # If the result has a to_dict method, use it
+                    if hasattr(analysis_result, "to_dict"):
+                        analysis_data = analysis_result.to_dict()
+                    else:
+                        # Otherwise, use the result directly if it's dict-like
+                        try:
+                            analysis_data = dict(analysis_result)
+                        except (TypeError, ValueError):
+                            # Last resort: convert to string
+                            analysis_data = {"raw_result": str(analysis_result)}
+                    # Add analysis data to element data with the key as prefix
+                    for k, v in analysis_data.items():
+                        element_data[f"{key}.{k}"] = v
+            all_data.append(element_data)
+        return all_data
+class PageCollection(Generic[P], ApplyMixin):
     """
     A collection of PDF pages with cross-page operations.
@@ -1221,6 +1541,7 @@ class PageCollection(Generic[P]):
         device: Optional[str] = None,
         resolution: Optional[int] = None,  # DPI for rendering
         apply_exclusions: bool = True,  # New parameter
+        replace: bool = True,  # Whether to replace existing OCR elements
         # --- Engine-Specific Options ---
         options: Optional[Any] = None,  # e.g., EasyOCROptions(...)
     ) -> "PageCollection[P]":
@@ -1240,6 +1561,8 @@ class PageCollection(Generic[P]):
             apply_exclusions: If True (default), render page images for OCR with
                               excluded areas masked (whited out). If False, OCR
                               the raw page images without masking exclusions.
+            replace: If True (default), remove any existing OCR elements before
+                    adding new ones. If False, add new OCR elements to existing ones.
             options: An engine-specific options object (e.g., EasyOCROptions) or dict.
         Returns:
@@ -1277,45 +1600,134 @@ class PageCollection(Generic[P]):
             device=device,
             resolution=resolution,
             apply_exclusions=apply_exclusions,  # Pass down
+            replace=replace,  # Pass the replace parameter
             options=options,
         )
         # The PDF method modifies the Page objects directly by adding elements.
         return self  # Return self for chaining
-    def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional[T]:
+    @overload
+    def find(
+        self,
+        *,
+        text: str,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> Optional[T]: ...
+    @overload
+    def find(
+        self,
+        selector: str,
+        *,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> Optional[T]: ...
+    def find(
+        self,
+        selector: Optional[str] = None,
+        *,
+        text: Optional[str] = None,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> Optional[T]:
         """
-        Find the first element matching the selector across all pages.
+        Find the first element matching the selector OR text across all pages in the collection.
+        Provide EITHER `selector` OR `text`, but not both.
         Args:
-            selector: CSS-like selector string
-            apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
-            **kwargs: Additional filter parameters
+            selector: CSS-like selector string.
+            text: Text content to search for (equivalent to 'text:contains(...)').
+            apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
+            regex: Whether to use regex for text search (`selector` or `text`) (default: False).
+            case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
+            **kwargs: Additional filter parameters.
         Returns:
-            First matching element or None
+            First matching element or None.
         """
+        # Input validation happens within page.find
         for page in self.pages:
-            element = page.find(selector, apply_exclusions=apply_exclusions, **kwargs)
+            element = page.find(
+                selector=selector,
+                text=text,
+                apply_exclusions=apply_exclusions,
+                regex=regex,
+                case=case,
+                **kwargs,
+            )
             if element:
                 return element
         return None
-    def find_all(self, selector: str, apply_exclusions=True, **kwargs) -> ElementCollection:
+    @overload
+    def find_all(
+        self,
+        *,
+        text: str,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> "ElementCollection": ...
+    @overload
+    def find_all(
+        self,
+        selector: str,
+        *,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> "ElementCollection": ...
+    def find_all(
+        self,
+        selector: Optional[str] = None,
+        *,
+        text: Optional[str] = None,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> "ElementCollection":
         """
-        Find all elements matching the selector across all pages.
+        Find all elements matching the selector OR text across all pages in the collection.
+        Provide EITHER `selector` OR `text`, but not both.
         Args:
-            selector: CSS-like selector string
-            apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
-            **kwargs: Additional filter parameters
+            selector: CSS-like selector string.
+            text: Text content to search for (equivalent to 'text:contains(...)').
+            apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
+            regex: Whether to use regex for text search (`selector` or `text`) (default: False).
+            case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
+            **kwargs: Additional filter parameters.
         Returns:
-            ElementCollection with matching elements from all pages
+            ElementCollection with matching elements from all pages.
         """
         all_elements = []
+        # Input validation happens within page.find_all
         for page in self.pages:
-            elements = page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
+            elements = page.find_all(
+                selector=selector,
+                text=text,
+                apply_exclusions=apply_exclusions,
+                regex=regex,
+                case=case,
+                **kwargs,
+            )
             if elements:
                 all_elements.extend(elements.elements)
@@ -1324,10 +1736,12 @@ class PageCollection(Generic[P]):
     def correct_ocr(
         self,
         correction_callback: Callable[[Any], Optional[str]],
+        max_workers: Optional[int] = None,
     ) -> "PageCollection[P]":
         """
         Applies corrections to OCR-generated text elements across all pages
-        in this collection using a user-provided callback function.
+        in this collection using a user-provided callback function, executed
+        in parallel if `max_workers` is specified.
         This method delegates to the parent PDF's `correct_ocr` method,
         targeting all pages within this collection.
@@ -1335,10 +1749,11 @@ class PageCollection(Generic[P]):
         Args:
             correction_callback: A function that accepts a single argument (an element
                                  object) and returns `Optional[str]` (new text or None).
+            max_workers: The maximum number of worker threads to use for parallel
+                         correction on each page. If None, defaults are used.
         Returns:
-            A dictionary containing aggregate statistics for the process across all pages:
-            {'elements_checked': total_checked, 'corrections_applied': total_applied}
+            Self for method chaining.
         Raises:
             RuntimeError: If the collection is empty, pages lack a parent PDF reference,
@@ -1346,17 +1761,32 @@ class PageCollection(Generic[P]):
         """
         if not self.pages:
             logger.warning("Cannot correct OCR for an empty PageCollection.")
+            # Return self even if empty to maintain chaining consistency
+            return self
         # Assume all pages share the same parent PDF object
         parent_pdf = self.pages[0]._parent
+        if (
+            not parent_pdf
+            or not hasattr(parent_pdf, "correct_ocr")
+            or not callable(parent_pdf.correct_ocr)
+        ):
+            raise RuntimeError(
+                "Parent PDF reference not found or parent PDF lacks the required 'correct_ocr' method."
+            )
         page_indices = [p.index for p in self.pages]
         logger.info(
-            f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices}."
+            f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices} with max_workers={max_workers}."
         )
         # Delegate the call to the parent PDF object for the relevant pages
-        parent_pdf.correct_ocr(correction_callback=correction_callback, pages=page_indices)
+        # Pass the max_workers parameter down
+        parent_pdf.correct_ocr(
+            correction_callback=correction_callback,
+            pages=page_indices,
+            max_workers=max_workers,  # Pass it here
+        )
         return self
@@ -1660,3 +2090,176 @@ class PageCollection(Generic[P]):
                 sections.append(region)
         return sections
+    def _gather_analysis_data(
+        self,
+        analysis_keys: List[str],
+        include_content: bool,
+        include_images: bool,
+        image_dir: Optional[Path],
+        image_format: str,
+        image_resolution: int,
+    ) -> List[Dict[str, Any]]:
+        """
+        Gather analysis data from all pages in the collection.
+        Args:
+            analysis_keys: Keys in the analyses dictionary to export
+            include_content: Whether to include extracted text
+            include_images: Whether to export images
+            image_dir: Directory to save images
+            image_format: Format to save images
+            image_resolution: Resolution for exported images
+        Returns:
+            List of dictionaries containing analysis data
+        """
+        if not self.elements:
+            logger.warning("No pages found in collection")
+            return []
+        all_data = []
+        for page in self.elements:
+            # Basic page information
+            page_data = {
+                "page_number": page.number,
+                "page_index": page.index,
+                "width": page.width,
+                "height": page.height,
+            }
+            # Add PDF information if available
+            if hasattr(page, "pdf") and page.pdf:
+                page_data["pdf_path"] = page.pdf.path
+                page_data["pdf_filename"] = Path(page.pdf.path).name
+            # Include extracted text if requested
+            if include_content:
+                try:
+                    page_data["content"] = page.extract_text(preserve_whitespace=True)
+                except Exception as e:
+                    logger.error(f"Error extracting text from page {page.number}: {e}")
+                    page_data["content"] = ""
+            # Save image if requested
+            if include_images:
+                try:
+                    # Create image filename
+                    pdf_name = "unknown"
+                    if hasattr(page, "pdf") and page.pdf:
+                        pdf_name = Path(page.pdf.path).stem
+                    image_filename = f"{pdf_name}_page_{page.number}.{image_format}"
+                    image_path = image_dir / image_filename
+                    # Save image
+                    page.save_image(
+                        str(image_path), resolution=image_resolution, include_highlights=True
+                    )
+                    # Add relative path to data
+                    page_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
+                except Exception as e:
+                    logger.error(f"Error saving image for page {page.number}: {e}")
+                    page_data["image_path"] = None
+            # Add analyses data
+            if hasattr(page, "analyses") and page.analyses:
+                for key in analysis_keys:
+                    if key not in page.analyses:
+                        raise KeyError(f"Analysis key '{key}' not found in page {page.number}")
+                    # Get the analysis result
+                    analysis_result = page.analyses[key]
+                    # If the result has a to_dict method, use it
+                    if hasattr(analysis_result, "to_dict"):
+                        analysis_data = analysis_result.to_dict()
+                    else:
+                        # Otherwise, use the result directly if it's dict-like
+                        try:
+                            analysis_data = dict(analysis_result)
+                        except (TypeError, ValueError):
+                            # Last resort: convert to string
+                            analysis_data = {"raw_result": str(analysis_result)}
+                    # Add analysis data to page data with the key as prefix
+                    for k, v in analysis_data.items():
+                        page_data[f"{key}.{k}"] = v
+            all_data.append(page_data)
+        return all_data
+    # --- Deskew Method --- #
+    def deskew(
+        self,
+        resolution: int = 300,
+        detection_resolution: int = 72,
+        force_overwrite: bool = False,
+        **deskew_kwargs,
+    ) -> "PDF":  # Changed return type
+        """
+        Creates a new, in-memory PDF object containing deskewed versions of the pages
+        in this collection.
+        This method delegates the actual processing to the parent PDF object's
+        `deskew` method.
+        Important: The returned PDF is image-based. Any existing text, OCR results,
+        annotations, or other elements from the original pages will *not* be carried over.
+        Args:
+            resolution: DPI resolution for rendering the output deskewed pages.
+            detection_resolution: DPI resolution used for skew detection if angles are not
+                                  already cached on the page objects.
+            force_overwrite: If False (default), raises a ValueError if any target page
+                             already contains processed elements (text, OCR, regions) to
+                             prevent accidental data loss. Set to True to proceed anyway.
+            **deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
+                             during automatic detection (e.g., `max_angle`, `num_peaks`).
+        Returns:
+            A new PDF object representing the deskewed document.
+        Raises:
+            ImportError: If 'deskew' or 'img2pdf' libraries are not installed (raised by PDF.deskew).
+            ValueError: If `force_overwrite` is False and target pages contain elements (raised by PDF.deskew),
+                        or if the collection is empty.
+            RuntimeError: If pages lack a parent PDF reference, or the parent PDF lacks the `deskew` method.
+        """
+        if not self.pages:
+            logger.warning("Cannot deskew an empty PageCollection.")
+            raise ValueError("Cannot deskew an empty PageCollection.")
+        # Assume all pages share the same parent PDF object
+        # Need to hint the type of _parent for type checkers
+        if TYPE_CHECKING:
+            parent_pdf: "natural_pdf.core.pdf.PDF" = self.pages[0]._parent
+        else:
+            parent_pdf = self.pages[0]._parent
+        if not parent_pdf or not hasattr(parent_pdf, "deskew") or not callable(parent_pdf.deskew):
+            raise RuntimeError(
+                "Parent PDF reference not found or parent PDF lacks the required 'deskew' method."
+            )
+        # Get the 0-based indices of the pages in this collection
+        page_indices = [p.index for p in self.pages]
+        logger.info(
+            f"PageCollection: Delegating deskew to parent PDF for page indices: {page_indices}"
+        )
+        # Delegate the call to the parent PDF object for the relevant pages
+        # Pass all relevant arguments through (no output_path anymore)
+        return parent_pdf.deskew(
+            pages=page_indices,
+            resolution=resolution,
+            detection_resolution=detection_resolution,
+            force_overwrite=force_overwrite,
+            **deskew_kwargs,
+        )
+    # --- End Deskew Method --- #

natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

natural-pdf 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl