PyPI - natural-pdf - Versions diffs - 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl - Mend

natural-pdf 0.1.8py3-none-any.whl → 0.1.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

natural_pdf/__init__.py +1 -0
natural_pdf/analyzers/layout/base.py +1 -5
natural_pdf/analyzers/layout/gemini.py +61 -51
natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
natural_pdf/analyzers/layout/layout_manager.py +26 -84
natural_pdf/analyzers/layout/layout_options.py +7 -0
natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
natural_pdf/analyzers/layout/surya.py +46 -123
natural_pdf/analyzers/layout/tatr.py +51 -4
natural_pdf/analyzers/text_structure.py +3 -5
natural_pdf/analyzers/utils.py +3 -3
natural_pdf/classification/manager.py +241 -158
natural_pdf/classification/mixin.py +52 -38
natural_pdf/classification/results.py +71 -45
natural_pdf/collections/mixins.py +85 -20
natural_pdf/collections/pdf_collection.py +245 -100
natural_pdf/core/element_manager.py +30 -14
natural_pdf/core/highlighting_service.py +13 -22
natural_pdf/core/page.py +423 -101
natural_pdf/core/pdf.py +694 -195
natural_pdf/elements/base.py +134 -40
natural_pdf/elements/collections.py +610 -134
natural_pdf/elements/region.py +659 -90
natural_pdf/elements/text.py +1 -1
natural_pdf/export/mixin.py +137 -0
natural_pdf/exporters/base.py +3 -3
natural_pdf/exporters/paddleocr.py +4 -3
natural_pdf/extraction/manager.py +50 -49
natural_pdf/extraction/mixin.py +90 -57
natural_pdf/extraction/result.py +9 -23
natural_pdf/ocr/__init__.py +5 -5
natural_pdf/ocr/engine_doctr.py +346 -0
natural_pdf/ocr/ocr_factory.py +24 -4
natural_pdf/ocr/ocr_manager.py +61 -25
natural_pdf/ocr/ocr_options.py +70 -10
natural_pdf/ocr/utils.py +6 -4
natural_pdf/search/__init__.py +20 -34
natural_pdf/search/haystack_search_service.py +309 -265
natural_pdf/search/haystack_utils.py +99 -75
natural_pdf/search/search_service_protocol.py +11 -12
natural_pdf/selectors/parser.py +219 -143
natural_pdf/utils/debug.py +3 -3
natural_pdf/utils/identifiers.py +1 -1
natural_pdf/utils/locks.py +1 -1
natural_pdf/utils/packaging.py +8 -6
natural_pdf/utils/text_extraction.py +24 -16
natural_pdf/utils/tqdm_utils.py +18 -10
natural_pdf/utils/visualization.py +18 -0
natural_pdf/widgets/viewer.py +4 -25
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/METADATA +12 -3
natural_pdf-0.1.10.dist-info/RECORD +80 -0
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/top_level.txt +0 -2
docs/api/index.md +0 -386
docs/assets/favicon.png +0 -3
docs/assets/favicon.svg +0 -3
docs/assets/javascripts/custom.js +0 -17
docs/assets/logo.svg +0 -3
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +0 -17
docs/assets/social-preview.svg +0 -17
docs/assets/stylesheets/custom.css +0 -65
docs/categorizing-documents/index.md +0 -168
docs/data-extraction/index.md +0 -87
docs/document-qa/index.ipynb +0 -435
docs/document-qa/index.md +0 -79
docs/element-selection/index.ipynb +0 -969
docs/element-selection/index.md +0 -249
docs/finetuning/index.md +0 -176
docs/index.md +0 -189
docs/installation/index.md +0 -69
docs/interactive-widget/index.ipynb +0 -962
docs/interactive-widget/index.md +0 -12
docs/layout-analysis/index.ipynb +0 -818
docs/layout-analysis/index.md +0 -185
docs/ocr/index.md +0 -256
docs/pdf-navigation/index.ipynb +0 -314
docs/pdf-navigation/index.md +0 -97
docs/regions/index.ipynb +0 -816
docs/regions/index.md +0 -294
docs/tables/index.ipynb +0 -658
docs/tables/index.md +0 -144
docs/text-analysis/index.ipynb +0 -370
docs/text-analysis/index.md +0 -105
docs/text-extraction/index.ipynb +0 -1478
docs/text-extraction/index.md +0 -292
docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
docs/tutorials/01-loading-and-extraction.md +0 -95
docs/tutorials/02-finding-elements.ipynb +0 -417
docs/tutorials/02-finding-elements.md +0 -149
docs/tutorials/03-extracting-blocks.ipynb +0 -152
docs/tutorials/03-extracting-blocks.md +0 -48
docs/tutorials/04-table-extraction.ipynb +0 -119
docs/tutorials/04-table-extraction.md +0 -50
docs/tutorials/05-excluding-content.ipynb +0 -275
docs/tutorials/05-excluding-content.md +0 -109
docs/tutorials/06-document-qa.ipynb +0 -337
docs/tutorials/06-document-qa.md +0 -91
docs/tutorials/07-layout-analysis.ipynb +0 -293
docs/tutorials/07-layout-analysis.md +0 -66
docs/tutorials/07-working-with-regions.ipynb +0 -414
docs/tutorials/07-working-with-regions.md +0 -151
docs/tutorials/08-spatial-navigation.ipynb +0 -513
docs/tutorials/08-spatial-navigation.md +0 -190
docs/tutorials/09-section-extraction.ipynb +0 -2439
docs/tutorials/09-section-extraction.md +0 -256
docs/tutorials/10-form-field-extraction.ipynb +0 -517
docs/tutorials/10-form-field-extraction.md +0 -201
docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
docs/tutorials/11-enhanced-table-processing.md +0 -9
docs/tutorials/12-ocr-integration.ipynb +0 -3712
docs/tutorials/12-ocr-integration.md +0 -137
docs/tutorials/13-semantic-search.ipynb +0 -1718
docs/tutorials/13-semantic-search.md +0 -77
docs/visual-debugging/index.ipynb +0 -2970
docs/visual-debugging/index.md +0 -157
docs/visual-debugging/region.png +0 -0
natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
natural_pdf/templates/spa/css/style.css +0 -334
natural_pdf/templates/spa/index.html +0 -31
natural_pdf/templates/spa/js/app.js +0 -472
natural_pdf/templates/spa/words.txt +0 -235976
natural_pdf/widgets/frontend/viewer.js +0 -88
natural_pdf-0.1.8.dist-info/RECORD +0 -156
notebooks/Examples.ipynb +0 -1293
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +0 -543
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
{natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/licenses/LICENSE +0 -0

natural_pdf/elements/collections.py CHANGED Viewed

@@ -1,32 +1,42 @@
 import logging
+from collections.abc import MutableSequence
+from pathlib import Path
 from typing import (
     TYPE_CHECKING,
     Any,
     Callable,
     Dict,
     Generic,
+    Iterable,
     Iterator,
     List,
     Optional,
+    Sequence,
     Tuple,
+    Type,
     TypeVar,
     Union,
-    Iterable,
+    overload,
 )
 from pdfplumber.utils.geometry import objects_to_bbox
-from tqdm.auto import tqdm
+from PIL import Image, ImageDraw, ImageFont
 # New Imports
 from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
+from tqdm.auto import tqdm
+from natural_pdf.classification.manager import ClassificationManager
+from natural_pdf.classification.mixin import ClassificationMixin
+from natural_pdf.collections.mixins import ApplyMixin, DirectionalCollectionMixin
+from natural_pdf.core.pdf import PDF
+from natural_pdf.elements.base import Element
+from natural_pdf.elements.region import Region
 from natural_pdf.elements.text import TextElement
+from natural_pdf.export.mixin import ExportMixin
 from natural_pdf.ocr import OCROptions
-from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
 from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
-from natural_pdf.classification.mixin import ClassificationMixin
-from natural_pdf.classification.manager import ClassificationManager
-from natural_pdf.collections.mixins import ApplyMixin
+from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
 logger = logging.getLogger(__name__)
@@ -38,7 +48,9 @@ T = TypeVar("T")
 P = TypeVar("P", bound="Page")
-class ElementCollection(Generic[T], ApplyMixin):
+class ElementCollection(
+    Generic[T], ApplyMixin, ExportMixin, DirectionalCollectionMixin, MutableSequence
+):
     """
     Collection of PDF elements with batch operations.
     """
@@ -60,10 +72,6 @@ class ElementCollection(Generic[T], ApplyMixin):
         """Get an element by index."""
         return self._elements[index]
-    def __iter__(self):
-        """Iterate over elements."""
-        return iter(self._elements)
     def __repr__(self) -> str:
         """Return a string representation showing the element count."""
         element_type = "Mixed"
@@ -73,6 +81,20 @@ class ElementCollection(Generic[T], ApplyMixin):
                 element_type = types.pop()
         return f"<ElementCollection[{element_type}](count={len(self)})>"
+    def __add__(self, other: "ElementCollection") -> "ElementCollection":
+        if not isinstance(other, ElementCollection):
+            return NotImplemented
+        return ElementCollection(self._elements + other._elements)
+    def __setitem__(self, index, value):
+        self._elements[index] = value
+    def __delitem__(self, index):
+        del self._elements[index]
+    def insert(self, index, value):
+        self._elements.insert(index, value)
     @property
     def elements(self) -> List["Element"]:
         """Get the elements in this collection."""
@@ -125,9 +147,7 @@ class ElementCollection(Generic[T], ApplyMixin):
         # Check if any element is from a different PDF
         return any(
-            hasattr(e, "page") and
-            hasattr(e.page, "pdf") and
-            e.page.pdf is not first_pdf
+            hasattr(e, "page") and hasattr(e.page, "pdf") and e.page.pdf is not first_pdf
             for e in self._elements
         )
@@ -1113,62 +1133,23 @@ class ElementCollection(Generic[T], ApplyMixin):
             logger.error(f"Error creating interactive viewer from collection: {e}", exc_info=True)
             return None
-    def find_all(
-        self, selector: str, regex: bool = False, case: bool = True, **kwargs
-    ) -> "ElementCollection[T]":
+    def find(self, selector: str, **kwargs) -> "ElementCollection":
         """
-        Filter elements within this collection matching the selector.
+        Find elements in this collection matching the selector.
         Args:
-            selector: CSS-like selector string.
-            regex: Whether to use regex for text search in :contains (default: False).
-            case: Whether to do case-sensitive text search (default: True).
-            **kwargs: Additional filter parameters passed to the selector function.
-        Returns:
-            A new ElementCollection containing only the matching elements from this collection.
+            selector: CSS-like selector string
+            apply_exclusions: Whether to exclude elements in exclusion regions
         """
-        if not self._elements:
-            return ElementCollection([])
+        return self.apply(lambda element: element.find(selector, **kwargs))
-        try:
-            selector_obj = parse_selector(selector)
-        except Exception as e:
-            logger.error(f"Error parsing selector '{selector}': {e}")
-            return ElementCollection([])  # Return empty on parse error
-        # Pass regex and case flags to selector function generator
-        kwargs["regex"] = regex
-        kwargs["case"] = case
-        try:
-            filter_func = selector_to_filter_func(selector_obj, **kwargs)
-        except Exception as e:
-            logger.error(f"Error creating filter function for selector '{selector}': {e}")
-            return ElementCollection([])  # Return empty on filter creation error
-        matching_elements = [element for element in self._elements if filter_func(element)]
-        # Note: Unlike Page.find_all, this doesn't re-sort.
-        # Sorting should be done explicitly on the collection if needed.
-        return ElementCollection(matching_elements)
-    def find(self, selector: str, regex: bool = False, case: bool = True, **kwargs) -> Optional[T]:
+    def extract_each_text(self, **kwargs) -> List[str]:
         """
-        Find the first element within this collection matching the selector.
-        Args:
-            selector: CSS-like selector string.
-            regex: Whether to use regex for text search in :contains (default: False).
-            case: Whether to do case-sensitive text search (default: True).
-            **kwargs: Additional filter parameters passed to the selector function.
-        Returns:
-            The first matching element or None.
+        Extract text from each element in this region.
         """
-        results = self.find_all(selector, regex=regex, case=case, **kwargs)
-        return results.first
+        return self.apply(
+            lambda element: element.extract_text(**kwargs) if element is not None else None
+        )
     def correct_ocr(
         self,
@@ -1214,23 +1195,23 @@ class ElementCollection(Generic[T], ApplyMixin):
     def remove(self) -> int:
         """
         Remove all elements in this collection from their respective pages.
         This method removes elements from the page's _element_mgr storage.
         It's particularly useful for removing OCR elements before applying new OCR.
         Returns:
             int: Number of elements successfully removed
         """
         if not self._elements:
             return 0
         removed_count = 0
         for element in self._elements:
             # Each element should have a reference to its page
             if hasattr(element, "page") and hasattr(element.page, "_element_mgr"):
                 element_mgr = element.page._element_mgr
                 # Determine element type
                 element_type = getattr(element, "object_type", None)
                 if element_type:
@@ -1243,7 +1224,7 @@ class ElementCollection(Generic[T], ApplyMixin):
                         element_type = "rects"
                     elif element_type == "line":
                         element_type = "lines"
                     # Try to remove from the element manager
                     if hasattr(element_mgr, "remove_element"):
                         success = element_mgr.remove_element(element, element_type)
@@ -1253,27 +1234,27 @@ class ElementCollection(Generic[T], ApplyMixin):
                         logger.warning("ElementManager does not have remove_element method")
             else:
                 logger.warning(f"Element has no page or page has no _element_mgr: {element}")
         return removed_count
     # --- Classification Method --- #
     def classify_all(
         self,
-        categories: List[str],
+        labels: List[str],
         model: Optional[str] = None,
         using: Optional[str] = None,
         min_confidence: float = 0.0,
-        analysis_key: str = 'classification',
+        analysis_key: str = "classification",
         multi_label: bool = False,
         batch_size: int = 8,
         max_workers: Optional[int] = None,
         progress_bar: bool = True,
-        **kwargs
+        **kwargs,
     ):
         """Classifies all elements in the collection in batch.
         Args:
-            categories: List of category labels.
+            labels: List of category labels.
             model: Model ID (or alias 'text', 'vision').
             using: Optional processing mode ('text' or 'vision'). Inferred if None.
             min_confidence: Minimum confidence threshold.
@@ -1292,21 +1273,21 @@ class ElementCollection(Generic[T], ApplyMixin):
         # Requires access to the PDF's manager. Assume first element has it.
         first_element = self.elements[0]
         manager_source = None
-        if hasattr(first_element, 'page') and hasattr(first_element.page, 'pdf'):
-             manager_source = first_element.page.pdf
-        elif hasattr(first_element, 'pdf'): # Maybe it's a PageCollection?
-             manager_source = first_element.pdf
-        if not manager_source or not hasattr(manager_source, 'get_manager'):
-             raise RuntimeError("Cannot access ClassificationManager via elements.")
+        if hasattr(first_element, "page") and hasattr(first_element.page, "pdf"):
+            manager_source = first_element.page.pdf
+        elif hasattr(first_element, "pdf"):  # Maybe it's a PageCollection?
+            manager_source = first_element.pdf
+        if not manager_source or not hasattr(manager_source, "get_manager"):
+            raise RuntimeError("Cannot access ClassificationManager via elements.")
         try:
-            manager = manager_source.get_manager('classification')
+            manager = manager_source.get_manager("classification")
         except Exception as e:
-             raise RuntimeError(f"Failed to get ClassificationManager: {e}") from e
+            raise RuntimeError(f"Failed to get ClassificationManager: {e}") from e
         if not manager or not manager.is_available():
-             raise RuntimeError("ClassificationManager is not available.")
+            raise RuntimeError("ClassificationManager is not available.")
         # Determine engine type early for content gathering
         inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
@@ -1314,60 +1295,187 @@ class ElementCollection(Generic[T], ApplyMixin):
         # Gather content from all elements
         items_to_classify: List[Tuple[Any, Union[str, Image.Image]]] = []
         original_elements: List[Any] = []
-        logger.info(f"Gathering content for {len(self.elements)} elements for batch classification...")
+        logger.info(
+            f"Gathering content for {len(self.elements)} elements for batch classification..."
+        )
         for element in self.elements:
-             if not isinstance(element, ClassificationMixin):
-                 logger.warning(f"Skipping element (not ClassificationMixin): {element!r}")
-                 continue
-             try:
-                 # Delegate content fetching to the element itself
-                 content = element._get_classification_content(model_type=inferred_using, **kwargs)
-                 items_to_classify.append(content)
-                 original_elements.append(element)
-             except (ValueError, NotImplementedError) as e:
-                 logger.warning(f"Skipping element {element!r}: Cannot get content for classification - {e}")
-             except Exception as e:
-                  logger.warning(f"Skipping element {element!r}: Error getting classification content - {e}")
+            if not isinstance(element, ClassificationMixin):
+                logger.warning(f"Skipping element (not ClassificationMixin): {element!r}")
+                continue
+            try:
+                # Delegate content fetching to the element itself
+                content = element._get_classification_content(model_type=inferred_using, **kwargs)
+                items_to_classify.append(content)
+                original_elements.append(element)
+            except (ValueError, NotImplementedError) as e:
+                logger.warning(
+                    f"Skipping element {element!r}: Cannot get content for classification - {e}"
+                )
+            except Exception as e:
+                logger.warning(
+                    f"Skipping element {element!r}: Error getting classification content - {e}"
+                )
         if not items_to_classify:
-             logger.warning("No content could be gathered from elements for batch classification.")
-             return self
+            logger.warning("No content could be gathered from elements for batch classification.")
+            return self
-        logger.info(f"Collected content for {len(items_to_classify)} elements. Running batch classification...")
+        logger.info(
+            f"Collected content for {len(items_to_classify)} elements. Running batch classification..."
+        )
         # Call manager's batch classify
         batch_results: List[ClassificationResult] = manager.classify_batch(
             item_contents=items_to_classify,
-            categories=categories,
+            labels=labels,
             model_id=model,
             using=inferred_using,
             min_confidence=min_confidence,
             multi_label=multi_label,
             batch_size=batch_size,
             progress_bar=progress_bar,
-            **kwargs
+            **kwargs,
         )
         # Assign results back to elements
         if len(batch_results) != len(original_elements):
-             logger.error(
-                 f"Batch classification result count ({len(batch_results)}) mismatch "
-                 f"with elements processed ({len(original_elements)}). Cannot assign results."
-             )
-             # Decide how to handle mismatch - maybe store errors?
+            logger.error(
+                f"Batch classification result count ({len(batch_results)}) mismatch "
+                f"with elements processed ({len(original_elements)}). Cannot assign results."
+            )
+            # Decide how to handle mismatch - maybe store errors?
         else:
-             logger.info(f"Assigning {len(batch_results)} results to elements under key '{analysis_key}'.")
-             for element, result_obj in zip(original_elements, batch_results):
-                 try:
-                     if not hasattr(element, 'analyses') or element.analyses is None:
-                          element.analyses = {}
-                     element.analyses[analysis_key] = result_obj
-                 except Exception as e:
-                      logger.warning(f"Failed to store classification result for {element!r}: {e}")
+            logger.info(
+                f"Assigning {len(batch_results)} results to elements under key '{analysis_key}'."
+            )
+            for element, result_obj in zip(original_elements, batch_results):
+                try:
+                    if not hasattr(element, "analyses") or element.analyses is None:
+                        element.analyses = {}
+                    element.analyses[analysis_key] = result_obj
+                except Exception as e:
+                    logger.warning(f"Failed to store classification result for {element!r}: {e}")
         return self
     # --- End Classification Method --- #
+    def _gather_analysis_data(
+        self,
+        analysis_keys: List[str],
+        include_content: bool,
+        include_images: bool,
+        image_dir: Optional[Path],
+        image_format: str,
+        image_resolution: int,
+    ) -> List[Dict[str, Any]]:
+        """
+        Gather analysis data from all elements in the collection.
+        Args:
+            analysis_keys: Keys in the analyses dictionary to export
+            include_content: Whether to include extracted text
+            include_images: Whether to export images
+            image_dir: Directory to save images
+            image_format: Format to save images
+            image_resolution: Resolution for exported images
+        Returns:
+            List of dictionaries containing analysis data
+        """
+        if not self.elements:
+            logger.warning("No elements found in collection")
+            return []
+        all_data = []
+        for i, element in enumerate(self.elements):
+            # Base element information
+            element_data = {
+                "element_index": i,
+                "element_type": getattr(element, "type", type(element).__name__),
+            }
+            # Add geometry if available
+            for attr in ["x0", "top", "x1", "bottom", "width", "height"]:
+                if hasattr(element, attr):
+                    element_data[attr] = getattr(element, attr)
+            # Add page information if available
+            if hasattr(element, "page"):
+                page = element.page
+                if page:
+                    element_data["page_number"] = getattr(page, "number", None)
+                    element_data["pdf_path"] = (
+                        getattr(page.pdf, "path", None) if hasattr(page, "pdf") else None
+                    )
+            # Include extracted text if requested
+            if include_content and hasattr(element, "extract_text"):
+                try:
+                    element_data["content"] = element.extract_text(preserve_whitespace=True)
+                except Exception as e:
+                    logger.error(f"Error extracting text from element {i}: {e}")
+                    element_data["content"] = ""
+            # Save image if requested
+            if include_images and hasattr(element, "to_image"):
+                try:
+                    # Create identifier for the element
+                    pdf_name = "unknown"
+                    page_num = "unknown"
+                    if hasattr(element, "page") and element.page:
+                        page_num = element.page.number
+                        if hasattr(element.page, "pdf") and element.page.pdf:
+                            pdf_name = Path(element.page.pdf.path).stem
+                    # Create image filename
+                    element_type = element_data.get("element_type", "element").lower()
+                    image_filename = f"{pdf_name}_page{page_num}_{element_type}_{i}.{image_format}"
+                    image_path = image_dir / image_filename
+                    # Save image
+                    element.to_image(
+                        path=str(image_path), resolution=image_resolution, include_highlights=True
+                    )
+                    # Add relative path to data
+                    element_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
+                except Exception as e:
+                    logger.error(f"Error saving image for element {i}: {e}")
+                    element_data["image_path"] = None
+            # Add analyses data
+            if hasattr(element, "analyses"):
+                for key in analysis_keys:
+                    if key not in element.analyses:
+                        # Skip this key if it doesn't exist - elements might have different analyses
+                        logger.warning(f"Analysis key '{key}' not found in element {i}")
+                        continue
+                    # Get the analysis result
+                    analysis_result = element.analyses[key]
+                    # If the result has a to_dict method, use it
+                    if hasattr(analysis_result, "to_dict"):
+                        analysis_data = analysis_result.to_dict()
+                    else:
+                        # Otherwise, use the result directly if it's dict-like
+                        try:
+                            analysis_data = dict(analysis_result)
+                        except (TypeError, ValueError):
+                            # Last resort: convert to string
+                            analysis_data = {"raw_result": str(analysis_result)}
+                    # Add analysis data to element data with the key as prefix
+                    for k, v in analysis_data.items():
+                        element_data[f"{key}.{k}"] = v
+            all_data.append(element_data)
+        return all_data
 class PageCollection(Generic[P], ApplyMixin):
     """
@@ -1500,39 +1608,127 @@ class PageCollection(Generic[P], ApplyMixin):
         return self  # Return self for chaining
-    def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional[T]:
+    @overload
+    def find(
+        self,
+        *,
+        text: str,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> Optional[T]: ...
+    @overload
+    def find(
+        self,
+        selector: str,
+        *,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> Optional[T]: ...
+    def find(
+        self,
+        selector: Optional[str] = None,
+        *,
+        text: Optional[str] = None,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> Optional[T]:
         """
-        Find the first element matching the selector across all pages.
+        Find the first element matching the selector OR text across all pages in the collection.
+        Provide EITHER `selector` OR `text`, but not both.
         Args:
-            selector: CSS-like selector string
-            apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
-            **kwargs: Additional filter parameters
+            selector: CSS-like selector string.
+            text: Text content to search for (equivalent to 'text:contains(...)').
+            apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
+            regex: Whether to use regex for text search (`selector` or `text`) (default: False).
+            case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
+            **kwargs: Additional filter parameters.
         Returns:
-            First matching element or None
+            First matching element or None.
         """
+        # Input validation happens within page.find
         for page in self.pages:
-            element = page.find(selector, apply_exclusions=apply_exclusions, **kwargs)
+            element = page.find(
+                selector=selector,
+                text=text,
+                apply_exclusions=apply_exclusions,
+                regex=regex,
+                case=case,
+                **kwargs,
+            )
             if element:
                 return element
         return None
-    def find_all(self, selector: str, apply_exclusions=True, **kwargs) -> ElementCollection:
+    @overload
+    def find_all(
+        self,
+        *,
+        text: str,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> "ElementCollection": ...
+    @overload
+    def find_all(
+        self,
+        selector: str,
+        *,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> "ElementCollection": ...
+    def find_all(
+        self,
+        selector: Optional[str] = None,
+        *,
+        text: Optional[str] = None,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> "ElementCollection":
         """
-        Find all elements matching the selector across all pages.
+        Find all elements matching the selector OR text across all pages in the collection.
+        Provide EITHER `selector` OR `text`, but not both.
         Args:
-            selector: CSS-like selector string
-            apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
-            **kwargs: Additional filter parameters
+            selector: CSS-like selector string.
+            text: Text content to search for (equivalent to 'text:contains(...)').
+            apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
+            regex: Whether to use regex for text search (`selector` or `text`) (default: False).
+            case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
+            **kwargs: Additional filter parameters.
         Returns:
-            ElementCollection with matching elements from all pages
+            ElementCollection with matching elements from all pages.
         """
         all_elements = []
+        # Input validation happens within page.find_all
         for page in self.pages:
-            elements = page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
+            elements = page.find_all(
+                selector=selector,
+                text=text,
+                apply_exclusions=apply_exclusions,
+                regex=regex,
+                case=case,
+                **kwargs,
+            )
             if elements:
                 all_elements.extend(elements.elements)
@@ -1571,10 +1767,14 @@ class PageCollection(Generic[P], ApplyMixin):
         # Assume all pages share the same parent PDF object
         parent_pdf = self.pages[0]._parent
-        if not parent_pdf or not hasattr(parent_pdf, 'correct_ocr') or not callable(parent_pdf.correct_ocr):
-             raise RuntimeError(
-                 "Parent PDF reference not found or parent PDF lacks the required 'correct_ocr' method."
-             )
+        if (
+            not parent_pdf
+            or not hasattr(parent_pdf, "correct_ocr")
+            or not callable(parent_pdf.correct_ocr)
+        ):
+            raise RuntimeError(
+                "Parent PDF reference not found or parent PDF lacks the required 'correct_ocr' method."
+            )
         page_indices = [p.index for p in self.pages]
         logger.info(
@@ -1586,7 +1786,7 @@ class PageCollection(Generic[P], ApplyMixin):
         parent_pdf.correct_ocr(
             correction_callback=correction_callback,
             pages=page_indices,
-            max_workers=max_workers # Pass it here
+            max_workers=max_workers,  # Pass it here
         )
         return self
@@ -1891,3 +2091,279 @@ class PageCollection(Generic[P], ApplyMixin):
                 sections.append(region)
         return sections
+    def _gather_analysis_data(
+        self,
+        analysis_keys: List[str],
+        include_content: bool,
+        include_images: bool,
+        image_dir: Optional[Path],
+        image_format: str,
+        image_resolution: int,
+    ) -> List[Dict[str, Any]]:
+        """
+        Gather analysis data from all pages in the collection.
+        Args:
+            analysis_keys: Keys in the analyses dictionary to export
+            include_content: Whether to include extracted text
+            include_images: Whether to export images
+            image_dir: Directory to save images
+            image_format: Format to save images
+            image_resolution: Resolution for exported images
+        Returns:
+            List of dictionaries containing analysis data
+        """
+        if not self.elements:
+            logger.warning("No pages found in collection")
+            return []
+        all_data = []
+        for page in self.elements:
+            # Basic page information
+            page_data = {
+                "page_number": page.number,
+                "page_index": page.index,
+                "width": page.width,
+                "height": page.height,
+            }
+            # Add PDF information if available
+            if hasattr(page, "pdf") and page.pdf:
+                page_data["pdf_path"] = page.pdf.path
+                page_data["pdf_filename"] = Path(page.pdf.path).name
+            # Include extracted text if requested
+            if include_content:
+                try:
+                    page_data["content"] = page.extract_text(preserve_whitespace=True)
+                except Exception as e:
+                    logger.error(f"Error extracting text from page {page.number}: {e}")
+                    page_data["content"] = ""
+            # Save image if requested
+            if include_images:
+                try:
+                    # Create image filename
+                    pdf_name = "unknown"
+                    if hasattr(page, "pdf") and page.pdf:
+                        pdf_name = Path(page.pdf.path).stem
+                    image_filename = f"{pdf_name}_page_{page.number}.{image_format}"
+                    image_path = image_dir / image_filename
+                    # Save image
+                    page.save_image(
+                        str(image_path), resolution=image_resolution, include_highlights=True
+                    )
+                    # Add relative path to data
+                    page_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
+                except Exception as e:
+                    logger.error(f"Error saving image for page {page.number}: {e}")
+                    page_data["image_path"] = None
+            # Add analyses data
+            if hasattr(page, "analyses") and page.analyses:
+                for key in analysis_keys:
+                    if key not in page.analyses:
+                        raise KeyError(f"Analysis key '{key}' not found in page {page.number}")
+                    # Get the analysis result
+                    analysis_result = page.analyses[key]
+                    # If the result has a to_dict method, use it
+                    if hasattr(analysis_result, "to_dict"):
+                        analysis_data = analysis_result.to_dict()
+                    else:
+                        # Otherwise, use the result directly if it's dict-like
+                        try:
+                            analysis_data = dict(analysis_result)
+                        except (TypeError, ValueError):
+                            # Last resort: convert to string
+                            analysis_data = {"raw_result": str(analysis_result)}
+                    # Add analysis data to page data with the key as prefix
+                    for k, v in analysis_data.items():
+                        page_data[f"{key}.{k}"] = v
+            all_data.append(page_data)
+        return all_data
+    # --- Deskew Method --- #
+    def deskew(
+        self,
+        resolution: int = 300,
+        detection_resolution: int = 72,
+        force_overwrite: bool = False,
+        **deskew_kwargs,
+    ) -> "PDF":  # Changed return type
+        """
+        Creates a new, in-memory PDF object containing deskewed versions of the pages
+        in this collection.
+        This method delegates the actual processing to the parent PDF object's
+        `deskew` method.
+        Important: The returned PDF is image-based. Any existing text, OCR results,
+        annotations, or other elements from the original pages will *not* be carried over.
+        Args:
+            resolution: DPI resolution for rendering the output deskewed pages.
+            detection_resolution: DPI resolution used for skew detection if angles are not
+                                  already cached on the page objects.
+            force_overwrite: If False (default), raises a ValueError if any target page
+                             already contains processed elements (text, OCR, regions) to
+                             prevent accidental data loss. Set to True to proceed anyway.
+            **deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
+                             during automatic detection (e.g., `max_angle`, `num_peaks`).
+        Returns:
+            A new PDF object representing the deskewed document.
+        Raises:
+            ImportError: If 'deskew' or 'img2pdf' libraries are not installed (raised by PDF.deskew).
+            ValueError: If `force_overwrite` is False and target pages contain elements (raised by PDF.deskew),
+                        or if the collection is empty.
+            RuntimeError: If pages lack a parent PDF reference, or the parent PDF lacks the `deskew` method.
+        """
+        if not self.pages:
+            logger.warning("Cannot deskew an empty PageCollection.")
+            raise ValueError("Cannot deskew an empty PageCollection.")
+        # Assume all pages share the same parent PDF object
+        # Need to hint the type of _parent for type checkers
+        if TYPE_CHECKING:
+            parent_pdf: "natural_pdf.core.pdf.PDF" = self.pages[0]._parent
+        else:
+            parent_pdf = self.pages[0]._parent
+        if not parent_pdf or not hasattr(parent_pdf, "deskew") or not callable(parent_pdf.deskew):
+            raise RuntimeError(
+                "Parent PDF reference not found or parent PDF lacks the required 'deskew' method."
+            )
+        # Get the 0-based indices of the pages in this collection
+        page_indices = [p.index for p in self.pages]
+        logger.info(
+            f"PageCollection: Delegating deskew to parent PDF for page indices: {page_indices}"
+        )
+        # Delegate the call to the parent PDF object for the relevant pages
+        # Pass all relevant arguments through (no output_path anymore)
+        return parent_pdf.deskew(
+            pages=page_indices,
+            resolution=resolution,
+            detection_resolution=detection_resolution,
+            force_overwrite=force_overwrite,
+            **deskew_kwargs,
+        )
+    # --- End Deskew Method --- #
+    def to_image(
+        self,
+        page_width: int = 300,
+        cols: Optional[int] = 4,
+        rows: Optional[int] = None,
+        max_pages: Optional[int] = None,
+        spacing: int = 10,
+        add_labels: bool = True,
+        show_category: bool = False,  # Add new flag
+    ) -> Optional["Image.Image"]:
+        """
+        Generate a grid of page images for this collection.
+        Args:
+            page_width: Width in pixels for rendering individual pages
+            cols: Number of columns in grid (default: 4)
+            rows: Number of rows in grid (calculated automatically if None)
+            max_pages: Maximum number of pages to include (default: all)
+            spacing: Spacing between page thumbnails in pixels
+            add_labels: Whether to add page number labels
+            show_category: Whether to add category and confidence labels (if available)
+        Returns:
+            PIL Image of the page grid or None if no pages
+        """
+        if not self.pages:
+            logger.warning("Cannot generate image for empty PageCollection")
+            return None
+        # Limit pages if max_pages is specified
+        pages_to_render = self.pages[:max_pages] if max_pages else self.pages
+        # Load font once outside the loop
+        font = ImageFont.load_default(16) if add_labels else None
+        # Render individual page images
+        page_images = []
+        for page in pages_to_render:
+            img = page.to_image(width=page_width)
+            # Add page number label
+            if add_labels and font:  # Check if font was loaded
+                draw = ImageDraw.Draw(img)
+                pdf_name = Path(page.pdf.path).stem if hasattr(page, "pdf") and page.pdf else ""
+                label_text = f"p{page.number} - {pdf_name}"
+                # Add category if requested and available
+                if show_category:
+                    category = getattr(page, "category", None)
+                    confidence = getattr(page, "category_confidence", None)
+                    if category is not None and confidence is not None:
+                        category_str = f"{category} {confidence:.3f}"
+                        label_text += f"\n{category_str}"
+                # Calculate bounding box for multi-line text
+                # Use (5, 5) as top-left anchor for textbbox calculation for padding
+                # Use multiline_textbbox for accurate bounds with newlines
+                bbox = draw.multiline_textbbox((5, 5), label_text, font=font)
+                # Add padding to the calculated bbox for the white background
+                bg_rect = (bbox[0] - 2, bbox[1] - 2, bbox[2] + 2, bbox[3] + 2)
+                # Draw white background rectangle
+                draw.rectangle(bg_rect, fill=(255, 255, 255))
+                # Draw the potentially multi-line text using multiline_text
+                draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font)
+            page_images.append(img)
+        # Calculate grid dimensions if not provided
+        if not rows and not cols:
+            # Default to a square-ish grid
+            cols = min(4, int(len(page_images) ** 0.5) + 1)
+            rows = (len(page_images) + cols - 1) // cols
+        elif rows and not cols:
+            cols = (len(page_images) + rows - 1) // rows
+        elif cols and not rows:
+            rows = (len(page_images) + cols - 1) // cols
+        # Get maximum dimensions for consistent grid cells
+        max_width = max(img.width for img in page_images)
+        max_height = max(img.height for img in page_images)
+        # Create grid image
+        grid_width = cols * max_width + (cols + 1) * spacing
+        grid_height = rows * max_height + (rows + 1) * spacing
+        grid_img = Image.new("RGB", (grid_width, grid_height), (255, 255, 255))
+        # Place images in grid
+        for i, img in enumerate(page_images):
+            if i >= rows * cols:
+                break
+            row = i // cols
+            col = i % cols
+            x = col * max_width + (col + 1) * spacing
+            y = row * max_height + (row + 1) * spacing
+            grid_img.paste(img, (x, y))
+        return grid_img

natural-pdf 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

natural-pdf 0.1.8py3-none-any.whl → 0.1.10py3-none-any.whl