PyPI - natural-pdf - Versions diffs - 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl - Mend

natural-pdf 0.1.32py3-none-any.whl → 0.1.34py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

natural_pdf/analyzers/__init__.py +2 -2
natural_pdf/analyzers/guides.py +670 -595
natural_pdf/analyzers/layout/base.py +53 -6
natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
natural_pdf/analyzers/layout/layout_manager.py +18 -14
natural_pdf/analyzers/layout/layout_options.py +1 -0
natural_pdf/analyzers/layout/paddle.py +102 -64
natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
natural_pdf/analyzers/layout/yolo.py +2 -6
natural_pdf/analyzers/shape_detection_mixin.py +15 -6
natural_pdf/classification/manager.py +92 -77
natural_pdf/classification/mixin.py +49 -5
natural_pdf/classification/results.py +1 -1
natural_pdf/cli.py +7 -3
natural_pdf/collections/pdf_collection.py +96 -101
natural_pdf/core/element_manager.py +188 -82
natural_pdf/core/highlighting_service.py +5 -6
natural_pdf/core/page.py +132 -16
natural_pdf/core/pdf.py +486 -71
natural_pdf/describe/__init__.py +18 -12
natural_pdf/describe/base.py +179 -172
natural_pdf/describe/elements.py +155 -155
natural_pdf/describe/mixin.py +27 -19
natural_pdf/describe/summary.py +44 -55
natural_pdf/elements/base.py +134 -18
natural_pdf/elements/collections.py +90 -18
natural_pdf/elements/image.py +2 -1
natural_pdf/elements/line.py +0 -31
natural_pdf/elements/rect.py +0 -14
natural_pdf/elements/region.py +238 -111
natural_pdf/elements/text.py +18 -12
natural_pdf/exporters/__init__.py +4 -1
natural_pdf/exporters/original_pdf.py +12 -4
natural_pdf/extraction/mixin.py +66 -10
natural_pdf/extraction/result.py +1 -1
natural_pdf/flows/flow.py +63 -4
natural_pdf/flows/region.py +4 -4
natural_pdf/ocr/engine.py +83 -2
natural_pdf/ocr/engine_paddle.py +5 -5
natural_pdf/ocr/ocr_factory.py +2 -1
natural_pdf/ocr/ocr_manager.py +24 -13
natural_pdf/ocr/ocr_options.py +3 -10
natural_pdf/qa/document_qa.py +21 -8
natural_pdf/qa/qa_result.py +3 -7
natural_pdf/search/__init__.py +3 -2
natural_pdf/search/lancedb_search_service.py +5 -6
natural_pdf/search/numpy_search_service.py +5 -2
natural_pdf/selectors/parser.py +51 -6
natural_pdf/tables/__init__.py +2 -2
natural_pdf/tables/result.py +7 -6
natural_pdf/utils/bidi_mirror.py +2 -1
natural_pdf/utils/reading_order.py +3 -2
natural_pdf/utils/visualization.py +3 -3
natural_pdf/widgets/viewer.py +0 -1
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
natural_pdf-0.1.34.dist-info/RECORD +121 -0
optimization/memory_comparison.py +73 -58
optimization/pdf_analyzer.py +141 -96
optimization/performance_analysis.py +111 -110
optimization/test_cleanup_methods.py +47 -36
optimization/test_memory_fix.py +40 -39
tools/bad_pdf_eval/__init__.py +0 -1
tools/bad_pdf_eval/analyser.py +35 -18
tools/bad_pdf_eval/collate_summaries.py +22 -18
tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
tools/bad_pdf_eval/eval_suite.py +21 -9
tools/bad_pdf_eval/evaluate_quality.py +198 -0
tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
tools/bad_pdf_eval/llm_enrich.py +71 -39
tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
tools/bad_pdf_eval/reporter.py +1 -1
tools/bad_pdf_eval/utils.py +7 -4
natural_pdf-0.1.32.dist-info/RECORD +0 -118
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0

natural_pdf/core/pdf.py CHANGED Viewed

@@ -7,6 +7,7 @@ import tempfile
 import threading
 import time
 import urllib.request
+import weakref
 from pathlib import Path
 from typing import (
     TYPE_CHECKING,
@@ -23,9 +24,7 @@ from typing import (
 )
 import pdfplumber
-from PIL import Image
 from tqdm.auto import tqdm
-import weakref
 from natural_pdf.analyzers.layout.layout_manager import LayoutManager
 from natural_pdf.classification.manager import ClassificationError
@@ -73,11 +72,14 @@ except ImportError:
 logger = logging.getLogger("natural_pdf.core.pdf")
 def _get_classification_manager_class():
     """Lazy import for ClassificationManager."""
     from natural_pdf.classification.manager import ClassificationManager
     return ClassificationManager
 DEFAULT_MANAGERS = {
     "classification": _get_classification_manager_class,
     "structured_data": StructuredDataManager,
@@ -100,20 +102,51 @@ except ImportError:
 # --- Lazy Page List Helper --- #
 from collections.abc import Sequence
 class _LazyPageList(Sequence):
     """A lightweight, list-like object that lazily instantiates natural-pdf Page objects.
+    This class implements the Sequence protocol to provide list-like access to PDF pages
+    while minimizing memory usage. Pages are only created when accessed, and once created,
+    they are cached for subsequent access. This design allows efficient handling of large
+    PDF documents without loading all pages into memory immediately.
     The sequence holds `None` placeholders until an index is accessed, at which point
-    a real `Page` object is created, cached, and returned.  Slices and iteration are
-    also supported and will materialise pages on demand.
+    a real `Page` object is created, cached, and returned. Slices and iteration are
+    also supported and will materialize pages on demand.
+    Attributes:
+        _parent_pdf: Reference to the parent PDF object.
+        _plumber_pdf: Underlying pdfplumber PDF object.
+        _font_attrs: Font attributes to use when creating pages.
+        _cache: List of cached Page objects (None until accessed).
+        _load_text: Whether to load text layer when creating pages.
+    Example:
+        ```python
+        # Access is transparent - pages created on demand
+        pdf = npdf.PDF("document.pdf")
+        first_page = pdf.pages[0]  # Creates Page object here
+        last_page = pdf.pages[-1]  # Creates another Page object
+        # Slicing works too
+        first_three = pdf.pages[0:3]  # Creates 3 Page objects
+        # Iteration creates all pages
+        for page in pdf.pages:  # Each page created as needed
+            print(f"Page {page.index}")
+        ```
     """
-    def __init__(self, parent_pdf: "PDF", plumber_pdf: "pdfplumber.PDF", font_attrs=None):
+    def __init__(
+        self, parent_pdf: "PDF", plumber_pdf: "pdfplumber.PDF", font_attrs=None, load_text=True
+    ):
         self._parent_pdf = parent_pdf
         self._plumber_pdf = plumber_pdf
         self._font_attrs = font_attrs
         # One slot per pdfplumber page – initially all None
         self._cache: List[Optional["Page"]] = [None] * len(self._plumber_pdf.pages)
+        self._load_text = load_text
     # Internal helper -----------------------------------------------------
     def _create_page(self, index: int) -> "Page":
@@ -123,7 +156,13 @@ class _LazyPageList(Sequence):
             from natural_pdf.core.page import Page
             plumber_page = self._plumber_pdf.pages[index]
-            cached = Page(plumber_page, parent=self._parent_pdf, index=index, font_attrs=self._font_attrs)
+            cached = Page(
+                plumber_page,
+                parent=self._parent_pdf,
+                index=index,
+                font_attrs=self._font_attrs,
+                load_text=self._load_text,
+            )
             self._cache[index] = cached
         return cached
@@ -152,14 +191,44 @@ class _LazyPageList(Sequence):
     def __repr__(self) -> str:  # pragma: no cover
         return f"<_LazyPageList(len={len(self)})>"
 # --- End Lazy Page List Helper --- #
 class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
-    """
-    Enhanced PDF wrapper built on top of pdfplumber.
+    """Enhanced PDF wrapper built on top of pdfplumber.
     This class provides a fluent interface for working with PDF documents,
-    with improved selection, navigation, and extraction capabilities.
+    with improved selection, navigation, and extraction capabilities. It integrates
+    OCR, layout analysis, and AI-powered data extraction features while maintaining
+    compatibility with the underlying pdfplumber API.
+    The PDF class supports loading from files, URLs, or streams, and provides
+    spatial navigation, element selection with CSS-like selectors, and advanced
+    document processing workflows including multi-page content flows.
+    Attributes:
+        pages: Lazy-loaded list of Page objects for document pages.
+        path: Resolved path to the PDF file or source identifier.
+        source_path: Original path, URL, or stream identifier provided during initialization.
+        highlighter: Service for rendering highlighted visualizations of document content.
+    Example:
+        Basic usage:
+        ```python
+        import natural_pdf as npdf
+        pdf = npdf.PDF("document.pdf")
+        page = pdf.pages[0]
+        text_elements = page.find_all('text:contains("Summary")')
+        ```
+        Advanced usage with OCR:
+        ```python
+        pdf = npdf.PDF("scanned_document.pdf")
+        pdf.apply_ocr(engine="easyocr", resolution=144)
+        tables = pdf.pages[0].find_all('table')
+        ```
     """
     def __init__(
@@ -170,22 +239,56 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         keep_spaces: bool = True,
         text_tolerance: Optional[dict] = None,
         auto_text_tolerance: bool = True,
+        text_layer: bool = True,
     ):
-        """
-        Initialize the enhanced PDF object.
+        """Initialize the enhanced PDF object.
         Args:
-            path_or_url_or_stream: Path to the PDF file, a URL, or a file-like object (stream).
-            reading_order: Whether to use natural reading order
-            font_attrs: Font attributes for grouping characters into words
-            keep_spaces: Whether to include spaces in word elements
-            text_tolerance: PDFplumber-style tolerance settings
-            auto_text_tolerance: Whether to automatically scale text tolerance
+            path_or_url_or_stream: Path to the PDF file (str/Path), a URL (str),
+                or a file-like object (stream). URLs must start with 'http://' or 'https://'.
+            reading_order: If True, use natural reading order for text extraction.
+                Defaults to True.
+            font_attrs: List of font attributes for grouping characters into words.
+                Common attributes include ['fontname', 'size']. Defaults to None.
+            keep_spaces: If True, include spaces in word elements during text extraction.
+                Defaults to True.
+            text_tolerance: PDFplumber-style tolerance settings for text grouping.
+                Dictionary with keys like 'x_tolerance', 'y_tolerance'. Defaults to None.
+            auto_text_tolerance: If True, automatically scale text tolerance based on
+                font size and document characteristics. Defaults to True.
+            text_layer: If True, preserve existing text layer from the PDF. If False,
+                removes all existing text elements during initialization, useful for
+                OCR-only workflows. Defaults to True.
+        Raises:
+            TypeError: If path_or_url_or_stream is not a valid type.
+            IOError: If the PDF file cannot be opened or read.
+            ValueError: If URL download fails.
+        Example:
+            ```python
+            # From file path
+            pdf = npdf.PDF("document.pdf")
+            # From URL
+            pdf = npdf.PDF("https://example.com/document.pdf")
+            # From stream
+            with open("document.pdf", "rb") as f:
+                pdf = npdf.PDF(f)
+            # With custom settings
+            pdf = npdf.PDF("document.pdf",
+                          reading_order=False,
+                          text_layer=False,  # For OCR-only processing
+                          font_attrs=['fontname', 'size', 'flags'])
+            ```
         """
         self._original_path_or_stream = path_or_url_or_stream
         self._temp_file = None
         self._resolved_path = None
         self._is_stream = False
+        self._text_layer = text_layer
         stream_to_open = None
         if hasattr(path_or_url_or_stream, "read"):  # Check if it's file-like
@@ -257,7 +360,9 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         self._manager_registry = {}
         # Lazily instantiate pages only when accessed
-        self._pages = _LazyPageList(self, self._pdf, font_attrs=font_attrs)
+        self._pages = _LazyPageList(
+            self, self._pdf, font_attrs=font_attrs, load_text=self._text_layer
+        )
         self._element_cache = {}
         self._exclusions = []
@@ -267,6 +372,13 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         self._initialize_managers()
         self._initialize_highlighter()
+        # Remove text layer if requested
+        if not self._text_layer:
+            logger.info("Removing text layer as requested (text_layer=False)")
+            # Text layer is not loaded when text_layer=False, so no need to remove
+            pass
         # Analysis results accessed via self.analyses property (see below)
         # --- Automatic cleanup when object is garbage-collected ---
@@ -303,7 +415,30 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         self._managers = {}  # Will hold instantiated managers
     def get_manager(self, key: str) -> Any:
-        """Retrieve a manager instance by its key, instantiating it lazily if needed."""
+        """Retrieve a manager instance by its key, instantiating it lazily if needed.
+        Managers are specialized components that handle specific functionality like
+        classification, structured data extraction, or OCR processing. They are
+        instantiated on-demand to minimize memory usage and startup time.
+        Args:
+            key: The manager key to retrieve. Common keys include 'classification'
+                and 'structured_data'.
+        Returns:
+            The manager instance for the specified key.
+        Raises:
+            KeyError: If no manager is registered for the given key.
+            RuntimeError: If the manager failed to initialize.
+        Example:
+            ```python
+            pdf = npdf.PDF("document.pdf")
+            classification_mgr = pdf.get_manager('classification')
+            structured_data_mgr = pdf.get_manager('structured_data')
+            ```
+        """
         # Check if already instantiated
         if key in self._managers:
             manager_instance = self._managers[key]
@@ -339,12 +474,56 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
     @property
     def metadata(self) -> Dict[str, Any]:
-        """Access metadata as a dictionary."""
+        """Access PDF metadata as a dictionary.
+        Returns document metadata such as title, author, creation date, and other
+        properties embedded in the PDF file. The exact keys available depend on
+        what metadata was included when the PDF was created.
+        Returns:
+            Dictionary containing PDF metadata. Common keys include 'Title',
+            'Author', 'Subject', 'Creator', 'Producer', 'CreationDate', and
+            'ModDate'. May be empty if no metadata is available.
+        Example:
+            ```python
+            pdf = npdf.PDF("document.pdf")
+            print(pdf.metadata.get('Title', 'No title'))
+            print(f"Created: {pdf.metadata.get('CreationDate')}")
+            ```
+        """
         return self._pdf.metadata
     @property
     def pages(self) -> "PageCollection":
-        """Access pages as a PageCollection object."""
+        """Access pages as a PageCollection object.
+        Provides access to individual pages of the PDF document through a
+        collection interface that supports indexing, slicing, and iteration.
+        Pages are lazy-loaded to minimize memory usage.
+        Returns:
+            PageCollection object that provides list-like access to PDF pages.
+        Raises:
+            AttributeError: If PDF pages are not yet initialized.
+        Example:
+            ```python
+            pdf = npdf.PDF("document.pdf")
+            # Access individual pages
+            first_page = pdf.pages[0]
+            last_page = pdf.pages[-1]
+            # Slice pages
+            first_three = pdf.pages[0:3]
+            # Iterate over pages
+            for page in pdf.pages:
+                print(f"Page {page.index} has {len(page.chars)} characters")
+            ```
+        """
         from natural_pdf.elements.collections import PageCollection
         if not hasattr(self, "_pages"):
@@ -352,11 +531,26 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         return PageCollection(self._pages)
     def clear_exclusions(self) -> "PDF":
-        """
-        Clear all exclusion functions from the PDF.
+        """Clear all exclusion functions from the PDF.
+        Removes all previously added exclusion functions that were used to filter
+        out unwanted content (like headers, footers, or administrative text) from
+        text extraction and analysis operations.
         Returns:
-            Self for method chaining
+            Self for method chaining.
+        Raises:
+            AttributeError: If PDF pages are not yet initialized.
+        Example:
+            ```python
+            pdf = npdf.PDF("document.pdf")
+            pdf.add_exclusion(lambda page: page.find('text:contains("CONFIDENTIAL")').above())
+            # Later, remove all exclusions
+            pdf.clear_exclusions()
+            ```
         """
         if not hasattr(self, "_pages"):
             raise AttributeError("PDF pages not yet initialized.")
@@ -369,16 +563,46 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
     def add_exclusion(
         self, exclusion_func: Callable[["Page"], Optional["Region"]], label: str = None
     ) -> "PDF":
-        """
-        Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
+        """Add an exclusion function to the PDF.
+        Exclusion functions define regions of each page that should be ignored during
+        text extraction and analysis operations. This is useful for filtering out headers,
+        footers, watermarks, or other administrative content that shouldn't be included
+        in the main document processing.
         Args:
-            exclusion_func: A function that takes a Page and returns a Region to exclude, or None
-            exclusion_func: A function that takes a Page and returns a Region to exclude, or None
-            label: Optional label for this exclusion
+            exclusion_func: A function that takes a Page object and returns a Region
+                to exclude from processing, or None if no exclusion should be applied
+                to that page. The function is called once per page.
+            label: Optional descriptive label for this exclusion rule, useful for
+                debugging and identification.
         Returns:
-            Self for method chaining
+            Self for method chaining.
+        Raises:
+            AttributeError: If PDF pages are not yet initialized.
+        Example:
+            ```python
+            pdf = npdf.PDF("document.pdf")
+            # Exclude headers (top 50 points of each page)
+            pdf.add_exclusion(
+                lambda page: page.region(0, 0, page.width, 50),
+                label="header_exclusion"
+            )
+            # Exclude any text containing "CONFIDENTIAL"
+            pdf.add_exclusion(
+                lambda page: page.find('text:contains("CONFIDENTIAL")').above(include_source=True)
+                if page.find('text:contains("CONFIDENTIAL")') else None,
+                label="confidential_exclusion"
+            )
+            # Chain multiple exclusions
+            pdf.add_exclusion(header_func).add_exclusion(footer_func)
+            ```
         """
         if not hasattr(self, "_pages"):
             raise AttributeError("PDF pages not yet initialized.")
@@ -404,23 +628,74 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         options: Optional[Any] = None,
         pages: Optional[Union[Iterable[int], range, slice]] = None,
     ) -> "PDF":
-        """
-        Applies OCR to specified pages of the PDF using batch processing.
+        """Apply OCR to specified pages of the PDF using batch processing.
+        Performs optical character recognition on the specified pages, converting
+        image-based text into searchable and extractable text elements. This method
+        supports multiple OCR engines and provides batch processing for efficiency.
         Args:
-            engine: Name of the OCR engine
-            languages: List of language codes
-            min_confidence: Minimum confidence threshold
-            device: Device to run OCR on
-            resolution: DPI resolution for page images
-            apply_exclusions: Whether to mask excluded areas
-            detect_only: If True, only detect text boxes
-            replace: Whether to replace existing OCR elements
-            options: Engine-specific options
-            pages: Page indices to process or None for all pages
+            engine: Name of the OCR engine to use. Supported engines include
+                'easyocr' (default), 'surya', 'paddle', and 'doctr'. If None,
+                uses the global default from natural_pdf.options.ocr.engine.
+            languages: List of language codes for OCR recognition (e.g., ['en', 'es']).
+                If None, uses the global default from natural_pdf.options.ocr.languages.
+            min_confidence: Minimum confidence threshold (0.0-1.0) for accepting
+                OCR results. Text with lower confidence will be filtered out.
+                If None, uses the global default.
+            device: Device to run OCR on ('cpu', 'cuda', 'mps'). Engine-specific
+                availability varies. If None, uses engine defaults.
+            resolution: DPI resolution for rendering pages to images before OCR.
+                Higher values improve accuracy but increase processing time and memory.
+                Typical values: 150 (fast), 300 (balanced), 600 (high quality).
+            apply_exclusions: If True, mask excluded regions before OCR to prevent
+                processing of headers, footers, or other unwanted content.
+            detect_only: If True, only detect text bounding boxes without performing
+                character recognition. Useful for layout analysis workflows.
+            replace: If True, replace any existing OCR elements on the pages.
+                If False, append new OCR results to existing elements.
+            options: Engine-specific options object (e.g., EasyOCROptions, SuryaOptions).
+                Allows fine-tuning of engine behavior beyond common parameters.
+            pages: Page indices to process. Can be:
+                - None: Process all pages
+                - slice: Process a range of pages (e.g., slice(0, 10))
+                - Iterable[int]: Process specific page indices (e.g., [0, 2, 5])
         Returns:
-            Self for method chaining
+            Self for method chaining.
+        Raises:
+            ValueError: If invalid page index is provided.
+            TypeError: If pages parameter has invalid type.
+            RuntimeError: If OCR engine is not available or fails.
+        Example:
+            ```python
+            pdf = npdf.PDF("scanned_document.pdf")
+            # Basic OCR on all pages
+            pdf.apply_ocr()
+            # High-quality OCR with specific settings
+            pdf.apply_ocr(
+                engine='easyocr',
+                languages=['en', 'es'],
+                resolution=300,
+                min_confidence=0.8
+            )
+            # OCR specific pages only
+            pdf.apply_ocr(pages=[0, 1, 2])  # First 3 pages
+            pdf.apply_ocr(pages=slice(5, 10))  # Pages 5-9
+            # Detection-only workflow for layout analysis
+            pdf.apply_ocr(detect_only=True, resolution=150)
+            ```
+        Note:
+            OCR processing can be time and memory intensive, especially at high
+            resolutions. Consider using exclusions to mask unwanted regions and
+            processing pages in batches for large documents.
         """
         if not self._ocr_manager:
             logger.error("OCRManager not available. Cannot apply OCR.")
@@ -1013,10 +1288,47 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         **kwargs,
     ) -> Dict[str, Any]:
         """
-        Ask a question about the document content.
+        Ask a single question about the document content.
+        Args:
+            question: Question string to ask about the document
+            mode: "extractive" to extract answer from document, "generative" to generate
+            pages: Specific pages to query (default: all pages)
+            min_confidence: Minimum confidence threshold for answers
+            model: Optional model name for question answering
+            **kwargs: Additional parameters passed to the QA engine
+        Returns:
+            Dict containing: answer, confidence, found, page_num, source_elements, etc.
+        """
+        # Delegate to ask_batch and return the first result
+        results = self.ask_batch([question], mode=mode, pages=pages, min_confidence=min_confidence, model=model, **kwargs)
+        return results[0] if results else {
+            "answer": None,
+            "confidence": 0.0,
+            "found": False,
+            "page_num": None,
+            "source_elements": [],
+        }
+    def ask_batch(
+        self,
+        questions: List[str],
+        mode: str = "extractive",
+        pages: Union[int, List[int], range] = None,
+        min_confidence: float = 0.1,
+        model: str = None,
+        **kwargs,
+    ) -> List[Dict[str, Any]]:
+        """
+        Ask multiple questions about the document content using batch processing.
+        This method processes multiple questions efficiently in a single batch,
+        avoiding the multiprocessing resource accumulation that can occur with
+        sequential individual question calls.
         Args:
-            question: Question to ask about the document
+            questions: List of question strings to ask about the document
             mode: "extractive" to extract answer from document, "generative" to generate
             pages: Specific pages to query (default: all pages)
             min_confidence: Minimum confidence threshold for answers
@@ -1024,45 +1336,147 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
             **kwargs: Additional parameters passed to the QA engine
         Returns:
-            A dictionary containing the answer, confidence, and other metadata
-            A dictionary containing the answer, confidence, and other metadata
+            List of Dicts, each containing: answer, confidence, found, page_num, source_elements, etc.
         """
         from natural_pdf.qa import get_qa_engine
+        if not questions:
+            return []
+        if not isinstance(questions, list) or not all(isinstance(q, str) for q in questions):
+            raise TypeError("'questions' must be a list of strings")
         qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
+        # Resolve target pages
         if pages is None:
-            target_pages = list(range(len(self.pages)))
+            target_pages = self.pages
         elif isinstance(pages, int):
-            target_pages = [pages]
+            if 0 <= pages < len(self.pages):
+                target_pages = [self.pages[pages]]
+            else:
+                raise IndexError(f"Page index {pages} out of range (0-{len(self.pages)-1})")
         elif isinstance(pages, (list, range)):
-            target_pages = pages
+            target_pages = []
+            for page_idx in pages:
+                if 0 <= page_idx < len(self.pages):
+                    target_pages.append(self.pages[page_idx])
+                else:
+                    logger.warning(f"Page index {page_idx} out of range, skipping")
         else:
             raise ValueError(f"Invalid pages parameter: {pages}")
-        results = []
-        for page_idx in target_pages:
-            if 0 <= page_idx < len(self.pages):
-                page = self.pages[page_idx]
-                page_result = qa_engine.ask_pdf_page(
-                    page=page, question=question, min_confidence=min_confidence, **kwargs
-                )
+        if not target_pages:
+            logger.warning("No valid pages found for QA processing.")
+            return [
+                {
+                    "answer": None,
+                    "confidence": 0.0,
+                    "found": False,
+                    "page_num": None,
+                    "source_elements": [],
+                }
+                for _ in questions
+            ]
-                if page_result and page_result.get("found", False):
-                    results.append(page_result)
+        logger.info(f"Processing {len(questions)} question(s) across {len(target_pages)} page(s) using batch QA...")
-        results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
+        # Collect all page images and metadata for batch processing
+        page_images = []
+        page_word_boxes = []
+        page_metadata = []
-        if results:
-            return results[0]
-        else:
-            return {
-                "answer": None,
-                "confidence": 0.0,
-                "found": False,
-                "page_num": None,
-                "source_elements": [],
-            }
+        for page in target_pages:
+            # Get page image
+            try:
+                page_image = page.to_image(resolution=150, include_highlights=False)
+                if page_image is None:
+                    logger.warning(f"Failed to render image for page {page.number}, skipping")
+                    continue
+                # Get text elements for word boxes
+                elements = page.find_all("text")
+                if not elements:
+                    logger.warning(f"No text elements found on page {page.number}")
+                    word_boxes = []
+                else:
+                    word_boxes = qa_engine._get_word_boxes_from_elements(elements, offset_x=0, offset_y=0)
+                page_images.append(page_image)
+                page_word_boxes.append(word_boxes)
+                page_metadata.append({
+                    "page_number": page.number,
+                    "page_object": page
+                })
+            except Exception as e:
+                logger.warning(f"Error processing page {page.number}: {e}")
+                continue
+        if not page_images:
+            logger.warning("No page images could be processed for QA.")
+            return [
+                {
+                    "answer": None,
+                    "confidence": 0.0,
+                    "found": False,
+                    "page_num": None,
+                    "source_elements": [],
+                }
+                for _ in questions
+            ]
+        # Process all questions against all pages in batch
+        all_results = []
+        for question_text in questions:
+            question_results = []
+            # Ask this question against each page (but in batch per page)
+            for i, (page_image, word_boxes, page_meta) in enumerate(zip(page_images, page_word_boxes, page_metadata)):
+                try:
+                    # Use the DocumentQA batch interface
+                    page_result = qa_engine.ask(
+                        image=page_image,
+                        question=question_text,
+                        word_boxes=word_boxes,
+                        min_confidence=min_confidence,
+                        **kwargs
+                    )
+                    if page_result and page_result.found:
+                        # Add page metadata to result
+                        page_result_dict = {
+                            "answer": page_result.answer,
+                            "confidence": page_result.confidence,
+                            "found": page_result.found,
+                            "page_num": page_meta["page_number"],
+                            "source_elements": getattr(page_result, 'source_elements', []),
+                            "start": getattr(page_result, 'start', -1),
+                            "end": getattr(page_result, 'end', -1),
+                        }
+                        question_results.append(page_result_dict)
+                except Exception as e:
+                    logger.warning(f"Error processing question '{question_text}' on page {page_meta['page_number']}: {e}")
+                    continue
+            # Sort results by confidence and take the best one for this question
+            question_results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
+            if question_results:
+                all_results.append(question_results[0])
+            else:
+                # No results found for this question
+                all_results.append({
+                    "answer": None,
+                    "confidence": 0.0,
+                    "found": False,
+                    "page_num": None,
+                    "source_elements": [],
+                })
+        return all_results
     def search_within_index(
         self,
@@ -1463,6 +1877,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
                 reading_order=self._reading_order,
                 font_attrs=self._font_attrs,
                 keep_spaces=self._config.get("keep_spaces", True),
+                text_layer=self._text_layer,
             )
             return new_pdf
         except Exception as e:
@@ -1506,7 +1921,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
         if not manager or not manager.is_available():
             from natural_pdf.classification.manager import is_classification_available
             if not is_classification_available():
                 raise ImportError(
                     "Classification dependencies missing. "

natural-pdf 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl

natural-pdf 0.1.32py3-none-any.whl → 0.1.34py3-none-any.whl