PyPI - natural-pdf - Versions diffs - 25.3.16__py3-none-any.whl → 25.3.17.2__py3-none-any.whl - Mend

natural-pdf 25.3.16py3-none-any.whl → 25.3.17.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

examples/direct_qa_example.py +17 -111
examples/docling_comprehensive_test.py +325 -0
examples/docling_example.py +192 -0
examples/docling_hierarchy_example.py +230 -0
examples/docling_text_sources.py +241 -0
examples/improved_qa_example.py +66 -0
examples/url_pdf_example.py +45 -0
natural_pdf/analyzers/document_layout.py +276 -0
natural_pdf/core/page.py +72 -21
natural_pdf/core/pdf.py +102 -71
natural_pdf/elements/region.py +174 -19
natural_pdf/qa/document_qa.py +29 -38
natural_pdf/selectors/parser.py +6 -2
{natural_pdf-25.3.16.dist-info → natural_pdf-25.3.17.2.dist-info}/METADATA +25 -3
{natural_pdf-25.3.16.dist-info → natural_pdf-25.3.17.2.dist-info}/RECORD +18 -12
{natural_pdf-25.3.16.dist-info → natural_pdf-25.3.17.2.dist-info}/LICENSE +0 -0
{natural_pdf-25.3.16.dist-info → natural_pdf-25.3.17.2.dist-info}/WHEEL +0 -0
{natural_pdf-25.3.16.dist-info → natural_pdf-25.3.17.2.dist-info}/top_level.txt +0 -0

natural_pdf/core/pdf.py CHANGED Viewed

@@ -1,5 +1,9 @@
 import pdfplumber
 import logging
+import tempfile
+import os
+import re
+import urllib.request
 from typing import List, Optional, Union, Any, Dict, Callable, Tuple, Type
 from natural_pdf.core.page import Page
@@ -28,7 +32,7 @@ class PDF:
     with improved selection, navigation, and extraction capabilities.
     """
-    def __init__(self, path: str, reading_order: bool = True,
+    def __init__(self, path_or_url: str, reading_order: bool = True,
                  ocr: Optional[Union[bool, str, List, Dict]] = None,
                  ocr_engine: Optional[Union[str, Any]] = None,
                  font_attrs: Optional[List[str]] = None,
@@ -37,7 +41,7 @@ class PDF:
         Initialize the enhanced PDF object.
         Args:
-            path: Path to the PDF file
+            path_or_url: Path to the PDF file or a URL to a PDF
             reading_order: Whether to use natural reading order
             ocr: OCR configuration:
                  - None or False: OCR disabled
@@ -58,6 +62,40 @@ class PDF:
                        True: Spaces are part of words, better for multi-word searching
                        False: Break text at spaces, each word is separate (legacy behavior)
         """
+        # Check if the input is a URL
+        is_url = path_or_url.startswith('http://') or path_or_url.startswith('https://')
+        # Initialize path-related attributes
+        self._original_path = path_or_url
+        self._temp_file = None
+        if is_url:
+            logger.info(f"Downloading PDF from URL: {path_or_url}")
+            try:
+                # Create a temporary file to store the downloaded PDF
+                self._temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
+                # Download the PDF
+                with urllib.request.urlopen(path_or_url) as response:
+                    self._temp_file.write(response.read())
+                    self._temp_file.flush()
+                    self._temp_file.close()
+                # Use the temporary file path
+                path = self._temp_file.name
+                logger.info(f"PDF downloaded to temporary file: {path}")
+            except Exception as e:
+                if self._temp_file and hasattr(self._temp_file, 'name'):
+                    try:
+                        os.unlink(self._temp_file.name)
+                    except:
+                        pass
+                logger.error(f"Failed to download PDF from URL: {e}")
+                raise ValueError(f"Failed to download PDF from URL: {e}")
+        else:
+            # Use the provided path directly
+            path = path_or_url
         logger.info(f"Initializing PDF from {path}")
         logger.debug(f"Parameters: reading_order={reading_order}, ocr={ocr}, ocr_engine={ocr_engine}, font_attrs={font_attrs}, keep_spaces={keep_spaces}")
@@ -72,8 +110,13 @@ class PDF:
         if HAS_OCR_ENGINES:
             # Handle OCR engine selection
             if ocr_engine is None:
-                # Use default engine (EasyOCR)
-                self._ocr_engine = EasyOCREngine()
+                # Use default engine (PaddleOCR)
+                try:
+                    self._ocr_engine = PaddleOCREngine()
+                except (ImportError, ValueError) as e:
+                    logger.warning(f"PaddleOCR engine could not be loaded: {e}")
+                    logger.warning("Falling back to EasyOCR engine.")
+                    self._ocr_engine = EasyOCREngine()
             elif isinstance(ocr_engine, str):
                 # String-based engine selection
                 try:
@@ -481,74 +524,51 @@ class PDF:
             **kwargs: Additional parameters passed to the QA engine
         Returns:
-            Dictionary with answer and confidence
+            A dictionary containing the answer, confidence, and other metadata.
+            Result will have an 'answer' key containing the answer text.
         """
-        try:
-            from natural_pdf.qa import get_qa_engine
-            # Initialize or get QA engine
-            qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
-            # Determine which pages to query
-            if pages is None:
-                # Query all pages by default, prioritizing first few pages
-                target_pages = list(range(min(10, len(self.pages))))
-            elif isinstance(pages, int):
-                # Single page
-                target_pages = [pages]
-            elif isinstance(pages, (list, range)):
-                # List or range of pages
-                target_pages = pages
-            else:
-                raise ValueError(f"Invalid pages parameter: {pages}")
-            # Actually query each page and gather results
-            results = []
-            for page_idx in target_pages:
-                if 0 <= page_idx < len(self.pages):
-                    page = self.pages[page_idx]
-                    page_result = qa_engine.ask_pdf_page(
-                        page=page,
-                        question=question,
-                        min_confidence=min_confidence,
-                        **kwargs
-                    )
-                    # Add to results if it found an answer
-                    if page_result.get("found", False):
-                        results.append(page_result)
-            # Sort results by confidence
-            results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
-            # Return the best result, or an empty result if none found
-            if results:
-                return results[0]
-            else:
-                return {
-                    "answer": "",
-                    "confidence": 0.0,
-                    "found": False,
-                    "message": "No answer found in document"
-                }
+        from natural_pdf.qa import get_qa_engine
+        # Initialize or get QA engine
+        qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
+        # Determine which pages to query
+        if pages is None:
+            target_pages = list(range(len(self.pages)))
+        elif isinstance(pages, int):
+            # Single page
+            target_pages = [pages]
+        elif isinstance(pages, (list, range)):
+            # List or range of pages
+            target_pages = pages
+        else:
+            raise ValueError(f"Invalid pages parameter: {pages}")
+        # Actually query each page and gather results
+        results = []
+        for page_idx in target_pages:
+            if 0 <= page_idx < len(self.pages):
+                page = self.pages[page_idx]
+                page_result = qa_engine.ask_pdf_page(
+                    page=page,
+                    question=question,
+                    min_confidence=min_confidence,
+                    **kwargs
+                )
+                # Add to results if it found an answer
+                if page_result.get("found", False):
+                    results.append(page_result)
+        # Sort results by confidence
+        results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
+        # Return the best result, or a default result if none found
+        if results:
+            return results[0]
+        else:
+            return None
-        except ImportError as e:
-            logger.warning(f"QA functionality not available: {e}")
-            return {
-                "answer": "",
-                "confidence": 0.0,
-                "error": "QA functionality not available",
-                "found": False
-            }
-        except Exception as e:
-            logger.error(f"Error in document QA: {e}")
-            return {
-                "answer": "",
-                "confidence": 0.0,
-                "error": str(e),
-                "found": False
-            }
     def __len__(self) -> int:
         """Return the number of pages in the PDF."""
         return len(self.pages)
@@ -558,10 +578,21 @@ class PDF:
         return self.pages[key]
     def close(self):
-        """Close the underlying PDF file."""
+        """Close the underlying PDF file and clean up any temporary files."""
         if hasattr(self, '_pdf') and self._pdf is not None:
             self._pdf.close()
             self._pdf = None
+        # Clean up temporary file if it exists
+        if hasattr(self, '_temp_file') and self._temp_file is not None:
+            try:
+                if os.path.exists(self._temp_file.name):
+                    os.unlink(self._temp_file.name)
+                    logger.debug(f"Removed temporary PDF file: {self._temp_file.name}")
+            except Exception as e:
+                logger.warning(f"Failed to clean up temporary PDF file: {e}")
+            finally:
+                self._temp_file = None
     def __enter__(self):
         """Context manager entry."""

natural_pdf/elements/region.py CHANGED Viewed

@@ -18,7 +18,7 @@ class Region:
     Represents a rectangular region on a page.
     """
-    def __init__(self, page: 'Page', bbox: Tuple[float, float, float, float], polygon: List[Tuple[float, float]] = None):
+    def __init__(self, page: 'Page', bbox: Tuple[float, float, float, float], polygon: List[Tuple[float, float]] = None, parent=None):
         """
         Initialize a region.
@@ -26,6 +26,7 @@ class Region:
             page: Parent page
             bbox: Bounding box as (x0, top, x1, bottom)
             polygon: Optional list of coordinate points [(x1,y1), (x2,y2), ...] for non-rectangular regions
+            parent: Optional parent region (for hierarchical document structure)
         """
         self._page = page
         self._bbox = bbox
@@ -48,6 +49,12 @@ class Region:
         # Region management attributes
         self.name = None
         self.source = None  # Will be set by creation methods
+        # Hierarchy support for nested document structure
+        self.parent_region = parent
+        self.child_regions = []
+        self.text_content = None  # Direct text content (e.g., from Docling)
+        self.associated_text_elements = []  # Native text elements that overlap with this region
     @property
     def page(self) -> 'Page':
@@ -387,6 +394,11 @@ class Region:
         """
         Extract text from this region using pdfplumber's native functionality.
+        For regions created by Docling, this will first try to use:
+        1. Associated text elements from the PDF (if available)
+        2. Direct text content from Docling (if available)
+        3. Fall back to standard pdfplumber extraction
         Args:
             keep_blank_chars: Whether to keep blank characters (legacy parameter)
             apply_exclusions: Whether to apply exclusion regions
@@ -398,6 +410,28 @@ class Region:
         Returns:
             Extracted text as string
         """
+        import logging
+        logger = logging.getLogger("natural_pdf.elements.region")
+        # Check for Docling model or if we have direct text content
+        if self.model == 'docling' or hasattr(self, 'text_content'):
+            # First priority: check if we have associated native text elements
+            if hasattr(self, 'associated_text_elements') and self.associated_text_elements:
+                source_count = len(self.associated_text_elements)
+                logger.info(f"Region {self.region_type}: Using {source_count} native PDF text elements")
+                # Sort elements in reading order
+                sorted_elements = sorted(self.associated_text_elements, key=lambda e: (e.top, e.x0))
+                # Extract and join their text
+                text_result = " ".join(elem.text for elem in sorted_elements)
+                return text_result
+            # Second priority: use direct text content from Docling
+            elif self.text_content:
+                logger.info(f"Region {self.region_type}: Using Docling OCR text content")
+                return self.text_content
+            logger.debug(f"Region {self.region_type}: No Docling text found, falling back to standard extraction")
         # Handle preserve_whitespace parameter for consistency with Page.extract_text
         if preserve_whitespace is not None:
             keep_blank_chars = preserve_whitespace
@@ -1346,21 +1380,142 @@ class Region:
                 "source_elements": list of elements that contain the answer (if found)
             }
         """
-        try:
-            from natural_pdf.qa.document_qa import get_qa_engine
-            # Get or initialize QA engine with specified model
-            qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
-            # Ask the question using the QA engine
-            return qa_engine.ask_pdf_region(self, question, min_confidence=min_confidence, debug=debug, **kwargs)
-        except ImportError as e:
-            import logging
-            logger = logging.getLogger("natural_pdf.elements.region")
-            logger.warning(f"QA functionality not available: {e}")
-            return {
-                "answer": "",
-                "confidence": 0.0,
-                "error": "QA functionality not available",
-                "found": False
-            }
+        from natural_pdf.qa.document_qa import get_qa_engine
+        # Get or initialize QA engine with specified model
+        qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
+        # Ask the question using the QA engine
+    def add_child(self, child):
+        """
+        Add a child region to this region.
+        Used for hierarchical document structure when using models like Docling
+        that understand document hierarchy.
+        Args:
+            child: Region object to add as a child
+        Returns:
+            Self for method chaining
+        """
+        self.child_regions.append(child)
+        child.parent_region = self
+        return self
+    def get_children(self, selector=None):
+        """
+        Get immediate child regions, optionally filtered by selector.
+        Args:
+            selector: Optional selector to filter children
+        Returns:
+            List of child regions matching the selector
+        """
+        import logging
+        logger = logging.getLogger("natural_pdf.elements.region")
+        if selector is None:
+            return self.child_regions
+        # Use existing selector parser to filter
+        from natural_pdf.selectors.parser import match_elements_with_selector
+        matched = match_elements_with_selector(self.child_regions, selector)
+        logger.debug(f"get_children: found {len(matched)} of {len(self.child_regions)} children matching '{selector}'")
+        return matched
+    def get_descendants(self, selector=None):
+        """
+        Get all descendant regions (children, grandchildren, etc.), optionally filtered by selector.
+        Args:
+            selector: Optional selector to filter descendants
+        Returns:
+            List of descendant regions matching the selector
+        """
+        import logging
+        logger = logging.getLogger("natural_pdf.elements.region")
+        all_descendants = []
+        # First add direct children
+        all_descendants.extend(self.child_regions)
+        # Then recursively add their descendants
+        for child in self.child_regions:
+            all_descendants.extend(child.get_descendants())
+        logger.debug(f"get_descendants: found {len(all_descendants)} total descendants")
+        # Filter by selector if provided
+        if selector is not None:
+            from natural_pdf.selectors.parser import match_elements_with_selector
+            matched = match_elements_with_selector(all_descendants, selector)
+            logger.debug(f"get_descendants: filtered to {len(matched)} matching '{selector}'")
+            return matched
+        return all_descendants
+    def find_all(self, selector, recursive=True, **kwargs):
+        """
+        Find all matching elements within this region, with optional recursion through child regions.
+        Args:
+            selector: The selector to find elements with
+            recursive: Whether to search recursively through child regions
+            **kwargs: Additional parameters to pass to the selector parser
+        Returns:
+            Collection of matching elements
+        """
+        # Get direct matches
+        direct_matches = self.page.find_all(selector, region=self, **kwargs)
+        if not recursive or not self.child_regions:
+            return direct_matches
+        # Get recursive matches from children
+        from natural_pdf.elements.collections import ElementCollection
+        all_matches = list(direct_matches)
+        for child in self.child_regions:
+            child_matches = child.find_all(selector, recursive=True, **kwargs)
+            for match in child_matches:
+                if match not in all_matches:
+                    all_matches.append(match)
+        return ElementCollection(all_matches)
+    def ask(self, question: str, min_confidence: float = 0.1, model: str = None, debug: bool = False, **kwargs) -> Dict[str, Any]:
+        """
+        Ask a question about the region content using document QA.
+        This method uses a document question answering model to extract answers from the region content.
+        It leverages both textual content and layout information for better understanding.
+        Args:
+            question: The question to ask about the region content
+            min_confidence: Minimum confidence threshold for answers (0.0-1.0)
+            model: Optional model name to use for QA (if None, uses default model)
+            **kwargs: Additional parameters to pass to the QA engine
+        Returns:
+            Dictionary with answer details: {
+                "answer": extracted text,
+                "confidence": confidence score,
+                "found": whether an answer was found,
+                "page_num": page number,
+                "region": reference to this region,
+                "source_elements": list of elements that contain the answer (if found)
+            }
+        """
+        from natural_pdf.qa.document_qa import get_qa_engine
+        # Get or initialize QA engine with specified model
+        qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
+        # Ask the question using the QA engine
+        return qa_engine.ask_pdf_region(self, question, min_confidence=min_confidence, debug=debug, **kwargs)

natural_pdf/qa/document_qa.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import logging
 from typing import List, Dict, Any, Optional, Union, Tuple
 import numpy as np
-from PIL import Image
+from PIL import Image, ImageDraw
 import os
 import tempfile
 import json
@@ -207,47 +207,38 @@ class DocumentQA:
                 logger.info(f"Visualization: {vis_path}")
         # Run the query through the pipeline
-        try:
-            logger.info(f"Running document QA pipeline with question: {question}")
-            result = self.pipe(query)[0]
-            logger.info(f"Raw result: {result}")
-            # Save the result if debugging
-            if debug:
-                result_path = os.path.join(debug_output_dir, "debug_qa_result.json")
-                with open(result_path, 'w') as f:
-                    # Convert any non-serializable data
-                    serializable_result = {k: str(v) if not isinstance(v, (str, int, float, bool, list, dict, type(None))) else v
-                                        for k, v in result.items()}
-                    json.dump(serializable_result, f, indent=2)
-            # Check confidence against threshold
-            if result["score"] < min_confidence:
-                logger.info(f"Answer confidence {result['score']:.4f} below threshold {min_confidence}")
-                return {
-                    "answer": "",
-                    "confidence": result["score"],
-                    "start": result.get("start", -1),
-                    "end": result.get("end", -1),
-                    "found": False
-                }
-            return {
-                "answer": result["answer"],
-                "confidence": result["score"],
-                "start": result.get("start", 0),
-                "end": result.get("end", 0),
-                "found": True
-            }
-        except Exception as e:
-            logger.error(f"Error in document QA: {e}")
+        logger.info(f"Running document QA pipeline with question: {question}")
+        result = self.pipe(query)[0]
+        logger.info(f"Raw result: {result}")
+        # Save the result if debugging
+        if debug:
+            result_path = os.path.join(debug_output_dir, "debug_qa_result.json")
+            with open(result_path, 'w') as f:
+                # Convert any non-serializable data
+                serializable_result = {k: str(v) if not isinstance(v, (str, int, float, bool, list, dict, type(None))) else v
+                                    for k, v in result.items()}
+                json.dump(serializable_result, f, indent=2)
+        # Check confidence against threshold
+        if result["score"] < min_confidence:
+            logger.info(f"Answer confidence {result['score']:.4f} below threshold {min_confidence}")
             return {
                 "answer": "",
-                "confidence": 0.0,
-                "error": str(e),
+                "confidence": result["score"],
+                "start": result.get("start", -1),
+                "end": result.get("end", -1),
                 "found": False
             }
+        return {
+            "answer": result["answer"],
+            "confidence": result["score"],
+            "start": result.get("start", 0),
+            "end": result.get("end", 0),
+            "found": True
+        }
     def ask_pdf_page(self, page, question: str, min_confidence: float = 0.1, debug: bool = False) -> Dict[str, Any]:
         """

natural_pdf/selectors/parser.py CHANGED Viewed

@@ -162,8 +162,12 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
                     return False
                 # If 'type' attribute specified, it will be checked in the attributes section
-            # Otherwise, require exact match
-            elif element.type != selector['type']:
+            # Check for Docling-specific types (section-header, etc.)
+            elif hasattr(element, 'normalized_type') and element.normalized_type == selector['type']:
+                # This is a direct match with a Docling region type
+                pass
+            # Otherwise, require exact match with the element's type attribute
+            elif not hasattr(element, 'type') or element.type != selector['type']:
                 return False
         # Check attributes

{natural_pdf-25.3.16.dist-info → natural_pdf-25.3.17.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: natural-pdf
-Version: 25.3.16
+Version: 25.3.17.2
 Summary: A more intuitive interface for working with PDFs
 Home-page: https://github.com/jsoma/natural-pdf
 Author: Jonathan Soma
@@ -15,6 +15,7 @@ Requires-Dist: pdfplumber>=0.7.0
 Requires-Dist: Pillow>=8.0.0
 Requires-Dist: colour>=0.1.5
 Requires-Dist: numpy>=1.20.0
+Requires-Dist: urllib3>=1.26.0
 Requires-Dist: doclayout_yolo>=0.0.3
 Requires-Dist: torch>=2.0.0
 Requires-Dist: torchvision>=0.15.0
@@ -58,7 +59,8 @@ A friendly library for working with PDFs, built on top of [pdfplumber](https://g
 Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
-[Complete documentation here](https://jsoma.github.io/natural-pdf)
+- [Complete documentation here](https://jsoma.github.io/natural-pdf)
+- [Live demo here](https://colab.research.google.com/github/jsoma/natural-pdf/blob/main/notebooks/Examples.ipynb)
 ## Features
@@ -96,9 +98,12 @@ pip install natural-pdf[easyocr,paddle]
 ```python
 from natural_pdf import PDF
-# Open a PDF
+# Open a local PDF
 pdf = PDF('document.pdf')
+# Or open a PDF from a URL
+pdf = PDF('https://example.com/document.pdf')
 # Get the first page
 page = pdf.pages[0]
@@ -263,6 +268,23 @@ Logs follow a hierarchical structure matching the library's module organization:
 - `natural_pdf.analyzers` - Layout analysis operations
 - `natural_pdf.ocr` - OCR engine operations
+## Document QA
+Ask questions directly to your documents:
+```python
+# Ask questions about the document content
+result = pdf.ask("What was the company's revenue in 2022?")
+print(f"Answer: {result['answer']}")
+print(f"Confidence: {result['confidence']:.2f}")
+# Access more details in the result dictionary
+result = pdf.ask("Who is the CEO?")
+print(f"Answer: {result['answer']}")
+print(f"Found on page: {result['page_num']}")
+print(f"Source text: {result.get('source_text', 'N/A')}")
+```
 ## More details
 [Complete documentation here](https://jsoma.github.io/natural-pdf)

natural-pdf 25.3.16__py3-none-any.whl → 25.3.17.2__py3-none-any.whl

natural-pdf 25.3.16py3-none-any.whl → 25.3.17.2py3-none-any.whl