PyPI - natural-pdf - Versions diffs - 25.3.16__py3-none-any.whl → 25.3.17.2__py3-none-any.whl - Mend

natural-pdf 25.3.16py3-none-any.whl → 25.3.17.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

examples/direct_qa_example.py +17 -111
examples/docling_comprehensive_test.py +325 -0
examples/docling_example.py +192 -0
examples/docling_hierarchy_example.py +230 -0
examples/docling_text_sources.py +241 -0
examples/improved_qa_example.py +66 -0
examples/url_pdf_example.py +45 -0
natural_pdf/analyzers/document_layout.py +276 -0
natural_pdf/core/page.py +72 -21
natural_pdf/core/pdf.py +102 -71
natural_pdf/elements/region.py +174 -19
natural_pdf/qa/document_qa.py +29 -38
natural_pdf/selectors/parser.py +6 -2
{natural_pdf-25.3.16.dist-info → natural_pdf-25.3.17.2.dist-info}/METADATA +25 -3
{natural_pdf-25.3.16.dist-info → natural_pdf-25.3.17.2.dist-info}/RECORD +18 -12
{natural_pdf-25.3.16.dist-info → natural_pdf-25.3.17.2.dist-info}/LICENSE +0 -0
{natural_pdf-25.3.16.dist-info → natural_pdf-25.3.17.2.dist-info}/WHEEL +0 -0
{natural_pdf-25.3.16.dist-info → natural_pdf-25.3.17.2.dist-info}/top_level.txt +0 -0

examples/docling_hierarchy_example.py ADDED Viewed

@@ -0,0 +1,230 @@
+"""
+Example script demonstrating hierarchical document navigation with Docling.
+This script shows how to use Docling's hierarchical document structure to:
+1. Navigate parent-child relationships
+2. Extract structured content from nested document elements
+3. Visualize the document hierarchy
+Usage:
+    python examples/docling_hierarchy_example.py [pdf_path]
+Dependencies:
+    - torch
+    - transformers
+    - docling_core
+"""
+import os
+import sys
+import logging
+from pathlib import Path
+# Import the library
+from natural_pdf import PDF, configure_logging
+# Configure logging
+configure_logging(level=logging.INFO)
+logger = logging.getLogger("docling_hierarchy")
+logger.setLevel(logging.INFO)
+# Get PDF path from command line or use demo file
+if len(sys.argv) > 1:
+    pdf_path = sys.argv[1]
+else:
+    # Default to a sample PDF in the pdfs directory
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    repo_root = os.path.dirname(script_dir)
+    pdf_path = os.path.join(repo_root, "pdfs", "01-practice.pdf")
+# Check if required packages are installed
+try:
+    from docling.document_converter import DocumentConverter
+except ImportError:
+    print("Missing required packages. Please install:")
+    print("pip install docling")
+    sys.exit(1)
+# Create output directory
+output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "output")
+os.makedirs(output_dir, exist_ok=True)
+# Load the PDF
+print(f"Loading PDF: {pdf_path}")
+pdf = PDF(pdf_path)
+page = pdf.pages[0]
+# Run Docling analysis
+print("Running Docling analysis...")
+page.analyze_layout(
+    model="docling",
+    confidence=0.2,  # This parameter isn't used by Docling but kept for API consistency
+    model_params={
+        "verbose": True
+        # Any other parameters would be passed directly to DocumentConverter
+    }
+)
+# Verify Docling document is created
+if not hasattr(page, 'docling_document'):
+    print("Error: Docling document not created")
+    sys.exit(1)
+# Get all Docling regions
+docling_regions = page.find_all('region[model=docling]')
+print(f"Found {len(docling_regions)} Docling regions")
+# Find top-level elements (no parent)
+top_level = [r for r in docling_regions if not r.parent_region]
+print(f"Document has {len(top_level)} top-level elements")
+# Show the top-level hierarchy
+print("\n--- Top-Level Hierarchy ---")
+for i, elem in enumerate(top_level[:5]):  # Show first 5 top-level elements
+    print(f"Element {i+1}: {elem.region_type}")
+    # Count children if any
+    if hasattr(elem, 'child_regions') and elem.child_regions:
+        print(f"  - Children: {len(elem.child_regions)}")
+        # Show first 3 children
+        for j, child in enumerate(elem.child_regions[:3]):
+            print(f"    Child {j+1}: {child.region_type}")
+            # If the child has children (grandchildren)
+            if hasattr(child, 'child_regions') and child.child_regions:
+                print(f"      - Grandchildren: {len(child.child_regions)}")
+        # If more children exist
+        if len(elem.child_regions) > 3:
+            print(f"    ... and {len(elem.child_regions) - 3} more children")
+# Try to find section headers specifically
+section_headers = page.find_all('section-header')
+print(f"\nFound {len(section_headers)} section headers")
+# If we have section headers, demonstrate hierarchical navigation
+if section_headers:
+    # Choose the first section header for demonstration
+    header = section_headers[0]
+    print(f"\n--- Analyzing Section: {header.extract_text()[:50]}... ---")
+    # Direct children
+    children = header.get_children()
+    print(f"Direct children: {len(children)}")
+    # Children by type
+    text_children = header.get_children('text')
+    print(f"Direct text children: {len(text_children)}")
+    # All descendants
+    descendants = header.get_descendants()
+    print(f"All descendants: {len(descendants)}")
+    # Descendants by type
+    text_descendants = header.get_descendants('text')
+    print(f"All text descendants: {len(text_descendants)}")
+    # Recursive find_all
+    found_text = header.find_all('text', recursive=True)
+    print(f"Text elements found recursively: {len(found_text)}")
+    # Extract text from the entire section
+    section_text = header.extract_text()
+    print(f"Full section text ({len(section_text)} chars): {section_text[:100]}...")
+    # Create a structured outline of this section
+    print("\n--- Section Outline ---")
+    def print_outline(element, level=0):
+        """Recursively print the outline of a section"""
+        indent = "  " * level
+        text = element.extract_text()
+        if len(text) > 50:
+            text = text[:47] + "..."
+        print(f"{indent}- {element.region_type}: {text}")
+        if hasattr(element, 'get_children'):
+            for child in element.get_children():
+                print_outline(child, level + 1)
+    print_outline(header)
+    # Visualize the hierarchy
+    print("\nVisualizing section hierarchy...")
+    page.clear_highlights()
+    # Create a color gradient for different hierarchy levels
+    colors = [
+        (1, 0, 0, 0.3),  # Red - Top level
+        (0, 0.7, 0, 0.3),  # Green - Level 1
+        (0, 0, 1, 0.3),  # Blue - Level 2
+        (1, 0.7, 0, 0.3),  # Orange - Level 3
+        (0.7, 0, 1, 0.3),  # Purple - Level 4
+    ]
+    # Highlight the hierarchy
+    def highlight_hierarchy(element, level=0):
+        """Recursively highlight elements with color by level"""
+        color = colors[min(level, len(colors) - 1)]
+        label = f"Level {level}: {element.region_type}"
+        element.highlight(color=color, label=label, include_attrs=['region_type'])
+        if hasattr(element, 'get_children'):
+            for child in element.get_children():
+                highlight_hierarchy(child, level + 1)
+    highlight_hierarchy(header)
+    # Save visualization
+    hierarchy_path = os.path.join(output_dir, "docling_hierarchy.png")
+    page.save_image(hierarchy_path, labels=True)
+    print(f"Saved hierarchy visualization to {hierarchy_path}")
+    # BONUS: Extract structured content from the hierarchy
+    print("\n--- Structured Content Extraction ---")
+    # Create a structured dictionary from the hierarchy
+    def extract_structured_content(element):
+        """Extract structured content from the element hierarchy"""
+        content = {
+            "type": element.region_type,
+            "text": element.extract_text(),
+            "children": []
+        }
+        if hasattr(element, 'get_children'):
+            for child in element.get_children():
+                content["children"].append(extract_structured_content(child))
+        return content
+    structured_content = extract_structured_content(header)
+    # Display the structure (simplified)
+    def print_structure(structure, level=0):
+        """Print the structured content dictionary in a readable format"""
+        indent = "  " * level
+        text = structure["text"]
+        if len(text) > 50:
+            text = text[:47] + "..."
+        print(f"{indent}{structure['type']}: {text}")
+        if structure["children"]:
+            print(f"{indent}Children: {len(structure['children'])}")
+            for child in structure["children"][:2]:  # Show only first 2 children
+                print_structure(child, level + 1)
+            if len(structure["children"]) > 2:
+                print(f"{indent}... and {len(structure['children']) - 2} more children")
+    print_structure(structured_content)
+    # Advanced: Save structured content as JSON
+    import json
+    structured_path = os.path.join(output_dir, "docling_structured_content.json")
+    with open(structured_path, 'w') as f:
+        json.dump(structured_content, f, indent=2)
+    print(f"Saved structured content to {structured_path}")
+else:
+    print("No section headers found for hierarchy demonstration")
+print("\nHierarchy analysis complete!")

examples/docling_text_sources.py ADDED Viewed

@@ -0,0 +1,241 @@
+"""
+Example script demonstrating how Docling handles text from different sources.
+This script shows how Docling integrates with natural-pdf's text extraction system,
+handling both native PDF text and OCR text intelligently.
+Usage:
+    python examples/docling_text_sources.py [pdf_path]
+Dependencies:
+    - torch
+    - transformers
+    - docling_core
+"""
+import os
+import sys
+import logging
+from pathlib import Path
+# Import the library
+from natural_pdf import PDF, configure_logging
+# Configure detailed logging to see text source decision messages
+configure_logging(level=logging.INFO)
+logger = logging.getLogger("natural_pdf")
+logger.setLevel(logging.INFO)
+# Get PDF path from command line or use demo files
+if len(sys.argv) > 1:
+    pdf_path = sys.argv[1]
+else:
+    # Default to a sample PDF in the pdfs directory
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    repo_root = os.path.dirname(script_dir)
+    # Use two different PDFs for testing:
+    # 1. One with native text
+    native_pdf_path = os.path.join(repo_root, "pdfs", "01-practice.pdf")
+    # 2. One that needs OCR
+    ocr_pdf_path = os.path.join(repo_root, "pdfs", "needs-ocr.pdf")
+    # Default to native text PDF
+    pdf_path = native_pdf_path
+# Check if required packages are installed
+try:
+    from docling.document_converter import DocumentConverter
+except ImportError:
+    print("Missing required packages. Please install:")
+    print("pip install docling")
+    sys.exit(1)
+# Create output directory
+output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "output")
+os.makedirs(output_dir, exist_ok=True)
+# Create a custom handler to also print log messages to console
+console_handler = logging.StreamHandler()
+console_handler.setLevel(logging.INFO)
+formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
+console_handler.setFormatter(formatter)
+logger.addHandler(console_handler)
+# Part 1: Native Text PDF Example
+print("\n=== PART 1: PDF WITH NATIVE TEXT ===")
+# Load the PDF with native text
+print(f"Loading PDF with native text: {native_pdf_path}")
+native_pdf = PDF(native_pdf_path)
+native_page = native_pdf.pages[0]
+# First count original text elements
+original_elements = native_page.words
+print(f"PDF has {len(original_elements)} native text elements")
+# Run Docling analysis
+print("\nRunning Docling analysis...")
+native_page.analyze_layout(
+    model="docling",
+    confidence=0.2
+)
+# Find Docling regions
+docling_regions = native_page.find_all('region[model=docling]')
+print(f"Found {len(docling_regions)} Docling regions")
+# Count elements by source
+native_text = native_page.find_all('text[source=native]')
+ocr_text = native_page.find_all('text[source=ocr]')
+docling_text_regions = native_page.find_all('region[model=docling][type=text]')
+print(f"\nText elements by source:")
+print(f"  Native PDF text: {len(native_text)} elements")
+print(f"  OCR text: {len(ocr_text)} elements")
+print(f"  Docling text regions: {len(docling_text_regions)} elements")
+# Check text sources
+print("\nChecking text sources for regions:")
+for i, region in enumerate(docling_regions[:5]):  # Check first 5 regions
+    # Check if region has direct text content
+    has_text_content = hasattr(region, 'text_content') and region.text_content
+    # Check if region has associated text elements
+    has_associated_text = (hasattr(region, 'associated_text_elements') and
+                          region.associated_text_elements)
+    # Extract text using the enhanced method which logs source decision
+    text = region.extract_text()
+    print(f"\nRegion {i+1} ({region.region_type}):")
+    print(f"  Has direct text content: {has_text_content}")
+    print(f"  Has associated text elements: {has_associated_text}")
+    print(f"  Text length: {len(text)} characters")
+    print(f"  Text preview: '{text[:50]}...'")
+# Visualize text sources
+print("\nVisualizing text sources...")
+native_page.clear_highlights()
+# Highlight native text elements
+native_text.highlight(
+    color=(0, 0, 0.7, 0.3),
+    label="Native PDF Text Elements",
+    include_attrs=['source']
+)
+# Highlight regions with native text (associated elements)
+native_text_regions = []
+for region in docling_regions:
+    if hasattr(region, 'associated_text_elements') and region.associated_text_elements:
+        native_text_regions.append(region)
+if native_text_regions:
+    from natural_pdf.elements.collections import ElementCollection
+    ElementCollection(native_text_regions).highlight(
+        color=(0, 0.7, 0, 0.3),
+        label="Regions using Native Text",
+        include_attrs=['region_type']
+    )
+# Highlight regions with only Docling text
+docling_text_regions = []
+for region in docling_regions:
+    if ((hasattr(region, 'text_content') and region.text_content) and
+        (not hasattr(region, 'associated_text_elements') or not region.associated_text_elements)):
+        docling_text_regions.append(region)
+if docling_text_regions:
+    from natural_pdf.elements.collections import ElementCollection
+    ElementCollection(docling_text_regions).highlight(
+        color=(0.7, 0, 0, 0.3),
+        label="Regions using Docling Text Only",
+        include_attrs=['region_type']
+    )
+# Save visualization
+native_output_path = os.path.join(output_dir, "docling_native_text_sources.png")
+native_page.save_image(native_output_path, labels=True)
+print(f"Saved visualization to {native_output_path}")
+# Part 2: OCR PDF Example (if available)
+print("\n=== PART 2: PDF REQUIRING OCR ===")
+# Check if OCR PDF exists
+if not os.path.exists(ocr_pdf_path):
+    print(f"OCR test PDF not found at {ocr_pdf_path}")
+    print("Skipping OCR text source test")
+    sys.exit(0)
+# Load the PDF requiring OCR
+print(f"Loading PDF requiring OCR: {ocr_pdf_path}")
+ocr_pdf = PDF(ocr_pdf_path, ocr="auto")  # Enable auto OCR
+ocr_page = ocr_pdf.pages[0]
+# First extract text with standard OCR
+print("\nExtracting text with standard OCR first...")
+ocr_elements = ocr_page.apply_ocr()
+print(f"Standard OCR found {len(ocr_elements)} text elements")
+# Now run Docling analysis
+print("\nRunning Docling analysis with integrated OCR...")
+ocr_page.analyze_layout(
+    model="docling",
+    confidence=0.2
+)
+# Find Docling regions
+ocr_docling_regions = ocr_page.find_all('region[model=docling]')
+print(f"Found {len(ocr_docling_regions)} Docling regions")
+# Check text sources
+print("\nChecking text sources for regions:")
+for i, region in enumerate(ocr_docling_regions[:5]):  # Check first 5 regions
+    # Check if region has direct text content
+    has_text_content = hasattr(region, 'text_content') and region.text_content
+    # Check if region has associated text elements (from standard OCR)
+    has_associated_text = (hasattr(region, 'associated_text_elements') and
+                          region.associated_text_elements)
+    # Extract text using the enhanced method which logs source decision
+    text = region.extract_text()
+    print(f"\nRegion {i+1} ({region.region_type}):")
+    print(f"  Has Docling text content: {has_text_content}")
+    print(f"  Has associated OCR elements: {has_associated_text}")
+    print(f"  Text length: {len(text)} characters")
+    print(f"  Text preview: '{text[:50]}...'")
+# Visualize text sources
+print("\nVisualizing OCR text sources...")
+ocr_page.clear_highlights()
+# Highlight standard OCR elements
+ocr_page.find_all('text[source=ocr]').highlight(
+    color=(0, 0, 0.7, 0.3),
+    label="Standard OCR Text",
+    include_attrs=['confidence']
+)
+# Highlight regions with Docling text
+docling_ocr_regions = []
+for region in ocr_docling_regions:
+    if hasattr(region, 'text_content') and region.text_content:
+        docling_ocr_regions.append(region)
+if docling_ocr_regions:
+    from natural_pdf.elements.collections import ElementCollection
+    ElementCollection(docling_ocr_regions).highlight(
+        color=(0.7, 0, 0, 0.3),
+        label="Docling OCR Text",
+        include_attrs=['region_type']
+    )
+# Save visualization
+ocr_output_path = os.path.join(output_dir, "docling_ocr_text_sources.png")
+ocr_page.save_image(ocr_output_path, labels=True)
+print(f"Saved visualization to {ocr_output_path}")
+print("\nText source analysis complete!")

examples/improved_qa_example.py ADDED Viewed

@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+"""
+Example demonstrating the simplified document QA interface.
+"""
+import sys
+import os
+import argparse
+# Add the parent directory to the path so we can import the natural_pdf package
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from natural_pdf import PDF
+def main():
+    parser = argparse.ArgumentParser(description="Example of using the improved document QA interface")
+    parser.add_argument('pdf', nargs='?',
+                      default="pdfs/2019 Statistics.pdf",
+                      help="Path to a PDF document")
+    parser.add_argument('--question', '-q',
+                      default="What information does this document contain?",
+                      help="Question to ask about the document")
+    parser.add_argument('--full', '-f', action='store_true',
+                      help="Show the full result dictionary with confidence scores")
+    args = parser.parse_args()
+    if not os.path.exists(args.pdf):
+        print(f"Error: PDF file '{args.pdf}' not found")
+        sys.exit(1)
+    print(f"Loading PDF: {args.pdf}")
+    print(f"Question: {args.question}")
+    try:
+        # Open the PDF
+        with PDF(args.pdf) as pdf:
+            # Get result dictionary
+            result = pdf.ask(args.question)
+            # Display result
+            if args.full:
+                print("\nFull result:")
+                for key, value in result.items():
+                    if key == 'confidence' and isinstance(value, float):
+                        print(f"  {key}: {value:.2f}")
+                    else:
+                        print(f"  {key}: {value}")
+            else:
+                print("\nResult:")
+                print(f"  Answer: {result['answer']}")
+                if 'confidence' in result:
+                    print(f"  Confidence: {result['confidence']:.2f}")
+                if 'page_num' in result:
+                    print(f"  Page: {result['page_num']}")
+            # Ask another related question
+            print("\nAsking follow-up question:")
+            follow_up = "What year does this data cover?"
+            print(f"Question: {follow_up}")
+            follow_result = pdf.ask(follow_up)
+            print(f"Answer: {follow_result['answer']}")
+    except Exception as e:
+        print(f"Error: {e}")
+if __name__ == "__main__":
+    main()

examples/url_pdf_example.py ADDED Viewed

@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+"""
+Example demonstrating loading a PDF from a URL.
+"""
+import sys
+import os
+import argparse
+# Add the parent directory to the path so we can import the natural_pdf package
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from natural_pdf import PDF
+def main():
+    parser = argparse.ArgumentParser(description="Example of loading a PDF from a URL")
+    parser.add_argument('url', nargs='?',
+                      default="https://arxiv.org/pdf/2103.14749.pdf",
+                      help="URL to a PDF document (default: an arXiv paper)")
+    args = parser.parse_args()
+    print(f"Loading PDF from URL: {args.url}")
+    # Open the PDF from URL
+    with PDF(args.url) as pdf:
+        # Display basic document info
+        print(f"Document loaded successfully: {len(pdf)} pages")
+        # Extract text from the first page
+        if len(pdf) > 0:
+            page = pdf.pages[0]
+            # Get the title (usually large text on the first page)
+            title = page.find_all('text[size>=12]')
+            if title:
+                print("\nTitle candidates:")
+                for i, t in enumerate(title[:3], 1):  # Show top 3 candidates
+                    print(f"{i}. {t.text}")
+            # Extract the first 200 characters of text
+            text = page.extract_text()
+            preview = text[:200] + "..." if len(text) > 200 else text
+            print(f"\nText preview:\n{preview}")
+if __name__ == "__main__":
+    main()

natural-pdf 25.3.16__py3-none-any.whl → 25.3.17.2__py3-none-any.whl

natural-pdf 25.3.16py3-none-any.whl → 25.3.17.2py3-none-any.whl