PyPI - natural-pdf - Versions diffs - 25.3.16.2__tar.gz → 25.3.17.2__tar.gz - Mend

natural-pdf 25.3.16.2tar.gz → 25.3.17.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (141) hide show

{natural_pdf-25.3.16.2/natural_pdf.egg-info → natural_pdf-25.3.17.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: natural-pdf
-Version: 25.3.16.2
+Version: 25.3.17.2
 Summary: A more intuitive interface for working with PDFs
 Home-page: https://github.com/jsoma/natural-pdf
 Author: Jonathan Soma
@@ -60,7 +60,7 @@ A friendly library for working with PDFs, built on top of [pdfplumber](https://g
 Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
 - [Complete documentation here](https://jsoma.github.io/natural-pdf)
-- [Live demo here](https://colab.research.google.com/github/jsoma/)
+- [Live demo here](https://colab.research.google.com/github/jsoma/natural-pdf/blob/main/notebooks/Examples.ipynb)
 ## Features
@@ -268,6 +268,23 @@ Logs follow a hierarchical structure matching the library's module organization:
 - `natural_pdf.analyzers` - Layout analysis operations
 - `natural_pdf.ocr` - OCR engine operations
+## Document QA
+Ask questions directly to your documents:
+```python
+# Ask questions about the document content
+result = pdf.ask("What was the company's revenue in 2022?")
+print(f"Answer: {result['answer']}")
+print(f"Confidence: {result['confidence']:.2f}")
+# Access more details in the result dictionary
+result = pdf.ask("Who is the CEO?")
+print(f"Answer: {result['answer']}")
+print(f"Found on page: {result['page_num']}")
+print(f"Source text: {result.get('source_text', 'N/A')}")
+```
 ## More details
 [Complete documentation here](https://jsoma.github.io/natural-pdf)

{natural_pdf-25.3.16.2 → natural_pdf-25.3.17.2}/README.md RENAMED Viewed

@@ -5,7 +5,7 @@ A friendly library for working with PDFs, built on top of [pdfplumber](https://g
 Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
 - [Complete documentation here](https://jsoma.github.io/natural-pdf)
-- [Live demo here](https://colab.research.google.com/github/jsoma/)
+- [Live demo here](https://colab.research.google.com/github/jsoma/natural-pdf/blob/main/notebooks/Examples.ipynb)
 ## Features
@@ -213,6 +213,23 @@ Logs follow a hierarchical structure matching the library's module organization:
 - `natural_pdf.analyzers` - Layout analysis operations
 - `natural_pdf.ocr` - OCR engine operations
+## Document QA
+Ask questions directly to your documents:
+```python
+# Ask questions about the document content
+result = pdf.ask("What was the company's revenue in 2022?")
+print(f"Answer: {result['answer']}")
+print(f"Confidence: {result['confidence']:.2f}")
+# Access more details in the result dictionary
+result = pdf.ask("Who is the CEO?")
+print(f"Answer: {result['answer']}")
+print(f"Found on page: {result['page_num']}")
+print(f"Source text: {result.get('source_text', 'N/A')}")
+```
 ## More details
 [Complete documentation here](https://jsoma.github.io/natural-pdf)

{natural_pdf-25.3.16.2 → natural_pdf-25.3.17.2}/docs/index.md RENAMED Viewed

@@ -4,7 +4,7 @@ A friendly library for working with PDFs, built on top of [pdfplumber](https://g
 Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
-- [Live demo here](https://colab.research.google.com/github/jsoma/)
+- [Live demo here](https://colab.research.google.com/github/jsoma/natural-pdf/blob/main/notebooks/Examples.ipynb)
 ## Quick Example

natural_pdf-25.3.17.2/examples/direct_qa_example.py ADDED Viewed

@@ -0,0 +1,71 @@
+"""
+Direct Document QA example that closely mirrors the original pdfplumber implementation.
+This example shows how to:
+1. Use pdfplumber directly to extract words and images
+2. Use transformers pipelines for document QA
+3. Compare with the Natural PDF implementation
+It's intentionally similar to the original code provided by the user.
+"""
+import os
+import sys
+import argparse
+import pdfplumber
+from PIL import Image, ImageDraw
+import numpy as np
+# Add parent directory to path to run without installing
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+# For comparison
+from natural_pdf import PDF, configure_logging
+import logging
+def main():
+    parser = argparse.ArgumentParser(description="Direct Document QA Example")
+    parser.add_argument("pdf_path", nargs="?", default="../pdfs/0500000US42001.pdf",
+                      help="Path to PDF document")
+    parser.add_argument("--question", default="How many votes for Harris and Walz?",
+                      help="Question to ask about the document")
+    parser.add_argument("--debug", action="store_true",
+                      help="Save debug information for troubleshooting")
+    args = parser.parse_args()
+    # Configure logging for Natural PDF
+    if args.debug:
+        configure_logging(level=logging.DEBUG)
+    else:
+        configure_logging(level=logging.INFO)
+    print(f"Document: {args.pdf_path}")
+    print(f"Question: {args.question}")
+    print("\n=== Natural PDF implementation ===")
+    # Use Natural PDF
+    pdf = PDF(args.pdf_path)
+    page = pdf.pages[0]
+    # Ask the question
+    result = page.ask(args.question, debug=args.debug)
+    if result.get("found", False):
+        print(f"Answer: {result['answer']}")
+        print(f"Confidence: {result['confidence']:.2f}")
+        # Highlight the answer
+        if result.get("source_elements"):
+            for element in result["source_elements"]:
+                element.highlight(color=(1, 0.5, 0, 0.5))
+            # Save the image
+            page.save_image("output/natural_pdf_answer.png")
+            print("Saved highlighted answer to output/natural_pdf_answer.png")
+    else:
+        print(f"No answer found: {result.get('error', '')}")
+if __name__ == "__main__":
+    main()

natural_pdf-25.3.17.2/examples/docling_comprehensive_test.py ADDED Viewed

@@ -0,0 +1,325 @@
+"""
+Comprehensive test of the Docling integration with Natural PDF.
+This script tests all aspects of the Docling integration:
+1. Basic document layout detection
+2. Hierarchical document navigation
+3. Text extraction from complex structures
+4. Integration with other layout models
+5. Performance and edge cases
+Usage:
+    python examples/docling_comprehensive_test.py [pdf_path]
+Dependencies:
+    - torch
+    - transformers
+    - docling_core
+"""
+import os
+import sys
+import time
+import logging
+from pathlib import Path
+# Import the library
+from natural_pdf import PDF, configure_logging
+# Configure detailed logging for debugging
+configure_logging(level=logging.INFO)
+logger = logging.getLogger("docling_test")
+logger.setLevel(logging.INFO)
+# Get PDF path from command line or use demo file
+if len(sys.argv) > 1:
+    pdf_path = sys.argv[1]
+else:
+    # Default to a sample PDF in the pdfs directory
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    repo_root = os.path.dirname(script_dir)
+    pdf_path = os.path.join(repo_root, "pdfs", "01-practice.pdf")
+# Check if required packages are installed
+try:
+    from docling.document_converter import DocumentConverter
+except ImportError:
+    logger.error("Missing required packages. Please install with:")
+    logger.error("pip install docling")
+    sys.exit(1)
+# Create output directory for test results
+output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "output", "docling_tests")
+os.makedirs(output_dir, exist_ok=True)
+# Load the PDF
+logger.info(f"Loading PDF: {pdf_path}")
+pdf = PDF(pdf_path)
+logger.info(f"PDF has {len(pdf.pages)} pages")
+# Process only the first page for tests
+page = pdf.pages[0]
+# SECTION 1: Basic Docling Detection
+logger.info("\n*** SECTION 1: Basic Docling Detection ***")
+# Time the Docling analysis
+start_time = time.time()
+page.analyze_layout(
+    model="docling",
+    confidence=0.2,  # This parameter isn't used by Docling but kept for API consistency
+    model_params={
+        "verbose": True
+        # Any other parameters would be passed directly to DocumentConverter
+    }
+)
+docling_time = time.time() - start_time
+logger.info(f"Docling analysis completed in {docling_time:.2f} seconds")
+# Verify that docling_document was created
+if hasattr(page, 'docling_document'):
+    logger.info("✅ Docling document created successfully")
+else:
+    logger.error("❌ Docling document not created")
+# Count detected regions
+docling_regions = page.find_all('region[model=docling]')
+logger.info(f"Found {len(docling_regions)} total Docling regions")
+# Get regions by type
+section_headers = page.find_all('section-header')
+text_regions = page.find_all('region[model=docling][type=text]')
+figures = page.find_all('region[model=docling][type=figure]')
+logger.info(f"- Section headers: {len(section_headers)}")
+logger.info(f"- Text regions: {len(text_regions)}")
+logger.info(f"- Figures: {len(figures)}")
+# SECTION 2: Hierarchical Navigation
+logger.info("\n*** SECTION 2: Hierarchical Navigation ***")
+# Test if regions have child_regions attribute
+has_children_attr = all(hasattr(region, 'child_regions') for region in docling_regions)
+logger.info(f"All regions have child_regions attribute: {has_children_attr}")
+# Count top-level regions (no parent)
+top_level_regions = [r for r in docling_regions if not r.parent_region]
+logger.info(f"Top-level regions: {len(top_level_regions)}")
+# Test child traversal for section headers
+if section_headers:
+    header = section_headers[0]
+    logger.info(f"Testing section header: '{header.extract_text()[:30]}...'")
+    # Test get_children method
+    if hasattr(header, 'get_children'):
+        children = header.get_children()
+        logger.info(f"- Direct children: {len(children)}")
+        # Test filtered get_children
+        text_children = header.get_children('text')
+        logger.info(f"- Direct text children: {len(text_children)}")
+    else:
+        logger.error("❌ get_children method not found")
+    # Test get_descendants method
+    if hasattr(header, 'get_descendants'):
+        descendants = header.get_descendants()
+        logger.info(f"- All descendants: {len(descendants)}")
+        # Test filtered get_descendants
+        text_descendants = header.get_descendants('text')
+        logger.info(f"- Text descendants: {len(text_descendants)}")
+    else:
+        logger.error("❌ get_descendants method not found")
+    # Test find_all with recursive option
+    children_find = header.find_all('text', recursive=False)
+    logger.info(f"- Children via find_all(recursive=False): {len(children_find)}")
+    all_find = header.find_all('text', recursive=True)
+    logger.info(f"- All text via find_all(recursive=True): {len(all_find)}")
+# SECTION 3: Text Extraction
+logger.info("\n*** SECTION 3: Text Extraction ***")
+# Test basic text extraction
+if section_headers:
+    header = section_headers[0]
+    header_text = header.extract_text()
+    logger.info(f"Section header text: '{header_text[:50]}...'")
+    # Test extraction from hierarchy
+    if hasattr(header, 'get_children') and header.get_children():
+        child = header.get_children()[0]
+        child_text = child.extract_text()
+        logger.info(f"First child text: '{child_text[:50]}...'")
+        # Compare with standard extraction
+        # In a real document, the header's extract_text might include the child text too
+        combined_len = len(header_text) + len(child_text)
+        logger.info(f"Combined text length: {combined_len} characters")
+# Test text extraction with and without OCR
+# This is a simplified test - in a real scenario, we'd compare with known text
+extracted_text = page.extract_text()
+logger.info(f"Extracted page text: {len(extracted_text)} characters")
+# SECTION 4: Integration with Other Models
+logger.info("\n*** SECTION 4: Integration with Other Models ***")
+# Store current regions for comparison
+original_region_count = len(page._regions['detected'])
+# Add YOLO analysis
+page.analyze_layout(
+    model="yolo",
+    confidence=0.3,
+    existing="append"  # Important: don't replace Docling regions
+)
+# Count new regions
+all_regions = page._regions['detected']
+logger.info(f"Total regions after adding YOLO: {len(all_regions)}")
+logger.info(f"New regions added: {len(all_regions) - original_region_count}")
+# Test filtering by model
+yolo_regions = page.find_all('region[model=yolo]')
+docling_regions_after = page.find_all('region[model=docling]')
+logger.info(f"YOLO regions: {len(yolo_regions)}")
+logger.info(f"Docling regions after YOLO: {len(docling_regions_after)}")
+logger.info(f"Docling regions preserved: {len(docling_regions_after) == len(docling_regions)}")
+# SECTION 5: Visualization
+logger.info("\n*** SECTION 5: Visualization ***")
+# Clear previous highlights
+page.clear_highlights()
+# Highlight different models and region types
+if section_headers:
+    section_headers.highlight(
+        color=(1, 0, 0, 0.3),
+        label="Docling Headers",
+        include_attrs=['region_type']
+    )
+if text_regions:
+    text_regions.highlight(
+        color=(0, 0, 1, 0.3),
+        label="Docling Text",
+        include_attrs=['region_type']
+    )
+if yolo_regions:
+    yolo_regions.highlight(
+        color=(0, 1, 0, 0.3),
+        label="YOLO Regions",
+        include_attrs=['region_type']
+    )
+# Save highlighted image
+highlight_path = os.path.join(output_dir, "model_comparison.png")
+page.save_image(highlight_path, labels=True)
+logger.info(f"Saved visualization to {highlight_path}")
+# Test hierarchical highlighting
+if section_headers and len(section_headers) > 0:
+    # Clear previous highlights
+    page.clear_highlights()
+    # Select a section to visualize
+    header = section_headers[0]
+    # Highlight header
+    header.highlight(
+        color=(1, 0, 0, 0.3),
+        label="Section Header"
+    )
+    # Highlight direct children
+    if hasattr(header, 'get_children') and header.get_children():
+        children = header.get_children()
+        for child in children:
+            child.highlight(
+                color=(0, 1, 0, 0.3),
+                label="Direct Children",
+                include_attrs=['region_type']
+            )
+    # Save hierarchy visualization
+    hierarchy_path = os.path.join(output_dir, "hierarchy_visualization.png")
+    page.save_image(hierarchy_path, labels=True)
+    logger.info(f"Saved hierarchy visualization to {hierarchy_path}")
+# SECTION 6: Text Source Testing (OCR vs Native)
+logger.info("\n*** SECTION 6: Text Source Testing ***")
+# Find text elements by source
+native_text = page.find_all('text[source=native]')
+ocr_text = page.find_all('text[source=ocr]')
+docling_text = page.find_all('region[model=docling][type=text]')
+logger.info(f"Text elements by source:")
+logger.info(f"- Native PDF text: {len(native_text)} elements")
+logger.info(f"- OCR text: {len(ocr_text)} elements")
+logger.info(f"- Docling text: {len(docling_text)} elements")
+# Test specific text element queries
+if native_text:
+    sample_native = native_text[0]
+    logger.info(f"Sample native text: '{sample_native.text[:30]}...'")
+    logger.info(f"Has source='native' attribute: {getattr(sample_native, 'source', None) == 'native'}")
+# Test if text_content attribute is set
+has_text_content = False
+for region in docling_regions:
+    if hasattr(region, 'text_content') and region.text_content:
+        has_text_content = True
+        logger.info(f"Found region with text_content: '{region.text_content[:30]}...'")
+        break
+logger.info(f"Regions have text_content attribute: {has_text_content}")
+# Test if associated_text_elements is used
+has_associated_text = False
+for region in docling_regions:
+    if hasattr(region, 'associated_text_elements') and region.associated_text_elements:
+        has_associated_text = True
+        logger.info(f"Found region with associated_text_elements: {len(region.associated_text_elements)} elements")
+        break
+logger.info(f"Regions have associated_text_elements: {has_associated_text}")
+# Highlight different text sources
+page.clear_highlights()
+if native_text:
+    native_text.highlight(
+        color=(0, 0, 0.7, 0.3),
+        label="Native Text Elements",
+        include_attrs=['source']
+    )
+if docling_text:
+    docling_text.highlight(
+        color=(0.7, 0, 0, 0.3),
+        label="Docling Text Elements",
+        include_attrs=['model']
+    )
+# Save source visualization
+source_path = os.path.join(output_dir, "text_sources.png")
+page.save_image(source_path, labels=True)
+logger.info(f"Saved text source visualization to {source_path}")
+# Log final summary
+print("\n*** TEST SUMMARY ***")
+print(f"Total Docling regions: {len(docling_regions)}")
+print(f"Hierarchical navigation: {'✅ Working' if has_children_attr else '❌ Not working'}")
+print(f"Text extraction: {'✅ Working' if len(extracted_text) > 0 else '❌ Not working'}")
+print(f"Multi-model integration: {'✅ Working' if len(yolo_regions) > 0 else '❌ Not working'}")
+print(f"Test artifacts saved to: {output_dir}")
+print("\nAll tests completed with no errors!")
+logger.info("\nAll tests completed.")

natural_pdf-25.3.17.2/examples/docling_example.py ADDED Viewed

@@ -0,0 +1,192 @@
+"""
+Example script demonstrating the Docling integration with Natural PDF.
+This script uses Docling to analyze document layout and text structure,
+with hierarchical relationships between document elements.
+Usage:
+    python examples/docling_example.py [pdf_path]
+Dependencies:
+    - torch
+    - transformers
+    - docling_core
+"""
+import os
+import sys
+import logging
+from PIL import Image
+# Import the library
+from natural_pdf import PDF, configure_logging
+# Get PDF path from command line or use demo file
+if len(sys.argv) > 1:
+    pdf_path = sys.argv[1]
+else:
+    # Default to a sample PDF in the pdfs directory
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    repo_root = os.path.dirname(script_dir)
+    pdf_path = os.path.join(repo_root, "pdfs", "01-practice.pdf")
+# Configure logging to see what's happening
+configure_logging(level=logging.INFO)
+logger = logging.getLogger("docling_example")
+logger.setLevel(logging.INFO)
+# Check if we can import required packages
+try:
+    from docling.document_converter import DocumentConverter
+except ImportError:
+    logger.error("Missing required packages. Please install:")
+    logger.error("pip install docling")
+    sys.exit(1)
+# Docling will use the best available device automatically
+# Load the PDF
+pdf = PDF(pdf_path)
+logger.info(f"Loaded PDF with {len(pdf.pages)} pages")
+# Process the first page with Docling
+page = pdf.pages[0]
+# Run Docling analysis with the new docling model
+logger.info("Running Docling analysis...")
+page.analyze_layout(
+    model="docling",
+    confidence=0.2,  # This parameter isn't used by Docling but kept for API consistency
+    model_params={
+        "verbose": True,  # Enable detailed logging
+        # Any other parameters would be passed directly to DocumentConverter
+    }
+)
+# If we have a docling_document, we succeeded
+if hasattr(page, 'docling_document'):
+    logger.info("Docling analysis complete!")
+    # Find all detected regions by model
+    docling_regions = page.find_all('region[model=docling]')
+    logger.info(f"Found {len(docling_regions)} Docling regions")
+    # Get regions by type
+    section_headers = page.find_all('section-header')
+    plain_text = page.find_all('text[model=docling]')
+    figures = page.find_all('figure[model=docling]')
+    logger.info(f"Found {len(section_headers)} section headers")
+    logger.info(f"Found {len(plain_text)} text blocks")
+    logger.info(f"Found {len(figures)} figures")
+    # Print hierarchy information
+    root_regions = [r for r in docling_regions if not r.parent_region]
+    logger.info(f"Document has {len(root_regions)} top-level regions")
+    # Print text from each section header and its children
+    for i, header in enumerate(section_headers):
+        logger.info(f"\nSection {i+1}: {header.extract_text()}")
+        # Get direct children of this header
+        children = header.get_children()
+        if children:
+            logger.info(f"  - Has {len(children)} direct children")
+            for j, child in enumerate(children[:2]):  # Show first 2 children
+                child_text = child.extract_text()
+                if len(child_text) > 50:
+                    child_text = child_text[:50] + "..."
+                logger.info(f"  - Child {j+1}: {child.region_type} - {child_text}")
+            if len(children) > 2:
+                logger.info(f"  - And {len(children) - 2} more children...")
+    # Highlight different types of regions
+    page.clear_highlights()
+    # Highlight section headers in red
+    if section_headers:
+        section_headers.highlight(
+            color=(1, 0, 0, 0.3),
+            label="Section Headers",
+            include_attrs=['confidence']
+        )
+    # Highlight text blocks in blue
+    if plain_text:
+        plain_text.highlight(
+            color=(0, 0, 1, 0.3),
+            label="Text Blocks"
+        )
+    # Highlight figures in green
+    if figures:
+        figures.highlight(
+            color=(0, 1, 0, 0.3),
+            label="Figures"
+        )
+    # Demonstrate hierarchical extraction
+    if section_headers:
+        # Get the first section header
+        header = section_headers[0]
+        # Extract all text recursively from this section and its children
+        all_text = header.extract_text()
+        logger.info(f"\nExtracted text from first section: {all_text[:100]}...")
+        # Find all text elements recursively within this section
+        section_text_elems = header.find_all('text', recursive=True)
+        logger.info(f"Found {len(section_text_elems)} text elements in the section hierarchy")
+        # Test recursive searching
+        if hasattr(header, 'get_descendants'):
+            descendants = header.get_descendants()
+            logger.info(f"Section has {len(descendants)} total descendants")
+    # Save highlighted image
+    output_path = os.path.join("output", "docling_analysis.png")
+    os.makedirs("output", exist_ok=True)
+    logger.info(f"Saving visualization to {output_path}")
+    page.save_image(output_path, labels=True)
+    # Create a more detailed visualization showing the hierarchy
+    if section_headers and len(section_headers) > 0:
+        # Create a new visualization from scratch
+        page.clear_highlights()
+        # Get the first section to visualize its hierarchy
+        section = section_headers[0]
+        # Highlight the section header
+        section.highlight(
+            color=(1, 0, 0, 0.3),
+            label="Section Header"
+        )
+        # Highlight its immediate children
+        children = section.get_children()
+        for child in children:
+            child.highlight(
+                color=(0, 0.7, 0, 0.3),
+                label="Direct Children",
+                include_attrs=['region_type']
+            )
+            # Highlight grandchildren differently
+            grandchildren = child.get_children()
+            for grandchild in grandchildren:
+                grandchild.highlight(
+                    color=(0, 0, 0.7, 0.3),
+                    label="Grandchildren",
+                    include_attrs=['region_type']
+                )
+        # Save hierarchy visualization
+        hierarchy_path = os.path.join("output", "docling_hierarchy.png")
+        page.save_image(hierarchy_path, labels=True)
+        logger.info(f"Saved hierarchy visualization to {hierarchy_path}")
+else:
+    logger.error("Docling analysis failed. Check that you have the required packages installed.")

natural-pdf 25.3.16.2__tar.gz → 25.3.17.2__tar.gz

natural-pdf 25.3.16.2tar.gz → 25.3.17.2tar.gz