PyPI - natural-pdf - Versions diffs - 25.3.16__py3-none-any.whl - Mend

natural-pdf 25.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (109) hide show

examples/__init__.py +3 -0
examples/another_exclusion_example.py +20 -0
examples/basic_usage.py +190 -0
examples/boundary_exclusion_test.py +137 -0
examples/boundary_inclusion_fix_test.py +157 -0
examples/chainable_layout_example.py +70 -0
examples/color_basic_test.py +49 -0
examples/color_name_example.py +71 -0
examples/color_test.py +62 -0
examples/debug_ocr.py +91 -0
examples/direct_ocr_test.py +148 -0
examples/direct_paddle_test.py +99 -0
examples/direct_qa_example.py +165 -0
examples/document_layout_analysis.py +123 -0
examples/document_qa_example.py +185 -0
examples/exclusion_count_debug.py +128 -0
examples/exclusion_debug.py +107 -0
examples/exclusion_example.py +150 -0
examples/exclusion_optimization_example.py +190 -0
examples/extract_text_test.py +128 -0
examples/font_aware_example.py +101 -0
examples/font_variant_example.py +124 -0
examples/footer_overlap_test.py +124 -0
examples/highlight_all_example.py +82 -0
examples/highlight_attributes_test.py +114 -0
examples/highlight_confidence_display.py +122 -0
examples/highlight_demo.py +110 -0
examples/highlight_float_test.py +71 -0
examples/highlight_test.py +147 -0
examples/highlighting_example.py +123 -0
examples/image_width_example.py +84 -0
examples/improved_api_example.py +128 -0
examples/layout_confidence_display_test.py +65 -0
examples/layout_confidence_test.py +82 -0
examples/layout_coordinate_debug.py +258 -0
examples/layout_highlight_test.py +77 -0
examples/logging_example.py +70 -0
examples/ocr_comprehensive.py +193 -0
examples/ocr_debug_example.py +87 -0
examples/ocr_default_test.py +97 -0
examples/ocr_engine_comparison.py +235 -0
examples/ocr_example.py +89 -0
examples/ocr_simplified_params.py +79 -0
examples/ocr_visualization.py +102 -0
examples/ocr_visualization_test.py +121 -0
examples/paddle_layout_example.py +315 -0
examples/paddle_layout_simple.py +74 -0
examples/paddleocr_example.py +224 -0
examples/page_collection_example.py +103 -0
examples/polygon_highlight_example.py +83 -0
examples/position_methods_example.py +134 -0
examples/region_boundary_test.py +73 -0
examples/region_exclusion_test.py +149 -0
examples/region_expand_example.py +109 -0
examples/region_image_example.py +116 -0
examples/region_ocr_test.py +119 -0
examples/region_sections_example.py +115 -0
examples/school_books.py +49 -0
examples/school_books_all.py +52 -0
examples/scouring.py +36 -0
examples/section_extraction_example.py +232 -0
examples/simple_document_qa.py +97 -0
examples/spatial_navigation_example.py +108 -0
examples/table_extraction_example.py +135 -0
examples/table_structure_detection.py +155 -0
examples/tatr_cells_test.py +56 -0
examples/tatr_ocr_table_test.py +94 -0
examples/text_search_example.py +122 -0
examples/text_style_example.py +110 -0
examples/tiny-text.py +61 -0
examples/until_boundaries_example.py +156 -0
examples/until_example.py +112 -0
examples/very_basics.py +15 -0
natural_pdf/__init__.py +55 -0
natural_pdf/analyzers/__init__.py +9 -0
natural_pdf/analyzers/document_layout.py +736 -0
natural_pdf/analyzers/text_structure.py +153 -0
natural_pdf/core/__init__.py +3 -0
natural_pdf/core/page.py +2376 -0
natural_pdf/core/pdf.py +572 -0
natural_pdf/elements/__init__.py +3 -0
natural_pdf/elements/base.py +553 -0
natural_pdf/elements/collections.py +770 -0
natural_pdf/elements/line.py +124 -0
natural_pdf/elements/rect.py +122 -0
natural_pdf/elements/region.py +1366 -0
natural_pdf/elements/text.py +304 -0
natural_pdf/ocr/__init__.py +62 -0
natural_pdf/ocr/easyocr_engine.py +254 -0
natural_pdf/ocr/engine.py +158 -0
natural_pdf/ocr/paddleocr_engine.py +263 -0
natural_pdf/qa/__init__.py +3 -0
natural_pdf/qa/document_qa.py +405 -0
natural_pdf/selectors/__init__.py +4 -0
natural_pdf/selectors/parser.py +360 -0
natural_pdf/templates/__init__.py +1 -0
natural_pdf/templates/ocr_debug.html +517 -0
natural_pdf/utils/__init__.py +4 -0
natural_pdf/utils/highlighting.py +605 -0
natural_pdf/utils/ocr.py +515 -0
natural_pdf/utils/reading_order.py +227 -0
natural_pdf/utils/visualization.py +151 -0
natural_pdf-25.3.16.dist-info/LICENSE +21 -0
natural_pdf-25.3.16.dist-info/METADATA +268 -0
natural_pdf-25.3.16.dist-info/RECORD +109 -0
natural_pdf-25.3.16.dist-info/WHEEL +5 -0
natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
tests/__init__.py +3 -0
tests/test_pdf.py +39 -0

examples/ocr_visualization_test.py ADDED Viewed

@@ -0,0 +1,121 @@
+"""
+OCR Visualization Test
+This example demonstrates the OCR text visualization feature using PaddleOCR.
+"""
+import os
+import sys
+from pathlib import Path
+# Add project directory to the path
+script_dir = os.path.dirname(os.path.realpath(__file__))
+root_dir = os.path.dirname(script_dir)
+sys.path.insert(0, root_dir)
+# Import the library
+from natural_pdf import PDF
+# Set up paths
+output_dir = os.path.join(root_dir, "output")
+os.makedirs(output_dir, exist_ok=True)
+# Use a PDF that typically needs OCR
+pdf_path = os.path.join(root_dir, "pdfs", "needs-ocr.pdf")
+if not os.path.exists(pdf_path):
+    # Fallback to other PDFs if the needs-ocr.pdf doesn't exist
+    pdf_path = os.path.join(root_dir, "pdfs", "HARRY ROQUE_redacted.pdf")
+    if not os.path.exists(pdf_path):
+        pdf_path = os.path.join(root_dir, "pdfs", "01-practice.pdf")
+print("OCR Visualization Test")
+print("=====================")
+print(f"Using PDF: {pdf_path}")
+# Initialize the PDF with PaddleOCR engine
+try:
+    # Try with PaddleOCR first
+    pdf = PDF(
+        pdf_path,
+        ocr_engine="paddleocr",
+        ocr={
+            "enabled": True,
+            "languages": ["en"],
+            "min_confidence": 0.3
+        }
+    )
+    print("Using PaddleOCR engine")
+except Exception as e:
+    print(f"PaddleOCR initialization failed: {e}")
+    print("Falling back to EasyOCR")
+    # Fall back to EasyOCR
+    pdf = PDF(
+        pdf_path,
+        ocr_engine="easyocr",
+        ocr={
+            "enabled": True,
+            "languages": ["en"],
+            "min_confidence": 0.3
+        }
+    )
+# Access the first page
+page = pdf.pages[0]
+# Force OCR text extraction
+print("\nExtracting text with OCR...")
+text = page.extract_text(ocr=True)
+print(f"Extracted {len(text)} characters of text")
+if text:
+    print(f"First 100 chars: {text[:100]}...")
+# Extract OCR elements
+print("\nExtracting OCR elements...")
+ocr_elements = page.extract_ocr_elements()
+print(f"Found {len(ocr_elements)} OCR elements")
+# Create highlight visualization
+print("\nCreating highlight visualization...")
+for elem in ocr_elements:
+    # Use color based on confidence - with full RGB values (0-255) and higher opacity
+    if elem.confidence >= 0.8:
+        color = (0, 255, 0, 180)  # Green for high confidence (more visible)
+    elif elem.confidence >= 0.5:
+        color = (255, 255, 0, 180)  # Yellow for medium confidence
+    else:
+        color = (255, 0, 0, 180)  # Red for low confidence
+    # Add highlight with confidence as label
+    elem.highlight(color=color, label=f"OCR ({elem.confidence:.2f})")
+# Save image with highlights only
+highlight_path = os.path.join(output_dir, "ocr_visualization_highlights.png")
+page.to_image(path=highlight_path, show_labels=True)
+print(f"Saved highlighted image to {highlight_path}")
+# Now use the OCR text rendering feature
+if len(ocr_elements) > 0:
+    print("\nCreating rendered OCR text visualization...")
+    # Save image with OCR text rendered
+    ocr_text_path = os.path.join(output_dir, "ocr_visualization_text.png")
+    try:
+        page.to_image(path=ocr_text_path, show_labels=True, render_ocr=True)
+        print(f"Saved OCR text rendering to {ocr_text_path}")
+    except ValueError as e:
+        print(f"Error rendering OCR text: {e}")
+    # Clear highlights and render only OCR text
+    print("\nCreating clean OCR text visualization...")
+    page.clear_highlights()
+    # Save clean image with only OCR text
+    clean_text_path = os.path.join(output_dir, "ocr_visualization_clean.png")
+    try:
+        page.to_image(path=clean_text_path, render_ocr=True)
+        print(f"Saved clean OCR text rendering to {clean_text_path}")
+    except ValueError as e:
+        print(f"Error rendering clean OCR text: {e}")
+else:
+    print("\nNo OCR elements found to render.")
+print("\nTest complete!")

examples/paddle_layout_example.py ADDED Viewed

@@ -0,0 +1,315 @@
+"""
+Document layout analysis example using PaddlePaddle's PP-Structure model.
+This example demonstrates how to use PaddlePaddle for document layout analysis
+to detect and extract content from different regions of a PDF document.
+Features:
+- Standard layout detection using PaddlePaddle's PP-Structure
+- Enhanced text detection by combining PP-Structure with direct OCR
+- Visualization of different region types and sources
+- Comparison mode to evaluate performance with and without text detection
+- Support for polygon-based text regions from OCR
+"""
+import os
+import sys
+import logging
+from pathlib import Path
+import argparse
+# Import the library with its logging utilities
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from natural_pdf import configure_logging, PDF
+# Get the current directory of this script
+script_dir = os.path.dirname(os.path.realpath(__file__))
+# Get the parent directory (project root)
+root_dir = os.path.dirname(script_dir)
+# Default PDF path
+default_pdf = os.path.join(root_dir, "pdfs", "HARRY ROQUE_redacted.pdf")
+# Set up argument parser
+parser = argparse.ArgumentParser(description="PaddlePaddle layout analysis example")
+parser.add_argument("pdf_path", nargs="?", default=default_pdf, help="Path to a PDF file")
+parser.add_argument("--page", type=int, default=0, help="Page number to analyze (0-based)")
+parser.add_argument("--conf", type=float, default=0.2, help="Confidence threshold for detections")
+parser.add_argument("--lang", type=str, default="en", help="Language code (en, ch, etc.)")
+parser.add_argument("--device", type=str, default="cpu", help="Device to run inference on ('cpu' or 'gpu')")
+parser.add_argument("--output", type=str, default=None, help="Output file path for highlighted image")
+parser.add_argument("--disable-table", action="store_true", help="Disable table detection")
+parser.add_argument("--text-detection", action="store_true", help="Enable direct text detection")
+parser.add_argument("--compare", action="store_true", help="Compare with and without text detection")
+parser.add_argument("--verbose", action="store_true", help="Show detailed debug output")
+parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
+                    default="INFO", help="Set logging level")
+args = parser.parse_args()
+# Configure logging based on command-line arguments
+log_level = getattr(logging, args.log_level)
+configure_logging(level=log_level)
+# Further adjust logging for verbose mode
+if args.verbose:
+    configure_logging(level=logging.DEBUG)
+print(f"Analyzing PDF: {args.pdf_path}")
+print(f"Page: {args.page}")
+print(f"Confidence threshold: {args.conf}")
+# Load the PDF
+pdf = PDF(args.pdf_path)
+page = pdf.pages[args.page]
+print(f"Running PaddlePaddle layout analysis...")
+# Enable debugging output
+print("PDF page dimensions:", page.width, "x", page.height)
+# Check if we should run comparison
+if args.compare:
+    print("\n=== Comparing Layout Detection With and Without Text Detection ===")
+    # First run without text detection
+    print("\nRunning WITHOUT text detection...")
+    import time
+    start = time.time()
+    regions_without_text = page.analyze_layout(
+        model="paddle",
+        confidence=args.conf,
+        device=args.device,
+        model_params={
+            "lang": args.lang,
+            "show_log": args.verbose,
+            "detect_text": False,
+            "verbose": args.verbose
+        }
+    )
+    time_without = time.time() - start
+    # Highlight without text detection
+    page.highlight_layout()
+    # Save the highlighted image
+    output_without = os.path.join(
+        os.path.dirname(args.output or os.path.join(root_dir, "output", "paddle_layout_detection.png")),
+        "paddle_layout_without_text.png"
+    )
+    page.to_image(path=output_without, show_labels=True)
+    print(f"Found {len(regions_without_text)} regions WITHOUT text detection in {time_without:.2f} seconds")
+    print(f"Saved image to {output_without}")
+    # Clear highlights
+    page.clear_highlights()
+    # Then run with text detection
+    print("\nRunning WITH text detection...")
+    start = time.time()
+    regions_with_text = page.analyze_layout(
+        model="paddle",
+        confidence=args.conf,
+        device=args.device,
+        model_params={
+            "lang": args.lang,
+            "show_log": args.verbose,
+            "detect_text": True,
+            "verbose": args.verbose
+        }
+    )
+    time_with = time.time() - start
+    # Highlight with text detection
+    page.highlight_layout()
+    # Save the highlighted image
+    output_with = os.path.join(
+        os.path.dirname(args.output or os.path.join(root_dir, "output", "paddle_layout_detection.png")),
+        "paddle_layout_with_text.png"
+    )
+    page.to_image(path=output_with, show_labels=True)
+    print(f"Found {len(regions_with_text)} regions WITH text detection in {time_with:.2f} seconds")
+    print(f"Saved image to {output_with}")
+    # Comparison
+    print("\nComparison results:")
+    print(f"  - WITHOUT text detection: {len(regions_without_text)} regions in {time_without:.2f} seconds")
+    print(f"  - WITH text detection: {len(regions_with_text)} regions in {time_with:.2f} seconds")
+    print(f"  - Additional regions: {len(regions_with_text) - len(regions_without_text)}")
+    print(f"  - Speed difference: {time_with / time_without:.2f}x longer with text detection")
+    # Continue with the regions from the requested mode
+    regions = regions_with_text if args.text_detection else regions_without_text
+else:
+    # Run regular layout analysis
+    regions = page.analyze_layout(
+        model="paddle",
+        confidence=args.conf,
+        device=args.device,
+        model_params={
+            "lang": args.lang,
+            "show_log": args.verbose,
+            "detect_text": args.text_detection,
+            "verbose": args.verbose
+        }
+    )
+print(f"Found {len(regions)} regions with confidence >= {args.conf}")
+# Group regions by type and source
+regions_by_type = {}
+sources = {"layout": 0, "ocr": 0, "unknown": 0}
+for region in regions:
+    region_type = region.region_type
+    if region_type not in regions_by_type:
+        regions_by_type[region_type] = []
+    regions_by_type[region_type].append(region)
+    # Count sources
+    source = getattr(region, "source", "unknown")
+    sources[source] = sources.get(source, 0) + 1
+# Print a summary of detected regions by type
+for region_type, type_regions in regions_by_type.items():
+    print(f"  - {region_type}: {len(type_regions)} regions")
+# Print source information
+print("\nRegion sources:")
+for source, count in sources.items():
+    print(f"  - {source}: {count} regions")
+# If the user enabled text detection, show source-specific highlighting
+if args.text_detection:
+    print("\nHighlighting regions by source...")
+    # Clear any existing highlights
+    page.clear_highlights()
+    # Get text regions separately using normalized_type
+    text_regions = page.find_all('region[normalized_type=plain-text][model=paddle]')
+    figure_regions = page.find_all('region[normalized_type=figure][model=paddle]')
+    # Highlight figure regions in blue
+    for region in figure_regions:
+        region.highlight(color=(0, 0, 1, 0.3), label=f"Figure: {region.region_type}")
+    # Highlight text regions in green
+    for region in text_regions:
+        region.highlight(color=(0, 1, 0, 0.3), label=f"Text: {region.region_type}")
+    # Save the source-highlighted image
+    sources_output = os.path.join(
+        os.path.dirname(args.output or os.path.join(root_dir, "output", "paddle_layout_detection.png")),
+        "paddle_layout_sources.png"
+    )
+    page.to_image(path=sources_output, show_labels=True)
+    print(f"Saved source-highlighted layout to {sources_output}")
+    # Show polygon visualizations if any OCR regions have polygons
+    regions_with_polygons = [r for r in regions if hasattr(r, "polygon")]
+    if regions_with_polygons:
+        print(f"\nVisualizing {len(regions_with_polygons)} regions with polygon points...")
+        page.clear_highlights()
+        # Highlight regions with polygons in red
+        for region in regions_with_polygons:
+            region.highlight(color=(1, 0, 0, 0.3), label="Polygon Region")
+        # Save the polygon-highlighted image
+        polygon_output = os.path.join(
+            os.path.dirname(args.output or os.path.join(root_dir, "output", "paddle_layout_detection.png")),
+            "paddle_layout_polygons.png"
+        )
+        page.to_image(path=polygon_output, show_labels=True)
+        print(f"Saved polygon visualization to {polygon_output}")
+    # Clear highlights for standard view
+    page.clear_highlights()
+# Highlight all detected regions normally
+page.highlight_all(include_layout_regions=True, layout_confidence=args.conf)
+# Demonstrate using selectors to find regions by type and model
+print("\nSelecting regions by type and model:")
+for region_type in regions_by_type.keys():
+    # Convert spaces to hyphens for selector syntax
+    selector_type = region_type.lower().replace(' ', '-')
+    # Use model-specific selector
+    # Use either type or normalized_type in selector
+    if region_type.lower() == 'text':
+        selector = f"region[normalized_type=plain-text][model=paddle]"
+    else:
+        selector = f"region[normalized_type={selector_type}][model=paddle]"
+    found_regions = page.find_all(selector)
+    print(f"  - {selector}: {len(found_regions)} regions")
+    # Try different selectors to debug the issue
+    model_regions = page.find_all(f"region[type={selector_type}]")
+    paddle_regions = page.find_all(f"region[model=paddle]")
+    layout_regions = page.find_all(f"region[source=layout]")
+    ocr_regions = page.find_all(f"region[source=ocr]")
+    detected_regions = page.find_all(f"region[source=detected]")
+    print(f"    - With type only: {len(model_regions)} regions")
+    print(f"    - With model=paddle: {len(paddle_regions)} regions")
+    print(f"    - With source=layout: {len(layout_regions)} regions")
+    print(f"    - With source=ocr: {len(ocr_regions)} regions")
+    print(f"    - With source=detected: {len(detected_regions)} regions")
+    # Debug a sample region
+    if model_regions:
+        region = model_regions[0]
+        print(f"    - Sample region attributes: type={region.region_type}, normalized_type={getattr(region, 'normalized_type', 'N/A')}, " +
+              f"source={getattr(region, 'source', 'N/A')}, model={getattr(region, 'model', 'N/A')}")
+    # For text regions, find a sample to debug
+    if region_type.lower() == 'text' and detected_regions:
+        text_sample = None
+        for i, r in enumerate(detected_regions[:10]):
+            print(f"    - Detected region {i}: type={r.region_type}, normalized_type={getattr(r, 'normalized_type', 'N/A')}")
+    # Extract text from the first region if available
+    if found_regions:
+        text = found_regions[0].extract_text()
+        preview = text[:50] + "..." if len(text) > 50 else text
+        print(f"    First region text: {preview}")
+# Save the highlighted image
+output_path = args.output or os.path.join(root_dir, "output", "paddle_layout_detection.png")
+os.makedirs(os.path.dirname(output_path), exist_ok=True)
+print(f"\nSaving highlighted layout to {output_path}")
+page.to_image(path=output_path, show_labels=True)
+print(f"Done!")
+# Show an example of working with a table region
+if "table" in regions_by_type and regions_by_type["table"]:
+    print("\nExample: Working with a detected table region")
+    table_region = regions_by_type["table"][0]
+    # Extract table data
+    try:
+        # Try using the extract_table method on the region
+        table_data = table_region.extract_table()
+        print(f"  Extracted {len(table_data)} rows from table")
+        # Show some table data
+        for i, row in enumerate(table_data[:2]):  # Show first 2 rows
+            print(f"    Row {i}: {row}")
+        # Check for cells
+        cells = page.find_all('region[type=table_cell][model=paddle]')
+        if cells:
+            print(f"\n  Found {len(cells)} table cells")
+            cell = cells[0]
+            print(f"    First cell text: {cell.extract_text()}")
+            print(f"    Row index: {getattr(cell, 'row_idx', 'N/A')}, Column index: {getattr(cell, 'col_idx', 'N/A')}")
+    except Exception as e:
+        print(f"  Error extracting table data: {e}")
+    # Save the highlighted table
+    table_output = os.path.join(os.path.dirname(output_path), "paddle_detected_table.png")
+    table_region.highlight(color=(0, 1, 0, 0.3), label="PaddlePaddle Table")
+    page.to_image(path=table_output, show_labels=True)
+    print(f"  Table highlighted image saved to {table_output}")

examples/paddle_layout_simple.py ADDED Viewed

@@ -0,0 +1,74 @@
+"""
+Simple test of PaddlePaddle layout analysis using minimal parameters.
+"""
+import os
+import sys
+from pathlib import Path
+# Add parent directory to path for imports
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from natural_pdf import PDF
+# Get the current directory of this script
+script_dir = os.path.dirname(os.path.realpath(__file__))
+# Get the parent directory (project root)
+root_dir = os.path.dirname(script_dir)
+# Get PDF path from command line or use default
+if len(sys.argv) > 1:
+    pdf_path = sys.argv[1]
+else:
+    # Default PDF path
+    pdf_path = os.path.join(root_dir, "pdfs", "2019 Statistics.pdf")
+# Get page number from command line or use default
+page_num = int(sys.argv[2]) if len(sys.argv) > 2 else 0
+print(f"Analyzing PDF: {pdf_path}")
+print(f"Page: {page_num}")
+# Load the PDF
+pdf = PDF(pdf_path)
+page = pdf.pages[page_num]
+print("Running PaddlePaddle layout analysis...")
+# Run paddle layout analysis using our minimal approach
+regions = page.analyze_layout(
+    model="paddle",
+    confidence=0.2,  # Lower confidence threshold to detect more regions
+    model_params={
+        "show_log": True
+    }
+)
+print(f"Found {len(regions)} regions")
+# Group regions by type and source
+region_groups = {}
+for region in regions:
+    region_type = region.region_type
+    source = getattr(region, 'source', 'unknown')
+    group_key = f"{region_type} ({source})"
+    if group_key not in region_groups:
+        region_groups[group_key] = []
+    region_groups[group_key].append(region)
+# Print regions by type and source
+for group_key, group_regions in region_groups.items():
+    print(f"{group_key}: {len(group_regions)} regions")
+# Highlight regions by type and source with different colors
+print("Highlighting regions...")
+for group_key, group_regions in region_groups.items():
+    for region in group_regions:
+        region.highlight(label=f"{group_key}")
+# Save highlighted image
+output_path = os.path.join(root_dir, "output", "paddle_layout_simple.png")
+print(f"Saving highlighted image to {output_path}")
+page.to_image(path=output_path, show_labels=True)
+print("Done!")