PyPI - natural-pdf - Versions diffs - 25.3.16__py3-none-any.whl - Mend

natural-pdf 25.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (109) hide show

examples/__init__.py +3 -0
examples/another_exclusion_example.py +20 -0
examples/basic_usage.py +190 -0
examples/boundary_exclusion_test.py +137 -0
examples/boundary_inclusion_fix_test.py +157 -0
examples/chainable_layout_example.py +70 -0
examples/color_basic_test.py +49 -0
examples/color_name_example.py +71 -0
examples/color_test.py +62 -0
examples/debug_ocr.py +91 -0
examples/direct_ocr_test.py +148 -0
examples/direct_paddle_test.py +99 -0
examples/direct_qa_example.py +165 -0
examples/document_layout_analysis.py +123 -0
examples/document_qa_example.py +185 -0
examples/exclusion_count_debug.py +128 -0
examples/exclusion_debug.py +107 -0
examples/exclusion_example.py +150 -0
examples/exclusion_optimization_example.py +190 -0
examples/extract_text_test.py +128 -0
examples/font_aware_example.py +101 -0
examples/font_variant_example.py +124 -0
examples/footer_overlap_test.py +124 -0
examples/highlight_all_example.py +82 -0
examples/highlight_attributes_test.py +114 -0
examples/highlight_confidence_display.py +122 -0
examples/highlight_demo.py +110 -0
examples/highlight_float_test.py +71 -0
examples/highlight_test.py +147 -0
examples/highlighting_example.py +123 -0
examples/image_width_example.py +84 -0
examples/improved_api_example.py +128 -0
examples/layout_confidence_display_test.py +65 -0
examples/layout_confidence_test.py +82 -0
examples/layout_coordinate_debug.py +258 -0
examples/layout_highlight_test.py +77 -0
examples/logging_example.py +70 -0
examples/ocr_comprehensive.py +193 -0
examples/ocr_debug_example.py +87 -0
examples/ocr_default_test.py +97 -0
examples/ocr_engine_comparison.py +235 -0
examples/ocr_example.py +89 -0
examples/ocr_simplified_params.py +79 -0
examples/ocr_visualization.py +102 -0
examples/ocr_visualization_test.py +121 -0
examples/paddle_layout_example.py +315 -0
examples/paddle_layout_simple.py +74 -0
examples/paddleocr_example.py +224 -0
examples/page_collection_example.py +103 -0
examples/polygon_highlight_example.py +83 -0
examples/position_methods_example.py +134 -0
examples/region_boundary_test.py +73 -0
examples/region_exclusion_test.py +149 -0
examples/region_expand_example.py +109 -0
examples/region_image_example.py +116 -0
examples/region_ocr_test.py +119 -0
examples/region_sections_example.py +115 -0
examples/school_books.py +49 -0
examples/school_books_all.py +52 -0
examples/scouring.py +36 -0
examples/section_extraction_example.py +232 -0
examples/simple_document_qa.py +97 -0
examples/spatial_navigation_example.py +108 -0
examples/table_extraction_example.py +135 -0
examples/table_structure_detection.py +155 -0
examples/tatr_cells_test.py +56 -0
examples/tatr_ocr_table_test.py +94 -0
examples/text_search_example.py +122 -0
examples/text_style_example.py +110 -0
examples/tiny-text.py +61 -0
examples/until_boundaries_example.py +156 -0
examples/until_example.py +112 -0
examples/very_basics.py +15 -0
natural_pdf/__init__.py +55 -0
natural_pdf/analyzers/__init__.py +9 -0
natural_pdf/analyzers/document_layout.py +736 -0
natural_pdf/analyzers/text_structure.py +153 -0
natural_pdf/core/__init__.py +3 -0
natural_pdf/core/page.py +2376 -0
natural_pdf/core/pdf.py +572 -0
natural_pdf/elements/__init__.py +3 -0
natural_pdf/elements/base.py +553 -0
natural_pdf/elements/collections.py +770 -0
natural_pdf/elements/line.py +124 -0
natural_pdf/elements/rect.py +122 -0
natural_pdf/elements/region.py +1366 -0
natural_pdf/elements/text.py +304 -0
natural_pdf/ocr/__init__.py +62 -0
natural_pdf/ocr/easyocr_engine.py +254 -0
natural_pdf/ocr/engine.py +158 -0
natural_pdf/ocr/paddleocr_engine.py +263 -0
natural_pdf/qa/__init__.py +3 -0
natural_pdf/qa/document_qa.py +405 -0
natural_pdf/selectors/__init__.py +4 -0
natural_pdf/selectors/parser.py +360 -0
natural_pdf/templates/__init__.py +1 -0
natural_pdf/templates/ocr_debug.html +517 -0
natural_pdf/utils/__init__.py +4 -0
natural_pdf/utils/highlighting.py +605 -0
natural_pdf/utils/ocr.py +515 -0
natural_pdf/utils/reading_order.py +227 -0
natural_pdf/utils/visualization.py +151 -0
natural_pdf-25.3.16.dist-info/LICENSE +21 -0
natural_pdf-25.3.16.dist-info/METADATA +268 -0
natural_pdf-25.3.16.dist-info/RECORD +109 -0
natural_pdf-25.3.16.dist-info/WHEEL +5 -0
natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
tests/__init__.py +3 -0
tests/test_pdf.py +39 -0

examples/highlighting_example.py ADDED Viewed

@@ -0,0 +1,123 @@
+"""
+Example demonstrating the highlighting feature of natural-pdf.
+"""
+import os
+import sys
+# Add the parent directory to the path to import the package
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from natural_pdf import PDF
+# IMPORTANT: This example has been updated to use the new API
+# Changes:
+# - select_until() → until()
+# - full_width=False → width="element"
+# - labels=True → show_labels=True
+# - cycle_colors=True → use_color_cycling=True
+def highlighting_example(pdf_path):
+    """Demonstrates the highlighting features for visual debugging."""
+    # Open the PDF
+    with PDF(pdf_path) as pdf:
+        page = pdf.pages[0]
+        print(f"PDF loaded: {pdf_path}")
+        print(f"PDF has {len(pdf)} pages")
+        # Create an output directory for saving images
+        output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'output'))
+        os.makedirs(output_dir, exist_ok=True)
+        # EXAMPLE 1: Highlight a single element
+        print("\nEXAMPLE 1: Highlighting a single element")
+        print("-" * 60)
+        # Find the "Summary:" text
+        summary = page.find('text:contains("Summary:")')
+        print(f"Found 'Summary' text at: {summary.bbox}")
+        # Highlight it and save the image
+        summary.highlight(label="Summary Heading")
+        output_file = os.path.join(output_dir, "highlight_single.png")
+        summary.page.to_image(path=output_file, show_show_labels=True)
+        print(f"Saved highlighted page to: {output_file}")
+        # Clear highlights for next example
+        page.clear_highlights()
+        # EXAMPLE 2: Highlight multiple elements with automatic color cycling
+        print("\nEXAMPLE 2: Highlighting multiple elements with color cycling")
+        print("-" * 60)
+        # Find different types of elements
+        thick_lines = page.find_all('line[width>=2]')
+        headings = page.find_all('text:bold')
+        # Highlight each group with a label
+        print(f"Found {len(thick_lines)} thick lines")
+        thick_lines.highlight(label="Thick Lines")
+        print(f"Found {len(headings)} bold headings")
+        # Let's examine some of the bold headings
+        for i, h in enumerate(headings[:5]):
+            print(f"  Bold heading {i+1}: '{h.text}' at {h.bbox}")
+        headings.highlight(label="Bold Headings")
+        # Save the image with a legend
+        output_file = os.path.join(output_dir, "highlight_multiple.png")
+        page.to_image(path=output_file, show_show_labels=True)
+        print(f"Saved page with multiple highlights to: {output_file}")
+        # Clear highlights for next example
+        page.clear_highlights()
+        # EXAMPLE 3: Highlighting regions
+        print("\nEXAMPLE 3: Highlighting regions")
+        print("-" * 60)
+        # Find the "Summary:" text and the thick line
+        summary = page.find('text:contains("Summary:")')
+        thick_line = page.find('line[width>=2]')
+        # Create a region from Summary until the thick line
+        summary_region = summary.until('line[width>=2]', width="full")
+        print(f"Created region from Summary to thick line: {summary_region.bbox}")
+        # Highlight the region
+        summary_region.highlight(label="Summary Section")
+        # Find text within the region and highlight with a different color
+        key_elements = summary_region.find_all('text')
+        print(f"Found {len(key_elements)} text elements in the region")
+        # Only highlight a subset to avoid cluttering the image
+        for element in key_elements[:10]:
+            if "fertilizer" in element.text.lower():
+                element.highlight(label="Key Terms")
+        # Save the image with a legend
+        output_file = os.path.join(output_dir, "highlight_region.png")
+        page.to_image(path=output_file, show_show_labels=True)
+        print(f"Saved page with highlighted region to: {output_file}")
+        print("\nEnd of highlighting demonstration.")
+if __name__ == "__main__":
+    # Default to example PDF if no path is provided
+    if len(sys.argv) < 2:
+        # Use the example PDF in the pdfs directory
+        pdf_path = os.path.abspath(os.path.join(
+            os.path.dirname(__file__), '..', 'pdfs', '01-practice.pdf'))
+        if not os.path.exists(pdf_path):
+            print("Example PDF not found. Please provide a path to a PDF file.")
+            print("Usage: python highlighting_example.py [path/to/file.pdf]")
+            sys.exit(1)
+    else:
+        pdf_path = sys.argv[1]
+        # Check if the file exists
+        if not os.path.exists(pdf_path):
+            print(f"File not found: {pdf_path}")
+            sys.exit(1)
+    highlighting_example(pdf_path)

examples/image_width_example.py ADDED Viewed

@@ -0,0 +1,84 @@
+"""
+Example demonstrating image width customization in to_image method.
+"""
+import os
+import sys
+# Add the parent directory to the path to import the package
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from natural_pdf import PDF
+def image_width_example(pdf_path):
+    """Demonstrate customizing image width with the to_image method."""
+    # Open the PDF
+    with PDF(pdf_path) as pdf:
+        page = pdf.pages[0]
+        print(f"PDF loaded: {pdf_path}")
+        print(f"PDF has {len(pdf)} pages")
+        # Create an output directory for saving images
+        output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'output'))
+        os.makedirs(output_dir, exist_ok=True)
+        # First highlight some elements to make the examples more interesting
+        page.clear_highlights()
+        page.highlight_all(include_types=['rect', 'line'])
+        # EXAMPLE 1: Save image with default width (based on scale)
+        print("\nEXAMPLE 1: Image with default width")
+        print("-" * 60)
+        output_file = os.path.join(output_dir, "width_default.png")
+        img = page.to_image(path=output_file, show_labels=True)
+        print(f"Original image size: {img.width} x {img.height} pixels")
+        print(f"Saved to: {output_file}")
+        # EXAMPLE 2: Image with custom width of 800px
+        print("\nEXAMPLE 2: Custom width of 800px")
+        print("-" * 60)
+        output_file = os.path.join(output_dir, "width_800px.png")
+        img = page.to_image(path=output_file, width=800, show_labels=True)
+        print(f"Custom image size: {img.width} x {img.height} pixels")
+        print(f"Saved to: {output_file}")
+        # EXAMPLE 3: Image with custom width of 1200px
+        print("\nEXAMPLE 3: Custom width of 1200px")
+        print("-" * 60)
+        output_file = os.path.join(output_dir, "width_1200px.png")
+        img = page.to_image(path=output_file, width=1200, show_labels=True)
+        print(f"Custom image size: {img.width} x {img.height} pixels")
+        print(f"Saved to: {output_file}")
+        # EXAMPLE 4: Using both scale and width (width takes precedence for final output)
+        print("\nEXAMPLE 4: Using both scale and width")
+        print("-" * 60)
+        output_file = os.path.join(output_dir, "width_with_scale.png")
+        img = page.to_image(path=output_file, scale=3.0, width=600, show_labels=True)
+        print(f"Scale 3.0 with width 600px: {img.width} x {img.height} pixels")
+        print(f"Saved to: {output_file}")
+        print("\nEnd of image width demonstration.")
+if __name__ == "__main__":
+    # Default to example PDF if no path is provided
+    if len(sys.argv) < 2:
+        # Use the example PDF in the pdfs directory
+        pdf_path = os.path.abspath(os.path.join(
+            os.path.dirname(__file__), '..', 'pdfs', '01-practice.pdf'))
+        if not os.path.exists(pdf_path):
+            print("Example PDF not found. Please provide a path to a PDF file.")
+            print("Usage: python image_width_example.py [path/to/file.pdf]")
+            sys.exit(1)
+    else:
+        pdf_path = sys.argv[1]
+        # Check if the file exists
+        if not os.path.exists(pdf_path):
+            print(f"File not found: {pdf_path}")
+            sys.exit(1)
+    image_width_example(pdf_path)

examples/improved_api_example.py ADDED Viewed

@@ -0,0 +1,128 @@
+"""
+Example demonstrating the improved API consistency in natural-pdf.
+"""
+import os
+import sys
+from pathlib import Path
+# Add the parent directory to the path to import the package
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from natural_pdf import PDF
+def consistency_example(pdf_path):
+    """Example showing the improved consistent API."""
+    # Open the PDF without OCR to avoid issues
+    with PDF(pdf_path) as pdf:
+        print(f"PDF has {len(pdf)} pages")
+        page = pdf.pages[0]
+        print("\n1. IMPROVED REGION CREATION:")
+        # Create a region with intuitive named parameters
+        header_region = page.region(top=0, bottom=100)
+        print(f"  Created header region with bounds {header_region.bbox}")
+        # Create a custom region with element width
+        custom_region = page.region(
+            left=100, right=300,
+            top=200, bottom=400,
+            width="element"
+        )
+        print(f"  Created custom region with bounds {custom_region.bbox}")
+        print("\n2. IMPROVED SPATIAL NAVIGATION:")
+        # Find a major element
+        heading = page.find('text[size>=12]')
+        if heading:
+            print(f"  Found heading: '{heading.text}'")
+            # Use above/below with improved parameters
+            above_region = heading.above(height=50, width="full")
+            print(f"  Region above: {above_region.bbox}")
+            # Below with element width
+            below_region = heading.below(height=100, width="element")
+            print(f"  Region below (element width): {below_region.bbox}")
+            # Using until with consistent parameter naming
+            next_heading = page.find('text[size>=12]', skip=1)
+            if next_heading:
+                print(f"  Found next heading: '{next_heading.text}'")
+                # Using the until method
+                between_region = heading.until(
+                    'text[size>=12]',
+                    include_endpoint=False,
+                    width="full"
+                )
+                # Don't use OCR for text extraction
+                print(f"  Region between headings: {between_region.bbox}")
+        print("\n3. CONSISTENT EXTRACTION PARAMETERS:")
+        # Text extraction with consistent parameters
+        text = page.extract_text(
+            preserve_whitespace=True,
+            use_exclusions=True
+        )
+        print(f"  Extracted {len(text)} characters")
+        print("\n4. CONSISTENT VISUAL METHODS:")
+        # Find and highlight elements with consistent parameters
+        lines = page.find_all('line[width>=1]')
+        if lines:
+            print(f"  Found {len(lines)} thick lines")
+            # Highlight with label first, then color
+            lines.highlight(
+                label="Thick Lines",
+                color=(1, 0, 0, 0.5)
+            )
+            # Method chaining with save
+            lines.highlight(
+                label="Thick Lines"
+            ).save(
+                "improved_api_lines.png",
+                show_labels=True
+            )
+        print("\n5. BUILDER PATTERN:")
+        # Create regions for exclusion
+        header = page.region(top=0, bottom=50)
+        footer = page.region(top=page.height-50, bottom=page.height)
+        # Add exclusions with method chaining
+        pdf.add_exclusion(
+            lambda p: p.region(top=0, bottom=50),
+            label="headers"
+        ).add_exclusion(
+            lambda p: p.region(top=p.height-50, bottom=p.height),
+            label="footers"
+        )
+        # Extract with exclusions
+        filtered_text = page.extract_text(use_exclusions=True)
+        print(f"  Extracted {len(filtered_text)} characters with exclusions")
+        # Method chaining with method return
+        pdf_same = pdf.add_exclusion(lambda p: None, label="test")
+        print(f"  Method chaining returns same object: {pdf is pdf_same}")
+if __name__ == "__main__":
+    # Default to example PDF if no path is provided
+    if len(sys.argv) < 2:
+        # Use the example PDF in the pdfs directory
+        pdf_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'pdfs', '01-practice.pdf'))
+        if not os.path.exists(pdf_path):
+            print("Example PDF not found. Please provide a path to a PDF file.")
+            print("Usage: python improved_api_example.py [path/to/file.pdf]")
+            sys.exit(1)
+    else:
+        pdf_path = sys.argv[1]
+        # Check if the file exists
+        if not os.path.exists(pdf_path):
+            print(f"File not found: {pdf_path}")
+            sys.exit(1)
+    consistency_example(pdf_path)

examples/layout_confidence_display_test.py ADDED Viewed

@@ -0,0 +1,65 @@
+"""
+Test displaying confidence scores in layout highlighting.
+This example demonstrates how confidence scores are displayed next to
+each layout region in both highlight_layout and highlight_all methods.
+"""
+import os
+import sys
+import argparse
+# Add the parent directory to the Python path
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from natural_pdf import PDF
+# Get the current directory of this script
+script_dir = os.path.dirname(os.path.realpath(__file__))
+# Get the parent directory (project root)
+root_dir = os.path.dirname(script_dir)
+# Default PDF path
+default_pdf = os.path.join(root_dir, "pdfs", "01-practice.pdf")
+# Set up argument parser
+parser = argparse.ArgumentParser(description="Layout confidence display test")
+parser.add_argument("pdf_path", nargs="?", default=default_pdf, help="Path to a PDF file")
+parser.add_argument("--page", type=int, default=0, help="Page number to analyze (0-based)")
+args = parser.parse_args()
+print(f"Testing confidence display on: {args.pdf_path}")
+print(f"Page: {args.page}")
+# Load the PDF
+pdf = PDF(args.pdf_path)
+page = pdf.pages[args.page]
+# Run layout analysis with different models
+print("Running layout analysis...")
+page.analyze_layout(model="yolo", confidence=0.1)  # Use low confidence to get more regions
+page.analyze_layout(model="tatr", confidence=0.1, existing="append")  # Low confidence for TATR too
+print(f"Found {len(page.detected_layout_regions)} total layout regions")
+# Test 1: highlight_layout with default format
+print("\nTest 1: Using highlight_layout with default format")
+page.clear_highlights()
+page.highlight_layout()
+output_path = os.path.join(root_dir, "output", "conf_display_highlight_layout.png")
+page.to_image(path=output_path, show_labels=True)
+print(f"Saved to {output_path}")
+# Test 2: highlight_all with include_layout_regions=True
+print("\nTest 2: Using highlight_all with include_layout_regions=True")
+page.clear_highlights()
+page.highlight_all(include_layout_regions=True, layout_confidence=0.1)
+output_path = os.path.join(root_dir, "output", "conf_display_highlight_all.png")
+page.to_image(path=output_path, show_labels=True)
+print(f"Saved to {output_path}")
+# Test 3: highlight_all with only layout regions
+print("\nTest 3: Using highlight_all with only layout regions")
+page.clear_highlights()
+page.highlight_all(include_layout_regions=True, include_types=[], layout_confidence=0.1)
+output_path = os.path.join(root_dir, "output", "conf_display_layout_only.png")
+page.to_image(path=output_path, show_labels=True)
+print(f"Saved to {output_path}")
+print("\nDone!")

examples/layout_confidence_test.py ADDED Viewed

@@ -0,0 +1,82 @@
+"""
+Test the layout_confidence=True behavior in highlight_all method.
+This example demonstrates that when layout_confidence=True is passed,
+all layout regions are included regardless of their confidence score.
+"""
+import os
+import sys
+import argparse
+# Add the parent directory to the Python path
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from natural_pdf import PDF
+# Get the current directory of this script
+script_dir = os.path.dirname(os.path.realpath(__file__))
+# Get the parent directory (project root)
+root_dir = os.path.dirname(script_dir)
+# Default PDF path
+default_pdf = os.path.join(root_dir, "pdfs", "01-practice.pdf")
+# Set up argument parser
+parser = argparse.ArgumentParser(description="Layout confidence test")
+parser.add_argument("pdf_path", nargs="?", default=default_pdf, help="Path to a PDF file")
+parser.add_argument("--page", type=int, default=0, help="Page number to analyze (0-based)")
+args = parser.parse_args()
+print(f"Testing layout_confidence=True on: {args.pdf_path}")
+print(f"Page: {args.page}")
+# Load the PDF
+pdf = PDF(args.pdf_path)
+page = pdf.pages[args.page]
+# Run layout analysis with YOLO and TATR
+print("Running layout analysis...")
+page.analyze_layout(model="yolo", confidence=0.1)  # Use low confidence to get more regions
+page.analyze_layout(model="tatr", confidence=0.1, existing="append")  # Low confidence for TATR too
+print(f"Found {len(page.detected_layout_regions)} total layout regions")
+# Count regions by confidence thresholds
+high_conf = [r for r in page.detected_layout_regions if r.confidence >= 0.5]
+med_conf = [r for r in page.detected_layout_regions if 0.2 <= r.confidence < 0.5]
+low_conf = [r for r in page.detected_layout_regions if r.confidence < 0.2]
+print(f"High confidence (>=0.5): {len(high_conf)} regions")
+print(f"Medium confidence (0.2-0.5): {len(med_conf)} regions")
+print(f"Low confidence (<0.2): {len(low_conf)} regions")
+# Test 1: highlight_all with default layout_confidence=0.2
+print("\nTest 1: Using default layout_confidence=0.2")
+page.clear_highlights()
+page.highlight_all(include_layout_regions=True)
+output_path = os.path.join(root_dir, "output", "layout_conf_default.png")
+page.to_image(path=output_path, show_labels=True)
+print(f"Saved to {output_path}")
+# Test 2: highlight_all with layout_confidence=0.5 (high threshold)
+print("\nTest 2: Using layout_confidence=0.5 (high threshold)")
+page.clear_highlights()
+page.highlight_all(include_layout_regions=True, layout_confidence=0.5)
+output_path = os.path.join(root_dir, "output", "layout_conf_high.png")
+page.to_image(path=output_path, show_labels=True)
+print(f"Saved to {output_path}")
+# Test 3: highlight_all with layout_confidence=True (include all)
+print("\nTest 3: Using layout_confidence=True (include all)")
+page.clear_highlights()
+page.highlight_all(include_layout_regions=True, layout_confidence=True)
+output_path = os.path.join(root_dir, "output", "layout_conf_all.png")
+page.to_image(path=output_path, show_labels=True)
+print(f"Saved to {output_path}")
+# Test 4: highlight_all with layout_confidence=0.0 (include all)
+print("\nTest 4: Using layout_confidence=0.0 (include all)")
+page.clear_highlights()
+page.highlight_all(include_layout_regions=True, layout_confidence=0.0)
+output_path = os.path.join(root_dir, "output", "layout_conf_zero.png")
+page.to_image(path=output_path, show_labels=True)
+print(f"Saved to {output_path}")
+print("\nDone!")