PyPI - natural-pdf - Versions diffs - 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl - Mend

natural-pdf 0.2.10py3-none-any.whl → 0.2.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

natural_pdf/analyzers/guides.py +318 -73
natural_pdf/core/page.py +56 -8
natural_pdf/elements/region.py +5 -3
{natural_pdf-0.2.10.dist-info → natural_pdf-0.2.12.dist-info}/METADATA +1 -1
{natural_pdf-0.2.10.dist-info → natural_pdf-0.2.12.dist-info}/RECORD +29 -9
{natural_pdf-0.2.10.dist-info → natural_pdf-0.2.12.dist-info}/top_level.txt +1 -0
temp/debug_cell_extraction.py +42 -0
temp/debug_exclusion_overlap.py +43 -0
temp/debug_exclusions_guides.py +67 -0
temp/debug_extra_guide.py +41 -0
temp/debug_outer_boundaries.py +46 -0
temp/debug_st_search.py +33 -0
temp/fix_page_exclusions.py +42 -0
temp/test_exclusion_with_debug.py +30 -0
temp/test_find_exclusions_fix.py +53 -0
temp/test_find_exclusions_fix_no_recursion.py +97 -0
temp/test_fix_real_pdf.py +48 -0
temp/test_fix_working.py +55 -0
temp/test_fixed_pdf_exclusions.py +67 -0
temp/test_horizontal_top_bottom.py +53 -0
temp/test_marker_order.py +45 -0
temp/test_original_exclusions_now_work.py +56 -0
temp/test_pdf_exclusions_with_guides.py +84 -0
temp/test_region_exclusions_detailed.py +25 -0
temp/test_stripes_real_pdf.py +62 -0
temp/test_vertical_stripes.py +55 -0
{natural_pdf-0.2.10.dist-info → natural_pdf-0.2.12.dist-info}/WHEEL +0 -0
{natural_pdf-0.2.10.dist-info → natural_pdf-0.2.12.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.2.10.dist-info → natural_pdf-0.2.12.dist-info}/licenses/LICENSE +0 -0

temp/test_fix_working.py ADDED Viewed

@@ -0,0 +1,55 @@
+"""Test that the fix for region exclusions works"""
+from natural_pdf import PDF
+from natural_pdf.analyzers.guides import Guides
+pdf = PDF("pdfs/m27.pdf")
+page = pdf.pages[0]
+# Check initial text
+print("Initial text:")
+print(page.extract_text()[:200])
+# Add exclusions
+pdf.add_exclusion(lambda page: page.find(text="PREMISE").above())
+pdf.add_exclusion(lambda page: page.find("text:regex(Page \d+ of)").expand())
+# Test region extraction with exclusions
+print("\n\nRegion in excluded area (0, 0, 200, 50):")
+excluded_region = page.region(0, 0, 200, 50)
+print("Without exclusions:", repr(excluded_region.extract_text(apply_exclusions=False)))
+print("With exclusions:", repr(excluded_region.extract_text(apply_exclusions=True)))
+# Now test the full table extraction
+print("\n\nFull table extraction:")
+headers = (
+    page
+    .find(text="NUMBER")
+    .right(include_source=True)
+    .expand(top=3, bottom=3)
+    .find_all('text')
+)
+guides = Guides(page)
+guides.vertical.from_content(headers, align='left')
+guides.horizontal.from_stripes()
+result = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True, header=False)
+df = result.to_df()
+print(f"Shape: {df.shape}")
+print("\nFirst row:")
+for col, val in df.iloc[0].items():
+    print(f"  {repr(col)}: {repr(val)}")
+# Check if excluded content is in the table
+table_str = df.to_string()
+has_feb = "FEBRUARY" in table_str or "FEBR" in table_str and "RUARY" in table_str
+has_alphabetic = "ALPHABETIC LISTING" in table_str
+print(f"\nContains 'FEBRUARY': {has_feb}")
+print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic}")
+if has_feb or has_alphabetic:
+    print("\n❌ FAILED: Exclusions not working properly")
+else:
+    print("\n✅ SUCCESS: Exclusions working correctly!")

temp/test_fixed_pdf_exclusions.py ADDED Viewed

@@ -0,0 +1,67 @@
+"""Test PDF-level exclusions with proper Region returns."""
+from natural_pdf import PDF
+from natural_pdf.analyzers.guides import Guides
+# Load the PDF
+pdf = PDF("pdfs/m27.pdf")
+# Add PDF-level exclusions that properly return Regions
+print("Adding fixed PDF-level exclusions...")
+pdf.add_exclusion(
+    lambda page: page.find("text:contains(PREMISE)").above()
+    if page.find("text:contains(PREMISE)") else None,
+    label="header_exclusion"
+)
+pdf.add_exclusion(
+    lambda page: page.find("text:regex(Page \d+ of)").expand(10)
+    if page.find("text:regex(Page \d+ of)") else None,
+    label="footer_exclusion"
+)
+page = pdf.pages[0]
+# Create guides and extract table
+print("\nCreating guides and extracting table...")
+headers = (
+    page
+    .find("text:contains(NUMBER)")
+    .right(include_source=True)
+    .expand(top=3, bottom=3)
+    .find_all('text')
+)
+guides = Guides(page)
+guides.vertical.from_content(headers, align='left')
+guides.horizontal.from_stripes()
+# Extract table with apply_exclusions=True (default)
+table_result = guides.extract_table(include_outer_boundaries=True)
+df = table_result.to_df()
+print(f"\nTable shape: {df.shape}")
+print("\nFirst few rows:")
+print(df.head())
+# Check if excluded content is present
+print("\nChecking for excluded content...")
+table_text = df.to_string()
+has_alphabetic = "ALPHABETIC LISTING" in table_text
+has_page_num = "Page 1 of" in table_text or "Page" in table_text and "of" in table_text
+print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic}")
+print(f"Contains 'Page X of Y': {has_page_num}")
+if has_alphabetic or has_page_num:
+    print("\n❌ FAILED: Exclusions not properly applied")
+    # Show what's in the first row/column that should be excluded
+    print("\nFirst column values (should not have header text):")
+    print(df.iloc[:5, 0])
+else:
+    print("\n✅ SUCCESS: Exclusions properly applied")
+# Debug: Check exclusion regions
+print("\n\nDebug: Checking exclusion regions...")
+exclusions = page._get_exclusion_regions(debug=True)
+print(f"\nTotal exclusion regions: {len(exclusions)}")
+for i, exc in enumerate(exclusions):
+    print(f"  Exclusion {i}: {exc.bbox} (label: {exc.label})")

temp/test_horizontal_top_bottom.py ADDED Viewed

@@ -0,0 +1,53 @@
+"""Test horizontal guides with top/bottom alignment on real PDF."""
+from natural_pdf import PDF
+from natural_pdf.analyzers.guides import Guides
+# Load the PDF
+pdf = PDF("pdfs/m27.pdf")
+page = pdf.pages[0]
+# Find some row headers to use as markers
+# First, let's find what text elements exist
+all_text = page.find_all('text')[:20]
+print("First 20 text elements:")
+for t in all_text:
+    print(f"  '{t.text}' at y={t.top:.2f}")
+# Look for rows containing numbers in the first column
+rows = page.find_all('text[x0<100]')[:5]  # Get elements in left column
+print("Found rows:")
+for i, row in enumerate(rows):
+    print(f"  {i}: '{row.text}' at y={row.top:.2f}-{row.bottom:.2f}")
+# Test with align='top' (should use top edge of each row)
+guides_top = Guides(page)
+guides_top.horizontal.from_content(rows, align='top', outer=False)
+print(f"\nHorizontal guides with align='top': {sorted(guides_top.horizontal)}")
+print("Expected: top edges of each row")
+# Test with align='bottom' (should use bottom edge of each row)
+guides_bottom = Guides(page)
+guides_bottom.horizontal.from_content(rows, align='bottom', outer=False)
+print(f"\nHorizontal guides with align='bottom': {sorted(guides_bottom.horizontal)}")
+print("Expected: bottom edges of each row")
+# Verify they're different
+if guides_top.horizontal != guides_bottom.horizontal:
+    print("\n✅ SUCCESS: top and bottom alignment produce different guides")
+else:
+    print("\n❌ FAILED: top and bottom alignment produced the same guides")
+# Test the class method too
+guides_class_top = Guides.from_content(page, axis='horizontal', markers=rows, align='top', outer=False)
+guides_class_bottom = Guides.from_content(page, axis='horizontal', markers=rows, align='bottom', outer=False)
+print(f"\nClass method with top: {sorted(guides_class_top.horizontal)}")
+print(f"Class method with bottom: {sorted(guides_class_bottom.horizontal)}")
+if guides_class_top.horizontal == guides_top.horizontal and guides_class_bottom.horizontal == guides_bottom.horizontal:
+    print("\n✅ SUCCESS: Class method produces same results as instance method")
+else:
+    print("\n❌ FAILED: Class method produces different results")

temp/test_marker_order.py ADDED Viewed

@@ -0,0 +1,45 @@
+"""Test marker ordering with a real PDF."""
+from natural_pdf import PDF
+# Load a test PDF
+pdf = PDF("pdfs/01-practice.pdf")
+page = pdf.pages[0]
+# Find some text elements to use as markers
+all_text = page.find_all("text")
+print("Sample text elements:")
+for i, elem in enumerate(all_text[:10]):
+    print(f"{i}: '{elem.text}' at x={elem.x0:.1f}")
+# Create guides with markers in specific order
+print("\n--- Testing vertical guides with markers ---")
+# Let's find specific text elements at different positions
+violations_elem = page.find('text:contains("Violations")')
+date_elem = page.find('text:contains("Date")')
+total_elem = page.find('text:contains("Total")')
+if violations_elem and date_elem and total_elem:
+    print(f"\nElement positions:")
+    print(f"'Violations' at x={violations_elem.x0:.1f}")
+    print(f"'Date' at x={date_elem.x0:.1f}")
+    print(f"'Total' at x={total_elem.x0:.1f}")
+    # Create guides with markers in a specific order
+    guides = page.guides.from_content(
+        axis="vertical",
+        markers=["Violations", "Date", "Total"],
+        align="left",
+        outer=True
+    )
+    print(f"\nResulting vertical guides: {sorted(guides.vertical)}")
+    print(f"Page bounds: {page.bbox}")
+    # Show visually
+    page.guides.vertical.from_content(
+        markers=["Violations", "Date", "Total"],
+        align="left",
+        outer=True
+    )
+    page.show(guides=True)

temp/test_original_exclusions_now_work.py ADDED Viewed

@@ -0,0 +1,56 @@
+"""Test that the original exclusion lambdas (returning Elements) now work."""
+from natural_pdf import PDF
+from natural_pdf.analyzers.guides import Guides
+# Load the PDF
+pdf = PDF("pdfs/m27.pdf")
+# Add the ORIGINAL PDF-level exclusions that return Elements (not Regions)
+print("Adding original PDF-level exclusions (returning Elements)...")
+pdf.add_exclusion(lambda page: page.find("text:contains(PREMISE)").above())
+pdf.add_exclusion(lambda page: page.find("text:regex(Page \d+ of)"))
+page = pdf.pages[0]
+# Create guides and extract table
+print("\nCreating guides and extracting table...")
+headers = (
+    page
+    .find("text:contains(NUMBER)")
+    .right(include_source=True)
+    .expand(top=3, bottom=3)
+    .find_all('text')
+)
+guides = Guides(page)
+guides.vertical.from_content(headers, align='left')
+guides.horizontal.from_stripes()
+# Extract table
+table_result = guides.extract_table(include_outer_boundaries=True)
+df = table_result.to_df()
+print(f"\nTable shape: {df.shape}")
+print("\nFirst few rows:")
+print(df.head())
+# Check if excluded content is present
+print("\nChecking for excluded content...")
+table_text = df.to_string()
+has_alphabetic = "ALPHABETIC LISTING" in table_text
+has_page_num = "Page 1 of" in table_text or "Page" in table_text and "of" in table_text
+print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic}")
+print(f"Contains 'Page X of Y': {has_page_num}")
+if has_alphabetic or has_page_num:
+    print("\n❌ FAILED: Exclusions not properly applied")
+else:
+    print("\n✅ SUCCESS: Exclusions properly applied with Element returns!")
+# Debug: Check exclusion regions with verbose output
+print("\n\nDebug: Checking exclusion regions...")
+exclusions = page._get_exclusion_regions(debug=True)
+print(f"\nTotal exclusion regions: {len(exclusions)}")
+for i, exc in enumerate(exclusions):
+    print(f"  Exclusion {i}: {exc.bbox} (label: {getattr(exc, 'label', 'None')})")

temp/test_pdf_exclusions_with_guides.py ADDED Viewed

@@ -0,0 +1,84 @@
+"""Test that PDF-level exclusions work with guides.extract_table()."""
+from natural_pdf import PDF
+from natural_pdf.analyzers.guides import Guides
+# Load the PDF
+pdf = PDF("pdfs/m27.pdf")
+page = pdf.pages[0]
+# Add PDF-level exclusions
+print("Adding PDF-level exclusions...")
+pdf.add_exclusion(lambda page: page.find("text:contains(PREMISE)").above())
+pdf.add_exclusion(lambda page: page.find("text:regex(Page \d+ of)"))
+# Create guides and extract table
+print("\nCreating guides and extracting table...")
+headers = (
+    page
+    .find("text:contains(NUMBER)")
+    .right(include_source=True)
+    .expand(top=3, bottom=3)
+    .find_all('text')
+)
+guides = Guides(page)
+guides.vertical.from_content(headers, align='left')
+guides.horizontal.from_stripes()
+# Extract table with apply_exclusions=True (default)
+table_result = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True)
+df = table_result.to_df()
+print(f"\nTable shape: {df.shape}")
+print("\nFirst few rows:")
+print(df.head())
+# Check if excluded content is present
+print("\nChecking for excluded content...")
+table_text = df.to_string()
+has_alphabetic = "ALPHABETIC LISTING" in table_text
+has_page_num = "Page 1 of" in table_text or "Page" in table_text and "of" in table_text
+print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic}")
+print(f"Contains 'Page X of Y': {has_page_num}")
+if has_alphabetic or has_page_num:
+    print("\n❌ FAILED: Exclusions not properly applied")
+else:
+    print("\n✅ SUCCESS: Exclusions properly applied")
+# Now test with page-level exclusions for comparison
+print("\n\nTesting with page-level exclusions...")
+page2 = pdf.pages[0]
+header = page2.find("text:contains(PREMISE)").above()
+footer = page2.find("text:regex(Page \d+ of)")
+if header:
+    page2.add_exclusion(header)
+if footer:
+    page2.add_exclusion(footer)
+guides2 = Guides(page2)
+guides2.vertical.from_content(headers, align='left')
+guides2.horizontal.from_stripes()
+table_result2 = guides2.extract_table(include_outer_boundaries=True)
+df2 = table_result2.to_df()
+print(f"\nTable shape with page exclusions: {df2.shape}")
+table_text2 = df2.to_string()
+has_alphabetic2 = "ALPHABETIC LISTING" in table_text2
+has_page_num2 = "Page 1 of" in table_text2 or "Page" in table_text2 and "of" in table_text2
+print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic2}")
+print(f"Contains 'Page X of Y': {has_page_num2}")
+if has_alphabetic2 or has_page_num2:
+    print("\n❌ FAILED: Page exclusions not properly applied")
+else:
+    print("\n✅ SUCCESS: Page exclusions properly applied")
+# Debug: Check exclusion regions
+print("\n\nDebug: Checking exclusion regions...")
+exclusions = page._get_exclusion_regions(debug=True)
+print(f"\nTotal exclusion regions: {len(exclusions)}")

temp/test_region_exclusions_detailed.py ADDED Viewed

@@ -0,0 +1,25 @@
+"""Test region exclusions with detailed debugging"""
+from natural_pdf import PDF
+pdf = PDF("pdfs/m27.pdf")
+page = pdf.pages[0]
+# Add exclusion
+pdf.add_exclusion(lambda page: page.find(text="PREMISE").above(), label="header")
+# Get page exclusions
+print("Page exclusions:")
+print(f"page._exclusions: {page._exclusions}")
+print(f"pdf._exclusions: {pdf._exclusions}")
+# Create a region in the excluded area
+test_region = page.region(0, 0, 200, 50)
+print(f"\nTest region: {test_region.bbox}")
+print(f"Region's page: {test_region.page}")
+print(f"Region's _page: {test_region._page}")
+print(f"Region's _page._exclusions: {test_region._page._exclusions}")
+# Try extraction with debug
+print("\nExtracting with debug=True:")
+text = test_region.extract_text(apply_exclusions=True, debug=True)
+print(f"Result: '{text}'")

temp/test_stripes_real_pdf.py ADDED Viewed

@@ -0,0 +1,62 @@
+"""Test from_stripes with a real PDF that has striped rows."""
+from natural_pdf import PDF
+from natural_pdf.analyzers.guides import Guides
+# Load the PDF (assuming m27.pdf has the striped table)
+pdf = PDF("pdfs/m27.pdf")
+page = pdf.pages[0]
+# Test 1: Manual selection with specific color
+print("Test 1: Manual selection with color='#00ffff'")
+guides = Guides(page)
+guides.horizontal.from_stripes(color='#00ffff')
+print(f"Found {len(guides.horizontal)} horizontal guides")
+if guides.horizontal:
+    print(f"Guide range: {min(guides.horizontal):.2f} to {max(guides.horizontal):.2f}")
+    print(f"First 5 guides: {sorted(guides.horizontal)[:5]}")
+# Test 2: Auto-detect most common stripe color
+print("\nTest 2: Auto-detect stripes")
+guides2 = Guides(page)
+guides2.horizontal.from_stripes()
+print(f"Found {len(guides2.horizontal)} horizontal guides")
+if guides2.horizontal:
+    print(f"Guide range: {min(guides2.horizontal):.2f} to {max(guides2.horizontal):.2f}")
+# Test 3: Manual selection of stripes
+print("\nTest 3: Manual selection of stripe elements")
+cyan_stripes = page.find_all('rect[fill=#00ffff]')
+print(f"Found {len(cyan_stripes)} cyan rectangles")
+if cyan_stripes:
+    guides3 = Guides(page)
+    guides3.horizontal.from_stripes(cyan_stripes)
+    print(f"Created {len(guides3.horizontal)} guides from stripes")
+    # Show how this captures both edges of each stripe
+    print("\nFirst stripe edges:")
+    first_stripe = cyan_stripes[0]
+    print(f"  Stripe at y={first_stripe.top:.2f} to {first_stripe.bottom:.2f}")
+    print(f"  Guides include: {first_stripe.top:.2f} in guides? {first_stripe.top in guides3.horizontal}")
+    print(f"  Guides include: {first_stripe.bottom:.2f} in guides? {first_stripe.bottom in guides3.horizontal}")
+# Test 4: Compare with traditional approach
+print("\nComparison with traditional from_content approach:")
+# Traditional way would only get one edge per stripe
+guides4 = Guides(page)
+guides4.horizontal.from_content(cyan_stripes, align='top', outer=False)
+print(f"from_content (top only): {len(guides4.horizontal)} guides")
+guides5 = Guides(page)
+guides5.horizontal.from_content(cyan_stripes, align='bottom', outer=False)
+print(f"from_content (bottom only): {len(guides5.horizontal)} guides")
+print(f"from_stripes (both edges): {len(guides3.horizontal)} guides")
+# Verify we get approximately 2x guides with from_stripes
+if len(cyan_stripes) > 0:
+    expected_guides = len(set([s.top for s in cyan_stripes] + [s.bottom for s in cyan_stripes]))
+    print(f"\nExpected unique edges: {expected_guides}")
+    print(f"Actual from_stripes: {len(guides3.horizontal)}")

temp/test_vertical_stripes.py ADDED Viewed

@@ -0,0 +1,55 @@
+"""Test from_stripes with vertical stripes (column backgrounds)."""
+from natural_pdf import PDF
+from natural_pdf.analyzers.guides import Guides
+# Create a mock page with vertical stripes (simulating column backgrounds)
+class MockRect:
+    def __init__(self, fill, x0, x1, top, bottom):
+        self.fill = fill
+        self.x0 = x0
+        self.x1 = x1
+        self.top = top
+        self.bottom = bottom
+class MockPage:
+    def __init__(self):
+        self.bbox = (0, 0, 800, 600)
+        # Create vertical stripes for alternating columns
+        self.stripes = [
+            MockRect('#f0f0f0', 100, 200, 0, 600),  # Column 1 background
+            MockRect('#f0f0f0', 300, 400, 0, 600),  # Column 3 background
+            MockRect('#f0f0f0', 500, 600, 0, 600),  # Column 5 background
+        ]
+    def find_all(self, selector):
+        if selector == 'rect[fill=#f0f0f0]':
+            return self.stripes
+        elif selector == 'rect[fill]':
+            # Include some other rects too
+            return self.stripes + [
+                MockRect('#ffffff', 0, 100, 0, 600),
+                MockRect('#ffffff', 200, 300, 0, 600),
+            ]
+        return []
+# Test vertical stripes
+page = MockPage()
+guides = Guides(page)
+print("Testing vertical stripes (column backgrounds)")
+guides.vertical.from_stripes(color='#f0f0f0')
+print(f"\nFound {len(guides.vertical)} vertical guides")
+print(f"Guides at: {sorted(guides.vertical)}")
+# Verify we got both edges of each stripe
+expected = [100, 200, 300, 400, 500, 600]
+print(f"\nExpected: {expected}")
+print(f"Match: {sorted(guides.vertical) == expected}")
+# Test auto-detection
+guides2 = Guides(page)
+guides2.vertical.from_stripes()  # Should auto-detect the gray stripes
+print(f"\nAuto-detect found {len(guides2.vertical)} guides")
+print(f"Same result: {sorted(guides2.vertical) == sorted(guides.vertical)}")

{natural_pdf-0.2.10.dist-info → natural_pdf-0.2.12.dist-info}/WHEEL RENAMED Viewed

File without changes

{natural_pdf-0.2.10.dist-info → natural_pdf-0.2.12.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{natural_pdf-0.2.10.dist-info → natural_pdf-0.2.12.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

natural-pdf 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl

natural-pdf 0.2.10py3-none-any.whl → 0.2.12py3-none-any.whl