natural-pdf 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,55 @@
1
+ """Test that the fix for region exclusions works"""
2
+ from natural_pdf import PDF
3
+ from natural_pdf.analyzers.guides import Guides
4
+
5
+ pdf = PDF("pdfs/m27.pdf")
6
+ page = pdf.pages[0]
7
+
8
+ # Check initial text
9
+ print("Initial text:")
10
+ print(page.extract_text()[:200])
11
+
12
+ # Add exclusions
13
+ pdf.add_exclusion(lambda page: page.find(text="PREMISE").above())
14
+ pdf.add_exclusion(lambda page: page.find("text:regex(Page \d+ of)").expand())
15
+
16
+ # Test region extraction with exclusions
17
+ print("\n\nRegion in excluded area (0, 0, 200, 50):")
18
+ excluded_region = page.region(0, 0, 200, 50)
19
+ print("Without exclusions:", repr(excluded_region.extract_text(apply_exclusions=False)))
20
+ print("With exclusions:", repr(excluded_region.extract_text(apply_exclusions=True)))
21
+
22
+ # Now test the full table extraction
23
+ print("\n\nFull table extraction:")
24
+ headers = (
25
+ page
26
+ .find(text="NUMBER")
27
+ .right(include_source=True)
28
+ .expand(top=3, bottom=3)
29
+ .find_all('text')
30
+ )
31
+
32
+ guides = Guides(page)
33
+ guides.vertical.from_content(headers, align='left')
34
+ guides.horizontal.from_stripes()
35
+
36
+ result = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True, header=False)
37
+ df = result.to_df()
38
+
39
+ print(f"Shape: {df.shape}")
40
+ print("\nFirst row:")
41
+ for col, val in df.iloc[0].items():
42
+ print(f" {repr(col)}: {repr(val)}")
43
+
44
+ # Check if excluded content is in the table
45
+ table_str = df.to_string()
46
+ has_feb = "FEBRUARY" in table_str or "FEBR" in table_str and "RUARY" in table_str
47
+ has_alphabetic = "ALPHABETIC LISTING" in table_str
48
+
49
+ print(f"\nContains 'FEBRUARY': {has_feb}")
50
+ print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic}")
51
+
52
+ if has_feb or has_alphabetic:
53
+ print("\n❌ FAILED: Exclusions not working properly")
54
+ else:
55
+ print("\n✅ SUCCESS: Exclusions working correctly!")
@@ -0,0 +1,67 @@
1
+ """Test PDF-level exclusions with proper Region returns."""
2
+ from natural_pdf import PDF
3
+ from natural_pdf.analyzers.guides import Guides
4
+
5
+ # Load the PDF
6
+ pdf = PDF("pdfs/m27.pdf")
7
+
8
+ # Add PDF-level exclusions that properly return Regions
9
+ print("Adding fixed PDF-level exclusions...")
10
+ pdf.add_exclusion(
11
+ lambda page: page.find("text:contains(PREMISE)").above()
12
+ if page.find("text:contains(PREMISE)") else None,
13
+ label="header_exclusion"
14
+ )
15
+ pdf.add_exclusion(
16
+ lambda page: page.find("text:regex(Page \d+ of)").expand(10)
17
+ if page.find("text:regex(Page \d+ of)") else None,
18
+ label="footer_exclusion"
19
+ )
20
+
21
+ page = pdf.pages[0]
22
+
23
+ # Create guides and extract table
24
+ print("\nCreating guides and extracting table...")
25
+ headers = (
26
+ page
27
+ .find("text:contains(NUMBER)")
28
+ .right(include_source=True)
29
+ .expand(top=3, bottom=3)
30
+ .find_all('text')
31
+ )
32
+
33
+ guides = Guides(page)
34
+ guides.vertical.from_content(headers, align='left')
35
+ guides.horizontal.from_stripes()
36
+
37
+ # Extract table with apply_exclusions=True (default)
38
+ table_result = guides.extract_table(include_outer_boundaries=True)
39
+ df = table_result.to_df()
40
+
41
+ print(f"\nTable shape: {df.shape}")
42
+ print("\nFirst few rows:")
43
+ print(df.head())
44
+
45
+ # Check if excluded content is present
46
+ print("\nChecking for excluded content...")
47
+ table_text = df.to_string()
48
+ has_alphabetic = "ALPHABETIC LISTING" in table_text
49
+ has_page_num = "Page 1 of" in table_text or "Page" in table_text and "of" in table_text
50
+
51
+ print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic}")
52
+ print(f"Contains 'Page X of Y': {has_page_num}")
53
+
54
+ if has_alphabetic or has_page_num:
55
+ print("\n❌ FAILED: Exclusions not properly applied")
56
+ # Show what's in the first row/column that should be excluded
57
+ print("\nFirst column values (should not have header text):")
58
+ print(df.iloc[:5, 0])
59
+ else:
60
+ print("\n✅ SUCCESS: Exclusions properly applied")
61
+
62
+ # Debug: Check exclusion regions
63
+ print("\n\nDebug: Checking exclusion regions...")
64
+ exclusions = page._get_exclusion_regions(debug=True)
65
+ print(f"\nTotal exclusion regions: {len(exclusions)}")
66
+ for i, exc in enumerate(exclusions):
67
+ print(f" Exclusion {i}: {exc.bbox} (label: {exc.label})")
@@ -0,0 +1,53 @@
1
+ """Test horizontal guides with top/bottom alignment on real PDF."""
2
+ from natural_pdf import PDF
3
+ from natural_pdf.analyzers.guides import Guides
4
+
5
+ # Load the PDF
6
+ pdf = PDF("pdfs/m27.pdf")
7
+ page = pdf.pages[0]
8
+
9
+ # Find some row headers to use as markers
10
+ # First, let's find what text elements exist
11
+ all_text = page.find_all('text')[:20]
12
+ print("First 20 text elements:")
13
+ for t in all_text:
14
+ print(f" '{t.text}' at y={t.top:.2f}")
15
+
16
+ # Look for rows containing numbers in the first column
17
+ rows = page.find_all('text[x0<100]')[:5] # Get elements in left column
18
+
19
+ print("Found rows:")
20
+ for i, row in enumerate(rows):
21
+ print(f" {i}: '{row.text}' at y={row.top:.2f}-{row.bottom:.2f}")
22
+
23
+ # Test with align='top' (should use top edge of each row)
24
+ guides_top = Guides(page)
25
+ guides_top.horizontal.from_content(rows, align='top', outer=False)
26
+
27
+ print(f"\nHorizontal guides with align='top': {sorted(guides_top.horizontal)}")
28
+ print("Expected: top edges of each row")
29
+
30
+ # Test with align='bottom' (should use bottom edge of each row)
31
+ guides_bottom = Guides(page)
32
+ guides_bottom.horizontal.from_content(rows, align='bottom', outer=False)
33
+
34
+ print(f"\nHorizontal guides with align='bottom': {sorted(guides_bottom.horizontal)}")
35
+ print("Expected: bottom edges of each row")
36
+
37
+ # Verify they're different
38
+ if guides_top.horizontal != guides_bottom.horizontal:
39
+ print("\n✅ SUCCESS: top and bottom alignment produce different guides")
40
+ else:
41
+ print("\n❌ FAILED: top and bottom alignment produced the same guides")
42
+
43
+ # Test the class method too
44
+ guides_class_top = Guides.from_content(page, axis='horizontal', markers=rows, align='top', outer=False)
45
+ guides_class_bottom = Guides.from_content(page, axis='horizontal', markers=rows, align='bottom', outer=False)
46
+
47
+ print(f"\nClass method with top: {sorted(guides_class_top.horizontal)}")
48
+ print(f"Class method with bottom: {sorted(guides_class_bottom.horizontal)}")
49
+
50
+ if guides_class_top.horizontal == guides_top.horizontal and guides_class_bottom.horizontal == guides_bottom.horizontal:
51
+ print("\n✅ SUCCESS: Class method produces same results as instance method")
52
+ else:
53
+ print("\n❌ FAILED: Class method produces different results")
@@ -0,0 +1,45 @@
1
+ """Test marker ordering with a real PDF."""
2
+ from natural_pdf import PDF
3
+
4
+ # Load a test PDF
5
+ pdf = PDF("pdfs/01-practice.pdf")
6
+ page = pdf.pages[0]
7
+
8
+ # Find some text elements to use as markers
9
+ all_text = page.find_all("text")
10
+ print("Sample text elements:")
11
+ for i, elem in enumerate(all_text[:10]):
12
+ print(f"{i}: '{elem.text}' at x={elem.x0:.1f}")
13
+
14
+ # Create guides with markers in specific order
15
+ print("\n--- Testing vertical guides with markers ---")
16
+
17
+ # Let's find specific text elements at different positions
18
+ violations_elem = page.find('text:contains("Violations")')
19
+ date_elem = page.find('text:contains("Date")')
20
+ total_elem = page.find('text:contains("Total")')
21
+
22
+ if violations_elem and date_elem and total_elem:
23
+ print(f"\nElement positions:")
24
+ print(f"'Violations' at x={violations_elem.x0:.1f}")
25
+ print(f"'Date' at x={date_elem.x0:.1f}")
26
+ print(f"'Total' at x={total_elem.x0:.1f}")
27
+
28
+ # Create guides with markers in a specific order
29
+ guides = page.guides.from_content(
30
+ axis="vertical",
31
+ markers=["Violations", "Date", "Total"],
32
+ align="left",
33
+ outer=True
34
+ )
35
+
36
+ print(f"\nResulting vertical guides: {sorted(guides.vertical)}")
37
+ print(f"Page bounds: {page.bbox}")
38
+
39
+ # Show visually
40
+ page.guides.vertical.from_content(
41
+ markers=["Violations", "Date", "Total"],
42
+ align="left",
43
+ outer=True
44
+ )
45
+ page.show(guides=True)
@@ -0,0 +1,56 @@
1
+ """Test that the original exclusion lambdas (returning Elements) now work."""
2
+ from natural_pdf import PDF
3
+ from natural_pdf.analyzers.guides import Guides
4
+
5
+ # Load the PDF
6
+ pdf = PDF("pdfs/m27.pdf")
7
+
8
+ # Add the ORIGINAL PDF-level exclusions that return Elements (not Regions)
9
+ print("Adding original PDF-level exclusions (returning Elements)...")
10
+ pdf.add_exclusion(lambda page: page.find("text:contains(PREMISE)").above())
11
+ pdf.add_exclusion(lambda page: page.find("text:regex(Page \d+ of)"))
12
+
13
+ page = pdf.pages[0]
14
+
15
+ # Create guides and extract table
16
+ print("\nCreating guides and extracting table...")
17
+ headers = (
18
+ page
19
+ .find("text:contains(NUMBER)")
20
+ .right(include_source=True)
21
+ .expand(top=3, bottom=3)
22
+ .find_all('text')
23
+ )
24
+
25
+ guides = Guides(page)
26
+ guides.vertical.from_content(headers, align='left')
27
+ guides.horizontal.from_stripes()
28
+
29
+ # Extract table
30
+ table_result = guides.extract_table(include_outer_boundaries=True)
31
+ df = table_result.to_df()
32
+
33
+ print(f"\nTable shape: {df.shape}")
34
+ print("\nFirst few rows:")
35
+ print(df.head())
36
+
37
+ # Check if excluded content is present
38
+ print("\nChecking for excluded content...")
39
+ table_text = df.to_string()
40
+ has_alphabetic = "ALPHABETIC LISTING" in table_text
41
+ has_page_num = "Page 1 of" in table_text or "Page" in table_text and "of" in table_text
42
+
43
+ print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic}")
44
+ print(f"Contains 'Page X of Y': {has_page_num}")
45
+
46
+ if has_alphabetic or has_page_num:
47
+ print("\n❌ FAILED: Exclusions not properly applied")
48
+ else:
49
+ print("\n✅ SUCCESS: Exclusions properly applied with Element returns!")
50
+
51
+ # Debug: Check exclusion regions with verbose output
52
+ print("\n\nDebug: Checking exclusion regions...")
53
+ exclusions = page._get_exclusion_regions(debug=True)
54
+ print(f"\nTotal exclusion regions: {len(exclusions)}")
55
+ for i, exc in enumerate(exclusions):
56
+ print(f" Exclusion {i}: {exc.bbox} (label: {getattr(exc, 'label', 'None')})")
@@ -0,0 +1,84 @@
1
+ """Test that PDF-level exclusions work with guides.extract_table()."""
2
+ from natural_pdf import PDF
3
+ from natural_pdf.analyzers.guides import Guides
4
+
5
+ # Load the PDF
6
+ pdf = PDF("pdfs/m27.pdf")
7
+ page = pdf.pages[0]
8
+
9
+ # Add PDF-level exclusions
10
+ print("Adding PDF-level exclusions...")
11
+ pdf.add_exclusion(lambda page: page.find("text:contains(PREMISE)").above())
12
+ pdf.add_exclusion(lambda page: page.find("text:regex(Page \d+ of)"))
13
+
14
+ # Create guides and extract table
15
+ print("\nCreating guides and extracting table...")
16
+ headers = (
17
+ page
18
+ .find("text:contains(NUMBER)")
19
+ .right(include_source=True)
20
+ .expand(top=3, bottom=3)
21
+ .find_all('text')
22
+ )
23
+
24
+ guides = Guides(page)
25
+ guides.vertical.from_content(headers, align='left')
26
+ guides.horizontal.from_stripes()
27
+
28
+ # Extract table with apply_exclusions=True (default)
29
+ table_result = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True)
30
+ df = table_result.to_df()
31
+
32
+ print(f"\nTable shape: {df.shape}")
33
+ print("\nFirst few rows:")
34
+ print(df.head())
35
+
36
+ # Check if excluded content is present
37
+ print("\nChecking for excluded content...")
38
+ table_text = df.to_string()
39
+ has_alphabetic = "ALPHABETIC LISTING" in table_text
40
+ has_page_num = "Page 1 of" in table_text or "Page" in table_text and "of" in table_text
41
+
42
+ print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic}")
43
+ print(f"Contains 'Page X of Y': {has_page_num}")
44
+
45
+ if has_alphabetic or has_page_num:
46
+ print("\n❌ FAILED: Exclusions not properly applied")
47
+ else:
48
+ print("\n✅ SUCCESS: Exclusions properly applied")
49
+
50
+ # Now test with page-level exclusions for comparison
51
+ print("\n\nTesting with page-level exclusions...")
52
+ page2 = pdf.pages[0]
53
+ header = page2.find("text:contains(PREMISE)").above()
54
+ footer = page2.find("text:regex(Page \d+ of)")
55
+
56
+ if header:
57
+ page2.add_exclusion(header)
58
+ if footer:
59
+ page2.add_exclusion(footer)
60
+
61
+ guides2 = Guides(page2)
62
+ guides2.vertical.from_content(headers, align='left')
63
+ guides2.horizontal.from_stripes()
64
+
65
+ table_result2 = guides2.extract_table(include_outer_boundaries=True)
66
+ df2 = table_result2.to_df()
67
+
68
+ print(f"\nTable shape with page exclusions: {df2.shape}")
69
+ table_text2 = df2.to_string()
70
+ has_alphabetic2 = "ALPHABETIC LISTING" in table_text2
71
+ has_page_num2 = "Page 1 of" in table_text2 or "Page" in table_text2 and "of" in table_text2
72
+
73
+ print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic2}")
74
+ print(f"Contains 'Page X of Y': {has_page_num2}")
75
+
76
+ if has_alphabetic2 or has_page_num2:
77
+ print("\n❌ FAILED: Page exclusions not properly applied")
78
+ else:
79
+ print("\n✅ SUCCESS: Page exclusions properly applied")
80
+
81
+ # Debug: Check exclusion regions
82
+ print("\n\nDebug: Checking exclusion regions...")
83
+ exclusions = page._get_exclusion_regions(debug=True)
84
+ print(f"\nTotal exclusion regions: {len(exclusions)}")
@@ -0,0 +1,25 @@
1
+ """Test region exclusions with detailed debugging"""
2
+ from natural_pdf import PDF
3
+
4
+ pdf = PDF("pdfs/m27.pdf")
5
+ page = pdf.pages[0]
6
+
7
+ # Add exclusion
8
+ pdf.add_exclusion(lambda page: page.find(text="PREMISE").above(), label="header")
9
+
10
+ # Get page exclusions
11
+ print("Page exclusions:")
12
+ print(f"page._exclusions: {page._exclusions}")
13
+ print(f"pdf._exclusions: {pdf._exclusions}")
14
+
15
+ # Create a region in the excluded area
16
+ test_region = page.region(0, 0, 200, 50)
17
+ print(f"\nTest region: {test_region.bbox}")
18
+ print(f"Region's page: {test_region.page}")
19
+ print(f"Region's _page: {test_region._page}")
20
+ print(f"Region's _page._exclusions: {test_region._page._exclusions}")
21
+
22
+ # Try extraction with debug
23
+ print("\nExtracting with debug=True:")
24
+ text = test_region.extract_text(apply_exclusions=True, debug=True)
25
+ print(f"Result: '{text}'")
@@ -0,0 +1,62 @@
1
+ """Test from_stripes with a real PDF that has striped rows."""
2
+ from natural_pdf import PDF
3
+ from natural_pdf.analyzers.guides import Guides
4
+
5
+ # Load the PDF (assuming m27.pdf has the striped table)
6
+ pdf = PDF("pdfs/m27.pdf")
7
+ page = pdf.pages[0]
8
+
9
+ # Test 1: Manual selection with specific color
10
+ print("Test 1: Manual selection with color='#00ffff'")
11
+ guides = Guides(page)
12
+ guides.horizontal.from_stripes(color='#00ffff')
13
+
14
+ print(f"Found {len(guides.horizontal)} horizontal guides")
15
+ if guides.horizontal:
16
+ print(f"Guide range: {min(guides.horizontal):.2f} to {max(guides.horizontal):.2f}")
17
+ print(f"First 5 guides: {sorted(guides.horizontal)[:5]}")
18
+
19
+ # Test 2: Auto-detect most common stripe color
20
+ print("\nTest 2: Auto-detect stripes")
21
+ guides2 = Guides(page)
22
+ guides2.horizontal.from_stripes()
23
+
24
+ print(f"Found {len(guides2.horizontal)} horizontal guides")
25
+ if guides2.horizontal:
26
+ print(f"Guide range: {min(guides2.horizontal):.2f} to {max(guides2.horizontal):.2f}")
27
+
28
+ # Test 3: Manual selection of stripes
29
+ print("\nTest 3: Manual selection of stripe elements")
30
+ cyan_stripes = page.find_all('rect[fill=#00ffff]')
31
+ print(f"Found {len(cyan_stripes)} cyan rectangles")
32
+
33
+ if cyan_stripes:
34
+ guides3 = Guides(page)
35
+ guides3.horizontal.from_stripes(cyan_stripes)
36
+ print(f"Created {len(guides3.horizontal)} guides from stripes")
37
+
38
+ # Show how this captures both edges of each stripe
39
+ print("\nFirst stripe edges:")
40
+ first_stripe = cyan_stripes[0]
41
+ print(f" Stripe at y={first_stripe.top:.2f} to {first_stripe.bottom:.2f}")
42
+ print(f" Guides include: {first_stripe.top:.2f} in guides? {first_stripe.top in guides3.horizontal}")
43
+ print(f" Guides include: {first_stripe.bottom:.2f} in guides? {first_stripe.bottom in guides3.horizontal}")
44
+
45
+ # Test 4: Compare with traditional approach
46
+ print("\nComparison with traditional from_content approach:")
47
+ # Traditional way would only get one edge per stripe
48
+ guides4 = Guides(page)
49
+ guides4.horizontal.from_content(cyan_stripes, align='top', outer=False)
50
+ print(f"from_content (top only): {len(guides4.horizontal)} guides")
51
+
52
+ guides5 = Guides(page)
53
+ guides5.horizontal.from_content(cyan_stripes, align='bottom', outer=False)
54
+ print(f"from_content (bottom only): {len(guides5.horizontal)} guides")
55
+
56
+ print(f"from_stripes (both edges): {len(guides3.horizontal)} guides")
57
+
58
+ # Verify we get approximately 2x guides with from_stripes
59
+ if len(cyan_stripes) > 0:
60
+ expected_guides = len(set([s.top for s in cyan_stripes] + [s.bottom for s in cyan_stripes]))
61
+ print(f"\nExpected unique edges: {expected_guides}")
62
+ print(f"Actual from_stripes: {len(guides3.horizontal)}")
@@ -0,0 +1,55 @@
1
+ """Test from_stripes with vertical stripes (column backgrounds)."""
2
+ from natural_pdf import PDF
3
+ from natural_pdf.analyzers.guides import Guides
4
+
5
+ # Create a mock page with vertical stripes (simulating column backgrounds)
6
+ class MockRect:
7
+ def __init__(self, fill, x0, x1, top, bottom):
8
+ self.fill = fill
9
+ self.x0 = x0
10
+ self.x1 = x1
11
+ self.top = top
12
+ self.bottom = bottom
13
+
14
+ class MockPage:
15
+ def __init__(self):
16
+ self.bbox = (0, 0, 800, 600)
17
+ # Create vertical stripes for alternating columns
18
+ self.stripes = [
19
+ MockRect('#f0f0f0', 100, 200, 0, 600), # Column 1 background
20
+ MockRect('#f0f0f0', 300, 400, 0, 600), # Column 3 background
21
+ MockRect('#f0f0f0', 500, 600, 0, 600), # Column 5 background
22
+ ]
23
+
24
+ def find_all(self, selector):
25
+ if selector == 'rect[fill=#f0f0f0]':
26
+ return self.stripes
27
+ elif selector == 'rect[fill]':
28
+ # Include some other rects too
29
+ return self.stripes + [
30
+ MockRect('#ffffff', 0, 100, 0, 600),
31
+ MockRect('#ffffff', 200, 300, 0, 600),
32
+ ]
33
+ return []
34
+
35
+ # Test vertical stripes
36
+ page = MockPage()
37
+ guides = Guides(page)
38
+
39
+ print("Testing vertical stripes (column backgrounds)")
40
+ guides.vertical.from_stripes(color='#f0f0f0')
41
+
42
+ print(f"\nFound {len(guides.vertical)} vertical guides")
43
+ print(f"Guides at: {sorted(guides.vertical)}")
44
+
45
+ # Verify we got both edges of each stripe
46
+ expected = [100, 200, 300, 400, 500, 600]
47
+ print(f"\nExpected: {expected}")
48
+ print(f"Match: {sorted(guides.vertical) == expected}")
49
+
50
+ # Test auto-detection
51
+ guides2 = Guides(page)
52
+ guides2.vertical.from_stripes() # Should auto-detect the gray stripes
53
+
54
+ print(f"\nAuto-detect found {len(guides2.vertical)} guides")
55
+ print(f"Same result: {sorted(guides2.vertical) == sorted(guides.vertical)}")