natural-pdf 0.2.11__py3-none-any.whl → 0.2.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. natural_pdf/analyzers/guides.py +196 -43
  2. natural_pdf/core/highlighting_service.py +40 -10
  3. natural_pdf/core/page.py +56 -8
  4. natural_pdf/elements/base.py +15 -1
  5. natural_pdf/elements/region.py +37 -5
  6. natural_pdf/vision/__init__.py +1 -2
  7. natural_pdf/vision/mixin.py +67 -27
  8. natural_pdf/vision/results.py +49 -5
  9. natural_pdf/vision/similarity.py +195 -23
  10. natural_pdf/vision/template_matching.py +209 -0
  11. {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/METADATA +1 -1
  12. {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/RECORD +36 -15
  13. {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/top_level.txt +1 -0
  14. temp/fix_page_exclusions.py +42 -0
  15. temp/test_draw_guides.py +25 -0
  16. temp/test_draw_guides_interactive.py +30 -0
  17. temp/test_exclusion_with_debug.py +30 -0
  18. temp/test_find_exclusions_fix.py +53 -0
  19. temp/test_find_exclusions_fix_no_recursion.py +97 -0
  20. temp/test_fix_real_pdf.py +48 -0
  21. temp/test_fix_working.py +55 -0
  22. temp/test_fixed_pdf_exclusions.py +67 -0
  23. temp/test_guide_draw_notebook.py +47 -0
  24. temp/test_horizontal_top_bottom.py +53 -0
  25. temp/test_inline_js.py +22 -0
  26. temp/test_marker_order.py +45 -0
  27. temp/test_original_exclusions_now_work.py +56 -0
  28. temp/test_pdf_exclusions_with_guides.py +84 -0
  29. temp/test_region_exclusions_detailed.py +25 -0
  30. temp/test_stripes_real_pdf.py +62 -0
  31. temp/test_vertical_stripes.py +55 -0
  32. temp/test_widget_functionality.py +68 -0
  33. temp/test_widget_simple.py +41 -0
  34. {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/WHEEL +0 -0
  35. {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/entry_points.txt +0 -0
  36. {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,84 @@
1
+ """Test that PDF-level exclusions work with guides.extract_table()."""
2
+ from natural_pdf import PDF
3
+ from natural_pdf.analyzers.guides import Guides
4
+
5
+ # Load the PDF
6
+ pdf = PDF("pdfs/m27.pdf")
7
+ page = pdf.pages[0]
8
+
9
+ # Add PDF-level exclusions
10
+ print("Adding PDF-level exclusions...")
11
+ pdf.add_exclusion(lambda page: page.find("text:contains(PREMISE)").above())
12
+ pdf.add_exclusion(lambda page: page.find("text:regex(Page \d+ of)"))
13
+
14
+ # Create guides and extract table
15
+ print("\nCreating guides and extracting table...")
16
+ headers = (
17
+ page
18
+ .find("text:contains(NUMBER)")
19
+ .right(include_source=True)
20
+ .expand(top=3, bottom=3)
21
+ .find_all('text')
22
+ )
23
+
24
+ guides = Guides(page)
25
+ guides.vertical.from_content(headers, align='left')
26
+ guides.horizontal.from_stripes()
27
+
28
+ # Extract table with apply_exclusions=True (default)
29
+ table_result = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True)
30
+ df = table_result.to_df()
31
+
32
+ print(f"\nTable shape: {df.shape}")
33
+ print("\nFirst few rows:")
34
+ print(df.head())
35
+
36
+ # Check if excluded content is present
37
+ print("\nChecking for excluded content...")
38
+ table_text = df.to_string()
39
+ has_alphabetic = "ALPHABETIC LISTING" in table_text
40
+ has_page_num = "Page 1 of" in table_text or "Page" in table_text and "of" in table_text
41
+
42
+ print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic}")
43
+ print(f"Contains 'Page X of Y': {has_page_num}")
44
+
45
+ if has_alphabetic or has_page_num:
46
+ print("\n❌ FAILED: Exclusions not properly applied")
47
+ else:
48
+ print("\n✅ SUCCESS: Exclusions properly applied")
49
+
50
+ # Now test with page-level exclusions for comparison
51
+ print("\n\nTesting with page-level exclusions...")
52
+ page2 = pdf.pages[0]
53
+ header = page2.find("text:contains(PREMISE)").above()
54
+ footer = page2.find("text:regex(Page \d+ of)")
55
+
56
+ if header:
57
+ page2.add_exclusion(header)
58
+ if footer:
59
+ page2.add_exclusion(footer)
60
+
61
+ guides2 = Guides(page2)
62
+ guides2.vertical.from_content(headers, align='left')
63
+ guides2.horizontal.from_stripes()
64
+
65
+ table_result2 = guides2.extract_table(include_outer_boundaries=True)
66
+ df2 = table_result2.to_df()
67
+
68
+ print(f"\nTable shape with page exclusions: {df2.shape}")
69
+ table_text2 = df2.to_string()
70
+ has_alphabetic2 = "ALPHABETIC LISTING" in table_text2
71
+ has_page_num2 = "Page 1 of" in table_text2 or "Page" in table_text2 and "of" in table_text2
72
+
73
+ print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic2}")
74
+ print(f"Contains 'Page X of Y': {has_page_num2}")
75
+
76
+ if has_alphabetic2 or has_page_num2:
77
+ print("\n❌ FAILED: Page exclusions not properly applied")
78
+ else:
79
+ print("\n✅ SUCCESS: Page exclusions properly applied")
80
+
81
+ # Debug: Check exclusion regions
82
+ print("\n\nDebug: Checking exclusion regions...")
83
+ exclusions = page._get_exclusion_regions(debug=True)
84
+ print(f"\nTotal exclusion regions: {len(exclusions)}")
@@ -0,0 +1,25 @@
1
+ """Test region exclusions with detailed debugging"""
2
+ from natural_pdf import PDF
3
+
4
+ pdf = PDF("pdfs/m27.pdf")
5
+ page = pdf.pages[0]
6
+
7
+ # Add exclusion
8
+ pdf.add_exclusion(lambda page: page.find(text="PREMISE").above(), label="header")
9
+
10
+ # Get page exclusions
11
+ print("Page exclusions:")
12
+ print(f"page._exclusions: {page._exclusions}")
13
+ print(f"pdf._exclusions: {pdf._exclusions}")
14
+
15
+ # Create a region in the excluded area
16
+ test_region = page.region(0, 0, 200, 50)
17
+ print(f"\nTest region: {test_region.bbox}")
18
+ print(f"Region's page: {test_region.page}")
19
+ print(f"Region's _page: {test_region._page}")
20
+ print(f"Region's _page._exclusions: {test_region._page._exclusions}")
21
+
22
+ # Try extraction with debug
23
+ print("\nExtracting with debug=True:")
24
+ text = test_region.extract_text(apply_exclusions=True, debug=True)
25
+ print(f"Result: '{text}'")
@@ -0,0 +1,62 @@
1
+ """Test from_stripes with a real PDF that has striped rows."""
2
+ from natural_pdf import PDF
3
+ from natural_pdf.analyzers.guides import Guides
4
+
5
+ # Load the PDF (assuming m27.pdf has the striped table)
6
+ pdf = PDF("pdfs/m27.pdf")
7
+ page = pdf.pages[0]
8
+
9
+ # Test 1: Manual selection with specific color
10
+ print("Test 1: Manual selection with color='#00ffff'")
11
+ guides = Guides(page)
12
+ guides.horizontal.from_stripes(color='#00ffff')
13
+
14
+ print(f"Found {len(guides.horizontal)} horizontal guides")
15
+ if guides.horizontal:
16
+ print(f"Guide range: {min(guides.horizontal):.2f} to {max(guides.horizontal):.2f}")
17
+ print(f"First 5 guides: {sorted(guides.horizontal)[:5]}")
18
+
19
+ # Test 2: Auto-detect most common stripe color
20
+ print("\nTest 2: Auto-detect stripes")
21
+ guides2 = Guides(page)
22
+ guides2.horizontal.from_stripes()
23
+
24
+ print(f"Found {len(guides2.horizontal)} horizontal guides")
25
+ if guides2.horizontal:
26
+ print(f"Guide range: {min(guides2.horizontal):.2f} to {max(guides2.horizontal):.2f}")
27
+
28
+ # Test 3: Manual selection of stripes
29
+ print("\nTest 3: Manual selection of stripe elements")
30
+ cyan_stripes = page.find_all('rect[fill=#00ffff]')
31
+ print(f"Found {len(cyan_stripes)} cyan rectangles")
32
+
33
+ if cyan_stripes:
34
+ guides3 = Guides(page)
35
+ guides3.horizontal.from_stripes(cyan_stripes)
36
+ print(f"Created {len(guides3.horizontal)} guides from stripes")
37
+
38
+ # Show how this captures both edges of each stripe
39
+ print("\nFirst stripe edges:")
40
+ first_stripe = cyan_stripes[0]
41
+ print(f" Stripe at y={first_stripe.top:.2f} to {first_stripe.bottom:.2f}")
42
+ print(f" Guides include: {first_stripe.top:.2f} in guides? {first_stripe.top in guides3.horizontal}")
43
+ print(f" Guides include: {first_stripe.bottom:.2f} in guides? {first_stripe.bottom in guides3.horizontal}")
44
+
45
+ # Test 4: Compare with traditional approach
46
+ print("\nComparison with traditional from_content approach:")
47
+ # Traditional way would only get one edge per stripe
48
+ guides4 = Guides(page)
49
+ guides4.horizontal.from_content(cyan_stripes, align='top', outer=False)
50
+ print(f"from_content (top only): {len(guides4.horizontal)} guides")
51
+
52
+ guides5 = Guides(page)
53
+ guides5.horizontal.from_content(cyan_stripes, align='bottom', outer=False)
54
+ print(f"from_content (bottom only): {len(guides5.horizontal)} guides")
55
+
56
+ print(f"from_stripes (both edges): {len(guides3.horizontal)} guides")
57
+
58
+ # Verify we get approximately 2x guides with from_stripes
59
+ if len(cyan_stripes) > 0:
60
+ expected_guides = len(set([s.top for s in cyan_stripes] + [s.bottom for s in cyan_stripes]))
61
+ print(f"\nExpected unique edges: {expected_guides}")
62
+ print(f"Actual from_stripes: {len(guides3.horizontal)}")
@@ -0,0 +1,55 @@
1
+ """Test from_stripes with vertical stripes (column backgrounds)."""
2
+ from natural_pdf import PDF
3
+ from natural_pdf.analyzers.guides import Guides
4
+
5
+ # Create a mock page with vertical stripes (simulating column backgrounds)
6
+ class MockRect:
7
+ def __init__(self, fill, x0, x1, top, bottom):
8
+ self.fill = fill
9
+ self.x0 = x0
10
+ self.x1 = x1
11
+ self.top = top
12
+ self.bottom = bottom
13
+
14
+ class MockPage:
15
+ def __init__(self):
16
+ self.bbox = (0, 0, 800, 600)
17
+ # Create vertical stripes for alternating columns
18
+ self.stripes = [
19
+ MockRect('#f0f0f0', 100, 200, 0, 600), # Column 1 background
20
+ MockRect('#f0f0f0', 300, 400, 0, 600), # Column 3 background
21
+ MockRect('#f0f0f0', 500, 600, 0, 600), # Column 5 background
22
+ ]
23
+
24
+ def find_all(self, selector):
25
+ if selector == 'rect[fill=#f0f0f0]':
26
+ return self.stripes
27
+ elif selector == 'rect[fill]':
28
+ # Include some other rects too
29
+ return self.stripes + [
30
+ MockRect('#ffffff', 0, 100, 0, 600),
31
+ MockRect('#ffffff', 200, 300, 0, 600),
32
+ ]
33
+ return []
34
+
35
+ # Test vertical stripes
36
+ page = MockPage()
37
+ guides = Guides(page)
38
+
39
+ print("Testing vertical stripes (column backgrounds)")
40
+ guides.vertical.from_stripes(color='#f0f0f0')
41
+
42
+ print(f"\nFound {len(guides.vertical)} vertical guides")
43
+ print(f"Guides at: {sorted(guides.vertical)}")
44
+
45
+ # Verify we got both edges of each stripe
46
+ expected = [100, 200, 300, 400, 500, 600]
47
+ print(f"\nExpected: {expected}")
48
+ print(f"Match: {sorted(guides.vertical) == expected}")
49
+
50
+ # Test auto-detection
51
+ guides2 = Guides(page)
52
+ guides2.vertical.from_stripes() # Should auto-detect the gray stripes
53
+
54
+ print(f"\nAuto-detect found {len(guides2.vertical)} guides")
55
+ print(f"Same result: {sorted(guides2.vertical) == sorted(guides.vertical)}")
@@ -0,0 +1,68 @@
1
+ #!/usr/bin/env python
2
+ """Test the guide widget functionality"""
3
+
4
+ import sys
5
+ import os
6
+
7
+ # Add parent directory to path
8
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
9
+
10
+ # Test importing and basic functionality
11
+ try:
12
+ from natural_pdf.analyzers.guides import InteractiveGuideWidget, GuidesList, _GUIDE_WIDGET_AVAILABLE
13
+ print("✓ Successfully imported InteractiveGuideWidget")
14
+
15
+ if _GUIDE_WIDGET_AVAILABLE:
16
+ print("✓ ipywidgets is available")
17
+
18
+ # Create a mock GuidesList for testing
19
+ class MockPage:
20
+ def __init__(self):
21
+ self.bbox = (0, 0, 595, 842) # A4 page size in points
22
+
23
+ def render(self, resolution=150):
24
+ # Mock render method
25
+ from PIL import Image
26
+ width = int(595 * resolution / 72)
27
+ height = int(842 * resolution / 72)
28
+ return Image.new('RGB', (width, height), color='white')
29
+
30
+ class MockGuides:
31
+ def __init__(self):
32
+ self.context = MockPage()
33
+
34
+ class MockGuidesList:
35
+ def __init__(self):
36
+ self.data = [100, 200, 300]
37
+ self._axis = 'vertical'
38
+ self._parent = MockGuides()
39
+
40
+ # Test creating the widget
41
+ mock_guides = MockGuidesList()
42
+ try:
43
+ widget = InteractiveGuideWidget(mock_guides)
44
+ print("✓ Successfully created InteractiveGuideWidget instance")
45
+ print(f" - Widget ID: {widget.widget_id}")
46
+ print(f" - Widget base classes: {InteractiveGuideWidget.__bases__}")
47
+
48
+ # Check if the widget has the expected methods
49
+ expected_methods = ['_generate_content', 'update_guides']
50
+ for method in expected_methods:
51
+ if hasattr(widget, method):
52
+ print(f" - Has method: {method}")
53
+ else:
54
+ print(f" - Missing method: {method}")
55
+
56
+ except Exception as e:
57
+ print(f"✗ Error creating widget: {e}")
58
+
59
+ else:
60
+ print("⚠ ipywidgets not available - widget functionality disabled")
61
+
62
+ except ImportError as e:
63
+ print(f"✗ Import error: {e}")
64
+
65
+ except Exception as e:
66
+ print(f"✗ Unexpected error: {e}")
67
+
68
+ print("\nWidget implementation test complete!")
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/env python
2
+ """Simple test for the guide widget"""
3
+
4
+ import sys
5
+ import os
6
+
7
+ # Add parent directory to path
8
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
9
+
10
+ # Test importing the module
11
+ try:
12
+ from natural_pdf.analyzers.guides import InteractiveGuideWidget, _GUIDE_WIDGET_AVAILABLE
13
+ print(f"✓ Module imported successfully")
14
+ print(f"✓ Widget available: {_GUIDE_WIDGET_AVAILABLE}")
15
+
16
+ if _GUIDE_WIDGET_AVAILABLE:
17
+ print("✓ ipywidgets is installed and InteractiveGuideWidget is available")
18
+ else:
19
+ print("✗ ipywidgets is not installed")
20
+
21
+ except ImportError as e:
22
+ print(f"✗ Import error: {e}")
23
+ sys.exit(1)
24
+
25
+ # Check if we can create the widget class
26
+ if _GUIDE_WIDGET_AVAILABLE:
27
+ try:
28
+ # We can't actually instantiate it without a GuidesList, but we can check the class exists
29
+ print(f"✓ InteractiveGuideWidget class: {InteractiveGuideWidget}")
30
+ print(f"✓ Widget base classes: {InteractiveGuideWidget.__bases__}")
31
+
32
+ # Check methods
33
+ methods = [m for m in dir(InteractiveGuideWidget) if not m.startswith('_')]
34
+ print(f"✓ Public methods: {methods}")
35
+
36
+ except Exception as e:
37
+ print(f"✗ Error checking widget class: {e}")
38
+ else:
39
+ print("⚠ Skipping widget checks as ipywidgets is not available")
40
+
41
+ print("\nAll checks passed!")