natural-pdf 0.2.11__py3-none-any.whl → 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/guides.py +196 -43
- natural_pdf/core/highlighting_service.py +40 -10
- natural_pdf/core/page.py +56 -8
- natural_pdf/elements/base.py +15 -1
- natural_pdf/elements/region.py +37 -5
- natural_pdf/vision/__init__.py +1 -2
- natural_pdf/vision/mixin.py +67 -27
- natural_pdf/vision/results.py +49 -5
- natural_pdf/vision/similarity.py +195 -23
- natural_pdf/vision/template_matching.py +209 -0
- {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/RECORD +36 -15
- {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/top_level.txt +1 -0
- temp/fix_page_exclusions.py +42 -0
- temp/test_draw_guides.py +25 -0
- temp/test_draw_guides_interactive.py +30 -0
- temp/test_exclusion_with_debug.py +30 -0
- temp/test_find_exclusions_fix.py +53 -0
- temp/test_find_exclusions_fix_no_recursion.py +97 -0
- temp/test_fix_real_pdf.py +48 -0
- temp/test_fix_working.py +55 -0
- temp/test_fixed_pdf_exclusions.py +67 -0
- temp/test_guide_draw_notebook.py +47 -0
- temp/test_horizontal_top_bottom.py +53 -0
- temp/test_inline_js.py +22 -0
- temp/test_marker_order.py +45 -0
- temp/test_original_exclusions_now_work.py +56 -0
- temp/test_pdf_exclusions_with_guides.py +84 -0
- temp/test_region_exclusions_detailed.py +25 -0
- temp/test_stripes_real_pdf.py +62 -0
- temp/test_vertical_stripes.py +55 -0
- temp/test_widget_functionality.py +68 -0
- temp/test_widget_simple.py +41 -0
- {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,84 @@
|
|
1
|
+
"""Test that PDF-level exclusions work with guides.extract_table()."""
|
2
|
+
from natural_pdf import PDF
|
3
|
+
from natural_pdf.analyzers.guides import Guides
|
4
|
+
|
5
|
+
# Load the PDF
|
6
|
+
pdf = PDF("pdfs/m27.pdf")
|
7
|
+
page = pdf.pages[0]
|
8
|
+
|
9
|
+
# Add PDF-level exclusions
|
10
|
+
print("Adding PDF-level exclusions...")
|
11
|
+
pdf.add_exclusion(lambda page: page.find("text:contains(PREMISE)").above())
|
12
|
+
pdf.add_exclusion(lambda page: page.find("text:regex(Page \d+ of)"))
|
13
|
+
|
14
|
+
# Create guides and extract table
|
15
|
+
print("\nCreating guides and extracting table...")
|
16
|
+
headers = (
|
17
|
+
page
|
18
|
+
.find("text:contains(NUMBER)")
|
19
|
+
.right(include_source=True)
|
20
|
+
.expand(top=3, bottom=3)
|
21
|
+
.find_all('text')
|
22
|
+
)
|
23
|
+
|
24
|
+
guides = Guides(page)
|
25
|
+
guides.vertical.from_content(headers, align='left')
|
26
|
+
guides.horizontal.from_stripes()
|
27
|
+
|
28
|
+
# Extract table with apply_exclusions=True (default)
|
29
|
+
table_result = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True)
|
30
|
+
df = table_result.to_df()
|
31
|
+
|
32
|
+
print(f"\nTable shape: {df.shape}")
|
33
|
+
print("\nFirst few rows:")
|
34
|
+
print(df.head())
|
35
|
+
|
36
|
+
# Check if excluded content is present
|
37
|
+
print("\nChecking for excluded content...")
|
38
|
+
table_text = df.to_string()
|
39
|
+
has_alphabetic = "ALPHABETIC LISTING" in table_text
|
40
|
+
has_page_num = "Page 1 of" in table_text or "Page" in table_text and "of" in table_text
|
41
|
+
|
42
|
+
print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic}")
|
43
|
+
print(f"Contains 'Page X of Y': {has_page_num}")
|
44
|
+
|
45
|
+
if has_alphabetic or has_page_num:
|
46
|
+
print("\n❌ FAILED: Exclusions not properly applied")
|
47
|
+
else:
|
48
|
+
print("\n✅ SUCCESS: Exclusions properly applied")
|
49
|
+
|
50
|
+
# Now test with page-level exclusions for comparison
|
51
|
+
print("\n\nTesting with page-level exclusions...")
|
52
|
+
page2 = pdf.pages[0]
|
53
|
+
header = page2.find("text:contains(PREMISE)").above()
|
54
|
+
footer = page2.find("text:regex(Page \d+ of)")
|
55
|
+
|
56
|
+
if header:
|
57
|
+
page2.add_exclusion(header)
|
58
|
+
if footer:
|
59
|
+
page2.add_exclusion(footer)
|
60
|
+
|
61
|
+
guides2 = Guides(page2)
|
62
|
+
guides2.vertical.from_content(headers, align='left')
|
63
|
+
guides2.horizontal.from_stripes()
|
64
|
+
|
65
|
+
table_result2 = guides2.extract_table(include_outer_boundaries=True)
|
66
|
+
df2 = table_result2.to_df()
|
67
|
+
|
68
|
+
print(f"\nTable shape with page exclusions: {df2.shape}")
|
69
|
+
table_text2 = df2.to_string()
|
70
|
+
has_alphabetic2 = "ALPHABETIC LISTING" in table_text2
|
71
|
+
has_page_num2 = "Page 1 of" in table_text2 or "Page" in table_text2 and "of" in table_text2
|
72
|
+
|
73
|
+
print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic2}")
|
74
|
+
print(f"Contains 'Page X of Y': {has_page_num2}")
|
75
|
+
|
76
|
+
if has_alphabetic2 or has_page_num2:
|
77
|
+
print("\n❌ FAILED: Page exclusions not properly applied")
|
78
|
+
else:
|
79
|
+
print("\n✅ SUCCESS: Page exclusions properly applied")
|
80
|
+
|
81
|
+
# Debug: Check exclusion regions
|
82
|
+
print("\n\nDebug: Checking exclusion regions...")
|
83
|
+
exclusions = page._get_exclusion_regions(debug=True)
|
84
|
+
print(f"\nTotal exclusion regions: {len(exclusions)}")
|
@@ -0,0 +1,25 @@
|
|
1
|
+
"""Test region exclusions with detailed debugging"""
|
2
|
+
from natural_pdf import PDF
|
3
|
+
|
4
|
+
pdf = PDF("pdfs/m27.pdf")
|
5
|
+
page = pdf.pages[0]
|
6
|
+
|
7
|
+
# Add exclusion
|
8
|
+
pdf.add_exclusion(lambda page: page.find(text="PREMISE").above(), label="header")
|
9
|
+
|
10
|
+
# Get page exclusions
|
11
|
+
print("Page exclusions:")
|
12
|
+
print(f"page._exclusions: {page._exclusions}")
|
13
|
+
print(f"pdf._exclusions: {pdf._exclusions}")
|
14
|
+
|
15
|
+
# Create a region in the excluded area
|
16
|
+
test_region = page.region(0, 0, 200, 50)
|
17
|
+
print(f"\nTest region: {test_region.bbox}")
|
18
|
+
print(f"Region's page: {test_region.page}")
|
19
|
+
print(f"Region's _page: {test_region._page}")
|
20
|
+
print(f"Region's _page._exclusions: {test_region._page._exclusions}")
|
21
|
+
|
22
|
+
# Try extraction with debug
|
23
|
+
print("\nExtracting with debug=True:")
|
24
|
+
text = test_region.extract_text(apply_exclusions=True, debug=True)
|
25
|
+
print(f"Result: '{text}'")
|
@@ -0,0 +1,62 @@
|
|
1
|
+
"""Test from_stripes with a real PDF that has striped rows."""
|
2
|
+
from natural_pdf import PDF
|
3
|
+
from natural_pdf.analyzers.guides import Guides
|
4
|
+
|
5
|
+
# Load the PDF (assuming m27.pdf has the striped table)
|
6
|
+
pdf = PDF("pdfs/m27.pdf")
|
7
|
+
page = pdf.pages[0]
|
8
|
+
|
9
|
+
# Test 1: Manual selection with specific color
|
10
|
+
print("Test 1: Manual selection with color='#00ffff'")
|
11
|
+
guides = Guides(page)
|
12
|
+
guides.horizontal.from_stripes(color='#00ffff')
|
13
|
+
|
14
|
+
print(f"Found {len(guides.horizontal)} horizontal guides")
|
15
|
+
if guides.horizontal:
|
16
|
+
print(f"Guide range: {min(guides.horizontal):.2f} to {max(guides.horizontal):.2f}")
|
17
|
+
print(f"First 5 guides: {sorted(guides.horizontal)[:5]}")
|
18
|
+
|
19
|
+
# Test 2: Auto-detect most common stripe color
|
20
|
+
print("\nTest 2: Auto-detect stripes")
|
21
|
+
guides2 = Guides(page)
|
22
|
+
guides2.horizontal.from_stripes()
|
23
|
+
|
24
|
+
print(f"Found {len(guides2.horizontal)} horizontal guides")
|
25
|
+
if guides2.horizontal:
|
26
|
+
print(f"Guide range: {min(guides2.horizontal):.2f} to {max(guides2.horizontal):.2f}")
|
27
|
+
|
28
|
+
# Test 3: Manual selection of stripes
|
29
|
+
print("\nTest 3: Manual selection of stripe elements")
|
30
|
+
cyan_stripes = page.find_all('rect[fill=#00ffff]')
|
31
|
+
print(f"Found {len(cyan_stripes)} cyan rectangles")
|
32
|
+
|
33
|
+
if cyan_stripes:
|
34
|
+
guides3 = Guides(page)
|
35
|
+
guides3.horizontal.from_stripes(cyan_stripes)
|
36
|
+
print(f"Created {len(guides3.horizontal)} guides from stripes")
|
37
|
+
|
38
|
+
# Show how this captures both edges of each stripe
|
39
|
+
print("\nFirst stripe edges:")
|
40
|
+
first_stripe = cyan_stripes[0]
|
41
|
+
print(f" Stripe at y={first_stripe.top:.2f} to {first_stripe.bottom:.2f}")
|
42
|
+
print(f" Guides include: {first_stripe.top:.2f} in guides? {first_stripe.top in guides3.horizontal}")
|
43
|
+
print(f" Guides include: {first_stripe.bottom:.2f} in guides? {first_stripe.bottom in guides3.horizontal}")
|
44
|
+
|
45
|
+
# Test 4: Compare with traditional approach
|
46
|
+
print("\nComparison with traditional from_content approach:")
|
47
|
+
# Traditional way would only get one edge per stripe
|
48
|
+
guides4 = Guides(page)
|
49
|
+
guides4.horizontal.from_content(cyan_stripes, align='top', outer=False)
|
50
|
+
print(f"from_content (top only): {len(guides4.horizontal)} guides")
|
51
|
+
|
52
|
+
guides5 = Guides(page)
|
53
|
+
guides5.horizontal.from_content(cyan_stripes, align='bottom', outer=False)
|
54
|
+
print(f"from_content (bottom only): {len(guides5.horizontal)} guides")
|
55
|
+
|
56
|
+
print(f"from_stripes (both edges): {len(guides3.horizontal)} guides")
|
57
|
+
|
58
|
+
# Verify we get approximately 2x guides with from_stripes
|
59
|
+
if len(cyan_stripes) > 0:
|
60
|
+
expected_guides = len(set([s.top for s in cyan_stripes] + [s.bottom for s in cyan_stripes]))
|
61
|
+
print(f"\nExpected unique edges: {expected_guides}")
|
62
|
+
print(f"Actual from_stripes: {len(guides3.horizontal)}")
|
@@ -0,0 +1,55 @@
|
|
1
|
+
"""Test from_stripes with vertical stripes (column backgrounds)."""
|
2
|
+
from natural_pdf import PDF
|
3
|
+
from natural_pdf.analyzers.guides import Guides
|
4
|
+
|
5
|
+
# Create a mock page with vertical stripes (simulating column backgrounds)
|
6
|
+
class MockRect:
|
7
|
+
def __init__(self, fill, x0, x1, top, bottom):
|
8
|
+
self.fill = fill
|
9
|
+
self.x0 = x0
|
10
|
+
self.x1 = x1
|
11
|
+
self.top = top
|
12
|
+
self.bottom = bottom
|
13
|
+
|
14
|
+
class MockPage:
|
15
|
+
def __init__(self):
|
16
|
+
self.bbox = (0, 0, 800, 600)
|
17
|
+
# Create vertical stripes for alternating columns
|
18
|
+
self.stripes = [
|
19
|
+
MockRect('#f0f0f0', 100, 200, 0, 600), # Column 1 background
|
20
|
+
MockRect('#f0f0f0', 300, 400, 0, 600), # Column 3 background
|
21
|
+
MockRect('#f0f0f0', 500, 600, 0, 600), # Column 5 background
|
22
|
+
]
|
23
|
+
|
24
|
+
def find_all(self, selector):
|
25
|
+
if selector == 'rect[fill=#f0f0f0]':
|
26
|
+
return self.stripes
|
27
|
+
elif selector == 'rect[fill]':
|
28
|
+
# Include some other rects too
|
29
|
+
return self.stripes + [
|
30
|
+
MockRect('#ffffff', 0, 100, 0, 600),
|
31
|
+
MockRect('#ffffff', 200, 300, 0, 600),
|
32
|
+
]
|
33
|
+
return []
|
34
|
+
|
35
|
+
# Test vertical stripes
|
36
|
+
page = MockPage()
|
37
|
+
guides = Guides(page)
|
38
|
+
|
39
|
+
print("Testing vertical stripes (column backgrounds)")
|
40
|
+
guides.vertical.from_stripes(color='#f0f0f0')
|
41
|
+
|
42
|
+
print(f"\nFound {len(guides.vertical)} vertical guides")
|
43
|
+
print(f"Guides at: {sorted(guides.vertical)}")
|
44
|
+
|
45
|
+
# Verify we got both edges of each stripe
|
46
|
+
expected = [100, 200, 300, 400, 500, 600]
|
47
|
+
print(f"\nExpected: {expected}")
|
48
|
+
print(f"Match: {sorted(guides.vertical) == expected}")
|
49
|
+
|
50
|
+
# Test auto-detection
|
51
|
+
guides2 = Guides(page)
|
52
|
+
guides2.vertical.from_stripes() # Should auto-detect the gray stripes
|
53
|
+
|
54
|
+
print(f"\nAuto-detect found {len(guides2.vertical)} guides")
|
55
|
+
print(f"Same result: {sorted(guides2.vertical) == sorted(guides.vertical)}")
|
@@ -0,0 +1,68 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
"""Test the guide widget functionality"""
|
3
|
+
|
4
|
+
import sys
|
5
|
+
import os
|
6
|
+
|
7
|
+
# Add parent directory to path
|
8
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
9
|
+
|
10
|
+
# Test importing and basic functionality
|
11
|
+
try:
|
12
|
+
from natural_pdf.analyzers.guides import InteractiveGuideWidget, GuidesList, _GUIDE_WIDGET_AVAILABLE
|
13
|
+
print("✓ Successfully imported InteractiveGuideWidget")
|
14
|
+
|
15
|
+
if _GUIDE_WIDGET_AVAILABLE:
|
16
|
+
print("✓ ipywidgets is available")
|
17
|
+
|
18
|
+
# Create a mock GuidesList for testing
|
19
|
+
class MockPage:
|
20
|
+
def __init__(self):
|
21
|
+
self.bbox = (0, 0, 595, 842) # A4 page size in points
|
22
|
+
|
23
|
+
def render(self, resolution=150):
|
24
|
+
# Mock render method
|
25
|
+
from PIL import Image
|
26
|
+
width = int(595 * resolution / 72)
|
27
|
+
height = int(842 * resolution / 72)
|
28
|
+
return Image.new('RGB', (width, height), color='white')
|
29
|
+
|
30
|
+
class MockGuides:
|
31
|
+
def __init__(self):
|
32
|
+
self.context = MockPage()
|
33
|
+
|
34
|
+
class MockGuidesList:
|
35
|
+
def __init__(self):
|
36
|
+
self.data = [100, 200, 300]
|
37
|
+
self._axis = 'vertical'
|
38
|
+
self._parent = MockGuides()
|
39
|
+
|
40
|
+
# Test creating the widget
|
41
|
+
mock_guides = MockGuidesList()
|
42
|
+
try:
|
43
|
+
widget = InteractiveGuideWidget(mock_guides)
|
44
|
+
print("✓ Successfully created InteractiveGuideWidget instance")
|
45
|
+
print(f" - Widget ID: {widget.widget_id}")
|
46
|
+
print(f" - Widget base classes: {InteractiveGuideWidget.__bases__}")
|
47
|
+
|
48
|
+
# Check if the widget has the expected methods
|
49
|
+
expected_methods = ['_generate_content', 'update_guides']
|
50
|
+
for method in expected_methods:
|
51
|
+
if hasattr(widget, method):
|
52
|
+
print(f" - Has method: {method}")
|
53
|
+
else:
|
54
|
+
print(f" - Missing method: {method}")
|
55
|
+
|
56
|
+
except Exception as e:
|
57
|
+
print(f"✗ Error creating widget: {e}")
|
58
|
+
|
59
|
+
else:
|
60
|
+
print("⚠ ipywidgets not available - widget functionality disabled")
|
61
|
+
|
62
|
+
except ImportError as e:
|
63
|
+
print(f"✗ Import error: {e}")
|
64
|
+
|
65
|
+
except Exception as e:
|
66
|
+
print(f"✗ Unexpected error: {e}")
|
67
|
+
|
68
|
+
print("\nWidget implementation test complete!")
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
"""Simple test for the guide widget"""
|
3
|
+
|
4
|
+
import sys
|
5
|
+
import os
|
6
|
+
|
7
|
+
# Add parent directory to path
|
8
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
9
|
+
|
10
|
+
# Test importing the module
|
11
|
+
try:
|
12
|
+
from natural_pdf.analyzers.guides import InteractiveGuideWidget, _GUIDE_WIDGET_AVAILABLE
|
13
|
+
print(f"✓ Module imported successfully")
|
14
|
+
print(f"✓ Widget available: {_GUIDE_WIDGET_AVAILABLE}")
|
15
|
+
|
16
|
+
if _GUIDE_WIDGET_AVAILABLE:
|
17
|
+
print("✓ ipywidgets is installed and InteractiveGuideWidget is available")
|
18
|
+
else:
|
19
|
+
print("✗ ipywidgets is not installed")
|
20
|
+
|
21
|
+
except ImportError as e:
|
22
|
+
print(f"✗ Import error: {e}")
|
23
|
+
sys.exit(1)
|
24
|
+
|
25
|
+
# Check if we can create the widget class
|
26
|
+
if _GUIDE_WIDGET_AVAILABLE:
|
27
|
+
try:
|
28
|
+
# We can't actually instantiate it without a GuidesList, but we can check the class exists
|
29
|
+
print(f"✓ InteractiveGuideWidget class: {InteractiveGuideWidget}")
|
30
|
+
print(f"✓ Widget base classes: {InteractiveGuideWidget.__bases__}")
|
31
|
+
|
32
|
+
# Check methods
|
33
|
+
methods = [m for m in dir(InteractiveGuideWidget) if not m.startswith('_')]
|
34
|
+
print(f"✓ Public methods: {methods}")
|
35
|
+
|
36
|
+
except Exception as e:
|
37
|
+
print(f"✗ Error checking widget class: {e}")
|
38
|
+
else:
|
39
|
+
print("⚠ Skipping widget checks as ipywidgets is not available")
|
40
|
+
|
41
|
+
print("\nAll checks passed!")
|
File without changes
|
File without changes
|
File without changes
|