natural-pdf 0.2.11__py3-none-any.whl → 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/guides.py +196 -43
- natural_pdf/core/highlighting_service.py +40 -10
- natural_pdf/core/page.py +56 -8
- natural_pdf/elements/base.py +15 -1
- natural_pdf/elements/region.py +37 -5
- natural_pdf/vision/__init__.py +1 -2
- natural_pdf/vision/mixin.py +67 -27
- natural_pdf/vision/results.py +49 -5
- natural_pdf/vision/similarity.py +195 -23
- natural_pdf/vision/template_matching.py +209 -0
- {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/RECORD +36 -15
- {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/top_level.txt +1 -0
- temp/fix_page_exclusions.py +42 -0
- temp/test_draw_guides.py +25 -0
- temp/test_draw_guides_interactive.py +30 -0
- temp/test_exclusion_with_debug.py +30 -0
- temp/test_find_exclusions_fix.py +53 -0
- temp/test_find_exclusions_fix_no_recursion.py +97 -0
- temp/test_fix_real_pdf.py +48 -0
- temp/test_fix_working.py +55 -0
- temp/test_fixed_pdf_exclusions.py +67 -0
- temp/test_guide_draw_notebook.py +47 -0
- temp/test_horizontal_top_bottom.py +53 -0
- temp/test_inline_js.py +22 -0
- temp/test_marker_order.py +45 -0
- temp/test_original_exclusions_now_work.py +56 -0
- temp/test_pdf_exclusions_with_guides.py +84 -0
- temp/test_region_exclusions_detailed.py +25 -0
- temp/test_stripes_real_pdf.py +62 -0
- temp/test_vertical_stripes.py +55 -0
- temp/test_widget_functionality.py +68 -0
- temp/test_widget_simple.py +41 -0
- {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.13.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,42 @@
|
|
1
|
+
"""Script to fix the remaining exclusion bugs in page.py"""
|
2
|
+
|
3
|
+
import re
|
4
|
+
|
5
|
+
# Read the file
|
6
|
+
with open('/Users/soma/Development/natural-pdf/natural_pdf/core/page.py', 'r') as f:
|
7
|
+
content = f.read()
|
8
|
+
|
9
|
+
# Fix 1: Line 1132 in find() method
|
10
|
+
# Change: if apply_exclusions and self._exclusions and results_collection:
|
11
|
+
# To: if apply_exclusions and results_collection:
|
12
|
+
content = re.sub(
|
13
|
+
r'(\s+)if apply_exclusions and self\._exclusions and results_collection:',
|
14
|
+
r'\1if apply_exclusions and results_collection:',
|
15
|
+
content
|
16
|
+
)
|
17
|
+
|
18
|
+
# Fix 2: Line 1227 in find_all() method
|
19
|
+
# Same change pattern
|
20
|
+
content = re.sub(
|
21
|
+
r'(\s+)if apply_exclusions and self\._exclusions and results_collection:',
|
22
|
+
r'\1if apply_exclusions and results_collection:',
|
23
|
+
content
|
24
|
+
)
|
25
|
+
|
26
|
+
# Fix 3: Line 1599 in get_elements() method
|
27
|
+
# Change: if apply_exclusions and self._exclusions:
|
28
|
+
# To: if apply_exclusions:
|
29
|
+
content = re.sub(
|
30
|
+
r'(\s+)if apply_exclusions and self\._exclusions:',
|
31
|
+
r'\1if apply_exclusions:',
|
32
|
+
content
|
33
|
+
)
|
34
|
+
|
35
|
+
# Write the fixed content back
|
36
|
+
with open('/Users/soma/Development/natural-pdf/natural_pdf/core/page.py', 'w') as f:
|
37
|
+
f.write(content)
|
38
|
+
|
39
|
+
print("Fixed exclusion checks in page.py")
|
40
|
+
print("- find() method: removed self._exclusions check")
|
41
|
+
print("- find_all() method: removed self._exclusions check")
|
42
|
+
print("- get_elements() method: removed self._exclusions check")
|
temp/test_draw_guides.py
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
"""Example usage of the interactive guide drawing feature"""
|
2
|
+
|
3
|
+
# In a Jupyter notebook:
|
4
|
+
from natural_pdf import NaturalPDF
|
5
|
+
|
6
|
+
# Load a PDF
|
7
|
+
pdf = NaturalPDF.from_file("your_pdf.pdf")
|
8
|
+
page = pdf[0]
|
9
|
+
|
10
|
+
# Create guides
|
11
|
+
guides = page.guides()
|
12
|
+
|
13
|
+
# Detect some initial guides (optional)
|
14
|
+
guides.vertical.from_lines(n=5)
|
15
|
+
guides.horizontal.from_lines(n=5)
|
16
|
+
|
17
|
+
# Open interactive editor for vertical guides
|
18
|
+
guides.vertical.draw()
|
19
|
+
|
20
|
+
# Open interactive editor for horizontal guides
|
21
|
+
guides.horizontal.draw(width=600) # Smaller widget
|
22
|
+
|
23
|
+
# After editing, the guides are automatically updated
|
24
|
+
# You can now use them to extract tables:
|
25
|
+
table = page.extract_table(guides)
|
@@ -0,0 +1,30 @@
|
|
1
|
+
"""Test the interactive guide drawing functionality"""
|
2
|
+
|
3
|
+
from natural_pdf.core.pdf import PDF
|
4
|
+
from natural_pdf.analyzers.guides import Guides
|
5
|
+
|
6
|
+
# Load a sample PDF
|
7
|
+
pdf = PDF("tests/sample_pdfs/simple_table.pdf")
|
8
|
+
page = pdf.pages[0]
|
9
|
+
|
10
|
+
# Create guides
|
11
|
+
guides = Guides(page)
|
12
|
+
|
13
|
+
# Add some initial guides for testing
|
14
|
+
guides.vertical.from_content()
|
15
|
+
guides.horizontal.from_lines(n=5)
|
16
|
+
|
17
|
+
print("Initial vertical guides:", list(guides.vertical))
|
18
|
+
print("Initial horizontal guides:", list(guides.horizontal))
|
19
|
+
|
20
|
+
# This would open the interactive widget in Jupyter
|
21
|
+
# guides.vertical.draw()
|
22
|
+
|
23
|
+
# For non-Jupyter testing, we can check the method exists
|
24
|
+
assert hasattr(guides.vertical, 'draw')
|
25
|
+
assert callable(guides.vertical.draw)
|
26
|
+
|
27
|
+
print("\nSuccess! The draw() method is available on GuidesList objects.")
|
28
|
+
print("To use it interactively, run this in a Jupyter notebook:")
|
29
|
+
print(" guides.vertical.draw()")
|
30
|
+
print(" guides.horizontal.draw(width=600)")
|
@@ -0,0 +1,30 @@
|
|
1
|
+
"""Test exclusion with detailed debugging"""
|
2
|
+
from natural_pdf import PDF
|
3
|
+
|
4
|
+
pdf = PDF("pdfs/m27.pdf")
|
5
|
+
page = pdf.pages[0]
|
6
|
+
|
7
|
+
# Add exclusion
|
8
|
+
pdf.add_exclusion(lambda page: page.find(text="PREMISE").above(), label="header")
|
9
|
+
|
10
|
+
# First, verify the exclusion works on the page itself
|
11
|
+
print("Page-level text extraction:")
|
12
|
+
print("Without exclusions:", page.extract_text(apply_exclusions=False)[:100])
|
13
|
+
print("With exclusions:", page.extract_text(apply_exclusions=True)[:100])
|
14
|
+
|
15
|
+
# Now test on a specific region that should be excluded
|
16
|
+
print("\n\nRegion in excluded area (0, 0, 200, 50):")
|
17
|
+
excluded_region = page.region(0, 0, 200, 50)
|
18
|
+
print("Without exclusions:", repr(excluded_region.extract_text(apply_exclusions=False)))
|
19
|
+
print("With exclusions:", repr(excluded_region.extract_text(apply_exclusions=True)))
|
20
|
+
|
21
|
+
# Test the actual problematic cell region
|
22
|
+
print("\n\nProblematic cell region (32.06, 0.5, 73.18, 79.54):")
|
23
|
+
cell_region = page.region(32.06, 0.5, 73.18288, 79.54)
|
24
|
+
print("Without exclusions:", repr(cell_region.extract_text(apply_exclusions=False)))
|
25
|
+
print("With exclusions:", repr(cell_region.extract_text(apply_exclusions=True)))
|
26
|
+
|
27
|
+
# Check if the region inherits the page
|
28
|
+
print(f"\nCell region's page: {cell_region.page}")
|
29
|
+
print(f"Cell region's _page: {getattr(cell_region, '_page', 'Not found')}")
|
30
|
+
print(f"Same as original page: {cell_region.page is page if hasattr(cell_region, 'page') else 'N/A'}")
|
@@ -0,0 +1,53 @@
|
|
1
|
+
"""Test that the find methods now work with PDF-level exclusions."""
|
2
|
+
|
3
|
+
from natural_pdf import PDF
|
4
|
+
|
5
|
+
# Load a test PDF
|
6
|
+
pdf = PDF("pdfs/m27.pdf")
|
7
|
+
page = pdf.pages[0]
|
8
|
+
|
9
|
+
# Add PDF-level exclusion for the header
|
10
|
+
pdf.add_exclusion(lambda page: page.find(text="PREMISE").above())
|
11
|
+
|
12
|
+
# Test 1: find() should now exclude the header text
|
13
|
+
print("Test 1: find() with PDF-level exclusions")
|
14
|
+
result = page.find("text:contains(FEBRUARY)", apply_exclusions=True)
|
15
|
+
if result is None:
|
16
|
+
print("✅ SUCCESS: find() correctly excluded header text")
|
17
|
+
else:
|
18
|
+
print(f"❌ FAILED: find() returned {result.text}")
|
19
|
+
|
20
|
+
# Test 2: find_all() should exclude header elements
|
21
|
+
print("\nTest 2: find_all() with PDF-level exclusions")
|
22
|
+
all_text = page.find_all("text", apply_exclusions=False)
|
23
|
+
filtered_text = page.find_all("text", apply_exclusions=True)
|
24
|
+
print(f"Without exclusions: {len(all_text)} elements")
|
25
|
+
print(f"With exclusions: {len(filtered_text)} elements")
|
26
|
+
if len(filtered_text) < len(all_text):
|
27
|
+
print("✅ SUCCESS: find_all() excluded some elements")
|
28
|
+
else:
|
29
|
+
print("❌ FAILED: find_all() didn't exclude any elements")
|
30
|
+
|
31
|
+
# Test 3: get_elements() should exclude header elements
|
32
|
+
print("\nTest 3: get_elements() with PDF-level exclusions")
|
33
|
+
all_elements = page.get_elements(apply_exclusions=False)
|
34
|
+
filtered_elements = page.get_elements(apply_exclusions=True)
|
35
|
+
print(f"Without exclusions: {len(all_elements)} elements")
|
36
|
+
print(f"With exclusions: {len(filtered_elements)} elements")
|
37
|
+
if len(filtered_elements) < len(all_elements):
|
38
|
+
print("✅ SUCCESS: get_elements() excluded some elements")
|
39
|
+
else:
|
40
|
+
print("❌ FAILED: get_elements() didn't exclude any elements")
|
41
|
+
|
42
|
+
# Test that excluded text is not in the filtered results
|
43
|
+
print("\nChecking excluded text...")
|
44
|
+
excluded_texts = ["FEBRUARY", "ALPHABETIC LISTING", "M27"]
|
45
|
+
for text in excluded_texts:
|
46
|
+
found_in_filtered = any(
|
47
|
+
text in str(el.text) if hasattr(el, 'text') else False
|
48
|
+
for el in filtered_text
|
49
|
+
)
|
50
|
+
if not found_in_filtered:
|
51
|
+
print(f"✅ '{text}' correctly excluded")
|
52
|
+
else:
|
53
|
+
print(f"❌ '{text}' still present in filtered results")
|
@@ -0,0 +1,97 @@
|
|
1
|
+
"""Test that the find methods now work with PDF-level exclusions (without recursion)."""
|
2
|
+
|
3
|
+
from natural_pdf import PDF
|
4
|
+
|
5
|
+
# Load a test PDF
|
6
|
+
pdf = PDF("pdfs/m27.pdf")
|
7
|
+
page = pdf.pages[0]
|
8
|
+
|
9
|
+
# Add PDF-level exclusion using a Region directly to avoid recursion
|
10
|
+
# First, get the header region without exclusions
|
11
|
+
header_element = page.find(text="PREMISE", apply_exclusions=False)
|
12
|
+
if header_element:
|
13
|
+
header_region = header_element.above()
|
14
|
+
pdf.add_exclusion(header_region)
|
15
|
+
else:
|
16
|
+
print("WARNING: Could not find PREMISE text for exclusion")
|
17
|
+
|
18
|
+
# Test 1: find() should now exclude the header text
|
19
|
+
print("Test 1: find() with PDF-level exclusions")
|
20
|
+
result = page.find("text:contains(FEBRUARY)", apply_exclusions=True)
|
21
|
+
if result is None:
|
22
|
+
print("✅ SUCCESS: find() correctly excluded header text")
|
23
|
+
else:
|
24
|
+
print(f"❌ FAILED: find() returned {result.text}")
|
25
|
+
|
26
|
+
# Test 2: find_all() should exclude header elements
|
27
|
+
print("\nTest 2: find_all() with PDF-level exclusions")
|
28
|
+
all_text = page.find_all("text", apply_exclusions=False)
|
29
|
+
filtered_text = page.find_all("text", apply_exclusions=True)
|
30
|
+
print(f"Without exclusions: {len(all_text)} elements")
|
31
|
+
print(f"With exclusions: {len(filtered_text)} elements")
|
32
|
+
if len(filtered_text) < len(all_text):
|
33
|
+
print("✅ SUCCESS: find_all() excluded some elements")
|
34
|
+
else:
|
35
|
+
print("❌ FAILED: find_all() didn't exclude any elements")
|
36
|
+
|
37
|
+
# Test 3: get_elements() should exclude header elements
|
38
|
+
print("\nTest 3: get_elements() with PDF-level exclusions")
|
39
|
+
all_elements = page.get_elements(apply_exclusions=False)
|
40
|
+
filtered_elements = page.get_elements(apply_exclusions=True)
|
41
|
+
print(f"Without exclusions: {len(all_elements)} elements")
|
42
|
+
print(f"With exclusions: {len(filtered_elements)} elements")
|
43
|
+
if len(filtered_elements) < len(all_elements):
|
44
|
+
print("✅ SUCCESS: get_elements() excluded some elements")
|
45
|
+
else:
|
46
|
+
print("❌ FAILED: get_elements() didn't exclude any elements")
|
47
|
+
|
48
|
+
# Test that excluded text is not in the filtered results
|
49
|
+
print("\nChecking excluded text...")
|
50
|
+
excluded_texts = ["FEBRUARY", "ALPHABETIC LISTING", "M27"]
|
51
|
+
for text in excluded_texts:
|
52
|
+
found_in_filtered = any(
|
53
|
+
text in str(el.text) if hasattr(el, 'text') else False
|
54
|
+
for el in filtered_text
|
55
|
+
)
|
56
|
+
if not found_in_filtered:
|
57
|
+
print(f"✅ '{text}' correctly excluded")
|
58
|
+
else:
|
59
|
+
print(f"❌ '{text}' still present in filtered results")
|
60
|
+
|
61
|
+
# Also test that the original table extraction issue is fixed
|
62
|
+
print("\n\nTesting original table extraction issue...")
|
63
|
+
from natural_pdf.analyzers.guides import Guides
|
64
|
+
|
65
|
+
# Add exclusion for footer too
|
66
|
+
footer_element = page.find("text:regex(Page \\d+ of)", apply_exclusions=False)
|
67
|
+
if footer_element:
|
68
|
+
pdf.add_exclusion(footer_element.expand())
|
69
|
+
|
70
|
+
headers = (
|
71
|
+
page
|
72
|
+
.find(text="NUMBER", apply_exclusions=False)
|
73
|
+
.right(include_source=True)
|
74
|
+
.expand(top=3, bottom=3)
|
75
|
+
.find_all('text')
|
76
|
+
)
|
77
|
+
|
78
|
+
guides = Guides(page)
|
79
|
+
guides.vertical.from_content(headers, align='left')
|
80
|
+
guides.horizontal.from_stripes()
|
81
|
+
|
82
|
+
result = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True, header=False)
|
83
|
+
df = result.to_df()
|
84
|
+
|
85
|
+
# Check if excluded content is in the table
|
86
|
+
table_str = df.to_string()
|
87
|
+
has_feb = "FEBRUARY" in table_str
|
88
|
+
has_alphabetic = "ALPHABETIC LISTING" in table_str
|
89
|
+
|
90
|
+
print(f"\nTable extraction with exclusions:")
|
91
|
+
print(f"Contains 'FEBRUARY': {has_feb}")
|
92
|
+
print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic}")
|
93
|
+
|
94
|
+
if not has_feb and not has_alphabetic:
|
95
|
+
print("✅ SUCCESS: Table extraction correctly excludes header/footer!")
|
96
|
+
else:
|
97
|
+
print("❌ FAILED: Exclusions not working in table extraction")
|
@@ -0,0 +1,48 @@
|
|
1
|
+
"""Test the fix with the actual PDF from the user's example."""
|
2
|
+
from natural_pdf import PDF
|
3
|
+
from natural_pdf.analyzers.guides import Guides
|
4
|
+
|
5
|
+
pdf = PDF("pdfs/m27.pdf")
|
6
|
+
page = pdf.pages[0]
|
7
|
+
|
8
|
+
# Get headers using the user's exact code
|
9
|
+
headers = (
|
10
|
+
page
|
11
|
+
.find("text:contains(NUMBER)")
|
12
|
+
.right(include_source=True)
|
13
|
+
.expand(top=3, bottom=3)
|
14
|
+
.find_all('text')
|
15
|
+
)
|
16
|
+
|
17
|
+
print("Headers found:")
|
18
|
+
for i, h in enumerate(headers):
|
19
|
+
print(f" {i}: '{h.text}' at x={h.x0:.2f}")
|
20
|
+
|
21
|
+
# Create guides using ElementCollection
|
22
|
+
guides = Guides(page)
|
23
|
+
guides.vertical.from_content(headers, align='left', outer=False)
|
24
|
+
|
25
|
+
print(f"\nResulting vertical guides: {sorted(guides.vertical)}")
|
26
|
+
|
27
|
+
# Check specific headers that were problematic
|
28
|
+
print("\nChecking headers 3-5:")
|
29
|
+
for i, h in enumerate(headers[3:5]):
|
30
|
+
print(f" Header {i+3}: '{h.text}' at x={h.x0:.5f}")
|
31
|
+
|
32
|
+
# Test with just headers 3-5
|
33
|
+
guides2 = Guides(page)
|
34
|
+
guides2.vertical.from_content(headers[3:5], align='left', outer=False)
|
35
|
+
|
36
|
+
print(f"\nGuides from headers[3:5]: {guides2.vertical}")
|
37
|
+
print(f"Expected: [328.32012, 539.63316]")
|
38
|
+
|
39
|
+
# Verify the fix
|
40
|
+
if 332.88095999999996 in guides2.vertical:
|
41
|
+
print("\n❌ FAILED: Extra guide at 332.88 is still present")
|
42
|
+
else:
|
43
|
+
print("\n✅ SUCCESS: Extra guide at 332.88 is not present")
|
44
|
+
|
45
|
+
# Test that outer guides work correctly too
|
46
|
+
guides3 = Guides(page)
|
47
|
+
guides3.vertical.from_content(headers[3:5], align='left', outer=True)
|
48
|
+
print(f"\nWith outer=True: {guides3.vertical}")
|
temp/test_fix_working.py
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
"""Test that the fix for region exclusions works"""
|
2
|
+
from natural_pdf import PDF
|
3
|
+
from natural_pdf.analyzers.guides import Guides
|
4
|
+
|
5
|
+
pdf = PDF("pdfs/m27.pdf")
|
6
|
+
page = pdf.pages[0]
|
7
|
+
|
8
|
+
# Check initial text
|
9
|
+
print("Initial text:")
|
10
|
+
print(page.extract_text()[:200])
|
11
|
+
|
12
|
+
# Add exclusions
|
13
|
+
pdf.add_exclusion(lambda page: page.find(text="PREMISE").above())
|
14
|
+
pdf.add_exclusion(lambda page: page.find("text:regex(Page \d+ of)").expand())
|
15
|
+
|
16
|
+
# Test region extraction with exclusions
|
17
|
+
print("\n\nRegion in excluded area (0, 0, 200, 50):")
|
18
|
+
excluded_region = page.region(0, 0, 200, 50)
|
19
|
+
print("Without exclusions:", repr(excluded_region.extract_text(apply_exclusions=False)))
|
20
|
+
print("With exclusions:", repr(excluded_region.extract_text(apply_exclusions=True)))
|
21
|
+
|
22
|
+
# Now test the full table extraction
|
23
|
+
print("\n\nFull table extraction:")
|
24
|
+
headers = (
|
25
|
+
page
|
26
|
+
.find(text="NUMBER")
|
27
|
+
.right(include_source=True)
|
28
|
+
.expand(top=3, bottom=3)
|
29
|
+
.find_all('text')
|
30
|
+
)
|
31
|
+
|
32
|
+
guides = Guides(page)
|
33
|
+
guides.vertical.from_content(headers, align='left')
|
34
|
+
guides.horizontal.from_stripes()
|
35
|
+
|
36
|
+
result = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True, header=False)
|
37
|
+
df = result.to_df()
|
38
|
+
|
39
|
+
print(f"Shape: {df.shape}")
|
40
|
+
print("\nFirst row:")
|
41
|
+
for col, val in df.iloc[0].items():
|
42
|
+
print(f" {repr(col)}: {repr(val)}")
|
43
|
+
|
44
|
+
# Check if excluded content is in the table
|
45
|
+
table_str = df.to_string()
|
46
|
+
has_feb = "FEBRUARY" in table_str or "FEBR" in table_str and "RUARY" in table_str
|
47
|
+
has_alphabetic = "ALPHABETIC LISTING" in table_str
|
48
|
+
|
49
|
+
print(f"\nContains 'FEBRUARY': {has_feb}")
|
50
|
+
print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic}")
|
51
|
+
|
52
|
+
if has_feb or has_alphabetic:
|
53
|
+
print("\n❌ FAILED: Exclusions not working properly")
|
54
|
+
else:
|
55
|
+
print("\n✅ SUCCESS: Exclusions working correctly!")
|
@@ -0,0 +1,67 @@
|
|
1
|
+
"""Test PDF-level exclusions with proper Region returns."""
|
2
|
+
from natural_pdf import PDF
|
3
|
+
from natural_pdf.analyzers.guides import Guides
|
4
|
+
|
5
|
+
# Load the PDF
|
6
|
+
pdf = PDF("pdfs/m27.pdf")
|
7
|
+
|
8
|
+
# Add PDF-level exclusions that properly return Regions
|
9
|
+
print("Adding fixed PDF-level exclusions...")
|
10
|
+
pdf.add_exclusion(
|
11
|
+
lambda page: page.find("text:contains(PREMISE)").above()
|
12
|
+
if page.find("text:contains(PREMISE)") else None,
|
13
|
+
label="header_exclusion"
|
14
|
+
)
|
15
|
+
pdf.add_exclusion(
|
16
|
+
lambda page: page.find("text:regex(Page \d+ of)").expand(10)
|
17
|
+
if page.find("text:regex(Page \d+ of)") else None,
|
18
|
+
label="footer_exclusion"
|
19
|
+
)
|
20
|
+
|
21
|
+
page = pdf.pages[0]
|
22
|
+
|
23
|
+
# Create guides and extract table
|
24
|
+
print("\nCreating guides and extracting table...")
|
25
|
+
headers = (
|
26
|
+
page
|
27
|
+
.find("text:contains(NUMBER)")
|
28
|
+
.right(include_source=True)
|
29
|
+
.expand(top=3, bottom=3)
|
30
|
+
.find_all('text')
|
31
|
+
)
|
32
|
+
|
33
|
+
guides = Guides(page)
|
34
|
+
guides.vertical.from_content(headers, align='left')
|
35
|
+
guides.horizontal.from_stripes()
|
36
|
+
|
37
|
+
# Extract table with apply_exclusions=True (default)
|
38
|
+
table_result = guides.extract_table(include_outer_boundaries=True)
|
39
|
+
df = table_result.to_df()
|
40
|
+
|
41
|
+
print(f"\nTable shape: {df.shape}")
|
42
|
+
print("\nFirst few rows:")
|
43
|
+
print(df.head())
|
44
|
+
|
45
|
+
# Check if excluded content is present
|
46
|
+
print("\nChecking for excluded content...")
|
47
|
+
table_text = df.to_string()
|
48
|
+
has_alphabetic = "ALPHABETIC LISTING" in table_text
|
49
|
+
has_page_num = "Page 1 of" in table_text or "Page" in table_text and "of" in table_text
|
50
|
+
|
51
|
+
print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic}")
|
52
|
+
print(f"Contains 'Page X of Y': {has_page_num}")
|
53
|
+
|
54
|
+
if has_alphabetic or has_page_num:
|
55
|
+
print("\n❌ FAILED: Exclusions not properly applied")
|
56
|
+
# Show what's in the first row/column that should be excluded
|
57
|
+
print("\nFirst column values (should not have header text):")
|
58
|
+
print(df.iloc[:5, 0])
|
59
|
+
else:
|
60
|
+
print("\n✅ SUCCESS: Exclusions properly applied")
|
61
|
+
|
62
|
+
# Debug: Check exclusion regions
|
63
|
+
print("\n\nDebug: Checking exclusion regions...")
|
64
|
+
exclusions = page._get_exclusion_regions(debug=True)
|
65
|
+
print(f"\nTotal exclusion regions: {len(exclusions)}")
|
66
|
+
for i, exc in enumerate(exclusions):
|
67
|
+
print(f" Exclusion {i}: {exc.bbox} (label: {exc.label})")
|
@@ -0,0 +1,47 @@
|
|
1
|
+
"""Test script to verify the draw() method works"""
|
2
|
+
|
3
|
+
import sys
|
4
|
+
sys.path.insert(0, '.')
|
5
|
+
|
6
|
+
from natural_pdf.analyzers.guides import GuidesList, Guides
|
7
|
+
|
8
|
+
# Create a mock context for testing
|
9
|
+
class MockContext:
|
10
|
+
def __init__(self):
|
11
|
+
self.width = 600
|
12
|
+
self.height = 800
|
13
|
+
|
14
|
+
def render(self, resolution=150):
|
15
|
+
# Create a simple test image
|
16
|
+
from PIL import Image
|
17
|
+
img = Image.new('RGB', (int(self.width * resolution/72), int(self.height * resolution/72)), 'white')
|
18
|
+
return img
|
19
|
+
|
20
|
+
# Test that the draw method exists
|
21
|
+
mock_context = MockContext()
|
22
|
+
guides = Guides(mock_context)
|
23
|
+
|
24
|
+
# Add some test guides
|
25
|
+
guides.vertical.data = [100, 200, 300, 400, 500]
|
26
|
+
guides.horizontal.data = [150, 350, 550, 750]
|
27
|
+
|
28
|
+
print("Initial vertical guides:", list(guides.vertical))
|
29
|
+
print("Initial horizontal guides:", list(guides.horizontal))
|
30
|
+
|
31
|
+
# Check that draw method exists
|
32
|
+
assert hasattr(guides.vertical, 'draw')
|
33
|
+
assert callable(guides.vertical.draw)
|
34
|
+
assert hasattr(guides.horizontal, 'draw')
|
35
|
+
assert callable(guides.horizontal.draw)
|
36
|
+
|
37
|
+
print("\nSuccess! The draw() method is available.")
|
38
|
+
print("\nIn a Jupyter notebook, you would use:")
|
39
|
+
print(" guides.vertical.draw() # Interactive vertical guide editor")
|
40
|
+
print(" guides.horizontal.draw() # Interactive horizontal guide editor")
|
41
|
+
print("\nFeatures:")
|
42
|
+
print(" - Click to add new guides")
|
43
|
+
print(" - Click existing guides to select them")
|
44
|
+
print(" - Drag to move guides")
|
45
|
+
print(" - Delete key to remove selected guide")
|
46
|
+
print(" - Arrow keys to fine-tune position")
|
47
|
+
print(" - Enter to apply, Escape to cancel")
|
@@ -0,0 +1,53 @@
|
|
1
|
+
"""Test horizontal guides with top/bottom alignment on real PDF."""
|
2
|
+
from natural_pdf import PDF
|
3
|
+
from natural_pdf.analyzers.guides import Guides
|
4
|
+
|
5
|
+
# Load the PDF
|
6
|
+
pdf = PDF("pdfs/m27.pdf")
|
7
|
+
page = pdf.pages[0]
|
8
|
+
|
9
|
+
# Find some row headers to use as markers
|
10
|
+
# First, let's find what text elements exist
|
11
|
+
all_text = page.find_all('text')[:20]
|
12
|
+
print("First 20 text elements:")
|
13
|
+
for t in all_text:
|
14
|
+
print(f" '{t.text}' at y={t.top:.2f}")
|
15
|
+
|
16
|
+
# Look for rows containing numbers in the first column
|
17
|
+
rows = page.find_all('text[x0<100]')[:5] # Get elements in left column
|
18
|
+
|
19
|
+
print("Found rows:")
|
20
|
+
for i, row in enumerate(rows):
|
21
|
+
print(f" {i}: '{row.text}' at y={row.top:.2f}-{row.bottom:.2f}")
|
22
|
+
|
23
|
+
# Test with align='top' (should use top edge of each row)
|
24
|
+
guides_top = Guides(page)
|
25
|
+
guides_top.horizontal.from_content(rows, align='top', outer=False)
|
26
|
+
|
27
|
+
print(f"\nHorizontal guides with align='top': {sorted(guides_top.horizontal)}")
|
28
|
+
print("Expected: top edges of each row")
|
29
|
+
|
30
|
+
# Test with align='bottom' (should use bottom edge of each row)
|
31
|
+
guides_bottom = Guides(page)
|
32
|
+
guides_bottom.horizontal.from_content(rows, align='bottom', outer=False)
|
33
|
+
|
34
|
+
print(f"\nHorizontal guides with align='bottom': {sorted(guides_bottom.horizontal)}")
|
35
|
+
print("Expected: bottom edges of each row")
|
36
|
+
|
37
|
+
# Verify they're different
|
38
|
+
if guides_top.horizontal != guides_bottom.horizontal:
|
39
|
+
print("\n✅ SUCCESS: top and bottom alignment produce different guides")
|
40
|
+
else:
|
41
|
+
print("\n❌ FAILED: top and bottom alignment produced the same guides")
|
42
|
+
|
43
|
+
# Test the class method too
|
44
|
+
guides_class_top = Guides.from_content(page, axis='horizontal', markers=rows, align='top', outer=False)
|
45
|
+
guides_class_bottom = Guides.from_content(page, axis='horizontal', markers=rows, align='bottom', outer=False)
|
46
|
+
|
47
|
+
print(f"\nClass method with top: {sorted(guides_class_top.horizontal)}")
|
48
|
+
print(f"Class method with bottom: {sorted(guides_class_bottom.horizontal)}")
|
49
|
+
|
50
|
+
if guides_class_top.horizontal == guides_top.horizontal and guides_class_bottom.horizontal == guides_bottom.horizontal:
|
51
|
+
print("\n✅ SUCCESS: Class method produces same results as instance method")
|
52
|
+
else:
|
53
|
+
print("\n❌ FAILED: Class method produces different results")
|
temp/test_inline_js.py
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
"""Test inline JavaScript in HTML widget"""
|
2
|
+
|
3
|
+
import ipywidgets as widgets
|
4
|
+
from IPython.display import display
|
5
|
+
|
6
|
+
# Create an HTML widget with inline JavaScript
|
7
|
+
html_content = '''
|
8
|
+
<div id="test-div">Click me!</div>
|
9
|
+
<script type="text/javascript">
|
10
|
+
document.getElementById('test-div').addEventListener('click', function() {
|
11
|
+
alert('Clicked!');
|
12
|
+
this.innerHTML = 'Clicked at ' + new Date().toLocaleTimeString();
|
13
|
+
});
|
14
|
+
console.log('JavaScript is running!');
|
15
|
+
</script>
|
16
|
+
'''
|
17
|
+
|
18
|
+
# Display using widgets.HTML
|
19
|
+
html_widget = widgets.HTML(value=html_content)
|
20
|
+
display(html_widget)
|
21
|
+
|
22
|
+
print("If you see 'Click me!' above and can click it, JavaScript is working.")
|
@@ -0,0 +1,45 @@
|
|
1
|
+
"""Test marker ordering with a real PDF."""
|
2
|
+
from natural_pdf import PDF
|
3
|
+
|
4
|
+
# Load a test PDF
|
5
|
+
pdf = PDF("pdfs/01-practice.pdf")
|
6
|
+
page = pdf.pages[0]
|
7
|
+
|
8
|
+
# Find some text elements to use as markers
|
9
|
+
all_text = page.find_all("text")
|
10
|
+
print("Sample text elements:")
|
11
|
+
for i, elem in enumerate(all_text[:10]):
|
12
|
+
print(f"{i}: '{elem.text}' at x={elem.x0:.1f}")
|
13
|
+
|
14
|
+
# Create guides with markers in specific order
|
15
|
+
print("\n--- Testing vertical guides with markers ---")
|
16
|
+
|
17
|
+
# Let's find specific text elements at different positions
|
18
|
+
violations_elem = page.find('text:contains("Violations")')
|
19
|
+
date_elem = page.find('text:contains("Date")')
|
20
|
+
total_elem = page.find('text:contains("Total")')
|
21
|
+
|
22
|
+
if violations_elem and date_elem and total_elem:
|
23
|
+
print(f"\nElement positions:")
|
24
|
+
print(f"'Violations' at x={violations_elem.x0:.1f}")
|
25
|
+
print(f"'Date' at x={date_elem.x0:.1f}")
|
26
|
+
print(f"'Total' at x={total_elem.x0:.1f}")
|
27
|
+
|
28
|
+
# Create guides with markers in a specific order
|
29
|
+
guides = page.guides.from_content(
|
30
|
+
axis="vertical",
|
31
|
+
markers=["Violations", "Date", "Total"],
|
32
|
+
align="left",
|
33
|
+
outer=True
|
34
|
+
)
|
35
|
+
|
36
|
+
print(f"\nResulting vertical guides: {sorted(guides.vertical)}")
|
37
|
+
print(f"Page bounds: {page.bbox}")
|
38
|
+
|
39
|
+
# Show visually
|
40
|
+
page.guides.vertical.from_content(
|
41
|
+
markers=["Violations", "Date", "Total"],
|
42
|
+
align="left",
|
43
|
+
outer=True
|
44
|
+
)
|
45
|
+
page.show(guides=True)
|
@@ -0,0 +1,56 @@
|
|
1
|
+
"""Test that the original exclusion lambdas (returning Elements) now work."""
|
2
|
+
from natural_pdf import PDF
|
3
|
+
from natural_pdf.analyzers.guides import Guides
|
4
|
+
|
5
|
+
# Load the PDF
|
6
|
+
pdf = PDF("pdfs/m27.pdf")
|
7
|
+
|
8
|
+
# Add the ORIGINAL PDF-level exclusions that return Elements (not Regions)
|
9
|
+
print("Adding original PDF-level exclusions (returning Elements)...")
|
10
|
+
pdf.add_exclusion(lambda page: page.find("text:contains(PREMISE)").above())
|
11
|
+
pdf.add_exclusion(lambda page: page.find("text:regex(Page \d+ of)"))
|
12
|
+
|
13
|
+
page = pdf.pages[0]
|
14
|
+
|
15
|
+
# Create guides and extract table
|
16
|
+
print("\nCreating guides and extracting table...")
|
17
|
+
headers = (
|
18
|
+
page
|
19
|
+
.find("text:contains(NUMBER)")
|
20
|
+
.right(include_source=True)
|
21
|
+
.expand(top=3, bottom=3)
|
22
|
+
.find_all('text')
|
23
|
+
)
|
24
|
+
|
25
|
+
guides = Guides(page)
|
26
|
+
guides.vertical.from_content(headers, align='left')
|
27
|
+
guides.horizontal.from_stripes()
|
28
|
+
|
29
|
+
# Extract table
|
30
|
+
table_result = guides.extract_table(include_outer_boundaries=True)
|
31
|
+
df = table_result.to_df()
|
32
|
+
|
33
|
+
print(f"\nTable shape: {df.shape}")
|
34
|
+
print("\nFirst few rows:")
|
35
|
+
print(df.head())
|
36
|
+
|
37
|
+
# Check if excluded content is present
|
38
|
+
print("\nChecking for excluded content...")
|
39
|
+
table_text = df.to_string()
|
40
|
+
has_alphabetic = "ALPHABETIC LISTING" in table_text
|
41
|
+
has_page_num = "Page 1 of" in table_text or "Page" in table_text and "of" in table_text
|
42
|
+
|
43
|
+
print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic}")
|
44
|
+
print(f"Contains 'Page X of Y': {has_page_num}")
|
45
|
+
|
46
|
+
if has_alphabetic or has_page_num:
|
47
|
+
print("\n❌ FAILED: Exclusions not properly applied")
|
48
|
+
else:
|
49
|
+
print("\n✅ SUCCESS: Exclusions properly applied with Element returns!")
|
50
|
+
|
51
|
+
# Debug: Check exclusion regions with verbose output
|
52
|
+
print("\n\nDebug: Checking exclusion regions...")
|
53
|
+
exclusions = page._get_exclusion_regions(debug=True)
|
54
|
+
print(f"\nTotal exclusion regions: {len(exclusions)}")
|
55
|
+
for i, exc in enumerate(exclusions):
|
56
|
+
print(f" Exclusion {i}: {exc.bbox} (label: {getattr(exc, 'label', 'None')})")
|