natural-pdf 0.2.11__py3-none-any.whl → 0.2.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/guides.py +196 -43
- natural_pdf/core/page.py +56 -8
- natural_pdf/elements/region.py +5 -3
- {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.12.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.12.dist-info}/RECORD +29 -9
- {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.12.dist-info}/top_level.txt +1 -0
- temp/debug_cell_extraction.py +42 -0
- temp/debug_exclusion_overlap.py +43 -0
- temp/debug_exclusions_guides.py +67 -0
- temp/debug_extra_guide.py +41 -0
- temp/debug_outer_boundaries.py +46 -0
- temp/debug_st_search.py +33 -0
- temp/fix_page_exclusions.py +42 -0
- temp/test_exclusion_with_debug.py +30 -0
- temp/test_find_exclusions_fix.py +53 -0
- temp/test_find_exclusions_fix_no_recursion.py +97 -0
- temp/test_fix_real_pdf.py +48 -0
- temp/test_fix_working.py +55 -0
- temp/test_fixed_pdf_exclusions.py +67 -0
- temp/test_horizontal_top_bottom.py +53 -0
- temp/test_marker_order.py +45 -0
- temp/test_original_exclusions_now_work.py +56 -0
- temp/test_pdf_exclusions_with_guides.py +84 -0
- temp/test_region_exclusions_detailed.py +25 -0
- temp/test_stripes_real_pdf.py +62 -0
- temp/test_vertical_stripes.py +55 -0
- {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.12.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.12.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.11.dist-info → natural_pdf-0.2.12.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,45 @@
|
|
1
|
+
"""Test marker ordering with a real PDF."""
|
2
|
+
from natural_pdf import PDF
|
3
|
+
|
4
|
+
# Load a test PDF
|
5
|
+
pdf = PDF("pdfs/01-practice.pdf")
|
6
|
+
page = pdf.pages[0]
|
7
|
+
|
8
|
+
# Find some text elements to use as markers
|
9
|
+
all_text = page.find_all("text")
|
10
|
+
print("Sample text elements:")
|
11
|
+
for i, elem in enumerate(all_text[:10]):
|
12
|
+
print(f"{i}: '{elem.text}' at x={elem.x0:.1f}")
|
13
|
+
|
14
|
+
# Create guides with markers in specific order
|
15
|
+
print("\n--- Testing vertical guides with markers ---")
|
16
|
+
|
17
|
+
# Let's find specific text elements at different positions
|
18
|
+
violations_elem = page.find('text:contains("Violations")')
|
19
|
+
date_elem = page.find('text:contains("Date")')
|
20
|
+
total_elem = page.find('text:contains("Total")')
|
21
|
+
|
22
|
+
if violations_elem and date_elem and total_elem:
|
23
|
+
print(f"\nElement positions:")
|
24
|
+
print(f"'Violations' at x={violations_elem.x0:.1f}")
|
25
|
+
print(f"'Date' at x={date_elem.x0:.1f}")
|
26
|
+
print(f"'Total' at x={total_elem.x0:.1f}")
|
27
|
+
|
28
|
+
# Create guides with markers in a specific order
|
29
|
+
guides = page.guides.from_content(
|
30
|
+
axis="vertical",
|
31
|
+
markers=["Violations", "Date", "Total"],
|
32
|
+
align="left",
|
33
|
+
outer=True
|
34
|
+
)
|
35
|
+
|
36
|
+
print(f"\nResulting vertical guides: {sorted(guides.vertical)}")
|
37
|
+
print(f"Page bounds: {page.bbox}")
|
38
|
+
|
39
|
+
# Show visually
|
40
|
+
page.guides.vertical.from_content(
|
41
|
+
markers=["Violations", "Date", "Total"],
|
42
|
+
align="left",
|
43
|
+
outer=True
|
44
|
+
)
|
45
|
+
page.show(guides=True)
|
@@ -0,0 +1,56 @@
|
|
1
|
+
"""Test that the original exclusion lambdas (returning Elements) now work."""
|
2
|
+
from natural_pdf import PDF
|
3
|
+
from natural_pdf.analyzers.guides import Guides
|
4
|
+
|
5
|
+
# Load the PDF
|
6
|
+
pdf = PDF("pdfs/m27.pdf")
|
7
|
+
|
8
|
+
# Add the ORIGINAL PDF-level exclusions that return Elements (not Regions)
|
9
|
+
print("Adding original PDF-level exclusions (returning Elements)...")
|
10
|
+
pdf.add_exclusion(lambda page: page.find("text:contains(PREMISE)").above())
|
11
|
+
pdf.add_exclusion(lambda page: page.find("text:regex(Page \d+ of)"))
|
12
|
+
|
13
|
+
page = pdf.pages[0]
|
14
|
+
|
15
|
+
# Create guides and extract table
|
16
|
+
print("\nCreating guides and extracting table...")
|
17
|
+
headers = (
|
18
|
+
page
|
19
|
+
.find("text:contains(NUMBER)")
|
20
|
+
.right(include_source=True)
|
21
|
+
.expand(top=3, bottom=3)
|
22
|
+
.find_all('text')
|
23
|
+
)
|
24
|
+
|
25
|
+
guides = Guides(page)
|
26
|
+
guides.vertical.from_content(headers, align='left')
|
27
|
+
guides.horizontal.from_stripes()
|
28
|
+
|
29
|
+
# Extract table
|
30
|
+
table_result = guides.extract_table(include_outer_boundaries=True)
|
31
|
+
df = table_result.to_df()
|
32
|
+
|
33
|
+
print(f"\nTable shape: {df.shape}")
|
34
|
+
print("\nFirst few rows:")
|
35
|
+
print(df.head())
|
36
|
+
|
37
|
+
# Check if excluded content is present
|
38
|
+
print("\nChecking for excluded content...")
|
39
|
+
table_text = df.to_string()
|
40
|
+
has_alphabetic = "ALPHABETIC LISTING" in table_text
|
41
|
+
has_page_num = "Page 1 of" in table_text or "Page" in table_text and "of" in table_text
|
42
|
+
|
43
|
+
print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic}")
|
44
|
+
print(f"Contains 'Page X of Y': {has_page_num}")
|
45
|
+
|
46
|
+
if has_alphabetic or has_page_num:
|
47
|
+
print("\n❌ FAILED: Exclusions not properly applied")
|
48
|
+
else:
|
49
|
+
print("\n✅ SUCCESS: Exclusions properly applied with Element returns!")
|
50
|
+
|
51
|
+
# Debug: Check exclusion regions with verbose output
|
52
|
+
print("\n\nDebug: Checking exclusion regions...")
|
53
|
+
exclusions = page._get_exclusion_regions(debug=True)
|
54
|
+
print(f"\nTotal exclusion regions: {len(exclusions)}")
|
55
|
+
for i, exc in enumerate(exclusions):
|
56
|
+
print(f" Exclusion {i}: {exc.bbox} (label: {getattr(exc, 'label', 'None')})")
|
@@ -0,0 +1,84 @@
|
|
1
|
+
"""Test that PDF-level exclusions work with guides.extract_table()."""
|
2
|
+
from natural_pdf import PDF
|
3
|
+
from natural_pdf.analyzers.guides import Guides
|
4
|
+
|
5
|
+
# Load the PDF
|
6
|
+
pdf = PDF("pdfs/m27.pdf")
|
7
|
+
page = pdf.pages[0]
|
8
|
+
|
9
|
+
# Add PDF-level exclusions
|
10
|
+
print("Adding PDF-level exclusions...")
|
11
|
+
pdf.add_exclusion(lambda page: page.find("text:contains(PREMISE)").above())
|
12
|
+
pdf.add_exclusion(lambda page: page.find("text:regex(Page \d+ of)"))
|
13
|
+
|
14
|
+
# Create guides and extract table
|
15
|
+
print("\nCreating guides and extracting table...")
|
16
|
+
headers = (
|
17
|
+
page
|
18
|
+
.find("text:contains(NUMBER)")
|
19
|
+
.right(include_source=True)
|
20
|
+
.expand(top=3, bottom=3)
|
21
|
+
.find_all('text')
|
22
|
+
)
|
23
|
+
|
24
|
+
guides = Guides(page)
|
25
|
+
guides.vertical.from_content(headers, align='left')
|
26
|
+
guides.horizontal.from_stripes()
|
27
|
+
|
28
|
+
# Extract table with apply_exclusions=True (default)
|
29
|
+
table_result = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True)
|
30
|
+
df = table_result.to_df()
|
31
|
+
|
32
|
+
print(f"\nTable shape: {df.shape}")
|
33
|
+
print("\nFirst few rows:")
|
34
|
+
print(df.head())
|
35
|
+
|
36
|
+
# Check if excluded content is present
|
37
|
+
print("\nChecking for excluded content...")
|
38
|
+
table_text = df.to_string()
|
39
|
+
has_alphabetic = "ALPHABETIC LISTING" in table_text
|
40
|
+
has_page_num = "Page 1 of" in table_text or "Page" in table_text and "of" in table_text
|
41
|
+
|
42
|
+
print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic}")
|
43
|
+
print(f"Contains 'Page X of Y': {has_page_num}")
|
44
|
+
|
45
|
+
if has_alphabetic or has_page_num:
|
46
|
+
print("\n❌ FAILED: Exclusions not properly applied")
|
47
|
+
else:
|
48
|
+
print("\n✅ SUCCESS: Exclusions properly applied")
|
49
|
+
|
50
|
+
# Now test with page-level exclusions for comparison
|
51
|
+
print("\n\nTesting with page-level exclusions...")
|
52
|
+
page2 = pdf.pages[0]
|
53
|
+
header = page2.find("text:contains(PREMISE)").above()
|
54
|
+
footer = page2.find("text:regex(Page \d+ of)")
|
55
|
+
|
56
|
+
if header:
|
57
|
+
page2.add_exclusion(header)
|
58
|
+
if footer:
|
59
|
+
page2.add_exclusion(footer)
|
60
|
+
|
61
|
+
guides2 = Guides(page2)
|
62
|
+
guides2.vertical.from_content(headers, align='left')
|
63
|
+
guides2.horizontal.from_stripes()
|
64
|
+
|
65
|
+
table_result2 = guides2.extract_table(include_outer_boundaries=True)
|
66
|
+
df2 = table_result2.to_df()
|
67
|
+
|
68
|
+
print(f"\nTable shape with page exclusions: {df2.shape}")
|
69
|
+
table_text2 = df2.to_string()
|
70
|
+
has_alphabetic2 = "ALPHABETIC LISTING" in table_text2
|
71
|
+
has_page_num2 = "Page 1 of" in table_text2 or "Page" in table_text2 and "of" in table_text2
|
72
|
+
|
73
|
+
print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic2}")
|
74
|
+
print(f"Contains 'Page X of Y': {has_page_num2}")
|
75
|
+
|
76
|
+
if has_alphabetic2 or has_page_num2:
|
77
|
+
print("\n❌ FAILED: Page exclusions not properly applied")
|
78
|
+
else:
|
79
|
+
print("\n✅ SUCCESS: Page exclusions properly applied")
|
80
|
+
|
81
|
+
# Debug: Check exclusion regions
|
82
|
+
print("\n\nDebug: Checking exclusion regions...")
|
83
|
+
exclusions = page._get_exclusion_regions(debug=True)
|
84
|
+
print(f"\nTotal exclusion regions: {len(exclusions)}")
|
@@ -0,0 +1,25 @@
|
|
1
|
+
"""Test region exclusions with detailed debugging"""
|
2
|
+
from natural_pdf import PDF
|
3
|
+
|
4
|
+
pdf = PDF("pdfs/m27.pdf")
|
5
|
+
page = pdf.pages[0]
|
6
|
+
|
7
|
+
# Add exclusion
|
8
|
+
pdf.add_exclusion(lambda page: page.find(text="PREMISE").above(), label="header")
|
9
|
+
|
10
|
+
# Get page exclusions
|
11
|
+
print("Page exclusions:")
|
12
|
+
print(f"page._exclusions: {page._exclusions}")
|
13
|
+
print(f"pdf._exclusions: {pdf._exclusions}")
|
14
|
+
|
15
|
+
# Create a region in the excluded area
|
16
|
+
test_region = page.region(0, 0, 200, 50)
|
17
|
+
print(f"\nTest region: {test_region.bbox}")
|
18
|
+
print(f"Region's page: {test_region.page}")
|
19
|
+
print(f"Region's _page: {test_region._page}")
|
20
|
+
print(f"Region's _page._exclusions: {test_region._page._exclusions}")
|
21
|
+
|
22
|
+
# Try extraction with debug
|
23
|
+
print("\nExtracting with debug=True:")
|
24
|
+
text = test_region.extract_text(apply_exclusions=True, debug=True)
|
25
|
+
print(f"Result: '{text}'")
|
@@ -0,0 +1,62 @@
|
|
1
|
+
"""Test from_stripes with a real PDF that has striped rows."""
|
2
|
+
from natural_pdf import PDF
|
3
|
+
from natural_pdf.analyzers.guides import Guides
|
4
|
+
|
5
|
+
# Load the PDF (assuming m27.pdf has the striped table)
|
6
|
+
pdf = PDF("pdfs/m27.pdf")
|
7
|
+
page = pdf.pages[0]
|
8
|
+
|
9
|
+
# Test 1: Manual selection with specific color
|
10
|
+
print("Test 1: Manual selection with color='#00ffff'")
|
11
|
+
guides = Guides(page)
|
12
|
+
guides.horizontal.from_stripes(color='#00ffff')
|
13
|
+
|
14
|
+
print(f"Found {len(guides.horizontal)} horizontal guides")
|
15
|
+
if guides.horizontal:
|
16
|
+
print(f"Guide range: {min(guides.horizontal):.2f} to {max(guides.horizontal):.2f}")
|
17
|
+
print(f"First 5 guides: {sorted(guides.horizontal)[:5]}")
|
18
|
+
|
19
|
+
# Test 2: Auto-detect most common stripe color
|
20
|
+
print("\nTest 2: Auto-detect stripes")
|
21
|
+
guides2 = Guides(page)
|
22
|
+
guides2.horizontal.from_stripes()
|
23
|
+
|
24
|
+
print(f"Found {len(guides2.horizontal)} horizontal guides")
|
25
|
+
if guides2.horizontal:
|
26
|
+
print(f"Guide range: {min(guides2.horizontal):.2f} to {max(guides2.horizontal):.2f}")
|
27
|
+
|
28
|
+
# Test 3: Manual selection of stripes
|
29
|
+
print("\nTest 3: Manual selection of stripe elements")
|
30
|
+
cyan_stripes = page.find_all('rect[fill=#00ffff]')
|
31
|
+
print(f"Found {len(cyan_stripes)} cyan rectangles")
|
32
|
+
|
33
|
+
if cyan_stripes:
|
34
|
+
guides3 = Guides(page)
|
35
|
+
guides3.horizontal.from_stripes(cyan_stripes)
|
36
|
+
print(f"Created {len(guides3.horizontal)} guides from stripes")
|
37
|
+
|
38
|
+
# Show how this captures both edges of each stripe
|
39
|
+
print("\nFirst stripe edges:")
|
40
|
+
first_stripe = cyan_stripes[0]
|
41
|
+
print(f" Stripe at y={first_stripe.top:.2f} to {first_stripe.bottom:.2f}")
|
42
|
+
print(f" Guides include: {first_stripe.top:.2f} in guides? {first_stripe.top in guides3.horizontal}")
|
43
|
+
print(f" Guides include: {first_stripe.bottom:.2f} in guides? {first_stripe.bottom in guides3.horizontal}")
|
44
|
+
|
45
|
+
# Test 4: Compare with traditional approach
|
46
|
+
print("\nComparison with traditional from_content approach:")
|
47
|
+
# Traditional way would only get one edge per stripe
|
48
|
+
guides4 = Guides(page)
|
49
|
+
guides4.horizontal.from_content(cyan_stripes, align='top', outer=False)
|
50
|
+
print(f"from_content (top only): {len(guides4.horizontal)} guides")
|
51
|
+
|
52
|
+
guides5 = Guides(page)
|
53
|
+
guides5.horizontal.from_content(cyan_stripes, align='bottom', outer=False)
|
54
|
+
print(f"from_content (bottom only): {len(guides5.horizontal)} guides")
|
55
|
+
|
56
|
+
print(f"from_stripes (both edges): {len(guides3.horizontal)} guides")
|
57
|
+
|
58
|
+
# Verify we get approximately 2x guides with from_stripes
|
59
|
+
if len(cyan_stripes) > 0:
|
60
|
+
expected_guides = len(set([s.top for s in cyan_stripes] + [s.bottom for s in cyan_stripes]))
|
61
|
+
print(f"\nExpected unique edges: {expected_guides}")
|
62
|
+
print(f"Actual from_stripes: {len(guides3.horizontal)}")
|
@@ -0,0 +1,55 @@
|
|
1
|
+
"""Test from_stripes with vertical stripes (column backgrounds)."""
|
2
|
+
from natural_pdf import PDF
|
3
|
+
from natural_pdf.analyzers.guides import Guides
|
4
|
+
|
5
|
+
# Create a mock page with vertical stripes (simulating column backgrounds)
|
6
|
+
class MockRect:
|
7
|
+
def __init__(self, fill, x0, x1, top, bottom):
|
8
|
+
self.fill = fill
|
9
|
+
self.x0 = x0
|
10
|
+
self.x1 = x1
|
11
|
+
self.top = top
|
12
|
+
self.bottom = bottom
|
13
|
+
|
14
|
+
class MockPage:
|
15
|
+
def __init__(self):
|
16
|
+
self.bbox = (0, 0, 800, 600)
|
17
|
+
# Create vertical stripes for alternating columns
|
18
|
+
self.stripes = [
|
19
|
+
MockRect('#f0f0f0', 100, 200, 0, 600), # Column 1 background
|
20
|
+
MockRect('#f0f0f0', 300, 400, 0, 600), # Column 3 background
|
21
|
+
MockRect('#f0f0f0', 500, 600, 0, 600), # Column 5 background
|
22
|
+
]
|
23
|
+
|
24
|
+
def find_all(self, selector):
|
25
|
+
if selector == 'rect[fill=#f0f0f0]':
|
26
|
+
return self.stripes
|
27
|
+
elif selector == 'rect[fill]':
|
28
|
+
# Include some other rects too
|
29
|
+
return self.stripes + [
|
30
|
+
MockRect('#ffffff', 0, 100, 0, 600),
|
31
|
+
MockRect('#ffffff', 200, 300, 0, 600),
|
32
|
+
]
|
33
|
+
return []
|
34
|
+
|
35
|
+
# Test vertical stripes
|
36
|
+
page = MockPage()
|
37
|
+
guides = Guides(page)
|
38
|
+
|
39
|
+
print("Testing vertical stripes (column backgrounds)")
|
40
|
+
guides.vertical.from_stripes(color='#f0f0f0')
|
41
|
+
|
42
|
+
print(f"\nFound {len(guides.vertical)} vertical guides")
|
43
|
+
print(f"Guides at: {sorted(guides.vertical)}")
|
44
|
+
|
45
|
+
# Verify we got both edges of each stripe
|
46
|
+
expected = [100, 200, 300, 400, 500, 600]
|
47
|
+
print(f"\nExpected: {expected}")
|
48
|
+
print(f"Match: {sorted(guides.vertical) == expected}")
|
49
|
+
|
50
|
+
# Test auto-detection
|
51
|
+
guides2 = Guides(page)
|
52
|
+
guides2.vertical.from_stripes() # Should auto-detect the gray stripes
|
53
|
+
|
54
|
+
print(f"\nAuto-detect found {len(guides2.vertical)} guides")
|
55
|
+
print(f"Same result: {sorted(guides2.vertical) == sorted(guides.vertical)}")
|
File without changes
|
File without changes
|
File without changes
|