natural-pdf 0.2.12__py3-none-any.whl → 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/core/highlighting_service.py +40 -10
- natural_pdf/elements/base.py +15 -1
- natural_pdf/elements/region.py +32 -2
- natural_pdf/vision/__init__.py +1 -2
- natural_pdf/vision/mixin.py +67 -27
- natural_pdf/vision/results.py +49 -5
- natural_pdf/vision/similarity.py +195 -23
- natural_pdf/vision/template_matching.py +209 -0
- {natural_pdf-0.2.12.dist-info → natural_pdf-0.2.13.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.12.dist-info → natural_pdf-0.2.13.dist-info}/RECORD +20 -19
- temp/test_draw_guides.py +25 -0
- temp/test_draw_guides_interactive.py +30 -0
- temp/test_guide_draw_notebook.py +47 -0
- temp/test_inline_js.py +22 -0
- temp/test_widget_functionality.py +68 -0
- temp/test_widget_simple.py +41 -0
- temp/debug_cell_extraction.py +0 -42
- temp/debug_exclusion_overlap.py +0 -43
- temp/debug_exclusions_guides.py +0 -67
- temp/debug_extra_guide.py +0 -41
- temp/debug_outer_boundaries.py +0 -46
- temp/debug_st_search.py +0 -33
- {natural_pdf-0.2.12.dist-info → natural_pdf-0.2.13.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.12.dist-info → natural_pdf-0.2.13.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.12.dist-info → natural_pdf-0.2.13.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.12.dist-info → natural_pdf-0.2.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,30 @@
|
|
1
|
+
"""Test the interactive guide drawing functionality"""
|
2
|
+
|
3
|
+
from natural_pdf.core.pdf import PDF
|
4
|
+
from natural_pdf.analyzers.guides import Guides
|
5
|
+
|
6
|
+
# Load a sample PDF
|
7
|
+
pdf = PDF("tests/sample_pdfs/simple_table.pdf")
|
8
|
+
page = pdf.pages[0]
|
9
|
+
|
10
|
+
# Create guides
|
11
|
+
guides = Guides(page)
|
12
|
+
|
13
|
+
# Add some initial guides for testing
|
14
|
+
guides.vertical.from_content()
|
15
|
+
guides.horizontal.from_lines(n=5)
|
16
|
+
|
17
|
+
print("Initial vertical guides:", list(guides.vertical))
|
18
|
+
print("Initial horizontal guides:", list(guides.horizontal))
|
19
|
+
|
20
|
+
# This would open the interactive widget in Jupyter
|
21
|
+
# guides.vertical.draw()
|
22
|
+
|
23
|
+
# For non-Jupyter testing, we can check the method exists
|
24
|
+
assert hasattr(guides.vertical, 'draw')
|
25
|
+
assert callable(guides.vertical.draw)
|
26
|
+
|
27
|
+
print("\nSuccess! The draw() method is available on GuidesList objects.")
|
28
|
+
print("To use it interactively, run this in a Jupyter notebook:")
|
29
|
+
print(" guides.vertical.draw()")
|
30
|
+
print(" guides.horizontal.draw(width=600)")
|
@@ -0,0 +1,47 @@
|
|
1
|
+
"""Test script to verify the draw() method works"""
|
2
|
+
|
3
|
+
import sys
|
4
|
+
sys.path.insert(0, '.')
|
5
|
+
|
6
|
+
from natural_pdf.analyzers.guides import GuidesList, Guides
|
7
|
+
|
8
|
+
# Create a mock context for testing
|
9
|
+
class MockContext:
|
10
|
+
def __init__(self):
|
11
|
+
self.width = 600
|
12
|
+
self.height = 800
|
13
|
+
|
14
|
+
def render(self, resolution=150):
|
15
|
+
# Create a simple test image
|
16
|
+
from PIL import Image
|
17
|
+
img = Image.new('RGB', (int(self.width * resolution/72), int(self.height * resolution/72)), 'white')
|
18
|
+
return img
|
19
|
+
|
20
|
+
# Test that the draw method exists
|
21
|
+
mock_context = MockContext()
|
22
|
+
guides = Guides(mock_context)
|
23
|
+
|
24
|
+
# Add some test guides
|
25
|
+
guides.vertical.data = [100, 200, 300, 400, 500]
|
26
|
+
guides.horizontal.data = [150, 350, 550, 750]
|
27
|
+
|
28
|
+
print("Initial vertical guides:", list(guides.vertical))
|
29
|
+
print("Initial horizontal guides:", list(guides.horizontal))
|
30
|
+
|
31
|
+
# Check that draw method exists
|
32
|
+
assert hasattr(guides.vertical, 'draw')
|
33
|
+
assert callable(guides.vertical.draw)
|
34
|
+
assert hasattr(guides.horizontal, 'draw')
|
35
|
+
assert callable(guides.horizontal.draw)
|
36
|
+
|
37
|
+
print("\nSuccess! The draw() method is available.")
|
38
|
+
print("\nIn a Jupyter notebook, you would use:")
|
39
|
+
print(" guides.vertical.draw() # Interactive vertical guide editor")
|
40
|
+
print(" guides.horizontal.draw() # Interactive horizontal guide editor")
|
41
|
+
print("\nFeatures:")
|
42
|
+
print(" - Click to add new guides")
|
43
|
+
print(" - Click existing guides to select them")
|
44
|
+
print(" - Drag to move guides")
|
45
|
+
print(" - Delete key to remove selected guide")
|
46
|
+
print(" - Arrow keys to fine-tune position")
|
47
|
+
print(" - Enter to apply, Escape to cancel")
|
temp/test_inline_js.py
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
"""Test inline JavaScript in HTML widget"""
|
2
|
+
|
3
|
+
import ipywidgets as widgets
|
4
|
+
from IPython.display import display
|
5
|
+
|
6
|
+
# Create an HTML widget with inline JavaScript
|
7
|
+
html_content = '''
|
8
|
+
<div id="test-div">Click me!</div>
|
9
|
+
<script type="text/javascript">
|
10
|
+
document.getElementById('test-div').addEventListener('click', function() {
|
11
|
+
alert('Clicked!');
|
12
|
+
this.innerHTML = 'Clicked at ' + new Date().toLocaleTimeString();
|
13
|
+
});
|
14
|
+
console.log('JavaScript is running!');
|
15
|
+
</script>
|
16
|
+
'''
|
17
|
+
|
18
|
+
# Display using widgets.HTML
|
19
|
+
html_widget = widgets.HTML(value=html_content)
|
20
|
+
display(html_widget)
|
21
|
+
|
22
|
+
print("If you see 'Click me!' above and can click it, JavaScript is working.")
|
@@ -0,0 +1,68 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
"""Test the guide widget functionality"""
|
3
|
+
|
4
|
+
import sys
|
5
|
+
import os
|
6
|
+
|
7
|
+
# Add parent directory to path
|
8
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
9
|
+
|
10
|
+
# Test importing and basic functionality
|
11
|
+
try:
|
12
|
+
from natural_pdf.analyzers.guides import InteractiveGuideWidget, GuidesList, _GUIDE_WIDGET_AVAILABLE
|
13
|
+
print("✓ Successfully imported InteractiveGuideWidget")
|
14
|
+
|
15
|
+
if _GUIDE_WIDGET_AVAILABLE:
|
16
|
+
print("✓ ipywidgets is available")
|
17
|
+
|
18
|
+
# Create a mock GuidesList for testing
|
19
|
+
class MockPage:
|
20
|
+
def __init__(self):
|
21
|
+
self.bbox = (0, 0, 595, 842) # A4 page size in points
|
22
|
+
|
23
|
+
def render(self, resolution=150):
|
24
|
+
# Mock render method
|
25
|
+
from PIL import Image
|
26
|
+
width = int(595 * resolution / 72)
|
27
|
+
height = int(842 * resolution / 72)
|
28
|
+
return Image.new('RGB', (width, height), color='white')
|
29
|
+
|
30
|
+
class MockGuides:
|
31
|
+
def __init__(self):
|
32
|
+
self.context = MockPage()
|
33
|
+
|
34
|
+
class MockGuidesList:
|
35
|
+
def __init__(self):
|
36
|
+
self.data = [100, 200, 300]
|
37
|
+
self._axis = 'vertical'
|
38
|
+
self._parent = MockGuides()
|
39
|
+
|
40
|
+
# Test creating the widget
|
41
|
+
mock_guides = MockGuidesList()
|
42
|
+
try:
|
43
|
+
widget = InteractiveGuideWidget(mock_guides)
|
44
|
+
print("✓ Successfully created InteractiveGuideWidget instance")
|
45
|
+
print(f" - Widget ID: {widget.widget_id}")
|
46
|
+
print(f" - Widget base classes: {InteractiveGuideWidget.__bases__}")
|
47
|
+
|
48
|
+
# Check if the widget has the expected methods
|
49
|
+
expected_methods = ['_generate_content', 'update_guides']
|
50
|
+
for method in expected_methods:
|
51
|
+
if hasattr(widget, method):
|
52
|
+
print(f" - Has method: {method}")
|
53
|
+
else:
|
54
|
+
print(f" - Missing method: {method}")
|
55
|
+
|
56
|
+
except Exception as e:
|
57
|
+
print(f"✗ Error creating widget: {e}")
|
58
|
+
|
59
|
+
else:
|
60
|
+
print("⚠ ipywidgets not available - widget functionality disabled")
|
61
|
+
|
62
|
+
except ImportError as e:
|
63
|
+
print(f"✗ Import error: {e}")
|
64
|
+
|
65
|
+
except Exception as e:
|
66
|
+
print(f"✗ Unexpected error: {e}")
|
67
|
+
|
68
|
+
print("\nWidget implementation test complete!")
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
"""Simple test for the guide widget"""
|
3
|
+
|
4
|
+
import sys
|
5
|
+
import os
|
6
|
+
|
7
|
+
# Add parent directory to path
|
8
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
9
|
+
|
10
|
+
# Test importing the module
|
11
|
+
try:
|
12
|
+
from natural_pdf.analyzers.guides import InteractiveGuideWidget, _GUIDE_WIDGET_AVAILABLE
|
13
|
+
print(f"✓ Module imported successfully")
|
14
|
+
print(f"✓ Widget available: {_GUIDE_WIDGET_AVAILABLE}")
|
15
|
+
|
16
|
+
if _GUIDE_WIDGET_AVAILABLE:
|
17
|
+
print("✓ ipywidgets is installed and InteractiveGuideWidget is available")
|
18
|
+
else:
|
19
|
+
print("✗ ipywidgets is not installed")
|
20
|
+
|
21
|
+
except ImportError as e:
|
22
|
+
print(f"✗ Import error: {e}")
|
23
|
+
sys.exit(1)
|
24
|
+
|
25
|
+
# Check if we can create the widget class
|
26
|
+
if _GUIDE_WIDGET_AVAILABLE:
|
27
|
+
try:
|
28
|
+
# We can't actually instantiate it without a GuidesList, but we can check the class exists
|
29
|
+
print(f"✓ InteractiveGuideWidget class: {InteractiveGuideWidget}")
|
30
|
+
print(f"✓ Widget base classes: {InteractiveGuideWidget.__bases__}")
|
31
|
+
|
32
|
+
# Check methods
|
33
|
+
methods = [m for m in dir(InteractiveGuideWidget) if not m.startswith('_')]
|
34
|
+
print(f"✓ Public methods: {methods}")
|
35
|
+
|
36
|
+
except Exception as e:
|
37
|
+
print(f"✗ Error checking widget class: {e}")
|
38
|
+
else:
|
39
|
+
print("⚠ Skipping widget checks as ipywidgets is not available")
|
40
|
+
|
41
|
+
print("\nAll checks passed!")
|
temp/debug_cell_extraction.py
DELETED
@@ -1,42 +0,0 @@
|
|
1
|
-
"""Debug cell text extraction with exclusions"""
|
2
|
-
from natural_pdf import PDF
|
3
|
-
from natural_pdf.analyzers.guides import Guides
|
4
|
-
|
5
|
-
pdf = PDF("pdfs/m27.pdf")
|
6
|
-
page = pdf.pages[0]
|
7
|
-
|
8
|
-
# Add exclusions
|
9
|
-
pdf.add_exclusion(lambda page: page.find(text="PREMISE").above(), label="header")
|
10
|
-
|
11
|
-
# Check exclusions are registered
|
12
|
-
print("Exclusions on page:")
|
13
|
-
exclusions = page._get_exclusion_regions(debug=True)
|
14
|
-
|
15
|
-
# Create guides and build grid
|
16
|
-
headers = page.find(text="NUMBER").right(include_source=True).expand(top=3, bottom=3).find_all('text')
|
17
|
-
guides = Guides(page)
|
18
|
-
guides.vertical.from_content(headers, align='left')
|
19
|
-
guides.horizontal.from_stripes()
|
20
|
-
|
21
|
-
# Build grid and get cells
|
22
|
-
grid_result = guides.build_grid(include_outer_boundaries=True)
|
23
|
-
cells = grid_result["regions"]["cells"]
|
24
|
-
|
25
|
-
print(f"\nTotal cells: {len(cells)}")
|
26
|
-
|
27
|
-
# Check first row cells (these should be in excluded area)
|
28
|
-
first_row_cells = [c for c in cells if c.bbox[1] < 90] # y < 90
|
29
|
-
print(f"\nFirst row cells: {len(first_row_cells)}")
|
30
|
-
|
31
|
-
for i, cell in enumerate(first_row_cells[:3]):
|
32
|
-
print(f"\nCell {i}:")
|
33
|
-
print(f" Bbox: {cell.bbox}")
|
34
|
-
print(f" Raw text: {repr(cell.extract_text(apply_exclusions=False))}")
|
35
|
-
print(f" With exclusions: {repr(cell.extract_text(apply_exclusions=True))}")
|
36
|
-
|
37
|
-
# Now test the full table extraction
|
38
|
-
print("\n\nFull table extraction:")
|
39
|
-
result = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True, header=False)
|
40
|
-
df = result.to_df()
|
41
|
-
print("\nFirst row of dataframe:")
|
42
|
-
print(df.iloc[0].to_dict() if not df.empty else "Empty")
|
temp/debug_exclusion_overlap.py
DELETED
@@ -1,43 +0,0 @@
|
|
1
|
-
"""Debug how exclusions work with overlapping regions"""
|
2
|
-
from natural_pdf import PDF
|
3
|
-
from natural_pdf.analyzers.guides import Guides
|
4
|
-
|
5
|
-
pdf = PDF("pdfs/m27.pdf")
|
6
|
-
page = pdf.pages[0]
|
7
|
-
|
8
|
-
# Add exclusion
|
9
|
-
pdf.add_exclusion(lambda page: page.find(text="PREMISE").above(), label="header")
|
10
|
-
|
11
|
-
# Get the exclusion region
|
12
|
-
exclusions = page._get_exclusion_regions()
|
13
|
-
excl_region = exclusions[0]
|
14
|
-
print(f"Exclusion region: {excl_region.bbox}")
|
15
|
-
print(f"Exclusion bottom: {excl_region.bbox[3]}")
|
16
|
-
|
17
|
-
# Create a test cell that overlaps the exclusion
|
18
|
-
# Cell 1 from before: (32.06, 0.5, 73.18288, 79.53999999999996)
|
19
|
-
test_cell = page.region(32.06, 0.5, 73.18288, 79.53999999999996)
|
20
|
-
|
21
|
-
print(f"\nTest cell: {test_cell.bbox}")
|
22
|
-
print(f"Cell overlaps exclusion: top={test_cell.bbox[1]} < excl_bottom={excl_region.bbox[3]}")
|
23
|
-
|
24
|
-
# Extract text from different y-ranges
|
25
|
-
print("\nText in different parts of the cell:")
|
26
|
-
|
27
|
-
# Part above exclusion line (should be empty)
|
28
|
-
upper_part = page.region(32.06, 0.5, 73.18288, 59.12)
|
29
|
-
print(f"Upper part (0.5 to 59.12): '{upper_part.extract_text(apply_exclusions=True)}'")
|
30
|
-
|
31
|
-
# Part below exclusion line (should have text)
|
32
|
-
lower_part = page.region(32.06, 59.12, 73.18288, 79.54)
|
33
|
-
print(f"Lower part (59.12 to 79.54): '{lower_part.extract_text()}'")
|
34
|
-
|
35
|
-
# The whole cell
|
36
|
-
print(f"Whole cell with exclusions: '{test_cell.extract_text(apply_exclusions=True)}'")
|
37
|
-
print(f"Whole cell without exclusions: '{test_cell.extract_text(apply_exclusions=False)}'")
|
38
|
-
|
39
|
-
# Check what text elements are in this region
|
40
|
-
print("\nText elements in cell:")
|
41
|
-
cell_texts = test_cell.find_all('text')
|
42
|
-
for t in cell_texts[:5]:
|
43
|
-
print(f" '{t.text}' at y={t.top:.2f}-{t.bottom:.2f}")
|
temp/debug_exclusions_guides.py
DELETED
@@ -1,67 +0,0 @@
|
|
1
|
-
"""Debug why exclusions aren't working with guides.extract_table()"""
|
2
|
-
from natural_pdf import PDF
|
3
|
-
from natural_pdf.analyzers.guides import Guides
|
4
|
-
|
5
|
-
pdf = PDF("pdfs/m27.pdf")
|
6
|
-
page = pdf.pages[0]
|
7
|
-
|
8
|
-
# Check initial text
|
9
|
-
print("Initial text:")
|
10
|
-
print(page.extract_text()[:200])
|
11
|
-
print()
|
12
|
-
|
13
|
-
# Add exclusions
|
14
|
-
pdf.add_exclusion(lambda page: page.find(text="PREMISE").above())
|
15
|
-
pdf.add_exclusion(lambda page: page.find("text:regex(Page \d+ of)").expand())
|
16
|
-
|
17
|
-
# Check text after exclusions
|
18
|
-
print("Text after exclusions:")
|
19
|
-
print(page.extract_text()[:100])
|
20
|
-
print()
|
21
|
-
|
22
|
-
# Debug exclusion regions
|
23
|
-
print("Checking exclusion regions:")
|
24
|
-
exclusions = page._get_exclusion_regions(debug=True)
|
25
|
-
print(f"\nTotal exclusions: {len(exclusions)}")
|
26
|
-
for i, exc in enumerate(exclusions):
|
27
|
-
print(f" {i}: {exc.bbox}")
|
28
|
-
print()
|
29
|
-
|
30
|
-
# Create guides
|
31
|
-
headers = (
|
32
|
-
page
|
33
|
-
.find(text="NUMBER")
|
34
|
-
.right(include_source=True)
|
35
|
-
.expand(top=3, bottom=3)
|
36
|
-
.find_all('text')
|
37
|
-
)
|
38
|
-
|
39
|
-
guides = Guides(page)
|
40
|
-
guides.vertical.from_content(headers, align='left')
|
41
|
-
guides.horizontal.from_stripes()
|
42
|
-
|
43
|
-
# Build grid to see what regions are created
|
44
|
-
print("\nBuilding grid...")
|
45
|
-
grid_result = guides.build_grid(include_outer_boundaries=True)
|
46
|
-
table_region = grid_result["regions"]["table"]
|
47
|
-
print(f"Table region: {table_region}")
|
48
|
-
print(f"Table bbox: {table_region.bbox if table_region else 'None'}")
|
49
|
-
|
50
|
-
# Check if table region respects exclusions
|
51
|
-
if table_region:
|
52
|
-
print("\nExtracting text from table region directly:")
|
53
|
-
table_text = table_region.extract_text()[:200]
|
54
|
-
print(f"Table text: {table_text}")
|
55
|
-
|
56
|
-
# Now extract table
|
57
|
-
print("\nExtracting table with apply_exclusions=True:")
|
58
|
-
result = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True, header=False)
|
59
|
-
df = result.to_df()
|
60
|
-
print(df.head())
|
61
|
-
|
62
|
-
# Check if excluded content is in the table
|
63
|
-
table_str = df.to_string()
|
64
|
-
has_feb = "FEBRUARY 2014" in table_str or "FEBR" in table_str
|
65
|
-
has_alphabetic = "ALPHABETIC LISTING" in table_str
|
66
|
-
print(f"\nContains 'FEBRUARY': {has_feb}")
|
67
|
-
print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic}")
|
temp/debug_extra_guide.py
DELETED
@@ -1,41 +0,0 @@
|
|
1
|
-
"""Debug the extra guide issue."""
|
2
|
-
from natural_pdf import PDF
|
3
|
-
from natural_pdf.analyzers.guides import Guides
|
4
|
-
|
5
|
-
pdf = PDF("pdfs/m27.pdf")
|
6
|
-
page = pdf.pages[0]
|
7
|
-
|
8
|
-
# Get headers
|
9
|
-
headers = (
|
10
|
-
page
|
11
|
-
.find("text:contains(NUMBER)")
|
12
|
-
.right(include_source=True)
|
13
|
-
.expand(top=3, bottom=3)
|
14
|
-
.find_all('text')
|
15
|
-
)
|
16
|
-
|
17
|
-
print("Headers 3-5:")
|
18
|
-
for i, h in enumerate(headers[3:5]):
|
19
|
-
print(f" {i}: '{h.text}' bbox={h.bbox}")
|
20
|
-
|
21
|
-
# Create guides with just these two headers
|
22
|
-
guides = Guides(page)
|
23
|
-
guides.vertical.from_content(headers[3:5], align='left', outer=False)
|
24
|
-
|
25
|
-
print(f"\nResulting guides: {guides.vertical}")
|
26
|
-
print(f"Expected: [328.32012, 539.63316]")
|
27
|
-
|
28
|
-
# Let's also check what happens with each header individually
|
29
|
-
print("\nTesting each header individually:")
|
30
|
-
for i, h in enumerate(headers[3:5]):
|
31
|
-
g = Guides(page)
|
32
|
-
g.vertical.from_content([h], align='left', outer=False)
|
33
|
-
print(f" Header {i} guides: {g.vertical}")
|
34
|
-
|
35
|
-
# Check if it's related to the ElementCollection
|
36
|
-
print("\nTesting with manual list of text:")
|
37
|
-
text_list = [h.text for h in headers[3:5]]
|
38
|
-
print(f"Text list: {text_list}")
|
39
|
-
guides2 = Guides(page)
|
40
|
-
guides2.vertical.from_content(text_list, align='left', outer=False)
|
41
|
-
print(f"Guides from text list: {guides2.vertical}")
|
temp/debug_outer_boundaries.py
DELETED
@@ -1,46 +0,0 @@
|
|
1
|
-
"""Debug outer boundaries issue with exclusions"""
|
2
|
-
from natural_pdf import PDF
|
3
|
-
from natural_pdf.analyzers.guides import Guides
|
4
|
-
|
5
|
-
pdf = PDF("pdfs/m27.pdf")
|
6
|
-
page = pdf.pages[0]
|
7
|
-
|
8
|
-
# Add exclusions
|
9
|
-
pdf.add_exclusion(lambda page: page.find(text="PREMISE").above())
|
10
|
-
|
11
|
-
# Create guides
|
12
|
-
headers = (
|
13
|
-
page
|
14
|
-
.find(text="NUMBER")
|
15
|
-
.right(include_source=True)
|
16
|
-
.expand(top=3, bottom=3)
|
17
|
-
.find_all('text')
|
18
|
-
)
|
19
|
-
|
20
|
-
guides = Guides(page)
|
21
|
-
guides.vertical.from_content(headers, align='left')
|
22
|
-
guides.horizontal.from_stripes()
|
23
|
-
|
24
|
-
print("Horizontal guides (sorted):")
|
25
|
-
for i, h in enumerate(sorted(guides.horizontal)):
|
26
|
-
print(f" {i}: {h:.2f}")
|
27
|
-
|
28
|
-
print(f"\nFirst content guide: {sorted(guides.horizontal)[0]:.2f}")
|
29
|
-
print(f"Page height: {page.height}")
|
30
|
-
|
31
|
-
# Test without outer boundaries
|
32
|
-
print("\n\nWithout outer boundaries:")
|
33
|
-
result1 = guides.extract_table(include_outer_boundaries=False, apply_exclusions=True, header=False)
|
34
|
-
df1 = result1.to_df()
|
35
|
-
print(f"Shape: {df1.shape}")
|
36
|
-
print("First row, first column:", df1.iloc[0, 0] if not df1.empty else "Empty")
|
37
|
-
|
38
|
-
# Test with outer boundaries
|
39
|
-
print("\n\nWith outer boundaries:")
|
40
|
-
result2 = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True, header=False)
|
41
|
-
df2 = result2.to_df()
|
42
|
-
print(f"Shape: {df2.shape}")
|
43
|
-
print("First row, first column:", df2.iloc[0, 0] if not df2.empty else "Empty")
|
44
|
-
|
45
|
-
# The issue: include_outer_boundaries adds guides at 0 and 612,
|
46
|
-
# which creates cells that span into the exclusion zone
|
temp/debug_st_search.py
DELETED
@@ -1,33 +0,0 @@
|
|
1
|
-
"""Debug searching for 'ST' text."""
|
2
|
-
from natural_pdf import PDF
|
3
|
-
|
4
|
-
pdf = PDF("pdfs/m27.pdf")
|
5
|
-
page = pdf.pages[0]
|
6
|
-
|
7
|
-
# Get the original ST element
|
8
|
-
headers = (
|
9
|
-
page
|
10
|
-
.find("text:contains(NUMBER)")
|
11
|
-
.right(include_source=True)
|
12
|
-
.expand(top=3, bottom=3)
|
13
|
-
.find_all('text')
|
14
|
-
)
|
15
|
-
original_st = headers[4]
|
16
|
-
print(f"Original 'ST' element: '{original_st.text}' at {original_st.bbox}")
|
17
|
-
|
18
|
-
# Search for 'ST' using find
|
19
|
-
found_st = page.find('text:contains("ST")')
|
20
|
-
print(f"\nFound 'ST' using find: '{found_st.text}' at {found_st.bbox}")
|
21
|
-
|
22
|
-
# Find all elements containing 'ST'
|
23
|
-
all_st = page.find_all('text:contains("ST")')
|
24
|
-
print(f"\nAll elements containing 'ST':")
|
25
|
-
for i, elem in enumerate(all_st[:10]): # First 10
|
26
|
-
print(f" {i}: '{elem.text}' at x={elem.x0:.2f}, bbox={elem.bbox}")
|
27
|
-
|
28
|
-
# Check what's at position 332.88
|
29
|
-
print(f"\nLooking for element at x≈332.88:")
|
30
|
-
all_text = page.find_all('text')
|
31
|
-
for elem in all_text:
|
32
|
-
if 332 < elem.x0 < 334:
|
33
|
-
print(f" Found: '{elem.text}' at x={elem.x0:.5f}, bbox={elem.bbox}")
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|