natural-pdf 0.2.12__py3-none-any.whl → 0.2.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,30 @@
1
+ """Test the interactive guide drawing functionality"""
2
+
3
+ from natural_pdf.core.pdf import PDF
4
+ from natural_pdf.analyzers.guides import Guides
5
+
6
+ # Load a sample PDF
7
+ pdf = PDF("tests/sample_pdfs/simple_table.pdf")
8
+ page = pdf.pages[0]
9
+
10
+ # Create guides
11
+ guides = Guides(page)
12
+
13
+ # Add some initial guides for testing
14
+ guides.vertical.from_content()
15
+ guides.horizontal.from_lines(n=5)
16
+
17
+ print("Initial vertical guides:", list(guides.vertical))
18
+ print("Initial horizontal guides:", list(guides.horizontal))
19
+
20
+ # This would open the interactive widget in Jupyter
21
+ # guides.vertical.draw()
22
+
23
+ # For non-Jupyter testing, we can check the method exists
24
+ assert hasattr(guides.vertical, 'draw')
25
+ assert callable(guides.vertical.draw)
26
+
27
+ print("\nSuccess! The draw() method is available on GuidesList objects.")
28
+ print("To use it interactively, run this in a Jupyter notebook:")
29
+ print(" guides.vertical.draw()")
30
+ print(" guides.horizontal.draw(width=600)")
@@ -0,0 +1,47 @@
1
+ """Test script to verify the draw() method works"""
2
+
3
+ import sys
4
+ sys.path.insert(0, '.')
5
+
6
+ from natural_pdf.analyzers.guides import GuidesList, Guides
7
+
8
+ # Create a mock context for testing
9
+ class MockContext:
10
+ def __init__(self):
11
+ self.width = 600
12
+ self.height = 800
13
+
14
+ def render(self, resolution=150):
15
+ # Create a simple test image
16
+ from PIL import Image
17
+ img = Image.new('RGB', (int(self.width * resolution/72), int(self.height * resolution/72)), 'white')
18
+ return img
19
+
20
+ # Test that the draw method exists
21
+ mock_context = MockContext()
22
+ guides = Guides(mock_context)
23
+
24
+ # Add some test guides
25
+ guides.vertical.data = [100, 200, 300, 400, 500]
26
+ guides.horizontal.data = [150, 350, 550, 750]
27
+
28
+ print("Initial vertical guides:", list(guides.vertical))
29
+ print("Initial horizontal guides:", list(guides.horizontal))
30
+
31
+ # Check that draw method exists
32
+ assert hasattr(guides.vertical, 'draw')
33
+ assert callable(guides.vertical.draw)
34
+ assert hasattr(guides.horizontal, 'draw')
35
+ assert callable(guides.horizontal.draw)
36
+
37
+ print("\nSuccess! The draw() method is available.")
38
+ print("\nIn a Jupyter notebook, you would use:")
39
+ print(" guides.vertical.draw() # Interactive vertical guide editor")
40
+ print(" guides.horizontal.draw() # Interactive horizontal guide editor")
41
+ print("\nFeatures:")
42
+ print(" - Click to add new guides")
43
+ print(" - Click existing guides to select them")
44
+ print(" - Drag to move guides")
45
+ print(" - Delete key to remove selected guide")
46
+ print(" - Arrow keys to fine-tune position")
47
+ print(" - Enter to apply, Escape to cancel")
temp/test_inline_js.py ADDED
@@ -0,0 +1,22 @@
1
+ """Test inline JavaScript in HTML widget"""
2
+
3
+ import ipywidgets as widgets
4
+ from IPython.display import display
5
+
6
+ # Create an HTML widget with inline JavaScript
7
+ html_content = '''
8
+ <div id="test-div">Click me!</div>
9
+ <script type="text/javascript">
10
+ document.getElementById('test-div').addEventListener('click', function() {
11
+ alert('Clicked!');
12
+ this.innerHTML = 'Clicked at ' + new Date().toLocaleTimeString();
13
+ });
14
+ console.log('JavaScript is running!');
15
+ </script>
16
+ '''
17
+
18
+ # Display using widgets.HTML
19
+ html_widget = widgets.HTML(value=html_content)
20
+ display(html_widget)
21
+
22
+ print("If you see 'Click me!' above and can click it, JavaScript is working.")
@@ -0,0 +1,68 @@
1
+ #!/usr/bin/env python
2
+ """Test the guide widget functionality"""
3
+
4
+ import sys
5
+ import os
6
+
7
+ # Add parent directory to path
8
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
9
+
10
+ # Test importing and basic functionality
11
+ try:
12
+ from natural_pdf.analyzers.guides import InteractiveGuideWidget, GuidesList, _GUIDE_WIDGET_AVAILABLE
13
+ print("✓ Successfully imported InteractiveGuideWidget")
14
+
15
+ if _GUIDE_WIDGET_AVAILABLE:
16
+ print("✓ ipywidgets is available")
17
+
18
+ # Create a mock GuidesList for testing
19
+ class MockPage:
20
+ def __init__(self):
21
+ self.bbox = (0, 0, 595, 842) # A4 page size in points
22
+
23
+ def render(self, resolution=150):
24
+ # Mock render method
25
+ from PIL import Image
26
+ width = int(595 * resolution / 72)
27
+ height = int(842 * resolution / 72)
28
+ return Image.new('RGB', (width, height), color='white')
29
+
30
+ class MockGuides:
31
+ def __init__(self):
32
+ self.context = MockPage()
33
+
34
+ class MockGuidesList:
35
+ def __init__(self):
36
+ self.data = [100, 200, 300]
37
+ self._axis = 'vertical'
38
+ self._parent = MockGuides()
39
+
40
+ # Test creating the widget
41
+ mock_guides = MockGuidesList()
42
+ try:
43
+ widget = InteractiveGuideWidget(mock_guides)
44
+ print("✓ Successfully created InteractiveGuideWidget instance")
45
+ print(f" - Widget ID: {widget.widget_id}")
46
+ print(f" - Widget base classes: {InteractiveGuideWidget.__bases__}")
47
+
48
+ # Check if the widget has the expected methods
49
+ expected_methods = ['_generate_content', 'update_guides']
50
+ for method in expected_methods:
51
+ if hasattr(widget, method):
52
+ print(f" - Has method: {method}")
53
+ else:
54
+ print(f" - Missing method: {method}")
55
+
56
+ except Exception as e:
57
+ print(f"✗ Error creating widget: {e}")
58
+
59
+ else:
60
+ print("⚠ ipywidgets not available - widget functionality disabled")
61
+
62
+ except ImportError as e:
63
+ print(f"✗ Import error: {e}")
64
+
65
+ except Exception as e:
66
+ print(f"✗ Unexpected error: {e}")
67
+
68
+ print("\nWidget implementation test complete!")
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/env python
2
+ """Simple test for the guide widget"""
3
+
4
+ import sys
5
+ import os
6
+
7
+ # Add parent directory to path
8
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
9
+
10
+ # Test importing the module
11
+ try:
12
+ from natural_pdf.analyzers.guides import InteractiveGuideWidget, _GUIDE_WIDGET_AVAILABLE
13
+ print(f"✓ Module imported successfully")
14
+ print(f"✓ Widget available: {_GUIDE_WIDGET_AVAILABLE}")
15
+
16
+ if _GUIDE_WIDGET_AVAILABLE:
17
+ print("✓ ipywidgets is installed and InteractiveGuideWidget is available")
18
+ else:
19
+ print("✗ ipywidgets is not installed")
20
+
21
+ except ImportError as e:
22
+ print(f"✗ Import error: {e}")
23
+ sys.exit(1)
24
+
25
+ # Check if we can create the widget class
26
+ if _GUIDE_WIDGET_AVAILABLE:
27
+ try:
28
+ # We can't actually instantiate it without a GuidesList, but we can check the class exists
29
+ print(f"✓ InteractiveGuideWidget class: {InteractiveGuideWidget}")
30
+ print(f"✓ Widget base classes: {InteractiveGuideWidget.__bases__}")
31
+
32
+ # Check methods
33
+ methods = [m for m in dir(InteractiveGuideWidget) if not m.startswith('_')]
34
+ print(f"✓ Public methods: {methods}")
35
+
36
+ except Exception as e:
37
+ print(f"✗ Error checking widget class: {e}")
38
+ else:
39
+ print("⚠ Skipping widget checks as ipywidgets is not available")
40
+
41
+ print("\nAll checks passed!")
@@ -1,42 +0,0 @@
1
- """Debug cell text extraction with exclusions"""
2
- from natural_pdf import PDF
3
- from natural_pdf.analyzers.guides import Guides
4
-
5
- pdf = PDF("pdfs/m27.pdf")
6
- page = pdf.pages[0]
7
-
8
- # Add exclusions
9
- pdf.add_exclusion(lambda page: page.find(text="PREMISE").above(), label="header")
10
-
11
- # Check exclusions are registered
12
- print("Exclusions on page:")
13
- exclusions = page._get_exclusion_regions(debug=True)
14
-
15
- # Create guides and build grid
16
- headers = page.find(text="NUMBER").right(include_source=True).expand(top=3, bottom=3).find_all('text')
17
- guides = Guides(page)
18
- guides.vertical.from_content(headers, align='left')
19
- guides.horizontal.from_stripes()
20
-
21
- # Build grid and get cells
22
- grid_result = guides.build_grid(include_outer_boundaries=True)
23
- cells = grid_result["regions"]["cells"]
24
-
25
- print(f"\nTotal cells: {len(cells)}")
26
-
27
- # Check first row cells (these should be in excluded area)
28
- first_row_cells = [c for c in cells if c.bbox[1] < 90] # y < 90
29
- print(f"\nFirst row cells: {len(first_row_cells)}")
30
-
31
- for i, cell in enumerate(first_row_cells[:3]):
32
- print(f"\nCell {i}:")
33
- print(f" Bbox: {cell.bbox}")
34
- print(f" Raw text: {repr(cell.extract_text(apply_exclusions=False))}")
35
- print(f" With exclusions: {repr(cell.extract_text(apply_exclusions=True))}")
36
-
37
- # Now test the full table extraction
38
- print("\n\nFull table extraction:")
39
- result = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True, header=False)
40
- df = result.to_df()
41
- print("\nFirst row of dataframe:")
42
- print(df.iloc[0].to_dict() if not df.empty else "Empty")
@@ -1,43 +0,0 @@
1
- """Debug how exclusions work with overlapping regions"""
2
- from natural_pdf import PDF
3
- from natural_pdf.analyzers.guides import Guides
4
-
5
- pdf = PDF("pdfs/m27.pdf")
6
- page = pdf.pages[0]
7
-
8
- # Add exclusion
9
- pdf.add_exclusion(lambda page: page.find(text="PREMISE").above(), label="header")
10
-
11
- # Get the exclusion region
12
- exclusions = page._get_exclusion_regions()
13
- excl_region = exclusions[0]
14
- print(f"Exclusion region: {excl_region.bbox}")
15
- print(f"Exclusion bottom: {excl_region.bbox[3]}")
16
-
17
- # Create a test cell that overlaps the exclusion
18
- # Cell 1 from before: (32.06, 0.5, 73.18288, 79.53999999999996)
19
- test_cell = page.region(32.06, 0.5, 73.18288, 79.53999999999996)
20
-
21
- print(f"\nTest cell: {test_cell.bbox}")
22
- print(f"Cell overlaps exclusion: top={test_cell.bbox[1]} < excl_bottom={excl_region.bbox[3]}")
23
-
24
- # Extract text from different y-ranges
25
- print("\nText in different parts of the cell:")
26
-
27
- # Part above exclusion line (should be empty)
28
- upper_part = page.region(32.06, 0.5, 73.18288, 59.12)
29
- print(f"Upper part (0.5 to 59.12): '{upper_part.extract_text(apply_exclusions=True)}'")
30
-
31
- # Part below exclusion line (should have text)
32
- lower_part = page.region(32.06, 59.12, 73.18288, 79.54)
33
- print(f"Lower part (59.12 to 79.54): '{lower_part.extract_text()}'")
34
-
35
- # The whole cell
36
- print(f"Whole cell with exclusions: '{test_cell.extract_text(apply_exclusions=True)}'")
37
- print(f"Whole cell without exclusions: '{test_cell.extract_text(apply_exclusions=False)}'")
38
-
39
- # Check what text elements are in this region
40
- print("\nText elements in cell:")
41
- cell_texts = test_cell.find_all('text')
42
- for t in cell_texts[:5]:
43
- print(f" '{t.text}' at y={t.top:.2f}-{t.bottom:.2f}")
@@ -1,67 +0,0 @@
1
- """Debug why exclusions aren't working with guides.extract_table()"""
2
- from natural_pdf import PDF
3
- from natural_pdf.analyzers.guides import Guides
4
-
5
- pdf = PDF("pdfs/m27.pdf")
6
- page = pdf.pages[0]
7
-
8
- # Check initial text
9
- print("Initial text:")
10
- print(page.extract_text()[:200])
11
- print()
12
-
13
- # Add exclusions
14
- pdf.add_exclusion(lambda page: page.find(text="PREMISE").above())
15
- pdf.add_exclusion(lambda page: page.find("text:regex(Page \d+ of)").expand())
16
-
17
- # Check text after exclusions
18
- print("Text after exclusions:")
19
- print(page.extract_text()[:100])
20
- print()
21
-
22
- # Debug exclusion regions
23
- print("Checking exclusion regions:")
24
- exclusions = page._get_exclusion_regions(debug=True)
25
- print(f"\nTotal exclusions: {len(exclusions)}")
26
- for i, exc in enumerate(exclusions):
27
- print(f" {i}: {exc.bbox}")
28
- print()
29
-
30
- # Create guides
31
- headers = (
32
- page
33
- .find(text="NUMBER")
34
- .right(include_source=True)
35
- .expand(top=3, bottom=3)
36
- .find_all('text')
37
- )
38
-
39
- guides = Guides(page)
40
- guides.vertical.from_content(headers, align='left')
41
- guides.horizontal.from_stripes()
42
-
43
- # Build grid to see what regions are created
44
- print("\nBuilding grid...")
45
- grid_result = guides.build_grid(include_outer_boundaries=True)
46
- table_region = grid_result["regions"]["table"]
47
- print(f"Table region: {table_region}")
48
- print(f"Table bbox: {table_region.bbox if table_region else 'None'}")
49
-
50
- # Check if table region respects exclusions
51
- if table_region:
52
- print("\nExtracting text from table region directly:")
53
- table_text = table_region.extract_text()[:200]
54
- print(f"Table text: {table_text}")
55
-
56
- # Now extract table
57
- print("\nExtracting table with apply_exclusions=True:")
58
- result = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True, header=False)
59
- df = result.to_df()
60
- print(df.head())
61
-
62
- # Check if excluded content is in the table
63
- table_str = df.to_string()
64
- has_feb = "FEBRUARY 2014" in table_str or "FEBR" in table_str
65
- has_alphabetic = "ALPHABETIC LISTING" in table_str
66
- print(f"\nContains 'FEBRUARY': {has_feb}")
67
- print(f"Contains 'ALPHABETIC LISTING': {has_alphabetic}")
temp/debug_extra_guide.py DELETED
@@ -1,41 +0,0 @@
1
- """Debug the extra guide issue."""
2
- from natural_pdf import PDF
3
- from natural_pdf.analyzers.guides import Guides
4
-
5
- pdf = PDF("pdfs/m27.pdf")
6
- page = pdf.pages[0]
7
-
8
- # Get headers
9
- headers = (
10
- page
11
- .find("text:contains(NUMBER)")
12
- .right(include_source=True)
13
- .expand(top=3, bottom=3)
14
- .find_all('text')
15
- )
16
-
17
- print("Headers 3-5:")
18
- for i, h in enumerate(headers[3:5]):
19
- print(f" {i}: '{h.text}' bbox={h.bbox}")
20
-
21
- # Create guides with just these two headers
22
- guides = Guides(page)
23
- guides.vertical.from_content(headers[3:5], align='left', outer=False)
24
-
25
- print(f"\nResulting guides: {guides.vertical}")
26
- print(f"Expected: [328.32012, 539.63316]")
27
-
28
- # Let's also check what happens with each header individually
29
- print("\nTesting each header individually:")
30
- for i, h in enumerate(headers[3:5]):
31
- g = Guides(page)
32
- g.vertical.from_content([h], align='left', outer=False)
33
- print(f" Header {i} guides: {g.vertical}")
34
-
35
- # Check if it's related to the ElementCollection
36
- print("\nTesting with manual list of text:")
37
- text_list = [h.text for h in headers[3:5]]
38
- print(f"Text list: {text_list}")
39
- guides2 = Guides(page)
40
- guides2.vertical.from_content(text_list, align='left', outer=False)
41
- print(f"Guides from text list: {guides2.vertical}")
@@ -1,46 +0,0 @@
1
- """Debug outer boundaries issue with exclusions"""
2
- from natural_pdf import PDF
3
- from natural_pdf.analyzers.guides import Guides
4
-
5
- pdf = PDF("pdfs/m27.pdf")
6
- page = pdf.pages[0]
7
-
8
- # Add exclusions
9
- pdf.add_exclusion(lambda page: page.find(text="PREMISE").above())
10
-
11
- # Create guides
12
- headers = (
13
- page
14
- .find(text="NUMBER")
15
- .right(include_source=True)
16
- .expand(top=3, bottom=3)
17
- .find_all('text')
18
- )
19
-
20
- guides = Guides(page)
21
- guides.vertical.from_content(headers, align='left')
22
- guides.horizontal.from_stripes()
23
-
24
- print("Horizontal guides (sorted):")
25
- for i, h in enumerate(sorted(guides.horizontal)):
26
- print(f" {i}: {h:.2f}")
27
-
28
- print(f"\nFirst content guide: {sorted(guides.horizontal)[0]:.2f}")
29
- print(f"Page height: {page.height}")
30
-
31
- # Test without outer boundaries
32
- print("\n\nWithout outer boundaries:")
33
- result1 = guides.extract_table(include_outer_boundaries=False, apply_exclusions=True, header=False)
34
- df1 = result1.to_df()
35
- print(f"Shape: {df1.shape}")
36
- print("First row, first column:", df1.iloc[0, 0] if not df1.empty else "Empty")
37
-
38
- # Test with outer boundaries
39
- print("\n\nWith outer boundaries:")
40
- result2 = guides.extract_table(include_outer_boundaries=True, apply_exclusions=True, header=False)
41
- df2 = result2.to_df()
42
- print(f"Shape: {df2.shape}")
43
- print("First row, first column:", df2.iloc[0, 0] if not df2.empty else "Empty")
44
-
45
- # The issue: include_outer_boundaries adds guides at 0 and 612,
46
- # which creates cells that span into the exclusion zone
temp/debug_st_search.py DELETED
@@ -1,33 +0,0 @@
1
- """Debug searching for 'ST' text."""
2
- from natural_pdf import PDF
3
-
4
- pdf = PDF("pdfs/m27.pdf")
5
- page = pdf.pages[0]
6
-
7
- # Get the original ST element
8
- headers = (
9
- page
10
- .find("text:contains(NUMBER)")
11
- .right(include_source=True)
12
- .expand(top=3, bottom=3)
13
- .find_all('text')
14
- )
15
- original_st = headers[4]
16
- print(f"Original 'ST' element: '{original_st.text}' at {original_st.bbox}")
17
-
18
- # Search for 'ST' using find
19
- found_st = page.find('text:contains("ST")')
20
- print(f"\nFound 'ST' using find: '{found_st.text}' at {found_st.bbox}")
21
-
22
- # Find all elements containing 'ST'
23
- all_st = page.find_all('text:contains("ST")')
24
- print(f"\nAll elements containing 'ST':")
25
- for i, elem in enumerate(all_st[:10]): # First 10
26
- print(f" {i}: '{elem.text}' at x={elem.x0:.2f}, bbox={elem.bbox}")
27
-
28
- # Check what's at position 332.88
29
- print(f"\nLooking for element at x≈332.88:")
30
- all_text = page.find_all('text')
31
- for elem in all_text:
32
- if 332 < elem.x0 < 334:
33
- print(f" Found: '{elem.text}' at x={elem.x0:.5f}, bbox={elem.bbox}")