natural-pdf 25.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. examples/__init__.py +3 -0
  2. examples/another_exclusion_example.py +20 -0
  3. examples/basic_usage.py +190 -0
  4. examples/boundary_exclusion_test.py +137 -0
  5. examples/boundary_inclusion_fix_test.py +157 -0
  6. examples/chainable_layout_example.py +70 -0
  7. examples/color_basic_test.py +49 -0
  8. examples/color_name_example.py +71 -0
  9. examples/color_test.py +62 -0
  10. examples/debug_ocr.py +91 -0
  11. examples/direct_ocr_test.py +148 -0
  12. examples/direct_paddle_test.py +99 -0
  13. examples/direct_qa_example.py +165 -0
  14. examples/document_layout_analysis.py +123 -0
  15. examples/document_qa_example.py +185 -0
  16. examples/exclusion_count_debug.py +128 -0
  17. examples/exclusion_debug.py +107 -0
  18. examples/exclusion_example.py +150 -0
  19. examples/exclusion_optimization_example.py +190 -0
  20. examples/extract_text_test.py +128 -0
  21. examples/font_aware_example.py +101 -0
  22. examples/font_variant_example.py +124 -0
  23. examples/footer_overlap_test.py +124 -0
  24. examples/highlight_all_example.py +82 -0
  25. examples/highlight_attributes_test.py +114 -0
  26. examples/highlight_confidence_display.py +122 -0
  27. examples/highlight_demo.py +110 -0
  28. examples/highlight_float_test.py +71 -0
  29. examples/highlight_test.py +147 -0
  30. examples/highlighting_example.py +123 -0
  31. examples/image_width_example.py +84 -0
  32. examples/improved_api_example.py +128 -0
  33. examples/layout_confidence_display_test.py +65 -0
  34. examples/layout_confidence_test.py +82 -0
  35. examples/layout_coordinate_debug.py +258 -0
  36. examples/layout_highlight_test.py +77 -0
  37. examples/logging_example.py +70 -0
  38. examples/ocr_comprehensive.py +193 -0
  39. examples/ocr_debug_example.py +87 -0
  40. examples/ocr_default_test.py +97 -0
  41. examples/ocr_engine_comparison.py +235 -0
  42. examples/ocr_example.py +89 -0
  43. examples/ocr_simplified_params.py +79 -0
  44. examples/ocr_visualization.py +102 -0
  45. examples/ocr_visualization_test.py +121 -0
  46. examples/paddle_layout_example.py +315 -0
  47. examples/paddle_layout_simple.py +74 -0
  48. examples/paddleocr_example.py +224 -0
  49. examples/page_collection_example.py +103 -0
  50. examples/polygon_highlight_example.py +83 -0
  51. examples/position_methods_example.py +134 -0
  52. examples/region_boundary_test.py +73 -0
  53. examples/region_exclusion_test.py +149 -0
  54. examples/region_expand_example.py +109 -0
  55. examples/region_image_example.py +116 -0
  56. examples/region_ocr_test.py +119 -0
  57. examples/region_sections_example.py +115 -0
  58. examples/school_books.py +49 -0
  59. examples/school_books_all.py +52 -0
  60. examples/scouring.py +36 -0
  61. examples/section_extraction_example.py +232 -0
  62. examples/simple_document_qa.py +97 -0
  63. examples/spatial_navigation_example.py +108 -0
  64. examples/table_extraction_example.py +135 -0
  65. examples/table_structure_detection.py +155 -0
  66. examples/tatr_cells_test.py +56 -0
  67. examples/tatr_ocr_table_test.py +94 -0
  68. examples/text_search_example.py +122 -0
  69. examples/text_style_example.py +110 -0
  70. examples/tiny-text.py +61 -0
  71. examples/until_boundaries_example.py +156 -0
  72. examples/until_example.py +112 -0
  73. examples/very_basics.py +15 -0
  74. natural_pdf/__init__.py +55 -0
  75. natural_pdf/analyzers/__init__.py +9 -0
  76. natural_pdf/analyzers/document_layout.py +736 -0
  77. natural_pdf/analyzers/text_structure.py +153 -0
  78. natural_pdf/core/__init__.py +3 -0
  79. natural_pdf/core/page.py +2376 -0
  80. natural_pdf/core/pdf.py +572 -0
  81. natural_pdf/elements/__init__.py +3 -0
  82. natural_pdf/elements/base.py +553 -0
  83. natural_pdf/elements/collections.py +770 -0
  84. natural_pdf/elements/line.py +124 -0
  85. natural_pdf/elements/rect.py +122 -0
  86. natural_pdf/elements/region.py +1366 -0
  87. natural_pdf/elements/text.py +304 -0
  88. natural_pdf/ocr/__init__.py +62 -0
  89. natural_pdf/ocr/easyocr_engine.py +254 -0
  90. natural_pdf/ocr/engine.py +158 -0
  91. natural_pdf/ocr/paddleocr_engine.py +263 -0
  92. natural_pdf/qa/__init__.py +3 -0
  93. natural_pdf/qa/document_qa.py +405 -0
  94. natural_pdf/selectors/__init__.py +4 -0
  95. natural_pdf/selectors/parser.py +360 -0
  96. natural_pdf/templates/__init__.py +1 -0
  97. natural_pdf/templates/ocr_debug.html +517 -0
  98. natural_pdf/utils/__init__.py +4 -0
  99. natural_pdf/utils/highlighting.py +605 -0
  100. natural_pdf/utils/ocr.py +515 -0
  101. natural_pdf/utils/reading_order.py +227 -0
  102. natural_pdf/utils/visualization.py +151 -0
  103. natural_pdf-25.3.16.dist-info/LICENSE +21 -0
  104. natural_pdf-25.3.16.dist-info/METADATA +268 -0
  105. natural_pdf-25.3.16.dist-info/RECORD +109 -0
  106. natural_pdf-25.3.16.dist-info/WHEEL +5 -0
  107. natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
  108. tests/__init__.py +3 -0
  109. tests/test_pdf.py +39 -0
@@ -0,0 +1,224 @@
1
+ """
2
+ Example script demonstrating the PaddleOCR integration.
3
+ """
4
+ import os
5
+ import sys
6
+ from PIL import Image
7
+ import numpy as np
8
+
9
+ # Add the project directory to the path to import the library
10
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
11
+ from natural_pdf import PDF
12
+
13
+ # Select a PDF file to test
14
+ PDF_FILE = "./pdfs/HARRY ROQUE_redacted.pdf"
15
+ if not os.path.exists(PDF_FILE):
16
+ PDF_FILE = "./pdfs/01-practice.pdf" # Fallback to another file if needed
17
+
18
+ def basic_paddleocr_example():
19
+ """Basic example using PaddleOCR integration."""
20
+ print("\n=== Basic PaddleOCR Example ===")
21
+
22
+ # Create a PDF with the PaddleOCR engine
23
+ print("Creating PDF with PaddleOCR engine...")
24
+ pdf = PDF(
25
+ PDF_FILE,
26
+ ocr={
27
+ "enabled": True,
28
+ "languages": ["en"],
29
+ "min_confidence": 0.5,
30
+ },
31
+ ocr_engine="paddleocr"
32
+ )
33
+
34
+ # Get the first page
35
+ page = pdf.pages[0]
36
+
37
+ # Extract OCR elements explicitly
38
+ print("\nExtracting OCR elements...")
39
+ ocr_elements = page.extract_ocr_elements()
40
+ print(f"Found {len(ocr_elements)} OCR text elements")
41
+
42
+ # Print the first few elements
43
+ for i, element in enumerate(ocr_elements[:5]):
44
+ print(f"Element {i+1}: '{element.text}' (Confidence: {element.confidence:.2f})")
45
+
46
+ # Extract text with OCR applied automatically
47
+ print("\nExtracting text with auto OCR...")
48
+ text = page.extract_text(ocr=True)
49
+
50
+ # Print a snippet of the extracted text
51
+ print(f"Extracted text length: {len(text)}")
52
+ print(f"First 100 characters: {text[:100]}")
53
+
54
+ # Clean up
55
+ pdf.close()
56
+ print("Basic PaddleOCR example complete")
57
+
58
+ def advanced_paddleocr_example():
59
+ """Advanced example showing more PaddleOCR features."""
60
+ print("\n=== Advanced PaddleOCR Example ===")
61
+
62
+ # Create a PDF with detailed PaddleOCR configuration
63
+ print("Creating PDF with detailed PaddleOCR configuration...")
64
+ pdf = PDF(
65
+ PDF_FILE,
66
+ ocr={
67
+ "enabled": True,
68
+ "languages": ["en"],
69
+ "min_confidence": 0.3, # Lower threshold to catch more text
70
+ "model_settings": {
71
+ # PaddleOCR-specific settings
72
+ "use_angle_cls": False,
73
+ "rec_batch_num": 6,
74
+ "cls": False,
75
+ "det_db_thresh": 0.3,
76
+ "det_db_box_thresh": 0.5,
77
+ "det_limit_side_len": 2000 # Support larger images
78
+ }
79
+ },
80
+ ocr_engine="paddleocr"
81
+ )
82
+
83
+ # Create output directory for highlighted images
84
+ output_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "output")
85
+ os.makedirs(output_dir, exist_ok=True)
86
+
87
+ # Get the first page
88
+ page = pdf.pages[0]
89
+
90
+ # Extract OCR elements
91
+ print("\nExtracting OCR elements with detailed configuration...")
92
+ ocr_elements = page.extract_ocr_elements()
93
+ print(f"Found {len(ocr_elements)} OCR text elements")
94
+
95
+ # Highlight OCR elements with confidence scores
96
+ print("\nHighlighting OCR elements...")
97
+ for i, elem in enumerate(ocr_elements):
98
+ # Use different colors based on confidence
99
+ if elem.confidence >= 0.8:
100
+ color = (0, 1, 0, 0.3) # Green for high confidence
101
+ elif elem.confidence >= 0.5:
102
+ color = (1, 1, 0, 0.3) # Yellow for medium confidence
103
+ else:
104
+ color = (1, 0, 0, 0.3) # Red for low confidence
105
+
106
+ # Label includes confidence score
107
+ elem.highlight(
108
+ color=color,
109
+ label=f"OCR ({elem.confidence:.2f})"
110
+ )
111
+
112
+ # Save highlighted page
113
+ highlight_path = os.path.join(output_dir, "paddleocr_highlights.png")
114
+ page.to_image(path=highlight_path, show_labels=True)
115
+ print(f"Saved highlighted image to {highlight_path}")
116
+
117
+ # Filter OCR elements by confidence
118
+ high_confidence = [e for e in ocr_elements if e.confidence >= 0.7]
119
+ print(f"\nHigh confidence elements ({len(high_confidence)}): ")
120
+ for i, elem in enumerate(high_confidence[:3]):
121
+ print(f" {i+1}. '{elem.text}' (Confidence: {elem.confidence:.2f})")
122
+
123
+ # Clean up
124
+ pdf.close()
125
+ print("Advanced PaddleOCR example complete")
126
+
127
+ def ocr_engine_comparison():
128
+ """Compare EasyOCR and PaddleOCR on the same document."""
129
+ print("\n=== OCR Engine Comparison ===")
130
+
131
+ # Create output directory
132
+ output_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "output")
133
+ os.makedirs(output_dir, exist_ok=True)
134
+
135
+ # Test with EasyOCR
136
+ print("\nUsing EasyOCR...")
137
+ easy_pdf = PDF(
138
+ PDF_FILE,
139
+ ocr={"enabled": True, "languages": ["en"]},
140
+ ocr_engine="easyocr"
141
+ )
142
+ page = easy_pdf.pages[0]
143
+
144
+ # Time the OCR process
145
+ import time
146
+ start_time = time.time()
147
+ easy_elements = page.extract_ocr_elements()
148
+ easy_time = time.time() - start_time
149
+ print(f"EasyOCR found {len(easy_elements)} text elements in {easy_time:.2f} seconds")
150
+
151
+ # Save a sample
152
+ with open(os.path.join(output_dir, "easyocr_sample.txt"), "w") as f:
153
+ for i, elem in enumerate(easy_elements[:20]):
154
+ f.write(f"{i+1}. '{elem.text}' (Confidence: {elem.confidence:.2f})\n")
155
+
156
+ # Clean up
157
+ easy_pdf.close()
158
+
159
+ # Test with PaddleOCR
160
+ print("\nUsing PaddleOCR...")
161
+ paddle_pdf = PDF(
162
+ PDF_FILE,
163
+ ocr={"enabled": True, "languages": ["en"]},
164
+ ocr_engine="paddleocr"
165
+ )
166
+ page = paddle_pdf.pages[0]
167
+
168
+ # Time the OCR process
169
+ start_time = time.time()
170
+ paddle_elements = page.extract_ocr_elements()
171
+ paddle_time = time.time() - start_time
172
+ print(f"PaddleOCR found {len(paddle_elements)} text elements in {paddle_time:.2f} seconds")
173
+
174
+ # Save a sample
175
+ with open(os.path.join(output_dir, "paddleocr_sample.txt"), "w") as f:
176
+ for i, elem in enumerate(paddle_elements[:20]):
177
+ f.write(f"{i+1}. '{elem.text}' (Confidence: {elem.confidence:.2f})\n")
178
+
179
+ # Clean up
180
+ paddle_pdf.close()
181
+
182
+ # Compare results
183
+ print("\nComparison Results:")
184
+ print(f"EasyOCR: {len(easy_elements)} elements in {easy_time:.2f} seconds")
185
+ print(f"PaddleOCR: {len(paddle_elements)} elements in {paddle_time:.2f} seconds")
186
+ print(f"Speed difference: {(easy_time / paddle_time if paddle_time > 0 else 0):.2f}x")
187
+
188
+ print("\nSample results saved to:")
189
+ print(f" - {os.path.join(output_dir, 'easyocr_sample.txt')}")
190
+ print(f" - {os.path.join(output_dir, 'paddleocr_sample.txt')}")
191
+
192
+ print("OCR engine comparison complete")
193
+
194
+ if __name__ == "__main__":
195
+ try:
196
+ # Check if PaddleOCR is available
197
+ import paddleocr
198
+ print("PaddleOCR is available, running examples...")
199
+
200
+ # Get command line arguments if any
201
+ import sys
202
+ if len(sys.argv) > 1:
203
+ example = sys.argv[1].lower()
204
+ if example == "basic":
205
+ basic_paddleocr_example()
206
+ elif example == "advanced":
207
+ advanced_paddleocr_example()
208
+ elif example == "compare":
209
+ ocr_engine_comparison()
210
+ else:
211
+ print(f"Unknown example: {example}")
212
+ print("Available examples: basic, advanced, compare")
213
+ else:
214
+ # Run all examples
215
+ basic_paddleocr_example()
216
+ advanced_paddleocr_example()
217
+ ocr_engine_comparison()
218
+
219
+ except ImportError:
220
+ print("PaddleOCR is not installed. Please install it with: pip install paddlepaddle paddleocr")
221
+ except Exception as e:
222
+ print(f"Error in PaddleOCR examples: {e}")
223
+ import traceback
224
+ traceback.print_exc()
@@ -0,0 +1,103 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Example demonstrating the PageCollection functionality.
4
+
5
+ This example shows how to:
6
+ 1. Access a specific range of pages using slicing
7
+ 2. Extract text from multiple pages
8
+ 3. Find elements across multiple pages
9
+ 4. Get sections that span across page boundaries
10
+
11
+ Usage:
12
+ python examples/page_collection_example.py [path_to_pdf]
13
+ """
14
+
15
+ import os
16
+ import sys
17
+ from pathlib import Path
18
+
19
+ # Add the parent directory to the path so we can import the package
20
+ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
21
+
22
+ from natural_pdf import PDF
23
+
24
+ # Use the provided PDF path or a default
25
+ pdf_path = sys.argv[1] if len(sys.argv) > 1 else "pdfs/Atlanta_Public_Schools_GA_sample.pdf"
26
+
27
+ def main():
28
+ print(f"Opening {pdf_path}")
29
+
30
+ with PDF(pdf_path) as pdf:
31
+ page_count = len(pdf)
32
+ print(f"PDF has {page_count} pages")
33
+
34
+ # Example 1: Access a page range
35
+ if page_count >= 3:
36
+ print("\n1. Working with a range of pages:")
37
+ # Get pages 1-3 (0-indexed, so second, third, fourth pages)
38
+ page_range = pdf.pages[1:4]
39
+ print(f" Selected {len(page_range)} pages: {[p.number for p in page_range.pages]}")
40
+
41
+ # Extract text from the range
42
+ text = page_range.extract_text()
43
+ print(f" Extracted {len(text)} characters of text from pages {[p.number for p in page_range.pages]}")
44
+
45
+ # You can also slice a page collection
46
+ if len(page_range) > 1:
47
+ sub_range = page_range[0:2]
48
+ print(f" Sub-range has {len(sub_range)} pages: {[p.number for p in sub_range.pages]}")
49
+
50
+ # Example 2: Find elements across multiple pages
51
+ if page_count >= 2:
52
+ print("\n2. Finding elements across multiple pages:")
53
+ # Get the first two pages
54
+ two_pages = pdf.pages[0:2]
55
+
56
+ # Find all text elements
57
+ text_elements = two_pages.find_all('text')
58
+ print(f" Found {len(text_elements)} text elements across {len(two_pages)} pages")
59
+
60
+ # Find the first heading-like element
61
+ heading = two_pages.find('text[size>=12]')
62
+ if heading:
63
+ print(f" Found heading: '{heading.text}' on page {heading.page.number}")
64
+
65
+ # Example 3: Get sections across pages
66
+ if page_count >= 2:
67
+ print("\n3. Getting sections across pages:")
68
+ # Get the first two pages
69
+ two_pages = pdf.pages[0:2]
70
+
71
+ # Try to find headings or large text as section starts
72
+ sections = two_pages.get_sections(
73
+ start_selector='text[size>=12]',
74
+ new_section_on_page_break=False, # Allow sections to continue across pages
75
+ boundary_inclusion='both'
76
+ )
77
+
78
+ print(f" Found {len(sections)} sections across {len(two_pages)} pages")
79
+
80
+ # Print info about each section
81
+ for i, section in enumerate(sections):
82
+ print(f" Section {i+1}:")
83
+ if hasattr(section, 'start_element') and section.start_element:
84
+ print(f" Starts with: '{section.start_element.text}'")
85
+ print(f" On page: {section.start_element.page.number}")
86
+
87
+ text = section.extract_text()
88
+ print(f" Contains {len(text)} characters of text")
89
+
90
+ # Show a preview
91
+ preview = text[:50] + "..." if len(text) > 50 else text
92
+ print(f" Preview: {preview}")
93
+
94
+ # Show with page breaks as section boundaries
95
+ sections_with_breaks = two_pages.get_sections(
96
+ start_selector='text[size>=12]',
97
+ new_section_on_page_break=True, # Force new sections at page boundaries
98
+ boundary_inclusion='both'
99
+ )
100
+ print(f" With page breaks as boundaries: {len(sections_with_breaks)} sections")
101
+
102
+ if __name__ == "__main__":
103
+ main()
@@ -0,0 +1,83 @@
1
+ """
2
+ Example showing the polygon highlighting capabilities for handling non-rectangular regions.
3
+
4
+ This example demonstrates how polygon-based OCR results are handled and visualized,
5
+ which is especially useful for skewed or rotated text in scanned documents.
6
+ """
7
+ import os
8
+ import sys
9
+ from natural_pdf import PDF
10
+ from natural_pdf.elements.region import Region
11
+ from PIL import Image
12
+
13
+ # Get the current directory of this script
14
+ script_dir = os.path.dirname(os.path.realpath(__file__))
15
+ # Get the parent directory (project root)
16
+ root_dir = os.path.dirname(script_dir)
17
+ # Default PDF path (using a document that needs OCR)
18
+ default_pdf = os.path.join(root_dir, "pdfs", "needs-ocr.pdf")
19
+
20
+ # Check for command line arguments
21
+ pdf_path = sys.argv[1] if len(sys.argv) > 1 else default_pdf
22
+ page_num = int(sys.argv[2]) if len(sys.argv) > 2 else 0
23
+
24
+ print(f"Loading PDF: {pdf_path}")
25
+ print(f"Using page: {page_num}")
26
+
27
+ # Load the PDF with OCR enabled
28
+ pdf = PDF(pdf_path, ocr=True)
29
+ page = pdf.pages[page_num]
30
+
31
+ # Create a simulated polygon region to show polygon highlighting
32
+ print("Creating polygon region...")
33
+ polygon_points = [
34
+ (100, 100),
35
+ (300, 150),
36
+ (250, 250),
37
+ (120, 200)
38
+ ]
39
+
40
+ # Create a region with the polygon points
41
+ region = Region(page, (100, 100, 300, 250), polygon=polygon_points)
42
+ region.highlight(color=(1, 0, 0, 0.5), label="Polygon Region")
43
+
44
+ # Also extract and highlight text using OCR, which will use polygon detection
45
+ print("Running OCR on the page...")
46
+ ocr_elements = page.apply_ocr()
47
+ print(f"Found {len(ocr_elements)} OCR text elements")
48
+
49
+ # Highlight OCR elements with different colors based on confidence
50
+ print("Highlighting OCR elements...")
51
+ for elem in ocr_elements:
52
+ if elem.confidence > 0.8:
53
+ color = (0, 0.8, 0, 0.3) # Green for high confidence
54
+ elif elem.confidence > 0.5:
55
+ color = (1, 0.8, 0, 0.3) # Yellow for medium confidence
56
+ else:
57
+ color = (0.8, 0, 0, 0.3) # Red for low confidence
58
+
59
+ elem.highlight(color=color)
60
+
61
+ # Save the result
62
+ output_path = os.path.join(root_dir, "output", "polygon_highlight_example.png")
63
+ print(f"Saving highlighted image to {output_path}")
64
+ page.to_image(path=output_path, show_labels=True)
65
+
66
+ # Print some information about the elements
67
+ print("\nPolygon support details:")
68
+
69
+ # Check if any OCR elements have polygon data
70
+ polygon_elements = [elem for elem in ocr_elements if hasattr(elem, 'has_polygon') and elem.has_polygon]
71
+ print(f"- Found {len(polygon_elements)} elements with polygon data")
72
+
73
+ # Display details of the first few polygon elements
74
+ if polygon_elements:
75
+ for i, elem in enumerate(polygon_elements[:3]):
76
+ print(f"\nElement {i+1}:")
77
+ print(f"- Text: '{elem.text}'")
78
+ print(f"- Confidence: {elem.confidence:.2f}")
79
+ print(f"- Bounding box: {elem.bbox}")
80
+ print(f"- Polygon points: {elem.polygon[:2]}... ({len(elem.polygon)} points)")
81
+
82
+ if len(polygon_elements) > 3:
83
+ print(f"... and {len(polygon_elements) - 3} more")
@@ -0,0 +1,134 @@
1
+ """
2
+ Example demonstrating positional methods in ElementCollection.
3
+ """
4
+
5
+ import os
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ # Add parent directory to path for imports
10
+ sys.path.insert(0, str(Path(__file__).parent.parent))
11
+
12
+ from natural_pdf import PDF
13
+
14
+
15
+ def main():
16
+ """Main entry point."""
17
+ # Get the PDF path from command line or use a default
18
+ if len(sys.argv) > 1:
19
+ pdf_path = sys.argv[1]
20
+ else:
21
+ # Look for any PDF in the examples directory or pdfs directory
22
+ example_dir = Path(__file__).parent
23
+ pdf_files = list(example_dir.glob("*.pdf"))
24
+
25
+ if not pdf_files:
26
+ pdfs_dir = example_dir.parent / "pdfs"
27
+ if pdfs_dir.exists():
28
+ pdf_files = list(pdfs_dir.glob("*.pdf"))
29
+
30
+ if pdf_files:
31
+ pdf_path = str(pdf_files[0])
32
+ else:
33
+ print("No PDF file found. Please provide a path to a PDF file.")
34
+ sys.exit(1)
35
+
36
+ print(f"Using PDF: {pdf_path}")
37
+
38
+ # Open the PDF
39
+ pdf = PDF(pdf_path)
40
+ page = pdf.pages[0]
41
+
42
+ # Find different element types
43
+ lines = page.find_all('line')
44
+ rects = page.find_all('rect')
45
+ text = page.find_all('text')
46
+
47
+ # Clear any existing highlights
48
+ page.clear_highlights()
49
+
50
+ # Highlight the page corners for reference
51
+ page.create_region(0, 0, 10, 10).highlight(label="Top-Left Corner")
52
+ page.create_region(page.width-10, 0, page.width, 10).highlight(label="Top-Right Corner")
53
+ page.create_region(0, page.height-10, 10, page.height).highlight(label="Bottom-Left Corner")
54
+ page.create_region(page.width-10, page.height-10, page.width, page.height).highlight(label="Bottom-Right Corner")
55
+
56
+ # Demonstrate line position methods
57
+ print(f"\nLines found: {len(lines)}")
58
+ if len(lines) > 0:
59
+ highest_line = lines.highest()
60
+ lowest_line = lines.lowest()
61
+ leftmost_line = lines.leftmost()
62
+ rightmost_line = lines.rightmost()
63
+
64
+ print(f"Highest line: {highest_line.bbox}")
65
+ print(f"Lowest line: {lowest_line.bbox}")
66
+ print(f"Leftmost line: {leftmost_line.bbox}")
67
+ print(f"Rightmost line: {rightmost_line.bbox}")
68
+
69
+ # Highlight the extreme lines
70
+ highest_line.highlight(label="Highest Line")
71
+ lowest_line.highlight(label="Lowest Line")
72
+ leftmost_line.highlight(label="Leftmost Line")
73
+ rightmost_line.highlight(label="Rightmost Line")
74
+
75
+ # Demonstrate rectangle position methods
76
+ print(f"\nRectangles found: {len(rects)}")
77
+ if len(rects) > 0:
78
+ highest_rect = rects.highest()
79
+ lowest_rect = rects.lowest()
80
+ leftmost_rect = rects.leftmost()
81
+ rightmost_rect = rects.rightmost()
82
+
83
+ print(f"Highest rectangle: {highest_rect.bbox}")
84
+ print(f"Lowest rectangle: {lowest_rect.bbox}")
85
+ print(f"Leftmost rectangle: {leftmost_rect.bbox}")
86
+ print(f"Rightmost rectangle: {rightmost_rect.bbox}")
87
+
88
+ # Highlight the extreme rectangles
89
+ highest_rect.highlight(label="Highest Rectangle")
90
+ lowest_rect.highlight(label="Lowest Rectangle")
91
+ leftmost_rect.highlight(label="Leftmost Rectangle")
92
+ rightmost_rect.highlight(label="Rightmost Rectangle")
93
+
94
+ # Demonstrate text position methods
95
+ print(f"\nText elements found: {len(text)}")
96
+ if len(text) > 0:
97
+ highest_text = text.highest()
98
+ lowest_text = text.lowest()
99
+ leftmost_text = text.leftmost()
100
+ rightmost_text = text.rightmost()
101
+
102
+ print(f"Highest text: '{highest_text.text}' at {highest_text.bbox}")
103
+ print(f"Lowest text: '{lowest_text.text}' at {lowest_text.bbox}")
104
+ print(f"Leftmost text: '{leftmost_text.text}' at {leftmost_text.bbox}")
105
+ print(f"Rightmost text: '{rightmost_text.text}' at {rightmost_text.bbox}")
106
+
107
+ # Highlight the extreme text elements
108
+ highest_text.highlight(label="Highest Text")
109
+ lowest_text.highlight(label="Lowest Text")
110
+ leftmost_text.highlight(label="Leftmost Text")
111
+ rightmost_text.highlight(label="Rightmost Text")
112
+
113
+ # Create an output directory
114
+ output_dir = Path(__file__).parent / "position_output"
115
+ output_dir.mkdir(exist_ok=True)
116
+
117
+ # Save the result
118
+ page.to_image(path=str(output_dir / "position_methods.png"), show_labels=True)
119
+
120
+ # Demonstrate error handling for multi-page collections
121
+ if len(pdf.pages) > 1:
122
+ print("\nTesting multi-page error handling:")
123
+ multi_collection = pdf.pages.find_all('text')
124
+ try:
125
+ multi_collection.lowest()
126
+ print("ERROR: Should have raised ValueError for multi-page collection")
127
+ except ValueError as e:
128
+ print(f"Correctly raised ValueError: {e}")
129
+
130
+ print("\nExample completed. Check 'position_output/position_methods.png' for the result.")
131
+
132
+
133
+ if __name__ == "__main__":
134
+ main()
@@ -0,0 +1,73 @@
1
+ """
2
+ Test the modified region boundary logic with below() and above() method fixes.
3
+
4
+ This example tests that the .below() and .above() methods correctly exclude
5
+ the source element with the new 1-pixel offset.
6
+ """
7
+
8
+ import os
9
+ import sys
10
+ import argparse
11
+
12
+ # Add parent directory to path to run without installing
13
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
14
+
15
+ from natural_pdf import PDF
16
+
17
+ def main():
18
+ parser = argparse.ArgumentParser(description="Test region boundaries")
19
+ parser.add_argument("pdf_path", nargs="?", default="../pdfs/0500000US42001.pdf",
20
+ help="Path to PDF document")
21
+ args = parser.parse_args()
22
+
23
+ print(f"Testing with PDF: {args.pdf_path}")
24
+
25
+ # Open the PDF
26
+ pdf = PDF(args.pdf_path)
27
+ page = pdf.pages[0]
28
+
29
+ # Find a text element to test with
30
+ title = page.find('text:contains("Price")')
31
+ if not title:
32
+ title = page.find('text:bold')
33
+
34
+ if not title:
35
+ print("Couldn't find a suitable test element. Please provide a PDF with text elements.")
36
+ return
37
+
38
+ print(f"Found element: '{title.text}' at position {title.bbox}")
39
+
40
+ # Create region below the element
41
+ region_below = title.below(height=16, width="element")
42
+
43
+ # Check if the element is in the region (it shouldn't be)
44
+ elements_in_region = region_below.find_all('text')
45
+
46
+ # Print the region and elements found in it
47
+ print(f"\nRegion below: {region_below.bbox}")
48
+ print(f"Number of elements in region: {len(elements_in_region)}")
49
+
50
+ # Check specifically if the source element is in the region
51
+ is_source_in_region = title in elements_in_region
52
+ print(f"Source element is in region: {is_source_in_region}")
53
+
54
+ # Expand the region and check again
55
+ expanded_region = region_below.expand(right=40)
56
+ elements_in_expanded = expanded_region.find_all('text')
57
+
58
+ print(f"\nExpanded region: {expanded_region.bbox}")
59
+ print(f"Number of elements in expanded region: {len(elements_in_expanded)}")
60
+ print(f"Elements text: {[e.text for e in elements_in_expanded]}")
61
+
62
+ # Highlight the regions to visualize
63
+ title.highlight(color=(1, 0, 0, 0.3), label="Source")
64
+ region_below.highlight(color=(0, 1, 0, 0.3), label="Below")
65
+ expanded_region.highlight(color=(0, 0, 1, 0.3), label="Expanded")
66
+
67
+ # Save the image
68
+ os.makedirs("output", exist_ok=True)
69
+ page.save_image("output/region_boundary_test.png")
70
+ print("\nSaved visualization to output/region_boundary_test.png")
71
+
72
+ if __name__ == "__main__":
73
+ main()