natural-pdf 25.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/__init__.py +3 -0
- examples/another_exclusion_example.py +20 -0
- examples/basic_usage.py +190 -0
- examples/boundary_exclusion_test.py +137 -0
- examples/boundary_inclusion_fix_test.py +157 -0
- examples/chainable_layout_example.py +70 -0
- examples/color_basic_test.py +49 -0
- examples/color_name_example.py +71 -0
- examples/color_test.py +62 -0
- examples/debug_ocr.py +91 -0
- examples/direct_ocr_test.py +148 -0
- examples/direct_paddle_test.py +99 -0
- examples/direct_qa_example.py +165 -0
- examples/document_layout_analysis.py +123 -0
- examples/document_qa_example.py +185 -0
- examples/exclusion_count_debug.py +128 -0
- examples/exclusion_debug.py +107 -0
- examples/exclusion_example.py +150 -0
- examples/exclusion_optimization_example.py +190 -0
- examples/extract_text_test.py +128 -0
- examples/font_aware_example.py +101 -0
- examples/font_variant_example.py +124 -0
- examples/footer_overlap_test.py +124 -0
- examples/highlight_all_example.py +82 -0
- examples/highlight_attributes_test.py +114 -0
- examples/highlight_confidence_display.py +122 -0
- examples/highlight_demo.py +110 -0
- examples/highlight_float_test.py +71 -0
- examples/highlight_test.py +147 -0
- examples/highlighting_example.py +123 -0
- examples/image_width_example.py +84 -0
- examples/improved_api_example.py +128 -0
- examples/layout_confidence_display_test.py +65 -0
- examples/layout_confidence_test.py +82 -0
- examples/layout_coordinate_debug.py +258 -0
- examples/layout_highlight_test.py +77 -0
- examples/logging_example.py +70 -0
- examples/ocr_comprehensive.py +193 -0
- examples/ocr_debug_example.py +87 -0
- examples/ocr_default_test.py +97 -0
- examples/ocr_engine_comparison.py +235 -0
- examples/ocr_example.py +89 -0
- examples/ocr_simplified_params.py +79 -0
- examples/ocr_visualization.py +102 -0
- examples/ocr_visualization_test.py +121 -0
- examples/paddle_layout_example.py +315 -0
- examples/paddle_layout_simple.py +74 -0
- examples/paddleocr_example.py +224 -0
- examples/page_collection_example.py +103 -0
- examples/polygon_highlight_example.py +83 -0
- examples/position_methods_example.py +134 -0
- examples/region_boundary_test.py +73 -0
- examples/region_exclusion_test.py +149 -0
- examples/region_expand_example.py +109 -0
- examples/region_image_example.py +116 -0
- examples/region_ocr_test.py +119 -0
- examples/region_sections_example.py +115 -0
- examples/school_books.py +49 -0
- examples/school_books_all.py +52 -0
- examples/scouring.py +36 -0
- examples/section_extraction_example.py +232 -0
- examples/simple_document_qa.py +97 -0
- examples/spatial_navigation_example.py +108 -0
- examples/table_extraction_example.py +135 -0
- examples/table_structure_detection.py +155 -0
- examples/tatr_cells_test.py +56 -0
- examples/tatr_ocr_table_test.py +94 -0
- examples/text_search_example.py +122 -0
- examples/text_style_example.py +110 -0
- examples/tiny-text.py +61 -0
- examples/until_boundaries_example.py +156 -0
- examples/until_example.py +112 -0
- examples/very_basics.py +15 -0
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +9 -0
- natural_pdf/analyzers/document_layout.py +736 -0
- natural_pdf/analyzers/text_structure.py +153 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/page.py +2376 -0
- natural_pdf/core/pdf.py +572 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +553 -0
- natural_pdf/elements/collections.py +770 -0
- natural_pdf/elements/line.py +124 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1366 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +62 -0
- natural_pdf/ocr/easyocr_engine.py +254 -0
- natural_pdf/ocr/engine.py +158 -0
- natural_pdf/ocr/paddleocr_engine.py +263 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +405 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +360 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +4 -0
- natural_pdf/utils/highlighting.py +605 -0
- natural_pdf/utils/ocr.py +515 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +151 -0
- natural_pdf-25.3.16.dist-info/LICENSE +21 -0
- natural_pdf-25.3.16.dist-info/METADATA +268 -0
- natural_pdf-25.3.16.dist-info/RECORD +109 -0
- natural_pdf-25.3.16.dist-info/WHEEL +5 -0
- natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
- tests/__init__.py +3 -0
- tests/test_pdf.py +39 -0
@@ -0,0 +1,224 @@
|
|
1
|
+
"""
|
2
|
+
Example script demonstrating the PaddleOCR integration.
|
3
|
+
"""
|
4
|
+
import os
|
5
|
+
import sys
|
6
|
+
from PIL import Image
|
7
|
+
import numpy as np
|
8
|
+
|
9
|
+
# Add the project directory to the path to import the library
|
10
|
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
11
|
+
from natural_pdf import PDF
|
12
|
+
|
13
|
+
# Select a PDF file to test
|
14
|
+
PDF_FILE = "./pdfs/HARRY ROQUE_redacted.pdf"
|
15
|
+
if not os.path.exists(PDF_FILE):
|
16
|
+
PDF_FILE = "./pdfs/01-practice.pdf" # Fallback to another file if needed
|
17
|
+
|
18
|
+
def basic_paddleocr_example():
|
19
|
+
"""Basic example using PaddleOCR integration."""
|
20
|
+
print("\n=== Basic PaddleOCR Example ===")
|
21
|
+
|
22
|
+
# Create a PDF with the PaddleOCR engine
|
23
|
+
print("Creating PDF with PaddleOCR engine...")
|
24
|
+
pdf = PDF(
|
25
|
+
PDF_FILE,
|
26
|
+
ocr={
|
27
|
+
"enabled": True,
|
28
|
+
"languages": ["en"],
|
29
|
+
"min_confidence": 0.5,
|
30
|
+
},
|
31
|
+
ocr_engine="paddleocr"
|
32
|
+
)
|
33
|
+
|
34
|
+
# Get the first page
|
35
|
+
page = pdf.pages[0]
|
36
|
+
|
37
|
+
# Extract OCR elements explicitly
|
38
|
+
print("\nExtracting OCR elements...")
|
39
|
+
ocr_elements = page.extract_ocr_elements()
|
40
|
+
print(f"Found {len(ocr_elements)} OCR text elements")
|
41
|
+
|
42
|
+
# Print the first few elements
|
43
|
+
for i, element in enumerate(ocr_elements[:5]):
|
44
|
+
print(f"Element {i+1}: '{element.text}' (Confidence: {element.confidence:.2f})")
|
45
|
+
|
46
|
+
# Extract text with OCR applied automatically
|
47
|
+
print("\nExtracting text with auto OCR...")
|
48
|
+
text = page.extract_text(ocr=True)
|
49
|
+
|
50
|
+
# Print a snippet of the extracted text
|
51
|
+
print(f"Extracted text length: {len(text)}")
|
52
|
+
print(f"First 100 characters: {text[:100]}")
|
53
|
+
|
54
|
+
# Clean up
|
55
|
+
pdf.close()
|
56
|
+
print("Basic PaddleOCR example complete")
|
57
|
+
|
58
|
+
def advanced_paddleocr_example():
|
59
|
+
"""Advanced example showing more PaddleOCR features."""
|
60
|
+
print("\n=== Advanced PaddleOCR Example ===")
|
61
|
+
|
62
|
+
# Create a PDF with detailed PaddleOCR configuration
|
63
|
+
print("Creating PDF with detailed PaddleOCR configuration...")
|
64
|
+
pdf = PDF(
|
65
|
+
PDF_FILE,
|
66
|
+
ocr={
|
67
|
+
"enabled": True,
|
68
|
+
"languages": ["en"],
|
69
|
+
"min_confidence": 0.3, # Lower threshold to catch more text
|
70
|
+
"model_settings": {
|
71
|
+
# PaddleOCR-specific settings
|
72
|
+
"use_angle_cls": False,
|
73
|
+
"rec_batch_num": 6,
|
74
|
+
"cls": False,
|
75
|
+
"det_db_thresh": 0.3,
|
76
|
+
"det_db_box_thresh": 0.5,
|
77
|
+
"det_limit_side_len": 2000 # Support larger images
|
78
|
+
}
|
79
|
+
},
|
80
|
+
ocr_engine="paddleocr"
|
81
|
+
)
|
82
|
+
|
83
|
+
# Create output directory for highlighted images
|
84
|
+
output_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "output")
|
85
|
+
os.makedirs(output_dir, exist_ok=True)
|
86
|
+
|
87
|
+
# Get the first page
|
88
|
+
page = pdf.pages[0]
|
89
|
+
|
90
|
+
# Extract OCR elements
|
91
|
+
print("\nExtracting OCR elements with detailed configuration...")
|
92
|
+
ocr_elements = page.extract_ocr_elements()
|
93
|
+
print(f"Found {len(ocr_elements)} OCR text elements")
|
94
|
+
|
95
|
+
# Highlight OCR elements with confidence scores
|
96
|
+
print("\nHighlighting OCR elements...")
|
97
|
+
for i, elem in enumerate(ocr_elements):
|
98
|
+
# Use different colors based on confidence
|
99
|
+
if elem.confidence >= 0.8:
|
100
|
+
color = (0, 1, 0, 0.3) # Green for high confidence
|
101
|
+
elif elem.confidence >= 0.5:
|
102
|
+
color = (1, 1, 0, 0.3) # Yellow for medium confidence
|
103
|
+
else:
|
104
|
+
color = (1, 0, 0, 0.3) # Red for low confidence
|
105
|
+
|
106
|
+
# Label includes confidence score
|
107
|
+
elem.highlight(
|
108
|
+
color=color,
|
109
|
+
label=f"OCR ({elem.confidence:.2f})"
|
110
|
+
)
|
111
|
+
|
112
|
+
# Save highlighted page
|
113
|
+
highlight_path = os.path.join(output_dir, "paddleocr_highlights.png")
|
114
|
+
page.to_image(path=highlight_path, show_labels=True)
|
115
|
+
print(f"Saved highlighted image to {highlight_path}")
|
116
|
+
|
117
|
+
# Filter OCR elements by confidence
|
118
|
+
high_confidence = [e for e in ocr_elements if e.confidence >= 0.7]
|
119
|
+
print(f"\nHigh confidence elements ({len(high_confidence)}): ")
|
120
|
+
for i, elem in enumerate(high_confidence[:3]):
|
121
|
+
print(f" {i+1}. '{elem.text}' (Confidence: {elem.confidence:.2f})")
|
122
|
+
|
123
|
+
# Clean up
|
124
|
+
pdf.close()
|
125
|
+
print("Advanced PaddleOCR example complete")
|
126
|
+
|
127
|
+
def ocr_engine_comparison():
|
128
|
+
"""Compare EasyOCR and PaddleOCR on the same document."""
|
129
|
+
print("\n=== OCR Engine Comparison ===")
|
130
|
+
|
131
|
+
# Create output directory
|
132
|
+
output_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "output")
|
133
|
+
os.makedirs(output_dir, exist_ok=True)
|
134
|
+
|
135
|
+
# Test with EasyOCR
|
136
|
+
print("\nUsing EasyOCR...")
|
137
|
+
easy_pdf = PDF(
|
138
|
+
PDF_FILE,
|
139
|
+
ocr={"enabled": True, "languages": ["en"]},
|
140
|
+
ocr_engine="easyocr"
|
141
|
+
)
|
142
|
+
page = easy_pdf.pages[0]
|
143
|
+
|
144
|
+
# Time the OCR process
|
145
|
+
import time
|
146
|
+
start_time = time.time()
|
147
|
+
easy_elements = page.extract_ocr_elements()
|
148
|
+
easy_time = time.time() - start_time
|
149
|
+
print(f"EasyOCR found {len(easy_elements)} text elements in {easy_time:.2f} seconds")
|
150
|
+
|
151
|
+
# Save a sample
|
152
|
+
with open(os.path.join(output_dir, "easyocr_sample.txt"), "w") as f:
|
153
|
+
for i, elem in enumerate(easy_elements[:20]):
|
154
|
+
f.write(f"{i+1}. '{elem.text}' (Confidence: {elem.confidence:.2f})\n")
|
155
|
+
|
156
|
+
# Clean up
|
157
|
+
easy_pdf.close()
|
158
|
+
|
159
|
+
# Test with PaddleOCR
|
160
|
+
print("\nUsing PaddleOCR...")
|
161
|
+
paddle_pdf = PDF(
|
162
|
+
PDF_FILE,
|
163
|
+
ocr={"enabled": True, "languages": ["en"]},
|
164
|
+
ocr_engine="paddleocr"
|
165
|
+
)
|
166
|
+
page = paddle_pdf.pages[0]
|
167
|
+
|
168
|
+
# Time the OCR process
|
169
|
+
start_time = time.time()
|
170
|
+
paddle_elements = page.extract_ocr_elements()
|
171
|
+
paddle_time = time.time() - start_time
|
172
|
+
print(f"PaddleOCR found {len(paddle_elements)} text elements in {paddle_time:.2f} seconds")
|
173
|
+
|
174
|
+
# Save a sample
|
175
|
+
with open(os.path.join(output_dir, "paddleocr_sample.txt"), "w") as f:
|
176
|
+
for i, elem in enumerate(paddle_elements[:20]):
|
177
|
+
f.write(f"{i+1}. '{elem.text}' (Confidence: {elem.confidence:.2f})\n")
|
178
|
+
|
179
|
+
# Clean up
|
180
|
+
paddle_pdf.close()
|
181
|
+
|
182
|
+
# Compare results
|
183
|
+
print("\nComparison Results:")
|
184
|
+
print(f"EasyOCR: {len(easy_elements)} elements in {easy_time:.2f} seconds")
|
185
|
+
print(f"PaddleOCR: {len(paddle_elements)} elements in {paddle_time:.2f} seconds")
|
186
|
+
print(f"Speed difference: {(easy_time / paddle_time if paddle_time > 0 else 0):.2f}x")
|
187
|
+
|
188
|
+
print("\nSample results saved to:")
|
189
|
+
print(f" - {os.path.join(output_dir, 'easyocr_sample.txt')}")
|
190
|
+
print(f" - {os.path.join(output_dir, 'paddleocr_sample.txt')}")
|
191
|
+
|
192
|
+
print("OCR engine comparison complete")
|
193
|
+
|
194
|
+
if __name__ == "__main__":
|
195
|
+
try:
|
196
|
+
# Check if PaddleOCR is available
|
197
|
+
import paddleocr
|
198
|
+
print("PaddleOCR is available, running examples...")
|
199
|
+
|
200
|
+
# Get command line arguments if any
|
201
|
+
import sys
|
202
|
+
if len(sys.argv) > 1:
|
203
|
+
example = sys.argv[1].lower()
|
204
|
+
if example == "basic":
|
205
|
+
basic_paddleocr_example()
|
206
|
+
elif example == "advanced":
|
207
|
+
advanced_paddleocr_example()
|
208
|
+
elif example == "compare":
|
209
|
+
ocr_engine_comparison()
|
210
|
+
else:
|
211
|
+
print(f"Unknown example: {example}")
|
212
|
+
print("Available examples: basic, advanced, compare")
|
213
|
+
else:
|
214
|
+
# Run all examples
|
215
|
+
basic_paddleocr_example()
|
216
|
+
advanced_paddleocr_example()
|
217
|
+
ocr_engine_comparison()
|
218
|
+
|
219
|
+
except ImportError:
|
220
|
+
print("PaddleOCR is not installed. Please install it with: pip install paddlepaddle paddleocr")
|
221
|
+
except Exception as e:
|
222
|
+
print(f"Error in PaddleOCR examples: {e}")
|
223
|
+
import traceback
|
224
|
+
traceback.print_exc()
|
@@ -0,0 +1,103 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
Example demonstrating the PageCollection functionality.
|
4
|
+
|
5
|
+
This example shows how to:
|
6
|
+
1. Access a specific range of pages using slicing
|
7
|
+
2. Extract text from multiple pages
|
8
|
+
3. Find elements across multiple pages
|
9
|
+
4. Get sections that span across page boundaries
|
10
|
+
|
11
|
+
Usage:
|
12
|
+
python examples/page_collection_example.py [path_to_pdf]
|
13
|
+
"""
|
14
|
+
|
15
|
+
import os
|
16
|
+
import sys
|
17
|
+
from pathlib import Path
|
18
|
+
|
19
|
+
# Add the parent directory to the path so we can import the package
|
20
|
+
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
21
|
+
|
22
|
+
from natural_pdf import PDF
|
23
|
+
|
24
|
+
# Use the provided PDF path or a default
|
25
|
+
pdf_path = sys.argv[1] if len(sys.argv) > 1 else "pdfs/Atlanta_Public_Schools_GA_sample.pdf"
|
26
|
+
|
27
|
+
def main():
|
28
|
+
print(f"Opening {pdf_path}")
|
29
|
+
|
30
|
+
with PDF(pdf_path) as pdf:
|
31
|
+
page_count = len(pdf)
|
32
|
+
print(f"PDF has {page_count} pages")
|
33
|
+
|
34
|
+
# Example 1: Access a page range
|
35
|
+
if page_count >= 3:
|
36
|
+
print("\n1. Working with a range of pages:")
|
37
|
+
# Get pages 1-3 (0-indexed, so second, third, fourth pages)
|
38
|
+
page_range = pdf.pages[1:4]
|
39
|
+
print(f" Selected {len(page_range)} pages: {[p.number for p in page_range.pages]}")
|
40
|
+
|
41
|
+
# Extract text from the range
|
42
|
+
text = page_range.extract_text()
|
43
|
+
print(f" Extracted {len(text)} characters of text from pages {[p.number for p in page_range.pages]}")
|
44
|
+
|
45
|
+
# You can also slice a page collection
|
46
|
+
if len(page_range) > 1:
|
47
|
+
sub_range = page_range[0:2]
|
48
|
+
print(f" Sub-range has {len(sub_range)} pages: {[p.number for p in sub_range.pages]}")
|
49
|
+
|
50
|
+
# Example 2: Find elements across multiple pages
|
51
|
+
if page_count >= 2:
|
52
|
+
print("\n2. Finding elements across multiple pages:")
|
53
|
+
# Get the first two pages
|
54
|
+
two_pages = pdf.pages[0:2]
|
55
|
+
|
56
|
+
# Find all text elements
|
57
|
+
text_elements = two_pages.find_all('text')
|
58
|
+
print(f" Found {len(text_elements)} text elements across {len(two_pages)} pages")
|
59
|
+
|
60
|
+
# Find the first heading-like element
|
61
|
+
heading = two_pages.find('text[size>=12]')
|
62
|
+
if heading:
|
63
|
+
print(f" Found heading: '{heading.text}' on page {heading.page.number}")
|
64
|
+
|
65
|
+
# Example 3: Get sections across pages
|
66
|
+
if page_count >= 2:
|
67
|
+
print("\n3. Getting sections across pages:")
|
68
|
+
# Get the first two pages
|
69
|
+
two_pages = pdf.pages[0:2]
|
70
|
+
|
71
|
+
# Try to find headings or large text as section starts
|
72
|
+
sections = two_pages.get_sections(
|
73
|
+
start_selector='text[size>=12]',
|
74
|
+
new_section_on_page_break=False, # Allow sections to continue across pages
|
75
|
+
boundary_inclusion='both'
|
76
|
+
)
|
77
|
+
|
78
|
+
print(f" Found {len(sections)} sections across {len(two_pages)} pages")
|
79
|
+
|
80
|
+
# Print info about each section
|
81
|
+
for i, section in enumerate(sections):
|
82
|
+
print(f" Section {i+1}:")
|
83
|
+
if hasattr(section, 'start_element') and section.start_element:
|
84
|
+
print(f" Starts with: '{section.start_element.text}'")
|
85
|
+
print(f" On page: {section.start_element.page.number}")
|
86
|
+
|
87
|
+
text = section.extract_text()
|
88
|
+
print(f" Contains {len(text)} characters of text")
|
89
|
+
|
90
|
+
# Show a preview
|
91
|
+
preview = text[:50] + "..." if len(text) > 50 else text
|
92
|
+
print(f" Preview: {preview}")
|
93
|
+
|
94
|
+
# Show with page breaks as section boundaries
|
95
|
+
sections_with_breaks = two_pages.get_sections(
|
96
|
+
start_selector='text[size>=12]',
|
97
|
+
new_section_on_page_break=True, # Force new sections at page boundaries
|
98
|
+
boundary_inclusion='both'
|
99
|
+
)
|
100
|
+
print(f" With page breaks as boundaries: {len(sections_with_breaks)} sections")
|
101
|
+
|
102
|
+
if __name__ == "__main__":
|
103
|
+
main()
|
@@ -0,0 +1,83 @@
|
|
1
|
+
"""
|
2
|
+
Example showing the polygon highlighting capabilities for handling non-rectangular regions.
|
3
|
+
|
4
|
+
This example demonstrates how polygon-based OCR results are handled and visualized,
|
5
|
+
which is especially useful for skewed or rotated text in scanned documents.
|
6
|
+
"""
|
7
|
+
import os
|
8
|
+
import sys
|
9
|
+
from natural_pdf import PDF
|
10
|
+
from natural_pdf.elements.region import Region
|
11
|
+
from PIL import Image
|
12
|
+
|
13
|
+
# Get the current directory of this script
|
14
|
+
script_dir = os.path.dirname(os.path.realpath(__file__))
|
15
|
+
# Get the parent directory (project root)
|
16
|
+
root_dir = os.path.dirname(script_dir)
|
17
|
+
# Default PDF path (using a document that needs OCR)
|
18
|
+
default_pdf = os.path.join(root_dir, "pdfs", "needs-ocr.pdf")
|
19
|
+
|
20
|
+
# Check for command line arguments
|
21
|
+
pdf_path = sys.argv[1] if len(sys.argv) > 1 else default_pdf
|
22
|
+
page_num = int(sys.argv[2]) if len(sys.argv) > 2 else 0
|
23
|
+
|
24
|
+
print(f"Loading PDF: {pdf_path}")
|
25
|
+
print(f"Using page: {page_num}")
|
26
|
+
|
27
|
+
# Load the PDF with OCR enabled
|
28
|
+
pdf = PDF(pdf_path, ocr=True)
|
29
|
+
page = pdf.pages[page_num]
|
30
|
+
|
31
|
+
# Create a simulated polygon region to show polygon highlighting
|
32
|
+
print("Creating polygon region...")
|
33
|
+
polygon_points = [
|
34
|
+
(100, 100),
|
35
|
+
(300, 150),
|
36
|
+
(250, 250),
|
37
|
+
(120, 200)
|
38
|
+
]
|
39
|
+
|
40
|
+
# Create a region with the polygon points
|
41
|
+
region = Region(page, (100, 100, 300, 250), polygon=polygon_points)
|
42
|
+
region.highlight(color=(1, 0, 0, 0.5), label="Polygon Region")
|
43
|
+
|
44
|
+
# Also extract and highlight text using OCR, which will use polygon detection
|
45
|
+
print("Running OCR on the page...")
|
46
|
+
ocr_elements = page.apply_ocr()
|
47
|
+
print(f"Found {len(ocr_elements)} OCR text elements")
|
48
|
+
|
49
|
+
# Highlight OCR elements with different colors based on confidence
|
50
|
+
print("Highlighting OCR elements...")
|
51
|
+
for elem in ocr_elements:
|
52
|
+
if elem.confidence > 0.8:
|
53
|
+
color = (0, 0.8, 0, 0.3) # Green for high confidence
|
54
|
+
elif elem.confidence > 0.5:
|
55
|
+
color = (1, 0.8, 0, 0.3) # Yellow for medium confidence
|
56
|
+
else:
|
57
|
+
color = (0.8, 0, 0, 0.3) # Red for low confidence
|
58
|
+
|
59
|
+
elem.highlight(color=color)
|
60
|
+
|
61
|
+
# Save the result
|
62
|
+
output_path = os.path.join(root_dir, "output", "polygon_highlight_example.png")
|
63
|
+
print(f"Saving highlighted image to {output_path}")
|
64
|
+
page.to_image(path=output_path, show_labels=True)
|
65
|
+
|
66
|
+
# Print some information about the elements
|
67
|
+
print("\nPolygon support details:")
|
68
|
+
|
69
|
+
# Check if any OCR elements have polygon data
|
70
|
+
polygon_elements = [elem for elem in ocr_elements if hasattr(elem, 'has_polygon') and elem.has_polygon]
|
71
|
+
print(f"- Found {len(polygon_elements)} elements with polygon data")
|
72
|
+
|
73
|
+
# Display details of the first few polygon elements
|
74
|
+
if polygon_elements:
|
75
|
+
for i, elem in enumerate(polygon_elements[:3]):
|
76
|
+
print(f"\nElement {i+1}:")
|
77
|
+
print(f"- Text: '{elem.text}'")
|
78
|
+
print(f"- Confidence: {elem.confidence:.2f}")
|
79
|
+
print(f"- Bounding box: {elem.bbox}")
|
80
|
+
print(f"- Polygon points: {elem.polygon[:2]}... ({len(elem.polygon)} points)")
|
81
|
+
|
82
|
+
if len(polygon_elements) > 3:
|
83
|
+
print(f"... and {len(polygon_elements) - 3} more")
|
@@ -0,0 +1,134 @@
|
|
1
|
+
"""
|
2
|
+
Example demonstrating positional methods in ElementCollection.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
import sys
|
7
|
+
from pathlib import Path
|
8
|
+
|
9
|
+
# Add parent directory to path for imports
|
10
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
11
|
+
|
12
|
+
from natural_pdf import PDF
|
13
|
+
|
14
|
+
|
15
|
+
def main():
|
16
|
+
"""Main entry point."""
|
17
|
+
# Get the PDF path from command line or use a default
|
18
|
+
if len(sys.argv) > 1:
|
19
|
+
pdf_path = sys.argv[1]
|
20
|
+
else:
|
21
|
+
# Look for any PDF in the examples directory or pdfs directory
|
22
|
+
example_dir = Path(__file__).parent
|
23
|
+
pdf_files = list(example_dir.glob("*.pdf"))
|
24
|
+
|
25
|
+
if not pdf_files:
|
26
|
+
pdfs_dir = example_dir.parent / "pdfs"
|
27
|
+
if pdfs_dir.exists():
|
28
|
+
pdf_files = list(pdfs_dir.glob("*.pdf"))
|
29
|
+
|
30
|
+
if pdf_files:
|
31
|
+
pdf_path = str(pdf_files[0])
|
32
|
+
else:
|
33
|
+
print("No PDF file found. Please provide a path to a PDF file.")
|
34
|
+
sys.exit(1)
|
35
|
+
|
36
|
+
print(f"Using PDF: {pdf_path}")
|
37
|
+
|
38
|
+
# Open the PDF
|
39
|
+
pdf = PDF(pdf_path)
|
40
|
+
page = pdf.pages[0]
|
41
|
+
|
42
|
+
# Find different element types
|
43
|
+
lines = page.find_all('line')
|
44
|
+
rects = page.find_all('rect')
|
45
|
+
text = page.find_all('text')
|
46
|
+
|
47
|
+
# Clear any existing highlights
|
48
|
+
page.clear_highlights()
|
49
|
+
|
50
|
+
# Highlight the page corners for reference
|
51
|
+
page.create_region(0, 0, 10, 10).highlight(label="Top-Left Corner")
|
52
|
+
page.create_region(page.width-10, 0, page.width, 10).highlight(label="Top-Right Corner")
|
53
|
+
page.create_region(0, page.height-10, 10, page.height).highlight(label="Bottom-Left Corner")
|
54
|
+
page.create_region(page.width-10, page.height-10, page.width, page.height).highlight(label="Bottom-Right Corner")
|
55
|
+
|
56
|
+
# Demonstrate line position methods
|
57
|
+
print(f"\nLines found: {len(lines)}")
|
58
|
+
if len(lines) > 0:
|
59
|
+
highest_line = lines.highest()
|
60
|
+
lowest_line = lines.lowest()
|
61
|
+
leftmost_line = lines.leftmost()
|
62
|
+
rightmost_line = lines.rightmost()
|
63
|
+
|
64
|
+
print(f"Highest line: {highest_line.bbox}")
|
65
|
+
print(f"Lowest line: {lowest_line.bbox}")
|
66
|
+
print(f"Leftmost line: {leftmost_line.bbox}")
|
67
|
+
print(f"Rightmost line: {rightmost_line.bbox}")
|
68
|
+
|
69
|
+
# Highlight the extreme lines
|
70
|
+
highest_line.highlight(label="Highest Line")
|
71
|
+
lowest_line.highlight(label="Lowest Line")
|
72
|
+
leftmost_line.highlight(label="Leftmost Line")
|
73
|
+
rightmost_line.highlight(label="Rightmost Line")
|
74
|
+
|
75
|
+
# Demonstrate rectangle position methods
|
76
|
+
print(f"\nRectangles found: {len(rects)}")
|
77
|
+
if len(rects) > 0:
|
78
|
+
highest_rect = rects.highest()
|
79
|
+
lowest_rect = rects.lowest()
|
80
|
+
leftmost_rect = rects.leftmost()
|
81
|
+
rightmost_rect = rects.rightmost()
|
82
|
+
|
83
|
+
print(f"Highest rectangle: {highest_rect.bbox}")
|
84
|
+
print(f"Lowest rectangle: {lowest_rect.bbox}")
|
85
|
+
print(f"Leftmost rectangle: {leftmost_rect.bbox}")
|
86
|
+
print(f"Rightmost rectangle: {rightmost_rect.bbox}")
|
87
|
+
|
88
|
+
# Highlight the extreme rectangles
|
89
|
+
highest_rect.highlight(label="Highest Rectangle")
|
90
|
+
lowest_rect.highlight(label="Lowest Rectangle")
|
91
|
+
leftmost_rect.highlight(label="Leftmost Rectangle")
|
92
|
+
rightmost_rect.highlight(label="Rightmost Rectangle")
|
93
|
+
|
94
|
+
# Demonstrate text position methods
|
95
|
+
print(f"\nText elements found: {len(text)}")
|
96
|
+
if len(text) > 0:
|
97
|
+
highest_text = text.highest()
|
98
|
+
lowest_text = text.lowest()
|
99
|
+
leftmost_text = text.leftmost()
|
100
|
+
rightmost_text = text.rightmost()
|
101
|
+
|
102
|
+
print(f"Highest text: '{highest_text.text}' at {highest_text.bbox}")
|
103
|
+
print(f"Lowest text: '{lowest_text.text}' at {lowest_text.bbox}")
|
104
|
+
print(f"Leftmost text: '{leftmost_text.text}' at {leftmost_text.bbox}")
|
105
|
+
print(f"Rightmost text: '{rightmost_text.text}' at {rightmost_text.bbox}")
|
106
|
+
|
107
|
+
# Highlight the extreme text elements
|
108
|
+
highest_text.highlight(label="Highest Text")
|
109
|
+
lowest_text.highlight(label="Lowest Text")
|
110
|
+
leftmost_text.highlight(label="Leftmost Text")
|
111
|
+
rightmost_text.highlight(label="Rightmost Text")
|
112
|
+
|
113
|
+
# Create an output directory
|
114
|
+
output_dir = Path(__file__).parent / "position_output"
|
115
|
+
output_dir.mkdir(exist_ok=True)
|
116
|
+
|
117
|
+
# Save the result
|
118
|
+
page.to_image(path=str(output_dir / "position_methods.png"), show_labels=True)
|
119
|
+
|
120
|
+
# Demonstrate error handling for multi-page collections
|
121
|
+
if len(pdf.pages) > 1:
|
122
|
+
print("\nTesting multi-page error handling:")
|
123
|
+
multi_collection = pdf.pages.find_all('text')
|
124
|
+
try:
|
125
|
+
multi_collection.lowest()
|
126
|
+
print("ERROR: Should have raised ValueError for multi-page collection")
|
127
|
+
except ValueError as e:
|
128
|
+
print(f"Correctly raised ValueError: {e}")
|
129
|
+
|
130
|
+
print("\nExample completed. Check 'position_output/position_methods.png' for the result.")
|
131
|
+
|
132
|
+
|
133
|
+
if __name__ == "__main__":
|
134
|
+
main()
|
@@ -0,0 +1,73 @@
|
|
1
|
+
"""
|
2
|
+
Test the modified region boundary logic with below() and above() method fixes.
|
3
|
+
|
4
|
+
This example tests that the .below() and .above() methods correctly exclude
|
5
|
+
the source element with the new 1-pixel offset.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import os
|
9
|
+
import sys
|
10
|
+
import argparse
|
11
|
+
|
12
|
+
# Add parent directory to path to run without installing
|
13
|
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
14
|
+
|
15
|
+
from natural_pdf import PDF
|
16
|
+
|
17
|
+
def main():
|
18
|
+
parser = argparse.ArgumentParser(description="Test region boundaries")
|
19
|
+
parser.add_argument("pdf_path", nargs="?", default="../pdfs/0500000US42001.pdf",
|
20
|
+
help="Path to PDF document")
|
21
|
+
args = parser.parse_args()
|
22
|
+
|
23
|
+
print(f"Testing with PDF: {args.pdf_path}")
|
24
|
+
|
25
|
+
# Open the PDF
|
26
|
+
pdf = PDF(args.pdf_path)
|
27
|
+
page = pdf.pages[0]
|
28
|
+
|
29
|
+
# Find a text element to test with
|
30
|
+
title = page.find('text:contains("Price")')
|
31
|
+
if not title:
|
32
|
+
title = page.find('text:bold')
|
33
|
+
|
34
|
+
if not title:
|
35
|
+
print("Couldn't find a suitable test element. Please provide a PDF with text elements.")
|
36
|
+
return
|
37
|
+
|
38
|
+
print(f"Found element: '{title.text}' at position {title.bbox}")
|
39
|
+
|
40
|
+
# Create region below the element
|
41
|
+
region_below = title.below(height=16, width="element")
|
42
|
+
|
43
|
+
# Check if the element is in the region (it shouldn't be)
|
44
|
+
elements_in_region = region_below.find_all('text')
|
45
|
+
|
46
|
+
# Print the region and elements found in it
|
47
|
+
print(f"\nRegion below: {region_below.bbox}")
|
48
|
+
print(f"Number of elements in region: {len(elements_in_region)}")
|
49
|
+
|
50
|
+
# Check specifically if the source element is in the region
|
51
|
+
is_source_in_region = title in elements_in_region
|
52
|
+
print(f"Source element is in region: {is_source_in_region}")
|
53
|
+
|
54
|
+
# Expand the region and check again
|
55
|
+
expanded_region = region_below.expand(right=40)
|
56
|
+
elements_in_expanded = expanded_region.find_all('text')
|
57
|
+
|
58
|
+
print(f"\nExpanded region: {expanded_region.bbox}")
|
59
|
+
print(f"Number of elements in expanded region: {len(elements_in_expanded)}")
|
60
|
+
print(f"Elements text: {[e.text for e in elements_in_expanded]}")
|
61
|
+
|
62
|
+
# Highlight the regions to visualize
|
63
|
+
title.highlight(color=(1, 0, 0, 0.3), label="Source")
|
64
|
+
region_below.highlight(color=(0, 1, 0, 0.3), label="Below")
|
65
|
+
expanded_region.highlight(color=(0, 0, 1, 0.3), label="Expanded")
|
66
|
+
|
67
|
+
# Save the image
|
68
|
+
os.makedirs("output", exist_ok=True)
|
69
|
+
page.save_image("output/region_boundary_test.png")
|
70
|
+
print("\nSaved visualization to output/region_boundary_test.png")
|
71
|
+
|
72
|
+
if __name__ == "__main__":
|
73
|
+
main()
|