natural-pdf 25.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/__init__.py +3 -0
- examples/another_exclusion_example.py +20 -0
- examples/basic_usage.py +190 -0
- examples/boundary_exclusion_test.py +137 -0
- examples/boundary_inclusion_fix_test.py +157 -0
- examples/chainable_layout_example.py +70 -0
- examples/color_basic_test.py +49 -0
- examples/color_name_example.py +71 -0
- examples/color_test.py +62 -0
- examples/debug_ocr.py +91 -0
- examples/direct_ocr_test.py +148 -0
- examples/direct_paddle_test.py +99 -0
- examples/direct_qa_example.py +165 -0
- examples/document_layout_analysis.py +123 -0
- examples/document_qa_example.py +185 -0
- examples/exclusion_count_debug.py +128 -0
- examples/exclusion_debug.py +107 -0
- examples/exclusion_example.py +150 -0
- examples/exclusion_optimization_example.py +190 -0
- examples/extract_text_test.py +128 -0
- examples/font_aware_example.py +101 -0
- examples/font_variant_example.py +124 -0
- examples/footer_overlap_test.py +124 -0
- examples/highlight_all_example.py +82 -0
- examples/highlight_attributes_test.py +114 -0
- examples/highlight_confidence_display.py +122 -0
- examples/highlight_demo.py +110 -0
- examples/highlight_float_test.py +71 -0
- examples/highlight_test.py +147 -0
- examples/highlighting_example.py +123 -0
- examples/image_width_example.py +84 -0
- examples/improved_api_example.py +128 -0
- examples/layout_confidence_display_test.py +65 -0
- examples/layout_confidence_test.py +82 -0
- examples/layout_coordinate_debug.py +258 -0
- examples/layout_highlight_test.py +77 -0
- examples/logging_example.py +70 -0
- examples/ocr_comprehensive.py +193 -0
- examples/ocr_debug_example.py +87 -0
- examples/ocr_default_test.py +97 -0
- examples/ocr_engine_comparison.py +235 -0
- examples/ocr_example.py +89 -0
- examples/ocr_simplified_params.py +79 -0
- examples/ocr_visualization.py +102 -0
- examples/ocr_visualization_test.py +121 -0
- examples/paddle_layout_example.py +315 -0
- examples/paddle_layout_simple.py +74 -0
- examples/paddleocr_example.py +224 -0
- examples/page_collection_example.py +103 -0
- examples/polygon_highlight_example.py +83 -0
- examples/position_methods_example.py +134 -0
- examples/region_boundary_test.py +73 -0
- examples/region_exclusion_test.py +149 -0
- examples/region_expand_example.py +109 -0
- examples/region_image_example.py +116 -0
- examples/region_ocr_test.py +119 -0
- examples/region_sections_example.py +115 -0
- examples/school_books.py +49 -0
- examples/school_books_all.py +52 -0
- examples/scouring.py +36 -0
- examples/section_extraction_example.py +232 -0
- examples/simple_document_qa.py +97 -0
- examples/spatial_navigation_example.py +108 -0
- examples/table_extraction_example.py +135 -0
- examples/table_structure_detection.py +155 -0
- examples/tatr_cells_test.py +56 -0
- examples/tatr_ocr_table_test.py +94 -0
- examples/text_search_example.py +122 -0
- examples/text_style_example.py +110 -0
- examples/tiny-text.py +61 -0
- examples/until_boundaries_example.py +156 -0
- examples/until_example.py +112 -0
- examples/very_basics.py +15 -0
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +9 -0
- natural_pdf/analyzers/document_layout.py +736 -0
- natural_pdf/analyzers/text_structure.py +153 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/page.py +2376 -0
- natural_pdf/core/pdf.py +572 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +553 -0
- natural_pdf/elements/collections.py +770 -0
- natural_pdf/elements/line.py +124 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1366 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +62 -0
- natural_pdf/ocr/easyocr_engine.py +254 -0
- natural_pdf/ocr/engine.py +158 -0
- natural_pdf/ocr/paddleocr_engine.py +263 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +405 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +360 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +4 -0
- natural_pdf/utils/highlighting.py +605 -0
- natural_pdf/utils/ocr.py +515 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +151 -0
- natural_pdf-25.3.16.dist-info/LICENSE +21 -0
- natural_pdf-25.3.16.dist-info/METADATA +268 -0
- natural_pdf-25.3.16.dist-info/RECORD +109 -0
- natural_pdf-25.3.16.dist-info/WHEEL +5 -0
- natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
- tests/__init__.py +3 -0
- tests/test_pdf.py +39 -0
@@ -0,0 +1,97 @@
|
|
1
|
+
"""
|
2
|
+
Test to ensure OCR is disabled by default.
|
3
|
+
"""
|
4
|
+
import os
|
5
|
+
import sys
|
6
|
+
|
7
|
+
# Add the parent directory to the path to import the package
|
8
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
9
|
+
|
10
|
+
from natural_pdf import PDF
|
11
|
+
|
12
|
+
def test_ocr_default():
|
13
|
+
"""Test that OCR is disabled by default but can be enabled explicitly."""
|
14
|
+
# Use the scanned PDF for testing OCR
|
15
|
+
pdf_path = os.path.abspath(os.path.join(
|
16
|
+
os.path.dirname(__file__), '..', 'pdfs', 'needs-ocr.pdf'))
|
17
|
+
|
18
|
+
if not os.path.exists(pdf_path):
|
19
|
+
# Fall back to a different PDF
|
20
|
+
pdf_path = os.path.abspath(os.path.join(
|
21
|
+
os.path.dirname(__file__), '..', 'pdfs', 'HARRY ROQUE_redacted.pdf'))
|
22
|
+
|
23
|
+
if not os.path.exists(pdf_path):
|
24
|
+
print("No suitable PDF file found for OCR testing. Please provide a scanned PDF file.")
|
25
|
+
return
|
26
|
+
|
27
|
+
print(f"Testing with PDF: {pdf_path}")
|
28
|
+
|
29
|
+
# Test 1: OCR should be OFF by default
|
30
|
+
print("\nTEST 1: Default Behavior (OCR should be OFF)")
|
31
|
+
print("-" * 60)
|
32
|
+
|
33
|
+
with PDF(pdf_path) as pdf:
|
34
|
+
# Print initial OCR config
|
35
|
+
print(f"Initial OCR config: {pdf._ocr_config}")
|
36
|
+
print(f"OCR enabled? {pdf._ocr_config.get('enabled', False)}")
|
37
|
+
|
38
|
+
# Extract text without OCR
|
39
|
+
page = pdf.pages[0]
|
40
|
+
text = page.extract_text()
|
41
|
+
|
42
|
+
print(f"Extracted {len(text)} characters without explicit OCR")
|
43
|
+
print(f"First 100 chars: {text[:100]}...")
|
44
|
+
|
45
|
+
# Test 2: Explicit OCR enable via constructor
|
46
|
+
print("\nTEST 2: Explicit OCR Enable via Constructor")
|
47
|
+
print("-" * 60)
|
48
|
+
|
49
|
+
with PDF(pdf_path, ocr=True) as pdf:
|
50
|
+
# Print OCR config
|
51
|
+
print(f"OCR config: {pdf._ocr_config}")
|
52
|
+
print(f"OCR enabled? {pdf._ocr_config.get('enabled', False)}")
|
53
|
+
|
54
|
+
# Extract text with OCR
|
55
|
+
page = pdf.pages[0]
|
56
|
+
text = page.extract_text()
|
57
|
+
|
58
|
+
print(f"Extracted {len(text)} characters with OCR enabled in constructor")
|
59
|
+
print(f"First 100 chars: {text[:100]}...")
|
60
|
+
|
61
|
+
# Test 3: Explicit OCR enable via extract_text parameter
|
62
|
+
print("\nTEST 3: Explicit OCR Enable via extract_text parameter")
|
63
|
+
print("-" * 60)
|
64
|
+
|
65
|
+
with PDF(pdf_path) as pdf:
|
66
|
+
# Print initial OCR config
|
67
|
+
print(f"Initial OCR config: {pdf._ocr_config}")
|
68
|
+
print(f"OCR enabled? {pdf._ocr_config.get('enabled', False)}")
|
69
|
+
|
70
|
+
# Extract text with OCR parameter
|
71
|
+
page = pdf.pages[0]
|
72
|
+
text = page.extract_text(ocr=True)
|
73
|
+
|
74
|
+
print(f"Extracted {len(text)} characters with OCR enabled in extract_text")
|
75
|
+
print(f"First 100 chars: {text[:100]}...")
|
76
|
+
|
77
|
+
# Test 4: OCR via with_ocr builder
|
78
|
+
print("\nTEST 4: OCR via with_ocr builder")
|
79
|
+
print("-" * 60)
|
80
|
+
|
81
|
+
with PDF(pdf_path) as pdf:
|
82
|
+
# Configure OCR with builder
|
83
|
+
pdf.with_ocr(enabled=True, languages=["en"])
|
84
|
+
|
85
|
+
# Print updated OCR config
|
86
|
+
print(f"Updated OCR config: {pdf._ocr_config}")
|
87
|
+
print(f"OCR enabled? {pdf._ocr_config.get('enabled', False)}")
|
88
|
+
|
89
|
+
# Extract text with OCR configured via builder
|
90
|
+
page = pdf.pages[0]
|
91
|
+
text = page.extract_text()
|
92
|
+
|
93
|
+
print(f"Extracted {len(text)} characters with OCR enabled via builder")
|
94
|
+
print(f"First 100 chars: {text[:100]}...")
|
95
|
+
|
96
|
+
if __name__ == "__main__":
|
97
|
+
test_ocr_default()
|
@@ -0,0 +1,235 @@
|
|
1
|
+
"""
|
2
|
+
OCR Engine Comparison Example.
|
3
|
+
|
4
|
+
This example compares the performance of different OCR engines with natural-pdf.
|
5
|
+
|
6
|
+
Requires both EasyOCR and PaddleOCR to be installed:
|
7
|
+
pip install easyocr
|
8
|
+
pip install paddlepaddle paddleocr
|
9
|
+
"""
|
10
|
+
import os
|
11
|
+
import sys
|
12
|
+
import time
|
13
|
+
from pathlib import Path
|
14
|
+
|
15
|
+
# Add parent directory to path for imports
|
16
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
17
|
+
|
18
|
+
from natural_pdf import PDF
|
19
|
+
from natural_pdf.ocr import EasyOCREngine, PaddleOCREngine
|
20
|
+
|
21
|
+
# Get the current directory of this script
|
22
|
+
script_dir = os.path.dirname(os.path.realpath(__file__))
|
23
|
+
# Get the parent directory (project root)
|
24
|
+
root_dir = os.path.dirname(script_dir)
|
25
|
+
# Default PDF path (replace with a scanned document path for better results)
|
26
|
+
default_pdf = os.path.join(root_dir, "pdfs", "HARRY ROQUE_redacted.pdf")
|
27
|
+
# Output directory
|
28
|
+
output_dir = os.path.join(root_dir, "output")
|
29
|
+
os.makedirs(output_dir, exist_ok=True)
|
30
|
+
|
31
|
+
print("OCR Engine Comparison")
|
32
|
+
print("====================")
|
33
|
+
|
34
|
+
# Check if both OCR engines are available
|
35
|
+
easyocr_available = False
|
36
|
+
paddleocr_available = False
|
37
|
+
|
38
|
+
try:
|
39
|
+
import easyocr
|
40
|
+
easyocr_available = True
|
41
|
+
print("EasyOCR is available.")
|
42
|
+
except ImportError:
|
43
|
+
print("EasyOCR is not available. Some comparisons will be skipped.")
|
44
|
+
|
45
|
+
try:
|
46
|
+
import paddleocr
|
47
|
+
import paddle
|
48
|
+
paddleocr_available = True
|
49
|
+
print("PaddleOCR is available.")
|
50
|
+
except ImportError:
|
51
|
+
print("PaddleOCR is not available. Some comparisons will be skipped.")
|
52
|
+
|
53
|
+
if not easyocr_available and not paddleocr_available:
|
54
|
+
print("No OCR engines available. Please install at least one OCR engine.")
|
55
|
+
sys.exit(1)
|
56
|
+
|
57
|
+
# Common OCR configuration for fair comparison
|
58
|
+
ocr_config = {
|
59
|
+
"languages": ["en"],
|
60
|
+
"device": "cpu",
|
61
|
+
"min_confidence": 0.3
|
62
|
+
}
|
63
|
+
|
64
|
+
# Set up testing information
|
65
|
+
engines = []
|
66
|
+
if easyocr_available:
|
67
|
+
engines.append(("EasyOCR", "easyocr"))
|
68
|
+
if paddleocr_available:
|
69
|
+
engines.append(("PaddleOCR", "paddleocr"))
|
70
|
+
|
71
|
+
# Function to run OCR with an engine and measure performance
|
72
|
+
def test_engine(engine_name, engine_id, page_number=0):
|
73
|
+
print(f"\nTesting {engine_name}:")
|
74
|
+
|
75
|
+
try:
|
76
|
+
# Start timing
|
77
|
+
start_time = time.time()
|
78
|
+
|
79
|
+
# Load PDF with this engine
|
80
|
+
print(f" Loading PDF with {engine_name} engine...")
|
81
|
+
pdf = PDF(default_pdf, ocr_engine=engine_id, ocr=ocr_config)
|
82
|
+
|
83
|
+
# Get the specified page
|
84
|
+
print(f" Accessing page {page_number}...")
|
85
|
+
page = pdf.pages[page_number]
|
86
|
+
|
87
|
+
# Check if OCR is properly configured
|
88
|
+
if hasattr(pdf, '_ocr_engine'):
|
89
|
+
print(f" OCR engine: {pdf._ocr_engine.__class__.__name__}")
|
90
|
+
print(f" OCR config: {pdf._ocr_config}")
|
91
|
+
else:
|
92
|
+
print(" Warning: PDF does not have _ocr_engine attribute")
|
93
|
+
|
94
|
+
# Force OCR explicitly
|
95
|
+
print(f" Extracting OCR elements explicitly...")
|
96
|
+
ocr_elements = page.extract_ocr_elements()
|
97
|
+
print(f" Found {len(ocr_elements)} OCR elements")
|
98
|
+
|
99
|
+
if len(ocr_elements) == 0:
|
100
|
+
print(" Warning: No OCR elements found - trying to debug")
|
101
|
+
# Try direct extract_text with OCR flag
|
102
|
+
print(" Trying page.extract_text(ocr=True)...")
|
103
|
+
text = page.extract_text(ocr=True)
|
104
|
+
print(f" Extract_text with ocr=True returned {len(text)} characters")
|
105
|
+
else:
|
106
|
+
# Extract text
|
107
|
+
print(f" Extracting text...")
|
108
|
+
text = page.extract_text()
|
109
|
+
print(f" Extracted {len(text)} characters")
|
110
|
+
|
111
|
+
extraction_time = time.time() - start_time
|
112
|
+
|
113
|
+
# Calculate average confidence
|
114
|
+
avg_confidence = sum(elem.confidence for elem in ocr_elements) / len(ocr_elements) if ocr_elements else 0
|
115
|
+
|
116
|
+
# Create a highlighted image
|
117
|
+
print(f" Creating highlighted image...")
|
118
|
+
page.clear_highlights()
|
119
|
+
for elem in ocr_elements:
|
120
|
+
if elem.confidence >= 0.7:
|
121
|
+
color = (0, 204, 0, 76) # Green for high confidence
|
122
|
+
elif elem.confidence >= 0.5:
|
123
|
+
color = (230, 230, 0, 76) # Yellow for medium confidence
|
124
|
+
else:
|
125
|
+
color = (204, 0, 0, 76) # Red for low confidence
|
126
|
+
|
127
|
+
elem.highlight(label=f"{engine_name}", color=color)
|
128
|
+
|
129
|
+
# Save the image
|
130
|
+
output_path = os.path.join(output_dir, f"{engine_name.lower()}_results.png")
|
131
|
+
page.to_image(path=output_path, show_labels=True)
|
132
|
+
|
133
|
+
# Return results
|
134
|
+
return {
|
135
|
+
"engine": engine_name,
|
136
|
+
"extraction_time": extraction_time,
|
137
|
+
"text_length": len(text),
|
138
|
+
"element_count": len(ocr_elements),
|
139
|
+
"avg_confidence": avg_confidence,
|
140
|
+
"output_path": output_path
|
141
|
+
}
|
142
|
+
|
143
|
+
except Exception as e:
|
144
|
+
print(f" Error during {engine_name} test: {e}")
|
145
|
+
import traceback
|
146
|
+
traceback.print_exc()
|
147
|
+
return {
|
148
|
+
"engine": engine_name,
|
149
|
+
"extraction_time": 0,
|
150
|
+
"text_length": 0,
|
151
|
+
"element_count": 0,
|
152
|
+
"avg_confidence": 0,
|
153
|
+
"output_path": "error",
|
154
|
+
"error": str(e)
|
155
|
+
}
|
156
|
+
|
157
|
+
# Run tests for each available engine
|
158
|
+
results = []
|
159
|
+
for engine_name, engine_id in engines:
|
160
|
+
result = test_engine(engine_name, engine_id)
|
161
|
+
results.append(result)
|
162
|
+
|
163
|
+
# Print some stats
|
164
|
+
print(f" Extraction time: {result['extraction_time']:.2f} seconds")
|
165
|
+
print(f" Text length: {result['text_length']} characters")
|
166
|
+
print(f" Element count: {result['element_count']} elements")
|
167
|
+
print(f" Average confidence: {result['avg_confidence']:.2f}")
|
168
|
+
print(f" Output image: {result['output_path']}")
|
169
|
+
|
170
|
+
# Compare results
|
171
|
+
if len(results) > 1:
|
172
|
+
print("\nComparison Results:")
|
173
|
+
print(f"{'Engine':<10} {'Time (s)':<10} {'Text Len':<10} {'Elements':<10} {'Avg Conf':<10}")
|
174
|
+
print(f"{'-'*60}")
|
175
|
+
for result in results:
|
176
|
+
print(f"{result['engine']:<10} {result['extraction_time']:.2f}s {result['text_length']:<10} {result['element_count']:<10} {result['avg_confidence']:.2f}")
|
177
|
+
|
178
|
+
# Highlight differences
|
179
|
+
fastest = min(results, key=lambda x: x['extraction_time'])
|
180
|
+
most_elements = max(results, key=lambda x: x['element_count'])
|
181
|
+
highest_confidence = max(results, key=lambda x: x['avg_confidence'])
|
182
|
+
|
183
|
+
print(f"\nFastest engine: {fastest['engine']} ({fastest['extraction_time']:.2f}s)")
|
184
|
+
print(f"Most elements: {most_elements['engine']} ({most_elements['element_count']} elements)")
|
185
|
+
print(f"Highest confidence: {highest_confidence['engine']} ({highest_confidence['avg_confidence']:.2f})")
|
186
|
+
|
187
|
+
# Additional comparison with engine-specific optimizations
|
188
|
+
print("\nRunning comparison with engine-specific optimizations:")
|
189
|
+
|
190
|
+
# Custom configurations for each engine
|
191
|
+
if easyocr_available and paddleocr_available:
|
192
|
+
# EasyOCR with customized settings
|
193
|
+
easyocr_custom = PDF(default_pdf,
|
194
|
+
ocr_engine="easyocr",
|
195
|
+
ocr={
|
196
|
+
"languages": ["en"],
|
197
|
+
"device": "cpu",
|
198
|
+
"min_confidence": 0.3,
|
199
|
+
"model_settings": {
|
200
|
+
"detail": 1,
|
201
|
+
"paragraph": False,
|
202
|
+
"contrast_ths": 0.05,
|
203
|
+
"text_threshold": 0.5
|
204
|
+
}
|
205
|
+
})
|
206
|
+
|
207
|
+
# PaddleOCR with customized settings
|
208
|
+
paddleocr_custom = PDF(default_pdf,
|
209
|
+
ocr_engine="paddleocr",
|
210
|
+
ocr={
|
211
|
+
"languages": ["en"],
|
212
|
+
"device": "cpu",
|
213
|
+
"min_confidence": 0.3,
|
214
|
+
"model_settings": {
|
215
|
+
"use_angle_cls": True,
|
216
|
+
"det_db_thresh": 0.2,
|
217
|
+
"det_db_box_thresh": 0.3
|
218
|
+
}
|
219
|
+
})
|
220
|
+
|
221
|
+
# Compare text extraction
|
222
|
+
easyocr_text = easyocr_custom.pages[0].extract_text()
|
223
|
+
paddleocr_text = paddleocr_custom.pages[0].extract_text()
|
224
|
+
|
225
|
+
print(f"\nOptimized EasyOCR text length: {len(easyocr_text)}")
|
226
|
+
print(f"Optimized PaddleOCR text length: {len(paddleocr_text)}")
|
227
|
+
|
228
|
+
# Compare element counts
|
229
|
+
easyocr_elements = easyocr_custom.pages[0].extract_ocr_elements()
|
230
|
+
paddleocr_elements = paddleocr_custom.pages[0].extract_ocr_elements()
|
231
|
+
|
232
|
+
print(f"Optimized EasyOCR element count: {len(easyocr_elements)}")
|
233
|
+
print(f"Optimized PaddleOCR element count: {len(paddleocr_elements)}")
|
234
|
+
|
235
|
+
print("\nDone!")
|
examples/ocr_example.py
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
"""
|
2
|
+
OCR example using PaddleOCR.
|
3
|
+
|
4
|
+
This example demonstrates how to use OCR to extract text from PDF documents,
|
5
|
+
both for whole pages and specific regions.
|
6
|
+
|
7
|
+
Note: This example requires the 'paddleocr' package:
|
8
|
+
pip install paddlepaddle paddleocr
|
9
|
+
"""
|
10
|
+
import os
|
11
|
+
import sys
|
12
|
+
from natural_pdf import PDF
|
13
|
+
|
14
|
+
# Get the current directory of this script
|
15
|
+
script_dir = os.path.dirname(os.path.realpath(__file__))
|
16
|
+
# Get the parent directory (project root)
|
17
|
+
root_dir = os.path.dirname(script_dir)
|
18
|
+
# Default PDF path (replace with a scanned document path for better results)
|
19
|
+
default_pdf = os.path.join(root_dir, "pdfs", "HARRY ROQUE_redacted.pdf")
|
20
|
+
# Output directory
|
21
|
+
output_dir = os.path.join(root_dir, "output")
|
22
|
+
os.makedirs(output_dir, exist_ok=True)
|
23
|
+
|
24
|
+
print("OCR Example")
|
25
|
+
print("==========")
|
26
|
+
|
27
|
+
# 1. Loading a PDF with OCR enabled
|
28
|
+
print("\n1. Loading PDF with OCR enabled")
|
29
|
+
pdf = PDF(default_pdf, ocr={
|
30
|
+
"enabled": "auto", # Auto mode: only use OCR when necessary
|
31
|
+
"languages": ["en"],
|
32
|
+
# For more options, see OCR-NOTES.md
|
33
|
+
})
|
34
|
+
|
35
|
+
# 2. Extract text from a page with auto OCR
|
36
|
+
page = pdf.pages[0]
|
37
|
+
print(f"\n2. Extracting text from page {page.number} with auto OCR")
|
38
|
+
text = page.extract_text()
|
39
|
+
print(f"Extracted {len(text)} characters.")
|
40
|
+
print("First 150 characters:\n", text[:150] + "..." if len(text) > 150 else text)
|
41
|
+
|
42
|
+
# 3. Force OCR on a page
|
43
|
+
print("\n3. Force OCR on a page")
|
44
|
+
ocr_text = page.extract_text(ocr=True) # Force OCR regardless of existing text
|
45
|
+
print(f"Extracted {len(ocr_text)} characters with forced OCR.")
|
46
|
+
print("First 150 characters:\n", ocr_text[:150] + "..." if len(ocr_text) > 150 else ocr_text)
|
47
|
+
|
48
|
+
# 4. Extract OCR elements directly
|
49
|
+
print("\n4. Extracting OCR elements directly")
|
50
|
+
ocr_elements = page.extract_ocr_elements()
|
51
|
+
print(f"Found {len(ocr_elements)} OCR text elements.")
|
52
|
+
for i, elem in enumerate(ocr_elements[:3]): # Show first 3 elements
|
53
|
+
print(f" Element {i+1}: '{elem.text}' (confidence: {elem.confidence:.2f})")
|
54
|
+
|
55
|
+
# 5. Apply OCR to a specific region
|
56
|
+
print("\n5. Applying OCR to a specific region")
|
57
|
+
# Create a region (adjust coordinates for your PDF)
|
58
|
+
region = page.create_region(100, 100, 400, 200) # x0, y0, x1, y1
|
59
|
+
region.highlight(label="OCR Region")
|
60
|
+
|
61
|
+
# Apply OCR to this region
|
62
|
+
region_elements = region.apply_ocr()
|
63
|
+
print(f"Found {len(region_elements)} OCR text elements in the region.")
|
64
|
+
|
65
|
+
# Extract text from the region (uses OCR since we already applied it)
|
66
|
+
region_text = region.extract_text()
|
67
|
+
print(f"Region text: '{region_text[:50]}...'" if len(region_text) > 50 else f"Region text: '{region_text}'")
|
68
|
+
|
69
|
+
# 6. Finding OCR text elements with selectors
|
70
|
+
print("\n6. Finding OCR text elements with selectors")
|
71
|
+
# Find OCR elements with specific properties
|
72
|
+
high_confidence_ocr = page.find_all('text[source=ocr][confidence>=0.8]')
|
73
|
+
print(f"Found {len(high_confidence_ocr)} high-confidence OCR elements.")
|
74
|
+
|
75
|
+
# Find OCR elements containing specific text
|
76
|
+
matching_ocr = page.find_all('text[source=ocr]:contains("the")')
|
77
|
+
print(f"Found {len(matching_ocr)} OCR elements containing 'the'.")
|
78
|
+
|
79
|
+
# 7. Visualize OCR results
|
80
|
+
print("\n7. Visualizing OCR results")
|
81
|
+
# Highlight all OCR elements
|
82
|
+
for elem in ocr_elements:
|
83
|
+
elem.highlight(label=f"OCR ({elem.confidence:.2f})")
|
84
|
+
|
85
|
+
# Save the highlighted page
|
86
|
+
output_path = os.path.join(output_dir, "ocr_results.png")
|
87
|
+
page.to_image(path=output_path, show_labels=True)
|
88
|
+
print(f"Saved visualization to {output_path}")
|
89
|
+
print("\nDone!")
|
@@ -0,0 +1,79 @@
|
|
1
|
+
import os
|
2
|
+
import sys
|
3
|
+
from pathlib import Path
|
4
|
+
|
5
|
+
# Add parent directory to path for imports
|
6
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
7
|
+
|
8
|
+
from natural_pdf import PDF
|
9
|
+
|
10
|
+
# Get absolute path for the PDF
|
11
|
+
script_dir = os.path.dirname(os.path.realpath(__file__))
|
12
|
+
root_dir = os.path.dirname(script_dir)
|
13
|
+
pdf_path = os.path.join(root_dir, "pdfs", "HARRY ROQUE_redacted.pdf")
|
14
|
+
|
15
|
+
print(f"Loading PDF: {pdf_path}")
|
16
|
+
|
17
|
+
# Example 1: Initialize PDF with flattened OCR parameters
|
18
|
+
pdf = PDF(pdf_path, ocr={
|
19
|
+
"enabled": True,
|
20
|
+
"languages": ["en"],
|
21
|
+
"min_confidence": 0.3,
|
22
|
+
# OCR parameters directly in config root:
|
23
|
+
"text_threshold": 0.1, # Was previously in detection_params
|
24
|
+
"link_threshold": 0.1, # Was previously in detection_params
|
25
|
+
"paragraph": True, # Was previously in recognition_params
|
26
|
+
"detail": 1 # Was previously in recognition_params
|
27
|
+
})
|
28
|
+
|
29
|
+
# Use a specific page
|
30
|
+
page = pdf.pages[3]
|
31
|
+
|
32
|
+
# Example 2: Apply OCR with flattened parameters
|
33
|
+
print("\nApplying OCR with flattened parameters")
|
34
|
+
ocr_elements = page.apply_ocr(
|
35
|
+
# Direct parameters:
|
36
|
+
text_threshold=0.15,
|
37
|
+
link_threshold=0.15,
|
38
|
+
mag_ratio=1.5,
|
39
|
+
canvas_size=1024,
|
40
|
+
batch_size=4
|
41
|
+
)
|
42
|
+
|
43
|
+
print(f"Found {len(ocr_elements)} OCR text elements")
|
44
|
+
|
45
|
+
# Print sample of OCR results
|
46
|
+
print("\nSample OCR results:")
|
47
|
+
for i, elem in enumerate(ocr_elements[:5]):
|
48
|
+
print(f"{i+1}. '{elem.text}' (conf: {elem.confidence:.2f})")
|
49
|
+
if i >= 4:
|
50
|
+
break
|
51
|
+
|
52
|
+
# Example 3: Extract text with OCR using flattened parameters
|
53
|
+
print("\nExtracting text with OCR using flattened parameters")
|
54
|
+
text = page.extract_text(ocr={
|
55
|
+
"enabled": True,
|
56
|
+
"min_confidence": 0.2,
|
57
|
+
# Direct parameters:
|
58
|
+
"text_threshold": 0.2,
|
59
|
+
"contrast_ths": 0.05
|
60
|
+
})
|
61
|
+
|
62
|
+
# Display first 100 characters of text
|
63
|
+
print(f"\nExtracted text (first 100 chars):")
|
64
|
+
print(text[:100] + "...")
|
65
|
+
|
66
|
+
# Create output directory if it doesn't exist
|
67
|
+
output_dir = os.path.join(root_dir, "output")
|
68
|
+
os.makedirs(output_dir, exist_ok=True)
|
69
|
+
|
70
|
+
# Highlight OCR elements
|
71
|
+
for elem in ocr_elements[:10]:
|
72
|
+
elem.highlight(label=f"OCR: {elem.text}")
|
73
|
+
|
74
|
+
# Save image
|
75
|
+
output_path = os.path.join(output_dir, "ocr_simplified.png")
|
76
|
+
print(f"\nSaving highlighted image to: {output_path}")
|
77
|
+
page.to_image(path=output_path, show_labels=True)
|
78
|
+
|
79
|
+
print("\nTest completed successfully!")
|
@@ -0,0 +1,102 @@
|
|
1
|
+
"""
|
2
|
+
OCR Visualization Example
|
3
|
+
|
4
|
+
This example demonstrates the new OCR visualization feature that renders
|
5
|
+
OCR text with white background boxes on the image.
|
6
|
+
"""
|
7
|
+
import os
|
8
|
+
import sys
|
9
|
+
|
10
|
+
# Add project directory to the path to import the library
|
11
|
+
script_dir = os.path.dirname(os.path.realpath(__file__))
|
12
|
+
root_dir = os.path.dirname(script_dir)
|
13
|
+
sys.path.insert(0, root_dir)
|
14
|
+
|
15
|
+
from natural_pdf import PDF
|
16
|
+
|
17
|
+
# Get paths
|
18
|
+
default_pdf = os.path.join(root_dir, "pdfs", "needs-ocr.pdf")
|
19
|
+
if not os.path.exists(default_pdf):
|
20
|
+
default_pdf = os.path.join(root_dir, "pdfs", "01-practice.pdf")
|
21
|
+
|
22
|
+
# Output directory
|
23
|
+
output_dir = os.path.join(root_dir, "output")
|
24
|
+
os.makedirs(output_dir, exist_ok=True)
|
25
|
+
|
26
|
+
def main():
|
27
|
+
"""Main example function."""
|
28
|
+
print("OCR Visualization Example")
|
29
|
+
print("=========================")
|
30
|
+
|
31
|
+
# 1. Load a PDF with OCR enabled
|
32
|
+
print("\n1. Loading PDF with OCR enabled")
|
33
|
+
pdf = PDF(default_pdf, ocr={
|
34
|
+
"enabled": True,
|
35
|
+
"languages": ["en"],
|
36
|
+
"min_confidence": 0.3 # Lower confidence to get more results
|
37
|
+
})
|
38
|
+
|
39
|
+
# 2. First check if we have OCR text by extracting text with OCR
|
40
|
+
print("\n2. Extracting text with OCR")
|
41
|
+
page = pdf.pages[0]
|
42
|
+
text = page.extract_text(ocr=True) # Force OCR
|
43
|
+
print(f"Extracted {len(text)} characters with OCR")
|
44
|
+
|
45
|
+
# 3. Find OCR text elements
|
46
|
+
print("\n3. Finding OCR text elements")
|
47
|
+
ocr_elements = page.find_all('text[source=ocr]')
|
48
|
+
print(f"Found {len(ocr_elements)} OCR text elements on the page")
|
49
|
+
|
50
|
+
# If we don't have OCR elements, fall back to forcing OCR directly
|
51
|
+
if not ocr_elements:
|
52
|
+
print("No OCR elements found. Running OCR directly...")
|
53
|
+
# Extract OCR elements directly
|
54
|
+
ocr_elements = page.extract_ocr_elements()
|
55
|
+
print(f"Found {len(ocr_elements)} OCR text elements from direct extraction")
|
56
|
+
|
57
|
+
# 4. Highlight the OCR elements
|
58
|
+
print(f"\n4. Highlighting {len(ocr_elements)} OCR elements")
|
59
|
+
for element in ocr_elements:
|
60
|
+
# Add color highlighting based on confidence score
|
61
|
+
confidence = getattr(element, 'confidence', 0.5) # Default if not available
|
62
|
+
if confidence >= 0.8:
|
63
|
+
color = (0, 1, 0, 0.3) # Green for high confidence
|
64
|
+
elif confidence >= 0.5:
|
65
|
+
color = (1, 1, 0, 0.3) # Yellow for medium confidence
|
66
|
+
else:
|
67
|
+
color = (1, 0, 0, 0.3) # Red for low confidence
|
68
|
+
|
69
|
+
element.highlight(color=color, label=f"OCR ({confidence:.2f})")
|
70
|
+
|
71
|
+
# 5. Visualize the regular highlights (no OCR text)
|
72
|
+
print("\n5. Saving highlighted image without OCR text")
|
73
|
+
highlighted_path = os.path.join(output_dir, "ocr_highlighted.png")
|
74
|
+
page.to_image(path=highlighted_path, show_labels=True, render_ocr=False)
|
75
|
+
print(f"Saved highlighted image to {highlighted_path}")
|
76
|
+
|
77
|
+
# 6. Visualize with OCR text on white background
|
78
|
+
print("\n6. Saving image with rendered OCR text")
|
79
|
+
ocr_text_path = os.path.join(output_dir, "ocr_rendered_text.png")
|
80
|
+
try:
|
81
|
+
page.to_image(path=ocr_text_path, show_labels=True, render_ocr=True)
|
82
|
+
print(f"Saved OCR text rendering to {ocr_text_path}")
|
83
|
+
except ValueError as e:
|
84
|
+
print(f"Error rendering OCR text: {e}")
|
85
|
+
|
86
|
+
# 7. Create a clean white page with just OCR text (no highlights)
|
87
|
+
if ocr_elements:
|
88
|
+
print("\n7. Creating clean white page with just OCR text")
|
89
|
+
# Clear previous highlights
|
90
|
+
page.clear_highlights()
|
91
|
+
# Save with OCR text rendering only
|
92
|
+
clean_text_path = os.path.join(output_dir, "ocr_clean_text.png")
|
93
|
+
try:
|
94
|
+
page.to_image(path=clean_text_path, render_ocr=True)
|
95
|
+
print(f"Saved clean OCR text rendering to {clean_text_path}")
|
96
|
+
except ValueError as e:
|
97
|
+
print(f"Error rendering clean OCR text: {e}")
|
98
|
+
|
99
|
+
print("\nDone!")
|
100
|
+
|
101
|
+
if __name__ == "__main__":
|
102
|
+
main()
|