natural-pdf 25.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. examples/__init__.py +3 -0
  2. examples/another_exclusion_example.py +20 -0
  3. examples/basic_usage.py +190 -0
  4. examples/boundary_exclusion_test.py +137 -0
  5. examples/boundary_inclusion_fix_test.py +157 -0
  6. examples/chainable_layout_example.py +70 -0
  7. examples/color_basic_test.py +49 -0
  8. examples/color_name_example.py +71 -0
  9. examples/color_test.py +62 -0
  10. examples/debug_ocr.py +91 -0
  11. examples/direct_ocr_test.py +148 -0
  12. examples/direct_paddle_test.py +99 -0
  13. examples/direct_qa_example.py +165 -0
  14. examples/document_layout_analysis.py +123 -0
  15. examples/document_qa_example.py +185 -0
  16. examples/exclusion_count_debug.py +128 -0
  17. examples/exclusion_debug.py +107 -0
  18. examples/exclusion_example.py +150 -0
  19. examples/exclusion_optimization_example.py +190 -0
  20. examples/extract_text_test.py +128 -0
  21. examples/font_aware_example.py +101 -0
  22. examples/font_variant_example.py +124 -0
  23. examples/footer_overlap_test.py +124 -0
  24. examples/highlight_all_example.py +82 -0
  25. examples/highlight_attributes_test.py +114 -0
  26. examples/highlight_confidence_display.py +122 -0
  27. examples/highlight_demo.py +110 -0
  28. examples/highlight_float_test.py +71 -0
  29. examples/highlight_test.py +147 -0
  30. examples/highlighting_example.py +123 -0
  31. examples/image_width_example.py +84 -0
  32. examples/improved_api_example.py +128 -0
  33. examples/layout_confidence_display_test.py +65 -0
  34. examples/layout_confidence_test.py +82 -0
  35. examples/layout_coordinate_debug.py +258 -0
  36. examples/layout_highlight_test.py +77 -0
  37. examples/logging_example.py +70 -0
  38. examples/ocr_comprehensive.py +193 -0
  39. examples/ocr_debug_example.py +87 -0
  40. examples/ocr_default_test.py +97 -0
  41. examples/ocr_engine_comparison.py +235 -0
  42. examples/ocr_example.py +89 -0
  43. examples/ocr_simplified_params.py +79 -0
  44. examples/ocr_visualization.py +102 -0
  45. examples/ocr_visualization_test.py +121 -0
  46. examples/paddle_layout_example.py +315 -0
  47. examples/paddle_layout_simple.py +74 -0
  48. examples/paddleocr_example.py +224 -0
  49. examples/page_collection_example.py +103 -0
  50. examples/polygon_highlight_example.py +83 -0
  51. examples/position_methods_example.py +134 -0
  52. examples/region_boundary_test.py +73 -0
  53. examples/region_exclusion_test.py +149 -0
  54. examples/region_expand_example.py +109 -0
  55. examples/region_image_example.py +116 -0
  56. examples/region_ocr_test.py +119 -0
  57. examples/region_sections_example.py +115 -0
  58. examples/school_books.py +49 -0
  59. examples/school_books_all.py +52 -0
  60. examples/scouring.py +36 -0
  61. examples/section_extraction_example.py +232 -0
  62. examples/simple_document_qa.py +97 -0
  63. examples/spatial_navigation_example.py +108 -0
  64. examples/table_extraction_example.py +135 -0
  65. examples/table_structure_detection.py +155 -0
  66. examples/tatr_cells_test.py +56 -0
  67. examples/tatr_ocr_table_test.py +94 -0
  68. examples/text_search_example.py +122 -0
  69. examples/text_style_example.py +110 -0
  70. examples/tiny-text.py +61 -0
  71. examples/until_boundaries_example.py +156 -0
  72. examples/until_example.py +112 -0
  73. examples/very_basics.py +15 -0
  74. natural_pdf/__init__.py +55 -0
  75. natural_pdf/analyzers/__init__.py +9 -0
  76. natural_pdf/analyzers/document_layout.py +736 -0
  77. natural_pdf/analyzers/text_structure.py +153 -0
  78. natural_pdf/core/__init__.py +3 -0
  79. natural_pdf/core/page.py +2376 -0
  80. natural_pdf/core/pdf.py +572 -0
  81. natural_pdf/elements/__init__.py +3 -0
  82. natural_pdf/elements/base.py +553 -0
  83. natural_pdf/elements/collections.py +770 -0
  84. natural_pdf/elements/line.py +124 -0
  85. natural_pdf/elements/rect.py +122 -0
  86. natural_pdf/elements/region.py +1366 -0
  87. natural_pdf/elements/text.py +304 -0
  88. natural_pdf/ocr/__init__.py +62 -0
  89. natural_pdf/ocr/easyocr_engine.py +254 -0
  90. natural_pdf/ocr/engine.py +158 -0
  91. natural_pdf/ocr/paddleocr_engine.py +263 -0
  92. natural_pdf/qa/__init__.py +3 -0
  93. natural_pdf/qa/document_qa.py +405 -0
  94. natural_pdf/selectors/__init__.py +4 -0
  95. natural_pdf/selectors/parser.py +360 -0
  96. natural_pdf/templates/__init__.py +1 -0
  97. natural_pdf/templates/ocr_debug.html +517 -0
  98. natural_pdf/utils/__init__.py +4 -0
  99. natural_pdf/utils/highlighting.py +605 -0
  100. natural_pdf/utils/ocr.py +515 -0
  101. natural_pdf/utils/reading_order.py +227 -0
  102. natural_pdf/utils/visualization.py +151 -0
  103. natural_pdf-25.3.16.dist-info/LICENSE +21 -0
  104. natural_pdf-25.3.16.dist-info/METADATA +268 -0
  105. natural_pdf-25.3.16.dist-info/RECORD +109 -0
  106. natural_pdf-25.3.16.dist-info/WHEEL +5 -0
  107. natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
  108. tests/__init__.py +3 -0
  109. tests/test_pdf.py +39 -0
@@ -0,0 +1,97 @@
1
+ """
2
+ Test to ensure OCR is disabled by default.
3
+ """
4
+ import os
5
+ import sys
6
+
7
+ # Add the parent directory to the path to import the package
8
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
9
+
10
+ from natural_pdf import PDF
11
+
12
+ def test_ocr_default():
13
+ """Test that OCR is disabled by default but can be enabled explicitly."""
14
+ # Use the scanned PDF for testing OCR
15
+ pdf_path = os.path.abspath(os.path.join(
16
+ os.path.dirname(__file__), '..', 'pdfs', 'needs-ocr.pdf'))
17
+
18
+ if not os.path.exists(pdf_path):
19
+ # Fall back to a different PDF
20
+ pdf_path = os.path.abspath(os.path.join(
21
+ os.path.dirname(__file__), '..', 'pdfs', 'HARRY ROQUE_redacted.pdf'))
22
+
23
+ if not os.path.exists(pdf_path):
24
+ print("No suitable PDF file found for OCR testing. Please provide a scanned PDF file.")
25
+ return
26
+
27
+ print(f"Testing with PDF: {pdf_path}")
28
+
29
+ # Test 1: OCR should be OFF by default
30
+ print("\nTEST 1: Default Behavior (OCR should be OFF)")
31
+ print("-" * 60)
32
+
33
+ with PDF(pdf_path) as pdf:
34
+ # Print initial OCR config
35
+ print(f"Initial OCR config: {pdf._ocr_config}")
36
+ print(f"OCR enabled? {pdf._ocr_config.get('enabled', False)}")
37
+
38
+ # Extract text without OCR
39
+ page = pdf.pages[0]
40
+ text = page.extract_text()
41
+
42
+ print(f"Extracted {len(text)} characters without explicit OCR")
43
+ print(f"First 100 chars: {text[:100]}...")
44
+
45
+ # Test 2: Explicit OCR enable via constructor
46
+ print("\nTEST 2: Explicit OCR Enable via Constructor")
47
+ print("-" * 60)
48
+
49
+ with PDF(pdf_path, ocr=True) as pdf:
50
+ # Print OCR config
51
+ print(f"OCR config: {pdf._ocr_config}")
52
+ print(f"OCR enabled? {pdf._ocr_config.get('enabled', False)}")
53
+
54
+ # Extract text with OCR
55
+ page = pdf.pages[0]
56
+ text = page.extract_text()
57
+
58
+ print(f"Extracted {len(text)} characters with OCR enabled in constructor")
59
+ print(f"First 100 chars: {text[:100]}...")
60
+
61
+ # Test 3: Explicit OCR enable via extract_text parameter
62
+ print("\nTEST 3: Explicit OCR Enable via extract_text parameter")
63
+ print("-" * 60)
64
+
65
+ with PDF(pdf_path) as pdf:
66
+ # Print initial OCR config
67
+ print(f"Initial OCR config: {pdf._ocr_config}")
68
+ print(f"OCR enabled? {pdf._ocr_config.get('enabled', False)}")
69
+
70
+ # Extract text with OCR parameter
71
+ page = pdf.pages[0]
72
+ text = page.extract_text(ocr=True)
73
+
74
+ print(f"Extracted {len(text)} characters with OCR enabled in extract_text")
75
+ print(f"First 100 chars: {text[:100]}...")
76
+
77
+ # Test 4: OCR via with_ocr builder
78
+ print("\nTEST 4: OCR via with_ocr builder")
79
+ print("-" * 60)
80
+
81
+ with PDF(pdf_path) as pdf:
82
+ # Configure OCR with builder
83
+ pdf.with_ocr(enabled=True, languages=["en"])
84
+
85
+ # Print updated OCR config
86
+ print(f"Updated OCR config: {pdf._ocr_config}")
87
+ print(f"OCR enabled? {pdf._ocr_config.get('enabled', False)}")
88
+
89
+ # Extract text with OCR configured via builder
90
+ page = pdf.pages[0]
91
+ text = page.extract_text()
92
+
93
+ print(f"Extracted {len(text)} characters with OCR enabled via builder")
94
+ print(f"First 100 chars: {text[:100]}...")
95
+
96
+ if __name__ == "__main__":
97
+ test_ocr_default()
@@ -0,0 +1,235 @@
1
+ """
2
+ OCR Engine Comparison Example.
3
+
4
+ This example compares the performance of different OCR engines with natural-pdf.
5
+
6
+ Requires both EasyOCR and PaddleOCR to be installed:
7
+ pip install easyocr
8
+ pip install paddlepaddle paddleocr
9
+ """
10
+ import os
11
+ import sys
12
+ import time
13
+ from pathlib import Path
14
+
15
+ # Add parent directory to path for imports
16
+ sys.path.insert(0, str(Path(__file__).parent.parent))
17
+
18
+ from natural_pdf import PDF
19
+ from natural_pdf.ocr import EasyOCREngine, PaddleOCREngine
20
+
21
+ # Get the current directory of this script
22
+ script_dir = os.path.dirname(os.path.realpath(__file__))
23
+ # Get the parent directory (project root)
24
+ root_dir = os.path.dirname(script_dir)
25
+ # Default PDF path (replace with a scanned document path for better results)
26
+ default_pdf = os.path.join(root_dir, "pdfs", "HARRY ROQUE_redacted.pdf")
27
+ # Output directory
28
+ output_dir = os.path.join(root_dir, "output")
29
+ os.makedirs(output_dir, exist_ok=True)
30
+
31
+ print("OCR Engine Comparison")
32
+ print("====================")
33
+
34
+ # Check if both OCR engines are available
35
+ easyocr_available = False
36
+ paddleocr_available = False
37
+
38
+ try:
39
+ import easyocr
40
+ easyocr_available = True
41
+ print("EasyOCR is available.")
42
+ except ImportError:
43
+ print("EasyOCR is not available. Some comparisons will be skipped.")
44
+
45
+ try:
46
+ import paddleocr
47
+ import paddle
48
+ paddleocr_available = True
49
+ print("PaddleOCR is available.")
50
+ except ImportError:
51
+ print("PaddleOCR is not available. Some comparisons will be skipped.")
52
+
53
+ if not easyocr_available and not paddleocr_available:
54
+ print("No OCR engines available. Please install at least one OCR engine.")
55
+ sys.exit(1)
56
+
57
+ # Common OCR configuration for fair comparison
58
+ ocr_config = {
59
+ "languages": ["en"],
60
+ "device": "cpu",
61
+ "min_confidence": 0.3
62
+ }
63
+
64
+ # Set up testing information
65
+ engines = []
66
+ if easyocr_available:
67
+ engines.append(("EasyOCR", "easyocr"))
68
+ if paddleocr_available:
69
+ engines.append(("PaddleOCR", "paddleocr"))
70
+
71
+ # Function to run OCR with an engine and measure performance
72
+ def test_engine(engine_name, engine_id, page_number=0):
73
+ print(f"\nTesting {engine_name}:")
74
+
75
+ try:
76
+ # Start timing
77
+ start_time = time.time()
78
+
79
+ # Load PDF with this engine
80
+ print(f" Loading PDF with {engine_name} engine...")
81
+ pdf = PDF(default_pdf, ocr_engine=engine_id, ocr=ocr_config)
82
+
83
+ # Get the specified page
84
+ print(f" Accessing page {page_number}...")
85
+ page = pdf.pages[page_number]
86
+
87
+ # Check if OCR is properly configured
88
+ if hasattr(pdf, '_ocr_engine'):
89
+ print(f" OCR engine: {pdf._ocr_engine.__class__.__name__}")
90
+ print(f" OCR config: {pdf._ocr_config}")
91
+ else:
92
+ print(" Warning: PDF does not have _ocr_engine attribute")
93
+
94
+ # Force OCR explicitly
95
+ print(f" Extracting OCR elements explicitly...")
96
+ ocr_elements = page.extract_ocr_elements()
97
+ print(f" Found {len(ocr_elements)} OCR elements")
98
+
99
+ if len(ocr_elements) == 0:
100
+ print(" Warning: No OCR elements found - trying to debug")
101
+ # Try direct extract_text with OCR flag
102
+ print(" Trying page.extract_text(ocr=True)...")
103
+ text = page.extract_text(ocr=True)
104
+ print(f" Extract_text with ocr=True returned {len(text)} characters")
105
+ else:
106
+ # Extract text
107
+ print(f" Extracting text...")
108
+ text = page.extract_text()
109
+ print(f" Extracted {len(text)} characters")
110
+
111
+ extraction_time = time.time() - start_time
112
+
113
+ # Calculate average confidence
114
+ avg_confidence = sum(elem.confidence for elem in ocr_elements) / len(ocr_elements) if ocr_elements else 0
115
+
116
+ # Create a highlighted image
117
+ print(f" Creating highlighted image...")
118
+ page.clear_highlights()
119
+ for elem in ocr_elements:
120
+ if elem.confidence >= 0.7:
121
+ color = (0, 204, 0, 76) # Green for high confidence
122
+ elif elem.confidence >= 0.5:
123
+ color = (230, 230, 0, 76) # Yellow for medium confidence
124
+ else:
125
+ color = (204, 0, 0, 76) # Red for low confidence
126
+
127
+ elem.highlight(label=f"{engine_name}", color=color)
128
+
129
+ # Save the image
130
+ output_path = os.path.join(output_dir, f"{engine_name.lower()}_results.png")
131
+ page.to_image(path=output_path, show_labels=True)
132
+
133
+ # Return results
134
+ return {
135
+ "engine": engine_name,
136
+ "extraction_time": extraction_time,
137
+ "text_length": len(text),
138
+ "element_count": len(ocr_elements),
139
+ "avg_confidence": avg_confidence,
140
+ "output_path": output_path
141
+ }
142
+
143
+ except Exception as e:
144
+ print(f" Error during {engine_name} test: {e}")
145
+ import traceback
146
+ traceback.print_exc()
147
+ return {
148
+ "engine": engine_name,
149
+ "extraction_time": 0,
150
+ "text_length": 0,
151
+ "element_count": 0,
152
+ "avg_confidence": 0,
153
+ "output_path": "error",
154
+ "error": str(e)
155
+ }
156
+
157
+ # Run tests for each available engine
158
+ results = []
159
+ for engine_name, engine_id in engines:
160
+ result = test_engine(engine_name, engine_id)
161
+ results.append(result)
162
+
163
+ # Print some stats
164
+ print(f" Extraction time: {result['extraction_time']:.2f} seconds")
165
+ print(f" Text length: {result['text_length']} characters")
166
+ print(f" Element count: {result['element_count']} elements")
167
+ print(f" Average confidence: {result['avg_confidence']:.2f}")
168
+ print(f" Output image: {result['output_path']}")
169
+
170
+ # Compare results
171
+ if len(results) > 1:
172
+ print("\nComparison Results:")
173
+ print(f"{'Engine':<10} {'Time (s)':<10} {'Text Len':<10} {'Elements':<10} {'Avg Conf':<10}")
174
+ print(f"{'-'*60}")
175
+ for result in results:
176
+ print(f"{result['engine']:<10} {result['extraction_time']:.2f}s {result['text_length']:<10} {result['element_count']:<10} {result['avg_confidence']:.2f}")
177
+
178
+ # Highlight differences
179
+ fastest = min(results, key=lambda x: x['extraction_time'])
180
+ most_elements = max(results, key=lambda x: x['element_count'])
181
+ highest_confidence = max(results, key=lambda x: x['avg_confidence'])
182
+
183
+ print(f"\nFastest engine: {fastest['engine']} ({fastest['extraction_time']:.2f}s)")
184
+ print(f"Most elements: {most_elements['engine']} ({most_elements['element_count']} elements)")
185
+ print(f"Highest confidence: {highest_confidence['engine']} ({highest_confidence['avg_confidence']:.2f})")
186
+
187
+ # Additional comparison with engine-specific optimizations
188
+ print("\nRunning comparison with engine-specific optimizations:")
189
+
190
+ # Custom configurations for each engine
191
+ if easyocr_available and paddleocr_available:
192
+ # EasyOCR with customized settings
193
+ easyocr_custom = PDF(default_pdf,
194
+ ocr_engine="easyocr",
195
+ ocr={
196
+ "languages": ["en"],
197
+ "device": "cpu",
198
+ "min_confidence": 0.3,
199
+ "model_settings": {
200
+ "detail": 1,
201
+ "paragraph": False,
202
+ "contrast_ths": 0.05,
203
+ "text_threshold": 0.5
204
+ }
205
+ })
206
+
207
+ # PaddleOCR with customized settings
208
+ paddleocr_custom = PDF(default_pdf,
209
+ ocr_engine="paddleocr",
210
+ ocr={
211
+ "languages": ["en"],
212
+ "device": "cpu",
213
+ "min_confidence": 0.3,
214
+ "model_settings": {
215
+ "use_angle_cls": True,
216
+ "det_db_thresh": 0.2,
217
+ "det_db_box_thresh": 0.3
218
+ }
219
+ })
220
+
221
+ # Compare text extraction
222
+ easyocr_text = easyocr_custom.pages[0].extract_text()
223
+ paddleocr_text = paddleocr_custom.pages[0].extract_text()
224
+
225
+ print(f"\nOptimized EasyOCR text length: {len(easyocr_text)}")
226
+ print(f"Optimized PaddleOCR text length: {len(paddleocr_text)}")
227
+
228
+ # Compare element counts
229
+ easyocr_elements = easyocr_custom.pages[0].extract_ocr_elements()
230
+ paddleocr_elements = paddleocr_custom.pages[0].extract_ocr_elements()
231
+
232
+ print(f"Optimized EasyOCR element count: {len(easyocr_elements)}")
233
+ print(f"Optimized PaddleOCR element count: {len(paddleocr_elements)}")
234
+
235
+ print("\nDone!")
@@ -0,0 +1,89 @@
1
+ """
2
+ OCR example using PaddleOCR.
3
+
4
+ This example demonstrates how to use OCR to extract text from PDF documents,
5
+ both for whole pages and specific regions.
6
+
7
+ Note: This example requires the 'paddleocr' package:
8
+ pip install paddlepaddle paddleocr
9
+ """
10
+ import os
11
+ import sys
12
+ from natural_pdf import PDF
13
+
14
+ # Get the current directory of this script
15
+ script_dir = os.path.dirname(os.path.realpath(__file__))
16
+ # Get the parent directory (project root)
17
+ root_dir = os.path.dirname(script_dir)
18
+ # Default PDF path (replace with a scanned document path for better results)
19
+ default_pdf = os.path.join(root_dir, "pdfs", "HARRY ROQUE_redacted.pdf")
20
+ # Output directory
21
+ output_dir = os.path.join(root_dir, "output")
22
+ os.makedirs(output_dir, exist_ok=True)
23
+
24
+ print("OCR Example")
25
+ print("==========")
26
+
27
+ # 1. Loading a PDF with OCR enabled
28
+ print("\n1. Loading PDF with OCR enabled")
29
+ pdf = PDF(default_pdf, ocr={
30
+ "enabled": "auto", # Auto mode: only use OCR when necessary
31
+ "languages": ["en"],
32
+ # For more options, see OCR-NOTES.md
33
+ })
34
+
35
+ # 2. Extract text from a page with auto OCR
36
+ page = pdf.pages[0]
37
+ print(f"\n2. Extracting text from page {page.number} with auto OCR")
38
+ text = page.extract_text()
39
+ print(f"Extracted {len(text)} characters.")
40
+ print("First 150 characters:\n", text[:150] + "..." if len(text) > 150 else text)
41
+
42
+ # 3. Force OCR on a page
43
+ print("\n3. Force OCR on a page")
44
+ ocr_text = page.extract_text(ocr=True) # Force OCR regardless of existing text
45
+ print(f"Extracted {len(ocr_text)} characters with forced OCR.")
46
+ print("First 150 characters:\n", ocr_text[:150] + "..." if len(ocr_text) > 150 else ocr_text)
47
+
48
+ # 4. Extract OCR elements directly
49
+ print("\n4. Extracting OCR elements directly")
50
+ ocr_elements = page.extract_ocr_elements()
51
+ print(f"Found {len(ocr_elements)} OCR text elements.")
52
+ for i, elem in enumerate(ocr_elements[:3]): # Show first 3 elements
53
+ print(f" Element {i+1}: '{elem.text}' (confidence: {elem.confidence:.2f})")
54
+
55
+ # 5. Apply OCR to a specific region
56
+ print("\n5. Applying OCR to a specific region")
57
+ # Create a region (adjust coordinates for your PDF)
58
+ region = page.create_region(100, 100, 400, 200) # x0, y0, x1, y1
59
+ region.highlight(label="OCR Region")
60
+
61
+ # Apply OCR to this region
62
+ region_elements = region.apply_ocr()
63
+ print(f"Found {len(region_elements)} OCR text elements in the region.")
64
+
65
+ # Extract text from the region (uses OCR since we already applied it)
66
+ region_text = region.extract_text()
67
+ print(f"Region text: '{region_text[:50]}...'" if len(region_text) > 50 else f"Region text: '{region_text}'")
68
+
69
+ # 6. Finding OCR text elements with selectors
70
+ print("\n6. Finding OCR text elements with selectors")
71
+ # Find OCR elements with specific properties
72
+ high_confidence_ocr = page.find_all('text[source=ocr][confidence>=0.8]')
73
+ print(f"Found {len(high_confidence_ocr)} high-confidence OCR elements.")
74
+
75
+ # Find OCR elements containing specific text
76
+ matching_ocr = page.find_all('text[source=ocr]:contains("the")')
77
+ print(f"Found {len(matching_ocr)} OCR elements containing 'the'.")
78
+
79
+ # 7. Visualize OCR results
80
+ print("\n7. Visualizing OCR results")
81
+ # Highlight all OCR elements
82
+ for elem in ocr_elements:
83
+ elem.highlight(label=f"OCR ({elem.confidence:.2f})")
84
+
85
+ # Save the highlighted page
86
+ output_path = os.path.join(output_dir, "ocr_results.png")
87
+ page.to_image(path=output_path, show_labels=True)
88
+ print(f"Saved visualization to {output_path}")
89
+ print("\nDone!")
@@ -0,0 +1,79 @@
1
+ import os
2
+ import sys
3
+ from pathlib import Path
4
+
5
+ # Add parent directory to path for imports
6
+ sys.path.insert(0, str(Path(__file__).parent.parent))
7
+
8
+ from natural_pdf import PDF
9
+
10
+ # Get absolute path for the PDF
11
+ script_dir = os.path.dirname(os.path.realpath(__file__))
12
+ root_dir = os.path.dirname(script_dir)
13
+ pdf_path = os.path.join(root_dir, "pdfs", "HARRY ROQUE_redacted.pdf")
14
+
15
+ print(f"Loading PDF: {pdf_path}")
16
+
17
+ # Example 1: Initialize PDF with flattened OCR parameters
18
+ pdf = PDF(pdf_path, ocr={
19
+ "enabled": True,
20
+ "languages": ["en"],
21
+ "min_confidence": 0.3,
22
+ # OCR parameters directly in config root:
23
+ "text_threshold": 0.1, # Was previously in detection_params
24
+ "link_threshold": 0.1, # Was previously in detection_params
25
+ "paragraph": True, # Was previously in recognition_params
26
+ "detail": 1 # Was previously in recognition_params
27
+ })
28
+
29
+ # Use a specific page
30
+ page = pdf.pages[3]
31
+
32
+ # Example 2: Apply OCR with flattened parameters
33
+ print("\nApplying OCR with flattened parameters")
34
+ ocr_elements = page.apply_ocr(
35
+ # Direct parameters:
36
+ text_threshold=0.15,
37
+ link_threshold=0.15,
38
+ mag_ratio=1.5,
39
+ canvas_size=1024,
40
+ batch_size=4
41
+ )
42
+
43
+ print(f"Found {len(ocr_elements)} OCR text elements")
44
+
45
+ # Print sample of OCR results
46
+ print("\nSample OCR results:")
47
+ for i, elem in enumerate(ocr_elements[:5]):
48
+ print(f"{i+1}. '{elem.text}' (conf: {elem.confidence:.2f})")
49
+ if i >= 4:
50
+ break
51
+
52
+ # Example 3: Extract text with OCR using flattened parameters
53
+ print("\nExtracting text with OCR using flattened parameters")
54
+ text = page.extract_text(ocr={
55
+ "enabled": True,
56
+ "min_confidence": 0.2,
57
+ # Direct parameters:
58
+ "text_threshold": 0.2,
59
+ "contrast_ths": 0.05
60
+ })
61
+
62
+ # Display first 100 characters of text
63
+ print(f"\nExtracted text (first 100 chars):")
64
+ print(text[:100] + "...")
65
+
66
+ # Create output directory if it doesn't exist
67
+ output_dir = os.path.join(root_dir, "output")
68
+ os.makedirs(output_dir, exist_ok=True)
69
+
70
+ # Highlight OCR elements
71
+ for elem in ocr_elements[:10]:
72
+ elem.highlight(label=f"OCR: {elem.text}")
73
+
74
+ # Save image
75
+ output_path = os.path.join(output_dir, "ocr_simplified.png")
76
+ print(f"\nSaving highlighted image to: {output_path}")
77
+ page.to_image(path=output_path, show_labels=True)
78
+
79
+ print("\nTest completed successfully!")
@@ -0,0 +1,102 @@
1
+ """
2
+ OCR Visualization Example
3
+
4
+ This example demonstrates the new OCR visualization feature that renders
5
+ OCR text with white background boxes on the image.
6
+ """
7
+ import os
8
+ import sys
9
+
10
+ # Add project directory to the path to import the library
11
+ script_dir = os.path.dirname(os.path.realpath(__file__))
12
+ root_dir = os.path.dirname(script_dir)
13
+ sys.path.insert(0, root_dir)
14
+
15
+ from natural_pdf import PDF
16
+
17
+ # Get paths
18
+ default_pdf = os.path.join(root_dir, "pdfs", "needs-ocr.pdf")
19
+ if not os.path.exists(default_pdf):
20
+ default_pdf = os.path.join(root_dir, "pdfs", "01-practice.pdf")
21
+
22
+ # Output directory
23
+ output_dir = os.path.join(root_dir, "output")
24
+ os.makedirs(output_dir, exist_ok=True)
25
+
26
+ def main():
27
+ """Main example function."""
28
+ print("OCR Visualization Example")
29
+ print("=========================")
30
+
31
+ # 1. Load a PDF with OCR enabled
32
+ print("\n1. Loading PDF with OCR enabled")
33
+ pdf = PDF(default_pdf, ocr={
34
+ "enabled": True,
35
+ "languages": ["en"],
36
+ "min_confidence": 0.3 # Lower confidence to get more results
37
+ })
38
+
39
+ # 2. First check if we have OCR text by extracting text with OCR
40
+ print("\n2. Extracting text with OCR")
41
+ page = pdf.pages[0]
42
+ text = page.extract_text(ocr=True) # Force OCR
43
+ print(f"Extracted {len(text)} characters with OCR")
44
+
45
+ # 3. Find OCR text elements
46
+ print("\n3. Finding OCR text elements")
47
+ ocr_elements = page.find_all('text[source=ocr]')
48
+ print(f"Found {len(ocr_elements)} OCR text elements on the page")
49
+
50
+ # If we don't have OCR elements, fall back to forcing OCR directly
51
+ if not ocr_elements:
52
+ print("No OCR elements found. Running OCR directly...")
53
+ # Extract OCR elements directly
54
+ ocr_elements = page.extract_ocr_elements()
55
+ print(f"Found {len(ocr_elements)} OCR text elements from direct extraction")
56
+
57
+ # 4. Highlight the OCR elements
58
+ print(f"\n4. Highlighting {len(ocr_elements)} OCR elements")
59
+ for element in ocr_elements:
60
+ # Add color highlighting based on confidence score
61
+ confidence = getattr(element, 'confidence', 0.5) # Default if not available
62
+ if confidence >= 0.8:
63
+ color = (0, 1, 0, 0.3) # Green for high confidence
64
+ elif confidence >= 0.5:
65
+ color = (1, 1, 0, 0.3) # Yellow for medium confidence
66
+ else:
67
+ color = (1, 0, 0, 0.3) # Red for low confidence
68
+
69
+ element.highlight(color=color, label=f"OCR ({confidence:.2f})")
70
+
71
+ # 5. Visualize the regular highlights (no OCR text)
72
+ print("\n5. Saving highlighted image without OCR text")
73
+ highlighted_path = os.path.join(output_dir, "ocr_highlighted.png")
74
+ page.to_image(path=highlighted_path, show_labels=True, render_ocr=False)
75
+ print(f"Saved highlighted image to {highlighted_path}")
76
+
77
+ # 6. Visualize with OCR text on white background
78
+ print("\n6. Saving image with rendered OCR text")
79
+ ocr_text_path = os.path.join(output_dir, "ocr_rendered_text.png")
80
+ try:
81
+ page.to_image(path=ocr_text_path, show_labels=True, render_ocr=True)
82
+ print(f"Saved OCR text rendering to {ocr_text_path}")
83
+ except ValueError as e:
84
+ print(f"Error rendering OCR text: {e}")
85
+
86
+ # 7. Create a clean white page with just OCR text (no highlights)
87
+ if ocr_elements:
88
+ print("\n7. Creating clean white page with just OCR text")
89
+ # Clear previous highlights
90
+ page.clear_highlights()
91
+ # Save with OCR text rendering only
92
+ clean_text_path = os.path.join(output_dir, "ocr_clean_text.png")
93
+ try:
94
+ page.to_image(path=clean_text_path, render_ocr=True)
95
+ print(f"Saved clean OCR text rendering to {clean_text_path}")
96
+ except ValueError as e:
97
+ print(f"Error rendering clean OCR text: {e}")
98
+
99
+ print("\nDone!")
100
+
101
+ if __name__ == "__main__":
102
+ main()