natural-pdf 25.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. examples/__init__.py +3 -0
  2. examples/another_exclusion_example.py +20 -0
  3. examples/basic_usage.py +190 -0
  4. examples/boundary_exclusion_test.py +137 -0
  5. examples/boundary_inclusion_fix_test.py +157 -0
  6. examples/chainable_layout_example.py +70 -0
  7. examples/color_basic_test.py +49 -0
  8. examples/color_name_example.py +71 -0
  9. examples/color_test.py +62 -0
  10. examples/debug_ocr.py +91 -0
  11. examples/direct_ocr_test.py +148 -0
  12. examples/direct_paddle_test.py +99 -0
  13. examples/direct_qa_example.py +165 -0
  14. examples/document_layout_analysis.py +123 -0
  15. examples/document_qa_example.py +185 -0
  16. examples/exclusion_count_debug.py +128 -0
  17. examples/exclusion_debug.py +107 -0
  18. examples/exclusion_example.py +150 -0
  19. examples/exclusion_optimization_example.py +190 -0
  20. examples/extract_text_test.py +128 -0
  21. examples/font_aware_example.py +101 -0
  22. examples/font_variant_example.py +124 -0
  23. examples/footer_overlap_test.py +124 -0
  24. examples/highlight_all_example.py +82 -0
  25. examples/highlight_attributes_test.py +114 -0
  26. examples/highlight_confidence_display.py +122 -0
  27. examples/highlight_demo.py +110 -0
  28. examples/highlight_float_test.py +71 -0
  29. examples/highlight_test.py +147 -0
  30. examples/highlighting_example.py +123 -0
  31. examples/image_width_example.py +84 -0
  32. examples/improved_api_example.py +128 -0
  33. examples/layout_confidence_display_test.py +65 -0
  34. examples/layout_confidence_test.py +82 -0
  35. examples/layout_coordinate_debug.py +258 -0
  36. examples/layout_highlight_test.py +77 -0
  37. examples/logging_example.py +70 -0
  38. examples/ocr_comprehensive.py +193 -0
  39. examples/ocr_debug_example.py +87 -0
  40. examples/ocr_default_test.py +97 -0
  41. examples/ocr_engine_comparison.py +235 -0
  42. examples/ocr_example.py +89 -0
  43. examples/ocr_simplified_params.py +79 -0
  44. examples/ocr_visualization.py +102 -0
  45. examples/ocr_visualization_test.py +121 -0
  46. examples/paddle_layout_example.py +315 -0
  47. examples/paddle_layout_simple.py +74 -0
  48. examples/paddleocr_example.py +224 -0
  49. examples/page_collection_example.py +103 -0
  50. examples/polygon_highlight_example.py +83 -0
  51. examples/position_methods_example.py +134 -0
  52. examples/region_boundary_test.py +73 -0
  53. examples/region_exclusion_test.py +149 -0
  54. examples/region_expand_example.py +109 -0
  55. examples/region_image_example.py +116 -0
  56. examples/region_ocr_test.py +119 -0
  57. examples/region_sections_example.py +115 -0
  58. examples/school_books.py +49 -0
  59. examples/school_books_all.py +52 -0
  60. examples/scouring.py +36 -0
  61. examples/section_extraction_example.py +232 -0
  62. examples/simple_document_qa.py +97 -0
  63. examples/spatial_navigation_example.py +108 -0
  64. examples/table_extraction_example.py +135 -0
  65. examples/table_structure_detection.py +155 -0
  66. examples/tatr_cells_test.py +56 -0
  67. examples/tatr_ocr_table_test.py +94 -0
  68. examples/text_search_example.py +122 -0
  69. examples/text_style_example.py +110 -0
  70. examples/tiny-text.py +61 -0
  71. examples/until_boundaries_example.py +156 -0
  72. examples/until_example.py +112 -0
  73. examples/very_basics.py +15 -0
  74. natural_pdf/__init__.py +55 -0
  75. natural_pdf/analyzers/__init__.py +9 -0
  76. natural_pdf/analyzers/document_layout.py +736 -0
  77. natural_pdf/analyzers/text_structure.py +153 -0
  78. natural_pdf/core/__init__.py +3 -0
  79. natural_pdf/core/page.py +2376 -0
  80. natural_pdf/core/pdf.py +572 -0
  81. natural_pdf/elements/__init__.py +3 -0
  82. natural_pdf/elements/base.py +553 -0
  83. natural_pdf/elements/collections.py +770 -0
  84. natural_pdf/elements/line.py +124 -0
  85. natural_pdf/elements/rect.py +122 -0
  86. natural_pdf/elements/region.py +1366 -0
  87. natural_pdf/elements/text.py +304 -0
  88. natural_pdf/ocr/__init__.py +62 -0
  89. natural_pdf/ocr/easyocr_engine.py +254 -0
  90. natural_pdf/ocr/engine.py +158 -0
  91. natural_pdf/ocr/paddleocr_engine.py +263 -0
  92. natural_pdf/qa/__init__.py +3 -0
  93. natural_pdf/qa/document_qa.py +405 -0
  94. natural_pdf/selectors/__init__.py +4 -0
  95. natural_pdf/selectors/parser.py +360 -0
  96. natural_pdf/templates/__init__.py +1 -0
  97. natural_pdf/templates/ocr_debug.html +517 -0
  98. natural_pdf/utils/__init__.py +4 -0
  99. natural_pdf/utils/highlighting.py +605 -0
  100. natural_pdf/utils/ocr.py +515 -0
  101. natural_pdf/utils/reading_order.py +227 -0
  102. natural_pdf/utils/visualization.py +151 -0
  103. natural_pdf-25.3.16.dist-info/LICENSE +21 -0
  104. natural_pdf-25.3.16.dist-info/METADATA +268 -0
  105. natural_pdf-25.3.16.dist-info/RECORD +109 -0
  106. natural_pdf-25.3.16.dist-info/WHEEL +5 -0
  107. natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
  108. tests/__init__.py +3 -0
  109. tests/test_pdf.py +39 -0
@@ -0,0 +1,258 @@
1
+ """
2
+ Debug script to investigate coordinate differences between YOLO and TATR models.
3
+
4
+ This script visualizes the regions detected by both models and logs all coordinate
5
+ transformations to help diagnose the issue with YOLO's regions being too narrow.
6
+ """
7
+ import os
8
+ import sys
9
+ import logging
10
+ import numpy as np
11
+ try:
12
+ from PIL import Image, ImageDraw, ImageFont
13
+ except ImportError:
14
+ from PIL import Image, ImageDraw
15
+ ImageFont = None
16
+ import argparse
17
+
18
+ # Add project root to path
19
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
20
+
21
+ from natural_pdf import PDF, configure_logging
22
+ from natural_pdf.elements.collections import ElementCollection
23
+
24
+ # Set up logging
25
+ configure_logging(level=logging.DEBUG)
26
+ layout_logger = logging.getLogger("natural_pdf.analyzers.layout")
27
+ layout_logger.setLevel(logging.DEBUG)
28
+
29
+ # Create a file handler for detailed logs
30
+ file_handler = logging.FileHandler("layout_debug.log")
31
+ file_handler.setLevel(logging.DEBUG)
32
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
33
+ file_handler.setFormatter(formatter)
34
+ layout_logger.addHandler(file_handler)
35
+
36
+ def debug_detection_coordinates(pdf_path, page_num=0, output_dir="output"):
37
+ """Debug layout detection coordinates for both YOLO and TATR models."""
38
+ os.makedirs(output_dir, exist_ok=True)
39
+
40
+ # Open PDF
41
+ pdf = PDF(pdf_path)
42
+ page = pdf.pages[page_num]
43
+
44
+ # Get original page dimensions for reference
45
+ page_width = page.width
46
+ page_height = page.height
47
+ print(f"Page dimensions: {page_width} x {page_height}")
48
+
49
+ # Monkey patch the core analyze_layout method to add logging
50
+ original_analyze_layout = type(page).analyze_layout
51
+
52
+ def debug_analyze_layout(self, *args, **kwargs):
53
+ """Wrapped analyze_layout method with debug logging."""
54
+ model_type = kwargs.get('model', 'yolo')
55
+ print(f"\n=== Running layout analysis with {model_type.upper()} model ===")
56
+
57
+ # Add logging for image to PDF coordinate conversion
58
+ old_convert_to_regions = None
59
+ try:
60
+ from natural_pdf.analyzers.document_layout import convert_to_regions
61
+ old_convert_to_regions = convert_to_regions
62
+
63
+ def debug_convert_to_regions(page, detections, scale_factor=1.0):
64
+ """Monkey patched version with detailed logging."""
65
+ print(f"Converting {len(detections)} detections with scale factor {scale_factor}")
66
+
67
+ # Create a detailed log for each detection
68
+ for i, det in enumerate(detections):
69
+ bbox = det['bbox']
70
+ x_min, y_min, x_max, y_max = bbox
71
+ width = x_max - x_min
72
+ height = y_max - y_min
73
+
74
+ pdf_x0 = x_min * scale_factor
75
+ pdf_y0 = y_min * scale_factor
76
+ pdf_x1 = x_max * scale_factor
77
+ pdf_y1 = y_max * scale_factor
78
+
79
+ print(f"Detection #{i+1} ({det['class']}):")
80
+ print(f" Raw bbox: {bbox}")
81
+ print(f" Image dimensions: {width:.2f} x {height:.2f}")
82
+ print(f" PDF coords: ({pdf_x0:.2f}, {pdf_y0:.2f}, {pdf_x1:.2f}, {pdf_y1:.2f})")
83
+ print(f" PDF dimensions: {pdf_x1-pdf_x0:.2f} x {pdf_y1-pdf_y0:.2f}")
84
+ print(f" Image-to-PDF ratio: width={scale_factor:.4f}, height={scale_factor:.4f}")
85
+
86
+ # Call the original function
87
+ return old_convert_to_regions(page, detections, scale_factor)
88
+
89
+ # Replace the function
90
+ from natural_pdf.analyzers import document_layout
91
+ document_layout.convert_to_regions = debug_convert_to_regions
92
+
93
+ except ImportError:
94
+ print("Could not monkey patch convert_to_regions")
95
+
96
+ # Call the original method
97
+ result = original_analyze_layout(self, *args, **kwargs)
98
+
99
+ # Restore the original function
100
+ if old_convert_to_regions:
101
+ from natural_pdf.analyzers import document_layout
102
+ document_layout.convert_to_regions = old_convert_to_regions
103
+
104
+ return result
105
+
106
+ # Apply the monkey patch
107
+ type(page).analyze_layout = debug_analyze_layout
108
+
109
+ # Run YOLO model layout detection
110
+ page.analyze_layout(model="yolo")
111
+
112
+ # Get regions and save visualization
113
+ yolo_regions = page.find_all('region[model=yolo]')
114
+ print(f"YOLO detected {len(yolo_regions)} regions")
115
+
116
+ # Highlight YOLO regions
117
+ page.clear_highlights()
118
+ for region in yolo_regions:
119
+ # Get region dimensions
120
+ width = region.width
121
+ height = region.height
122
+ region.highlight(label=f"{region.region_type} ({width:.1f}x{height:.1f})")
123
+
124
+ # Save YOLO visualization
125
+ page.to_image(labels=True).save(os.path.join(output_dir, "yolo_regions.png"))
126
+
127
+ # Create detailed summary of YOLO regions
128
+ with open(os.path.join(output_dir, "yolo_regions.txt"), "w") as f:
129
+ for i, region in enumerate(yolo_regions):
130
+ f.write(f"Region #{i+1} ({region.region_type}):\n")
131
+ f.write(f" Bbox: ({region.x0:.2f}, {region.top:.2f}, {region.x1:.2f}, {region.bottom:.2f})\n")
132
+ f.write(f" Dimensions: {region.width:.2f} x {region.height:.2f}\n")
133
+ f.write(f" Confidence: {region.confidence:.4f}\n")
134
+ f.write(f" % of page width: {(region.width / page_width) * 100:.2f}%\n\n")
135
+
136
+ # Clear existing layout regions
137
+ page._regions['detected'] = []
138
+
139
+ # Run TATR model layout detection
140
+ page.analyze_layout(model="tatr")
141
+
142
+ # Get regions and save visualization
143
+ tatr_regions = page.find_all('region[model=tatr]')
144
+ print(f"TATR detected {len(tatr_regions)} regions")
145
+
146
+ # Highlight TATR regions
147
+ page.clear_highlights()
148
+ for region in tatr_regions:
149
+ # Get region dimensions
150
+ width = region.width
151
+ height = region.height
152
+ region.highlight(label=f"{region.region_type} ({width:.1f}x{height:.1f})")
153
+
154
+ # Save TATR visualization
155
+ page.to_image(labels=True).save(os.path.join(output_dir, "tatr_regions.png"))
156
+
157
+ # Create detailed summary of TATR regions
158
+ with open(os.path.join(output_dir, "tatr_regions.txt"), "w") as f:
159
+ for i, region in enumerate(tatr_regions):
160
+ f.write(f"Region #{i+1} ({region.region_type}):\n")
161
+ f.write(f" Bbox: ({region.x0:.2f}, {region.top:.2f}, {region.x1:.2f}, {region.bottom:.2f})\n")
162
+ f.write(f" Dimensions: {region.width:.2f} x {region.height:.2f}\n")
163
+ f.write(f" Confidence: {region.confidence:.4f}\n")
164
+ f.write(f" % of page width: {(region.width / page_width) * 100:.2f}%\n\n")
165
+
166
+ # Create a side-by-side comparison image
167
+ # Get page base image
168
+ page_image = page.to_image(resolution=150)
169
+
170
+ # Create a combined image
171
+ combined_width = page_image.width * 2 + 20 # Add some padding
172
+ combined_height = page_image.height + 50 # Add space for title
173
+ combined = Image.new('RGB', (combined_width, combined_height), (255, 255, 255))
174
+
175
+ # Add titles
176
+ try:
177
+ font = ImageFont.truetype("Arial", 16)
178
+ except:
179
+ font = None
180
+
181
+ draw = ImageDraw.Draw(combined)
182
+ draw.text((page_image.width // 2, 20), "YOLO Model", fill=(0, 0, 0), font=font)
183
+ draw.text((page_image.width * 3 // 2 + 20, 20), "TATR Model", fill=(0, 0, 0), font=font)
184
+
185
+ # Add images side by side
186
+ try:
187
+ yolo_img = Image.open(os.path.join(output_dir, "yolo_regions.png"))
188
+ tatr_img = Image.open(os.path.join(output_dir, "tatr_regions.png"))
189
+ except:
190
+ print("Warning: Could not load images for side-by-side comparison. Continuing...")
191
+ return
192
+
193
+ # Resize if needed
194
+ if yolo_img.height != page_image.height or yolo_img.width != page_image.width:
195
+ yolo_img = yolo_img.resize((page_image.width, page_image.height))
196
+ if tatr_img.height != page_image.height or tatr_img.width != page_image.width:
197
+ tatr_img = tatr_img.resize((page_image.width, page_image.height))
198
+
199
+ combined.paste(yolo_img, (0, 50))
200
+ combined.paste(tatr_img, (page_image.width + 20, 50))
201
+
202
+ combined.save(os.path.join(output_dir, "model_comparison.png"))
203
+
204
+ # Create a document comparing page dimensions to detected region dimensions
205
+ with open(os.path.join(output_dir, "dimension_analysis.txt"), "w") as f:
206
+ f.write(f"Page dimensions: {page_width:.2f} x {page_height:.2f}\n\n")
207
+
208
+ # YOLO region stats
209
+ f.write("=== YOLO Model Regions ===\n")
210
+ if yolo_regions:
211
+ yolo_widths = [r.width for r in yolo_regions]
212
+ yolo_width_pcts = [(r.width / page_width) * 100 for r in yolo_regions]
213
+
214
+ f.write(f"Total regions: {len(yolo_regions)}\n")
215
+ f.write(f"Average width: {sum(yolo_widths) / len(yolo_widths):.2f}\n")
216
+ f.write(f"Average width as % of page: {sum(yolo_width_pcts) / len(yolo_width_pcts):.2f}%\n")
217
+ f.write(f"Min width: {min(yolo_widths):.2f} ({min(yolo_width_pcts):.2f}%)\n")
218
+ f.write(f"Max width: {max(yolo_widths):.2f} ({max(yolo_width_pcts):.2f}%)\n\n")
219
+ else:
220
+ f.write("No regions detected\n\n")
221
+
222
+ # TATR region stats
223
+ f.write("=== TATR Model Regions ===\n")
224
+ if tatr_regions:
225
+ tatr_widths = [r.width for r in tatr_regions]
226
+ tatr_width_pcts = [(r.width / page_width) * 100 for r in tatr_regions]
227
+
228
+ f.write(f"Total regions: {len(tatr_regions)}\n")
229
+ f.write(f"Average width: {sum(tatr_widths) / len(tatr_widths):.2f}\n")
230
+ f.write(f"Average width as % of page: {sum(tatr_width_pcts) / len(tatr_width_pcts):.2f}%\n")
231
+ f.write(f"Min width: {min(tatr_widths):.2f} ({min(tatr_width_pcts):.2f}%)\n")
232
+ f.write(f"Max width: {max(tatr_widths):.2f} ({max(tatr_width_pcts):.2f}%)\n\n")
233
+ else:
234
+ f.write("No regions detected\n\n")
235
+
236
+ # If both have regions, compare them
237
+ if yolo_regions and tatr_regions:
238
+ # Calculate average width ratio
239
+ avg_yolo_width = sum(yolo_widths) / len(yolo_widths)
240
+ avg_tatr_width = sum(tatr_widths) / len(tatr_widths)
241
+ width_ratio = avg_tatr_width / avg_yolo_width if avg_yolo_width > 0 else 0
242
+
243
+ f.write("=== Comparison ===\n")
244
+ f.write(f"TATR avg width / YOLO avg width = {width_ratio:.4f}\n")
245
+ f.write(f"YOLO is {100 * (1 - 1/width_ratio):.2f}% narrower than TATR on average\n")
246
+
247
+ print(f"\nDebug information saved to {output_dir} directory")
248
+ print(f"See 'layout_debug.log' for detailed process logging")
249
+
250
+ if __name__ == "__main__":
251
+ parser = argparse.ArgumentParser(description="Debug layout detection coordinate differences")
252
+ parser.add_argument("pdf_path", help="Path to the PDF file to analyze")
253
+ parser.add_argument("--page", type=int, default=0, help="Page number to analyze (0-indexed)")
254
+ parser.add_argument("--output", default="output", help="Output directory for debug files")
255
+
256
+ args = parser.parse_args()
257
+
258
+ debug_detection_coordinates(args.pdf_path, page_num=args.page, output_dir=args.output)
@@ -0,0 +1,77 @@
1
+ """
2
+ Test the improved highlight_all behavior with layout regions.
3
+
4
+ This example demonstrates how the updated highlight_all method properly
5
+ highlights layout regions by model and type.
6
+ """
7
+ import os
8
+ import sys
9
+ import argparse
10
+
11
+ # Add the parent directory to the Python path
12
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
13
+ from natural_pdf import PDF
14
+
15
+ # Get the current directory of this script
16
+ script_dir = os.path.dirname(os.path.realpath(__file__))
17
+ # Get the parent directory (project root)
18
+ root_dir = os.path.dirname(script_dir)
19
+ # Default PDF path
20
+ default_pdf = os.path.join(root_dir, "pdfs", "01-practice.pdf")
21
+
22
+ # Set up argument parser
23
+ parser = argparse.ArgumentParser(description="Layout highlighting test")
24
+ parser.add_argument("pdf_path", nargs="?", default=default_pdf, help="Path to a PDF file")
25
+ parser.add_argument("--page", type=int, default=0, help="Page number to analyze (0-based)")
26
+ args = parser.parse_args()
27
+
28
+ print(f"Testing layout highlighting on: {args.pdf_path}")
29
+ print(f"Page: {args.page}")
30
+
31
+ # Load the PDF
32
+ pdf = PDF(args.pdf_path)
33
+ page = pdf.pages[args.page]
34
+
35
+ # First, let's show the regular highlight_all without layout regions
36
+ print("Creating image with standard highlight_all (no layout regions)...")
37
+ page.clear_highlights()
38
+ page.highlight_all()
39
+ output_path = os.path.join(root_dir, "output", "standard_highlight_all.png")
40
+ page.to_image(path=output_path, show_labels=True)
41
+ print(f"Saved to {output_path}")
42
+
43
+ # Now run layout analysis with YOLO
44
+ print("\nRunning YOLO layout analysis...")
45
+ page.analyze_layout(model="yolo", confidence=0.2)
46
+ print(f"Found {len(page.detected_layout_regions)} YOLO layout regions")
47
+
48
+ # Create an image with highlight_all including layout regions
49
+ print("Creating image with highlight_all including YOLO layout regions...")
50
+ page.clear_highlights()
51
+ page.highlight_all(include_layout_regions=True)
52
+ output_path = os.path.join(root_dir, "output", "highlight_all_with_yolo.png")
53
+ page.to_image(path=output_path, show_labels=True)
54
+ print(f"Saved to {output_path}")
55
+
56
+ # Now run table structure analysis with TATR and append to existing regions
57
+ print("\nRunning TATR table structure analysis...")
58
+ page.analyze_layout(model="tatr", confidence=0.3, existing="append")
59
+ print(f"Found {len(page.detected_layout_regions)} total layout regions (YOLO + TATR)")
60
+
61
+ # Create an image with highlight_all including all layout regions
62
+ print("Creating image with highlight_all including all layout regions...")
63
+ page.clear_highlights()
64
+ page.highlight_all(include_layout_regions=True)
65
+ output_path = os.path.join(root_dir, "output", "highlight_all_with_all_layouts.png")
66
+ page.to_image(path=output_path, show_labels=True)
67
+ print(f"Saved to {output_path}")
68
+
69
+ # Compare with the original highlight_layout method
70
+ print("\nCreating image with highlight_layout method for comparison...")
71
+ page.clear_highlights()
72
+ page.highlight_layout()
73
+ output_path = os.path.join(root_dir, "output", "highlight_layout_method.png")
74
+ page.to_image(path=output_path, show_labels=True)
75
+ print(f"Saved to {output_path}")
76
+
77
+ print("\nDone!")
@@ -0,0 +1,70 @@
1
+ """
2
+ Example demonstrating the logging system in Natural PDF.
3
+ """
4
+ import os
5
+ import sys
6
+ import logging
7
+ from pathlib import Path
8
+
9
+ # Add the project directory to the path to import the library
10
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
11
+ from natural_pdf import configure_logging, PDF
12
+
13
+ def main():
14
+ # Basic setup with INFO level
15
+ print("=== Configuring logging at INFO level ===")
16
+ configure_logging(level=logging.INFO)
17
+
18
+ # Create a PDF
19
+ pdf_path = "./pdfs/01-practice.pdf"
20
+ if not os.path.exists(pdf_path):
21
+ # Try another file if the first one doesn't exist
22
+ pdf_path = list(Path("./pdfs").glob("*.pdf"))[0]
23
+
24
+ print(f"\nLoading PDF with standard logging: {pdf_path}")
25
+ pdf = PDF(pdf_path)
26
+
27
+ # OCR with default settings (should log OCR engine initialization)
28
+ print("\nExtracting text with OCR")
29
+ text = pdf.pages[0].extract_text(ocr=True)
30
+ print(f"Extracted {len(text)} characters")
31
+
32
+ # Switch to DEBUG level
33
+ print("\n=== Configuring logging at DEBUG level ===")
34
+ configure_logging(level=logging.DEBUG)
35
+
36
+ # Try layout detection (generates more detailed logs)
37
+ print("\nRunning layout detection with DEBUG logging")
38
+ regions = pdf.pages[0].analyze_layout(
39
+ model="paddle",
40
+ model_params={"detect_text": True, "verbose": True}
41
+ )
42
+ print(f"Found {len(regions)} regions")
43
+
44
+ # Try logging to a file
45
+ print("\n=== Logging to a file ===")
46
+ log_file = os.path.join("output", "natural_pdf.log")
47
+ os.makedirs(os.path.dirname(log_file), exist_ok=True)
48
+
49
+ # Create a file handler with custom formatter
50
+ file_handler = logging.FileHandler(log_file, mode='w') # 'w' mode to overwrite any existing file
51
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
52
+ file_handler.setFormatter(formatter)
53
+
54
+ # Configure logging with the file handler
55
+ configure_logging(level=logging.DEBUG, handler=file_handler)
56
+
57
+ # Force a few log events
58
+ logger = logging.getLogger("natural_pdf")
59
+ logger.debug("This is a debug message written to the log file")
60
+ logger.info("This is an info message written to the log file")
61
+ logger.warning("This is a warning message written to the log file")
62
+
63
+ # Run another OCR operation to log to the file
64
+ print(f"Running OCR with logging to {log_file}")
65
+ text = pdf.pages[0].extract_text(ocr=True)
66
+
67
+ print(f"\nDone! Check {log_file} for detailed logs.")
68
+
69
+ if __name__ == "__main__":
70
+ main()
@@ -0,0 +1,193 @@
1
+ """
2
+ Comprehensive OCR example.
3
+
4
+ This example demonstrates the full range of OCR capabilities
5
+ in natural-pdf, including:
6
+ 1. Multiple configuration formats
7
+ 2. Auto mode
8
+ 3. Region-specific OCR
9
+ 4. Selector filtering by source and confidence
10
+ 5. Visualization of OCR results
11
+ """
12
+ import os
13
+ import sys
14
+ from pathlib import Path
15
+
16
+ # Add parent directory to path for imports
17
+ sys.path.insert(0, str(Path(__file__).parent.parent))
18
+
19
+ from natural_pdf import PDF
20
+
21
+ # Get the current directory of this script
22
+ script_dir = os.path.dirname(os.path.realpath(__file__))
23
+ # Get the parent directory (project root)
24
+ root_dir = os.path.dirname(script_dir)
25
+ # Default PDF path (replace with a scanned document path for better results)
26
+ default_pdf = os.path.join(root_dir, "pdfs", "HARRY ROQUE_redacted.pdf")
27
+ # Output directory
28
+ output_dir = os.path.join(root_dir, "output")
29
+ os.makedirs(output_dir, exist_ok=True)
30
+
31
+ print("Comprehensive OCR Example")
32
+ print("========================")
33
+
34
+ # 1. Demonstrate different OCR configuration formats
35
+ print("\n1. Different OCR configuration formats")
36
+
37
+ # Simple flag
38
+ pdf_simple = PDF(default_pdf, ocr=True)
39
+ print(" - Simple flag (ocr=True): OCR enabled with defaults")
40
+
41
+ # Auto mode
42
+ pdf_auto = PDF(default_pdf, ocr="auto")
43
+ print(" - Auto mode (ocr='auto'): OCR applied only when needed")
44
+
45
+ # Language list
46
+ pdf_langs = PDF(default_pdf, ocr=["en"])
47
+ print(" - Language list (ocr=['en']): English language OCR")
48
+
49
+ # Detailed config with detection parameters
50
+ pdf_detailed = PDF(default_pdf, ocr={
51
+ "enabled": True,
52
+ "engine": "easyocr",
53
+ "languages": ["en"],
54
+ "min_confidence": 0.6,
55
+ "paragraph": False,
56
+ # Text detection parameters for CRAFT
57
+ "detection_params": {
58
+ "text_threshold": 0.1, # Lower threshold to detect more text (default is 0.7)
59
+ "low_text": 0.3, # Lower threshold for text box filtering (default is 0.4)
60
+ "link_threshold": 0.3, # Lower threshold for link between text (default is 0.4)
61
+ "canvas_size": 2560, # Maximum image size
62
+ "mag_ratio": 1.5 # Image magnification ratio (increase for better detection)
63
+ },
64
+ # Optional recognition parameters
65
+ "recognition_params": {
66
+ "decoder": "greedy",
67
+ "batch_size": 4,
68
+ "contrast_ths": 0.05 # Lower contrast threshold
69
+ }
70
+ })
71
+ print(" - Detailed config: Custom parameters including text_threshold=0.1")
72
+
73
+ # 2. Auto mode demonstration
74
+ print("\n2. Auto mode OCR")
75
+ pdf = PDF(default_pdf, ocr="auto")
76
+ page = pdf.pages[0]
77
+
78
+ # Extract text with auto OCR
79
+ text = page.extract_text()
80
+ print(f" - Auto mode extracted {len(text)} characters")
81
+ print(f" - First 100 chars: {text[:100]}...")
82
+
83
+ # 3. Explicit OCR application
84
+ print("\n3. Explicit OCR application")
85
+ ocr_elements = page.apply_ocr()
86
+ print(f" - Found {len(ocr_elements)} OCR elements")
87
+ print(" - Sample OCR elements:")
88
+ for i, elem in enumerate(ocr_elements[:3]):
89
+ print(f" {i+1}. '{elem.text}' (confidence: {elem.confidence:.2f})")
90
+
91
+ # 4. OCR with confidence filtering
92
+ print("\n4. OCR confidence filtering")
93
+ high_conf = page.find_all('text[source=ocr][confidence>=0.8]')
94
+ print(f" - Found {len(high_conf)} high-confidence OCR elements")
95
+ low_conf = page.find_all('text[source=ocr][confidence<0.6]')
96
+ print(f" - Found {len(low_conf)} low-confidence OCR elements")
97
+
98
+ # 5. Content-based OCR filtering
99
+ print("\n5. Content-based OCR filtering")
100
+ contains_a = page.find_all('text[source=ocr]:contains("a")')
101
+ print(f" - Found {len(contains_a)} OCR elements containing 'a'")
102
+
103
+ # 6. Region-specific OCR
104
+ print("\n6. Region-specific OCR")
105
+ # Create a region - adjust coordinates if needed for your document
106
+ region = page.create_region(100, 100, 400, 200)
107
+ region_elems = region.apply_ocr()
108
+ print(f" - Applied OCR to region, found {len(region_elems)} elements")
109
+ region_text = region.extract_text()
110
+ print(f" - Region text: '{region_text[:50]}...'")
111
+
112
+ # 7. OCR visualization
113
+ print("\n7. OCR visualization")
114
+ # Clear any existing highlights
115
+ page.clear_highlights()
116
+
117
+ # Highlight all OCR elements with confidence displayed
118
+ print(" - Highlighting all OCR elements with confidence scores")
119
+ for elem in ocr_elements:
120
+ # Color coding based on confidence - using integer RGB values (0-255)
121
+ if elem.confidence >= 0.8:
122
+ color = (0, 204, 0, 76) # Green for high confidence
123
+ elif elem.confidence >= 0.6:
124
+ color = (230, 230, 0, 76) # Yellow for medium confidence
125
+ else:
126
+ color = (204, 0, 0, 76) # Red for low confidence
127
+
128
+ elem.highlight(label=f"OCR ({elem.confidence:.2f})", color=color)
129
+
130
+ # Save the image
131
+ output_path = os.path.join(output_dir, "ocr_confidence_visualization.png")
132
+ page.to_image(path=output_path, show_labels=True)
133
+ print(f" - Saved visualization to {output_path}")
134
+
135
+ # 8. Demonstrate override at extraction time
136
+ print("\n8. OCR override at extraction time")
137
+ text_default = page.extract_text()
138
+ print(f" - Default extraction: {len(text_default)} characters")
139
+
140
+ text_override = page.extract_text(ocr={
141
+ "languages": ["en"],
142
+ "min_confidence": 0.4 # Lower threshold
143
+ })
144
+ print(f" - Override extraction (min_confidence=0.4): {len(text_override)} characters")
145
+
146
+ # 9. OCR element properties
147
+ print("\n9. OCR element properties")
148
+ if ocr_elements:
149
+ elem = ocr_elements[0]
150
+ print(f" - Source: {elem.source}")
151
+ print(f" - Confidence: {elem.confidence:.2f}")
152
+ print(f" - Text: '{elem.text}'")
153
+ print(f" - Bounding box: {elem.bbox}")
154
+ print(f" - Font (default for OCR): {elem.fontname}")
155
+
156
+ # 10. Compare OCR with lower text_threshold to detect more text
157
+ print("\n10. Comparing OCR with different text_threshold values")
158
+ page.clear_highlights()
159
+
160
+ # First with default text_threshold (0.7)
161
+ print(" - Running OCR with default text_threshold (0.7)")
162
+ default_elements = page.extract_ocr_elements(ocr={
163
+ "languages": ["en"],
164
+ "detection_params": {
165
+ "text_threshold": 0.7 # Default value
166
+ }
167
+ })
168
+ print(f" - Found {len(default_elements)} elements with default text_threshold")
169
+
170
+ # Highlight with blue
171
+ for elem in default_elements:
172
+ elem.highlight(label="Default threshold", color=(0, 0, 204, 76))
173
+
174
+ # Now with lower text_threshold (0.1)
175
+ print(" - Running OCR with lower text_threshold (0.1)")
176
+ low_threshold_elements = page.extract_ocr_elements(ocr={
177
+ "languages": ["en"],
178
+ "detection_params": {
179
+ "text_threshold": 0.1 # Lower value to detect more text
180
+ }
181
+ })
182
+ print(f" - Found {len(low_threshold_elements)} elements with text_threshold=0.1")
183
+
184
+ # Highlight with red
185
+ for elem in low_threshold_elements:
186
+ elem.highlight(label="Lower threshold (0.1)", color=(204, 0, 0, 76))
187
+
188
+ # Save comparative visualization
189
+ output_path = os.path.join(output_dir, "ocr_threshold_comparison.png")
190
+ page.to_image(path=output_path, show_labels=True)
191
+ print(f" - Saved threshold comparison to {output_path}")
192
+
193
+ print("\nDone!")
@@ -0,0 +1,87 @@
1
+ """
2
+ OCR Debug Report Example
3
+
4
+ This example demonstrates the OCR debugging feature, which generates an interactive
5
+ HTML report for analyzing and correcting OCR results.
6
+ """
7
+ import os
8
+ import sys
9
+ from pathlib import Path
10
+
11
+ # Get the parent directory (project root)
12
+ project_root = Path(__file__).absolute().parent.parent
13
+ sys.path.insert(0, str(project_root))
14
+
15
+ from natural_pdf import PDF
16
+
17
+ # Output directory
18
+ output_dir = os.path.join(project_root, "output")
19
+ os.makedirs(output_dir, exist_ok=True)
20
+
21
+ # Default PDF path (a scanned document)
22
+ pdf_path = os.path.join(project_root, "pdfs", "HARRY ROQUE_redacted.pdf")
23
+ fallback_pdf = os.path.join(project_root, "pdfs", "needs-ocr.pdf")
24
+
25
+ # Use the first available PDF
26
+ if os.path.exists(pdf_path):
27
+ print(f"Using PDF: {pdf_path}")
28
+ elif os.path.exists(fallback_pdf):
29
+ pdf_path = fallback_pdf
30
+ print(f"Using PDF: {pdf_path}")
31
+ else:
32
+ print("No suitable PDF found. Please provide a scanned PDF.")
33
+ sys.exit(1)
34
+
35
+ # OCR Debug Example
36
+ print("\nOCR Debug Report Example")
37
+ print("=======================")
38
+
39
+ # Load PDF with OCR enabled
40
+ print("\n1. Loading PDF with OCR enabled")
41
+ pdf = PDF(
42
+ pdf_path,
43
+ ocr={
44
+ "enabled": True, # Enable OCR
45
+ "languages": ["en"],
46
+ "min_confidence": 0.3, # Lower threshold to get more results
47
+ },
48
+ # Try PaddleOCR first, which often gives better results for Asian languages
49
+ ocr_engine="paddleocr"
50
+ )
51
+
52
+ # Run OCR on the first page
53
+ print("\n2. Running OCR on the first page")
54
+ page = pdf.pages[0]
55
+ ocr_elements = page.extract_ocr_elements()
56
+ print(f"Found {len(ocr_elements)} OCR text elements")
57
+
58
+ # Generate a debug report for a single page
59
+ print("\n3. Generating OCR debug report for a single page")
60
+ page_report_path = os.path.join(output_dir, "ocr_debug_page.html")
61
+ page.debug_ocr(page_report_path)
62
+ print(f"Saved page debug report to: {page_report_path}")
63
+
64
+ # Generate a debug report for multiple pages
65
+ print("\n4. Generating OCR debug report for multiple pages")
66
+ pdf_report_path = os.path.join(output_dir, "ocr_debug_full.html")
67
+ pdf.debug_ocr(pdf_report_path)
68
+ print(f"Saved full PDF debug report to: {pdf_report_path}")
69
+
70
+ # Generate a debug report for a page range
71
+ print("\n5. Generating OCR debug report for a page range")
72
+ if len(pdf.pages) > 1:
73
+ page_range_report_path = os.path.join(output_dir, "ocr_debug_range.html")
74
+ page_range = pdf.pages[0:min(3, len(pdf.pages))]
75
+ page_range.debug_ocr(page_range_report_path)
76
+ print(f"Saved page range debug report to: {page_range_report_path}")
77
+ else:
78
+ print("PDF has only one page, skipping page range example")
79
+
80
+ print("\nDone! The debug reports have been saved to the output directory.")
81
+ print("You can open them in a web browser to interactively review the OCR results.")
82
+ print("The reports allow you to:")
83
+ print("- Filter results by confidence score")
84
+ print("- Search for specific text")
85
+ print("- Sort results by different criteria")
86
+ print("- Edit/correct OCR text")
87
+ print("- Export corrected text as JSON for further processing")