natural-pdf 25.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/__init__.py +3 -0
- examples/another_exclusion_example.py +20 -0
- examples/basic_usage.py +190 -0
- examples/boundary_exclusion_test.py +137 -0
- examples/boundary_inclusion_fix_test.py +157 -0
- examples/chainable_layout_example.py +70 -0
- examples/color_basic_test.py +49 -0
- examples/color_name_example.py +71 -0
- examples/color_test.py +62 -0
- examples/debug_ocr.py +91 -0
- examples/direct_ocr_test.py +148 -0
- examples/direct_paddle_test.py +99 -0
- examples/direct_qa_example.py +165 -0
- examples/document_layout_analysis.py +123 -0
- examples/document_qa_example.py +185 -0
- examples/exclusion_count_debug.py +128 -0
- examples/exclusion_debug.py +107 -0
- examples/exclusion_example.py +150 -0
- examples/exclusion_optimization_example.py +190 -0
- examples/extract_text_test.py +128 -0
- examples/font_aware_example.py +101 -0
- examples/font_variant_example.py +124 -0
- examples/footer_overlap_test.py +124 -0
- examples/highlight_all_example.py +82 -0
- examples/highlight_attributes_test.py +114 -0
- examples/highlight_confidence_display.py +122 -0
- examples/highlight_demo.py +110 -0
- examples/highlight_float_test.py +71 -0
- examples/highlight_test.py +147 -0
- examples/highlighting_example.py +123 -0
- examples/image_width_example.py +84 -0
- examples/improved_api_example.py +128 -0
- examples/layout_confidence_display_test.py +65 -0
- examples/layout_confidence_test.py +82 -0
- examples/layout_coordinate_debug.py +258 -0
- examples/layout_highlight_test.py +77 -0
- examples/logging_example.py +70 -0
- examples/ocr_comprehensive.py +193 -0
- examples/ocr_debug_example.py +87 -0
- examples/ocr_default_test.py +97 -0
- examples/ocr_engine_comparison.py +235 -0
- examples/ocr_example.py +89 -0
- examples/ocr_simplified_params.py +79 -0
- examples/ocr_visualization.py +102 -0
- examples/ocr_visualization_test.py +121 -0
- examples/paddle_layout_example.py +315 -0
- examples/paddle_layout_simple.py +74 -0
- examples/paddleocr_example.py +224 -0
- examples/page_collection_example.py +103 -0
- examples/polygon_highlight_example.py +83 -0
- examples/position_methods_example.py +134 -0
- examples/region_boundary_test.py +73 -0
- examples/region_exclusion_test.py +149 -0
- examples/region_expand_example.py +109 -0
- examples/region_image_example.py +116 -0
- examples/region_ocr_test.py +119 -0
- examples/region_sections_example.py +115 -0
- examples/school_books.py +49 -0
- examples/school_books_all.py +52 -0
- examples/scouring.py +36 -0
- examples/section_extraction_example.py +232 -0
- examples/simple_document_qa.py +97 -0
- examples/spatial_navigation_example.py +108 -0
- examples/table_extraction_example.py +135 -0
- examples/table_structure_detection.py +155 -0
- examples/tatr_cells_test.py +56 -0
- examples/tatr_ocr_table_test.py +94 -0
- examples/text_search_example.py +122 -0
- examples/text_style_example.py +110 -0
- examples/tiny-text.py +61 -0
- examples/until_boundaries_example.py +156 -0
- examples/until_example.py +112 -0
- examples/very_basics.py +15 -0
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +9 -0
- natural_pdf/analyzers/document_layout.py +736 -0
- natural_pdf/analyzers/text_structure.py +153 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/page.py +2376 -0
- natural_pdf/core/pdf.py +572 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +553 -0
- natural_pdf/elements/collections.py +770 -0
- natural_pdf/elements/line.py +124 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1366 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +62 -0
- natural_pdf/ocr/easyocr_engine.py +254 -0
- natural_pdf/ocr/engine.py +158 -0
- natural_pdf/ocr/paddleocr_engine.py +263 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +405 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +360 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +4 -0
- natural_pdf/utils/highlighting.py +605 -0
- natural_pdf/utils/ocr.py +515 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +151 -0
- natural_pdf-25.3.16.dist-info/LICENSE +21 -0
- natural_pdf-25.3.16.dist-info/METADATA +268 -0
- natural_pdf-25.3.16.dist-info/RECORD +109 -0
- natural_pdf-25.3.16.dist-info/WHEEL +5 -0
- natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
- tests/__init__.py +3 -0
- tests/test_pdf.py +39 -0
@@ -0,0 +1,258 @@
|
|
1
|
+
"""
|
2
|
+
Debug script to investigate coordinate differences between YOLO and TATR models.
|
3
|
+
|
4
|
+
This script visualizes the regions detected by both models and logs all coordinate
|
5
|
+
transformations to help diagnose the issue with YOLO's regions being too narrow.
|
6
|
+
"""
|
7
|
+
import os
|
8
|
+
import sys
|
9
|
+
import logging
|
10
|
+
import numpy as np
|
11
|
+
try:
|
12
|
+
from PIL import Image, ImageDraw, ImageFont
|
13
|
+
except ImportError:
|
14
|
+
from PIL import Image, ImageDraw
|
15
|
+
ImageFont = None
|
16
|
+
import argparse
|
17
|
+
|
18
|
+
# Add project root to path
|
19
|
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
20
|
+
|
21
|
+
from natural_pdf import PDF, configure_logging
|
22
|
+
from natural_pdf.elements.collections import ElementCollection
|
23
|
+
|
24
|
+
# Set up logging
|
25
|
+
configure_logging(level=logging.DEBUG)
|
26
|
+
layout_logger = logging.getLogger("natural_pdf.analyzers.layout")
|
27
|
+
layout_logger.setLevel(logging.DEBUG)
|
28
|
+
|
29
|
+
# Create a file handler for detailed logs
|
30
|
+
file_handler = logging.FileHandler("layout_debug.log")
|
31
|
+
file_handler.setLevel(logging.DEBUG)
|
32
|
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
33
|
+
file_handler.setFormatter(formatter)
|
34
|
+
layout_logger.addHandler(file_handler)
|
35
|
+
|
36
|
+
def debug_detection_coordinates(pdf_path, page_num=0, output_dir="output"):
|
37
|
+
"""Debug layout detection coordinates for both YOLO and TATR models."""
|
38
|
+
os.makedirs(output_dir, exist_ok=True)
|
39
|
+
|
40
|
+
# Open PDF
|
41
|
+
pdf = PDF(pdf_path)
|
42
|
+
page = pdf.pages[page_num]
|
43
|
+
|
44
|
+
# Get original page dimensions for reference
|
45
|
+
page_width = page.width
|
46
|
+
page_height = page.height
|
47
|
+
print(f"Page dimensions: {page_width} x {page_height}")
|
48
|
+
|
49
|
+
# Monkey patch the core analyze_layout method to add logging
|
50
|
+
original_analyze_layout = type(page).analyze_layout
|
51
|
+
|
52
|
+
def debug_analyze_layout(self, *args, **kwargs):
|
53
|
+
"""Wrapped analyze_layout method with debug logging."""
|
54
|
+
model_type = kwargs.get('model', 'yolo')
|
55
|
+
print(f"\n=== Running layout analysis with {model_type.upper()} model ===")
|
56
|
+
|
57
|
+
# Add logging for image to PDF coordinate conversion
|
58
|
+
old_convert_to_regions = None
|
59
|
+
try:
|
60
|
+
from natural_pdf.analyzers.document_layout import convert_to_regions
|
61
|
+
old_convert_to_regions = convert_to_regions
|
62
|
+
|
63
|
+
def debug_convert_to_regions(page, detections, scale_factor=1.0):
|
64
|
+
"""Monkey patched version with detailed logging."""
|
65
|
+
print(f"Converting {len(detections)} detections with scale factor {scale_factor}")
|
66
|
+
|
67
|
+
# Create a detailed log for each detection
|
68
|
+
for i, det in enumerate(detections):
|
69
|
+
bbox = det['bbox']
|
70
|
+
x_min, y_min, x_max, y_max = bbox
|
71
|
+
width = x_max - x_min
|
72
|
+
height = y_max - y_min
|
73
|
+
|
74
|
+
pdf_x0 = x_min * scale_factor
|
75
|
+
pdf_y0 = y_min * scale_factor
|
76
|
+
pdf_x1 = x_max * scale_factor
|
77
|
+
pdf_y1 = y_max * scale_factor
|
78
|
+
|
79
|
+
print(f"Detection #{i+1} ({det['class']}):")
|
80
|
+
print(f" Raw bbox: {bbox}")
|
81
|
+
print(f" Image dimensions: {width:.2f} x {height:.2f}")
|
82
|
+
print(f" PDF coords: ({pdf_x0:.2f}, {pdf_y0:.2f}, {pdf_x1:.2f}, {pdf_y1:.2f})")
|
83
|
+
print(f" PDF dimensions: {pdf_x1-pdf_x0:.2f} x {pdf_y1-pdf_y0:.2f}")
|
84
|
+
print(f" Image-to-PDF ratio: width={scale_factor:.4f}, height={scale_factor:.4f}")
|
85
|
+
|
86
|
+
# Call the original function
|
87
|
+
return old_convert_to_regions(page, detections, scale_factor)
|
88
|
+
|
89
|
+
# Replace the function
|
90
|
+
from natural_pdf.analyzers import document_layout
|
91
|
+
document_layout.convert_to_regions = debug_convert_to_regions
|
92
|
+
|
93
|
+
except ImportError:
|
94
|
+
print("Could not monkey patch convert_to_regions")
|
95
|
+
|
96
|
+
# Call the original method
|
97
|
+
result = original_analyze_layout(self, *args, **kwargs)
|
98
|
+
|
99
|
+
# Restore the original function
|
100
|
+
if old_convert_to_regions:
|
101
|
+
from natural_pdf.analyzers import document_layout
|
102
|
+
document_layout.convert_to_regions = old_convert_to_regions
|
103
|
+
|
104
|
+
return result
|
105
|
+
|
106
|
+
# Apply the monkey patch
|
107
|
+
type(page).analyze_layout = debug_analyze_layout
|
108
|
+
|
109
|
+
# Run YOLO model layout detection
|
110
|
+
page.analyze_layout(model="yolo")
|
111
|
+
|
112
|
+
# Get regions and save visualization
|
113
|
+
yolo_regions = page.find_all('region[model=yolo]')
|
114
|
+
print(f"YOLO detected {len(yolo_regions)} regions")
|
115
|
+
|
116
|
+
# Highlight YOLO regions
|
117
|
+
page.clear_highlights()
|
118
|
+
for region in yolo_regions:
|
119
|
+
# Get region dimensions
|
120
|
+
width = region.width
|
121
|
+
height = region.height
|
122
|
+
region.highlight(label=f"{region.region_type} ({width:.1f}x{height:.1f})")
|
123
|
+
|
124
|
+
# Save YOLO visualization
|
125
|
+
page.to_image(labels=True).save(os.path.join(output_dir, "yolo_regions.png"))
|
126
|
+
|
127
|
+
# Create detailed summary of YOLO regions
|
128
|
+
with open(os.path.join(output_dir, "yolo_regions.txt"), "w") as f:
|
129
|
+
for i, region in enumerate(yolo_regions):
|
130
|
+
f.write(f"Region #{i+1} ({region.region_type}):\n")
|
131
|
+
f.write(f" Bbox: ({region.x0:.2f}, {region.top:.2f}, {region.x1:.2f}, {region.bottom:.2f})\n")
|
132
|
+
f.write(f" Dimensions: {region.width:.2f} x {region.height:.2f}\n")
|
133
|
+
f.write(f" Confidence: {region.confidence:.4f}\n")
|
134
|
+
f.write(f" % of page width: {(region.width / page_width) * 100:.2f}%\n\n")
|
135
|
+
|
136
|
+
# Clear existing layout regions
|
137
|
+
page._regions['detected'] = []
|
138
|
+
|
139
|
+
# Run TATR model layout detection
|
140
|
+
page.analyze_layout(model="tatr")
|
141
|
+
|
142
|
+
# Get regions and save visualization
|
143
|
+
tatr_regions = page.find_all('region[model=tatr]')
|
144
|
+
print(f"TATR detected {len(tatr_regions)} regions")
|
145
|
+
|
146
|
+
# Highlight TATR regions
|
147
|
+
page.clear_highlights()
|
148
|
+
for region in tatr_regions:
|
149
|
+
# Get region dimensions
|
150
|
+
width = region.width
|
151
|
+
height = region.height
|
152
|
+
region.highlight(label=f"{region.region_type} ({width:.1f}x{height:.1f})")
|
153
|
+
|
154
|
+
# Save TATR visualization
|
155
|
+
page.to_image(labels=True).save(os.path.join(output_dir, "tatr_regions.png"))
|
156
|
+
|
157
|
+
# Create detailed summary of TATR regions
|
158
|
+
with open(os.path.join(output_dir, "tatr_regions.txt"), "w") as f:
|
159
|
+
for i, region in enumerate(tatr_regions):
|
160
|
+
f.write(f"Region #{i+1} ({region.region_type}):\n")
|
161
|
+
f.write(f" Bbox: ({region.x0:.2f}, {region.top:.2f}, {region.x1:.2f}, {region.bottom:.2f})\n")
|
162
|
+
f.write(f" Dimensions: {region.width:.2f} x {region.height:.2f}\n")
|
163
|
+
f.write(f" Confidence: {region.confidence:.4f}\n")
|
164
|
+
f.write(f" % of page width: {(region.width / page_width) * 100:.2f}%\n\n")
|
165
|
+
|
166
|
+
# Create a side-by-side comparison image
|
167
|
+
# Get page base image
|
168
|
+
page_image = page.to_image(resolution=150)
|
169
|
+
|
170
|
+
# Create a combined image
|
171
|
+
combined_width = page_image.width * 2 + 20 # Add some padding
|
172
|
+
combined_height = page_image.height + 50 # Add space for title
|
173
|
+
combined = Image.new('RGB', (combined_width, combined_height), (255, 255, 255))
|
174
|
+
|
175
|
+
# Add titles
|
176
|
+
try:
|
177
|
+
font = ImageFont.truetype("Arial", 16)
|
178
|
+
except:
|
179
|
+
font = None
|
180
|
+
|
181
|
+
draw = ImageDraw.Draw(combined)
|
182
|
+
draw.text((page_image.width // 2, 20), "YOLO Model", fill=(0, 0, 0), font=font)
|
183
|
+
draw.text((page_image.width * 3 // 2 + 20, 20), "TATR Model", fill=(0, 0, 0), font=font)
|
184
|
+
|
185
|
+
# Add images side by side
|
186
|
+
try:
|
187
|
+
yolo_img = Image.open(os.path.join(output_dir, "yolo_regions.png"))
|
188
|
+
tatr_img = Image.open(os.path.join(output_dir, "tatr_regions.png"))
|
189
|
+
except:
|
190
|
+
print("Warning: Could not load images for side-by-side comparison. Continuing...")
|
191
|
+
return
|
192
|
+
|
193
|
+
# Resize if needed
|
194
|
+
if yolo_img.height != page_image.height or yolo_img.width != page_image.width:
|
195
|
+
yolo_img = yolo_img.resize((page_image.width, page_image.height))
|
196
|
+
if tatr_img.height != page_image.height or tatr_img.width != page_image.width:
|
197
|
+
tatr_img = tatr_img.resize((page_image.width, page_image.height))
|
198
|
+
|
199
|
+
combined.paste(yolo_img, (0, 50))
|
200
|
+
combined.paste(tatr_img, (page_image.width + 20, 50))
|
201
|
+
|
202
|
+
combined.save(os.path.join(output_dir, "model_comparison.png"))
|
203
|
+
|
204
|
+
# Create a document comparing page dimensions to detected region dimensions
|
205
|
+
with open(os.path.join(output_dir, "dimension_analysis.txt"), "w") as f:
|
206
|
+
f.write(f"Page dimensions: {page_width:.2f} x {page_height:.2f}\n\n")
|
207
|
+
|
208
|
+
# YOLO region stats
|
209
|
+
f.write("=== YOLO Model Regions ===\n")
|
210
|
+
if yolo_regions:
|
211
|
+
yolo_widths = [r.width for r in yolo_regions]
|
212
|
+
yolo_width_pcts = [(r.width / page_width) * 100 for r in yolo_regions]
|
213
|
+
|
214
|
+
f.write(f"Total regions: {len(yolo_regions)}\n")
|
215
|
+
f.write(f"Average width: {sum(yolo_widths) / len(yolo_widths):.2f}\n")
|
216
|
+
f.write(f"Average width as % of page: {sum(yolo_width_pcts) / len(yolo_width_pcts):.2f}%\n")
|
217
|
+
f.write(f"Min width: {min(yolo_widths):.2f} ({min(yolo_width_pcts):.2f}%)\n")
|
218
|
+
f.write(f"Max width: {max(yolo_widths):.2f} ({max(yolo_width_pcts):.2f}%)\n\n")
|
219
|
+
else:
|
220
|
+
f.write("No regions detected\n\n")
|
221
|
+
|
222
|
+
# TATR region stats
|
223
|
+
f.write("=== TATR Model Regions ===\n")
|
224
|
+
if tatr_regions:
|
225
|
+
tatr_widths = [r.width for r in tatr_regions]
|
226
|
+
tatr_width_pcts = [(r.width / page_width) * 100 for r in tatr_regions]
|
227
|
+
|
228
|
+
f.write(f"Total regions: {len(tatr_regions)}\n")
|
229
|
+
f.write(f"Average width: {sum(tatr_widths) / len(tatr_widths):.2f}\n")
|
230
|
+
f.write(f"Average width as % of page: {sum(tatr_width_pcts) / len(tatr_width_pcts):.2f}%\n")
|
231
|
+
f.write(f"Min width: {min(tatr_widths):.2f} ({min(tatr_width_pcts):.2f}%)\n")
|
232
|
+
f.write(f"Max width: {max(tatr_widths):.2f} ({max(tatr_width_pcts):.2f}%)\n\n")
|
233
|
+
else:
|
234
|
+
f.write("No regions detected\n\n")
|
235
|
+
|
236
|
+
# If both have regions, compare them
|
237
|
+
if yolo_regions and tatr_regions:
|
238
|
+
# Calculate average width ratio
|
239
|
+
avg_yolo_width = sum(yolo_widths) / len(yolo_widths)
|
240
|
+
avg_tatr_width = sum(tatr_widths) / len(tatr_widths)
|
241
|
+
width_ratio = avg_tatr_width / avg_yolo_width if avg_yolo_width > 0 else 0
|
242
|
+
|
243
|
+
f.write("=== Comparison ===\n")
|
244
|
+
f.write(f"TATR avg width / YOLO avg width = {width_ratio:.4f}\n")
|
245
|
+
f.write(f"YOLO is {100 * (1 - 1/width_ratio):.2f}% narrower than TATR on average\n")
|
246
|
+
|
247
|
+
print(f"\nDebug information saved to {output_dir} directory")
|
248
|
+
print(f"See 'layout_debug.log' for detailed process logging")
|
249
|
+
|
250
|
+
if __name__ == "__main__":
|
251
|
+
parser = argparse.ArgumentParser(description="Debug layout detection coordinate differences")
|
252
|
+
parser.add_argument("pdf_path", help="Path to the PDF file to analyze")
|
253
|
+
parser.add_argument("--page", type=int, default=0, help="Page number to analyze (0-indexed)")
|
254
|
+
parser.add_argument("--output", default="output", help="Output directory for debug files")
|
255
|
+
|
256
|
+
args = parser.parse_args()
|
257
|
+
|
258
|
+
debug_detection_coordinates(args.pdf_path, page_num=args.page, output_dir=args.output)
|
@@ -0,0 +1,77 @@
|
|
1
|
+
"""
|
2
|
+
Test the improved highlight_all behavior with layout regions.
|
3
|
+
|
4
|
+
This example demonstrates how the updated highlight_all method properly
|
5
|
+
highlights layout regions by model and type.
|
6
|
+
"""
|
7
|
+
import os
|
8
|
+
import sys
|
9
|
+
import argparse
|
10
|
+
|
11
|
+
# Add the parent directory to the Python path
|
12
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
13
|
+
from natural_pdf import PDF
|
14
|
+
|
15
|
+
# Get the current directory of this script
|
16
|
+
script_dir = os.path.dirname(os.path.realpath(__file__))
|
17
|
+
# Get the parent directory (project root)
|
18
|
+
root_dir = os.path.dirname(script_dir)
|
19
|
+
# Default PDF path
|
20
|
+
default_pdf = os.path.join(root_dir, "pdfs", "01-practice.pdf")
|
21
|
+
|
22
|
+
# Set up argument parser
|
23
|
+
parser = argparse.ArgumentParser(description="Layout highlighting test")
|
24
|
+
parser.add_argument("pdf_path", nargs="?", default=default_pdf, help="Path to a PDF file")
|
25
|
+
parser.add_argument("--page", type=int, default=0, help="Page number to analyze (0-based)")
|
26
|
+
args = parser.parse_args()
|
27
|
+
|
28
|
+
print(f"Testing layout highlighting on: {args.pdf_path}")
|
29
|
+
print(f"Page: {args.page}")
|
30
|
+
|
31
|
+
# Load the PDF
|
32
|
+
pdf = PDF(args.pdf_path)
|
33
|
+
page = pdf.pages[args.page]
|
34
|
+
|
35
|
+
# First, let's show the regular highlight_all without layout regions
|
36
|
+
print("Creating image with standard highlight_all (no layout regions)...")
|
37
|
+
page.clear_highlights()
|
38
|
+
page.highlight_all()
|
39
|
+
output_path = os.path.join(root_dir, "output", "standard_highlight_all.png")
|
40
|
+
page.to_image(path=output_path, show_labels=True)
|
41
|
+
print(f"Saved to {output_path}")
|
42
|
+
|
43
|
+
# Now run layout analysis with YOLO
|
44
|
+
print("\nRunning YOLO layout analysis...")
|
45
|
+
page.analyze_layout(model="yolo", confidence=0.2)
|
46
|
+
print(f"Found {len(page.detected_layout_regions)} YOLO layout regions")
|
47
|
+
|
48
|
+
# Create an image with highlight_all including layout regions
|
49
|
+
print("Creating image with highlight_all including YOLO layout regions...")
|
50
|
+
page.clear_highlights()
|
51
|
+
page.highlight_all(include_layout_regions=True)
|
52
|
+
output_path = os.path.join(root_dir, "output", "highlight_all_with_yolo.png")
|
53
|
+
page.to_image(path=output_path, show_labels=True)
|
54
|
+
print(f"Saved to {output_path}")
|
55
|
+
|
56
|
+
# Now run table structure analysis with TATR and append to existing regions
|
57
|
+
print("\nRunning TATR table structure analysis...")
|
58
|
+
page.analyze_layout(model="tatr", confidence=0.3, existing="append")
|
59
|
+
print(f"Found {len(page.detected_layout_regions)} total layout regions (YOLO + TATR)")
|
60
|
+
|
61
|
+
# Create an image with highlight_all including all layout regions
|
62
|
+
print("Creating image with highlight_all including all layout regions...")
|
63
|
+
page.clear_highlights()
|
64
|
+
page.highlight_all(include_layout_regions=True)
|
65
|
+
output_path = os.path.join(root_dir, "output", "highlight_all_with_all_layouts.png")
|
66
|
+
page.to_image(path=output_path, show_labels=True)
|
67
|
+
print(f"Saved to {output_path}")
|
68
|
+
|
69
|
+
# Compare with the original highlight_layout method
|
70
|
+
print("\nCreating image with highlight_layout method for comparison...")
|
71
|
+
page.clear_highlights()
|
72
|
+
page.highlight_layout()
|
73
|
+
output_path = os.path.join(root_dir, "output", "highlight_layout_method.png")
|
74
|
+
page.to_image(path=output_path, show_labels=True)
|
75
|
+
print(f"Saved to {output_path}")
|
76
|
+
|
77
|
+
print("\nDone!")
|
@@ -0,0 +1,70 @@
|
|
1
|
+
"""
|
2
|
+
Example demonstrating the logging system in Natural PDF.
|
3
|
+
"""
|
4
|
+
import os
|
5
|
+
import sys
|
6
|
+
import logging
|
7
|
+
from pathlib import Path
|
8
|
+
|
9
|
+
# Add the project directory to the path to import the library
|
10
|
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
11
|
+
from natural_pdf import configure_logging, PDF
|
12
|
+
|
13
|
+
def main():
|
14
|
+
# Basic setup with INFO level
|
15
|
+
print("=== Configuring logging at INFO level ===")
|
16
|
+
configure_logging(level=logging.INFO)
|
17
|
+
|
18
|
+
# Create a PDF
|
19
|
+
pdf_path = "./pdfs/01-practice.pdf"
|
20
|
+
if not os.path.exists(pdf_path):
|
21
|
+
# Try another file if the first one doesn't exist
|
22
|
+
pdf_path = list(Path("./pdfs").glob("*.pdf"))[0]
|
23
|
+
|
24
|
+
print(f"\nLoading PDF with standard logging: {pdf_path}")
|
25
|
+
pdf = PDF(pdf_path)
|
26
|
+
|
27
|
+
# OCR with default settings (should log OCR engine initialization)
|
28
|
+
print("\nExtracting text with OCR")
|
29
|
+
text = pdf.pages[0].extract_text(ocr=True)
|
30
|
+
print(f"Extracted {len(text)} characters")
|
31
|
+
|
32
|
+
# Switch to DEBUG level
|
33
|
+
print("\n=== Configuring logging at DEBUG level ===")
|
34
|
+
configure_logging(level=logging.DEBUG)
|
35
|
+
|
36
|
+
# Try layout detection (generates more detailed logs)
|
37
|
+
print("\nRunning layout detection with DEBUG logging")
|
38
|
+
regions = pdf.pages[0].analyze_layout(
|
39
|
+
model="paddle",
|
40
|
+
model_params={"detect_text": True, "verbose": True}
|
41
|
+
)
|
42
|
+
print(f"Found {len(regions)} regions")
|
43
|
+
|
44
|
+
# Try logging to a file
|
45
|
+
print("\n=== Logging to a file ===")
|
46
|
+
log_file = os.path.join("output", "natural_pdf.log")
|
47
|
+
os.makedirs(os.path.dirname(log_file), exist_ok=True)
|
48
|
+
|
49
|
+
# Create a file handler with custom formatter
|
50
|
+
file_handler = logging.FileHandler(log_file, mode='w') # 'w' mode to overwrite any existing file
|
51
|
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
52
|
+
file_handler.setFormatter(formatter)
|
53
|
+
|
54
|
+
# Configure logging with the file handler
|
55
|
+
configure_logging(level=logging.DEBUG, handler=file_handler)
|
56
|
+
|
57
|
+
# Force a few log events
|
58
|
+
logger = logging.getLogger("natural_pdf")
|
59
|
+
logger.debug("This is a debug message written to the log file")
|
60
|
+
logger.info("This is an info message written to the log file")
|
61
|
+
logger.warning("This is a warning message written to the log file")
|
62
|
+
|
63
|
+
# Run another OCR operation to log to the file
|
64
|
+
print(f"Running OCR with logging to {log_file}")
|
65
|
+
text = pdf.pages[0].extract_text(ocr=True)
|
66
|
+
|
67
|
+
print(f"\nDone! Check {log_file} for detailed logs.")
|
68
|
+
|
69
|
+
if __name__ == "__main__":
|
70
|
+
main()
|
@@ -0,0 +1,193 @@
|
|
1
|
+
"""
|
2
|
+
Comprehensive OCR example.
|
3
|
+
|
4
|
+
This example demonstrates the full range of OCR capabilities
|
5
|
+
in natural-pdf, including:
|
6
|
+
1. Multiple configuration formats
|
7
|
+
2. Auto mode
|
8
|
+
3. Region-specific OCR
|
9
|
+
4. Selector filtering by source and confidence
|
10
|
+
5. Visualization of OCR results
|
11
|
+
"""
|
12
|
+
import os
|
13
|
+
import sys
|
14
|
+
from pathlib import Path
|
15
|
+
|
16
|
+
# Add parent directory to path for imports
|
17
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
18
|
+
|
19
|
+
from natural_pdf import PDF
|
20
|
+
|
21
|
+
# Get the current directory of this script
|
22
|
+
script_dir = os.path.dirname(os.path.realpath(__file__))
|
23
|
+
# Get the parent directory (project root)
|
24
|
+
root_dir = os.path.dirname(script_dir)
|
25
|
+
# Default PDF path (replace with a scanned document path for better results)
|
26
|
+
default_pdf = os.path.join(root_dir, "pdfs", "HARRY ROQUE_redacted.pdf")
|
27
|
+
# Output directory
|
28
|
+
output_dir = os.path.join(root_dir, "output")
|
29
|
+
os.makedirs(output_dir, exist_ok=True)
|
30
|
+
|
31
|
+
print("Comprehensive OCR Example")
|
32
|
+
print("========================")
|
33
|
+
|
34
|
+
# 1. Demonstrate different OCR configuration formats
|
35
|
+
print("\n1. Different OCR configuration formats")
|
36
|
+
|
37
|
+
# Simple flag
|
38
|
+
pdf_simple = PDF(default_pdf, ocr=True)
|
39
|
+
print(" - Simple flag (ocr=True): OCR enabled with defaults")
|
40
|
+
|
41
|
+
# Auto mode
|
42
|
+
pdf_auto = PDF(default_pdf, ocr="auto")
|
43
|
+
print(" - Auto mode (ocr='auto'): OCR applied only when needed")
|
44
|
+
|
45
|
+
# Language list
|
46
|
+
pdf_langs = PDF(default_pdf, ocr=["en"])
|
47
|
+
print(" - Language list (ocr=['en']): English language OCR")
|
48
|
+
|
49
|
+
# Detailed config with detection parameters
|
50
|
+
pdf_detailed = PDF(default_pdf, ocr={
|
51
|
+
"enabled": True,
|
52
|
+
"engine": "easyocr",
|
53
|
+
"languages": ["en"],
|
54
|
+
"min_confidence": 0.6,
|
55
|
+
"paragraph": False,
|
56
|
+
# Text detection parameters for CRAFT
|
57
|
+
"detection_params": {
|
58
|
+
"text_threshold": 0.1, # Lower threshold to detect more text (default is 0.7)
|
59
|
+
"low_text": 0.3, # Lower threshold for text box filtering (default is 0.4)
|
60
|
+
"link_threshold": 0.3, # Lower threshold for link between text (default is 0.4)
|
61
|
+
"canvas_size": 2560, # Maximum image size
|
62
|
+
"mag_ratio": 1.5 # Image magnification ratio (increase for better detection)
|
63
|
+
},
|
64
|
+
# Optional recognition parameters
|
65
|
+
"recognition_params": {
|
66
|
+
"decoder": "greedy",
|
67
|
+
"batch_size": 4,
|
68
|
+
"contrast_ths": 0.05 # Lower contrast threshold
|
69
|
+
}
|
70
|
+
})
|
71
|
+
print(" - Detailed config: Custom parameters including text_threshold=0.1")
|
72
|
+
|
73
|
+
# 2. Auto mode demonstration
|
74
|
+
print("\n2. Auto mode OCR")
|
75
|
+
pdf = PDF(default_pdf, ocr="auto")
|
76
|
+
page = pdf.pages[0]
|
77
|
+
|
78
|
+
# Extract text with auto OCR
|
79
|
+
text = page.extract_text()
|
80
|
+
print(f" - Auto mode extracted {len(text)} characters")
|
81
|
+
print(f" - First 100 chars: {text[:100]}...")
|
82
|
+
|
83
|
+
# 3. Explicit OCR application
|
84
|
+
print("\n3. Explicit OCR application")
|
85
|
+
ocr_elements = page.apply_ocr()
|
86
|
+
print(f" - Found {len(ocr_elements)} OCR elements")
|
87
|
+
print(" - Sample OCR elements:")
|
88
|
+
for i, elem in enumerate(ocr_elements[:3]):
|
89
|
+
print(f" {i+1}. '{elem.text}' (confidence: {elem.confidence:.2f})")
|
90
|
+
|
91
|
+
# 4. OCR with confidence filtering
|
92
|
+
print("\n4. OCR confidence filtering")
|
93
|
+
high_conf = page.find_all('text[source=ocr][confidence>=0.8]')
|
94
|
+
print(f" - Found {len(high_conf)} high-confidence OCR elements")
|
95
|
+
low_conf = page.find_all('text[source=ocr][confidence<0.6]')
|
96
|
+
print(f" - Found {len(low_conf)} low-confidence OCR elements")
|
97
|
+
|
98
|
+
# 5. Content-based OCR filtering
|
99
|
+
print("\n5. Content-based OCR filtering")
|
100
|
+
contains_a = page.find_all('text[source=ocr]:contains("a")')
|
101
|
+
print(f" - Found {len(contains_a)} OCR elements containing 'a'")
|
102
|
+
|
103
|
+
# 6. Region-specific OCR
|
104
|
+
print("\n6. Region-specific OCR")
|
105
|
+
# Create a region - adjust coordinates if needed for your document
|
106
|
+
region = page.create_region(100, 100, 400, 200)
|
107
|
+
region_elems = region.apply_ocr()
|
108
|
+
print(f" - Applied OCR to region, found {len(region_elems)} elements")
|
109
|
+
region_text = region.extract_text()
|
110
|
+
print(f" - Region text: '{region_text[:50]}...'")
|
111
|
+
|
112
|
+
# 7. OCR visualization
|
113
|
+
print("\n7. OCR visualization")
|
114
|
+
# Clear any existing highlights
|
115
|
+
page.clear_highlights()
|
116
|
+
|
117
|
+
# Highlight all OCR elements with confidence displayed
|
118
|
+
print(" - Highlighting all OCR elements with confidence scores")
|
119
|
+
for elem in ocr_elements:
|
120
|
+
# Color coding based on confidence - using integer RGB values (0-255)
|
121
|
+
if elem.confidence >= 0.8:
|
122
|
+
color = (0, 204, 0, 76) # Green for high confidence
|
123
|
+
elif elem.confidence >= 0.6:
|
124
|
+
color = (230, 230, 0, 76) # Yellow for medium confidence
|
125
|
+
else:
|
126
|
+
color = (204, 0, 0, 76) # Red for low confidence
|
127
|
+
|
128
|
+
elem.highlight(label=f"OCR ({elem.confidence:.2f})", color=color)
|
129
|
+
|
130
|
+
# Save the image
|
131
|
+
output_path = os.path.join(output_dir, "ocr_confidence_visualization.png")
|
132
|
+
page.to_image(path=output_path, show_labels=True)
|
133
|
+
print(f" - Saved visualization to {output_path}")
|
134
|
+
|
135
|
+
# 8. Demonstrate override at extraction time
|
136
|
+
print("\n8. OCR override at extraction time")
|
137
|
+
text_default = page.extract_text()
|
138
|
+
print(f" - Default extraction: {len(text_default)} characters")
|
139
|
+
|
140
|
+
text_override = page.extract_text(ocr={
|
141
|
+
"languages": ["en"],
|
142
|
+
"min_confidence": 0.4 # Lower threshold
|
143
|
+
})
|
144
|
+
print(f" - Override extraction (min_confidence=0.4): {len(text_override)} characters")
|
145
|
+
|
146
|
+
# 9. OCR element properties
|
147
|
+
print("\n9. OCR element properties")
|
148
|
+
if ocr_elements:
|
149
|
+
elem = ocr_elements[0]
|
150
|
+
print(f" - Source: {elem.source}")
|
151
|
+
print(f" - Confidence: {elem.confidence:.2f}")
|
152
|
+
print(f" - Text: '{elem.text}'")
|
153
|
+
print(f" - Bounding box: {elem.bbox}")
|
154
|
+
print(f" - Font (default for OCR): {elem.fontname}")
|
155
|
+
|
156
|
+
# 10. Compare OCR with lower text_threshold to detect more text
|
157
|
+
print("\n10. Comparing OCR with different text_threshold values")
|
158
|
+
page.clear_highlights()
|
159
|
+
|
160
|
+
# First with default text_threshold (0.7)
|
161
|
+
print(" - Running OCR with default text_threshold (0.7)")
|
162
|
+
default_elements = page.extract_ocr_elements(ocr={
|
163
|
+
"languages": ["en"],
|
164
|
+
"detection_params": {
|
165
|
+
"text_threshold": 0.7 # Default value
|
166
|
+
}
|
167
|
+
})
|
168
|
+
print(f" - Found {len(default_elements)} elements with default text_threshold")
|
169
|
+
|
170
|
+
# Highlight with blue
|
171
|
+
for elem in default_elements:
|
172
|
+
elem.highlight(label="Default threshold", color=(0, 0, 204, 76))
|
173
|
+
|
174
|
+
# Now with lower text_threshold (0.1)
|
175
|
+
print(" - Running OCR with lower text_threshold (0.1)")
|
176
|
+
low_threshold_elements = page.extract_ocr_elements(ocr={
|
177
|
+
"languages": ["en"],
|
178
|
+
"detection_params": {
|
179
|
+
"text_threshold": 0.1 # Lower value to detect more text
|
180
|
+
}
|
181
|
+
})
|
182
|
+
print(f" - Found {len(low_threshold_elements)} elements with text_threshold=0.1")
|
183
|
+
|
184
|
+
# Highlight with red
|
185
|
+
for elem in low_threshold_elements:
|
186
|
+
elem.highlight(label="Lower threshold (0.1)", color=(204, 0, 0, 76))
|
187
|
+
|
188
|
+
# Save comparative visualization
|
189
|
+
output_path = os.path.join(output_dir, "ocr_threshold_comparison.png")
|
190
|
+
page.to_image(path=output_path, show_labels=True)
|
191
|
+
print(f" - Saved threshold comparison to {output_path}")
|
192
|
+
|
193
|
+
print("\nDone!")
|
@@ -0,0 +1,87 @@
|
|
1
|
+
"""
|
2
|
+
OCR Debug Report Example
|
3
|
+
|
4
|
+
This example demonstrates the OCR debugging feature, which generates an interactive
|
5
|
+
HTML report for analyzing and correcting OCR results.
|
6
|
+
"""
|
7
|
+
import os
|
8
|
+
import sys
|
9
|
+
from pathlib import Path
|
10
|
+
|
11
|
+
# Get the parent directory (project root)
|
12
|
+
project_root = Path(__file__).absolute().parent.parent
|
13
|
+
sys.path.insert(0, str(project_root))
|
14
|
+
|
15
|
+
from natural_pdf import PDF
|
16
|
+
|
17
|
+
# Output directory
|
18
|
+
output_dir = os.path.join(project_root, "output")
|
19
|
+
os.makedirs(output_dir, exist_ok=True)
|
20
|
+
|
21
|
+
# Default PDF path (a scanned document)
|
22
|
+
pdf_path = os.path.join(project_root, "pdfs", "HARRY ROQUE_redacted.pdf")
|
23
|
+
fallback_pdf = os.path.join(project_root, "pdfs", "needs-ocr.pdf")
|
24
|
+
|
25
|
+
# Use the first available PDF
|
26
|
+
if os.path.exists(pdf_path):
|
27
|
+
print(f"Using PDF: {pdf_path}")
|
28
|
+
elif os.path.exists(fallback_pdf):
|
29
|
+
pdf_path = fallback_pdf
|
30
|
+
print(f"Using PDF: {pdf_path}")
|
31
|
+
else:
|
32
|
+
print("No suitable PDF found. Please provide a scanned PDF.")
|
33
|
+
sys.exit(1)
|
34
|
+
|
35
|
+
# OCR Debug Example
|
36
|
+
print("\nOCR Debug Report Example")
|
37
|
+
print("=======================")
|
38
|
+
|
39
|
+
# Load PDF with OCR enabled
|
40
|
+
print("\n1. Loading PDF with OCR enabled")
|
41
|
+
pdf = PDF(
|
42
|
+
pdf_path,
|
43
|
+
ocr={
|
44
|
+
"enabled": True, # Enable OCR
|
45
|
+
"languages": ["en"],
|
46
|
+
"min_confidence": 0.3, # Lower threshold to get more results
|
47
|
+
},
|
48
|
+
# Try PaddleOCR first, which often gives better results for Asian languages
|
49
|
+
ocr_engine="paddleocr"
|
50
|
+
)
|
51
|
+
|
52
|
+
# Run OCR on the first page
|
53
|
+
print("\n2. Running OCR on the first page")
|
54
|
+
page = pdf.pages[0]
|
55
|
+
ocr_elements = page.extract_ocr_elements()
|
56
|
+
print(f"Found {len(ocr_elements)} OCR text elements")
|
57
|
+
|
58
|
+
# Generate a debug report for a single page
|
59
|
+
print("\n3. Generating OCR debug report for a single page")
|
60
|
+
page_report_path = os.path.join(output_dir, "ocr_debug_page.html")
|
61
|
+
page.debug_ocr(page_report_path)
|
62
|
+
print(f"Saved page debug report to: {page_report_path}")
|
63
|
+
|
64
|
+
# Generate a debug report for multiple pages
|
65
|
+
print("\n4. Generating OCR debug report for multiple pages")
|
66
|
+
pdf_report_path = os.path.join(output_dir, "ocr_debug_full.html")
|
67
|
+
pdf.debug_ocr(pdf_report_path)
|
68
|
+
print(f"Saved full PDF debug report to: {pdf_report_path}")
|
69
|
+
|
70
|
+
# Generate a debug report for a page range
|
71
|
+
print("\n5. Generating OCR debug report for a page range")
|
72
|
+
if len(pdf.pages) > 1:
|
73
|
+
page_range_report_path = os.path.join(output_dir, "ocr_debug_range.html")
|
74
|
+
page_range = pdf.pages[0:min(3, len(pdf.pages))]
|
75
|
+
page_range.debug_ocr(page_range_report_path)
|
76
|
+
print(f"Saved page range debug report to: {page_range_report_path}")
|
77
|
+
else:
|
78
|
+
print("PDF has only one page, skipping page range example")
|
79
|
+
|
80
|
+
print("\nDone! The debug reports have been saved to the output directory.")
|
81
|
+
print("You can open them in a web browser to interactively review the OCR results.")
|
82
|
+
print("The reports allow you to:")
|
83
|
+
print("- Filter results by confidence score")
|
84
|
+
print("- Search for specific text")
|
85
|
+
print("- Sort results by different criteria")
|
86
|
+
print("- Edit/correct OCR text")
|
87
|
+
print("- Export corrected text as JSON for further processing")
|