natural-pdf 25.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/__init__.py +3 -0
- examples/another_exclusion_example.py +20 -0
- examples/basic_usage.py +190 -0
- examples/boundary_exclusion_test.py +137 -0
- examples/boundary_inclusion_fix_test.py +157 -0
- examples/chainable_layout_example.py +70 -0
- examples/color_basic_test.py +49 -0
- examples/color_name_example.py +71 -0
- examples/color_test.py +62 -0
- examples/debug_ocr.py +91 -0
- examples/direct_ocr_test.py +148 -0
- examples/direct_paddle_test.py +99 -0
- examples/direct_qa_example.py +165 -0
- examples/document_layout_analysis.py +123 -0
- examples/document_qa_example.py +185 -0
- examples/exclusion_count_debug.py +128 -0
- examples/exclusion_debug.py +107 -0
- examples/exclusion_example.py +150 -0
- examples/exclusion_optimization_example.py +190 -0
- examples/extract_text_test.py +128 -0
- examples/font_aware_example.py +101 -0
- examples/font_variant_example.py +124 -0
- examples/footer_overlap_test.py +124 -0
- examples/highlight_all_example.py +82 -0
- examples/highlight_attributes_test.py +114 -0
- examples/highlight_confidence_display.py +122 -0
- examples/highlight_demo.py +110 -0
- examples/highlight_float_test.py +71 -0
- examples/highlight_test.py +147 -0
- examples/highlighting_example.py +123 -0
- examples/image_width_example.py +84 -0
- examples/improved_api_example.py +128 -0
- examples/layout_confidence_display_test.py +65 -0
- examples/layout_confidence_test.py +82 -0
- examples/layout_coordinate_debug.py +258 -0
- examples/layout_highlight_test.py +77 -0
- examples/logging_example.py +70 -0
- examples/ocr_comprehensive.py +193 -0
- examples/ocr_debug_example.py +87 -0
- examples/ocr_default_test.py +97 -0
- examples/ocr_engine_comparison.py +235 -0
- examples/ocr_example.py +89 -0
- examples/ocr_simplified_params.py +79 -0
- examples/ocr_visualization.py +102 -0
- examples/ocr_visualization_test.py +121 -0
- examples/paddle_layout_example.py +315 -0
- examples/paddle_layout_simple.py +74 -0
- examples/paddleocr_example.py +224 -0
- examples/page_collection_example.py +103 -0
- examples/polygon_highlight_example.py +83 -0
- examples/position_methods_example.py +134 -0
- examples/region_boundary_test.py +73 -0
- examples/region_exclusion_test.py +149 -0
- examples/region_expand_example.py +109 -0
- examples/region_image_example.py +116 -0
- examples/region_ocr_test.py +119 -0
- examples/region_sections_example.py +115 -0
- examples/school_books.py +49 -0
- examples/school_books_all.py +52 -0
- examples/scouring.py +36 -0
- examples/section_extraction_example.py +232 -0
- examples/simple_document_qa.py +97 -0
- examples/spatial_navigation_example.py +108 -0
- examples/table_extraction_example.py +135 -0
- examples/table_structure_detection.py +155 -0
- examples/tatr_cells_test.py +56 -0
- examples/tatr_ocr_table_test.py +94 -0
- examples/text_search_example.py +122 -0
- examples/text_style_example.py +110 -0
- examples/tiny-text.py +61 -0
- examples/until_boundaries_example.py +156 -0
- examples/until_example.py +112 -0
- examples/very_basics.py +15 -0
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +9 -0
- natural_pdf/analyzers/document_layout.py +736 -0
- natural_pdf/analyzers/text_structure.py +153 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/page.py +2376 -0
- natural_pdf/core/pdf.py +572 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +553 -0
- natural_pdf/elements/collections.py +770 -0
- natural_pdf/elements/line.py +124 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1366 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +62 -0
- natural_pdf/ocr/easyocr_engine.py +254 -0
- natural_pdf/ocr/engine.py +158 -0
- natural_pdf/ocr/paddleocr_engine.py +263 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +405 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +360 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +4 -0
- natural_pdf/utils/highlighting.py +605 -0
- natural_pdf/utils/ocr.py +515 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +151 -0
- natural_pdf-25.3.16.dist-info/LICENSE +21 -0
- natural_pdf-25.3.16.dist-info/METADATA +268 -0
- natural_pdf-25.3.16.dist-info/RECORD +109 -0
- natural_pdf-25.3.16.dist-info/WHEEL +5 -0
- natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
- tests/__init__.py +3 -0
- tests/test_pdf.py +39 -0
@@ -0,0 +1,190 @@
|
|
1
|
+
"""
|
2
|
+
Example demonstrating the optimized exclusion handling for various region types.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
import sys
|
7
|
+
from pathlib import Path
|
8
|
+
import time
|
9
|
+
|
10
|
+
# Add parent directory to path for imports
|
11
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
12
|
+
|
13
|
+
from natural_pdf import PDF
|
14
|
+
|
15
|
+
|
16
|
+
def measure_time(func):
|
17
|
+
"""Decorator to measure function execution time."""
|
18
|
+
def wrapper(*args, **kwargs):
|
19
|
+
start_time = time.time()
|
20
|
+
result = func(*args, **kwargs)
|
21
|
+
end_time = time.time()
|
22
|
+
print(f"Time taken: {end_time - start_time:.4f} seconds")
|
23
|
+
return result
|
24
|
+
return wrapper
|
25
|
+
|
26
|
+
|
27
|
+
def optimized_exclusion_example(pdf_path):
|
28
|
+
"""
|
29
|
+
Demonstrates the optimized exclusion handling for different region types.
|
30
|
+
"""
|
31
|
+
with PDF(pdf_path) as pdf:
|
32
|
+
page = pdf.pages[0]
|
33
|
+
print(f"Using PDF: {pdf_path}")
|
34
|
+
|
35
|
+
# Create an output directory
|
36
|
+
output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'output'))
|
37
|
+
os.makedirs(output_dir, exist_ok=True)
|
38
|
+
|
39
|
+
# Step 1: Set up exclusion zones
|
40
|
+
print("\n=== Setting Up Exclusion Zones ===")
|
41
|
+
# Top 10% of page as header
|
42
|
+
header_zone = page.create_region(0, 0, page.width, page.height * 0.1)
|
43
|
+
header_zone.highlight(label="Header Exclusion", color=(1, 0, 0, 0.3))
|
44
|
+
page.add_exclusion(header_zone)
|
45
|
+
|
46
|
+
# Bottom 10% of page as footer
|
47
|
+
footer_zone = page.create_region(0, page.height * 0.9, page.width, page.height)
|
48
|
+
footer_zone.highlight(label="Footer Exclusion", color=(0, 0, 1, 0.3))
|
49
|
+
page.add_exclusion(footer_zone)
|
50
|
+
|
51
|
+
# Left 20% as a side panel (complex exclusion)
|
52
|
+
side_panel = page.create_region(0, 0, page.width * 0.2, page.height)
|
53
|
+
side_panel.highlight(label="Side Panel Exclusion", color=(0, 1, 0, 0.3))
|
54
|
+
page.add_exclusion(side_panel)
|
55
|
+
|
56
|
+
print(f"Added 3 exclusion zones: header, footer, and side panel")
|
57
|
+
|
58
|
+
# Step 2: Create test regions of different types
|
59
|
+
print("\n=== Creating Test Regions ===")
|
60
|
+
# Non-intersecting region (in center, away from all exclusions)
|
61
|
+
non_intersecting = page.create_region(
|
62
|
+
page.width * 0.3,
|
63
|
+
page.height * 0.3,
|
64
|
+
page.width * 0.8,
|
65
|
+
page.height * 0.7
|
66
|
+
)
|
67
|
+
non_intersecting.highlight(label="Non-Intersecting Region", color=(1, 1, 0, 0.3))
|
68
|
+
|
69
|
+
# Header/footer-only region (full width but between exclusions)
|
70
|
+
header_footer_region = page.create_region(
|
71
|
+
0,
|
72
|
+
0,
|
73
|
+
page.width,
|
74
|
+
page.height
|
75
|
+
)
|
76
|
+
header_footer_region.highlight(label="Full Page Region", color=(1, 0, 1, 0.2))
|
77
|
+
|
78
|
+
# Complex region (intersects with side panel)
|
79
|
+
complex_region = page.create_region(
|
80
|
+
0,
|
81
|
+
page.height * 0.2,
|
82
|
+
page.width * 0.5,
|
83
|
+
page.height * 0.8
|
84
|
+
)
|
85
|
+
complex_region.highlight(label="Complex Region", color=(0, 1, 1, 0.3))
|
86
|
+
|
87
|
+
print("Created 3 test regions with different exclusion intersection patterns")
|
88
|
+
|
89
|
+
# Save the visualization
|
90
|
+
output_file = os.path.join(output_dir, "exclusion_optimization_regions.png")
|
91
|
+
page.save_image(output_file, labels=True)
|
92
|
+
print(f"Saved visualization to: {output_file}")
|
93
|
+
|
94
|
+
# Step 3: Test extraction with and without optimizations
|
95
|
+
print("\n=== Testing Text Extraction with Exclusions ===")
|
96
|
+
|
97
|
+
# Test non-intersecting region
|
98
|
+
print("\nNon-Intersecting Region:")
|
99
|
+
print("This region should use the fast path (no exclusion checking)")
|
100
|
+
print("Extracting text with apply_exclusions=True...")
|
101
|
+
|
102
|
+
@measure_time
|
103
|
+
def extract_non_intersecting():
|
104
|
+
return non_intersecting.extract_text(apply_exclusions=True)
|
105
|
+
|
106
|
+
text1 = extract_non_intersecting()
|
107
|
+
|
108
|
+
print("Extracting text with apply_exclusions=False (for comparison)...")
|
109
|
+
|
110
|
+
@measure_time
|
111
|
+
def extract_non_intersecting_no_exclusions():
|
112
|
+
return non_intersecting.extract_text(apply_exclusions=False)
|
113
|
+
|
114
|
+
text2 = extract_non_intersecting_no_exclusions()
|
115
|
+
|
116
|
+
print(f"Text length comparison: with exclusions={len(text1)}, without={len(text2)}")
|
117
|
+
print(f"Identical results: {text1 == text2}")
|
118
|
+
|
119
|
+
# Test header/footer region
|
120
|
+
print("\nFull Page Region (intersecting header/footer):")
|
121
|
+
print("This region should use cropping optimization for header/footer exclusions")
|
122
|
+
print("Extracting text with apply_exclusions=True...")
|
123
|
+
|
124
|
+
@measure_time
|
125
|
+
def extract_header_footer():
|
126
|
+
return header_footer_region.extract_text(apply_exclusions=True)
|
127
|
+
|
128
|
+
text3 = extract_header_footer()
|
129
|
+
|
130
|
+
print("Extracting text with apply_exclusions=False (for comparison)...")
|
131
|
+
|
132
|
+
@measure_time
|
133
|
+
def extract_header_footer_no_exclusions():
|
134
|
+
return header_footer_region.extract_text(apply_exclusions=False)
|
135
|
+
|
136
|
+
text4 = extract_header_footer_no_exclusions()
|
137
|
+
|
138
|
+
print(f"Text length comparison: with exclusions={len(text3)}, without={len(text4)}")
|
139
|
+
print(f"Header/footer content excluded: {len(text4) > len(text3)}")
|
140
|
+
|
141
|
+
# Test complex region
|
142
|
+
print("\nComplex Region (intersecting side panel):")
|
143
|
+
print("This region should use filtering with warning")
|
144
|
+
print("Extracting text with apply_exclusions=True...")
|
145
|
+
|
146
|
+
@measure_time
|
147
|
+
def extract_complex():
|
148
|
+
return complex_region.extract_text(apply_exclusions=True)
|
149
|
+
|
150
|
+
text5 = extract_complex()
|
151
|
+
|
152
|
+
print("Extracting text with apply_exclusions=False (for comparison)...")
|
153
|
+
|
154
|
+
@measure_time
|
155
|
+
def extract_complex_no_exclusions():
|
156
|
+
return complex_region.extract_text(apply_exclusions=False)
|
157
|
+
|
158
|
+
text6 = extract_complex_no_exclusions()
|
159
|
+
|
160
|
+
print(f"Text length comparison: with exclusions={len(text5)}, without={len(text6)}")
|
161
|
+
|
162
|
+
# Step 4: Summarize findings
|
163
|
+
print("\n=== Summary ===")
|
164
|
+
print("1. Non-intersecting region: Optimization should skip exclusion checks entirely")
|
165
|
+
print("2. Header/footer region: Optimization should use direct cropping")
|
166
|
+
print("3. Complex region: Falls back to filtering with warning")
|
167
|
+
print("\nCheck the produced warning messages to confirm the behavior.")
|
168
|
+
|
169
|
+
|
170
|
+
def main():
|
171
|
+
"""Main entry point."""
|
172
|
+
# Get the PDF path from command line or use a default
|
173
|
+
if len(sys.argv) > 1:
|
174
|
+
pdf_path = sys.argv[1]
|
175
|
+
else:
|
176
|
+
# Look for any PDF in the pdfs directory
|
177
|
+
pdfs_dir = Path(__file__).parent.parent / "pdfs"
|
178
|
+
pdf_files = list(pdfs_dir.glob("*.pdf"))
|
179
|
+
|
180
|
+
if pdf_files:
|
181
|
+
pdf_path = str(pdf_files[0])
|
182
|
+
else:
|
183
|
+
print("No PDF file found. Please provide a path to a PDF file.")
|
184
|
+
sys.exit(1)
|
185
|
+
|
186
|
+
optimized_exclusion_example(pdf_path)
|
187
|
+
|
188
|
+
|
189
|
+
if __name__ == "__main__":
|
190
|
+
main()
|
@@ -0,0 +1,128 @@
|
|
1
|
+
"""
|
2
|
+
Test script for the new extract_text implementation that uses pdfplumber's native functionality.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
import sys
|
7
|
+
from io import StringIO
|
8
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
9
|
+
|
10
|
+
from natural_pdf import PDF
|
11
|
+
import time
|
12
|
+
|
13
|
+
def main():
|
14
|
+
# Use a sample PDF
|
15
|
+
pdf_path = "pdfs/01-practice.pdf"
|
16
|
+
if len(sys.argv) > 1:
|
17
|
+
pdf_path = sys.argv[1]
|
18
|
+
|
19
|
+
# Load the PDF
|
20
|
+
pdf = PDF(pdf_path)
|
21
|
+
page = pdf.pages[0]
|
22
|
+
|
23
|
+
print(f"Loaded {pdf_path}, processing first page...")
|
24
|
+
|
25
|
+
# Create different regions
|
26
|
+
full_region = page.create_region(0, 0, page.width, page.height)
|
27
|
+
top_region = page.create_region(0, 0, page.width, page.height / 3)
|
28
|
+
bottom_region = page.create_region(0, page.height * 2/3, page.width, page.height)
|
29
|
+
|
30
|
+
# Create a non-rectangular region (a triangle)
|
31
|
+
# First create the region with a bbox
|
32
|
+
triangle_region = page.create_region(0, 0, page.width, page.height/2)
|
33
|
+
# Then set the polygon directly
|
34
|
+
triangle_region._polygon = [(0, 0), (page.width, 0), (page.width/2, page.height/2)]
|
35
|
+
|
36
|
+
# Add an exclusion region
|
37
|
+
page.add_exclusion(bottom_region)
|
38
|
+
|
39
|
+
# Test extraction with different settings
|
40
|
+
|
41
|
+
# 1. Standard rectangular region without exclusions
|
42
|
+
print("\nExtracting text from top region:")
|
43
|
+
start = time.time()
|
44
|
+
# First try with just crop to debug - use bbox directly
|
45
|
+
crop_bbox = top_region.bbox
|
46
|
+
|
47
|
+
print(f"Using bbox: {crop_bbox}")
|
48
|
+
|
49
|
+
cropped = page._page.crop(crop_bbox)
|
50
|
+
direct_text = cropped.extract_text(keep_blank_chars=True)
|
51
|
+
print(f"Direct crop text length: {len(direct_text)}, Text: {direct_text[:100]}")
|
52
|
+
|
53
|
+
# Check if there's a bug when passing the instance directly to extract_text
|
54
|
+
print("Converting region to a dictionary and creating a new Region")
|
55
|
+
region_dict = {
|
56
|
+
'x0': top_region.x0,
|
57
|
+
'top': top_region.top,
|
58
|
+
'x1': top_region.x1,
|
59
|
+
'bottom': top_region.bottom
|
60
|
+
}
|
61
|
+
bbox = (region_dict['x0'], region_dict['top'], region_dict['x1'], region_dict['bottom'])
|
62
|
+
|
63
|
+
from natural_pdf.elements.region import Region
|
64
|
+
test_region = Region(page, bbox)
|
65
|
+
print(f"New region bbox: {test_region.bbox}")
|
66
|
+
|
67
|
+
# Create a simple direct call to pdfplumber's crop
|
68
|
+
print("Testing direct pdfplumber crop and extract:")
|
69
|
+
crop_bbox = test_region.bbox
|
70
|
+
cropped_page = page._page.crop(crop_bbox)
|
71
|
+
print(f"Cropped page dimensions: {cropped_page.width} × {cropped_page.height}")
|
72
|
+
print(f"Cropped page characters: {len(cropped_page.chars)}")
|
73
|
+
if cropped_page.chars:
|
74
|
+
print(f"First few chars: {cropped_page.chars[:3]}")
|
75
|
+
direct_crop_text = cropped_page.extract_text(keep_blank_chars=True)
|
76
|
+
print(f"Direct pdfplumber extraction: {len(direct_crop_text)} chars")
|
77
|
+
print(direct_crop_text[:100])
|
78
|
+
|
79
|
+
# Test if we're seeing any print outputs from extract_text
|
80
|
+
original_stderr = sys.stderr
|
81
|
+
string_stderr = StringIO()
|
82
|
+
sys.stderr = string_stderr
|
83
|
+
|
84
|
+
# Try the new region's extract_text
|
85
|
+
text = test_region.extract_text(keep_blank_chars=True)
|
86
|
+
stderr_output = string_stderr.getvalue()
|
87
|
+
sys.stderr = original_stderr
|
88
|
+
|
89
|
+
print(f"Stderr output from extract_text call:\n{stderr_output}")
|
90
|
+
|
91
|
+
elapsed = time.time() - start
|
92
|
+
print(f"Length: {len(text)} characters, Time: {elapsed:.4f} seconds")
|
93
|
+
print(text[:200] + "..." if len(text) > 200 else text)
|
94
|
+
|
95
|
+
# 2. Full page with exclusions
|
96
|
+
print("\nExtracting text from full page with exclusions:")
|
97
|
+
start = time.time()
|
98
|
+
text = full_region.extract_text(apply_exclusions=True)
|
99
|
+
elapsed = time.time() - start
|
100
|
+
print(f"Length: {len(text)} characters, Time: {elapsed:.4f} seconds")
|
101
|
+
print(text[:200] + "..." if len(text) > 200 else text)
|
102
|
+
|
103
|
+
# 3. Polygon region (triangle)
|
104
|
+
print("\nExtracting text from triangle region:")
|
105
|
+
start = time.time()
|
106
|
+
text = triangle_region.extract_text()
|
107
|
+
elapsed = time.time() - start
|
108
|
+
print(f"Length: {len(text)} characters, Time: {elapsed:.4f} seconds")
|
109
|
+
print(text[:200] + "..." if len(text) > 200 else text)
|
110
|
+
|
111
|
+
# 4. With OCR option (to test that pathway)
|
112
|
+
print("\nExtracting text with OCR option:")
|
113
|
+
start = time.time()
|
114
|
+
text = top_region.extract_text(ocr={"enabled": True})
|
115
|
+
elapsed = time.time() - start
|
116
|
+
print(f"Length: {len(text)} characters, Time: {elapsed:.4f} seconds")
|
117
|
+
print(text[:200] + "..." if len(text) > 200 else text)
|
118
|
+
|
119
|
+
# For comparison, test the regular page.extract_text method
|
120
|
+
print("\nExtraction with page.extract_text for comparison:")
|
121
|
+
start = time.time()
|
122
|
+
text = page.extract_text(preserve_whitespace=True, apply_exclusions=True)
|
123
|
+
elapsed = time.time() - start
|
124
|
+
print(f"Length: {len(text)} characters, Time: {elapsed:.4f} seconds")
|
125
|
+
print(text[:200] + "..." if len(text) > 200 else text)
|
126
|
+
|
127
|
+
if __name__ == "__main__":
|
128
|
+
main()
|
@@ -0,0 +1,101 @@
|
|
1
|
+
"""
|
2
|
+
Example demonstrating font-aware text extraction in Natural PDF.
|
3
|
+
|
4
|
+
This example shows how to use the font_attrs parameter to group text by font properties,
|
5
|
+
which helps preserve the formatting and style of text during extraction.
|
6
|
+
"""
|
7
|
+
import os
|
8
|
+
import sys
|
9
|
+
|
10
|
+
# Add the parent directory to the path so we can import natural_pdf module
|
11
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
12
|
+
|
13
|
+
from natural_pdf import PDF
|
14
|
+
|
15
|
+
def main():
|
16
|
+
# If a PDF path is provided, use it; otherwise use the default example
|
17
|
+
if len(sys.argv) > 1:
|
18
|
+
pdf_path = sys.argv[1]
|
19
|
+
else:
|
20
|
+
# Use a default PDF path - you'll need to replace this with an actual PDF path
|
21
|
+
pdf_path = "examples/sample.pdf"
|
22
|
+
if not os.path.exists(pdf_path):
|
23
|
+
print(f"Default PDF not found at {pdf_path}")
|
24
|
+
print("Please provide a PDF path as an argument")
|
25
|
+
return
|
26
|
+
|
27
|
+
print(f"Processing PDF: {pdf_path}")
|
28
|
+
|
29
|
+
# Example 1: Default behavior - group by fontname and size
|
30
|
+
print("\n1. Default behavior (group by fontname and size):")
|
31
|
+
pdf = PDF(pdf_path)
|
32
|
+
page = pdf.pages[0]
|
33
|
+
|
34
|
+
# Find some text element to inspect
|
35
|
+
text_element = page.find("text")
|
36
|
+
if text_element:
|
37
|
+
print(f"Example text element: {text_element}")
|
38
|
+
print(f"Font info: {text_element.font_info()}")
|
39
|
+
|
40
|
+
# Example 2: Disable font-aware grouping
|
41
|
+
print("\n2. Disable font-aware grouping (spatial only):")
|
42
|
+
pdf_no_font = PDF(pdf_path, font_attrs=[])
|
43
|
+
page_no_font = pdf_no_font.pages[0]
|
44
|
+
|
45
|
+
# Find the same text with different grouping
|
46
|
+
text_element = page_no_font.find("text")
|
47
|
+
if text_element:
|
48
|
+
print(f"Example text element: {text_element}")
|
49
|
+
|
50
|
+
# Example 3: Group by additional attributes
|
51
|
+
print("\n3. Group by font and color:")
|
52
|
+
pdf_with_color = PDF(pdf_path, font_attrs=['fontname', 'size', 'non_stroking_color'])
|
53
|
+
page_with_color = pdf_with_color.pages[0]
|
54
|
+
|
55
|
+
# Find the same text with color grouping
|
56
|
+
text_element = page_with_color.find("text")
|
57
|
+
if text_element:
|
58
|
+
print(f"Example text element: {text_element}")
|
59
|
+
|
60
|
+
# Compare text extraction results
|
61
|
+
print("\n4. Text extraction comparison:")
|
62
|
+
|
63
|
+
# Get a small region with mixed text styles
|
64
|
+
text_elements = page.find_all("text")
|
65
|
+
if text_elements:
|
66
|
+
region = page.create_region(0, 0, page.width, page.height) # Use the full page
|
67
|
+
|
68
|
+
# Extract with different font grouping settings
|
69
|
+
default_text = region.extract_text()
|
70
|
+
spatial_text = page_no_font.create_region(0, 0, page_no_font.width, page_no_font.height).extract_text()
|
71
|
+
color_text = page_with_color.create_region(0, 0, page_with_color.width, page_with_color.height).extract_text()
|
72
|
+
|
73
|
+
# Show word counts as a simple comparison
|
74
|
+
print(f"Default grouping word count: {len(default_text.split())}")
|
75
|
+
print(f"Spatial-only grouping word count: {len(spatial_text.split())}")
|
76
|
+
print(f"Font+color grouping word count: {len(color_text.split())}")
|
77
|
+
|
78
|
+
# Show sample of text differences
|
79
|
+
print("\nText samples (first 200 chars):")
|
80
|
+
print(f"Default: {default_text[:200]}...")
|
81
|
+
print(f"Spatial: {spatial_text[:200]}...")
|
82
|
+
print(f"Color-aware: {color_text[:200]}...")
|
83
|
+
|
84
|
+
# Example 4: Detailed character-level analysis
|
85
|
+
print("\n5. Character-level analysis:")
|
86
|
+
|
87
|
+
# Get raw character data
|
88
|
+
chars = page.find_all('char')[:5] # First 5 characters
|
89
|
+
print(f"Raw character elements ({len(chars)} of {len(page.find_all('char'))} total):")
|
90
|
+
for char in chars:
|
91
|
+
print(f" - {char}")
|
92
|
+
|
93
|
+
# Show word elements too
|
94
|
+
words = page.find_all("text")[:3] # First 3 words
|
95
|
+
print(f"\nWord elements ({len(words)} of {len(page.find_all('text'))} total):")
|
96
|
+
for word in words:
|
97
|
+
print(f" - {word}")
|
98
|
+
print(f" Font info: {word.font_info()}")
|
99
|
+
|
100
|
+
if __name__ == "__main__":
|
101
|
+
main()
|
@@ -0,0 +1,124 @@
|
|
1
|
+
"""
|
2
|
+
Example demonstrating font variant detection in Natural PDF.
|
3
|
+
|
4
|
+
This example shows how to identify and filter text elements by font variant
|
5
|
+
(the prefix in embedded font names, such as 'AAAAAB+FontName').
|
6
|
+
"""
|
7
|
+
import os
|
8
|
+
import sys
|
9
|
+
|
10
|
+
# Add the parent directory to the path so we can import natural_pdf module
|
11
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
12
|
+
|
13
|
+
from natural_pdf import PDF
|
14
|
+
|
15
|
+
def main():
|
16
|
+
# If a PDF path is provided, use it; otherwise use the default example
|
17
|
+
if len(sys.argv) > 1:
|
18
|
+
pdf_path = sys.argv[1]
|
19
|
+
else:
|
20
|
+
# Use a default PDF path - you'll need to replace this with an actual PDF path
|
21
|
+
pdf_path = "examples/sample.pdf"
|
22
|
+
if not os.path.exists(pdf_path):
|
23
|
+
print(f"Default PDF not found at {pdf_path}")
|
24
|
+
print("Please provide a PDF path as an argument")
|
25
|
+
return
|
26
|
+
|
27
|
+
print(f"Processing PDF: {pdf_path}")
|
28
|
+
pdf = PDF(pdf_path)
|
29
|
+
page = pdf.pages[0]
|
30
|
+
|
31
|
+
# Example 1: Identify different font variants on the page
|
32
|
+
print("\n1. Identifying font variants")
|
33
|
+
|
34
|
+
# Get all text elements
|
35
|
+
all_text = page.find_all('text')
|
36
|
+
|
37
|
+
# Collect unique font variants
|
38
|
+
variants = {}
|
39
|
+
for element in all_text:
|
40
|
+
variant = element.font_variant
|
41
|
+
if variant:
|
42
|
+
if variant not in variants:
|
43
|
+
variants[variant] = {
|
44
|
+
'count': 0,
|
45
|
+
'example': element.text,
|
46
|
+
'fontname': element.fontname
|
47
|
+
}
|
48
|
+
variants[variant]['count'] += 1
|
49
|
+
|
50
|
+
# Display the variants found
|
51
|
+
print(f"Found {len(variants)} font variants on the page:")
|
52
|
+
for variant, info in variants.items():
|
53
|
+
print(f" Variant: '{variant}'")
|
54
|
+
print(f" Full fontname: {info['fontname']}")
|
55
|
+
print(f" Count: {info['count']} elements")
|
56
|
+
print(f" Example text: '{info['example']}'")
|
57
|
+
|
58
|
+
# Example 2: Filter elements by font variant
|
59
|
+
print("\n2. Filtering by font variant")
|
60
|
+
|
61
|
+
# Select a variant to filter by (use the first one found)
|
62
|
+
if variants:
|
63
|
+
target_variant = next(iter(variants.keys()))
|
64
|
+
print(f"Filtering for variant: '{target_variant}'")
|
65
|
+
|
66
|
+
# Filter elements with this variant
|
67
|
+
variant_elements = page.find_all(f'text[font-variant="{target_variant}"]')
|
68
|
+
print(f"Found {len(variant_elements)} elements with this variant")
|
69
|
+
|
70
|
+
# Display some examples
|
71
|
+
for i, element in enumerate(variant_elements[:5]):
|
72
|
+
print(f" Element {i+1}: '{element.text}'")
|
73
|
+
if i >= 4:
|
74
|
+
break
|
75
|
+
|
76
|
+
# Example 3: Compare visually similar texts with different variants
|
77
|
+
print("\n3. Visual comparison of variants")
|
78
|
+
|
79
|
+
# Find all variants
|
80
|
+
variant_list = list(variants.keys())
|
81
|
+
|
82
|
+
# If we have multiple variants, compare them
|
83
|
+
if len(variant_list) >= 2:
|
84
|
+
variant_1 = variant_list[0]
|
85
|
+
variant_2 = variant_list[1]
|
86
|
+
|
87
|
+
print(f"Comparing variant '{variant_1}' with '{variant_2}':")
|
88
|
+
|
89
|
+
# Get elements from each variant
|
90
|
+
elements_1 = page.find_all(f'text[font-variant="{variant_1}"]')
|
91
|
+
elements_2 = page.find_all(f'text[font-variant="{variant_2}"]')
|
92
|
+
|
93
|
+
# Highlight elements with different colors
|
94
|
+
if elements_1:
|
95
|
+
elements_1.highlight(color=(1, 0, 0), label=f"Variant {variant_1}")
|
96
|
+
if elements_2:
|
97
|
+
elements_2.highlight(color=(0, 1, 0), label=f"Variant {variant_2}")
|
98
|
+
|
99
|
+
# Save the highlighted page
|
100
|
+
highlight_path = "font_variants_highlight.png"
|
101
|
+
page.save(highlight_path, labels=True)
|
102
|
+
print(f"Highlighted comparison saved to {highlight_path}")
|
103
|
+
|
104
|
+
# Compare properties of elements from each variant
|
105
|
+
if elements_1 and elements_2:
|
106
|
+
elem1 = elements_1[0]
|
107
|
+
elem2 = elements_2[0]
|
108
|
+
|
109
|
+
print("\nDetailed comparison of first elements from each variant:")
|
110
|
+
|
111
|
+
# Print font info for each
|
112
|
+
print(f"\nVariant '{variant_1}' font info:")
|
113
|
+
for k, v in elem1.font_info().items():
|
114
|
+
print(f" {k}: {v}")
|
115
|
+
|
116
|
+
print(f"\nVariant '{variant_2}' font info:")
|
117
|
+
for k, v in elem2.font_info().items():
|
118
|
+
print(f" {k}: {v}")
|
119
|
+
|
120
|
+
else:
|
121
|
+
print("No font variants found to filter by")
|
122
|
+
|
123
|
+
if __name__ == "__main__":
|
124
|
+
main()
|
@@ -0,0 +1,124 @@
|
|
1
|
+
"""
|
2
|
+
Test for handling regions that overlap with a footer exclusion zone.
|
3
|
+
This is a focused test for the specific issue where regions that overlap with a footer
|
4
|
+
weren't returning any text.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import os
|
8
|
+
import sys
|
9
|
+
import logging
|
10
|
+
from pathlib import Path
|
11
|
+
|
12
|
+
# Configure logging
|
13
|
+
import logging
|
14
|
+
logging.basicConfig(level=logging.INFO)
|
15
|
+
|
16
|
+
from natural_pdf import PDF
|
17
|
+
|
18
|
+
def main():
|
19
|
+
"""Main entry point."""
|
20
|
+
# Get the PDF path from command line or use a default
|
21
|
+
if len(sys.argv) > 1:
|
22
|
+
pdf_path = sys.argv[1]
|
23
|
+
else:
|
24
|
+
# Look for any PDF in the pdfs directory
|
25
|
+
pdfs_dir = Path(__file__).parent.parent / "pdfs"
|
26
|
+
pdf_files = list(pdfs_dir.glob("*.pdf"))
|
27
|
+
|
28
|
+
if pdf_files:
|
29
|
+
pdf_path = str(pdf_files[0])
|
30
|
+
else:
|
31
|
+
print("No PDF file found. Please provide a path to a PDF file.")
|
32
|
+
sys.exit(1)
|
33
|
+
|
34
|
+
print(f"\nTesting with PDF: {pdf_path}")
|
35
|
+
|
36
|
+
# Create a PDF object
|
37
|
+
pdf = PDF(pdf_path)
|
38
|
+
page = pdf.pages[0]
|
39
|
+
|
40
|
+
# Create ONLY a footer exclusion zone
|
41
|
+
footer_height = page.height * 0.1 # Bottom 10% of the page
|
42
|
+
footer = page.create_region(0, page.height - footer_height, page.width, page.height)
|
43
|
+
footer.highlight(label="Footer Exclusion", color=(1, 0, 0, 0.3))
|
44
|
+
page.add_exclusion(footer)
|
45
|
+
print(f"Added footer exclusion: {footer.bbox}")
|
46
|
+
|
47
|
+
# Create a region that extends from middle of page to past the footer
|
48
|
+
middle_to_footer = page.create_region(
|
49
|
+
page.width * 0.25, # 25% from left
|
50
|
+
page.height * 0.5, # 50% from top (middle of page)
|
51
|
+
page.width * 0.75, # 75% from left
|
52
|
+
page.height # All the way to bottom (overlaps footer)
|
53
|
+
)
|
54
|
+
middle_to_footer.highlight(label="Middle to Footer", color=(0, 1, 0, 0.3))
|
55
|
+
print(f"Created test region: {middle_to_footer.bbox}")
|
56
|
+
|
57
|
+
# Try different extraction approaches:
|
58
|
+
|
59
|
+
# 1. Extract with exclusions using the default approach
|
60
|
+
print("\n=== 1. Using Default Extraction ===")
|
61
|
+
text = middle_to_footer.extract_text(apply_exclusions=True, debug=True)
|
62
|
+
print(f"Text length: {len(text)}")
|
63
|
+
print(f"First 100 chars: {text[:100] if text else 'No text!'}")
|
64
|
+
|
65
|
+
# 2. Try direct cropping approach
|
66
|
+
print("\n=== 2. Using Direct Crop Approach ===")
|
67
|
+
# Manually adjust the region to exclude the footer
|
68
|
+
top_bound = middle_to_footer.top
|
69
|
+
bottom_bound = page.height - footer_height # Top of footer
|
70
|
+
|
71
|
+
cropped_region = page.create_region(
|
72
|
+
middle_to_footer.x0,
|
73
|
+
top_bound,
|
74
|
+
middle_to_footer.x1,
|
75
|
+
bottom_bound
|
76
|
+
)
|
77
|
+
cropped_region.highlight(label="Cropped Region", color=(0, 0, 1, 0.3))
|
78
|
+
|
79
|
+
# Extract without applying exclusions (since we manually cropped)
|
80
|
+
cropped_text = cropped_region.extract_text(apply_exclusions=False)
|
81
|
+
print(f"Text length: {len(cropped_text)}")
|
82
|
+
print(f"First 100 chars: {cropped_text[:100] if cropped_text else 'No text!'}")
|
83
|
+
|
84
|
+
# 3. Get individual elements and extract text from them
|
85
|
+
print("\n=== 3. Using Element Filtering Approach ===")
|
86
|
+
all_elements = page.get_elements()
|
87
|
+
|
88
|
+
# Filter elements that are in our region but NOT in footer
|
89
|
+
filtered_elements = []
|
90
|
+
for element in all_elements:
|
91
|
+
# Check if element is in the region
|
92
|
+
if (middle_to_footer.x0 <= (element.x0 + element.x1)/2 <= middle_to_footer.x1 and
|
93
|
+
middle_to_footer.top <= (element.top + element.bottom)/2 <= middle_to_footer.bottom and
|
94
|
+
not (footer.top <= (element.top + element.bottom)/2 <= footer.bottom)):
|
95
|
+
filtered_elements.append(element)
|
96
|
+
|
97
|
+
# Extract text from the filtered elements
|
98
|
+
filtered_text = " ".join(e.text for e in filtered_elements if hasattr(e, 'text'))
|
99
|
+
print(f"Text length: {len(filtered_text)}")
|
100
|
+
print(f"First 100 chars: {filtered_text[:100] if filtered_text else 'No text!'}")
|
101
|
+
|
102
|
+
# Save the visualization
|
103
|
+
page.save_image("output/footer_overlap_test.png", labels=True)
|
104
|
+
print(f"\nTest visualization saved to output/footer_overlap_test.png")
|
105
|
+
|
106
|
+
# Provide a summary
|
107
|
+
print("\nTEST SUMMARY:")
|
108
|
+
if len(text) > 0:
|
109
|
+
print("✅ Default extraction works now with overlapping exclusions!")
|
110
|
+
else:
|
111
|
+
print("❌ Default extraction still fails with overlapping exclusions!")
|
112
|
+
|
113
|
+
if len(cropped_text) > 0:
|
114
|
+
print("✅ Manual cropping approach works!")
|
115
|
+
else:
|
116
|
+
print("❌ Manual cropping approach fails!")
|
117
|
+
|
118
|
+
if len(filtered_text) > 0:
|
119
|
+
print("✅ Element filtering approach works!")
|
120
|
+
else:
|
121
|
+
print("❌ Element filtering approach fails!")
|
122
|
+
|
123
|
+
if __name__ == "__main__":
|
124
|
+
main()
|