natural-pdf 25.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/__init__.py +3 -0
- examples/another_exclusion_example.py +20 -0
- examples/basic_usage.py +190 -0
- examples/boundary_exclusion_test.py +137 -0
- examples/boundary_inclusion_fix_test.py +157 -0
- examples/chainable_layout_example.py +70 -0
- examples/color_basic_test.py +49 -0
- examples/color_name_example.py +71 -0
- examples/color_test.py +62 -0
- examples/debug_ocr.py +91 -0
- examples/direct_ocr_test.py +148 -0
- examples/direct_paddle_test.py +99 -0
- examples/direct_qa_example.py +165 -0
- examples/document_layout_analysis.py +123 -0
- examples/document_qa_example.py +185 -0
- examples/exclusion_count_debug.py +128 -0
- examples/exclusion_debug.py +107 -0
- examples/exclusion_example.py +150 -0
- examples/exclusion_optimization_example.py +190 -0
- examples/extract_text_test.py +128 -0
- examples/font_aware_example.py +101 -0
- examples/font_variant_example.py +124 -0
- examples/footer_overlap_test.py +124 -0
- examples/highlight_all_example.py +82 -0
- examples/highlight_attributes_test.py +114 -0
- examples/highlight_confidence_display.py +122 -0
- examples/highlight_demo.py +110 -0
- examples/highlight_float_test.py +71 -0
- examples/highlight_test.py +147 -0
- examples/highlighting_example.py +123 -0
- examples/image_width_example.py +84 -0
- examples/improved_api_example.py +128 -0
- examples/layout_confidence_display_test.py +65 -0
- examples/layout_confidence_test.py +82 -0
- examples/layout_coordinate_debug.py +258 -0
- examples/layout_highlight_test.py +77 -0
- examples/logging_example.py +70 -0
- examples/ocr_comprehensive.py +193 -0
- examples/ocr_debug_example.py +87 -0
- examples/ocr_default_test.py +97 -0
- examples/ocr_engine_comparison.py +235 -0
- examples/ocr_example.py +89 -0
- examples/ocr_simplified_params.py +79 -0
- examples/ocr_visualization.py +102 -0
- examples/ocr_visualization_test.py +121 -0
- examples/paddle_layout_example.py +315 -0
- examples/paddle_layout_simple.py +74 -0
- examples/paddleocr_example.py +224 -0
- examples/page_collection_example.py +103 -0
- examples/polygon_highlight_example.py +83 -0
- examples/position_methods_example.py +134 -0
- examples/region_boundary_test.py +73 -0
- examples/region_exclusion_test.py +149 -0
- examples/region_expand_example.py +109 -0
- examples/region_image_example.py +116 -0
- examples/region_ocr_test.py +119 -0
- examples/region_sections_example.py +115 -0
- examples/school_books.py +49 -0
- examples/school_books_all.py +52 -0
- examples/scouring.py +36 -0
- examples/section_extraction_example.py +232 -0
- examples/simple_document_qa.py +97 -0
- examples/spatial_navigation_example.py +108 -0
- examples/table_extraction_example.py +135 -0
- examples/table_structure_detection.py +155 -0
- examples/tatr_cells_test.py +56 -0
- examples/tatr_ocr_table_test.py +94 -0
- examples/text_search_example.py +122 -0
- examples/text_style_example.py +110 -0
- examples/tiny-text.py +61 -0
- examples/until_boundaries_example.py +156 -0
- examples/until_example.py +112 -0
- examples/very_basics.py +15 -0
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +9 -0
- natural_pdf/analyzers/document_layout.py +736 -0
- natural_pdf/analyzers/text_structure.py +153 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/page.py +2376 -0
- natural_pdf/core/pdf.py +572 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +553 -0
- natural_pdf/elements/collections.py +770 -0
- natural_pdf/elements/line.py +124 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1366 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +62 -0
- natural_pdf/ocr/easyocr_engine.py +254 -0
- natural_pdf/ocr/engine.py +158 -0
- natural_pdf/ocr/paddleocr_engine.py +263 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +405 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +360 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +4 -0
- natural_pdf/utils/highlighting.py +605 -0
- natural_pdf/utils/ocr.py +515 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +151 -0
- natural_pdf-25.3.16.dist-info/LICENSE +21 -0
- natural_pdf-25.3.16.dist-info/METADATA +268 -0
- natural_pdf-25.3.16.dist-info/RECORD +109 -0
- natural_pdf-25.3.16.dist-info/WHEEL +5 -0
- natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
- tests/__init__.py +3 -0
- tests/test_pdf.py +39 -0
@@ -0,0 +1,149 @@
|
|
1
|
+
"""
|
2
|
+
Test the improved exclusion handling in Region.extract_text() method.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
import sys
|
7
|
+
import logging
|
8
|
+
from pathlib import Path
|
9
|
+
|
10
|
+
# Add parent directory to path for imports
|
11
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
12
|
+
|
13
|
+
from natural_pdf import PDF, configure_logging
|
14
|
+
|
15
|
+
# Configure logging
|
16
|
+
configure_logging(level=logging.DEBUG)
|
17
|
+
|
18
|
+
|
19
|
+
def test_region_with_exclusions(pdf_path):
|
20
|
+
"""
|
21
|
+
Test extracting text from regions with various exclusion scenarios.
|
22
|
+
"""
|
23
|
+
with PDF(pdf_path) as pdf:
|
24
|
+
page = pdf.pages[0]
|
25
|
+
print(f"\nTesting with PDF: {pdf_path} (page {page.number})")
|
26
|
+
|
27
|
+
print("\n=== 1. Creating Test Exclusion Zones ===")
|
28
|
+
# Create top (header) and bottom (footer) exclusions
|
29
|
+
# Top 15% of the page
|
30
|
+
top_exclusion = page.create_region(0, 0, page.width, page.height * 0.15)
|
31
|
+
top_exclusion.highlight(label="Header Exclusion", color=(1, 0, 0, 0.3))
|
32
|
+
page.add_exclusion(top_exclusion)
|
33
|
+
print(f"Added header exclusion: {top_exclusion.bbox}")
|
34
|
+
|
35
|
+
# Bottom 10% of the page
|
36
|
+
bottom_exclusion = page.create_region(0, page.height * 0.9, page.width, page.height)
|
37
|
+
bottom_exclusion.highlight(label="Footer Exclusion", color=(0, 0, 1, 0.3))
|
38
|
+
page.add_exclusion(bottom_exclusion)
|
39
|
+
print(f"Added footer exclusion: {bottom_exclusion.bbox}")
|
40
|
+
|
41
|
+
# Middle partial-width exclusion
|
42
|
+
middle_exclusion = page.create_region(0, page.height * 0.4, page.width * 0.3, page.height * 0.6)
|
43
|
+
middle_exclusion.highlight(label="Side Exclusion", color=(0, 1, 0, 0.3))
|
44
|
+
page.add_exclusion(middle_exclusion)
|
45
|
+
print(f"Added side exclusion: {middle_exclusion.bbox}")
|
46
|
+
|
47
|
+
print("\n=== 2. Testing Region That Doesn't Intersect Exclusions ===")
|
48
|
+
# Create a region that doesn't intersect with any exclusion
|
49
|
+
non_intersecting = page.create_region(
|
50
|
+
page.width * 0.4,
|
51
|
+
page.height * 0.5,
|
52
|
+
page.width * 0.9,
|
53
|
+
page.height * 0.7
|
54
|
+
)
|
55
|
+
non_intersecting.highlight(label="Non-Intersecting", color=(1, 1, 0, 0.3))
|
56
|
+
|
57
|
+
# Extract with and without applying exclusions - should be the same
|
58
|
+
text_with_exclusions = non_intersecting.extract_text(apply_exclusions=True, debug=True)
|
59
|
+
text_without_exclusions = non_intersecting.extract_text(apply_exclusions=False)
|
60
|
+
print(f"Non-intersecting region text length:")
|
61
|
+
print(f" - With exclusions: {len(text_with_exclusions)} chars")
|
62
|
+
print(f" - Without exclusions: {len(text_without_exclusions)} chars")
|
63
|
+
print(f" - Same result: {text_with_exclusions == text_without_exclusions}")
|
64
|
+
|
65
|
+
print("\n=== 3. Testing Region With Header/Footer Intersection ===")
|
66
|
+
# Create a region that intersects with header and footer
|
67
|
+
full_height = page.create_region(
|
68
|
+
page.width * 0.3,
|
69
|
+
0,
|
70
|
+
page.width * 0.8,
|
71
|
+
page.height
|
72
|
+
)
|
73
|
+
full_height.highlight(label="Full Height Region", color=(1, 0, 1, 0.3))
|
74
|
+
|
75
|
+
# Extract with and without applying exclusions
|
76
|
+
text_with_exclusions = full_height.extract_text(apply_exclusions=True, debug=True)
|
77
|
+
text_without_exclusions = full_height.extract_text(apply_exclusions=False)
|
78
|
+
print(f"Full height region text length:")
|
79
|
+
print(f" - With exclusions: {len(text_with_exclusions)} chars")
|
80
|
+
print(f" - Without exclusions: {len(text_without_exclusions)} chars")
|
81
|
+
print(f" - Exclusions removed {len(text_without_exclusions) - len(text_with_exclusions)} chars")
|
82
|
+
|
83
|
+
# Test the specific case that was causing issues
|
84
|
+
middle_to_footer = page.create_region(
|
85
|
+
page.width * 0.3,
|
86
|
+
page.height * 0.4, # Middle of page
|
87
|
+
page.width * 0.8,
|
88
|
+
page.height # All the way to bottom (overlapping footer)
|
89
|
+
)
|
90
|
+
middle_to_footer.highlight(label="Middle to Footer", color=(0.5, 0.5, 0, 0.3))
|
91
|
+
|
92
|
+
text_with_exclusions = middle_to_footer.extract_text(apply_exclusions=True, debug=True)
|
93
|
+
text_without_exclusions = middle_to_footer.extract_text(apply_exclusions=False)
|
94
|
+
print(f"\nMiddle-to-footer region text length:")
|
95
|
+
print(f" - With exclusions: {len(text_with_exclusions)} chars")
|
96
|
+
print(f" - Without exclusions: {len(text_without_exclusions)} chars")
|
97
|
+
if len(text_with_exclusions) > 0:
|
98
|
+
print(f" - Working correctly! Content found with exclusions applied")
|
99
|
+
else:
|
100
|
+
print(f" - Still failing! No content found with exclusions applied")
|
101
|
+
|
102
|
+
print("\n=== 4. Testing Region With Complex Exclusion Intersection ===")
|
103
|
+
# Create a region that intersects with the side exclusion
|
104
|
+
complex_region = page.create_region(
|
105
|
+
page.width * 0.1,
|
106
|
+
page.height * 0.3,
|
107
|
+
page.width * 0.5,
|
108
|
+
page.height * 0.7
|
109
|
+
)
|
110
|
+
complex_region.highlight(label="Complex Region", color=(0, 1, 1, 0.3))
|
111
|
+
|
112
|
+
# Extract with and without applying exclusions
|
113
|
+
text_with_exclusions = complex_region.extract_text(apply_exclusions=True, debug=True)
|
114
|
+
text_without_exclusions = complex_region.extract_text(apply_exclusions=False)
|
115
|
+
print(f"Complex region text length:")
|
116
|
+
print(f" - With exclusions: {len(text_with_exclusions)} chars")
|
117
|
+
print(f" - Without exclusions: {len(text_without_exclusions)} chars")
|
118
|
+
print(f" - Exclusions removed {len(text_without_exclusions) - len(text_with_exclusions)} chars")
|
119
|
+
|
120
|
+
# Save the image with all regions and exclusions highlighted
|
121
|
+
print("\n=== 5. Saving Visual Test Image ===")
|
122
|
+
output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'output'))
|
123
|
+
os.makedirs(output_dir, exist_ok=True)
|
124
|
+
output_file = os.path.join(output_dir, "region_exclusion_test.png")
|
125
|
+
page.save_image(output_file, labels=True)
|
126
|
+
print(f"Saved test visualization to: {output_file}")
|
127
|
+
|
128
|
+
|
129
|
+
def main():
|
130
|
+
"""Main entry point."""
|
131
|
+
# Get the PDF path from command line or use a default
|
132
|
+
if len(sys.argv) > 1:
|
133
|
+
pdf_path = sys.argv[1]
|
134
|
+
else:
|
135
|
+
# Look for any PDF in the pdfs directory
|
136
|
+
pdfs_dir = Path(__file__).parent.parent / "pdfs"
|
137
|
+
pdf_files = list(pdfs_dir.glob("*.pdf"))
|
138
|
+
|
139
|
+
if pdf_files:
|
140
|
+
pdf_path = str(pdf_files[0])
|
141
|
+
else:
|
142
|
+
print("No PDF file found. Please provide a path to a PDF file.")
|
143
|
+
sys.exit(1)
|
144
|
+
|
145
|
+
test_region_with_exclusions(pdf_path)
|
146
|
+
|
147
|
+
|
148
|
+
if __name__ == "__main__":
|
149
|
+
main()
|
@@ -0,0 +1,109 @@
|
|
1
|
+
"""
|
2
|
+
Example demonstrating the region.expand() method in Natural PDF.
|
3
|
+
|
4
|
+
This example shows how to expand or shrink regions in various ways.
|
5
|
+
"""
|
6
|
+
import os
|
7
|
+
import sys
|
8
|
+
|
9
|
+
# Add the parent directory to the path so we can import natural_pdf module
|
10
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
11
|
+
|
12
|
+
from natural_pdf import PDF
|
13
|
+
|
14
|
+
def main():
|
15
|
+
# If a PDF path is provided, use it; otherwise use the default example
|
16
|
+
if len(sys.argv) > 1:
|
17
|
+
pdf_path = sys.argv[1]
|
18
|
+
else:
|
19
|
+
# Use a default PDF path
|
20
|
+
pdf_path = "pdfs/Atlanta_Public_Schools_GA_sample.pdf"
|
21
|
+
if not os.path.exists(pdf_path):
|
22
|
+
print(f"Default PDF not found at {pdf_path}")
|
23
|
+
print("Please provide a PDF path as an argument")
|
24
|
+
return
|
25
|
+
|
26
|
+
print(f"Processing PDF: {pdf_path}")
|
27
|
+
pdf = PDF(pdf_path)
|
28
|
+
page = pdf.pages[0]
|
29
|
+
|
30
|
+
# Example 1: Basic expansion in different directions
|
31
|
+
print("\n1. Basic region expansion")
|
32
|
+
|
33
|
+
# Find a text element to start with
|
34
|
+
text = page.find('text')
|
35
|
+
if not text:
|
36
|
+
print("No text found on page")
|
37
|
+
return
|
38
|
+
|
39
|
+
# Create a region from the text element (its bounding box)
|
40
|
+
region = page.create_region(text.x0, text.top, text.x1, text.bottom)
|
41
|
+
print(f"Original region: {region.bbox}")
|
42
|
+
|
43
|
+
# Expand the region in different directions
|
44
|
+
expanded_right = region.expand(right=50)
|
45
|
+
print(f"Expanded right by 50: {expanded_right.bbox}")
|
46
|
+
|
47
|
+
expanded_all = region.expand(left=10, right=20, top_expand=15, bottom_expand=25)
|
48
|
+
print(f"Expanded in all directions: {expanded_all.bbox}")
|
49
|
+
|
50
|
+
# Shrink the region with negative values
|
51
|
+
shrunk = region.expand(left=-5, right=-5, top_expand=-2, bottom_expand=-2)
|
52
|
+
print(f"Shrunk with negative values: {shrunk.bbox}")
|
53
|
+
|
54
|
+
# Example 2: Using expansion factors
|
55
|
+
print("\n2. Expansion with factors")
|
56
|
+
|
57
|
+
# Double the width
|
58
|
+
double_width = region.expand(width_factor=2.0)
|
59
|
+
print(f"Double width (width_factor=2.0): {double_width.bbox}")
|
60
|
+
|
61
|
+
# Increase height by 50%
|
62
|
+
taller = region.expand(height_factor=1.5)
|
63
|
+
print(f"50% taller (height_factor=1.5): {taller.bbox}")
|
64
|
+
|
65
|
+
# Both width and height factors
|
66
|
+
bigger = region.expand(width_factor=1.5, height_factor=1.25)
|
67
|
+
print(f"Wider and taller: {bigger.bbox}")
|
68
|
+
|
69
|
+
# Example 3: Combining with spatial navigation
|
70
|
+
print("\n3. Combining with spatial navigation")
|
71
|
+
|
72
|
+
# Find a heading (assuming it's bold or larger text)
|
73
|
+
heading = page.find('text[size>=12]')
|
74
|
+
if heading:
|
75
|
+
print(f"Found heading: '{heading.text}'")
|
76
|
+
|
77
|
+
# Create a region below the heading and expand it
|
78
|
+
content_region = heading.below(height=100, full_width=False)
|
79
|
+
print(f"Region below heading: {content_region.bbox}")
|
80
|
+
|
81
|
+
# Expand the region to include more content
|
82
|
+
expanded_region = content_region.expand(right=100, bottom_expand=50)
|
83
|
+
print(f"Expanded region: {expanded_region.bbox}")
|
84
|
+
|
85
|
+
# Extract text from the expanded region
|
86
|
+
text = expanded_region.extract_text()
|
87
|
+
print(f"Text in expanded region: {text[:100]}...")
|
88
|
+
|
89
|
+
# Example 4: Visual demonstration with highlighting
|
90
|
+
print("\n4. Visual demonstration with highlighting")
|
91
|
+
|
92
|
+
# Choose a region to work with
|
93
|
+
demo_region = page.create_region(100, 100, 300, 200)
|
94
|
+
|
95
|
+
# Highlight the original region
|
96
|
+
demo_region.highlight(color=(1, 0, 0), label="Original")
|
97
|
+
|
98
|
+
# Highlight expanded versions with different colors
|
99
|
+
demo_region.expand(left=20, right=20).highlight(color=(0, 1, 0), label="Wider")
|
100
|
+
demo_region.expand(top_expand=20, bottom_expand=20).highlight(color=(0, 0, 1), label="Taller")
|
101
|
+
demo_region.expand(width_factor=1.5, height_factor=1.5).highlight(color=(1, 0.5, 0), label="1.5x Larger")
|
102
|
+
|
103
|
+
# Save the highlighted page
|
104
|
+
highlight_path = "region_expand_highlight.png"
|
105
|
+
page.to_image(path=highlight_path, show_labels=True)
|
106
|
+
print(f"Highlighted regions saved to {highlight_path}")
|
107
|
+
|
108
|
+
if __name__ == "__main__":
|
109
|
+
main()
|
@@ -0,0 +1,116 @@
|
|
1
|
+
"""
|
2
|
+
Example demonstrating the new region.to_image() and region.save_image() functionality.
|
3
|
+
|
4
|
+
This example shows how to:
|
5
|
+
1. Create regions in various ways
|
6
|
+
2. Generate images of just the region
|
7
|
+
3. Save region images to files
|
8
|
+
4. Compare different rendering options
|
9
|
+
"""
|
10
|
+
|
11
|
+
import os
|
12
|
+
import sys
|
13
|
+
import argparse
|
14
|
+
|
15
|
+
# Add parent directory to path to run without installing
|
16
|
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
17
|
+
|
18
|
+
from natural_pdf import PDF
|
19
|
+
|
20
|
+
def main():
|
21
|
+
parser = argparse.ArgumentParser(description="Region Image Example")
|
22
|
+
parser.add_argument("pdf_path", nargs="?", default="../pdfs/0500000US42001.pdf",
|
23
|
+
help="Path to PDF document")
|
24
|
+
args = parser.parse_args()
|
25
|
+
|
26
|
+
print(f"Opening PDF: {args.pdf_path}")
|
27
|
+
|
28
|
+
# Open the PDF
|
29
|
+
pdf = PDF(args.pdf_path)
|
30
|
+
page = pdf.pages[0]
|
31
|
+
|
32
|
+
# Create output directory
|
33
|
+
os.makedirs("output", exist_ok=True)
|
34
|
+
|
35
|
+
# Method 1: Find a text element and create a region below it
|
36
|
+
print("Creating regions...")
|
37
|
+
title = page.find('text:bold')
|
38
|
+
if not title:
|
39
|
+
title = page.find('text')
|
40
|
+
|
41
|
+
region_below = title.below(height=100, width="element")
|
42
|
+
|
43
|
+
# Method 2: Create a region from a specific part of the page
|
44
|
+
page_width, page_height = page.width, page.height
|
45
|
+
center_region = page.create_region(
|
46
|
+
page_width / 4, # Left quarter of page
|
47
|
+
page_height / 4, # Top quarter of page
|
48
|
+
page_width * 3/4, # Right three-quarters
|
49
|
+
page_height * 3/4 # Bottom three-quarters
|
50
|
+
)
|
51
|
+
|
52
|
+
# Method 3: Use layout detection to find regions
|
53
|
+
page.analyze_layout(confidence=0.3)
|
54
|
+
layout_regions = page.find_all('region')
|
55
|
+
|
56
|
+
# Generate and save images for each region
|
57
|
+
print("Generating region images...")
|
58
|
+
|
59
|
+
# Example 1: Basic region image with default settings
|
60
|
+
region_below.save_image("output/region_below.png")
|
61
|
+
print(f"Saved basic region image to output/region_below.png")
|
62
|
+
|
63
|
+
# Example 2: Region image with highlighted content
|
64
|
+
# First highlight some elements in the region
|
65
|
+
elements = region_below.find_all('text')
|
66
|
+
if elements:
|
67
|
+
elements[0].highlight(color=(1, 0, 0, 0.3), label="First Element")
|
68
|
+
|
69
|
+
# Save with highlights included
|
70
|
+
region_below.save_image(
|
71
|
+
"output/region_with_highlights.png",
|
72
|
+
include_highlights=True
|
73
|
+
)
|
74
|
+
print(f"Saved region with highlights to output/region_with_highlights.png")
|
75
|
+
|
76
|
+
# Save without highlights
|
77
|
+
region_below.save_image(
|
78
|
+
"output/region_without_highlights.png",
|
79
|
+
include_highlights=False
|
80
|
+
)
|
81
|
+
print(f"Saved region without highlights to output/region_without_highlights.png")
|
82
|
+
|
83
|
+
# Example 3: Region image without border
|
84
|
+
center_region.save_image(
|
85
|
+
"output/center_region_with_border.png"
|
86
|
+
)
|
87
|
+
print(f"Saved center region with border to output/center_region_with_border.png")
|
88
|
+
|
89
|
+
center_region.save_image(
|
90
|
+
"output/center_region_without_border.png",
|
91
|
+
crop_only=True
|
92
|
+
)
|
93
|
+
print(f"Saved center region without border to output/center_region_without_border.png")
|
94
|
+
|
95
|
+
# Example 4: High-resolution region image
|
96
|
+
if layout_regions:
|
97
|
+
first_layout = layout_regions[0]
|
98
|
+
first_layout.highlight(label=f"Region Type: {first_layout.region_type}")
|
99
|
+
|
100
|
+
# Save at different resolutions
|
101
|
+
first_layout.save_image(
|
102
|
+
"output/layout_region_low_res.png",
|
103
|
+
resolution=72
|
104
|
+
)
|
105
|
+
print(f"Saved layout region at 72 DPI to output/layout_region_low_res.png")
|
106
|
+
|
107
|
+
first_layout.save_image(
|
108
|
+
"output/layout_region_high_res.png",
|
109
|
+
resolution=300
|
110
|
+
)
|
111
|
+
print(f"Saved layout region at 300 DPI to output/layout_region_high_res.png")
|
112
|
+
|
113
|
+
print("\nDone! Check the output directory for the generated images.")
|
114
|
+
|
115
|
+
if __name__ == "__main__":
|
116
|
+
main()
|
@@ -0,0 +1,119 @@
|
|
1
|
+
"""
|
2
|
+
Test to identify and fix issues with region-specific OCR.
|
3
|
+
"""
|
4
|
+
import os
|
5
|
+
import sys
|
6
|
+
|
7
|
+
# Add the parent directory to the path to import the package
|
8
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
9
|
+
|
10
|
+
from natural_pdf import PDF
|
11
|
+
from PIL import Image, ImageDraw
|
12
|
+
|
13
|
+
def test_region_ocr():
|
14
|
+
"""Test OCR applied to specific regions."""
|
15
|
+
# Use a PDF that may work well with OCR
|
16
|
+
pdf_path = os.path.abspath(os.path.join(
|
17
|
+
os.path.dirname(__file__), '..', 'pdfs', 'Nigeria 2021_MICS_SFR_English.pdf'))
|
18
|
+
|
19
|
+
if not os.path.exists(pdf_path):
|
20
|
+
# Fall back to another PDF
|
21
|
+
pdf_path = os.path.abspath(os.path.join(
|
22
|
+
os.path.dirname(__file__), '..', 'pdfs', '0500000US42001.pdf'))
|
23
|
+
|
24
|
+
if not os.path.exists(pdf_path):
|
25
|
+
print("No suitable PDF file found for region OCR testing.")
|
26
|
+
return
|
27
|
+
|
28
|
+
print(f"Testing with PDF: {pdf_path}")
|
29
|
+
|
30
|
+
# Output directory
|
31
|
+
output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'output'))
|
32
|
+
os.makedirs(output_dir, exist_ok=True)
|
33
|
+
|
34
|
+
with PDF(pdf_path) as pdf:
|
35
|
+
# Get the first page
|
36
|
+
page = pdf.pages[0]
|
37
|
+
|
38
|
+
# Save the entire page image for reference
|
39
|
+
page_img = page.to_image(path=os.path.join(output_dir, "region_ocr_full_page.png"))
|
40
|
+
|
41
|
+
# Create a region in the middle of the page
|
42
|
+
half_width = page.width / 2
|
43
|
+
half_height = page.height / 2
|
44
|
+
region_width = page.width / 3
|
45
|
+
region_height = page.height / 3
|
46
|
+
|
47
|
+
region = page.create_region(
|
48
|
+
half_width - region_width/2,
|
49
|
+
half_height - region_height/2,
|
50
|
+
half_width + region_width/2,
|
51
|
+
half_height + region_height/2
|
52
|
+
)
|
53
|
+
|
54
|
+
# Highlight the region
|
55
|
+
region.highlight(label="OCR Test Region")
|
56
|
+
page.to_image(path=os.path.join(output_dir, "region_ocr_highlighted.png"), show_labels=True)
|
57
|
+
|
58
|
+
# Extract text from the region with and without OCR
|
59
|
+
text_no_ocr = region.extract_text()
|
60
|
+
text_with_ocr = region.extract_text(ocr=True)
|
61
|
+
|
62
|
+
# Print results
|
63
|
+
print("\nRegion Text WITHOUT OCR:")
|
64
|
+
print("-" * 40)
|
65
|
+
print(text_no_ocr)
|
66
|
+
|
67
|
+
print("\nRegion Text WITH OCR:")
|
68
|
+
print("-" * 40)
|
69
|
+
print(text_with_ocr)
|
70
|
+
|
71
|
+
# Apply OCR to the region and visualize the results
|
72
|
+
ocr_elements = region.apply_ocr(enabled=True)
|
73
|
+
|
74
|
+
print(f"\nFound {len(ocr_elements)} OCR elements in the region")
|
75
|
+
|
76
|
+
# Get the region image
|
77
|
+
page_img = page.to_image()
|
78
|
+
region_img = page_img.crop((region.x0, region.top, region.x1, region.bottom))
|
79
|
+
|
80
|
+
# Save region image for reference
|
81
|
+
region_img.save(os.path.join(output_dir, "region_ocr_cropped.png"))
|
82
|
+
|
83
|
+
# Create debug image showing OCR bounding boxes
|
84
|
+
debug_img = page.to_image()
|
85
|
+
draw = ImageDraw.Draw(debug_img)
|
86
|
+
|
87
|
+
# Draw region rectangle
|
88
|
+
draw.rectangle(
|
89
|
+
(region.x0, region.top, region.x1, region.bottom),
|
90
|
+
outline=(255, 0, 0),
|
91
|
+
width=3
|
92
|
+
)
|
93
|
+
|
94
|
+
# Draw OCR element bounding boxes
|
95
|
+
for elem in ocr_elements:
|
96
|
+
draw.rectangle(
|
97
|
+
(elem.x0, elem.top, elem.x1, elem.bottom),
|
98
|
+
outline=(0, 255, 0),
|
99
|
+
width=2
|
100
|
+
)
|
101
|
+
|
102
|
+
# Draw text label
|
103
|
+
draw.text(
|
104
|
+
(elem.x0, elem.top - 10),
|
105
|
+
elem.text[:10],
|
106
|
+
fill=(0, 0, 255)
|
107
|
+
)
|
108
|
+
|
109
|
+
# Save debug image
|
110
|
+
debug_img.save(os.path.join(output_dir, "region_ocr_debug.png"))
|
111
|
+
|
112
|
+
print(f"\nCreated debug images in: {output_dir}")
|
113
|
+
print("- region_ocr_full_page.png: Original page")
|
114
|
+
print("- region_ocr_highlighted.png: Page with region highlighted")
|
115
|
+
print("- region_ocr_cropped.png: Cropped region image")
|
116
|
+
print("- region_ocr_debug.png: Page with OCR text bounding boxes")
|
117
|
+
|
118
|
+
if __name__ == "__main__":
|
119
|
+
test_region_ocr()
|
@@ -0,0 +1,115 @@
|
|
1
|
+
"""
|
2
|
+
Example demonstrating the get_sections() method on regions in Natural PDF.
|
3
|
+
|
4
|
+
This example shows how to extract logical sections from regions
|
5
|
+
using various types of boundary elements.
|
6
|
+
"""
|
7
|
+
import os
|
8
|
+
import sys
|
9
|
+
|
10
|
+
# Add the parent directory to the path so we can import natural_pdf module
|
11
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
12
|
+
|
13
|
+
from natural_pdf import PDF
|
14
|
+
|
15
|
+
def main():
|
16
|
+
# If a PDF path is provided, use it; otherwise use the default example
|
17
|
+
if len(sys.argv) > 1:
|
18
|
+
pdf_path = sys.argv[1]
|
19
|
+
else:
|
20
|
+
# Use a default PDF path - you'll need to replace this with an actual PDF path
|
21
|
+
pdf_path = "examples/sample.pdf"
|
22
|
+
if not os.path.exists(pdf_path):
|
23
|
+
print(f"Default PDF not found at {pdf_path}")
|
24
|
+
print("Please provide a PDF path as an argument")
|
25
|
+
return
|
26
|
+
|
27
|
+
print(f"Processing PDF: {pdf_path}")
|
28
|
+
pdf = PDF(pdf_path)
|
29
|
+
page = pdf.pages[0]
|
30
|
+
|
31
|
+
# Example 1: Get sections within a region using separators
|
32
|
+
print("\n1. Get sections within a region using separators")
|
33
|
+
|
34
|
+
# First, create a region from the top half of the page
|
35
|
+
top_half = page.create_region(0, 0, page.width, page.height / 2)
|
36
|
+
print(f"Created region: {top_half.bbox}")
|
37
|
+
|
38
|
+
# Method 1: Find elements first, then pass them to get_sections
|
39
|
+
lines = top_half.find_all('line')
|
40
|
+
print(f"Found {len(lines)} line elements in the region")
|
41
|
+
|
42
|
+
# Extract sections using lines as start elements
|
43
|
+
sections1 = top_half.get_sections(start_elements=lines)
|
44
|
+
print(f"Found {len(sections1)} sections using explicit elements")
|
45
|
+
|
46
|
+
# Method 2: Pass selector directly to start_elements
|
47
|
+
sections2 = top_half.get_sections(start_elements='line')
|
48
|
+
print(f"Found {len(sections2)} sections using selector string")
|
49
|
+
|
50
|
+
# Display section details
|
51
|
+
for i, section in enumerate(sections2):
|
52
|
+
text = section.extract_text()
|
53
|
+
text_snippet = text[:50] + "..." if len(text) > 50 else text
|
54
|
+
print(f" Section {i+1}: {section.bbox}, Text: {text_snippet}")
|
55
|
+
|
56
|
+
# Example 2: Get sections within a region using start/end elements
|
57
|
+
print("\n2. Get sections within a region using start/end elements")
|
58
|
+
|
59
|
+
# Create a region from the bottom half of the page
|
60
|
+
bottom_half = page.create_region(0, page.height / 2, page.width, page.height)
|
61
|
+
print(f"Created region: {bottom_half.bbox}")
|
62
|
+
|
63
|
+
# Method 1: Find heading elements first, then pass them to get_sections (old way)
|
64
|
+
headings = bottom_half.find_all('text[size>=12]')
|
65
|
+
print(f"Found {len(headings)} potential headings in the region")
|
66
|
+
|
67
|
+
# Use headings as start elements and extract sections (old way)
|
68
|
+
sections1 = bottom_half.get_sections(start_elements=headings)
|
69
|
+
print(f"Found {len(sections1)} sections using explicit elements")
|
70
|
+
|
71
|
+
# Method 2: Pass selector directly to start_elements (new way)
|
72
|
+
sections2 = bottom_half.get_sections(start_elements='text[size>=12]')
|
73
|
+
print(f"Found {len(sections2)} sections using selector string")
|
74
|
+
|
75
|
+
# Display section details
|
76
|
+
for i, section in enumerate(sections2):
|
77
|
+
start_element = section.start_element
|
78
|
+
start_text = start_element.text if start_element else "None"
|
79
|
+
|
80
|
+
text = section.extract_text()
|
81
|
+
text_snippet = text[:50] + "..." if len(text) > 50 else text
|
82
|
+
|
83
|
+
print(f" Section {i+1} (starts with '{start_text}'): {text_snippet}")
|
84
|
+
|
85
|
+
# Example 3: Use selectors within a region
|
86
|
+
print("\n3. Get sections using selectors within a region")
|
87
|
+
|
88
|
+
# Create a region from the center of the page
|
89
|
+
center = page.create_region(50, 50, page.width - 50, page.height - 50)
|
90
|
+
|
91
|
+
# Get sections with start elements
|
92
|
+
sections1 = center.get_sections(
|
93
|
+
start_elements='text[size>=12]' # Large text as section starts
|
94
|
+
)
|
95
|
+
|
96
|
+
# Get sections with both start and end elements
|
97
|
+
sections2 = center.get_sections(
|
98
|
+
start_elements='text[size>=12]', # Large text as section starts
|
99
|
+
end_elements='line[width>=1]' # Thick lines as section ends
|
100
|
+
)
|
101
|
+
|
102
|
+
print(f"Found {len(sections1)} sections using traditional selectors")
|
103
|
+
print(f"Found {len(sections2)} sections using direct selector strings")
|
104
|
+
|
105
|
+
# Compare the results - they should be identical
|
106
|
+
print(f"Both approaches match: {len(sections1) == len(sections2)}")
|
107
|
+
|
108
|
+
# Display section details for the new approach
|
109
|
+
for i, section in enumerate(sections2):
|
110
|
+
text = section.extract_text()
|
111
|
+
text_snippet = text[:50] + "..." if len(text) > 50 else text
|
112
|
+
print(f" Section {i+1}: {text_snippet}")
|
113
|
+
|
114
|
+
if __name__ == "__main__":
|
115
|
+
main()
|
examples/school_books.py
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
"""
|
2
|
+
Example demonstrating section extraction with the get_sections method.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
import sys
|
7
|
+
from pathlib import Path
|
8
|
+
|
9
|
+
# Add parent directory to path for imports
|
10
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
11
|
+
|
12
|
+
from natural_pdf import PDF
|
13
|
+
|
14
|
+
pdf = PDF("./pdfs/Atlanta_Public_Schools_GA_sample.pdf")
|
15
|
+
|
16
|
+
page = pdf.pages[0]
|
17
|
+
day_sections = page.get_sections(start_elements='line[width>=2]')
|
18
|
+
|
19
|
+
for day in day_sections:
|
20
|
+
date = day.find('text').text
|
21
|
+
book_sections = day.get_sections(start_elements='text:contains("(Removed:")')
|
22
|
+
for j, book in enumerate(book_sections):
|
23
|
+
print("-----")
|
24
|
+
if book.height < 30:
|
25
|
+
print("Not a book, skipping")
|
26
|
+
continue
|
27
|
+
book.highlight(label=f"Day {date} section {j}")
|
28
|
+
|
29
|
+
title = book.find_all('text[font_variant="AAAAAB"][size>=10]')
|
30
|
+
title.highlight(label='Title')
|
31
|
+
|
32
|
+
price = book.find('text:contains("Price")').below(height=15, width="element").expand(right=30)
|
33
|
+
price.highlight(label='Price')
|
34
|
+
|
35
|
+
acquired = book.find('text:contains("Acquired")').below(height=15, width="element").expand(right=30)
|
36
|
+
acquired.highlight(label='Acquired')
|
37
|
+
|
38
|
+
removed_by = book.find('text[size<10]:contains("Removed")').below(height=17, width="element").expand(right=60)
|
39
|
+
removed_by.highlight(label='Removed By')
|
40
|
+
|
41
|
+
data = {
|
42
|
+
'Title': title.extract_text(),
|
43
|
+
'Price': price.extract_text(),
|
44
|
+
'Acquired': acquired.extract_text(),
|
45
|
+
'Removed By': removed_by.extract_text()
|
46
|
+
}
|
47
|
+
print(data)
|
48
|
+
|
49
|
+
page.save("highlight.png", show_labels=True)
|