natural-pdf 25.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/__init__.py +3 -0
- examples/another_exclusion_example.py +20 -0
- examples/basic_usage.py +190 -0
- examples/boundary_exclusion_test.py +137 -0
- examples/boundary_inclusion_fix_test.py +157 -0
- examples/chainable_layout_example.py +70 -0
- examples/color_basic_test.py +49 -0
- examples/color_name_example.py +71 -0
- examples/color_test.py +62 -0
- examples/debug_ocr.py +91 -0
- examples/direct_ocr_test.py +148 -0
- examples/direct_paddle_test.py +99 -0
- examples/direct_qa_example.py +165 -0
- examples/document_layout_analysis.py +123 -0
- examples/document_qa_example.py +185 -0
- examples/exclusion_count_debug.py +128 -0
- examples/exclusion_debug.py +107 -0
- examples/exclusion_example.py +150 -0
- examples/exclusion_optimization_example.py +190 -0
- examples/extract_text_test.py +128 -0
- examples/font_aware_example.py +101 -0
- examples/font_variant_example.py +124 -0
- examples/footer_overlap_test.py +124 -0
- examples/highlight_all_example.py +82 -0
- examples/highlight_attributes_test.py +114 -0
- examples/highlight_confidence_display.py +122 -0
- examples/highlight_demo.py +110 -0
- examples/highlight_float_test.py +71 -0
- examples/highlight_test.py +147 -0
- examples/highlighting_example.py +123 -0
- examples/image_width_example.py +84 -0
- examples/improved_api_example.py +128 -0
- examples/layout_confidence_display_test.py +65 -0
- examples/layout_confidence_test.py +82 -0
- examples/layout_coordinate_debug.py +258 -0
- examples/layout_highlight_test.py +77 -0
- examples/logging_example.py +70 -0
- examples/ocr_comprehensive.py +193 -0
- examples/ocr_debug_example.py +87 -0
- examples/ocr_default_test.py +97 -0
- examples/ocr_engine_comparison.py +235 -0
- examples/ocr_example.py +89 -0
- examples/ocr_simplified_params.py +79 -0
- examples/ocr_visualization.py +102 -0
- examples/ocr_visualization_test.py +121 -0
- examples/paddle_layout_example.py +315 -0
- examples/paddle_layout_simple.py +74 -0
- examples/paddleocr_example.py +224 -0
- examples/page_collection_example.py +103 -0
- examples/polygon_highlight_example.py +83 -0
- examples/position_methods_example.py +134 -0
- examples/region_boundary_test.py +73 -0
- examples/region_exclusion_test.py +149 -0
- examples/region_expand_example.py +109 -0
- examples/region_image_example.py +116 -0
- examples/region_ocr_test.py +119 -0
- examples/region_sections_example.py +115 -0
- examples/school_books.py +49 -0
- examples/school_books_all.py +52 -0
- examples/scouring.py +36 -0
- examples/section_extraction_example.py +232 -0
- examples/simple_document_qa.py +97 -0
- examples/spatial_navigation_example.py +108 -0
- examples/table_extraction_example.py +135 -0
- examples/table_structure_detection.py +155 -0
- examples/tatr_cells_test.py +56 -0
- examples/tatr_ocr_table_test.py +94 -0
- examples/text_search_example.py +122 -0
- examples/text_style_example.py +110 -0
- examples/tiny-text.py +61 -0
- examples/until_boundaries_example.py +156 -0
- examples/until_example.py +112 -0
- examples/very_basics.py +15 -0
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +9 -0
- natural_pdf/analyzers/document_layout.py +736 -0
- natural_pdf/analyzers/text_structure.py +153 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/page.py +2376 -0
- natural_pdf/core/pdf.py +572 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +553 -0
- natural_pdf/elements/collections.py +770 -0
- natural_pdf/elements/line.py +124 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1366 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +62 -0
- natural_pdf/ocr/easyocr_engine.py +254 -0
- natural_pdf/ocr/engine.py +158 -0
- natural_pdf/ocr/paddleocr_engine.py +263 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +405 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +360 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +4 -0
- natural_pdf/utils/highlighting.py +605 -0
- natural_pdf/utils/ocr.py +515 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +151 -0
- natural_pdf-25.3.16.dist-info/LICENSE +21 -0
- natural_pdf-25.3.16.dist-info/METADATA +268 -0
- natural_pdf-25.3.16.dist-info/RECORD +109 -0
- natural_pdf-25.3.16.dist-info/WHEEL +5 -0
- natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
- tests/__init__.py +3 -0
- tests/test_pdf.py +39 -0
examples/__init__.py
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
"""
|
2
|
+
Example demonstrating how to use exclusion zones in Natural PDF.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
import sys
|
7
|
+
from pathlib import Path
|
8
|
+
|
9
|
+
# Add parent directory to path for imports
|
10
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
11
|
+
|
12
|
+
from natural_pdf import PDF
|
13
|
+
|
14
|
+
pdf = PDF('pdfs/Atlanta_Public_Schools_GA_sample.pdf')
|
15
|
+
pdf.add_exclusion(lambda page: page.find('line').above())
|
16
|
+
pdf.add_exclusion(lambda page: page.find_all('line')[-1].below())
|
17
|
+
page = pdf.pages[2]
|
18
|
+
page.find_all('text').highlight()
|
19
|
+
page.save('test.png', labels=True)
|
20
|
+
|
examples/basic_usage.py
ADDED
@@ -0,0 +1,190 @@
|
|
1
|
+
"""
|
2
|
+
Basic usage examples for natural-pdf.
|
3
|
+
"""
|
4
|
+
import os
|
5
|
+
import sys
|
6
|
+
|
7
|
+
# Add the parent directory to the path to import the package
|
8
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
9
|
+
|
10
|
+
from natural_pdf import PDF
|
11
|
+
|
12
|
+
def basic_example(pdf_path):
|
13
|
+
"""Basic example using the main features."""
|
14
|
+
# Open the PDF
|
15
|
+
with PDF(pdf_path, reading_order=True) as pdf:
|
16
|
+
# Get basic information
|
17
|
+
print(f"PDF has {len(pdf)} pages")
|
18
|
+
|
19
|
+
# First, display the PDF structure with simple text extraction
|
20
|
+
print("\nBASIC TEXT EXTRACTION:")
|
21
|
+
page_text = pdf.pages[0].extract_text()
|
22
|
+
print(page_text[:500] + "...")
|
23
|
+
|
24
|
+
print("\nWITH LAYOUT: ")
|
25
|
+
page_text = pdf.pages[0].extract_text(layout=True)
|
26
|
+
print(page_text[:2000] + "...")
|
27
|
+
|
28
|
+
# Direct demonstration of PDF features
|
29
|
+
print("\nDEMONSTRATING NATURAL PDF FEATURES:")
|
30
|
+
|
31
|
+
page = pdf.pages[0]
|
32
|
+
|
33
|
+
# 1. Display document structure
|
34
|
+
print("\n1. DOCUMENT STRUCTURE:")
|
35
|
+
|
36
|
+
# Count different types of elements
|
37
|
+
print(f" - {len(page.words)} words")
|
38
|
+
print(f" - {len(page.lines)} lines")
|
39
|
+
print(f" - {len(page.rects)} rectangles")
|
40
|
+
|
41
|
+
# 2. Extract specific text using extract_text
|
42
|
+
print("\n2. EXTRACT TEXT FROM DOCUMENT:")
|
43
|
+
print(f" Full document: {len(pdf.extract_text())} characters")
|
44
|
+
print(f" First page: {len(page.extract_text())} characters")
|
45
|
+
|
46
|
+
# 3. Find elements with specific properties
|
47
|
+
print("\n3. FIND ELEMENTS WITH SPECIFIC PROPERTIES:")
|
48
|
+
|
49
|
+
# Find the thick horizontal line
|
50
|
+
thick_lines = pdf.find_all('line[width>=2]')
|
51
|
+
if thick_lines:
|
52
|
+
print(f" Found thick line: {thick_lines[0].bbox}")
|
53
|
+
|
54
|
+
# Find text with a specific pattern
|
55
|
+
site_text = [w for w in page.words if w.text.startswith("Site:")]
|
56
|
+
if site_text:
|
57
|
+
print(f" Site info: {site_text[0].text}")
|
58
|
+
|
59
|
+
# Display some example words
|
60
|
+
print("\n4. SAMPLE WORDS:")
|
61
|
+
for i, word in enumerate(page.words[:5]):
|
62
|
+
print(f" - Word {i}: '{word.text}'")
|
63
|
+
|
64
|
+
# Find all statute codes using regex pattern matching
|
65
|
+
print("\n5. FIND STATUTE CODES:")
|
66
|
+
import re
|
67
|
+
statute_codes = []
|
68
|
+
for word in page.words:
|
69
|
+
if re.match(r'\d+\.\d+\.\d+', word.text):
|
70
|
+
statute_codes.append(word.text)
|
71
|
+
|
72
|
+
print(f" Found {len(statute_codes)} statute codes:")
|
73
|
+
for code in statute_codes[:3]:
|
74
|
+
print(f" - {code}")
|
75
|
+
|
76
|
+
# Demonstrate spatial relationships with fluent API
|
77
|
+
print("\n6. SPATIAL RELATIONSHIPS WITH FLUENT API:")
|
78
|
+
|
79
|
+
# Find the line with width >= 2
|
80
|
+
thick_line = pdf.find('line[width>=2]')
|
81
|
+
if thick_line:
|
82
|
+
print(f" Found thick line at y={thick_line.top}")
|
83
|
+
|
84
|
+
# Use the below() method to create a region below the line
|
85
|
+
# Specify width="full" for full page width
|
86
|
+
below_region = thick_line.below(height=50, width="full")
|
87
|
+
|
88
|
+
# Extract text from this region
|
89
|
+
region_text = below_region.extract_text(preserve_whitespace=True)
|
90
|
+
|
91
|
+
# Print the first part of the text
|
92
|
+
print(f" Text from region below line: {region_text[:30]}...")
|
93
|
+
|
94
|
+
# We can also use find_all on the region to get elements in that region
|
95
|
+
words_below = below_region.find_all('text')
|
96
|
+
if words_below:
|
97
|
+
print(f" Found {len(words_below)} text elements below the line")
|
98
|
+
# Show the first few words
|
99
|
+
if len(words_below) > 0:
|
100
|
+
first_few = [w.text for w in words_below[:3]]
|
101
|
+
print(f" First few words: {' '.join(first_few)}")
|
102
|
+
|
103
|
+
# Find critical violations
|
104
|
+
print("\n7. FIND CRITICAL VIOLATIONS:")
|
105
|
+
|
106
|
+
# Use simple word search with filtering
|
107
|
+
critical_words = []
|
108
|
+
for word in page.words:
|
109
|
+
if "Critical" in word.text:
|
110
|
+
critical_words.append(word)
|
111
|
+
|
112
|
+
if critical_words:
|
113
|
+
print(f" Found {len(critical_words)} critical items")
|
114
|
+
|
115
|
+
# For each critical item, find text on the same line
|
116
|
+
for critical in critical_words:
|
117
|
+
# Simple approach: find words on same line with lower x-position
|
118
|
+
descriptions = []
|
119
|
+
for word in page.words:
|
120
|
+
# Check if it's on the same line and to the left
|
121
|
+
if abs(word.top - critical.top) < 5 and word.x0 < critical.x0:
|
122
|
+
descriptions.append(word)
|
123
|
+
|
124
|
+
# Sort by x-position to get the closest one
|
125
|
+
if descriptions:
|
126
|
+
descriptions.sort(key=lambda w: w.x0)
|
127
|
+
print(f" - {descriptions[0].text}")
|
128
|
+
|
129
|
+
# Get statutes with critical violations
|
130
|
+
critical_statutes = []
|
131
|
+
for i, word in enumerate(page.words):
|
132
|
+
if "Critical" in word.text:
|
133
|
+
# Look for nearby statute code
|
134
|
+
for j, code_word in enumerate(page.words):
|
135
|
+
if abs(code_word.top - word.top) < 5 and code_word.x0 < word.x0:
|
136
|
+
if re.match(r'\d+\.\d+\.\d+', code_word.text):
|
137
|
+
critical_statutes.append(code_word.text)
|
138
|
+
break
|
139
|
+
|
140
|
+
if critical_statutes:
|
141
|
+
print(f" Critical violations for statutes: {', '.join(critical_statutes)}")
|
142
|
+
|
143
|
+
# Example of the intended fluent API (even if not all parts work yet)
|
144
|
+
print("\n8. FLUENT API EXAMPLES (HOW THE LIBRARY IS INTENDED TO BE USED):")
|
145
|
+
|
146
|
+
print(" Example 1: Find thick lines and extract text below them")
|
147
|
+
print(" ```python")
|
148
|
+
print(" thick_line = pdf.find('line[width>=2]')")
|
149
|
+
print(" text_below = thick_line.below(height=50, width='full').find_all('text')")
|
150
|
+
print(" for text in text_below[:3]:")
|
151
|
+
print(" print(text.text)")
|
152
|
+
print(" ```")
|
153
|
+
|
154
|
+
print("\n Example 2: Find critical violations and their codes")
|
155
|
+
print(" ```python")
|
156
|
+
print(" critical_items = pdf.find_all('text:contains(\"Critical\")')")
|
157
|
+
print(" for item in critical_items:")
|
158
|
+
print(" # Find codes on the same line")
|
159
|
+
print(" codes = pdf.find_all(f'text:matches(\"\\d+\\.\\d+\\.\\d+\")[top~={item.top}]')")
|
160
|
+
print(" if codes:")
|
161
|
+
print(" print(f\"Critical violation: {codes[0].text}\")")
|
162
|
+
print(" ```")
|
163
|
+
|
164
|
+
print("\n Example 3: Extract a table")
|
165
|
+
print(" ```python")
|
166
|
+
print(" # Find the table header")
|
167
|
+
print(" header = pdf.find('text:contains(\"Statute\")')")
|
168
|
+
print(" # Select the entire table region")
|
169
|
+
print(" table_region = header.until('text:contains(\"Jungle Health\")')")
|
170
|
+
print(" # Extract the table as data")
|
171
|
+
print(" table_data = table_region.extract_tables()[0]")
|
172
|
+
print(" ```")
|
173
|
+
|
174
|
+
if __name__ == "__main__":
|
175
|
+
# Default to example PDF if no path is provided
|
176
|
+
if len(sys.argv) < 2:
|
177
|
+
# Use the example PDF in the pdfs directory
|
178
|
+
pdf_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'pdfs', '01-practice.pdf'))
|
179
|
+
if not os.path.exists(pdf_path):
|
180
|
+
print("Example PDF not found. Please provide a path to a PDF file.")
|
181
|
+
print("Usage: python basic_usage.py [path/to/file.pdf]")
|
182
|
+
sys.exit(1)
|
183
|
+
else:
|
184
|
+
pdf_path = sys.argv[1]
|
185
|
+
# Check if the file exists
|
186
|
+
if not os.path.exists(pdf_path):
|
187
|
+
print(f"File not found: {pdf_path}")
|
188
|
+
sys.exit(1)
|
189
|
+
|
190
|
+
basic_example(pdf_path)
|
@@ -0,0 +1,137 @@
|
|
1
|
+
"""
|
2
|
+
Test for boundary element exclusion with real PDFs.
|
3
|
+
This test focuses on the boundary_inclusion parameter of get_sections.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import os
|
7
|
+
import sys
|
8
|
+
from natural_pdf import PDF
|
9
|
+
|
10
|
+
def main():
|
11
|
+
# Get path to PDF file, use default if not provided
|
12
|
+
if len(sys.argv) > 1:
|
13
|
+
pdf_path = sys.argv[1]
|
14
|
+
if not os.path.exists(pdf_path):
|
15
|
+
print(f"Error: File {pdf_path} not found")
|
16
|
+
sys.exit(1)
|
17
|
+
else:
|
18
|
+
# Just use a default PDF from the pdfs directory
|
19
|
+
pdfs_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "pdfs")
|
20
|
+
pdf_files = [f for f in os.listdir(pdfs_dir) if f.endswith('.pdf')]
|
21
|
+
if not pdf_files:
|
22
|
+
print("No PDF files found in the pdfs directory")
|
23
|
+
sys.exit(1)
|
24
|
+
|
25
|
+
pdf_path = os.path.join(pdfs_dir, pdf_files[0])
|
26
|
+
|
27
|
+
print(f"Using PDF: {pdf_path}")
|
28
|
+
|
29
|
+
# Open the PDF
|
30
|
+
pdf = PDF(pdf_path)
|
31
|
+
|
32
|
+
# Use the first page for testing
|
33
|
+
page = pdf.pages[0]
|
34
|
+
|
35
|
+
# Find elements to use as section boundaries
|
36
|
+
# First try to find large text as headings
|
37
|
+
headings = page.find_all('text[size>=14]')
|
38
|
+
|
39
|
+
# If not enough headings, try smaller text
|
40
|
+
if len(headings) < 3:
|
41
|
+
headings = page.find_all('text[size>=12]')
|
42
|
+
|
43
|
+
# If still not enough, try bold text
|
44
|
+
if len(headings) < 3:
|
45
|
+
headings = page.find_all('text:bold')
|
46
|
+
|
47
|
+
# If still not enough, use the first 3 text elements
|
48
|
+
if len(headings) < 3:
|
49
|
+
headings = page.find_all('text')[:5]
|
50
|
+
|
51
|
+
print(f"Found {len(headings)} potential section boundaries")
|
52
|
+
for i, h in enumerate(headings[:5]):
|
53
|
+
print(f"Boundary {i+1}: {h.text}")
|
54
|
+
|
55
|
+
# Create different sections with different boundary_inclusion settings
|
56
|
+
none_sections = page.get_sections(start_elements=headings, boundary_inclusion='none')
|
57
|
+
start_sections = page.get_sections(start_elements=headings, boundary_inclusion='start')
|
58
|
+
both_sections = page.get_sections(start_elements=headings, boundary_inclusion='both')
|
59
|
+
|
60
|
+
print("\nTesting boundary element inclusion/exclusion:")
|
61
|
+
|
62
|
+
# Check if the boundary elements are included correctly
|
63
|
+
for i, section in enumerate(none_sections[:3]):
|
64
|
+
if i >= len(headings):
|
65
|
+
break
|
66
|
+
|
67
|
+
boundary = headings[i]
|
68
|
+
found = section._is_element_in_region(boundary)
|
69
|
+
print(f"None Section {i+1}: Contains boundary element: {found}")
|
70
|
+
|
71
|
+
for i, section in enumerate(start_sections[:3]):
|
72
|
+
if i >= len(headings):
|
73
|
+
break
|
74
|
+
|
75
|
+
boundary = headings[i]
|
76
|
+
found = section._is_element_in_region(boundary)
|
77
|
+
print(f"Start Section {i+1}: Contains boundary element: {found}")
|
78
|
+
|
79
|
+
for i, section in enumerate(both_sections[:3]):
|
80
|
+
if i >= len(headings):
|
81
|
+
break
|
82
|
+
|
83
|
+
boundary = headings[i]
|
84
|
+
found = section._is_element_in_region(boundary)
|
85
|
+
print(f"Both Section {i+1}: Contains boundary element: {found}")
|
86
|
+
|
87
|
+
# Simplify our test approach - just check if:
|
88
|
+
# 1. 'none' sections exclude their boundary elements
|
89
|
+
# 2. 'start' sections include their boundary elements
|
90
|
+
# 3. 'both' sections include their boundary elements
|
91
|
+
|
92
|
+
# Check section element counts
|
93
|
+
print("\nElement counts in sections:")
|
94
|
+
for i, section in enumerate(none_sections[:3]):
|
95
|
+
elements = section.get_elements()
|
96
|
+
print(f"None Section {i+1}: {len(elements)} elements")
|
97
|
+
|
98
|
+
for i, section in enumerate(start_sections[:3]):
|
99
|
+
elements = section.get_elements()
|
100
|
+
print(f"Start Section {i+1}: {len(elements)} elements")
|
101
|
+
|
102
|
+
for i, section in enumerate(both_sections[:3]):
|
103
|
+
elements = section.get_elements()
|
104
|
+
print(f"Both Section {i+1}: {len(elements)} elements")
|
105
|
+
|
106
|
+
# Summarize test results
|
107
|
+
none_correct = all(
|
108
|
+
not section._is_element_in_region(headings[i])
|
109
|
+
for i, section in enumerate(none_sections[:3])
|
110
|
+
if i < len(headings)
|
111
|
+
)
|
112
|
+
|
113
|
+
# Check only non-empty sections that have a start_element
|
114
|
+
start_correct = all(
|
115
|
+
(section.start_element is None) or section._is_element_in_region(section.start_element)
|
116
|
+
for section in start_sections[:3]
|
117
|
+
if section.get_elements() # Skip empty sections
|
118
|
+
)
|
119
|
+
|
120
|
+
both_correct = all(
|
121
|
+
(section.start_element is None) or section._is_element_in_region(section.start_element)
|
122
|
+
for section in both_sections[:3]
|
123
|
+
if section.get_elements() # Skip empty sections
|
124
|
+
)
|
125
|
+
|
126
|
+
print("\nTest Results Summary:")
|
127
|
+
print(f"- 'none' excludes boundary elements: {'PASS' if none_correct else 'FAIL'}")
|
128
|
+
print(f"- 'start' includes boundary elements: {'PASS' if start_correct else 'FAIL'}")
|
129
|
+
print(f"- 'both' includes boundary elements: {'PASS' if both_correct else 'FAIL'}")
|
130
|
+
|
131
|
+
if none_correct and start_correct and both_correct:
|
132
|
+
print("\n✅ All tests PASSED!")
|
133
|
+
else:
|
134
|
+
print("\n❌ Some tests FAILED!")
|
135
|
+
|
136
|
+
if __name__ == "__main__":
|
137
|
+
main()
|
@@ -0,0 +1,157 @@
|
|
1
|
+
"""
|
2
|
+
Example demonstrating the fixed boundary inclusion behavior in the get_sections method.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
import sys
|
7
|
+
from natural_pdf import PDF
|
8
|
+
|
9
|
+
def main():
|
10
|
+
# Get path to PDF file, use default if not provided
|
11
|
+
if len(sys.argv) > 1:
|
12
|
+
pdf_path = sys.argv[1]
|
13
|
+
if not os.path.exists(pdf_path):
|
14
|
+
print(f"Error: File {pdf_path} not found")
|
15
|
+
sys.exit(1)
|
16
|
+
else:
|
17
|
+
# Use a default PDF from the pdfs directory
|
18
|
+
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
19
|
+
pdf_path = os.path.join(parent_dir, "pdfs", "2019 Statistics.pdf")
|
20
|
+
if not os.path.exists(pdf_path):
|
21
|
+
print(f"Error: Default file {pdf_path} not found")
|
22
|
+
sys.exit(1)
|
23
|
+
|
24
|
+
# Open the PDF
|
25
|
+
pdf = PDF(pdf_path)
|
26
|
+
page = pdf.pages[0] # Use the first page
|
27
|
+
|
28
|
+
# Find some elements to use as section boundaries
|
29
|
+
headings = page.find_all('text[size>=12]')
|
30
|
+
|
31
|
+
if len(headings) < 3:
|
32
|
+
print(f"Not enough headings found on the first page. Found: {len(headings)}")
|
33
|
+
sys.exit(1)
|
34
|
+
|
35
|
+
print(f"Found {len(headings)} headings")
|
36
|
+
for i, heading in enumerate(headings[:5]): # Show first 5 headings
|
37
|
+
print(f"Heading {i+1}: {heading.text}")
|
38
|
+
|
39
|
+
# Create sections with different boundary inclusion settings
|
40
|
+
sections_none = page.get_sections(
|
41
|
+
start_elements=headings,
|
42
|
+
boundary_inclusion='none'
|
43
|
+
)
|
44
|
+
|
45
|
+
sections_start = page.get_sections(
|
46
|
+
start_elements=headings,
|
47
|
+
boundary_inclusion='start'
|
48
|
+
)
|
49
|
+
|
50
|
+
sections_both = page.get_sections(
|
51
|
+
start_elements=headings,
|
52
|
+
boundary_inclusion='both'
|
53
|
+
)
|
54
|
+
|
55
|
+
# Display the results
|
56
|
+
print("\nTesting if headings are correctly included/excluded:")
|
57
|
+
|
58
|
+
# Check the sections with 'none' inclusion
|
59
|
+
print("\n=== Sections with boundary_inclusion='none' ===")
|
60
|
+
for i, section in enumerate(sections_none[:3]): # Check first 3 sections
|
61
|
+
# Get all elements in this section
|
62
|
+
elements = section.get_elements()
|
63
|
+
|
64
|
+
# Check if we have any elements
|
65
|
+
if not elements:
|
66
|
+
print(f"Section {i+1} is empty (has no elements)")
|
67
|
+
continue
|
68
|
+
|
69
|
+
# Get the first element text
|
70
|
+
first_element_text = elements[0].text if hasattr(elements[0], 'text') else str(elements[0])
|
71
|
+
|
72
|
+
# Look for a heading in all section elements
|
73
|
+
heading_found = False
|
74
|
+
for h in headings:
|
75
|
+
if section._is_element_in_region(h):
|
76
|
+
heading_found = True
|
77
|
+
break
|
78
|
+
|
79
|
+
print(f"Section {i+1} contains heading: {heading_found}")
|
80
|
+
print(f" First element: {first_element_text}")
|
81
|
+
print(f" Element count: {len(elements)}")
|
82
|
+
|
83
|
+
# Check the sections with 'start' inclusion
|
84
|
+
print("\n=== Sections with boundary_inclusion='start' ===")
|
85
|
+
for i, section in enumerate(sections_start[:3]): # Check first 3 sections
|
86
|
+
# Get elements in this section
|
87
|
+
elements = section.get_elements()
|
88
|
+
|
89
|
+
# Check if we have any elements
|
90
|
+
if not elements:
|
91
|
+
print(f"Section {i+1} is empty (has no elements)")
|
92
|
+
continue
|
93
|
+
|
94
|
+
# Check if the start heading is in this section
|
95
|
+
original_heading = headings[i] if i < len(headings) else None
|
96
|
+
heading_found = False
|
97
|
+
if original_heading:
|
98
|
+
heading_found = section._is_element_in_region(original_heading)
|
99
|
+
|
100
|
+
print(f"Section {i+1} contains start heading: {heading_found}")
|
101
|
+
print(f" Start element: {section.start_element.text if section.start_element else 'None'}")
|
102
|
+
print(f" Element count: {len(elements)}")
|
103
|
+
print(f" First element: {elements[0].text if hasattr(elements[0], 'text') else str(elements[0])}")
|
104
|
+
|
105
|
+
# Check the sections with 'both' inclusion
|
106
|
+
print("\n=== Sections with boundary_inclusion='both' ===")
|
107
|
+
for i, section in enumerate(sections_both[:3]): # Check first 3 sections
|
108
|
+
# Get elements in this section
|
109
|
+
elements = section.get_elements()
|
110
|
+
|
111
|
+
# Check if we have any elements
|
112
|
+
if not elements:
|
113
|
+
print(f"Section {i+1} is empty (has no elements)")
|
114
|
+
continue
|
115
|
+
|
116
|
+
# Check if the start heading is in this section
|
117
|
+
original_heading = headings[i] if i < len(headings) else None
|
118
|
+
heading_found = False
|
119
|
+
if original_heading:
|
120
|
+
heading_found = section._is_element_in_region(original_heading)
|
121
|
+
|
122
|
+
print(f"Section {i+1} contains start heading: {heading_found}")
|
123
|
+
print(f" Start element: {section.start_element.text if section.start_element else 'None'}")
|
124
|
+
print(f" Element count: {len(elements)}")
|
125
|
+
print(f" First element: {elements[0].text if hasattr(elements[0], 'text') else str(elements[0])}")
|
126
|
+
|
127
|
+
# Save output images for visual verification
|
128
|
+
page.highlight_all()
|
129
|
+
page.save_image("output/all_elements.png")
|
130
|
+
|
131
|
+
# Let's skip the highlighting part for this test since we're getting errors
|
132
|
+
print("\nResults of the test:")
|
133
|
+
print(f"- 'none' inclusion: Sections have {len([s for s in sections_none if s.get_elements()])} non-empty out of {len(sections_none)} total")
|
134
|
+
print(f"- 'start' inclusion: Sections have {len([s for s in sections_start if s.get_elements()])} non-empty out of {len(sections_start)} total")
|
135
|
+
print(f"- 'both' inclusion: Sections have {len([s for s in sections_both if s.get_elements()])} non-empty out of {len(sections_both)} total")
|
136
|
+
|
137
|
+
# Test successful if:
|
138
|
+
# 1. 'none' has no headings in its sections (verified above)
|
139
|
+
# 2. 'start' includes the start headings but not end headings
|
140
|
+
# 3. 'both' includes both start and end headings
|
141
|
+
|
142
|
+
none_success = all(len(s.get_elements()) == 0 or not any(s._is_element_in_region(h) for h in headings) for s in sections_none[:3])
|
143
|
+
start_success = all(s.start_element in headings and s._is_element_in_region(s.start_element) for s in sections_start[:3] if s.start_element)
|
144
|
+
both_success = all((s.start_element in headings and s._is_element_in_region(s.start_element)) for s in sections_both[:3] if s.start_element)
|
145
|
+
|
146
|
+
print("\nTest Results:")
|
147
|
+
print(f"- 'none' excludes headings: {'Success' if none_success else 'Failure'}")
|
148
|
+
print(f"- 'start' includes start headings: {'Success' if start_success else 'Failure'}")
|
149
|
+
print(f"- 'both' includes start headings: {'Success' if both_success else 'Failure'}")
|
150
|
+
|
151
|
+
if none_success and start_success and both_success:
|
152
|
+
print("\n✅ Fix was successful!")
|
153
|
+
else:
|
154
|
+
print("\n❌ Fix needs more work.")
|
155
|
+
|
156
|
+
if __name__ == "__main__":
|
157
|
+
main()
|
@@ -0,0 +1,70 @@
|
|
1
|
+
"""
|
2
|
+
Example demonstrating the chainable analyze_layout method.
|
3
|
+
|
4
|
+
This example shows how to use the chainable analyze_layout method
|
5
|
+
to create more concise code by chaining method calls together.
|
6
|
+
"""
|
7
|
+
import os
|
8
|
+
import sys
|
9
|
+
import argparse
|
10
|
+
|
11
|
+
# Add the parent directory to the Python path
|
12
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
13
|
+
from natural_pdf import PDF
|
14
|
+
|
15
|
+
# Get the current directory of this script
|
16
|
+
script_dir = os.path.dirname(os.path.realpath(__file__))
|
17
|
+
# Get the parent directory (project root)
|
18
|
+
root_dir = os.path.dirname(script_dir)
|
19
|
+
# Default PDF path
|
20
|
+
default_pdf = os.path.join(root_dir, "pdfs", "2019 Statistics.pdf")
|
21
|
+
|
22
|
+
# Set up argument parser
|
23
|
+
parser = argparse.ArgumentParser(description="Chainable layout analysis example")
|
24
|
+
parser.add_argument("pdf_path", nargs="?", default=default_pdf, help="Path to a PDF file")
|
25
|
+
parser.add_argument("--page", type=int, default=0, help="Page number to analyze (0-based)")
|
26
|
+
parser.add_argument("--conf", type=float, default=0.2, help="Confidence threshold for detections")
|
27
|
+
parser.add_argument("--output", type=str, default=None, help="Output file path for highlighted image")
|
28
|
+
args = parser.parse_args()
|
29
|
+
|
30
|
+
print(f"Analyzing PDF: {args.pdf_path}")
|
31
|
+
print(f"Page: {args.page}")
|
32
|
+
print(f"Confidence threshold: {args.conf}")
|
33
|
+
|
34
|
+
# Load the PDF
|
35
|
+
pdf = PDF(args.pdf_path)
|
36
|
+
page = pdf.pages[args.page]
|
37
|
+
|
38
|
+
print("Running document layout analysis with method chaining...")
|
39
|
+
|
40
|
+
# Example 1: Chain analyze_layout with highlight_all
|
41
|
+
page.analyze_layout(confidence=args.conf)\
|
42
|
+
.highlight_all(include_layout_regions=True)
|
43
|
+
|
44
|
+
print(f"Found {len(page.detected_layout_regions)} regions with confidence >= {args.conf}")
|
45
|
+
|
46
|
+
# Example 2: Save a highlighted image with labels
|
47
|
+
output_path = args.output or os.path.join(root_dir, "output", "chainable_layout.png")
|
48
|
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
49
|
+
|
50
|
+
# Chain the whole sequence: clear highlights, analyze layout, highlight all, save image
|
51
|
+
page.clear_highlights()\
|
52
|
+
.analyze_layout(model="yolo", confidence=args.conf)\
|
53
|
+
.highlight_all(include_layout_regions=True)\
|
54
|
+
.to_image(path=output_path, show_labels=True)
|
55
|
+
|
56
|
+
print(f"Saved highlighted image to {output_path}")
|
57
|
+
|
58
|
+
# Example 3: Chain with specialized highlighting
|
59
|
+
if page.find_all('region[type=title]'):
|
60
|
+
result_path = os.path.join(os.path.dirname(output_path), "titles_only.png")
|
61
|
+
|
62
|
+
page.clear_highlights()\
|
63
|
+
.analyze_layout(confidence=args.conf)\
|
64
|
+
.find_all('region[type=title]')\
|
65
|
+
.highlight(label="Document Titles", color=(1, 0, 0, 0.4))
|
66
|
+
|
67
|
+
page.to_image(path=result_path, show_labels=True)
|
68
|
+
print(f"Saved titles-only highlighted image to {result_path}")
|
69
|
+
|
70
|
+
print("Done!")
|
@@ -0,0 +1,49 @@
|
|
1
|
+
"""
|
2
|
+
Simple test for color conversion.
|
3
|
+
"""
|
4
|
+
|
5
|
+
# Test color conversion
|
6
|
+
def normalize_color(color):
|
7
|
+
"""Test function that normalizes colors from various formats to RGB(A) integers."""
|
8
|
+
if isinstance(color, tuple):
|
9
|
+
# Convert values to integers in 0-255 range
|
10
|
+
processed_color = []
|
11
|
+
for i, c in enumerate(color):
|
12
|
+
if isinstance(c, float):
|
13
|
+
# 0.0-1.0 float format
|
14
|
+
if c <= 1.0:
|
15
|
+
processed_color.append(int(c * 255))
|
16
|
+
# Already in 0-255 range but as float
|
17
|
+
else:
|
18
|
+
processed_color.append(int(c))
|
19
|
+
else:
|
20
|
+
processed_color.append(c)
|
21
|
+
|
22
|
+
# Default alpha value if needed
|
23
|
+
if len(processed_color) == 3:
|
24
|
+
processed_color.append(100) # Default alpha
|
25
|
+
|
26
|
+
return tuple(processed_color)
|
27
|
+
else:
|
28
|
+
# Default if invalid color is provided
|
29
|
+
return (255, 255, 0, 100) # Yellow with semi-transparency
|
30
|
+
|
31
|
+
# Test cases
|
32
|
+
print("Testing color conversion:")
|
33
|
+
print("-----------------------")
|
34
|
+
|
35
|
+
test_cases = [
|
36
|
+
((255, 0, 0, 128), "Integer RGB with alpha"),
|
37
|
+
((255, 0, 0), "Integer RGB without alpha"),
|
38
|
+
((0.0, 1.0, 0.0, 0.5), "Float RGB with alpha (0-1)"),
|
39
|
+
((0.0, 1.0, 0.0), "Float RGB without alpha (0-1)"),
|
40
|
+
((0.5, 0.5, 255, 0.7), "Mixed float and integer"),
|
41
|
+
((0.5, 0.5, 255), "Mixed without alpha"),
|
42
|
+
((128.5, 64.3, 200.7, 50.9), "Float values > 1"),
|
43
|
+
]
|
44
|
+
|
45
|
+
for color, desc in test_cases:
|
46
|
+
result = normalize_color(color)
|
47
|
+
print(f"{desc}: {color} -> {result}")
|
48
|
+
|
49
|
+
print("\nAll tests completed!")
|