natural-pdf 25.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/__init__.py +3 -0
- examples/another_exclusion_example.py +20 -0
- examples/basic_usage.py +190 -0
- examples/boundary_exclusion_test.py +137 -0
- examples/boundary_inclusion_fix_test.py +157 -0
- examples/chainable_layout_example.py +70 -0
- examples/color_basic_test.py +49 -0
- examples/color_name_example.py +71 -0
- examples/color_test.py +62 -0
- examples/debug_ocr.py +91 -0
- examples/direct_ocr_test.py +148 -0
- examples/direct_paddle_test.py +99 -0
- examples/direct_qa_example.py +165 -0
- examples/document_layout_analysis.py +123 -0
- examples/document_qa_example.py +185 -0
- examples/exclusion_count_debug.py +128 -0
- examples/exclusion_debug.py +107 -0
- examples/exclusion_example.py +150 -0
- examples/exclusion_optimization_example.py +190 -0
- examples/extract_text_test.py +128 -0
- examples/font_aware_example.py +101 -0
- examples/font_variant_example.py +124 -0
- examples/footer_overlap_test.py +124 -0
- examples/highlight_all_example.py +82 -0
- examples/highlight_attributes_test.py +114 -0
- examples/highlight_confidence_display.py +122 -0
- examples/highlight_demo.py +110 -0
- examples/highlight_float_test.py +71 -0
- examples/highlight_test.py +147 -0
- examples/highlighting_example.py +123 -0
- examples/image_width_example.py +84 -0
- examples/improved_api_example.py +128 -0
- examples/layout_confidence_display_test.py +65 -0
- examples/layout_confidence_test.py +82 -0
- examples/layout_coordinate_debug.py +258 -0
- examples/layout_highlight_test.py +77 -0
- examples/logging_example.py +70 -0
- examples/ocr_comprehensive.py +193 -0
- examples/ocr_debug_example.py +87 -0
- examples/ocr_default_test.py +97 -0
- examples/ocr_engine_comparison.py +235 -0
- examples/ocr_example.py +89 -0
- examples/ocr_simplified_params.py +79 -0
- examples/ocr_visualization.py +102 -0
- examples/ocr_visualization_test.py +121 -0
- examples/paddle_layout_example.py +315 -0
- examples/paddle_layout_simple.py +74 -0
- examples/paddleocr_example.py +224 -0
- examples/page_collection_example.py +103 -0
- examples/polygon_highlight_example.py +83 -0
- examples/position_methods_example.py +134 -0
- examples/region_boundary_test.py +73 -0
- examples/region_exclusion_test.py +149 -0
- examples/region_expand_example.py +109 -0
- examples/region_image_example.py +116 -0
- examples/region_ocr_test.py +119 -0
- examples/region_sections_example.py +115 -0
- examples/school_books.py +49 -0
- examples/school_books_all.py +52 -0
- examples/scouring.py +36 -0
- examples/section_extraction_example.py +232 -0
- examples/simple_document_qa.py +97 -0
- examples/spatial_navigation_example.py +108 -0
- examples/table_extraction_example.py +135 -0
- examples/table_structure_detection.py +155 -0
- examples/tatr_cells_test.py +56 -0
- examples/tatr_ocr_table_test.py +94 -0
- examples/text_search_example.py +122 -0
- examples/text_style_example.py +110 -0
- examples/tiny-text.py +61 -0
- examples/until_boundaries_example.py +156 -0
- examples/until_example.py +112 -0
- examples/very_basics.py +15 -0
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +9 -0
- natural_pdf/analyzers/document_layout.py +736 -0
- natural_pdf/analyzers/text_structure.py +153 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/page.py +2376 -0
- natural_pdf/core/pdf.py +572 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +553 -0
- natural_pdf/elements/collections.py +770 -0
- natural_pdf/elements/line.py +124 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1366 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +62 -0
- natural_pdf/ocr/easyocr_engine.py +254 -0
- natural_pdf/ocr/engine.py +158 -0
- natural_pdf/ocr/paddleocr_engine.py +263 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +405 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +360 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +4 -0
- natural_pdf/utils/highlighting.py +605 -0
- natural_pdf/utils/ocr.py +515 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +151 -0
- natural_pdf-25.3.16.dist-info/LICENSE +21 -0
- natural_pdf-25.3.16.dist-info/METADATA +268 -0
- natural_pdf-25.3.16.dist-info/RECORD +109 -0
- natural_pdf-25.3.16.dist-info/WHEEL +5 -0
- natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
- tests/__init__.py +3 -0
- tests/test_pdf.py +39 -0
@@ -0,0 +1,52 @@
|
|
1
|
+
"""
|
2
|
+
Example demonstrating section extraction with the get_sections method.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
import sys
|
7
|
+
from pathlib import Path
|
8
|
+
|
9
|
+
# Add parent directory to path for imports
|
10
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
11
|
+
|
12
|
+
from natural_pdf import PDF
|
13
|
+
|
14
|
+
pdf = PDF("./pdfs/Atlanta_Public_Schools_GA_sample.pdf")
|
15
|
+
|
16
|
+
day_sections = pdf.pages.get_sections(start_elements='line[width>=2]')
|
17
|
+
|
18
|
+
rows = []
|
19
|
+
for day in day_sections:
|
20
|
+
date = day.find('text').text
|
21
|
+
book_sections = day.get_sections(start_elements='text:contains("(Removed:")')
|
22
|
+
|
23
|
+
for book in book_sections:
|
24
|
+
if book.height < 30:
|
25
|
+
print("Not a book, skipping")
|
26
|
+
continue
|
27
|
+
|
28
|
+
# Bold big text is the title
|
29
|
+
title = book.find_all('text[font_variant="AAAAAB"][size>=10]')
|
30
|
+
price = book.find('text:contains("Price")').below(height=15, width="element").expand(right=30)
|
31
|
+
acquired = book.find('text:contains("Acquired")').below(height=15, width="element").expand(right=30)
|
32
|
+
removed_by = book.find('text[size<10]:contains("Removed")').below(height=17, width="element").expand(right=60)
|
33
|
+
|
34
|
+
# Highlight them
|
35
|
+
book.highlight(label=title.extract_text())
|
36
|
+
title.highlight(label='title')
|
37
|
+
price.highlight(label='price')
|
38
|
+
acquired.highlight(label='acquired')
|
39
|
+
removed_by.highlight(label='removed')
|
40
|
+
|
41
|
+
# Save them
|
42
|
+
data = {
|
43
|
+
'Title': title.extract_text(),
|
44
|
+
'Price': price.extract_text(),
|
45
|
+
'Acquired': acquired.extract_text(),
|
46
|
+
'Removed By': removed_by.extract_text()
|
47
|
+
}
|
48
|
+
rows.append(data)
|
49
|
+
|
50
|
+
pdf.pages[0].save("highlight-1.png", show_labels=True)
|
51
|
+
pdf.pages[1].save("highlight-2.png", show_labels=True)
|
52
|
+
pdf.pages[2].save("highlight-3.png", show_labels=True)
|
examples/scouring.py
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
"""
|
2
|
+
Example demonstrating section extraction with the get_sections method.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
import sys
|
7
|
+
from pathlib import Path
|
8
|
+
|
9
|
+
# Add parent directory to path for imports
|
10
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
11
|
+
|
12
|
+
from natural_pdf import PDF
|
13
|
+
|
14
|
+
pdf = PDF("./pdfs/Nigeria 2021_MICS_SFR_English.pdf")
|
15
|
+
|
16
|
+
# Exclude "Page | 123" footer from all queries
|
17
|
+
pdf.add_exclusion(lambda page: page.find_all('text').lowest().below(include_element=True))
|
18
|
+
|
19
|
+
# There's a bold header for 'EQ.4.1W' on a few of these pages
|
20
|
+
header = pdf.pages[460:470].find('text:contains("EQ.4.1W"):bold')
|
21
|
+
|
22
|
+
header.highlight(label='table header')
|
23
|
+
|
24
|
+
(
|
25
|
+
header
|
26
|
+
.below()
|
27
|
+
.find('text:contains("Total"):bold')
|
28
|
+
.below(
|
29
|
+
until='text:contains("MICS")',
|
30
|
+
include_element=True,
|
31
|
+
include_until=False
|
32
|
+
)
|
33
|
+
.highlight(label='table area')
|
34
|
+
)
|
35
|
+
|
36
|
+
header.page.to_image(path="output.png", show_labels=True)
|
@@ -0,0 +1,232 @@
|
|
1
|
+
"""
|
2
|
+
Example demonstrating section extraction with the get_sections method.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
import sys
|
7
|
+
from pathlib import Path
|
8
|
+
|
9
|
+
# Add parent directory to path for imports
|
10
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
11
|
+
|
12
|
+
from natural_pdf import PDF
|
13
|
+
|
14
|
+
|
15
|
+
def example_sections_between_headings(pdf_path):
|
16
|
+
"""
|
17
|
+
Example showing how to extract sections between headings.
|
18
|
+
"""
|
19
|
+
print("\n=== SECTIONS BETWEEN HEADINGS ===")
|
20
|
+
pdf = PDF(pdf_path)
|
21
|
+
page = pdf.pages[0]
|
22
|
+
page.add_exclusion(page.find('text:contains("November")').above(include_element=True))
|
23
|
+
|
24
|
+
# Create an output directory
|
25
|
+
output_dir = Path(__file__).parent / "section_output"
|
26
|
+
output_dir.mkdir(exist_ok=True)
|
27
|
+
|
28
|
+
# Find all the headings on the page
|
29
|
+
headings = page.find_all('text[size>=12]')
|
30
|
+
print(f"Found {len(headings)} heading elements")
|
31
|
+
|
32
|
+
# Create a highlighted visualization to see what we found
|
33
|
+
page.clear_highlights()
|
34
|
+
headings.highlight(label="Headings", color=(255, 100, 0, 100))
|
35
|
+
page.to_image(path=str(output_dir / "headings.png"), show_labels=True)
|
36
|
+
|
37
|
+
# First try without line grouping
|
38
|
+
print("Extracting sections WITHOUT line grouping:")
|
39
|
+
# Set y_threshold to 0 to disable line grouping
|
40
|
+
sections_no_grouping = page.get_sections(
|
41
|
+
start_elements=headings,
|
42
|
+
boundary_inclusion='start',
|
43
|
+
y_threshold=0 # Disable line grouping
|
44
|
+
)
|
45
|
+
print(f"Found {len(sections_no_grouping)} sections without line grouping")
|
46
|
+
|
47
|
+
# Now with line grouping
|
48
|
+
print("\nExtracting sections WITH line grouping:")
|
49
|
+
sections = page.get_sections(
|
50
|
+
start_elements=headings,
|
51
|
+
boundary_inclusion='start', # Include heading with its section
|
52
|
+
y_threshold=5 # Group elements within 5 points vertically
|
53
|
+
)
|
54
|
+
print(f"Found {len(sections)} sections with line grouping")
|
55
|
+
|
56
|
+
# Create visualizations showing the difference
|
57
|
+
# Highlight sections without grouping
|
58
|
+
page.clear_highlights()
|
59
|
+
for i, section in enumerate(sections_no_grouping):
|
60
|
+
section.highlight(label=f"Section {i+1}", use_color_cycling=True)
|
61
|
+
page.to_image(path=str(output_dir / "sections_no_grouping.png"), show_labels=True)
|
62
|
+
|
63
|
+
# Highlight sections with grouping
|
64
|
+
page.clear_highlights()
|
65
|
+
for i, section in enumerate(sections):
|
66
|
+
section.highlight(label=f"Section {i+1}", use_color_cycling=True)
|
67
|
+
page.to_image(path=str(output_dir / "sections_with_grouping.png"), show_labels=True)
|
68
|
+
|
69
|
+
# Process each section from the grouped version
|
70
|
+
for i, section in enumerate(sections):
|
71
|
+
# Get the heading text
|
72
|
+
heading_text = section.start_element.extract_text() if hasattr(section, 'start_element') else "No heading"
|
73
|
+
|
74
|
+
# Get section content (limited to first 50 chars for display)
|
75
|
+
content = section.extract_text()
|
76
|
+
content_preview = content[:50] + "..." if len(content) > 50 else content
|
77
|
+
|
78
|
+
print(f"Section {i+1}: '{heading_text}'")
|
79
|
+
print(f" Content: {content_preview}")
|
80
|
+
|
81
|
+
# Create visualization
|
82
|
+
page.clear_highlights()
|
83
|
+
section.highlight(label=f"Section {i+1}")
|
84
|
+
if hasattr(section, 'start_element') and section.start_element:
|
85
|
+
section.start_element.highlight(label="Heading", color=(255, 0, 0, 100))
|
86
|
+
if hasattr(section, 'end_element') and section.end_element:
|
87
|
+
section.end_element.highlight(label="End", color=(0, 0, 255, 100))
|
88
|
+
|
89
|
+
page.to_image(path=str(output_dir / f"section_{i+1}.png"), show_labels=True)
|
90
|
+
|
91
|
+
|
92
|
+
def example_sections_with_separators(pdf_path):
|
93
|
+
"""
|
94
|
+
Example showing how to extract sections with separators.
|
95
|
+
"""
|
96
|
+
print("\n=== SECTIONS WITH SEPARATORS ===")
|
97
|
+
pdf = PDF(pdf_path)
|
98
|
+
page = pdf.pages[0]
|
99
|
+
|
100
|
+
# Create an output directory
|
101
|
+
output_dir = Path(__file__).parent / "separator_output"
|
102
|
+
output_dir.mkdir(exist_ok=True)
|
103
|
+
|
104
|
+
# Find all horizontal lines that could be separators
|
105
|
+
separators = page.find_all('line[width>=2]')
|
106
|
+
print(f"Found {len(separators)} separator lines")
|
107
|
+
|
108
|
+
# Create a highlighted visualization to see what we found
|
109
|
+
page.clear_highlights()
|
110
|
+
separators.highlight(label="Separators", color=(0, 0, 255, 100))
|
111
|
+
page.to_image(path=str(output_dir / "separators.png"), show_labels=True)
|
112
|
+
|
113
|
+
# Try different boundary inclusions
|
114
|
+
inclusion_options = ['none', 'start', 'end', 'both']
|
115
|
+
|
116
|
+
for inclusion in inclusion_options:
|
117
|
+
print(f"\nSections with boundary_inclusion='{inclusion}':")
|
118
|
+
sections = page.get_sections(
|
119
|
+
start_elements=separators,
|
120
|
+
boundary_inclusion=inclusion
|
121
|
+
)
|
122
|
+
|
123
|
+
print(f"Found {len(sections)} sections")
|
124
|
+
|
125
|
+
# Create visualization for all sections
|
126
|
+
page.clear_highlights()
|
127
|
+
|
128
|
+
for i, section in enumerate(sections):
|
129
|
+
# Use different color for each section
|
130
|
+
color = None # Let it cycle through colors
|
131
|
+
section.highlight(label=f"Section {i+1}", use_color_cycling=True)
|
132
|
+
|
133
|
+
# Section info
|
134
|
+
content = section.extract_text()
|
135
|
+
content_preview = content[:30] + "..." if len(content) > 30 else content
|
136
|
+
print(f" Section {i+1}: {content_preview}")
|
137
|
+
|
138
|
+
# Save the visualization
|
139
|
+
page.to_image(path=str(output_dir / f"sections_{inclusion}.png"), show_labels=True)
|
140
|
+
page.clear_highlights()
|
141
|
+
|
142
|
+
|
143
|
+
def example_start_end_sections(pdf_path):
|
144
|
+
"""
|
145
|
+
Example showing how to extract sections between start and end elements.
|
146
|
+
"""
|
147
|
+
print("\n=== SECTIONS BETWEEN START AND END ELEMENTS ===")
|
148
|
+
pdf = PDF(pdf_path)
|
149
|
+
page = pdf.pages[0]
|
150
|
+
|
151
|
+
# Create an output directory
|
152
|
+
output_dir = Path(__file__).parent / "start_end_output"
|
153
|
+
output_dir.mkdir(exist_ok=True)
|
154
|
+
|
155
|
+
# Find headings and lines
|
156
|
+
headings = page.find_all('text[size>=14]')
|
157
|
+
lines = page.find_all('line[width>=2]')
|
158
|
+
|
159
|
+
print(f"Found {len(headings)} headings and {len(lines)} lines")
|
160
|
+
|
161
|
+
# Create a visualization to show both
|
162
|
+
page.clear_highlights()
|
163
|
+
headings.highlight(label="Headings", color=(255, 100, 0, 100))
|
164
|
+
lines.highlight(label="Lines", color=(0, 0, 255, 100))
|
165
|
+
page.to_image(path=str(output_dir / "elements.png"), show_labels=True)
|
166
|
+
|
167
|
+
# Get sections from heading to next line
|
168
|
+
sections = page.get_sections(
|
169
|
+
start_elements=headings,
|
170
|
+
end_elements=lines,
|
171
|
+
boundary_inclusion='start' # Include heading but not the line
|
172
|
+
)
|
173
|
+
|
174
|
+
print(f"Found {len(sections)} sections from headings to lines")
|
175
|
+
|
176
|
+
# Process each section
|
177
|
+
for i, section in enumerate(sections):
|
178
|
+
# Get the heading text
|
179
|
+
if hasattr(section, 'start_element') and section.start_element:
|
180
|
+
heading_text = section.start_element.extract_text()
|
181
|
+
else:
|
182
|
+
heading_text = "No heading"
|
183
|
+
|
184
|
+
# Get section content
|
185
|
+
content = section.extract_text()
|
186
|
+
content_preview = content[:50] + "..." if len(content) > 50 else content
|
187
|
+
|
188
|
+
print(f"Section {i+1}: '{heading_text}'")
|
189
|
+
print(f" Content: {content_preview}")
|
190
|
+
|
191
|
+
# Create visualization
|
192
|
+
page.clear_highlights()
|
193
|
+
section.highlight(label=f"Section {i+1}")
|
194
|
+
if hasattr(section, 'start_element') and section.start_element:
|
195
|
+
section.start_element.highlight(label="Heading", color=(255, 0, 0, 100))
|
196
|
+
if hasattr(section, 'end_element') and section.end_element:
|
197
|
+
section.end_element.highlight(label="Line", color=(0, 0, 255, 100))
|
198
|
+
|
199
|
+
page.to_image(path=str(output_dir / f"section_{i+1}.png"), show_labels=True)
|
200
|
+
|
201
|
+
|
202
|
+
def main():
|
203
|
+
"""Main entry point."""
|
204
|
+
# Get the PDF path from command line or use a default
|
205
|
+
if len(sys.argv) > 1:
|
206
|
+
pdf_path = sys.argv[1]
|
207
|
+
else:
|
208
|
+
# Look for any PDF in the examples directory or pdfs directory
|
209
|
+
example_dir = Path(__file__).parent
|
210
|
+
pdf_files = list(example_dir.glob("*.pdf"))
|
211
|
+
|
212
|
+
if not pdf_files:
|
213
|
+
pdfs_dir = example_dir.parent / "pdfs"
|
214
|
+
if pdfs_dir.exists():
|
215
|
+
pdf_files = list(pdfs_dir.glob("*.pdf"))
|
216
|
+
|
217
|
+
if pdf_files:
|
218
|
+
pdf_path = str(pdf_files[0])
|
219
|
+
else:
|
220
|
+
print("No PDF file found. Please provide a path to a PDF file.")
|
221
|
+
sys.exit(1)
|
222
|
+
|
223
|
+
print(f"Using PDF: {pdf_path}")
|
224
|
+
|
225
|
+
# Run the examples
|
226
|
+
example_sections_between_headings(pdf_path)
|
227
|
+
example_sections_with_separators(pdf_path)
|
228
|
+
example_start_end_sections(pdf_path)
|
229
|
+
|
230
|
+
|
231
|
+
if __name__ == "__main__":
|
232
|
+
main()
|
@@ -0,0 +1,97 @@
|
|
1
|
+
"""
|
2
|
+
Simple demonstration of document QA functionality in Natural PDF.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
import sys
|
7
|
+
import argparse
|
8
|
+
|
9
|
+
# Add parent directory to path to run without installing
|
10
|
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
11
|
+
|
12
|
+
from natural_pdf import PDF, configure_logging
|
13
|
+
import logging
|
14
|
+
|
15
|
+
def main():
|
16
|
+
# Set up logging
|
17
|
+
configure_logging(level=logging.INFO)
|
18
|
+
|
19
|
+
# Parse command line arguments
|
20
|
+
parser = argparse.ArgumentParser(description="Simple Document QA Example")
|
21
|
+
parser.add_argument("pdf_path", nargs="?", default="../pdfs/0500000US42001.pdf",
|
22
|
+
help="Path to PDF document")
|
23
|
+
parser.add_argument("question", nargs="?", default="How many votes for Harris and Walz?",
|
24
|
+
help="Question to ask about the document")
|
25
|
+
parser.add_argument("--debug", action="store_true",
|
26
|
+
help="Save debug information for troubleshooting")
|
27
|
+
args = parser.parse_args()
|
28
|
+
|
29
|
+
# Open the PDF
|
30
|
+
pdf = PDF(args.pdf_path)
|
31
|
+
print(f"Loaded PDF: {args.pdf_path} ({len(pdf.pages)} pages)")
|
32
|
+
|
33
|
+
# Get the first page
|
34
|
+
page = pdf.pages[0]
|
35
|
+
|
36
|
+
# Ask a question to the document
|
37
|
+
print(f"\nAsking document: {args.question}")
|
38
|
+
doc_result = pdf.ask(args.question, debug=args.debug)
|
39
|
+
|
40
|
+
if doc_result.get("found", False):
|
41
|
+
print(f"Document answer: {doc_result['answer']}")
|
42
|
+
print(f"Confidence: {doc_result['confidence']:.2f}")
|
43
|
+
print(f"Page: {doc_result.get('page_num', 0)}")
|
44
|
+
else:
|
45
|
+
print(f"No answer found in document: {doc_result.get('message', '')}")
|
46
|
+
|
47
|
+
# Ask the same question to the page
|
48
|
+
print(f"\nAsking page 0: {args.question}")
|
49
|
+
page_result = page.ask(args.question, debug=args.debug)
|
50
|
+
|
51
|
+
if page_result.get("found", False):
|
52
|
+
print(f"Page answer: {page_result['answer']}")
|
53
|
+
print(f"Confidence: {page_result['confidence']:.2f}")
|
54
|
+
|
55
|
+
# Highlight the answer elements if available
|
56
|
+
if page_result.get("source_elements"):
|
57
|
+
for element in page_result["source_elements"]:
|
58
|
+
element.highlight(color=(1, 0.5, 0, 0.5)) # Orange highlight
|
59
|
+
|
60
|
+
# Save the highlighted image
|
61
|
+
os.makedirs("output", exist_ok=True)
|
62
|
+
page.save_image("output/simple_qa_answer.png")
|
63
|
+
print("Saved highlighted answer to output/simple_qa_answer.png")
|
64
|
+
else:
|
65
|
+
print(f"No answer found on page: {page_result.get('message', '')}")
|
66
|
+
|
67
|
+
# Optional: Analyze layout and ask questions to specific regions
|
68
|
+
print("\nDetecting document layout...")
|
69
|
+
page.analyze_layout(confidence=0.3)
|
70
|
+
regions = page.find_all('region[type=title], region[type=plain-text], region[type=table]')
|
71
|
+
print(f"Found {len(regions)} relevant regions")
|
72
|
+
|
73
|
+
# Save layout visualization
|
74
|
+
page.highlight_layout()
|
75
|
+
page.save_image("output/simple_qa_regions.png")
|
76
|
+
print("Saved layout visualization to output/simple_qa_regions.png")
|
77
|
+
|
78
|
+
# Ask questions to each region
|
79
|
+
best_region_result = None
|
80
|
+
best_confidence = 0
|
81
|
+
|
82
|
+
for i, region in enumerate(regions):
|
83
|
+
region_result = region.ask(args.question, debug=args.debug)
|
84
|
+
|
85
|
+
if region_result.get("found", False) and region_result.get("confidence", 0) > best_confidence:
|
86
|
+
best_region_result = region_result
|
87
|
+
best_confidence = region_result["confidence"]
|
88
|
+
|
89
|
+
if best_region_result:
|
90
|
+
region_type = best_region_result["region"].region_type
|
91
|
+
print(f"\nBest region answer ({region_type}): {best_region_result['answer']}")
|
92
|
+
print(f"Confidence: {best_region_result['confidence']:.2f}")
|
93
|
+
else:
|
94
|
+
print("\nNo answer found in any region")
|
95
|
+
|
96
|
+
if __name__ == "__main__":
|
97
|
+
main()
|
@@ -0,0 +1,108 @@
|
|
1
|
+
"""
|
2
|
+
Spatial navigation example.
|
3
|
+
|
4
|
+
This example demonstrates how to navigate between elements using
|
5
|
+
spatial navigation methods: next(), prev(), and nearest().
|
6
|
+
"""
|
7
|
+
import os
|
8
|
+
import sys
|
9
|
+
|
10
|
+
# Add the parent directory to the Python path
|
11
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
12
|
+
from natural_pdf import PDF
|
13
|
+
|
14
|
+
# Get the current directory of this script
|
15
|
+
script_dir = os.path.dirname(os.path.realpath(__file__))
|
16
|
+
# Get the parent directory (project root)
|
17
|
+
root_dir = os.path.dirname(script_dir)
|
18
|
+
# Default PDF path
|
19
|
+
default_pdf_path = os.path.join(root_dir, "pdfs", "01-practice.pdf")
|
20
|
+
# Output directory
|
21
|
+
output_dir = os.path.join(root_dir, "output")
|
22
|
+
os.makedirs(output_dir, exist_ok=True)
|
23
|
+
|
24
|
+
# Get PDF path from command line or use default
|
25
|
+
pdf_path = sys.argv[1] if len(sys.argv) > 1 else default_pdf_path
|
26
|
+
print(f"Using PDF: {pdf_path}")
|
27
|
+
|
28
|
+
# Load the PDF
|
29
|
+
pdf = PDF(pdf_path)
|
30
|
+
page = pdf.pages[0]
|
31
|
+
|
32
|
+
print("\n=== Spatial Navigation Examples ===")
|
33
|
+
|
34
|
+
# First, find a heading or title to start with
|
35
|
+
title = page.find('text[size>=12]')
|
36
|
+
if title:
|
37
|
+
print(f"\nStarting with: '{title.text}'")
|
38
|
+
|
39
|
+
# 1. Find the next element in reading order
|
40
|
+
print("\n--- Next Element ---")
|
41
|
+
next_element = title.next()
|
42
|
+
if next_element:
|
43
|
+
print(f"Next element: '{next_element.text if hasattr(next_element, 'text') else next_element.type}'")
|
44
|
+
|
45
|
+
# 2. Find the next element matching a selector
|
46
|
+
print("\n--- Next Matching Element ---")
|
47
|
+
next_bold = title.next('text:bold', limit=20)
|
48
|
+
if next_bold:
|
49
|
+
print(f"Next bold text: '{next_bold.text}'")
|
50
|
+
|
51
|
+
# 3. Find the previous element in reading order
|
52
|
+
print("\n--- Previous Element ---")
|
53
|
+
prev_element = title.prev()
|
54
|
+
if prev_element:
|
55
|
+
print(f"Previous element: '{prev_element.text if hasattr(prev_element, 'text') else prev_element.type}'")
|
56
|
+
|
57
|
+
# 4. Find the previous element matching a selector
|
58
|
+
print("\n--- Previous Matching Element ---")
|
59
|
+
# Find a element further down first
|
60
|
+
middle_element = page.find_all('text')[len(page.find_all('text'))//2]
|
61
|
+
if middle_element:
|
62
|
+
print(f"Middle element: '{middle_element.text}'")
|
63
|
+
prev_large = middle_element.prev('text[size>=12]')
|
64
|
+
if prev_large:
|
65
|
+
print(f"Previous large text: '{prev_large.text}'")
|
66
|
+
|
67
|
+
# 5. Find the nearest element matching a selector
|
68
|
+
print("\n--- Nearest Element ---")
|
69
|
+
nearest_rect = title.nearest('rect')
|
70
|
+
if nearest_rect:
|
71
|
+
print(f"Nearest rectangle: {nearest_rect.bbox}")
|
72
|
+
|
73
|
+
# 6. Find the nearest element with max distance
|
74
|
+
print("\n--- Nearest Element with Max Distance ---")
|
75
|
+
nearest_small = title.nearest('text[size<10]', max_distance=100)
|
76
|
+
if nearest_small:
|
77
|
+
print(f"Nearest small text within 100 points: '{nearest_small.text}'")
|
78
|
+
else:
|
79
|
+
print("No small text within 100 points")
|
80
|
+
|
81
|
+
# Visualize the navigation
|
82
|
+
print("\n--- Visualizing Navigation ---")
|
83
|
+
page.clear_highlights()
|
84
|
+
|
85
|
+
# Highlight the starting element
|
86
|
+
title.highlight(label="Starting Element")
|
87
|
+
|
88
|
+
# Find and highlight the next few elements
|
89
|
+
current = title
|
90
|
+
for i in range(5):
|
91
|
+
next_elem = current.next()
|
92
|
+
if next_elem:
|
93
|
+
next_elem.highlight(label=f"Next {i+1}")
|
94
|
+
current = next_elem
|
95
|
+
else:
|
96
|
+
break
|
97
|
+
|
98
|
+
# Find and highlight the nearest elements
|
99
|
+
title.nearest('rect').highlight(label="Nearest Rectangle")
|
100
|
+
title.nearest('line').highlight(label="Nearest Line")
|
101
|
+
|
102
|
+
# Save the visualization
|
103
|
+
output_path = os.path.join(output_dir, "spatial_navigation.png")
|
104
|
+
page.to_image(path=output_path, show_labels=True)
|
105
|
+
print(f"Saved visualization to {output_path}")
|
106
|
+
|
107
|
+
else:
|
108
|
+
print("Could not find a title to start with.")
|
@@ -0,0 +1,135 @@
|
|
1
|
+
"""
|
2
|
+
Table extraction example using both TATR and pdfplumber methods.
|
3
|
+
|
4
|
+
This example demonstrates how to extract tables from PDF documents
|
5
|
+
using both the Table Transformer (TATR) structure detection and
|
6
|
+
pdfplumber's table extraction methods.
|
7
|
+
|
8
|
+
Note: This example requires additional dependencies:
|
9
|
+
- torch
|
10
|
+
- torchvision
|
11
|
+
- transformers
|
12
|
+
|
13
|
+
These will be automatically installed when you install natural-pdf.
|
14
|
+
"""
|
15
|
+
import os
|
16
|
+
from natural_pdf import PDF
|
17
|
+
import pprint
|
18
|
+
|
19
|
+
# Get the current directory of this script
|
20
|
+
script_dir = os.path.dirname(os.path.realpath(__file__))
|
21
|
+
# Get the parent directory (project root)
|
22
|
+
root_dir = os.path.dirname(script_dir)
|
23
|
+
# Setup paths
|
24
|
+
pdf_path = os.path.join(root_dir, "pdfs", "01-practice.pdf")
|
25
|
+
output_dir = os.path.join(root_dir, "output")
|
26
|
+
os.makedirs(output_dir, exist_ok=True)
|
27
|
+
|
28
|
+
print(f"Extracting tables from: {pdf_path}")
|
29
|
+
|
30
|
+
# Load the PDF
|
31
|
+
pdf = PDF(pdf_path)
|
32
|
+
page = pdf.pages[0] # This PDF has a single page with a table
|
33
|
+
|
34
|
+
# First, let's try the traditional pdfplumber method
|
35
|
+
print("\n== TRADITIONAL TABLE EXTRACTION ==")
|
36
|
+
table_plumber = page.extract_table() # Uses pdfplumber's table extraction
|
37
|
+
print("PDFPlumber extracted table:")
|
38
|
+
pprint.pprint(table_plumber)
|
39
|
+
|
40
|
+
# Now, let's detect and extract using TATR
|
41
|
+
print("\n== TABLE TRANSFORMER (TATR) EXTRACTION ==")
|
42
|
+
|
43
|
+
# Run table structure detection
|
44
|
+
print("Running Table Transformer detection...")
|
45
|
+
tatr_regions = page.analyze_layout(
|
46
|
+
model="tatr",
|
47
|
+
confidence=0.4 # Table detection confidence threshold
|
48
|
+
)
|
49
|
+
|
50
|
+
# Find the detected table
|
51
|
+
tables = page.find_all('region[type=table][model=tatr]')
|
52
|
+
|
53
|
+
if tables:
|
54
|
+
print(f"Found {len(tables)} tables")
|
55
|
+
|
56
|
+
# Get the first table
|
57
|
+
table = tables[0]
|
58
|
+
|
59
|
+
# Now extract the table using TATR structure (auto-detected)
|
60
|
+
tatr_table_data = table.extract_table() # Automatically uses TATR because it's a TATR region
|
61
|
+
print("\nExtracted table data (TATR auto-detection):")
|
62
|
+
pprint.pprint(tatr_table_data)
|
63
|
+
|
64
|
+
# You can also explicitly specify which method to use
|
65
|
+
plumber_table_data = table.extract_table(method='plumber')
|
66
|
+
print("\nExtracted table data (explicit pdfplumber method):")
|
67
|
+
pprint.pprint(plumber_table_data)
|
68
|
+
|
69
|
+
# Compare the results
|
70
|
+
print("\n== EXTRACTION METHOD COMPARISON ==")
|
71
|
+
print(f"TATR rows: {len(tatr_table_data)}, cols in first row: {len(tatr_table_data[0]) if tatr_table_data else 0}")
|
72
|
+
print(f"Plumber rows: {len(plumber_table_data)}, cols in first row: {len(plumber_table_data[0]) if plumber_table_data else 0}")
|
73
|
+
|
74
|
+
# Visualize the table structure
|
75
|
+
page.clear_highlights()
|
76
|
+
|
77
|
+
# First highlight the table
|
78
|
+
table.highlight(label="Table", color=(1, 0, 0, 0.3))
|
79
|
+
|
80
|
+
# Then highlight the structure elements
|
81
|
+
rows = page.find_all('region[type=table-row][model=tatr]')
|
82
|
+
columns = page.find_all('region[type=table-column][model=tatr]')
|
83
|
+
headers = page.find_all('region[type=table-column-header][model=tatr]')
|
84
|
+
|
85
|
+
for row in rows:
|
86
|
+
row.highlight(label="Row", color=(0, 1, 0, 0.3))
|
87
|
+
for column in columns:
|
88
|
+
column.highlight(label="Column", color=(0, 0, 1, 0.3))
|
89
|
+
for header in headers:
|
90
|
+
header.highlight(label="Header", color=(0, 1, 1, 0.3))
|
91
|
+
|
92
|
+
# Save the highlighted table structure
|
93
|
+
output_path = os.path.join(output_dir, "table_extraction.png")
|
94
|
+
page.to_image(path=output_path, show_labels=True)
|
95
|
+
print(f"\nSaved table structure visualization to {output_path}")
|
96
|
+
|
97
|
+
# Demonstrate working with individual cells
|
98
|
+
if rows and columns:
|
99
|
+
print("\n== EXTRACTING INDIVIDUAL CELLS ==")
|
100
|
+
# Create a cell at the intersection of first row and first column
|
101
|
+
from natural_pdf.elements.region import Region
|
102
|
+
|
103
|
+
row = rows[0]
|
104
|
+
col = columns[0]
|
105
|
+
|
106
|
+
cell_bbox = (col.x0, row.top, col.x1, row.bottom)
|
107
|
+
cell = Region(page, cell_bbox)
|
108
|
+
|
109
|
+
cell_text = cell.extract_text().strip()
|
110
|
+
print(f"Text in first cell: '{cell_text}'")
|
111
|
+
|
112
|
+
# When working with tables with headers, you might want to create a dictionary
|
113
|
+
if headers and rows and columns:
|
114
|
+
print("\n== CREATING A DICTIONARY FROM TABLE ==")
|
115
|
+
header_texts = [header.extract_text().strip() for header in headers]
|
116
|
+
|
117
|
+
table_dict = []
|
118
|
+
for row in rows:
|
119
|
+
row_dict = {}
|
120
|
+
for i, col in enumerate(columns):
|
121
|
+
if i < len(header_texts):
|
122
|
+
# Create cell region
|
123
|
+
cell_bbox = (col.x0, row.top, col.x1, row.bottom)
|
124
|
+
cell = Region(page, cell_bbox)
|
125
|
+
|
126
|
+
# Extract text and add to dictionary
|
127
|
+
row_dict[header_texts[i]] = cell.extract_text().strip()
|
128
|
+
|
129
|
+
if row_dict:
|
130
|
+
table_dict.append(row_dict)
|
131
|
+
|
132
|
+
print("Table as dictionary:")
|
133
|
+
pprint.pprint(table_dict)
|
134
|
+
else:
|
135
|
+
print("No tables detected with TATR.")
|