natural-pdf 25.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. examples/__init__.py +3 -0
  2. examples/another_exclusion_example.py +20 -0
  3. examples/basic_usage.py +190 -0
  4. examples/boundary_exclusion_test.py +137 -0
  5. examples/boundary_inclusion_fix_test.py +157 -0
  6. examples/chainable_layout_example.py +70 -0
  7. examples/color_basic_test.py +49 -0
  8. examples/color_name_example.py +71 -0
  9. examples/color_test.py +62 -0
  10. examples/debug_ocr.py +91 -0
  11. examples/direct_ocr_test.py +148 -0
  12. examples/direct_paddle_test.py +99 -0
  13. examples/direct_qa_example.py +165 -0
  14. examples/document_layout_analysis.py +123 -0
  15. examples/document_qa_example.py +185 -0
  16. examples/exclusion_count_debug.py +128 -0
  17. examples/exclusion_debug.py +107 -0
  18. examples/exclusion_example.py +150 -0
  19. examples/exclusion_optimization_example.py +190 -0
  20. examples/extract_text_test.py +128 -0
  21. examples/font_aware_example.py +101 -0
  22. examples/font_variant_example.py +124 -0
  23. examples/footer_overlap_test.py +124 -0
  24. examples/highlight_all_example.py +82 -0
  25. examples/highlight_attributes_test.py +114 -0
  26. examples/highlight_confidence_display.py +122 -0
  27. examples/highlight_demo.py +110 -0
  28. examples/highlight_float_test.py +71 -0
  29. examples/highlight_test.py +147 -0
  30. examples/highlighting_example.py +123 -0
  31. examples/image_width_example.py +84 -0
  32. examples/improved_api_example.py +128 -0
  33. examples/layout_confidence_display_test.py +65 -0
  34. examples/layout_confidence_test.py +82 -0
  35. examples/layout_coordinate_debug.py +258 -0
  36. examples/layout_highlight_test.py +77 -0
  37. examples/logging_example.py +70 -0
  38. examples/ocr_comprehensive.py +193 -0
  39. examples/ocr_debug_example.py +87 -0
  40. examples/ocr_default_test.py +97 -0
  41. examples/ocr_engine_comparison.py +235 -0
  42. examples/ocr_example.py +89 -0
  43. examples/ocr_simplified_params.py +79 -0
  44. examples/ocr_visualization.py +102 -0
  45. examples/ocr_visualization_test.py +121 -0
  46. examples/paddle_layout_example.py +315 -0
  47. examples/paddle_layout_simple.py +74 -0
  48. examples/paddleocr_example.py +224 -0
  49. examples/page_collection_example.py +103 -0
  50. examples/polygon_highlight_example.py +83 -0
  51. examples/position_methods_example.py +134 -0
  52. examples/region_boundary_test.py +73 -0
  53. examples/region_exclusion_test.py +149 -0
  54. examples/region_expand_example.py +109 -0
  55. examples/region_image_example.py +116 -0
  56. examples/region_ocr_test.py +119 -0
  57. examples/region_sections_example.py +115 -0
  58. examples/school_books.py +49 -0
  59. examples/school_books_all.py +52 -0
  60. examples/scouring.py +36 -0
  61. examples/section_extraction_example.py +232 -0
  62. examples/simple_document_qa.py +97 -0
  63. examples/spatial_navigation_example.py +108 -0
  64. examples/table_extraction_example.py +135 -0
  65. examples/table_structure_detection.py +155 -0
  66. examples/tatr_cells_test.py +56 -0
  67. examples/tatr_ocr_table_test.py +94 -0
  68. examples/text_search_example.py +122 -0
  69. examples/text_style_example.py +110 -0
  70. examples/tiny-text.py +61 -0
  71. examples/until_boundaries_example.py +156 -0
  72. examples/until_example.py +112 -0
  73. examples/very_basics.py +15 -0
  74. natural_pdf/__init__.py +55 -0
  75. natural_pdf/analyzers/__init__.py +9 -0
  76. natural_pdf/analyzers/document_layout.py +736 -0
  77. natural_pdf/analyzers/text_structure.py +153 -0
  78. natural_pdf/core/__init__.py +3 -0
  79. natural_pdf/core/page.py +2376 -0
  80. natural_pdf/core/pdf.py +572 -0
  81. natural_pdf/elements/__init__.py +3 -0
  82. natural_pdf/elements/base.py +553 -0
  83. natural_pdf/elements/collections.py +770 -0
  84. natural_pdf/elements/line.py +124 -0
  85. natural_pdf/elements/rect.py +122 -0
  86. natural_pdf/elements/region.py +1366 -0
  87. natural_pdf/elements/text.py +304 -0
  88. natural_pdf/ocr/__init__.py +62 -0
  89. natural_pdf/ocr/easyocr_engine.py +254 -0
  90. natural_pdf/ocr/engine.py +158 -0
  91. natural_pdf/ocr/paddleocr_engine.py +263 -0
  92. natural_pdf/qa/__init__.py +3 -0
  93. natural_pdf/qa/document_qa.py +405 -0
  94. natural_pdf/selectors/__init__.py +4 -0
  95. natural_pdf/selectors/parser.py +360 -0
  96. natural_pdf/templates/__init__.py +1 -0
  97. natural_pdf/templates/ocr_debug.html +517 -0
  98. natural_pdf/utils/__init__.py +4 -0
  99. natural_pdf/utils/highlighting.py +605 -0
  100. natural_pdf/utils/ocr.py +515 -0
  101. natural_pdf/utils/reading_order.py +227 -0
  102. natural_pdf/utils/visualization.py +151 -0
  103. natural_pdf-25.3.16.dist-info/LICENSE +21 -0
  104. natural_pdf-25.3.16.dist-info/METADATA +268 -0
  105. natural_pdf-25.3.16.dist-info/RECORD +109 -0
  106. natural_pdf-25.3.16.dist-info/WHEEL +5 -0
  107. natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
  108. tests/__init__.py +3 -0
  109. tests/test_pdf.py +39 -0
@@ -0,0 +1,52 @@
1
+ """
2
+ Example demonstrating section extraction with the get_sections method.
3
+ """
4
+
5
+ import os
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ # Add parent directory to path for imports
10
+ sys.path.insert(0, str(Path(__file__).parent.parent))
11
+
12
+ from natural_pdf import PDF
13
+
14
+ pdf = PDF("./pdfs/Atlanta_Public_Schools_GA_sample.pdf")
15
+
16
+ day_sections = pdf.pages.get_sections(start_elements='line[width>=2]')
17
+
18
+ rows = []
19
+ for day in day_sections:
20
+ date = day.find('text').text
21
+ book_sections = day.get_sections(start_elements='text:contains("(Removed:")')
22
+
23
+ for book in book_sections:
24
+ if book.height < 30:
25
+ print("Not a book, skipping")
26
+ continue
27
+
28
+ # Bold big text is the title
29
+ title = book.find_all('text[font_variant="AAAAAB"][size>=10]')
30
+ price = book.find('text:contains("Price")').below(height=15, width="element").expand(right=30)
31
+ acquired = book.find('text:contains("Acquired")').below(height=15, width="element").expand(right=30)
32
+ removed_by = book.find('text[size<10]:contains("Removed")').below(height=17, width="element").expand(right=60)
33
+
34
+ # Highlight them
35
+ book.highlight(label=title.extract_text())
36
+ title.highlight(label='title')
37
+ price.highlight(label='price')
38
+ acquired.highlight(label='acquired')
39
+ removed_by.highlight(label='removed')
40
+
41
+ # Save them
42
+ data = {
43
+ 'Title': title.extract_text(),
44
+ 'Price': price.extract_text(),
45
+ 'Acquired': acquired.extract_text(),
46
+ 'Removed By': removed_by.extract_text()
47
+ }
48
+ rows.append(data)
49
+
50
+ pdf.pages[0].save("highlight-1.png", show_labels=True)
51
+ pdf.pages[1].save("highlight-2.png", show_labels=True)
52
+ pdf.pages[2].save("highlight-3.png", show_labels=True)
examples/scouring.py ADDED
@@ -0,0 +1,36 @@
1
+ """
2
+ Example demonstrating section extraction with the get_sections method.
3
+ """
4
+
5
+ import os
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ # Add parent directory to path for imports
10
+ sys.path.insert(0, str(Path(__file__).parent.parent))
11
+
12
+ from natural_pdf import PDF
13
+
14
+ pdf = PDF("./pdfs/Nigeria 2021_MICS_SFR_English.pdf")
15
+
16
+ # Exclude "Page | 123" footer from all queries
17
+ pdf.add_exclusion(lambda page: page.find_all('text').lowest().below(include_element=True))
18
+
19
+ # There's a bold header for 'EQ.4.1W' on a few of these pages
20
+ header = pdf.pages[460:470].find('text:contains("EQ.4.1W"):bold')
21
+
22
+ header.highlight(label='table header')
23
+
24
+ (
25
+ header
26
+ .below()
27
+ .find('text:contains("Total"):bold')
28
+ .below(
29
+ until='text:contains("MICS")',
30
+ include_element=True,
31
+ include_until=False
32
+ )
33
+ .highlight(label='table area')
34
+ )
35
+
36
+ header.page.to_image(path="output.png", show_labels=True)
@@ -0,0 +1,232 @@
1
+ """
2
+ Example demonstrating section extraction with the get_sections method.
3
+ """
4
+
5
+ import os
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ # Add parent directory to path for imports
10
+ sys.path.insert(0, str(Path(__file__).parent.parent))
11
+
12
+ from natural_pdf import PDF
13
+
14
+
15
+ def example_sections_between_headings(pdf_path):
16
+ """
17
+ Example showing how to extract sections between headings.
18
+ """
19
+ print("\n=== SECTIONS BETWEEN HEADINGS ===")
20
+ pdf = PDF(pdf_path)
21
+ page = pdf.pages[0]
22
+ page.add_exclusion(page.find('text:contains("November")').above(include_element=True))
23
+
24
+ # Create an output directory
25
+ output_dir = Path(__file__).parent / "section_output"
26
+ output_dir.mkdir(exist_ok=True)
27
+
28
+ # Find all the headings on the page
29
+ headings = page.find_all('text[size>=12]')
30
+ print(f"Found {len(headings)} heading elements")
31
+
32
+ # Create a highlighted visualization to see what we found
33
+ page.clear_highlights()
34
+ headings.highlight(label="Headings", color=(255, 100, 0, 100))
35
+ page.to_image(path=str(output_dir / "headings.png"), show_labels=True)
36
+
37
+ # First try without line grouping
38
+ print("Extracting sections WITHOUT line grouping:")
39
+ # Set y_threshold to 0 to disable line grouping
40
+ sections_no_grouping = page.get_sections(
41
+ start_elements=headings,
42
+ boundary_inclusion='start',
43
+ y_threshold=0 # Disable line grouping
44
+ )
45
+ print(f"Found {len(sections_no_grouping)} sections without line grouping")
46
+
47
+ # Now with line grouping
48
+ print("\nExtracting sections WITH line grouping:")
49
+ sections = page.get_sections(
50
+ start_elements=headings,
51
+ boundary_inclusion='start', # Include heading with its section
52
+ y_threshold=5 # Group elements within 5 points vertically
53
+ )
54
+ print(f"Found {len(sections)} sections with line grouping")
55
+
56
+ # Create visualizations showing the difference
57
+ # Highlight sections without grouping
58
+ page.clear_highlights()
59
+ for i, section in enumerate(sections_no_grouping):
60
+ section.highlight(label=f"Section {i+1}", use_color_cycling=True)
61
+ page.to_image(path=str(output_dir / "sections_no_grouping.png"), show_labels=True)
62
+
63
+ # Highlight sections with grouping
64
+ page.clear_highlights()
65
+ for i, section in enumerate(sections):
66
+ section.highlight(label=f"Section {i+1}", use_color_cycling=True)
67
+ page.to_image(path=str(output_dir / "sections_with_grouping.png"), show_labels=True)
68
+
69
+ # Process each section from the grouped version
70
+ for i, section in enumerate(sections):
71
+ # Get the heading text
72
+ heading_text = section.start_element.extract_text() if hasattr(section, 'start_element') else "No heading"
73
+
74
+ # Get section content (limited to first 50 chars for display)
75
+ content = section.extract_text()
76
+ content_preview = content[:50] + "..." if len(content) > 50 else content
77
+
78
+ print(f"Section {i+1}: '{heading_text}'")
79
+ print(f" Content: {content_preview}")
80
+
81
+ # Create visualization
82
+ page.clear_highlights()
83
+ section.highlight(label=f"Section {i+1}")
84
+ if hasattr(section, 'start_element') and section.start_element:
85
+ section.start_element.highlight(label="Heading", color=(255, 0, 0, 100))
86
+ if hasattr(section, 'end_element') and section.end_element:
87
+ section.end_element.highlight(label="End", color=(0, 0, 255, 100))
88
+
89
+ page.to_image(path=str(output_dir / f"section_{i+1}.png"), show_labels=True)
90
+
91
+
92
+ def example_sections_with_separators(pdf_path):
93
+ """
94
+ Example showing how to extract sections with separators.
95
+ """
96
+ print("\n=== SECTIONS WITH SEPARATORS ===")
97
+ pdf = PDF(pdf_path)
98
+ page = pdf.pages[0]
99
+
100
+ # Create an output directory
101
+ output_dir = Path(__file__).parent / "separator_output"
102
+ output_dir.mkdir(exist_ok=True)
103
+
104
+ # Find all horizontal lines that could be separators
105
+ separators = page.find_all('line[width>=2]')
106
+ print(f"Found {len(separators)} separator lines")
107
+
108
+ # Create a highlighted visualization to see what we found
109
+ page.clear_highlights()
110
+ separators.highlight(label="Separators", color=(0, 0, 255, 100))
111
+ page.to_image(path=str(output_dir / "separators.png"), show_labels=True)
112
+
113
+ # Try different boundary inclusions
114
+ inclusion_options = ['none', 'start', 'end', 'both']
115
+
116
+ for inclusion in inclusion_options:
117
+ print(f"\nSections with boundary_inclusion='{inclusion}':")
118
+ sections = page.get_sections(
119
+ start_elements=separators,
120
+ boundary_inclusion=inclusion
121
+ )
122
+
123
+ print(f"Found {len(sections)} sections")
124
+
125
+ # Create visualization for all sections
126
+ page.clear_highlights()
127
+
128
+ for i, section in enumerate(sections):
129
+ # Use different color for each section
130
+ color = None # Let it cycle through colors
131
+ section.highlight(label=f"Section {i+1}", use_color_cycling=True)
132
+
133
+ # Section info
134
+ content = section.extract_text()
135
+ content_preview = content[:30] + "..." if len(content) > 30 else content
136
+ print(f" Section {i+1}: {content_preview}")
137
+
138
+ # Save the visualization
139
+ page.to_image(path=str(output_dir / f"sections_{inclusion}.png"), show_labels=True)
140
+ page.clear_highlights()
141
+
142
+
143
+ def example_start_end_sections(pdf_path):
144
+ """
145
+ Example showing how to extract sections between start and end elements.
146
+ """
147
+ print("\n=== SECTIONS BETWEEN START AND END ELEMENTS ===")
148
+ pdf = PDF(pdf_path)
149
+ page = pdf.pages[0]
150
+
151
+ # Create an output directory
152
+ output_dir = Path(__file__).parent / "start_end_output"
153
+ output_dir.mkdir(exist_ok=True)
154
+
155
+ # Find headings and lines
156
+ headings = page.find_all('text[size>=14]')
157
+ lines = page.find_all('line[width>=2]')
158
+
159
+ print(f"Found {len(headings)} headings and {len(lines)} lines")
160
+
161
+ # Create a visualization to show both
162
+ page.clear_highlights()
163
+ headings.highlight(label="Headings", color=(255, 100, 0, 100))
164
+ lines.highlight(label="Lines", color=(0, 0, 255, 100))
165
+ page.to_image(path=str(output_dir / "elements.png"), show_labels=True)
166
+
167
+ # Get sections from heading to next line
168
+ sections = page.get_sections(
169
+ start_elements=headings,
170
+ end_elements=lines,
171
+ boundary_inclusion='start' # Include heading but not the line
172
+ )
173
+
174
+ print(f"Found {len(sections)} sections from headings to lines")
175
+
176
+ # Process each section
177
+ for i, section in enumerate(sections):
178
+ # Get the heading text
179
+ if hasattr(section, 'start_element') and section.start_element:
180
+ heading_text = section.start_element.extract_text()
181
+ else:
182
+ heading_text = "No heading"
183
+
184
+ # Get section content
185
+ content = section.extract_text()
186
+ content_preview = content[:50] + "..." if len(content) > 50 else content
187
+
188
+ print(f"Section {i+1}: '{heading_text}'")
189
+ print(f" Content: {content_preview}")
190
+
191
+ # Create visualization
192
+ page.clear_highlights()
193
+ section.highlight(label=f"Section {i+1}")
194
+ if hasattr(section, 'start_element') and section.start_element:
195
+ section.start_element.highlight(label="Heading", color=(255, 0, 0, 100))
196
+ if hasattr(section, 'end_element') and section.end_element:
197
+ section.end_element.highlight(label="Line", color=(0, 0, 255, 100))
198
+
199
+ page.to_image(path=str(output_dir / f"section_{i+1}.png"), show_labels=True)
200
+
201
+
202
+ def main():
203
+ """Main entry point."""
204
+ # Get the PDF path from command line or use a default
205
+ if len(sys.argv) > 1:
206
+ pdf_path = sys.argv[1]
207
+ else:
208
+ # Look for any PDF in the examples directory or pdfs directory
209
+ example_dir = Path(__file__).parent
210
+ pdf_files = list(example_dir.glob("*.pdf"))
211
+
212
+ if not pdf_files:
213
+ pdfs_dir = example_dir.parent / "pdfs"
214
+ if pdfs_dir.exists():
215
+ pdf_files = list(pdfs_dir.glob("*.pdf"))
216
+
217
+ if pdf_files:
218
+ pdf_path = str(pdf_files[0])
219
+ else:
220
+ print("No PDF file found. Please provide a path to a PDF file.")
221
+ sys.exit(1)
222
+
223
+ print(f"Using PDF: {pdf_path}")
224
+
225
+ # Run the examples
226
+ example_sections_between_headings(pdf_path)
227
+ example_sections_with_separators(pdf_path)
228
+ example_start_end_sections(pdf_path)
229
+
230
+
231
+ if __name__ == "__main__":
232
+ main()
@@ -0,0 +1,97 @@
1
+ """
2
+ Simple demonstration of document QA functionality in Natural PDF.
3
+ """
4
+
5
+ import os
6
+ import sys
7
+ import argparse
8
+
9
+ # Add parent directory to path to run without installing
10
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
11
+
12
+ from natural_pdf import PDF, configure_logging
13
+ import logging
14
+
15
+ def main():
16
+ # Set up logging
17
+ configure_logging(level=logging.INFO)
18
+
19
+ # Parse command line arguments
20
+ parser = argparse.ArgumentParser(description="Simple Document QA Example")
21
+ parser.add_argument("pdf_path", nargs="?", default="../pdfs/0500000US42001.pdf",
22
+ help="Path to PDF document")
23
+ parser.add_argument("question", nargs="?", default="How many votes for Harris and Walz?",
24
+ help="Question to ask about the document")
25
+ parser.add_argument("--debug", action="store_true",
26
+ help="Save debug information for troubleshooting")
27
+ args = parser.parse_args()
28
+
29
+ # Open the PDF
30
+ pdf = PDF(args.pdf_path)
31
+ print(f"Loaded PDF: {args.pdf_path} ({len(pdf.pages)} pages)")
32
+
33
+ # Get the first page
34
+ page = pdf.pages[0]
35
+
36
+ # Ask a question to the document
37
+ print(f"\nAsking document: {args.question}")
38
+ doc_result = pdf.ask(args.question, debug=args.debug)
39
+
40
+ if doc_result.get("found", False):
41
+ print(f"Document answer: {doc_result['answer']}")
42
+ print(f"Confidence: {doc_result['confidence']:.2f}")
43
+ print(f"Page: {doc_result.get('page_num', 0)}")
44
+ else:
45
+ print(f"No answer found in document: {doc_result.get('message', '')}")
46
+
47
+ # Ask the same question to the page
48
+ print(f"\nAsking page 0: {args.question}")
49
+ page_result = page.ask(args.question, debug=args.debug)
50
+
51
+ if page_result.get("found", False):
52
+ print(f"Page answer: {page_result['answer']}")
53
+ print(f"Confidence: {page_result['confidence']:.2f}")
54
+
55
+ # Highlight the answer elements if available
56
+ if page_result.get("source_elements"):
57
+ for element in page_result["source_elements"]:
58
+ element.highlight(color=(1, 0.5, 0, 0.5)) # Orange highlight
59
+
60
+ # Save the highlighted image
61
+ os.makedirs("output", exist_ok=True)
62
+ page.save_image("output/simple_qa_answer.png")
63
+ print("Saved highlighted answer to output/simple_qa_answer.png")
64
+ else:
65
+ print(f"No answer found on page: {page_result.get('message', '')}")
66
+
67
+ # Optional: Analyze layout and ask questions to specific regions
68
+ print("\nDetecting document layout...")
69
+ page.analyze_layout(confidence=0.3)
70
+ regions = page.find_all('region[type=title], region[type=plain-text], region[type=table]')
71
+ print(f"Found {len(regions)} relevant regions")
72
+
73
+ # Save layout visualization
74
+ page.highlight_layout()
75
+ page.save_image("output/simple_qa_regions.png")
76
+ print("Saved layout visualization to output/simple_qa_regions.png")
77
+
78
+ # Ask questions to each region
79
+ best_region_result = None
80
+ best_confidence = 0
81
+
82
+ for i, region in enumerate(regions):
83
+ region_result = region.ask(args.question, debug=args.debug)
84
+
85
+ if region_result.get("found", False) and region_result.get("confidence", 0) > best_confidence:
86
+ best_region_result = region_result
87
+ best_confidence = region_result["confidence"]
88
+
89
+ if best_region_result:
90
+ region_type = best_region_result["region"].region_type
91
+ print(f"\nBest region answer ({region_type}): {best_region_result['answer']}")
92
+ print(f"Confidence: {best_region_result['confidence']:.2f}")
93
+ else:
94
+ print("\nNo answer found in any region")
95
+
96
+ if __name__ == "__main__":
97
+ main()
@@ -0,0 +1,108 @@
1
+ """
2
+ Spatial navigation example.
3
+
4
+ This example demonstrates how to navigate between elements using
5
+ spatial navigation methods: next(), prev(), and nearest().
6
+ """
7
+ import os
8
+ import sys
9
+
10
+ # Add the parent directory to the Python path
11
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
12
+ from natural_pdf import PDF
13
+
14
+ # Get the current directory of this script
15
+ script_dir = os.path.dirname(os.path.realpath(__file__))
16
+ # Get the parent directory (project root)
17
+ root_dir = os.path.dirname(script_dir)
18
+ # Default PDF path
19
+ default_pdf_path = os.path.join(root_dir, "pdfs", "01-practice.pdf")
20
+ # Output directory
21
+ output_dir = os.path.join(root_dir, "output")
22
+ os.makedirs(output_dir, exist_ok=True)
23
+
24
+ # Get PDF path from command line or use default
25
+ pdf_path = sys.argv[1] if len(sys.argv) > 1 else default_pdf_path
26
+ print(f"Using PDF: {pdf_path}")
27
+
28
+ # Load the PDF
29
+ pdf = PDF(pdf_path)
30
+ page = pdf.pages[0]
31
+
32
+ print("\n=== Spatial Navigation Examples ===")
33
+
34
+ # First, find a heading or title to start with
35
+ title = page.find('text[size>=12]')
36
+ if title:
37
+ print(f"\nStarting with: '{title.text}'")
38
+
39
+ # 1. Find the next element in reading order
40
+ print("\n--- Next Element ---")
41
+ next_element = title.next()
42
+ if next_element:
43
+ print(f"Next element: '{next_element.text if hasattr(next_element, 'text') else next_element.type}'")
44
+
45
+ # 2. Find the next element matching a selector
46
+ print("\n--- Next Matching Element ---")
47
+ next_bold = title.next('text:bold', limit=20)
48
+ if next_bold:
49
+ print(f"Next bold text: '{next_bold.text}'")
50
+
51
+ # 3. Find the previous element in reading order
52
+ print("\n--- Previous Element ---")
53
+ prev_element = title.prev()
54
+ if prev_element:
55
+ print(f"Previous element: '{prev_element.text if hasattr(prev_element, 'text') else prev_element.type}'")
56
+
57
+ # 4. Find the previous element matching a selector
58
+ print("\n--- Previous Matching Element ---")
59
+ # Find a element further down first
60
+ middle_element = page.find_all('text')[len(page.find_all('text'))//2]
61
+ if middle_element:
62
+ print(f"Middle element: '{middle_element.text}'")
63
+ prev_large = middle_element.prev('text[size>=12]')
64
+ if prev_large:
65
+ print(f"Previous large text: '{prev_large.text}'")
66
+
67
+ # 5. Find the nearest element matching a selector
68
+ print("\n--- Nearest Element ---")
69
+ nearest_rect = title.nearest('rect')
70
+ if nearest_rect:
71
+ print(f"Nearest rectangle: {nearest_rect.bbox}")
72
+
73
+ # 6. Find the nearest element with max distance
74
+ print("\n--- Nearest Element with Max Distance ---")
75
+ nearest_small = title.nearest('text[size<10]', max_distance=100)
76
+ if nearest_small:
77
+ print(f"Nearest small text within 100 points: '{nearest_small.text}'")
78
+ else:
79
+ print("No small text within 100 points")
80
+
81
+ # Visualize the navigation
82
+ print("\n--- Visualizing Navigation ---")
83
+ page.clear_highlights()
84
+
85
+ # Highlight the starting element
86
+ title.highlight(label="Starting Element")
87
+
88
+ # Find and highlight the next few elements
89
+ current = title
90
+ for i in range(5):
91
+ next_elem = current.next()
92
+ if next_elem:
93
+ next_elem.highlight(label=f"Next {i+1}")
94
+ current = next_elem
95
+ else:
96
+ break
97
+
98
+ # Find and highlight the nearest elements
99
+ title.nearest('rect').highlight(label="Nearest Rectangle")
100
+ title.nearest('line').highlight(label="Nearest Line")
101
+
102
+ # Save the visualization
103
+ output_path = os.path.join(output_dir, "spatial_navigation.png")
104
+ page.to_image(path=output_path, show_labels=True)
105
+ print(f"Saved visualization to {output_path}")
106
+
107
+ else:
108
+ print("Could not find a title to start with.")
@@ -0,0 +1,135 @@
1
+ """
2
+ Table extraction example using both TATR and pdfplumber methods.
3
+
4
+ This example demonstrates how to extract tables from PDF documents
5
+ using both the Table Transformer (TATR) structure detection and
6
+ pdfplumber's table extraction methods.
7
+
8
+ Note: This example requires additional dependencies:
9
+ - torch
10
+ - torchvision
11
+ - transformers
12
+
13
+ These will be automatically installed when you install natural-pdf.
14
+ """
15
+ import os
16
+ from natural_pdf import PDF
17
+ import pprint
18
+
19
+ # Get the current directory of this script
20
+ script_dir = os.path.dirname(os.path.realpath(__file__))
21
+ # Get the parent directory (project root)
22
+ root_dir = os.path.dirname(script_dir)
23
+ # Setup paths
24
+ pdf_path = os.path.join(root_dir, "pdfs", "01-practice.pdf")
25
+ output_dir = os.path.join(root_dir, "output")
26
+ os.makedirs(output_dir, exist_ok=True)
27
+
28
+ print(f"Extracting tables from: {pdf_path}")
29
+
30
+ # Load the PDF
31
+ pdf = PDF(pdf_path)
32
+ page = pdf.pages[0] # This PDF has a single page with a table
33
+
34
+ # First, let's try the traditional pdfplumber method
35
+ print("\n== TRADITIONAL TABLE EXTRACTION ==")
36
+ table_plumber = page.extract_table() # Uses pdfplumber's table extraction
37
+ print("PDFPlumber extracted table:")
38
+ pprint.pprint(table_plumber)
39
+
40
+ # Now, let's detect and extract using TATR
41
+ print("\n== TABLE TRANSFORMER (TATR) EXTRACTION ==")
42
+
43
+ # Run table structure detection
44
+ print("Running Table Transformer detection...")
45
+ tatr_regions = page.analyze_layout(
46
+ model="tatr",
47
+ confidence=0.4 # Table detection confidence threshold
48
+ )
49
+
50
+ # Find the detected table
51
+ tables = page.find_all('region[type=table][model=tatr]')
52
+
53
+ if tables:
54
+ print(f"Found {len(tables)} tables")
55
+
56
+ # Get the first table
57
+ table = tables[0]
58
+
59
+ # Now extract the table using TATR structure (auto-detected)
60
+ tatr_table_data = table.extract_table() # Automatically uses TATR because it's a TATR region
61
+ print("\nExtracted table data (TATR auto-detection):")
62
+ pprint.pprint(tatr_table_data)
63
+
64
+ # You can also explicitly specify which method to use
65
+ plumber_table_data = table.extract_table(method='plumber')
66
+ print("\nExtracted table data (explicit pdfplumber method):")
67
+ pprint.pprint(plumber_table_data)
68
+
69
+ # Compare the results
70
+ print("\n== EXTRACTION METHOD COMPARISON ==")
71
+ print(f"TATR rows: {len(tatr_table_data)}, cols in first row: {len(tatr_table_data[0]) if tatr_table_data else 0}")
72
+ print(f"Plumber rows: {len(plumber_table_data)}, cols in first row: {len(plumber_table_data[0]) if plumber_table_data else 0}")
73
+
74
+ # Visualize the table structure
75
+ page.clear_highlights()
76
+
77
+ # First highlight the table
78
+ table.highlight(label="Table", color=(1, 0, 0, 0.3))
79
+
80
+ # Then highlight the structure elements
81
+ rows = page.find_all('region[type=table-row][model=tatr]')
82
+ columns = page.find_all('region[type=table-column][model=tatr]')
83
+ headers = page.find_all('region[type=table-column-header][model=tatr]')
84
+
85
+ for row in rows:
86
+ row.highlight(label="Row", color=(0, 1, 0, 0.3))
87
+ for column in columns:
88
+ column.highlight(label="Column", color=(0, 0, 1, 0.3))
89
+ for header in headers:
90
+ header.highlight(label="Header", color=(0, 1, 1, 0.3))
91
+
92
+ # Save the highlighted table structure
93
+ output_path = os.path.join(output_dir, "table_extraction.png")
94
+ page.to_image(path=output_path, show_labels=True)
95
+ print(f"\nSaved table structure visualization to {output_path}")
96
+
97
+ # Demonstrate working with individual cells
98
+ if rows and columns:
99
+ print("\n== EXTRACTING INDIVIDUAL CELLS ==")
100
+ # Create a cell at the intersection of first row and first column
101
+ from natural_pdf.elements.region import Region
102
+
103
+ row = rows[0]
104
+ col = columns[0]
105
+
106
+ cell_bbox = (col.x0, row.top, col.x1, row.bottom)
107
+ cell = Region(page, cell_bbox)
108
+
109
+ cell_text = cell.extract_text().strip()
110
+ print(f"Text in first cell: '{cell_text}'")
111
+
112
+ # When working with tables with headers, you might want to create a dictionary
113
+ if headers and rows and columns:
114
+ print("\n== CREATING A DICTIONARY FROM TABLE ==")
115
+ header_texts = [header.extract_text().strip() for header in headers]
116
+
117
+ table_dict = []
118
+ for row in rows:
119
+ row_dict = {}
120
+ for i, col in enumerate(columns):
121
+ if i < len(header_texts):
122
+ # Create cell region
123
+ cell_bbox = (col.x0, row.top, col.x1, row.bottom)
124
+ cell = Region(page, cell_bbox)
125
+
126
+ # Extract text and add to dictionary
127
+ row_dict[header_texts[i]] = cell.extract_text().strip()
128
+
129
+ if row_dict:
130
+ table_dict.append(row_dict)
131
+
132
+ print("Table as dictionary:")
133
+ pprint.pprint(table_dict)
134
+ else:
135
+ print("No tables detected with TATR.")