natural-pdf 25.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. examples/__init__.py +3 -0
  2. examples/another_exclusion_example.py +20 -0
  3. examples/basic_usage.py +190 -0
  4. examples/boundary_exclusion_test.py +137 -0
  5. examples/boundary_inclusion_fix_test.py +157 -0
  6. examples/chainable_layout_example.py +70 -0
  7. examples/color_basic_test.py +49 -0
  8. examples/color_name_example.py +71 -0
  9. examples/color_test.py +62 -0
  10. examples/debug_ocr.py +91 -0
  11. examples/direct_ocr_test.py +148 -0
  12. examples/direct_paddle_test.py +99 -0
  13. examples/direct_qa_example.py +165 -0
  14. examples/document_layout_analysis.py +123 -0
  15. examples/document_qa_example.py +185 -0
  16. examples/exclusion_count_debug.py +128 -0
  17. examples/exclusion_debug.py +107 -0
  18. examples/exclusion_example.py +150 -0
  19. examples/exclusion_optimization_example.py +190 -0
  20. examples/extract_text_test.py +128 -0
  21. examples/font_aware_example.py +101 -0
  22. examples/font_variant_example.py +124 -0
  23. examples/footer_overlap_test.py +124 -0
  24. examples/highlight_all_example.py +82 -0
  25. examples/highlight_attributes_test.py +114 -0
  26. examples/highlight_confidence_display.py +122 -0
  27. examples/highlight_demo.py +110 -0
  28. examples/highlight_float_test.py +71 -0
  29. examples/highlight_test.py +147 -0
  30. examples/highlighting_example.py +123 -0
  31. examples/image_width_example.py +84 -0
  32. examples/improved_api_example.py +128 -0
  33. examples/layout_confidence_display_test.py +65 -0
  34. examples/layout_confidence_test.py +82 -0
  35. examples/layout_coordinate_debug.py +258 -0
  36. examples/layout_highlight_test.py +77 -0
  37. examples/logging_example.py +70 -0
  38. examples/ocr_comprehensive.py +193 -0
  39. examples/ocr_debug_example.py +87 -0
  40. examples/ocr_default_test.py +97 -0
  41. examples/ocr_engine_comparison.py +235 -0
  42. examples/ocr_example.py +89 -0
  43. examples/ocr_simplified_params.py +79 -0
  44. examples/ocr_visualization.py +102 -0
  45. examples/ocr_visualization_test.py +121 -0
  46. examples/paddle_layout_example.py +315 -0
  47. examples/paddle_layout_simple.py +74 -0
  48. examples/paddleocr_example.py +224 -0
  49. examples/page_collection_example.py +103 -0
  50. examples/polygon_highlight_example.py +83 -0
  51. examples/position_methods_example.py +134 -0
  52. examples/region_boundary_test.py +73 -0
  53. examples/region_exclusion_test.py +149 -0
  54. examples/region_expand_example.py +109 -0
  55. examples/region_image_example.py +116 -0
  56. examples/region_ocr_test.py +119 -0
  57. examples/region_sections_example.py +115 -0
  58. examples/school_books.py +49 -0
  59. examples/school_books_all.py +52 -0
  60. examples/scouring.py +36 -0
  61. examples/section_extraction_example.py +232 -0
  62. examples/simple_document_qa.py +97 -0
  63. examples/spatial_navigation_example.py +108 -0
  64. examples/table_extraction_example.py +135 -0
  65. examples/table_structure_detection.py +155 -0
  66. examples/tatr_cells_test.py +56 -0
  67. examples/tatr_ocr_table_test.py +94 -0
  68. examples/text_search_example.py +122 -0
  69. examples/text_style_example.py +110 -0
  70. examples/tiny-text.py +61 -0
  71. examples/until_boundaries_example.py +156 -0
  72. examples/until_example.py +112 -0
  73. examples/very_basics.py +15 -0
  74. natural_pdf/__init__.py +55 -0
  75. natural_pdf/analyzers/__init__.py +9 -0
  76. natural_pdf/analyzers/document_layout.py +736 -0
  77. natural_pdf/analyzers/text_structure.py +153 -0
  78. natural_pdf/core/__init__.py +3 -0
  79. natural_pdf/core/page.py +2376 -0
  80. natural_pdf/core/pdf.py +572 -0
  81. natural_pdf/elements/__init__.py +3 -0
  82. natural_pdf/elements/base.py +553 -0
  83. natural_pdf/elements/collections.py +770 -0
  84. natural_pdf/elements/line.py +124 -0
  85. natural_pdf/elements/rect.py +122 -0
  86. natural_pdf/elements/region.py +1366 -0
  87. natural_pdf/elements/text.py +304 -0
  88. natural_pdf/ocr/__init__.py +62 -0
  89. natural_pdf/ocr/easyocr_engine.py +254 -0
  90. natural_pdf/ocr/engine.py +158 -0
  91. natural_pdf/ocr/paddleocr_engine.py +263 -0
  92. natural_pdf/qa/__init__.py +3 -0
  93. natural_pdf/qa/document_qa.py +405 -0
  94. natural_pdf/selectors/__init__.py +4 -0
  95. natural_pdf/selectors/parser.py +360 -0
  96. natural_pdf/templates/__init__.py +1 -0
  97. natural_pdf/templates/ocr_debug.html +517 -0
  98. natural_pdf/utils/__init__.py +4 -0
  99. natural_pdf/utils/highlighting.py +605 -0
  100. natural_pdf/utils/ocr.py +515 -0
  101. natural_pdf/utils/reading_order.py +227 -0
  102. natural_pdf/utils/visualization.py +151 -0
  103. natural_pdf-25.3.16.dist-info/LICENSE +21 -0
  104. natural_pdf-25.3.16.dist-info/METADATA +268 -0
  105. natural_pdf-25.3.16.dist-info/RECORD +109 -0
  106. natural_pdf-25.3.16.dist-info/WHEEL +5 -0
  107. natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
  108. tests/__init__.py +3 -0
  109. tests/test_pdf.py +39 -0
@@ -0,0 +1,190 @@
1
+ """
2
+ Example demonstrating the optimized exclusion handling for various region types.
3
+ """
4
+
5
+ import os
6
+ import sys
7
+ from pathlib import Path
8
+ import time
9
+
10
+ # Add parent directory to path for imports
11
+ sys.path.insert(0, str(Path(__file__).parent.parent))
12
+
13
+ from natural_pdf import PDF
14
+
15
+
16
+ def measure_time(func):
17
+ """Decorator to measure function execution time."""
18
+ def wrapper(*args, **kwargs):
19
+ start_time = time.time()
20
+ result = func(*args, **kwargs)
21
+ end_time = time.time()
22
+ print(f"Time taken: {end_time - start_time:.4f} seconds")
23
+ return result
24
+ return wrapper
25
+
26
+
27
+ def optimized_exclusion_example(pdf_path):
28
+ """
29
+ Demonstrates the optimized exclusion handling for different region types.
30
+ """
31
+ with PDF(pdf_path) as pdf:
32
+ page = pdf.pages[0]
33
+ print(f"Using PDF: {pdf_path}")
34
+
35
+ # Create an output directory
36
+ output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'output'))
37
+ os.makedirs(output_dir, exist_ok=True)
38
+
39
+ # Step 1: Set up exclusion zones
40
+ print("\n=== Setting Up Exclusion Zones ===")
41
+ # Top 10% of page as header
42
+ header_zone = page.create_region(0, 0, page.width, page.height * 0.1)
43
+ header_zone.highlight(label="Header Exclusion", color=(1, 0, 0, 0.3))
44
+ page.add_exclusion(header_zone)
45
+
46
+ # Bottom 10% of page as footer
47
+ footer_zone = page.create_region(0, page.height * 0.9, page.width, page.height)
48
+ footer_zone.highlight(label="Footer Exclusion", color=(0, 0, 1, 0.3))
49
+ page.add_exclusion(footer_zone)
50
+
51
+ # Left 20% as a side panel (complex exclusion)
52
+ side_panel = page.create_region(0, 0, page.width * 0.2, page.height)
53
+ side_panel.highlight(label="Side Panel Exclusion", color=(0, 1, 0, 0.3))
54
+ page.add_exclusion(side_panel)
55
+
56
+ print(f"Added 3 exclusion zones: header, footer, and side panel")
57
+
58
+ # Step 2: Create test regions of different types
59
+ print("\n=== Creating Test Regions ===")
60
+ # Non-intersecting region (in center, away from all exclusions)
61
+ non_intersecting = page.create_region(
62
+ page.width * 0.3,
63
+ page.height * 0.3,
64
+ page.width * 0.8,
65
+ page.height * 0.7
66
+ )
67
+ non_intersecting.highlight(label="Non-Intersecting Region", color=(1, 1, 0, 0.3))
68
+
69
+ # Header/footer-only region (full width but between exclusions)
70
+ header_footer_region = page.create_region(
71
+ 0,
72
+ 0,
73
+ page.width,
74
+ page.height
75
+ )
76
+ header_footer_region.highlight(label="Full Page Region", color=(1, 0, 1, 0.2))
77
+
78
+ # Complex region (intersects with side panel)
79
+ complex_region = page.create_region(
80
+ 0,
81
+ page.height * 0.2,
82
+ page.width * 0.5,
83
+ page.height * 0.8
84
+ )
85
+ complex_region.highlight(label="Complex Region", color=(0, 1, 1, 0.3))
86
+
87
+ print("Created 3 test regions with different exclusion intersection patterns")
88
+
89
+ # Save the visualization
90
+ output_file = os.path.join(output_dir, "exclusion_optimization_regions.png")
91
+ page.save_image(output_file, labels=True)
92
+ print(f"Saved visualization to: {output_file}")
93
+
94
+ # Step 3: Test extraction with and without optimizations
95
+ print("\n=== Testing Text Extraction with Exclusions ===")
96
+
97
+ # Test non-intersecting region
98
+ print("\nNon-Intersecting Region:")
99
+ print("This region should use the fast path (no exclusion checking)")
100
+ print("Extracting text with apply_exclusions=True...")
101
+
102
+ @measure_time
103
+ def extract_non_intersecting():
104
+ return non_intersecting.extract_text(apply_exclusions=True)
105
+
106
+ text1 = extract_non_intersecting()
107
+
108
+ print("Extracting text with apply_exclusions=False (for comparison)...")
109
+
110
+ @measure_time
111
+ def extract_non_intersecting_no_exclusions():
112
+ return non_intersecting.extract_text(apply_exclusions=False)
113
+
114
+ text2 = extract_non_intersecting_no_exclusions()
115
+
116
+ print(f"Text length comparison: with exclusions={len(text1)}, without={len(text2)}")
117
+ print(f"Identical results: {text1 == text2}")
118
+
119
+ # Test header/footer region
120
+ print("\nFull Page Region (intersecting header/footer):")
121
+ print("This region should use cropping optimization for header/footer exclusions")
122
+ print("Extracting text with apply_exclusions=True...")
123
+
124
+ @measure_time
125
+ def extract_header_footer():
126
+ return header_footer_region.extract_text(apply_exclusions=True)
127
+
128
+ text3 = extract_header_footer()
129
+
130
+ print("Extracting text with apply_exclusions=False (for comparison)...")
131
+
132
+ @measure_time
133
+ def extract_header_footer_no_exclusions():
134
+ return header_footer_region.extract_text(apply_exclusions=False)
135
+
136
+ text4 = extract_header_footer_no_exclusions()
137
+
138
+ print(f"Text length comparison: with exclusions={len(text3)}, without={len(text4)}")
139
+ print(f"Header/footer content excluded: {len(text4) > len(text3)}")
140
+
141
+ # Test complex region
142
+ print("\nComplex Region (intersecting side panel):")
143
+ print("This region should use filtering with warning")
144
+ print("Extracting text with apply_exclusions=True...")
145
+
146
+ @measure_time
147
+ def extract_complex():
148
+ return complex_region.extract_text(apply_exclusions=True)
149
+
150
+ text5 = extract_complex()
151
+
152
+ print("Extracting text with apply_exclusions=False (for comparison)...")
153
+
154
+ @measure_time
155
+ def extract_complex_no_exclusions():
156
+ return complex_region.extract_text(apply_exclusions=False)
157
+
158
+ text6 = extract_complex_no_exclusions()
159
+
160
+ print(f"Text length comparison: with exclusions={len(text5)}, without={len(text6)}")
161
+
162
+ # Step 4: Summarize findings
163
+ print("\n=== Summary ===")
164
+ print("1. Non-intersecting region: Optimization should skip exclusion checks entirely")
165
+ print("2. Header/footer region: Optimization should use direct cropping")
166
+ print("3. Complex region: Falls back to filtering with warning")
167
+ print("\nCheck the produced warning messages to confirm the behavior.")
168
+
169
+
170
+ def main():
171
+ """Main entry point."""
172
+ # Get the PDF path from command line or use a default
173
+ if len(sys.argv) > 1:
174
+ pdf_path = sys.argv[1]
175
+ else:
176
+ # Look for any PDF in the pdfs directory
177
+ pdfs_dir = Path(__file__).parent.parent / "pdfs"
178
+ pdf_files = list(pdfs_dir.glob("*.pdf"))
179
+
180
+ if pdf_files:
181
+ pdf_path = str(pdf_files[0])
182
+ else:
183
+ print("No PDF file found. Please provide a path to a PDF file.")
184
+ sys.exit(1)
185
+
186
+ optimized_exclusion_example(pdf_path)
187
+
188
+
189
+ if __name__ == "__main__":
190
+ main()
@@ -0,0 +1,128 @@
1
+ """
2
+ Test script for the new extract_text implementation that uses pdfplumber's native functionality.
3
+ """
4
+
5
+ import os
6
+ import sys
7
+ from io import StringIO
8
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
9
+
10
+ from natural_pdf import PDF
11
+ import time
12
+
13
+ def main():
14
+ # Use a sample PDF
15
+ pdf_path = "pdfs/01-practice.pdf"
16
+ if len(sys.argv) > 1:
17
+ pdf_path = sys.argv[1]
18
+
19
+ # Load the PDF
20
+ pdf = PDF(pdf_path)
21
+ page = pdf.pages[0]
22
+
23
+ print(f"Loaded {pdf_path}, processing first page...")
24
+
25
+ # Create different regions
26
+ full_region = page.create_region(0, 0, page.width, page.height)
27
+ top_region = page.create_region(0, 0, page.width, page.height / 3)
28
+ bottom_region = page.create_region(0, page.height * 2/3, page.width, page.height)
29
+
30
+ # Create a non-rectangular region (a triangle)
31
+ # First create the region with a bbox
32
+ triangle_region = page.create_region(0, 0, page.width, page.height/2)
33
+ # Then set the polygon directly
34
+ triangle_region._polygon = [(0, 0), (page.width, 0), (page.width/2, page.height/2)]
35
+
36
+ # Add an exclusion region
37
+ page.add_exclusion(bottom_region)
38
+
39
+ # Test extraction with different settings
40
+
41
+ # 1. Standard rectangular region without exclusions
42
+ print("\nExtracting text from top region:")
43
+ start = time.time()
44
+ # First try with just crop to debug - use bbox directly
45
+ crop_bbox = top_region.bbox
46
+
47
+ print(f"Using bbox: {crop_bbox}")
48
+
49
+ cropped = page._page.crop(crop_bbox)
50
+ direct_text = cropped.extract_text(keep_blank_chars=True)
51
+ print(f"Direct crop text length: {len(direct_text)}, Text: {direct_text[:100]}")
52
+
53
+ # Check if there's a bug when passing the instance directly to extract_text
54
+ print("Converting region to a dictionary and creating a new Region")
55
+ region_dict = {
56
+ 'x0': top_region.x0,
57
+ 'top': top_region.top,
58
+ 'x1': top_region.x1,
59
+ 'bottom': top_region.bottom
60
+ }
61
+ bbox = (region_dict['x0'], region_dict['top'], region_dict['x1'], region_dict['bottom'])
62
+
63
+ from natural_pdf.elements.region import Region
64
+ test_region = Region(page, bbox)
65
+ print(f"New region bbox: {test_region.bbox}")
66
+
67
+ # Create a simple direct call to pdfplumber's crop
68
+ print("Testing direct pdfplumber crop and extract:")
69
+ crop_bbox = test_region.bbox
70
+ cropped_page = page._page.crop(crop_bbox)
71
+ print(f"Cropped page dimensions: {cropped_page.width} × {cropped_page.height}")
72
+ print(f"Cropped page characters: {len(cropped_page.chars)}")
73
+ if cropped_page.chars:
74
+ print(f"First few chars: {cropped_page.chars[:3]}")
75
+ direct_crop_text = cropped_page.extract_text(keep_blank_chars=True)
76
+ print(f"Direct pdfplumber extraction: {len(direct_crop_text)} chars")
77
+ print(direct_crop_text[:100])
78
+
79
+ # Test if we're seeing any print outputs from extract_text
80
+ original_stderr = sys.stderr
81
+ string_stderr = StringIO()
82
+ sys.stderr = string_stderr
83
+
84
+ # Try the new region's extract_text
85
+ text = test_region.extract_text(keep_blank_chars=True)
86
+ stderr_output = string_stderr.getvalue()
87
+ sys.stderr = original_stderr
88
+
89
+ print(f"Stderr output from extract_text call:\n{stderr_output}")
90
+
91
+ elapsed = time.time() - start
92
+ print(f"Length: {len(text)} characters, Time: {elapsed:.4f} seconds")
93
+ print(text[:200] + "..." if len(text) > 200 else text)
94
+
95
+ # 2. Full page with exclusions
96
+ print("\nExtracting text from full page with exclusions:")
97
+ start = time.time()
98
+ text = full_region.extract_text(apply_exclusions=True)
99
+ elapsed = time.time() - start
100
+ print(f"Length: {len(text)} characters, Time: {elapsed:.4f} seconds")
101
+ print(text[:200] + "..." if len(text) > 200 else text)
102
+
103
+ # 3. Polygon region (triangle)
104
+ print("\nExtracting text from triangle region:")
105
+ start = time.time()
106
+ text = triangle_region.extract_text()
107
+ elapsed = time.time() - start
108
+ print(f"Length: {len(text)} characters, Time: {elapsed:.4f} seconds")
109
+ print(text[:200] + "..." if len(text) > 200 else text)
110
+
111
+ # 4. With OCR option (to test that pathway)
112
+ print("\nExtracting text with OCR option:")
113
+ start = time.time()
114
+ text = top_region.extract_text(ocr={"enabled": True})
115
+ elapsed = time.time() - start
116
+ print(f"Length: {len(text)} characters, Time: {elapsed:.4f} seconds")
117
+ print(text[:200] + "..." if len(text) > 200 else text)
118
+
119
+ # For comparison, test the regular page.extract_text method
120
+ print("\nExtraction with page.extract_text for comparison:")
121
+ start = time.time()
122
+ text = page.extract_text(preserve_whitespace=True, apply_exclusions=True)
123
+ elapsed = time.time() - start
124
+ print(f"Length: {len(text)} characters, Time: {elapsed:.4f} seconds")
125
+ print(text[:200] + "..." if len(text) > 200 else text)
126
+
127
+ if __name__ == "__main__":
128
+ main()
@@ -0,0 +1,101 @@
1
+ """
2
+ Example demonstrating font-aware text extraction in Natural PDF.
3
+
4
+ This example shows how to use the font_attrs parameter to group text by font properties,
5
+ which helps preserve the formatting and style of text during extraction.
6
+ """
7
+ import os
8
+ import sys
9
+
10
+ # Add the parent directory to the path so we can import natural_pdf module
11
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
12
+
13
+ from natural_pdf import PDF
14
+
15
+ def main():
16
+ # If a PDF path is provided, use it; otherwise use the default example
17
+ if len(sys.argv) > 1:
18
+ pdf_path = sys.argv[1]
19
+ else:
20
+ # Use a default PDF path - you'll need to replace this with an actual PDF path
21
+ pdf_path = "examples/sample.pdf"
22
+ if not os.path.exists(pdf_path):
23
+ print(f"Default PDF not found at {pdf_path}")
24
+ print("Please provide a PDF path as an argument")
25
+ return
26
+
27
+ print(f"Processing PDF: {pdf_path}")
28
+
29
+ # Example 1: Default behavior - group by fontname and size
30
+ print("\n1. Default behavior (group by fontname and size):")
31
+ pdf = PDF(pdf_path)
32
+ page = pdf.pages[0]
33
+
34
+ # Find some text element to inspect
35
+ text_element = page.find("text")
36
+ if text_element:
37
+ print(f"Example text element: {text_element}")
38
+ print(f"Font info: {text_element.font_info()}")
39
+
40
+ # Example 2: Disable font-aware grouping
41
+ print("\n2. Disable font-aware grouping (spatial only):")
42
+ pdf_no_font = PDF(pdf_path, font_attrs=[])
43
+ page_no_font = pdf_no_font.pages[0]
44
+
45
+ # Find the same text with different grouping
46
+ text_element = page_no_font.find("text")
47
+ if text_element:
48
+ print(f"Example text element: {text_element}")
49
+
50
+ # Example 3: Group by additional attributes
51
+ print("\n3. Group by font and color:")
52
+ pdf_with_color = PDF(pdf_path, font_attrs=['fontname', 'size', 'non_stroking_color'])
53
+ page_with_color = pdf_with_color.pages[0]
54
+
55
+ # Find the same text with color grouping
56
+ text_element = page_with_color.find("text")
57
+ if text_element:
58
+ print(f"Example text element: {text_element}")
59
+
60
+ # Compare text extraction results
61
+ print("\n4. Text extraction comparison:")
62
+
63
+ # Get a small region with mixed text styles
64
+ text_elements = page.find_all("text")
65
+ if text_elements:
66
+ region = page.create_region(0, 0, page.width, page.height) # Use the full page
67
+
68
+ # Extract with different font grouping settings
69
+ default_text = region.extract_text()
70
+ spatial_text = page_no_font.create_region(0, 0, page_no_font.width, page_no_font.height).extract_text()
71
+ color_text = page_with_color.create_region(0, 0, page_with_color.width, page_with_color.height).extract_text()
72
+
73
+ # Show word counts as a simple comparison
74
+ print(f"Default grouping word count: {len(default_text.split())}")
75
+ print(f"Spatial-only grouping word count: {len(spatial_text.split())}")
76
+ print(f"Font+color grouping word count: {len(color_text.split())}")
77
+
78
+ # Show sample of text differences
79
+ print("\nText samples (first 200 chars):")
80
+ print(f"Default: {default_text[:200]}...")
81
+ print(f"Spatial: {spatial_text[:200]}...")
82
+ print(f"Color-aware: {color_text[:200]}...")
83
+
84
+ # Example 4: Detailed character-level analysis
85
+ print("\n5. Character-level analysis:")
86
+
87
+ # Get raw character data
88
+ chars = page.find_all('char')[:5] # First 5 characters
89
+ print(f"Raw character elements ({len(chars)} of {len(page.find_all('char'))} total):")
90
+ for char in chars:
91
+ print(f" - {char}")
92
+
93
+ # Show word elements too
94
+ words = page.find_all("text")[:3] # First 3 words
95
+ print(f"\nWord elements ({len(words)} of {len(page.find_all('text'))} total):")
96
+ for word in words:
97
+ print(f" - {word}")
98
+ print(f" Font info: {word.font_info()}")
99
+
100
+ if __name__ == "__main__":
101
+ main()
@@ -0,0 +1,124 @@
1
+ """
2
+ Example demonstrating font variant detection in Natural PDF.
3
+
4
+ This example shows how to identify and filter text elements by font variant
5
+ (the prefix in embedded font names, such as 'AAAAAB+FontName').
6
+ """
7
+ import os
8
+ import sys
9
+
10
+ # Add the parent directory to the path so we can import natural_pdf module
11
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
12
+
13
+ from natural_pdf import PDF
14
+
15
+ def main():
16
+ # If a PDF path is provided, use it; otherwise use the default example
17
+ if len(sys.argv) > 1:
18
+ pdf_path = sys.argv[1]
19
+ else:
20
+ # Use a default PDF path - you'll need to replace this with an actual PDF path
21
+ pdf_path = "examples/sample.pdf"
22
+ if not os.path.exists(pdf_path):
23
+ print(f"Default PDF not found at {pdf_path}")
24
+ print("Please provide a PDF path as an argument")
25
+ return
26
+
27
+ print(f"Processing PDF: {pdf_path}")
28
+ pdf = PDF(pdf_path)
29
+ page = pdf.pages[0]
30
+
31
+ # Example 1: Identify different font variants on the page
32
+ print("\n1. Identifying font variants")
33
+
34
+ # Get all text elements
35
+ all_text = page.find_all('text')
36
+
37
+ # Collect unique font variants
38
+ variants = {}
39
+ for element in all_text:
40
+ variant = element.font_variant
41
+ if variant:
42
+ if variant not in variants:
43
+ variants[variant] = {
44
+ 'count': 0,
45
+ 'example': element.text,
46
+ 'fontname': element.fontname
47
+ }
48
+ variants[variant]['count'] += 1
49
+
50
+ # Display the variants found
51
+ print(f"Found {len(variants)} font variants on the page:")
52
+ for variant, info in variants.items():
53
+ print(f" Variant: '{variant}'")
54
+ print(f" Full fontname: {info['fontname']}")
55
+ print(f" Count: {info['count']} elements")
56
+ print(f" Example text: '{info['example']}'")
57
+
58
+ # Example 2: Filter elements by font variant
59
+ print("\n2. Filtering by font variant")
60
+
61
+ # Select a variant to filter by (use the first one found)
62
+ if variants:
63
+ target_variant = next(iter(variants.keys()))
64
+ print(f"Filtering for variant: '{target_variant}'")
65
+
66
+ # Filter elements with this variant
67
+ variant_elements = page.find_all(f'text[font-variant="{target_variant}"]')
68
+ print(f"Found {len(variant_elements)} elements with this variant")
69
+
70
+ # Display some examples
71
+ for i, element in enumerate(variant_elements[:5]):
72
+ print(f" Element {i+1}: '{element.text}'")
73
+ if i >= 4:
74
+ break
75
+
76
+ # Example 3: Compare visually similar texts with different variants
77
+ print("\n3. Visual comparison of variants")
78
+
79
+ # Find all variants
80
+ variant_list = list(variants.keys())
81
+
82
+ # If we have multiple variants, compare them
83
+ if len(variant_list) >= 2:
84
+ variant_1 = variant_list[0]
85
+ variant_2 = variant_list[1]
86
+
87
+ print(f"Comparing variant '{variant_1}' with '{variant_2}':")
88
+
89
+ # Get elements from each variant
90
+ elements_1 = page.find_all(f'text[font-variant="{variant_1}"]')
91
+ elements_2 = page.find_all(f'text[font-variant="{variant_2}"]')
92
+
93
+ # Highlight elements with different colors
94
+ if elements_1:
95
+ elements_1.highlight(color=(1, 0, 0), label=f"Variant {variant_1}")
96
+ if elements_2:
97
+ elements_2.highlight(color=(0, 1, 0), label=f"Variant {variant_2}")
98
+
99
+ # Save the highlighted page
100
+ highlight_path = "font_variants_highlight.png"
101
+ page.save(highlight_path, labels=True)
102
+ print(f"Highlighted comparison saved to {highlight_path}")
103
+
104
+ # Compare properties of elements from each variant
105
+ if elements_1 and elements_2:
106
+ elem1 = elements_1[0]
107
+ elem2 = elements_2[0]
108
+
109
+ print("\nDetailed comparison of first elements from each variant:")
110
+
111
+ # Print font info for each
112
+ print(f"\nVariant '{variant_1}' font info:")
113
+ for k, v in elem1.font_info().items():
114
+ print(f" {k}: {v}")
115
+
116
+ print(f"\nVariant '{variant_2}' font info:")
117
+ for k, v in elem2.font_info().items():
118
+ print(f" {k}: {v}")
119
+
120
+ else:
121
+ print("No font variants found to filter by")
122
+
123
+ if __name__ == "__main__":
124
+ main()
@@ -0,0 +1,124 @@
1
+ """
2
+ Test for handling regions that overlap with a footer exclusion zone.
3
+ This is a focused test for the specific issue where regions that overlap with a footer
4
+ weren't returning any text.
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ import logging
10
+ from pathlib import Path
11
+
12
+ # Configure logging
13
+ import logging
14
+ logging.basicConfig(level=logging.INFO)
15
+
16
+ from natural_pdf import PDF
17
+
18
+ def main():
19
+ """Main entry point."""
20
+ # Get the PDF path from command line or use a default
21
+ if len(sys.argv) > 1:
22
+ pdf_path = sys.argv[1]
23
+ else:
24
+ # Look for any PDF in the pdfs directory
25
+ pdfs_dir = Path(__file__).parent.parent / "pdfs"
26
+ pdf_files = list(pdfs_dir.glob("*.pdf"))
27
+
28
+ if pdf_files:
29
+ pdf_path = str(pdf_files[0])
30
+ else:
31
+ print("No PDF file found. Please provide a path to a PDF file.")
32
+ sys.exit(1)
33
+
34
+ print(f"\nTesting with PDF: {pdf_path}")
35
+
36
+ # Create a PDF object
37
+ pdf = PDF(pdf_path)
38
+ page = pdf.pages[0]
39
+
40
+ # Create ONLY a footer exclusion zone
41
+ footer_height = page.height * 0.1 # Bottom 10% of the page
42
+ footer = page.create_region(0, page.height - footer_height, page.width, page.height)
43
+ footer.highlight(label="Footer Exclusion", color=(1, 0, 0, 0.3))
44
+ page.add_exclusion(footer)
45
+ print(f"Added footer exclusion: {footer.bbox}")
46
+
47
+ # Create a region that extends from middle of page to past the footer
48
+ middle_to_footer = page.create_region(
49
+ page.width * 0.25, # 25% from left
50
+ page.height * 0.5, # 50% from top (middle of page)
51
+ page.width * 0.75, # 75% from left
52
+ page.height # All the way to bottom (overlaps footer)
53
+ )
54
+ middle_to_footer.highlight(label="Middle to Footer", color=(0, 1, 0, 0.3))
55
+ print(f"Created test region: {middle_to_footer.bbox}")
56
+
57
+ # Try different extraction approaches:
58
+
59
+ # 1. Extract with exclusions using the default approach
60
+ print("\n=== 1. Using Default Extraction ===")
61
+ text = middle_to_footer.extract_text(apply_exclusions=True, debug=True)
62
+ print(f"Text length: {len(text)}")
63
+ print(f"First 100 chars: {text[:100] if text else 'No text!'}")
64
+
65
+ # 2. Try direct cropping approach
66
+ print("\n=== 2. Using Direct Crop Approach ===")
67
+ # Manually adjust the region to exclude the footer
68
+ top_bound = middle_to_footer.top
69
+ bottom_bound = page.height - footer_height # Top of footer
70
+
71
+ cropped_region = page.create_region(
72
+ middle_to_footer.x0,
73
+ top_bound,
74
+ middle_to_footer.x1,
75
+ bottom_bound
76
+ )
77
+ cropped_region.highlight(label="Cropped Region", color=(0, 0, 1, 0.3))
78
+
79
+ # Extract without applying exclusions (since we manually cropped)
80
+ cropped_text = cropped_region.extract_text(apply_exclusions=False)
81
+ print(f"Text length: {len(cropped_text)}")
82
+ print(f"First 100 chars: {cropped_text[:100] if cropped_text else 'No text!'}")
83
+
84
+ # 3. Get individual elements and extract text from them
85
+ print("\n=== 3. Using Element Filtering Approach ===")
86
+ all_elements = page.get_elements()
87
+
88
+ # Filter elements that are in our region but NOT in footer
89
+ filtered_elements = []
90
+ for element in all_elements:
91
+ # Check if element is in the region
92
+ if (middle_to_footer.x0 <= (element.x0 + element.x1)/2 <= middle_to_footer.x1 and
93
+ middle_to_footer.top <= (element.top + element.bottom)/2 <= middle_to_footer.bottom and
94
+ not (footer.top <= (element.top + element.bottom)/2 <= footer.bottom)):
95
+ filtered_elements.append(element)
96
+
97
+ # Extract text from the filtered elements
98
+ filtered_text = " ".join(e.text for e in filtered_elements if hasattr(e, 'text'))
99
+ print(f"Text length: {len(filtered_text)}")
100
+ print(f"First 100 chars: {filtered_text[:100] if filtered_text else 'No text!'}")
101
+
102
+ # Save the visualization
103
+ page.save_image("output/footer_overlap_test.png", labels=True)
104
+ print(f"\nTest visualization saved to output/footer_overlap_test.png")
105
+
106
+ # Provide a summary
107
+ print("\nTEST SUMMARY:")
108
+ if len(text) > 0:
109
+ print("✅ Default extraction works now with overlapping exclusions!")
110
+ else:
111
+ print("❌ Default extraction still fails with overlapping exclusions!")
112
+
113
+ if len(cropped_text) > 0:
114
+ print("✅ Manual cropping approach works!")
115
+ else:
116
+ print("❌ Manual cropping approach fails!")
117
+
118
+ if len(filtered_text) > 0:
119
+ print("✅ Element filtering approach works!")
120
+ else:
121
+ print("❌ Element filtering approach fails!")
122
+
123
+ if __name__ == "__main__":
124
+ main()