natural-pdf 25.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. examples/__init__.py +3 -0
  2. examples/another_exclusion_example.py +20 -0
  3. examples/basic_usage.py +190 -0
  4. examples/boundary_exclusion_test.py +137 -0
  5. examples/boundary_inclusion_fix_test.py +157 -0
  6. examples/chainable_layout_example.py +70 -0
  7. examples/color_basic_test.py +49 -0
  8. examples/color_name_example.py +71 -0
  9. examples/color_test.py +62 -0
  10. examples/debug_ocr.py +91 -0
  11. examples/direct_ocr_test.py +148 -0
  12. examples/direct_paddle_test.py +99 -0
  13. examples/direct_qa_example.py +165 -0
  14. examples/document_layout_analysis.py +123 -0
  15. examples/document_qa_example.py +185 -0
  16. examples/exclusion_count_debug.py +128 -0
  17. examples/exclusion_debug.py +107 -0
  18. examples/exclusion_example.py +150 -0
  19. examples/exclusion_optimization_example.py +190 -0
  20. examples/extract_text_test.py +128 -0
  21. examples/font_aware_example.py +101 -0
  22. examples/font_variant_example.py +124 -0
  23. examples/footer_overlap_test.py +124 -0
  24. examples/highlight_all_example.py +82 -0
  25. examples/highlight_attributes_test.py +114 -0
  26. examples/highlight_confidence_display.py +122 -0
  27. examples/highlight_demo.py +110 -0
  28. examples/highlight_float_test.py +71 -0
  29. examples/highlight_test.py +147 -0
  30. examples/highlighting_example.py +123 -0
  31. examples/image_width_example.py +84 -0
  32. examples/improved_api_example.py +128 -0
  33. examples/layout_confidence_display_test.py +65 -0
  34. examples/layout_confidence_test.py +82 -0
  35. examples/layout_coordinate_debug.py +258 -0
  36. examples/layout_highlight_test.py +77 -0
  37. examples/logging_example.py +70 -0
  38. examples/ocr_comprehensive.py +193 -0
  39. examples/ocr_debug_example.py +87 -0
  40. examples/ocr_default_test.py +97 -0
  41. examples/ocr_engine_comparison.py +235 -0
  42. examples/ocr_example.py +89 -0
  43. examples/ocr_simplified_params.py +79 -0
  44. examples/ocr_visualization.py +102 -0
  45. examples/ocr_visualization_test.py +121 -0
  46. examples/paddle_layout_example.py +315 -0
  47. examples/paddle_layout_simple.py +74 -0
  48. examples/paddleocr_example.py +224 -0
  49. examples/page_collection_example.py +103 -0
  50. examples/polygon_highlight_example.py +83 -0
  51. examples/position_methods_example.py +134 -0
  52. examples/region_boundary_test.py +73 -0
  53. examples/region_exclusion_test.py +149 -0
  54. examples/region_expand_example.py +109 -0
  55. examples/region_image_example.py +116 -0
  56. examples/region_ocr_test.py +119 -0
  57. examples/region_sections_example.py +115 -0
  58. examples/school_books.py +49 -0
  59. examples/school_books_all.py +52 -0
  60. examples/scouring.py +36 -0
  61. examples/section_extraction_example.py +232 -0
  62. examples/simple_document_qa.py +97 -0
  63. examples/spatial_navigation_example.py +108 -0
  64. examples/table_extraction_example.py +135 -0
  65. examples/table_structure_detection.py +155 -0
  66. examples/tatr_cells_test.py +56 -0
  67. examples/tatr_ocr_table_test.py +94 -0
  68. examples/text_search_example.py +122 -0
  69. examples/text_style_example.py +110 -0
  70. examples/tiny-text.py +61 -0
  71. examples/until_boundaries_example.py +156 -0
  72. examples/until_example.py +112 -0
  73. examples/very_basics.py +15 -0
  74. natural_pdf/__init__.py +55 -0
  75. natural_pdf/analyzers/__init__.py +9 -0
  76. natural_pdf/analyzers/document_layout.py +736 -0
  77. natural_pdf/analyzers/text_structure.py +153 -0
  78. natural_pdf/core/__init__.py +3 -0
  79. natural_pdf/core/page.py +2376 -0
  80. natural_pdf/core/pdf.py +572 -0
  81. natural_pdf/elements/__init__.py +3 -0
  82. natural_pdf/elements/base.py +553 -0
  83. natural_pdf/elements/collections.py +770 -0
  84. natural_pdf/elements/line.py +124 -0
  85. natural_pdf/elements/rect.py +122 -0
  86. natural_pdf/elements/region.py +1366 -0
  87. natural_pdf/elements/text.py +304 -0
  88. natural_pdf/ocr/__init__.py +62 -0
  89. natural_pdf/ocr/easyocr_engine.py +254 -0
  90. natural_pdf/ocr/engine.py +158 -0
  91. natural_pdf/ocr/paddleocr_engine.py +263 -0
  92. natural_pdf/qa/__init__.py +3 -0
  93. natural_pdf/qa/document_qa.py +405 -0
  94. natural_pdf/selectors/__init__.py +4 -0
  95. natural_pdf/selectors/parser.py +360 -0
  96. natural_pdf/templates/__init__.py +1 -0
  97. natural_pdf/templates/ocr_debug.html +517 -0
  98. natural_pdf/utils/__init__.py +4 -0
  99. natural_pdf/utils/highlighting.py +605 -0
  100. natural_pdf/utils/ocr.py +515 -0
  101. natural_pdf/utils/reading_order.py +227 -0
  102. natural_pdf/utils/visualization.py +151 -0
  103. natural_pdf-25.3.16.dist-info/LICENSE +21 -0
  104. natural_pdf-25.3.16.dist-info/METADATA +268 -0
  105. natural_pdf-25.3.16.dist-info/RECORD +109 -0
  106. natural_pdf-25.3.16.dist-info/WHEEL +5 -0
  107. natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
  108. tests/__init__.py +3 -0
  109. tests/test_pdf.py +39 -0
@@ -0,0 +1,149 @@
1
+ """
2
+ Test the improved exclusion handling in Region.extract_text() method.
3
+ """
4
+
5
+ import os
6
+ import sys
7
+ import logging
8
+ from pathlib import Path
9
+
10
+ # Add parent directory to path for imports
11
+ sys.path.insert(0, str(Path(__file__).parent.parent))
12
+
13
+ from natural_pdf import PDF, configure_logging
14
+
15
+ # Configure logging
16
+ configure_logging(level=logging.DEBUG)
17
+
18
+
19
+ def test_region_with_exclusions(pdf_path):
20
+ """
21
+ Test extracting text from regions with various exclusion scenarios.
22
+ """
23
+ with PDF(pdf_path) as pdf:
24
+ page = pdf.pages[0]
25
+ print(f"\nTesting with PDF: {pdf_path} (page {page.number})")
26
+
27
+ print("\n=== 1. Creating Test Exclusion Zones ===")
28
+ # Create top (header) and bottom (footer) exclusions
29
+ # Top 15% of the page
30
+ top_exclusion = page.create_region(0, 0, page.width, page.height * 0.15)
31
+ top_exclusion.highlight(label="Header Exclusion", color=(1, 0, 0, 0.3))
32
+ page.add_exclusion(top_exclusion)
33
+ print(f"Added header exclusion: {top_exclusion.bbox}")
34
+
35
+ # Bottom 10% of the page
36
+ bottom_exclusion = page.create_region(0, page.height * 0.9, page.width, page.height)
37
+ bottom_exclusion.highlight(label="Footer Exclusion", color=(0, 0, 1, 0.3))
38
+ page.add_exclusion(bottom_exclusion)
39
+ print(f"Added footer exclusion: {bottom_exclusion.bbox}")
40
+
41
+ # Middle partial-width exclusion
42
+ middle_exclusion = page.create_region(0, page.height * 0.4, page.width * 0.3, page.height * 0.6)
43
+ middle_exclusion.highlight(label="Side Exclusion", color=(0, 1, 0, 0.3))
44
+ page.add_exclusion(middle_exclusion)
45
+ print(f"Added side exclusion: {middle_exclusion.bbox}")
46
+
47
+ print("\n=== 2. Testing Region That Doesn't Intersect Exclusions ===")
48
+ # Create a region that doesn't intersect with any exclusion
49
+ non_intersecting = page.create_region(
50
+ page.width * 0.4,
51
+ page.height * 0.5,
52
+ page.width * 0.9,
53
+ page.height * 0.7
54
+ )
55
+ non_intersecting.highlight(label="Non-Intersecting", color=(1, 1, 0, 0.3))
56
+
57
+ # Extract with and without applying exclusions - should be the same
58
+ text_with_exclusions = non_intersecting.extract_text(apply_exclusions=True, debug=True)
59
+ text_without_exclusions = non_intersecting.extract_text(apply_exclusions=False)
60
+ print(f"Non-intersecting region text length:")
61
+ print(f" - With exclusions: {len(text_with_exclusions)} chars")
62
+ print(f" - Without exclusions: {len(text_without_exclusions)} chars")
63
+ print(f" - Same result: {text_with_exclusions == text_without_exclusions}")
64
+
65
+ print("\n=== 3. Testing Region With Header/Footer Intersection ===")
66
+ # Create a region that intersects with header and footer
67
+ full_height = page.create_region(
68
+ page.width * 0.3,
69
+ 0,
70
+ page.width * 0.8,
71
+ page.height
72
+ )
73
+ full_height.highlight(label="Full Height Region", color=(1, 0, 1, 0.3))
74
+
75
+ # Extract with and without applying exclusions
76
+ text_with_exclusions = full_height.extract_text(apply_exclusions=True, debug=True)
77
+ text_without_exclusions = full_height.extract_text(apply_exclusions=False)
78
+ print(f"Full height region text length:")
79
+ print(f" - With exclusions: {len(text_with_exclusions)} chars")
80
+ print(f" - Without exclusions: {len(text_without_exclusions)} chars")
81
+ print(f" - Exclusions removed {len(text_without_exclusions) - len(text_with_exclusions)} chars")
82
+
83
+ # Test the specific case that was causing issues
84
+ middle_to_footer = page.create_region(
85
+ page.width * 0.3,
86
+ page.height * 0.4, # Middle of page
87
+ page.width * 0.8,
88
+ page.height # All the way to bottom (overlapping footer)
89
+ )
90
+ middle_to_footer.highlight(label="Middle to Footer", color=(0.5, 0.5, 0, 0.3))
91
+
92
+ text_with_exclusions = middle_to_footer.extract_text(apply_exclusions=True, debug=True)
93
+ text_without_exclusions = middle_to_footer.extract_text(apply_exclusions=False)
94
+ print(f"\nMiddle-to-footer region text length:")
95
+ print(f" - With exclusions: {len(text_with_exclusions)} chars")
96
+ print(f" - Without exclusions: {len(text_without_exclusions)} chars")
97
+ if len(text_with_exclusions) > 0:
98
+ print(f" - Working correctly! Content found with exclusions applied")
99
+ else:
100
+ print(f" - Still failing! No content found with exclusions applied")
101
+
102
+ print("\n=== 4. Testing Region With Complex Exclusion Intersection ===")
103
+ # Create a region that intersects with the side exclusion
104
+ complex_region = page.create_region(
105
+ page.width * 0.1,
106
+ page.height * 0.3,
107
+ page.width * 0.5,
108
+ page.height * 0.7
109
+ )
110
+ complex_region.highlight(label="Complex Region", color=(0, 1, 1, 0.3))
111
+
112
+ # Extract with and without applying exclusions
113
+ text_with_exclusions = complex_region.extract_text(apply_exclusions=True, debug=True)
114
+ text_without_exclusions = complex_region.extract_text(apply_exclusions=False)
115
+ print(f"Complex region text length:")
116
+ print(f" - With exclusions: {len(text_with_exclusions)} chars")
117
+ print(f" - Without exclusions: {len(text_without_exclusions)} chars")
118
+ print(f" - Exclusions removed {len(text_without_exclusions) - len(text_with_exclusions)} chars")
119
+
120
+ # Save the image with all regions and exclusions highlighted
121
+ print("\n=== 5. Saving Visual Test Image ===")
122
+ output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'output'))
123
+ os.makedirs(output_dir, exist_ok=True)
124
+ output_file = os.path.join(output_dir, "region_exclusion_test.png")
125
+ page.save_image(output_file, labels=True)
126
+ print(f"Saved test visualization to: {output_file}")
127
+
128
+
129
+ def main():
130
+ """Main entry point."""
131
+ # Get the PDF path from command line or use a default
132
+ if len(sys.argv) > 1:
133
+ pdf_path = sys.argv[1]
134
+ else:
135
+ # Look for any PDF in the pdfs directory
136
+ pdfs_dir = Path(__file__).parent.parent / "pdfs"
137
+ pdf_files = list(pdfs_dir.glob("*.pdf"))
138
+
139
+ if pdf_files:
140
+ pdf_path = str(pdf_files[0])
141
+ else:
142
+ print("No PDF file found. Please provide a path to a PDF file.")
143
+ sys.exit(1)
144
+
145
+ test_region_with_exclusions(pdf_path)
146
+
147
+
148
+ if __name__ == "__main__":
149
+ main()
@@ -0,0 +1,109 @@
1
+ """
2
+ Example demonstrating the region.expand() method in Natural PDF.
3
+
4
+ This example shows how to expand or shrink regions in various ways.
5
+ """
6
+ import os
7
+ import sys
8
+
9
+ # Add the parent directory to the path so we can import natural_pdf module
10
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
11
+
12
+ from natural_pdf import PDF
13
+
14
+ def main():
15
+ # If a PDF path is provided, use it; otherwise use the default example
16
+ if len(sys.argv) > 1:
17
+ pdf_path = sys.argv[1]
18
+ else:
19
+ # Use a default PDF path
20
+ pdf_path = "pdfs/Atlanta_Public_Schools_GA_sample.pdf"
21
+ if not os.path.exists(pdf_path):
22
+ print(f"Default PDF not found at {pdf_path}")
23
+ print("Please provide a PDF path as an argument")
24
+ return
25
+
26
+ print(f"Processing PDF: {pdf_path}")
27
+ pdf = PDF(pdf_path)
28
+ page = pdf.pages[0]
29
+
30
+ # Example 1: Basic expansion in different directions
31
+ print("\n1. Basic region expansion")
32
+
33
+ # Find a text element to start with
34
+ text = page.find('text')
35
+ if not text:
36
+ print("No text found on page")
37
+ return
38
+
39
+ # Create a region from the text element (its bounding box)
40
+ region = page.create_region(text.x0, text.top, text.x1, text.bottom)
41
+ print(f"Original region: {region.bbox}")
42
+
43
+ # Expand the region in different directions
44
+ expanded_right = region.expand(right=50)
45
+ print(f"Expanded right by 50: {expanded_right.bbox}")
46
+
47
+ expanded_all = region.expand(left=10, right=20, top_expand=15, bottom_expand=25)
48
+ print(f"Expanded in all directions: {expanded_all.bbox}")
49
+
50
+ # Shrink the region with negative values
51
+ shrunk = region.expand(left=-5, right=-5, top_expand=-2, bottom_expand=-2)
52
+ print(f"Shrunk with negative values: {shrunk.bbox}")
53
+
54
+ # Example 2: Using expansion factors
55
+ print("\n2. Expansion with factors")
56
+
57
+ # Double the width
58
+ double_width = region.expand(width_factor=2.0)
59
+ print(f"Double width (width_factor=2.0): {double_width.bbox}")
60
+
61
+ # Increase height by 50%
62
+ taller = region.expand(height_factor=1.5)
63
+ print(f"50% taller (height_factor=1.5): {taller.bbox}")
64
+
65
+ # Both width and height factors
66
+ bigger = region.expand(width_factor=1.5, height_factor=1.25)
67
+ print(f"Wider and taller: {bigger.bbox}")
68
+
69
+ # Example 3: Combining with spatial navigation
70
+ print("\n3. Combining with spatial navigation")
71
+
72
+ # Find a heading (assuming it's bold or larger text)
73
+ heading = page.find('text[size>=12]')
74
+ if heading:
75
+ print(f"Found heading: '{heading.text}'")
76
+
77
+ # Create a region below the heading and expand it
78
+ content_region = heading.below(height=100, full_width=False)
79
+ print(f"Region below heading: {content_region.bbox}")
80
+
81
+ # Expand the region to include more content
82
+ expanded_region = content_region.expand(right=100, bottom_expand=50)
83
+ print(f"Expanded region: {expanded_region.bbox}")
84
+
85
+ # Extract text from the expanded region
86
+ text = expanded_region.extract_text()
87
+ print(f"Text in expanded region: {text[:100]}...")
88
+
89
+ # Example 4: Visual demonstration with highlighting
90
+ print("\n4. Visual demonstration with highlighting")
91
+
92
+ # Choose a region to work with
93
+ demo_region = page.create_region(100, 100, 300, 200)
94
+
95
+ # Highlight the original region
96
+ demo_region.highlight(color=(1, 0, 0), label="Original")
97
+
98
+ # Highlight expanded versions with different colors
99
+ demo_region.expand(left=20, right=20).highlight(color=(0, 1, 0), label="Wider")
100
+ demo_region.expand(top_expand=20, bottom_expand=20).highlight(color=(0, 0, 1), label="Taller")
101
+ demo_region.expand(width_factor=1.5, height_factor=1.5).highlight(color=(1, 0.5, 0), label="1.5x Larger")
102
+
103
+ # Save the highlighted page
104
+ highlight_path = "region_expand_highlight.png"
105
+ page.to_image(path=highlight_path, show_labels=True)
106
+ print(f"Highlighted regions saved to {highlight_path}")
107
+
108
+ if __name__ == "__main__":
109
+ main()
@@ -0,0 +1,116 @@
1
+ """
2
+ Example demonstrating the new region.to_image() and region.save_image() functionality.
3
+
4
+ This example shows how to:
5
+ 1. Create regions in various ways
6
+ 2. Generate images of just the region
7
+ 3. Save region images to files
8
+ 4. Compare different rendering options
9
+ """
10
+
11
+ import os
12
+ import sys
13
+ import argparse
14
+
15
+ # Add parent directory to path to run without installing
16
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
17
+
18
+ from natural_pdf import PDF
19
+
20
+ def main():
21
+ parser = argparse.ArgumentParser(description="Region Image Example")
22
+ parser.add_argument("pdf_path", nargs="?", default="../pdfs/0500000US42001.pdf",
23
+ help="Path to PDF document")
24
+ args = parser.parse_args()
25
+
26
+ print(f"Opening PDF: {args.pdf_path}")
27
+
28
+ # Open the PDF
29
+ pdf = PDF(args.pdf_path)
30
+ page = pdf.pages[0]
31
+
32
+ # Create output directory
33
+ os.makedirs("output", exist_ok=True)
34
+
35
+ # Method 1: Find a text element and create a region below it
36
+ print("Creating regions...")
37
+ title = page.find('text:bold')
38
+ if not title:
39
+ title = page.find('text')
40
+
41
+ region_below = title.below(height=100, width="element")
42
+
43
+ # Method 2: Create a region from a specific part of the page
44
+ page_width, page_height = page.width, page.height
45
+ center_region = page.create_region(
46
+ page_width / 4, # Left quarter of page
47
+ page_height / 4, # Top quarter of page
48
+ page_width * 3/4, # Right three-quarters
49
+ page_height * 3/4 # Bottom three-quarters
50
+ )
51
+
52
+ # Method 3: Use layout detection to find regions
53
+ page.analyze_layout(confidence=0.3)
54
+ layout_regions = page.find_all('region')
55
+
56
+ # Generate and save images for each region
57
+ print("Generating region images...")
58
+
59
+ # Example 1: Basic region image with default settings
60
+ region_below.save_image("output/region_below.png")
61
+ print(f"Saved basic region image to output/region_below.png")
62
+
63
+ # Example 2: Region image with highlighted content
64
+ # First highlight some elements in the region
65
+ elements = region_below.find_all('text')
66
+ if elements:
67
+ elements[0].highlight(color=(1, 0, 0, 0.3), label="First Element")
68
+
69
+ # Save with highlights included
70
+ region_below.save_image(
71
+ "output/region_with_highlights.png",
72
+ include_highlights=True
73
+ )
74
+ print(f"Saved region with highlights to output/region_with_highlights.png")
75
+
76
+ # Save without highlights
77
+ region_below.save_image(
78
+ "output/region_without_highlights.png",
79
+ include_highlights=False
80
+ )
81
+ print(f"Saved region without highlights to output/region_without_highlights.png")
82
+
83
+ # Example 3: Region image without border
84
+ center_region.save_image(
85
+ "output/center_region_with_border.png"
86
+ )
87
+ print(f"Saved center region with border to output/center_region_with_border.png")
88
+
89
+ center_region.save_image(
90
+ "output/center_region_without_border.png",
91
+ crop_only=True
92
+ )
93
+ print(f"Saved center region without border to output/center_region_without_border.png")
94
+
95
+ # Example 4: High-resolution region image
96
+ if layout_regions:
97
+ first_layout = layout_regions[0]
98
+ first_layout.highlight(label=f"Region Type: {first_layout.region_type}")
99
+
100
+ # Save at different resolutions
101
+ first_layout.save_image(
102
+ "output/layout_region_low_res.png",
103
+ resolution=72
104
+ )
105
+ print(f"Saved layout region at 72 DPI to output/layout_region_low_res.png")
106
+
107
+ first_layout.save_image(
108
+ "output/layout_region_high_res.png",
109
+ resolution=300
110
+ )
111
+ print(f"Saved layout region at 300 DPI to output/layout_region_high_res.png")
112
+
113
+ print("\nDone! Check the output directory for the generated images.")
114
+
115
+ if __name__ == "__main__":
116
+ main()
@@ -0,0 +1,119 @@
1
+ """
2
+ Test to identify and fix issues with region-specific OCR.
3
+ """
4
+ import os
5
+ import sys
6
+
7
+ # Add the parent directory to the path to import the package
8
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
9
+
10
+ from natural_pdf import PDF
11
+ from PIL import Image, ImageDraw
12
+
13
+ def test_region_ocr():
14
+ """Test OCR applied to specific regions."""
15
+ # Use a PDF that may work well with OCR
16
+ pdf_path = os.path.abspath(os.path.join(
17
+ os.path.dirname(__file__), '..', 'pdfs', 'Nigeria 2021_MICS_SFR_English.pdf'))
18
+
19
+ if not os.path.exists(pdf_path):
20
+ # Fall back to another PDF
21
+ pdf_path = os.path.abspath(os.path.join(
22
+ os.path.dirname(__file__), '..', 'pdfs', '0500000US42001.pdf'))
23
+
24
+ if not os.path.exists(pdf_path):
25
+ print("No suitable PDF file found for region OCR testing.")
26
+ return
27
+
28
+ print(f"Testing with PDF: {pdf_path}")
29
+
30
+ # Output directory
31
+ output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'output'))
32
+ os.makedirs(output_dir, exist_ok=True)
33
+
34
+ with PDF(pdf_path) as pdf:
35
+ # Get the first page
36
+ page = pdf.pages[0]
37
+
38
+ # Save the entire page image for reference
39
+ page_img = page.to_image(path=os.path.join(output_dir, "region_ocr_full_page.png"))
40
+
41
+ # Create a region in the middle of the page
42
+ half_width = page.width / 2
43
+ half_height = page.height / 2
44
+ region_width = page.width / 3
45
+ region_height = page.height / 3
46
+
47
+ region = page.create_region(
48
+ half_width - region_width/2,
49
+ half_height - region_height/2,
50
+ half_width + region_width/2,
51
+ half_height + region_height/2
52
+ )
53
+
54
+ # Highlight the region
55
+ region.highlight(label="OCR Test Region")
56
+ page.to_image(path=os.path.join(output_dir, "region_ocr_highlighted.png"), show_labels=True)
57
+
58
+ # Extract text from the region with and without OCR
59
+ text_no_ocr = region.extract_text()
60
+ text_with_ocr = region.extract_text(ocr=True)
61
+
62
+ # Print results
63
+ print("\nRegion Text WITHOUT OCR:")
64
+ print("-" * 40)
65
+ print(text_no_ocr)
66
+
67
+ print("\nRegion Text WITH OCR:")
68
+ print("-" * 40)
69
+ print(text_with_ocr)
70
+
71
+ # Apply OCR to the region and visualize the results
72
+ ocr_elements = region.apply_ocr(enabled=True)
73
+
74
+ print(f"\nFound {len(ocr_elements)} OCR elements in the region")
75
+
76
+ # Get the region image
77
+ page_img = page.to_image()
78
+ region_img = page_img.crop((region.x0, region.top, region.x1, region.bottom))
79
+
80
+ # Save region image for reference
81
+ region_img.save(os.path.join(output_dir, "region_ocr_cropped.png"))
82
+
83
+ # Create debug image showing OCR bounding boxes
84
+ debug_img = page.to_image()
85
+ draw = ImageDraw.Draw(debug_img)
86
+
87
+ # Draw region rectangle
88
+ draw.rectangle(
89
+ (region.x0, region.top, region.x1, region.bottom),
90
+ outline=(255, 0, 0),
91
+ width=3
92
+ )
93
+
94
+ # Draw OCR element bounding boxes
95
+ for elem in ocr_elements:
96
+ draw.rectangle(
97
+ (elem.x0, elem.top, elem.x1, elem.bottom),
98
+ outline=(0, 255, 0),
99
+ width=2
100
+ )
101
+
102
+ # Draw text label
103
+ draw.text(
104
+ (elem.x0, elem.top - 10),
105
+ elem.text[:10],
106
+ fill=(0, 0, 255)
107
+ )
108
+
109
+ # Save debug image
110
+ debug_img.save(os.path.join(output_dir, "region_ocr_debug.png"))
111
+
112
+ print(f"\nCreated debug images in: {output_dir}")
113
+ print("- region_ocr_full_page.png: Original page")
114
+ print("- region_ocr_highlighted.png: Page with region highlighted")
115
+ print("- region_ocr_cropped.png: Cropped region image")
116
+ print("- region_ocr_debug.png: Page with OCR text bounding boxes")
117
+
118
+ if __name__ == "__main__":
119
+ test_region_ocr()
@@ -0,0 +1,115 @@
1
+ """
2
+ Example demonstrating the get_sections() method on regions in Natural PDF.
3
+
4
+ This example shows how to extract logical sections from regions
5
+ using various types of boundary elements.
6
+ """
7
+ import os
8
+ import sys
9
+
10
+ # Add the parent directory to the path so we can import natural_pdf module
11
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
12
+
13
+ from natural_pdf import PDF
14
+
15
+ def main():
16
+ # If a PDF path is provided, use it; otherwise use the default example
17
+ if len(sys.argv) > 1:
18
+ pdf_path = sys.argv[1]
19
+ else:
20
+ # Use a default PDF path - you'll need to replace this with an actual PDF path
21
+ pdf_path = "examples/sample.pdf"
22
+ if not os.path.exists(pdf_path):
23
+ print(f"Default PDF not found at {pdf_path}")
24
+ print("Please provide a PDF path as an argument")
25
+ return
26
+
27
+ print(f"Processing PDF: {pdf_path}")
28
+ pdf = PDF(pdf_path)
29
+ page = pdf.pages[0]
30
+
31
+ # Example 1: Get sections within a region using separators
32
+ print("\n1. Get sections within a region using separators")
33
+
34
+ # First, create a region from the top half of the page
35
+ top_half = page.create_region(0, 0, page.width, page.height / 2)
36
+ print(f"Created region: {top_half.bbox}")
37
+
38
+ # Method 1: Find elements first, then pass them to get_sections
39
+ lines = top_half.find_all('line')
40
+ print(f"Found {len(lines)} line elements in the region")
41
+
42
+ # Extract sections using lines as start elements
43
+ sections1 = top_half.get_sections(start_elements=lines)
44
+ print(f"Found {len(sections1)} sections using explicit elements")
45
+
46
+ # Method 2: Pass selector directly to start_elements
47
+ sections2 = top_half.get_sections(start_elements='line')
48
+ print(f"Found {len(sections2)} sections using selector string")
49
+
50
+ # Display section details
51
+ for i, section in enumerate(sections2):
52
+ text = section.extract_text()
53
+ text_snippet = text[:50] + "..." if len(text) > 50 else text
54
+ print(f" Section {i+1}: {section.bbox}, Text: {text_snippet}")
55
+
56
+ # Example 2: Get sections within a region using start/end elements
57
+ print("\n2. Get sections within a region using start/end elements")
58
+
59
+ # Create a region from the bottom half of the page
60
+ bottom_half = page.create_region(0, page.height / 2, page.width, page.height)
61
+ print(f"Created region: {bottom_half.bbox}")
62
+
63
+ # Method 1: Find heading elements first, then pass them to get_sections (old way)
64
+ headings = bottom_half.find_all('text[size>=12]')
65
+ print(f"Found {len(headings)} potential headings in the region")
66
+
67
+ # Use headings as start elements and extract sections (old way)
68
+ sections1 = bottom_half.get_sections(start_elements=headings)
69
+ print(f"Found {len(sections1)} sections using explicit elements")
70
+
71
+ # Method 2: Pass selector directly to start_elements (new way)
72
+ sections2 = bottom_half.get_sections(start_elements='text[size>=12]')
73
+ print(f"Found {len(sections2)} sections using selector string")
74
+
75
+ # Display section details
76
+ for i, section in enumerate(sections2):
77
+ start_element = section.start_element
78
+ start_text = start_element.text if start_element else "None"
79
+
80
+ text = section.extract_text()
81
+ text_snippet = text[:50] + "..." if len(text) > 50 else text
82
+
83
+ print(f" Section {i+1} (starts with '{start_text}'): {text_snippet}")
84
+
85
+ # Example 3: Use selectors within a region
86
+ print("\n3. Get sections using selectors within a region")
87
+
88
+ # Create a region from the center of the page
89
+ center = page.create_region(50, 50, page.width - 50, page.height - 50)
90
+
91
+ # Get sections with start elements
92
+ sections1 = center.get_sections(
93
+ start_elements='text[size>=12]' # Large text as section starts
94
+ )
95
+
96
+ # Get sections with both start and end elements
97
+ sections2 = center.get_sections(
98
+ start_elements='text[size>=12]', # Large text as section starts
99
+ end_elements='line[width>=1]' # Thick lines as section ends
100
+ )
101
+
102
+ print(f"Found {len(sections1)} sections using traditional selectors")
103
+ print(f"Found {len(sections2)} sections using direct selector strings")
104
+
105
+ # Compare the results - they should be identical
106
+ print(f"Both approaches match: {len(sections1) == len(sections2)}")
107
+
108
+ # Display section details for the new approach
109
+ for i, section in enumerate(sections2):
110
+ text = section.extract_text()
111
+ text_snippet = text[:50] + "..." if len(text) > 50 else text
112
+ print(f" Section {i+1}: {text_snippet}")
113
+
114
+ if __name__ == "__main__":
115
+ main()
@@ -0,0 +1,49 @@
1
+ """
2
+ Example demonstrating section extraction with the get_sections method.
3
+ """
4
+
5
+ import os
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ # Add parent directory to path for imports
10
+ sys.path.insert(0, str(Path(__file__).parent.parent))
11
+
12
+ from natural_pdf import PDF
13
+
14
+ pdf = PDF("./pdfs/Atlanta_Public_Schools_GA_sample.pdf")
15
+
16
+ page = pdf.pages[0]
17
+ day_sections = page.get_sections(start_elements='line[width>=2]')
18
+
19
+ for day in day_sections:
20
+ date = day.find('text').text
21
+ book_sections = day.get_sections(start_elements='text:contains("(Removed:")')
22
+ for j, book in enumerate(book_sections):
23
+ print("-----")
24
+ if book.height < 30:
25
+ print("Not a book, skipping")
26
+ continue
27
+ book.highlight(label=f"Day {date} section {j}")
28
+
29
+ title = book.find_all('text[font_variant="AAAAAB"][size>=10]')
30
+ title.highlight(label='Title')
31
+
32
+ price = book.find('text:contains("Price")').below(height=15, width="element").expand(right=30)
33
+ price.highlight(label='Price')
34
+
35
+ acquired = book.find('text:contains("Acquired")').below(height=15, width="element").expand(right=30)
36
+ acquired.highlight(label='Acquired')
37
+
38
+ removed_by = book.find('text[size<10]:contains("Removed")').below(height=17, width="element").expand(right=60)
39
+ removed_by.highlight(label='Removed By')
40
+
41
+ data = {
42
+ 'Title': title.extract_text(),
43
+ 'Price': price.extract_text(),
44
+ 'Acquired': acquired.extract_text(),
45
+ 'Removed By': removed_by.extract_text()
46
+ }
47
+ print(data)
48
+
49
+ page.save("highlight.png", show_labels=True)