natural-pdf 25.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. examples/__init__.py +3 -0
  2. examples/another_exclusion_example.py +20 -0
  3. examples/basic_usage.py +190 -0
  4. examples/boundary_exclusion_test.py +137 -0
  5. examples/boundary_inclusion_fix_test.py +157 -0
  6. examples/chainable_layout_example.py +70 -0
  7. examples/color_basic_test.py +49 -0
  8. examples/color_name_example.py +71 -0
  9. examples/color_test.py +62 -0
  10. examples/debug_ocr.py +91 -0
  11. examples/direct_ocr_test.py +148 -0
  12. examples/direct_paddle_test.py +99 -0
  13. examples/direct_qa_example.py +165 -0
  14. examples/document_layout_analysis.py +123 -0
  15. examples/document_qa_example.py +185 -0
  16. examples/exclusion_count_debug.py +128 -0
  17. examples/exclusion_debug.py +107 -0
  18. examples/exclusion_example.py +150 -0
  19. examples/exclusion_optimization_example.py +190 -0
  20. examples/extract_text_test.py +128 -0
  21. examples/font_aware_example.py +101 -0
  22. examples/font_variant_example.py +124 -0
  23. examples/footer_overlap_test.py +124 -0
  24. examples/highlight_all_example.py +82 -0
  25. examples/highlight_attributes_test.py +114 -0
  26. examples/highlight_confidence_display.py +122 -0
  27. examples/highlight_demo.py +110 -0
  28. examples/highlight_float_test.py +71 -0
  29. examples/highlight_test.py +147 -0
  30. examples/highlighting_example.py +123 -0
  31. examples/image_width_example.py +84 -0
  32. examples/improved_api_example.py +128 -0
  33. examples/layout_confidence_display_test.py +65 -0
  34. examples/layout_confidence_test.py +82 -0
  35. examples/layout_coordinate_debug.py +258 -0
  36. examples/layout_highlight_test.py +77 -0
  37. examples/logging_example.py +70 -0
  38. examples/ocr_comprehensive.py +193 -0
  39. examples/ocr_debug_example.py +87 -0
  40. examples/ocr_default_test.py +97 -0
  41. examples/ocr_engine_comparison.py +235 -0
  42. examples/ocr_example.py +89 -0
  43. examples/ocr_simplified_params.py +79 -0
  44. examples/ocr_visualization.py +102 -0
  45. examples/ocr_visualization_test.py +121 -0
  46. examples/paddle_layout_example.py +315 -0
  47. examples/paddle_layout_simple.py +74 -0
  48. examples/paddleocr_example.py +224 -0
  49. examples/page_collection_example.py +103 -0
  50. examples/polygon_highlight_example.py +83 -0
  51. examples/position_methods_example.py +134 -0
  52. examples/region_boundary_test.py +73 -0
  53. examples/region_exclusion_test.py +149 -0
  54. examples/region_expand_example.py +109 -0
  55. examples/region_image_example.py +116 -0
  56. examples/region_ocr_test.py +119 -0
  57. examples/region_sections_example.py +115 -0
  58. examples/school_books.py +49 -0
  59. examples/school_books_all.py +52 -0
  60. examples/scouring.py +36 -0
  61. examples/section_extraction_example.py +232 -0
  62. examples/simple_document_qa.py +97 -0
  63. examples/spatial_navigation_example.py +108 -0
  64. examples/table_extraction_example.py +135 -0
  65. examples/table_structure_detection.py +155 -0
  66. examples/tatr_cells_test.py +56 -0
  67. examples/tatr_ocr_table_test.py +94 -0
  68. examples/text_search_example.py +122 -0
  69. examples/text_style_example.py +110 -0
  70. examples/tiny-text.py +61 -0
  71. examples/until_boundaries_example.py +156 -0
  72. examples/until_example.py +112 -0
  73. examples/very_basics.py +15 -0
  74. natural_pdf/__init__.py +55 -0
  75. natural_pdf/analyzers/__init__.py +9 -0
  76. natural_pdf/analyzers/document_layout.py +736 -0
  77. natural_pdf/analyzers/text_structure.py +153 -0
  78. natural_pdf/core/__init__.py +3 -0
  79. natural_pdf/core/page.py +2376 -0
  80. natural_pdf/core/pdf.py +572 -0
  81. natural_pdf/elements/__init__.py +3 -0
  82. natural_pdf/elements/base.py +553 -0
  83. natural_pdf/elements/collections.py +770 -0
  84. natural_pdf/elements/line.py +124 -0
  85. natural_pdf/elements/rect.py +122 -0
  86. natural_pdf/elements/region.py +1366 -0
  87. natural_pdf/elements/text.py +304 -0
  88. natural_pdf/ocr/__init__.py +62 -0
  89. natural_pdf/ocr/easyocr_engine.py +254 -0
  90. natural_pdf/ocr/engine.py +158 -0
  91. natural_pdf/ocr/paddleocr_engine.py +263 -0
  92. natural_pdf/qa/__init__.py +3 -0
  93. natural_pdf/qa/document_qa.py +405 -0
  94. natural_pdf/selectors/__init__.py +4 -0
  95. natural_pdf/selectors/parser.py +360 -0
  96. natural_pdf/templates/__init__.py +1 -0
  97. natural_pdf/templates/ocr_debug.html +517 -0
  98. natural_pdf/utils/__init__.py +4 -0
  99. natural_pdf/utils/highlighting.py +605 -0
  100. natural_pdf/utils/ocr.py +515 -0
  101. natural_pdf/utils/reading_order.py +227 -0
  102. natural_pdf/utils/visualization.py +151 -0
  103. natural_pdf-25.3.16.dist-info/LICENSE +21 -0
  104. natural_pdf-25.3.16.dist-info/METADATA +268 -0
  105. natural_pdf-25.3.16.dist-info/RECORD +109 -0
  106. natural_pdf-25.3.16.dist-info/WHEEL +5 -0
  107. natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
  108. tests/__init__.py +3 -0
  109. tests/test_pdf.py +39 -0
examples/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """
2
+ Examples for natural-pdf.
3
+ """
@@ -0,0 +1,20 @@
1
+ """
2
+ Example demonstrating how to use exclusion zones in Natural PDF.
3
+ """
4
+
5
+ import os
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ # Add parent directory to path for imports
10
+ sys.path.insert(0, str(Path(__file__).parent.parent))
11
+
12
+ from natural_pdf import PDF
13
+
14
+ pdf = PDF('pdfs/Atlanta_Public_Schools_GA_sample.pdf')
15
+ pdf.add_exclusion(lambda page: page.find('line').above())
16
+ pdf.add_exclusion(lambda page: page.find_all('line')[-1].below())
17
+ page = pdf.pages[2]
18
+ page.find_all('text').highlight()
19
+ page.save('test.png', labels=True)
20
+
@@ -0,0 +1,190 @@
1
+ """
2
+ Basic usage examples for natural-pdf.
3
+ """
4
+ import os
5
+ import sys
6
+
7
+ # Add the parent directory to the path to import the package
8
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
9
+
10
+ from natural_pdf import PDF
11
+
12
+ def basic_example(pdf_path):
13
+ """Basic example using the main features."""
14
+ # Open the PDF
15
+ with PDF(pdf_path, reading_order=True) as pdf:
16
+ # Get basic information
17
+ print(f"PDF has {len(pdf)} pages")
18
+
19
+ # First, display the PDF structure with simple text extraction
20
+ print("\nBASIC TEXT EXTRACTION:")
21
+ page_text = pdf.pages[0].extract_text()
22
+ print(page_text[:500] + "...")
23
+
24
+ print("\nWITH LAYOUT: ")
25
+ page_text = pdf.pages[0].extract_text(layout=True)
26
+ print(page_text[:2000] + "...")
27
+
28
+ # Direct demonstration of PDF features
29
+ print("\nDEMONSTRATING NATURAL PDF FEATURES:")
30
+
31
+ page = pdf.pages[0]
32
+
33
+ # 1. Display document structure
34
+ print("\n1. DOCUMENT STRUCTURE:")
35
+
36
+ # Count different types of elements
37
+ print(f" - {len(page.words)} words")
38
+ print(f" - {len(page.lines)} lines")
39
+ print(f" - {len(page.rects)} rectangles")
40
+
41
+ # 2. Extract specific text using extract_text
42
+ print("\n2. EXTRACT TEXT FROM DOCUMENT:")
43
+ print(f" Full document: {len(pdf.extract_text())} characters")
44
+ print(f" First page: {len(page.extract_text())} characters")
45
+
46
+ # 3. Find elements with specific properties
47
+ print("\n3. FIND ELEMENTS WITH SPECIFIC PROPERTIES:")
48
+
49
+ # Find the thick horizontal line
50
+ thick_lines = pdf.find_all('line[width>=2]')
51
+ if thick_lines:
52
+ print(f" Found thick line: {thick_lines[0].bbox}")
53
+
54
+ # Find text with a specific pattern
55
+ site_text = [w for w in page.words if w.text.startswith("Site:")]
56
+ if site_text:
57
+ print(f" Site info: {site_text[0].text}")
58
+
59
+ # Display some example words
60
+ print("\n4. SAMPLE WORDS:")
61
+ for i, word in enumerate(page.words[:5]):
62
+ print(f" - Word {i}: '{word.text}'")
63
+
64
+ # Find all statute codes using regex pattern matching
65
+ print("\n5. FIND STATUTE CODES:")
66
+ import re
67
+ statute_codes = []
68
+ for word in page.words:
69
+ if re.match(r'\d+\.\d+\.\d+', word.text):
70
+ statute_codes.append(word.text)
71
+
72
+ print(f" Found {len(statute_codes)} statute codes:")
73
+ for code in statute_codes[:3]:
74
+ print(f" - {code}")
75
+
76
+ # Demonstrate spatial relationships with fluent API
77
+ print("\n6. SPATIAL RELATIONSHIPS WITH FLUENT API:")
78
+
79
+ # Find the line with width >= 2
80
+ thick_line = pdf.find('line[width>=2]')
81
+ if thick_line:
82
+ print(f" Found thick line at y={thick_line.top}")
83
+
84
+ # Use the below() method to create a region below the line
85
+ # Specify width="full" for full page width
86
+ below_region = thick_line.below(height=50, width="full")
87
+
88
+ # Extract text from this region
89
+ region_text = below_region.extract_text(preserve_whitespace=True)
90
+
91
+ # Print the first part of the text
92
+ print(f" Text from region below line: {region_text[:30]}...")
93
+
94
+ # We can also use find_all on the region to get elements in that region
95
+ words_below = below_region.find_all('text')
96
+ if words_below:
97
+ print(f" Found {len(words_below)} text elements below the line")
98
+ # Show the first few words
99
+ if len(words_below) > 0:
100
+ first_few = [w.text for w in words_below[:3]]
101
+ print(f" First few words: {' '.join(first_few)}")
102
+
103
+ # Find critical violations
104
+ print("\n7. FIND CRITICAL VIOLATIONS:")
105
+
106
+ # Use simple word search with filtering
107
+ critical_words = []
108
+ for word in page.words:
109
+ if "Critical" in word.text:
110
+ critical_words.append(word)
111
+
112
+ if critical_words:
113
+ print(f" Found {len(critical_words)} critical items")
114
+
115
+ # For each critical item, find text on the same line
116
+ for critical in critical_words:
117
+ # Simple approach: find words on same line with lower x-position
118
+ descriptions = []
119
+ for word in page.words:
120
+ # Check if it's on the same line and to the left
121
+ if abs(word.top - critical.top) < 5 and word.x0 < critical.x0:
122
+ descriptions.append(word)
123
+
124
+ # Sort by x-position to get the closest one
125
+ if descriptions:
126
+ descriptions.sort(key=lambda w: w.x0)
127
+ print(f" - {descriptions[0].text}")
128
+
129
+ # Get statutes with critical violations
130
+ critical_statutes = []
131
+ for i, word in enumerate(page.words):
132
+ if "Critical" in word.text:
133
+ # Look for nearby statute code
134
+ for j, code_word in enumerate(page.words):
135
+ if abs(code_word.top - word.top) < 5 and code_word.x0 < word.x0:
136
+ if re.match(r'\d+\.\d+\.\d+', code_word.text):
137
+ critical_statutes.append(code_word.text)
138
+ break
139
+
140
+ if critical_statutes:
141
+ print(f" Critical violations for statutes: {', '.join(critical_statutes)}")
142
+
143
+ # Example of the intended fluent API (even if not all parts work yet)
144
+ print("\n8. FLUENT API EXAMPLES (HOW THE LIBRARY IS INTENDED TO BE USED):")
145
+
146
+ print(" Example 1: Find thick lines and extract text below them")
147
+ print(" ```python")
148
+ print(" thick_line = pdf.find('line[width>=2]')")
149
+ print(" text_below = thick_line.below(height=50, width='full').find_all('text')")
150
+ print(" for text in text_below[:3]:")
151
+ print(" print(text.text)")
152
+ print(" ```")
153
+
154
+ print("\n Example 2: Find critical violations and their codes")
155
+ print(" ```python")
156
+ print(" critical_items = pdf.find_all('text:contains(\"Critical\")')")
157
+ print(" for item in critical_items:")
158
+ print(" # Find codes on the same line")
159
+ print(" codes = pdf.find_all(f'text:matches(\"\\d+\\.\\d+\\.\\d+\")[top~={item.top}]')")
160
+ print(" if codes:")
161
+ print(" print(f\"Critical violation: {codes[0].text}\")")
162
+ print(" ```")
163
+
164
+ print("\n Example 3: Extract a table")
165
+ print(" ```python")
166
+ print(" # Find the table header")
167
+ print(" header = pdf.find('text:contains(\"Statute\")')")
168
+ print(" # Select the entire table region")
169
+ print(" table_region = header.until('text:contains(\"Jungle Health\")')")
170
+ print(" # Extract the table as data")
171
+ print(" table_data = table_region.extract_tables()[0]")
172
+ print(" ```")
173
+
174
+ if __name__ == "__main__":
175
+ # Default to example PDF if no path is provided
176
+ if len(sys.argv) < 2:
177
+ # Use the example PDF in the pdfs directory
178
+ pdf_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'pdfs', '01-practice.pdf'))
179
+ if not os.path.exists(pdf_path):
180
+ print("Example PDF not found. Please provide a path to a PDF file.")
181
+ print("Usage: python basic_usage.py [path/to/file.pdf]")
182
+ sys.exit(1)
183
+ else:
184
+ pdf_path = sys.argv[1]
185
+ # Check if the file exists
186
+ if not os.path.exists(pdf_path):
187
+ print(f"File not found: {pdf_path}")
188
+ sys.exit(1)
189
+
190
+ basic_example(pdf_path)
@@ -0,0 +1,137 @@
1
+ """
2
+ Test for boundary element exclusion with real PDFs.
3
+ This test focuses on the boundary_inclusion parameter of get_sections.
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ from natural_pdf import PDF
9
+
10
+ def main():
11
+ # Get path to PDF file, use default if not provided
12
+ if len(sys.argv) > 1:
13
+ pdf_path = sys.argv[1]
14
+ if not os.path.exists(pdf_path):
15
+ print(f"Error: File {pdf_path} not found")
16
+ sys.exit(1)
17
+ else:
18
+ # Just use a default PDF from the pdfs directory
19
+ pdfs_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "pdfs")
20
+ pdf_files = [f for f in os.listdir(pdfs_dir) if f.endswith('.pdf')]
21
+ if not pdf_files:
22
+ print("No PDF files found in the pdfs directory")
23
+ sys.exit(1)
24
+
25
+ pdf_path = os.path.join(pdfs_dir, pdf_files[0])
26
+
27
+ print(f"Using PDF: {pdf_path}")
28
+
29
+ # Open the PDF
30
+ pdf = PDF(pdf_path)
31
+
32
+ # Use the first page for testing
33
+ page = pdf.pages[0]
34
+
35
+ # Find elements to use as section boundaries
36
+ # First try to find large text as headings
37
+ headings = page.find_all('text[size>=14]')
38
+
39
+ # If not enough headings, try smaller text
40
+ if len(headings) < 3:
41
+ headings = page.find_all('text[size>=12]')
42
+
43
+ # If still not enough, try bold text
44
+ if len(headings) < 3:
45
+ headings = page.find_all('text:bold')
46
+
47
+ # If still not enough, use the first 3 text elements
48
+ if len(headings) < 3:
49
+ headings = page.find_all('text')[:5]
50
+
51
+ print(f"Found {len(headings)} potential section boundaries")
52
+ for i, h in enumerate(headings[:5]):
53
+ print(f"Boundary {i+1}: {h.text}")
54
+
55
+ # Create different sections with different boundary_inclusion settings
56
+ none_sections = page.get_sections(start_elements=headings, boundary_inclusion='none')
57
+ start_sections = page.get_sections(start_elements=headings, boundary_inclusion='start')
58
+ both_sections = page.get_sections(start_elements=headings, boundary_inclusion='both')
59
+
60
+ print("\nTesting boundary element inclusion/exclusion:")
61
+
62
+ # Check if the boundary elements are included correctly
63
+ for i, section in enumerate(none_sections[:3]):
64
+ if i >= len(headings):
65
+ break
66
+
67
+ boundary = headings[i]
68
+ found = section._is_element_in_region(boundary)
69
+ print(f"None Section {i+1}: Contains boundary element: {found}")
70
+
71
+ for i, section in enumerate(start_sections[:3]):
72
+ if i >= len(headings):
73
+ break
74
+
75
+ boundary = headings[i]
76
+ found = section._is_element_in_region(boundary)
77
+ print(f"Start Section {i+1}: Contains boundary element: {found}")
78
+
79
+ for i, section in enumerate(both_sections[:3]):
80
+ if i >= len(headings):
81
+ break
82
+
83
+ boundary = headings[i]
84
+ found = section._is_element_in_region(boundary)
85
+ print(f"Both Section {i+1}: Contains boundary element: {found}")
86
+
87
+ # Simplify our test approach - just check if:
88
+ # 1. 'none' sections exclude their boundary elements
89
+ # 2. 'start' sections include their boundary elements
90
+ # 3. 'both' sections include their boundary elements
91
+
92
+ # Check section element counts
93
+ print("\nElement counts in sections:")
94
+ for i, section in enumerate(none_sections[:3]):
95
+ elements = section.get_elements()
96
+ print(f"None Section {i+1}: {len(elements)} elements")
97
+
98
+ for i, section in enumerate(start_sections[:3]):
99
+ elements = section.get_elements()
100
+ print(f"Start Section {i+1}: {len(elements)} elements")
101
+
102
+ for i, section in enumerate(both_sections[:3]):
103
+ elements = section.get_elements()
104
+ print(f"Both Section {i+1}: {len(elements)} elements")
105
+
106
+ # Summarize test results
107
+ none_correct = all(
108
+ not section._is_element_in_region(headings[i])
109
+ for i, section in enumerate(none_sections[:3])
110
+ if i < len(headings)
111
+ )
112
+
113
+ # Check only non-empty sections that have a start_element
114
+ start_correct = all(
115
+ (section.start_element is None) or section._is_element_in_region(section.start_element)
116
+ for section in start_sections[:3]
117
+ if section.get_elements() # Skip empty sections
118
+ )
119
+
120
+ both_correct = all(
121
+ (section.start_element is None) or section._is_element_in_region(section.start_element)
122
+ for section in both_sections[:3]
123
+ if section.get_elements() # Skip empty sections
124
+ )
125
+
126
+ print("\nTest Results Summary:")
127
+ print(f"- 'none' excludes boundary elements: {'PASS' if none_correct else 'FAIL'}")
128
+ print(f"- 'start' includes boundary elements: {'PASS' if start_correct else 'FAIL'}")
129
+ print(f"- 'both' includes boundary elements: {'PASS' if both_correct else 'FAIL'}")
130
+
131
+ if none_correct and start_correct and both_correct:
132
+ print("\n✅ All tests PASSED!")
133
+ else:
134
+ print("\n❌ Some tests FAILED!")
135
+
136
+ if __name__ == "__main__":
137
+ main()
@@ -0,0 +1,157 @@
1
+ """
2
+ Example demonstrating the fixed boundary inclusion behavior in the get_sections method.
3
+ """
4
+
5
+ import os
6
+ import sys
7
+ from natural_pdf import PDF
8
+
9
+ def main():
10
+ # Get path to PDF file, use default if not provided
11
+ if len(sys.argv) > 1:
12
+ pdf_path = sys.argv[1]
13
+ if not os.path.exists(pdf_path):
14
+ print(f"Error: File {pdf_path} not found")
15
+ sys.exit(1)
16
+ else:
17
+ # Use a default PDF from the pdfs directory
18
+ parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
19
+ pdf_path = os.path.join(parent_dir, "pdfs", "2019 Statistics.pdf")
20
+ if not os.path.exists(pdf_path):
21
+ print(f"Error: Default file {pdf_path} not found")
22
+ sys.exit(1)
23
+
24
+ # Open the PDF
25
+ pdf = PDF(pdf_path)
26
+ page = pdf.pages[0] # Use the first page
27
+
28
+ # Find some elements to use as section boundaries
29
+ headings = page.find_all('text[size>=12]')
30
+
31
+ if len(headings) < 3:
32
+ print(f"Not enough headings found on the first page. Found: {len(headings)}")
33
+ sys.exit(1)
34
+
35
+ print(f"Found {len(headings)} headings")
36
+ for i, heading in enumerate(headings[:5]): # Show first 5 headings
37
+ print(f"Heading {i+1}: {heading.text}")
38
+
39
+ # Create sections with different boundary inclusion settings
40
+ sections_none = page.get_sections(
41
+ start_elements=headings,
42
+ boundary_inclusion='none'
43
+ )
44
+
45
+ sections_start = page.get_sections(
46
+ start_elements=headings,
47
+ boundary_inclusion='start'
48
+ )
49
+
50
+ sections_both = page.get_sections(
51
+ start_elements=headings,
52
+ boundary_inclusion='both'
53
+ )
54
+
55
+ # Display the results
56
+ print("\nTesting if headings are correctly included/excluded:")
57
+
58
+ # Check the sections with 'none' inclusion
59
+ print("\n=== Sections with boundary_inclusion='none' ===")
60
+ for i, section in enumerate(sections_none[:3]): # Check first 3 sections
61
+ # Get all elements in this section
62
+ elements = section.get_elements()
63
+
64
+ # Check if we have any elements
65
+ if not elements:
66
+ print(f"Section {i+1} is empty (has no elements)")
67
+ continue
68
+
69
+ # Get the first element text
70
+ first_element_text = elements[0].text if hasattr(elements[0], 'text') else str(elements[0])
71
+
72
+ # Look for a heading in all section elements
73
+ heading_found = False
74
+ for h in headings:
75
+ if section._is_element_in_region(h):
76
+ heading_found = True
77
+ break
78
+
79
+ print(f"Section {i+1} contains heading: {heading_found}")
80
+ print(f" First element: {first_element_text}")
81
+ print(f" Element count: {len(elements)}")
82
+
83
+ # Check the sections with 'start' inclusion
84
+ print("\n=== Sections with boundary_inclusion='start' ===")
85
+ for i, section in enumerate(sections_start[:3]): # Check first 3 sections
86
+ # Get elements in this section
87
+ elements = section.get_elements()
88
+
89
+ # Check if we have any elements
90
+ if not elements:
91
+ print(f"Section {i+1} is empty (has no elements)")
92
+ continue
93
+
94
+ # Check if the start heading is in this section
95
+ original_heading = headings[i] if i < len(headings) else None
96
+ heading_found = False
97
+ if original_heading:
98
+ heading_found = section._is_element_in_region(original_heading)
99
+
100
+ print(f"Section {i+1} contains start heading: {heading_found}")
101
+ print(f" Start element: {section.start_element.text if section.start_element else 'None'}")
102
+ print(f" Element count: {len(elements)}")
103
+ print(f" First element: {elements[0].text if hasattr(elements[0], 'text') else str(elements[0])}")
104
+
105
+ # Check the sections with 'both' inclusion
106
+ print("\n=== Sections with boundary_inclusion='both' ===")
107
+ for i, section in enumerate(sections_both[:3]): # Check first 3 sections
108
+ # Get elements in this section
109
+ elements = section.get_elements()
110
+
111
+ # Check if we have any elements
112
+ if not elements:
113
+ print(f"Section {i+1} is empty (has no elements)")
114
+ continue
115
+
116
+ # Check if the start heading is in this section
117
+ original_heading = headings[i] if i < len(headings) else None
118
+ heading_found = False
119
+ if original_heading:
120
+ heading_found = section._is_element_in_region(original_heading)
121
+
122
+ print(f"Section {i+1} contains start heading: {heading_found}")
123
+ print(f" Start element: {section.start_element.text if section.start_element else 'None'}")
124
+ print(f" Element count: {len(elements)}")
125
+ print(f" First element: {elements[0].text if hasattr(elements[0], 'text') else str(elements[0])}")
126
+
127
+ # Save output images for visual verification
128
+ page.highlight_all()
129
+ page.save_image("output/all_elements.png")
130
+
131
+ # Let's skip the highlighting part for this test since we're getting errors
132
+ print("\nResults of the test:")
133
+ print(f"- 'none' inclusion: Sections have {len([s for s in sections_none if s.get_elements()])} non-empty out of {len(sections_none)} total")
134
+ print(f"- 'start' inclusion: Sections have {len([s for s in sections_start if s.get_elements()])} non-empty out of {len(sections_start)} total")
135
+ print(f"- 'both' inclusion: Sections have {len([s for s in sections_both if s.get_elements()])} non-empty out of {len(sections_both)} total")
136
+
137
+ # Test successful if:
138
+ # 1. 'none' has no headings in its sections (verified above)
139
+ # 2. 'start' includes the start headings but not end headings
140
+ # 3. 'both' includes both start and end headings
141
+
142
+ none_success = all(len(s.get_elements()) == 0 or not any(s._is_element_in_region(h) for h in headings) for s in sections_none[:3])
143
+ start_success = all(s.start_element in headings and s._is_element_in_region(s.start_element) for s in sections_start[:3] if s.start_element)
144
+ both_success = all((s.start_element in headings and s._is_element_in_region(s.start_element)) for s in sections_both[:3] if s.start_element)
145
+
146
+ print("\nTest Results:")
147
+ print(f"- 'none' excludes headings: {'Success' if none_success else 'Failure'}")
148
+ print(f"- 'start' includes start headings: {'Success' if start_success else 'Failure'}")
149
+ print(f"- 'both' includes start headings: {'Success' if both_success else 'Failure'}")
150
+
151
+ if none_success and start_success and both_success:
152
+ print("\n✅ Fix was successful!")
153
+ else:
154
+ print("\n❌ Fix needs more work.")
155
+
156
+ if __name__ == "__main__":
157
+ main()
@@ -0,0 +1,70 @@
1
+ """
2
+ Example demonstrating the chainable analyze_layout method.
3
+
4
+ This example shows how to use the chainable analyze_layout method
5
+ to create more concise code by chaining method calls together.
6
+ """
7
+ import os
8
+ import sys
9
+ import argparse
10
+
11
+ # Add the parent directory to the Python path
12
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
13
+ from natural_pdf import PDF
14
+
15
+ # Get the current directory of this script
16
+ script_dir = os.path.dirname(os.path.realpath(__file__))
17
+ # Get the parent directory (project root)
18
+ root_dir = os.path.dirname(script_dir)
19
+ # Default PDF path
20
+ default_pdf = os.path.join(root_dir, "pdfs", "2019 Statistics.pdf")
21
+
22
+ # Set up argument parser
23
+ parser = argparse.ArgumentParser(description="Chainable layout analysis example")
24
+ parser.add_argument("pdf_path", nargs="?", default=default_pdf, help="Path to a PDF file")
25
+ parser.add_argument("--page", type=int, default=0, help="Page number to analyze (0-based)")
26
+ parser.add_argument("--conf", type=float, default=0.2, help="Confidence threshold for detections")
27
+ parser.add_argument("--output", type=str, default=None, help="Output file path for highlighted image")
28
+ args = parser.parse_args()
29
+
30
+ print(f"Analyzing PDF: {args.pdf_path}")
31
+ print(f"Page: {args.page}")
32
+ print(f"Confidence threshold: {args.conf}")
33
+
34
+ # Load the PDF
35
+ pdf = PDF(args.pdf_path)
36
+ page = pdf.pages[args.page]
37
+
38
+ print("Running document layout analysis with method chaining...")
39
+
40
+ # Example 1: Chain analyze_layout with highlight_all
41
+ page.analyze_layout(confidence=args.conf)\
42
+ .highlight_all(include_layout_regions=True)
43
+
44
+ print(f"Found {len(page.detected_layout_regions)} regions with confidence >= {args.conf}")
45
+
46
+ # Example 2: Save a highlighted image with labels
47
+ output_path = args.output or os.path.join(root_dir, "output", "chainable_layout.png")
48
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
49
+
50
+ # Chain the whole sequence: clear highlights, analyze layout, highlight all, save image
51
+ page.clear_highlights()\
52
+ .analyze_layout(model="yolo", confidence=args.conf)\
53
+ .highlight_all(include_layout_regions=True)\
54
+ .to_image(path=output_path, show_labels=True)
55
+
56
+ print(f"Saved highlighted image to {output_path}")
57
+
58
+ # Example 3: Chain with specialized highlighting
59
+ if page.find_all('region[type=title]'):
60
+ result_path = os.path.join(os.path.dirname(output_path), "titles_only.png")
61
+
62
+ page.clear_highlights()\
63
+ .analyze_layout(confidence=args.conf)\
64
+ .find_all('region[type=title]')\
65
+ .highlight(label="Document Titles", color=(1, 0, 0, 0.4))
66
+
67
+ page.to_image(path=result_path, show_labels=True)
68
+ print(f"Saved titles-only highlighted image to {result_path}")
69
+
70
+ print("Done!")
@@ -0,0 +1,49 @@
1
+ """
2
+ Simple test for color conversion.
3
+ """
4
+
5
+ # Test color conversion
6
+ def normalize_color(color):
7
+ """Test function that normalizes colors from various formats to RGB(A) integers."""
8
+ if isinstance(color, tuple):
9
+ # Convert values to integers in 0-255 range
10
+ processed_color = []
11
+ for i, c in enumerate(color):
12
+ if isinstance(c, float):
13
+ # 0.0-1.0 float format
14
+ if c <= 1.0:
15
+ processed_color.append(int(c * 255))
16
+ # Already in 0-255 range but as float
17
+ else:
18
+ processed_color.append(int(c))
19
+ else:
20
+ processed_color.append(c)
21
+
22
+ # Default alpha value if needed
23
+ if len(processed_color) == 3:
24
+ processed_color.append(100) # Default alpha
25
+
26
+ return tuple(processed_color)
27
+ else:
28
+ # Default if invalid color is provided
29
+ return (255, 255, 0, 100) # Yellow with semi-transparency
30
+
31
+ # Test cases
32
+ print("Testing color conversion:")
33
+ print("-----------------------")
34
+
35
+ test_cases = [
36
+ ((255, 0, 0, 128), "Integer RGB with alpha"),
37
+ ((255, 0, 0), "Integer RGB without alpha"),
38
+ ((0.0, 1.0, 0.0, 0.5), "Float RGB with alpha (0-1)"),
39
+ ((0.0, 1.0, 0.0), "Float RGB without alpha (0-1)"),
40
+ ((0.5, 0.5, 255, 0.7), "Mixed float and integer"),
41
+ ((0.5, 0.5, 255), "Mixed without alpha"),
42
+ ((128.5, 64.3, 200.7, 50.9), "Float values > 1"),
43
+ ]
44
+
45
+ for color, desc in test_cases:
46
+ result = normalize_color(color)
47
+ print(f"{desc}: {color} -> {result}")
48
+
49
+ print("\nAll tests completed!")