natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +222 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +260 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +409 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +484 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +586 -0
  56. docs/tutorials/12-ocr-integration.md +188 -0
  57. docs/tutorials/13-semantic-search.ipynb +1888 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +39 -20
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  67. natural_pdf/analyzers/layout/layout_manager.py +98 -58
  68. natural_pdf/analyzers/layout/layout_options.py +32 -17
  69. natural_pdf/analyzers/layout/paddle.py +152 -95
  70. natural_pdf/analyzers/layout/surya.py +164 -92
  71. natural_pdf/analyzers/layout/tatr.py +149 -84
  72. natural_pdf/analyzers/layout/yolo.py +84 -44
  73. natural_pdf/analyzers/text_options.py +22 -15
  74. natural_pdf/analyzers/text_structure.py +131 -85
  75. natural_pdf/analyzers/utils.py +30 -23
  76. natural_pdf/collections/pdf_collection.py +126 -98
  77. natural_pdf/core/__init__.py +1 -1
  78. natural_pdf/core/element_manager.py +416 -337
  79. natural_pdf/core/highlighting_service.py +268 -196
  80. natural_pdf/core/page.py +910 -516
  81. natural_pdf/core/pdf.py +387 -289
  82. natural_pdf/elements/__init__.py +1 -1
  83. natural_pdf/elements/base.py +302 -214
  84. natural_pdf/elements/collections.py +714 -514
  85. natural_pdf/elements/line.py +39 -36
  86. natural_pdf/elements/rect.py +32 -30
  87. natural_pdf/elements/region.py +854 -883
  88. natural_pdf/elements/text.py +122 -99
  89. natural_pdf/exporters/__init__.py +0 -1
  90. natural_pdf/exporters/searchable_pdf.py +261 -102
  91. natural_pdf/ocr/__init__.py +23 -14
  92. natural_pdf/ocr/engine.py +17 -8
  93. natural_pdf/ocr/engine_easyocr.py +63 -47
  94. natural_pdf/ocr/engine_paddle.py +97 -68
  95. natural_pdf/ocr/engine_surya.py +54 -44
  96. natural_pdf/ocr/ocr_manager.py +88 -62
  97. natural_pdf/ocr/ocr_options.py +16 -10
  98. natural_pdf/qa/__init__.py +1 -1
  99. natural_pdf/qa/document_qa.py +119 -111
  100. natural_pdf/search/__init__.py +37 -31
  101. natural_pdf/search/haystack_search_service.py +312 -189
  102. natural_pdf/search/haystack_utils.py +186 -122
  103. natural_pdf/search/search_options.py +25 -14
  104. natural_pdf/search/search_service_protocol.py +12 -6
  105. natural_pdf/search/searchable_mixin.py +261 -176
  106. natural_pdf/selectors/__init__.py +2 -1
  107. natural_pdf/selectors/parser.py +159 -316
  108. natural_pdf/templates/__init__.py +1 -1
  109. natural_pdf/utils/highlighting.py +8 -2
  110. natural_pdf/utils/reading_order.py +65 -63
  111. natural_pdf/utils/text_extraction.py +195 -0
  112. natural_pdf/utils/visualization.py +70 -61
  113. natural_pdf/widgets/__init__.py +2 -3
  114. natural_pdf/widgets/viewer.py +749 -718
  115. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
  116. natural_pdf-0.1.5.dist-info/RECORD +134 -0
  117. natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
  118. notebooks/Examples.ipynb +1293 -0
  119. pdfs/.gitkeep +0 -0
  120. pdfs/01-practice.pdf +543 -0
  121. pdfs/0500000US42001.pdf +0 -0
  122. pdfs/0500000US42007.pdf +0 -0
  123. pdfs/2014 Statistics.pdf +0 -0
  124. pdfs/2019 Statistics.pdf +0 -0
  125. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  126. pdfs/needs-ocr.pdf +0 -0
  127. tests/test_loading.py +50 -0
  128. tests/test_optional_deps.py +298 -0
  129. natural_pdf-0.1.3.dist-info/RECORD +0 -61
  130. natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
  131. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
  132. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,256 @@
1
+ # Section Extraction
2
+
3
+ Documents are often organized into logical sections like chapters, articles, or content blocks. This tutorial shows how to extract these sections using natural-pdf, using a library weeding log as an example.
4
+
5
+ ```python
6
+ #%pip install "natural-pdf[all]"
7
+ ```
8
+
9
+ ```python
10
+ from natural_pdf import PDF
11
+
12
+ # Load the PDF using the relative path
13
+ pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/Atlanta_Public_Schools_GA_sample.pdf")
14
+ page = pdf.pages[0]
15
+
16
+ # Find horizontal lines that separate book entries
17
+ horizontal_lines = page.find_all('line:horizontal')
18
+
19
+ # Visualize the potential section boundaries
20
+ horizontal_lines.highlight(color="red", label="Section Boundaries")
21
+ page.to_image()
22
+ ```
23
+
24
+ ```python
25
+ # Count what we found
26
+ len(horizontal_lines)
27
+ ```
28
+
29
+ ## Basic Section Extraction
30
+
31
+ ```python
32
+ # Extract sections based on horizontal lines
33
+ # Each section starts at a horizontal line and ends at the next one
34
+ book_sections = page.get_sections(
35
+ start_elements=horizontal_lines,
36
+ boundary_inclusion='start' # Include the boundary in the section
37
+ )
38
+
39
+ # Visualize each section
40
+ page.clear_highlights()
41
+ for section in book_sections:
42
+ section.highlight()
43
+ page.to_image()
44
+ ```
45
+
46
+ ```python
47
+ # Display section count and preview the first section
48
+ {
49
+ "total_sections": len(book_sections),
50
+ "first_section_text": book_sections[0].extract_text()[:100] + "..." if book_sections else "No sections found"
51
+ }
52
+ ```
53
+
54
+ ## Working with Section Content
55
+
56
+ ```python
57
+ # Extract and display content from the first few book entries
58
+ book_entries = []
59
+
60
+ for i, section in enumerate(book_sections[:5]):
61
+ # Extract the section text
62
+ text = section.extract_text().strip()
63
+
64
+ # Try to parse book information
65
+ title = ""
66
+ author = ""
67
+ isbn = ""
68
+
69
+ # Extract title (typically the first line)
70
+ title_match = section.find('text:contains("Title:")')
71
+ if title_match:
72
+ title_value = title_match.right(width=400).extract_text()
73
+ title = title_value.strip()
74
+
75
+ # Extract author
76
+ author_match = section.find('text:contains("Author:")')
77
+ if author_match:
78
+ author_value = author_match.right(width=400).extract_text()
79
+ author = author_value.strip()
80
+
81
+ # Extract ISBN
82
+ isbn_match = section.find('text:contains("ISBN:")')
83
+ if isbn_match:
84
+ isbn_value = isbn_match.right(width=400).extract_text()
85
+ isbn = isbn_value.strip()
86
+
87
+ # Add to our collection
88
+ book_entries.append({
89
+ "number": i + 1,
90
+ "title": title,
91
+ "author": author,
92
+ "isbn": isbn,
93
+ "preview": text[:50] + "..." if len(text) > 50 else text
94
+ })
95
+
96
+ # Display the structured book entries
97
+ import pandas as pd
98
+ pd.DataFrame(book_entries)
99
+ ```
100
+
101
+ ## Using Different Section Boundaries
102
+
103
+ ```python
104
+ page.viewer()
105
+ ```
106
+
107
+ ```python
108
+ # Find title elements with specific selectors
109
+ title_elements = page.find('line[width=2]').below().find_all('text[fontname="AAAAAB+font000000002a8d158a"][size=10]')
110
+ title_elements.show()
111
+ ```
112
+
113
+ ```python
114
+ # Extract sections starting from each title
115
+ # This now directly returns an ElementCollection
116
+ title_sections = page.get_sections(
117
+ start_elements=title_elements,
118
+ boundary_inclusion='start'
119
+ )
120
+
121
+ # Show the title-based sections
122
+ page.clear_highlights()
123
+ title_sections.highlight()
124
+ page.to_image()
125
+ ```
126
+
127
+ ```python
128
+ # Count the sections found
129
+ len(title_sections)
130
+ ```
131
+
132
+ ## Section Boundary Inclusion Options
133
+
134
+ ```python
135
+ # Use horizontal line elements as section dividers
136
+ dividers = page.find_all('line[horizontal]')
137
+
138
+ # Compare the different boundary inclusion options
139
+ inclusion_options = {
140
+ 'none': page.get_sections(start_elements=dividers, boundary_inclusion='none'),
141
+ 'start': page.get_sections(start_elements=dividers, boundary_inclusion='start'),
142
+ 'end': page.get_sections(start_elements=dividers, boundary_inclusion='end'),
143
+ 'both': page.get_sections(start_elements=dividers, boundary_inclusion='both')
144
+ }
145
+
146
+ # Count sections with each option
147
+ section_counts = {option: len(sections) for option, sections in inclusion_options.items()}
148
+ section_counts
149
+ ```
150
+
151
+ ## Custom Section Boundaries
152
+
153
+ ```python
154
+ # Define specific start and end points - let's extract just one book entry
155
+ # We'll look for the first and second horizontal lines
156
+ page.clear_highlights()
157
+
158
+ start_point = title_elements[0]
159
+ end_point = title_elements[1]
160
+
161
+ # Extract the section between these points
162
+ single_book_entry = page.get_sections(
163
+ start_elements=[start_point],
164
+ end_elements=[end_point],
165
+ boundary_inclusion='start' # Include the start but not the end
166
+ )
167
+
168
+ # Visualize the custom section
169
+ single_book_entry.highlight(color="green", label="Single Book Entry")
170
+
171
+ print(single_book_entry[0].extract_text())
172
+
173
+ page.to_image()
174
+ ```
175
+
176
+ ## Multi-page Sections
177
+
178
+ ```python
179
+ # Get sections across the first two pages
180
+ multi_page_sections = [] # Initialize as a list
181
+
182
+ for page_num in range(min(2, len(pdf.pages))):
183
+ page = pdf.pages[page_num]
184
+
185
+ # Find horizontal lines on this page
186
+ title_elements = page.find('line[width=2]').below().find_all('text[fontname="AAAAAB+font000000002a8d158a"][size=10]')
187
+
188
+ # Get sections for this page (returns ElementCollection)
189
+ page_sections = page.get_sections(
190
+ start_elements=title_elements,
191
+ boundary_inclusion='start'
192
+ )
193
+
194
+ # Add elements from the collection to our list
195
+ multi_page_sections.extend(page_sections) # list.extend works with iterables like ElementCollection
196
+
197
+ # Display info about each section (showing first 3)
198
+ [{
199
+ "page": section.page.number + 1, # 1-indexed page number for display
200
+ "text": section.extract_text()[:50] + "..." if len(section.extract_text()) > 50 else section.extract_text()
201
+ } for section in multi_page_sections]
202
+ ```
203
+
204
+ ## Building a Book Database
205
+
206
+ ```python
207
+ # Extract all book entries across multiple pages
208
+ book_database = []
209
+
210
+ # Process first 3 pages (or fewer if the document is shorter)
211
+ for page_num in range(min(3, len(pdf.pages))):
212
+ page = pdf.pages[page_num]
213
+
214
+ # Find horizontal lines on this page
215
+ title_elements = page.find('line[width=2]').below().find_all('text[fontname="AAAAAB+font000000002a8d158a"][size=10]')
216
+
217
+ # Get sections for this page
218
+ book_sections = page.get_sections(
219
+ start_elements=title_elements,
220
+ boundary_inclusion='start'
221
+ )
222
+
223
+ # Process each book section
224
+ for section in book_sections:
225
+ # Skip sections that are too short (might be headers/footers)
226
+ if len(section.extract_text()) < 50:
227
+ continue
228
+
229
+ # Extract book information
230
+ book_info = {"page": page_num + 1}
231
+
232
+ for field in ["Title:", "Author:", "ISBN:", "Publisher:", "Copyright:"]:
233
+ field_element = section.find(f'text:contains("{field}")')
234
+ if field_element:
235
+ field_name = field.strip(':').lower()
236
+ field_value = field_element.extract_text().replace(field, '').strip()
237
+ book_info[field_name] = field_value
238
+
239
+ # Below the field name
240
+ for field in ["Price", "Acquired", "Barcode", "Removed By"]:
241
+ field_element = section.find(f'text:contains("{field}")')
242
+ if field_element:
243
+ field_name = field.lower()
244
+ field_value = field_element.below(height=10, width='element').expand(right=50).extract_text().strip()
245
+ book_info[field_name] = field_value
246
+
247
+ book_database.append(book_info)
248
+
249
+ # Display sample entries (first 3)
250
+ import pandas as pd
251
+
252
+ df = pd.json_normalize(book_database)
253
+ df.head()
254
+ ```
255
+
256
+ Section extraction lets you break down documents into logical parts, making it easier to generate summaries, extract specific content, and create structured data from semi-structured documents. In this example, we've shown how to convert a PDF library catalog into a structured book database.