natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +422 -0
- natural_pdf/classification/mixin.py +163 -0
- natural_pdf/classification/results.py +80 -0
- natural_pdf/collections/mixins.py +111 -0
- natural_pdf/collections/pdf_collection.py +434 -15
- natural_pdf/core/element_manager.py +83 -0
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +578 -93
- natural_pdf/core/pdf.py +912 -460
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +712 -109
- natural_pdf/elements/region.py +722 -69
- natural_pdf/elements/text.py +4 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf/extraction/manager.py +135 -0
- natural_pdf/extraction/mixin.py +279 -0
- natural_pdf/extraction/result.py +23 -0
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf/ocr/ocr_options.py +94 -11
- natural_pdf/ocr/utils.py +19 -6
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +431 -230
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf/utils/tqdm_utils.py +51 -0
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -915
- docs/element-selection/index.md +0 -229
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -170
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -209
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -340
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -147
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -114
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -270
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -332
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -288
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -413
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -508
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2434
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -512
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -604
- docs/tutorials/12-ocr-integration.md +0 -175
- docs/tutorials/13-semantic-search.ipynb +0 -1328
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.7.dist-info/RECORD +0 -145
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,256 +0,0 @@
|
|
1
|
-
# Section Extraction
|
2
|
-
|
3
|
-
Documents are often organized into logical sections like chapters, articles, or content blocks. This tutorial shows how to extract these sections using natural-pdf, using a library weeding log as an example.
|
4
|
-
|
5
|
-
```python
|
6
|
-
#%pip install "natural-pdf[all]"
|
7
|
-
```
|
8
|
-
|
9
|
-
```python
|
10
|
-
from natural_pdf import PDF
|
11
|
-
|
12
|
-
# Load the PDF using the relative path
|
13
|
-
pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/Atlanta_Public_Schools_GA_sample.pdf")
|
14
|
-
page = pdf.pages[0]
|
15
|
-
|
16
|
-
# Find horizontal lines that separate book entries
|
17
|
-
horizontal_lines = page.find_all('line:horizontal')
|
18
|
-
|
19
|
-
# Visualize the potential section boundaries
|
20
|
-
horizontal_lines.highlight(color="red", label="Section Boundaries")
|
21
|
-
page.to_image()
|
22
|
-
```
|
23
|
-
|
24
|
-
```python
|
25
|
-
# Count what we found
|
26
|
-
len(horizontal_lines)
|
27
|
-
```
|
28
|
-
|
29
|
-
## Basic Section Extraction
|
30
|
-
|
31
|
-
```python
|
32
|
-
# Extract sections based on horizontal lines
|
33
|
-
# Each section starts at a horizontal line and ends at the next one
|
34
|
-
book_sections = page.get_sections(
|
35
|
-
start_elements=horizontal_lines,
|
36
|
-
boundary_inclusion='start' # Include the boundary in the section
|
37
|
-
)
|
38
|
-
|
39
|
-
# Visualize each section
|
40
|
-
page.clear_highlights()
|
41
|
-
for section in book_sections:
|
42
|
-
section.highlight()
|
43
|
-
page.to_image()
|
44
|
-
```
|
45
|
-
|
46
|
-
```python
|
47
|
-
# Display section count and preview the first section
|
48
|
-
{
|
49
|
-
"total_sections": len(book_sections),
|
50
|
-
"first_section_text": book_sections[0].extract_text()[:100] + "..." if book_sections else "No sections found"
|
51
|
-
}
|
52
|
-
```
|
53
|
-
|
54
|
-
## Working with Section Content
|
55
|
-
|
56
|
-
```python
|
57
|
-
# Extract and display content from the first few book entries
|
58
|
-
book_entries = []
|
59
|
-
|
60
|
-
for i, section in enumerate(book_sections[:5]):
|
61
|
-
# Extract the section text
|
62
|
-
text = section.extract_text().strip()
|
63
|
-
|
64
|
-
# Try to parse book information
|
65
|
-
title = ""
|
66
|
-
author = ""
|
67
|
-
isbn = ""
|
68
|
-
|
69
|
-
# Extract title (typically the first line)
|
70
|
-
title_match = section.find('text:contains("Title:")')
|
71
|
-
if title_match:
|
72
|
-
title_value = title_match.right(width=400).extract_text()
|
73
|
-
title = title_value.strip()
|
74
|
-
|
75
|
-
# Extract author
|
76
|
-
author_match = section.find('text:contains("Author:")')
|
77
|
-
if author_match:
|
78
|
-
author_value = author_match.right(width=400).extract_text()
|
79
|
-
author = author_value.strip()
|
80
|
-
|
81
|
-
# Extract ISBN
|
82
|
-
isbn_match = section.find('text:contains("ISBN:")')
|
83
|
-
if isbn_match:
|
84
|
-
isbn_value = isbn_match.right(width=400).extract_text()
|
85
|
-
isbn = isbn_value.strip()
|
86
|
-
|
87
|
-
# Add to our collection
|
88
|
-
book_entries.append({
|
89
|
-
"number": i + 1,
|
90
|
-
"title": title,
|
91
|
-
"author": author,
|
92
|
-
"isbn": isbn,
|
93
|
-
"preview": text[:50] + "..." if len(text) > 50 else text
|
94
|
-
})
|
95
|
-
|
96
|
-
# Display the structured book entries
|
97
|
-
import pandas as pd
|
98
|
-
pd.DataFrame(book_entries)
|
99
|
-
```
|
100
|
-
|
101
|
-
## Using Different Section Boundaries
|
102
|
-
|
103
|
-
```python
|
104
|
-
page.viewer()
|
105
|
-
```
|
106
|
-
|
107
|
-
```python
|
108
|
-
# Find title elements with specific selectors
|
109
|
-
title_elements = page.find('line[width=2]').below().find_all('text[fontname="AAAAAB+font000000002a8d158a"][size=10]')
|
110
|
-
title_elements.show()
|
111
|
-
```
|
112
|
-
|
113
|
-
```python
|
114
|
-
# Extract sections starting from each title
|
115
|
-
# This now directly returns an ElementCollection
|
116
|
-
title_sections = page.get_sections(
|
117
|
-
start_elements=title_elements,
|
118
|
-
boundary_inclusion='start'
|
119
|
-
)
|
120
|
-
|
121
|
-
# Show the title-based sections
|
122
|
-
page.clear_highlights()
|
123
|
-
title_sections.highlight()
|
124
|
-
page.to_image()
|
125
|
-
```
|
126
|
-
|
127
|
-
```python
|
128
|
-
# Count the sections found
|
129
|
-
len(title_sections)
|
130
|
-
```
|
131
|
-
|
132
|
-
## Section Boundary Inclusion Options
|
133
|
-
|
134
|
-
```python
|
135
|
-
# Use horizontal line elements as section dividers
|
136
|
-
dividers = page.find_all('line[horizontal]')
|
137
|
-
|
138
|
-
# Compare the different boundary inclusion options
|
139
|
-
inclusion_options = {
|
140
|
-
'none': page.get_sections(start_elements=dividers, boundary_inclusion='none'),
|
141
|
-
'start': page.get_sections(start_elements=dividers, boundary_inclusion='start'),
|
142
|
-
'end': page.get_sections(start_elements=dividers, boundary_inclusion='end'),
|
143
|
-
'both': page.get_sections(start_elements=dividers, boundary_inclusion='both')
|
144
|
-
}
|
145
|
-
|
146
|
-
# Count sections with each option
|
147
|
-
section_counts = {option: len(sections) for option, sections in inclusion_options.items()}
|
148
|
-
section_counts
|
149
|
-
```
|
150
|
-
|
151
|
-
## Custom Section Boundaries
|
152
|
-
|
153
|
-
```python
|
154
|
-
# Define specific start and end points - let's extract just one book entry
|
155
|
-
# We'll look for the first and second horizontal lines
|
156
|
-
page.clear_highlights()
|
157
|
-
|
158
|
-
start_point = title_elements[0]
|
159
|
-
end_point = title_elements[1]
|
160
|
-
|
161
|
-
# Extract the section between these points
|
162
|
-
single_book_entry = page.get_sections(
|
163
|
-
start_elements=[start_point],
|
164
|
-
end_elements=[end_point],
|
165
|
-
boundary_inclusion='start' # Include the start but not the end
|
166
|
-
)
|
167
|
-
|
168
|
-
# Visualize the custom section
|
169
|
-
single_book_entry.highlight(color="green", label="Single Book Entry")
|
170
|
-
|
171
|
-
print(single_book_entry[0].extract_text())
|
172
|
-
|
173
|
-
page.to_image()
|
174
|
-
```
|
175
|
-
|
176
|
-
## Multi-page Sections
|
177
|
-
|
178
|
-
```python
|
179
|
-
# Get sections across the first two pages
|
180
|
-
multi_page_sections = [] # Initialize as a list
|
181
|
-
|
182
|
-
for page_num in range(min(2, len(pdf.pages))):
|
183
|
-
page = pdf.pages[page_num]
|
184
|
-
|
185
|
-
# Find horizontal lines on this page
|
186
|
-
title_elements = page.find('line[width=2]').below().find_all('text[fontname="AAAAAB+font000000002a8d158a"][size=10]')
|
187
|
-
|
188
|
-
# Get sections for this page (returns ElementCollection)
|
189
|
-
page_sections = page.get_sections(
|
190
|
-
start_elements=title_elements,
|
191
|
-
boundary_inclusion='start'
|
192
|
-
)
|
193
|
-
|
194
|
-
# Add elements from the collection to our list
|
195
|
-
multi_page_sections.extend(page_sections) # list.extend works with iterables like ElementCollection
|
196
|
-
|
197
|
-
# Display info about each section (showing first 3)
|
198
|
-
[{
|
199
|
-
"page": section.page.number + 1, # 1-indexed page number for display
|
200
|
-
"text": section.extract_text()[:50] + "..." if len(section.extract_text()) > 50 else section.extract_text()
|
201
|
-
} for section in multi_page_sections]
|
202
|
-
```
|
203
|
-
|
204
|
-
## Building a Book Database
|
205
|
-
|
206
|
-
```python
|
207
|
-
# Extract all book entries across multiple pages
|
208
|
-
book_database = []
|
209
|
-
|
210
|
-
# Process first 3 pages (or fewer if the document is shorter)
|
211
|
-
for page_num in range(min(3, len(pdf.pages))):
|
212
|
-
page = pdf.pages[page_num]
|
213
|
-
|
214
|
-
# Find horizontal lines on this page
|
215
|
-
title_elements = page.find('line[width=2]').below().find_all('text[fontname="AAAAAB+font000000002a8d158a"][size=10]')
|
216
|
-
|
217
|
-
# Get sections for this page
|
218
|
-
book_sections = page.get_sections(
|
219
|
-
start_elements=title_elements,
|
220
|
-
boundary_inclusion='start'
|
221
|
-
)
|
222
|
-
|
223
|
-
# Process each book section
|
224
|
-
for section in book_sections:
|
225
|
-
# Skip sections that are too short (might be headers/footers)
|
226
|
-
if len(section.extract_text()) < 50:
|
227
|
-
continue
|
228
|
-
|
229
|
-
# Extract book information
|
230
|
-
book_info = {"page": page_num + 1}
|
231
|
-
|
232
|
-
for field in ["Title:", "Author:", "ISBN:", "Publisher:", "Copyright:"]:
|
233
|
-
field_element = section.find(f'text:contains("{field}")')
|
234
|
-
if field_element:
|
235
|
-
field_name = field.strip(':').lower()
|
236
|
-
field_value = field_element.extract_text().replace(field, '').strip()
|
237
|
-
book_info[field_name] = field_value
|
238
|
-
|
239
|
-
# Below the field name
|
240
|
-
for field in ["Price", "Acquired", "Barcode", "Removed By"]:
|
241
|
-
field_element = section.find(f'text:contains("{field}")')
|
242
|
-
if field_element:
|
243
|
-
field_name = field.lower()
|
244
|
-
field_value = field_element.below(height=10, width='element').expand(right=50).extract_text().strip()
|
245
|
-
book_info[field_name] = field_value
|
246
|
-
|
247
|
-
book_database.append(book_info)
|
248
|
-
|
249
|
-
# Display sample entries (first 3)
|
250
|
-
import pandas as pd
|
251
|
-
|
252
|
-
df = pd.json_normalize(book_database)
|
253
|
-
df.head()
|
254
|
-
```
|
255
|
-
|
256
|
-
Section extraction lets you break down documents into logical parts, making it easier to generate summaries, extract specific content, and create structured data from semi-structured documents. In this example, we've shown how to convert a PDF library catalog into a structured book database.
|