natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +422 -0
- natural_pdf/classification/mixin.py +163 -0
- natural_pdf/classification/results.py +80 -0
- natural_pdf/collections/mixins.py +111 -0
- natural_pdf/collections/pdf_collection.py +434 -15
- natural_pdf/core/element_manager.py +83 -0
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +578 -93
- natural_pdf/core/pdf.py +912 -460
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +712 -109
- natural_pdf/elements/region.py +722 -69
- natural_pdf/elements/text.py +4 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf/extraction/manager.py +135 -0
- natural_pdf/extraction/mixin.py +279 -0
- natural_pdf/extraction/result.py +23 -0
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf/ocr/ocr_options.py +94 -11
- natural_pdf/ocr/utils.py +19 -6
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +431 -230
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf/utils/tqdm_utils.py +51 -0
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -915
- docs/element-selection/index.md +0 -229
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -170
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -209
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -340
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -147
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -114
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -270
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -332
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -288
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -413
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -508
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2434
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -512
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -604
- docs/tutorials/12-ocr-integration.md +0 -175
- docs/tutorials/13-semantic-search.ipynb +0 -1328
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.7.dist-info/RECORD +0 -145
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,190 +0,0 @@
|
|
1
|
-
# Spatial Navigation
|
2
|
-
|
3
|
-
Spatial navigation lets you work with PDF content based on the physical layout of elements on the page. It's perfect for finding elements relative to each other and extracting information in context.
|
4
|
-
|
5
|
-
```python
|
6
|
-
#%pip install "natural-pdf[all]"
|
7
|
-
```
|
8
|
-
|
9
|
-
```python
|
10
|
-
from natural_pdf import PDF
|
11
|
-
|
12
|
-
# Load a PDF
|
13
|
-
pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
|
14
|
-
page = pdf.pages[0]
|
15
|
-
|
16
|
-
# Find the title of the document
|
17
|
-
title = page.find('text:contains("Jungle Health")')
|
18
|
-
|
19
|
-
# Visualize our starting point
|
20
|
-
title.show(color="red", label="Document Title")
|
21
|
-
|
22
|
-
# Display the title text
|
23
|
-
title.text
|
24
|
-
```
|
25
|
-
|
26
|
-
## Finding Elements Above and Below
|
27
|
-
|
28
|
-
```python
|
29
|
-
# Create a region below the title
|
30
|
-
region_below = title.below(height=100)
|
31
|
-
|
32
|
-
# Visualize the region
|
33
|
-
region_below.show(color="blue", label="Below Title")
|
34
|
-
|
35
|
-
# Find and extract text from this region
|
36
|
-
text_below = region_below.extract_text()
|
37
|
-
text_below
|
38
|
-
```
|
39
|
-
|
40
|
-
## Finding Content Between Elements
|
41
|
-
|
42
|
-
```python
|
43
|
-
# Find two labels to serve as boundaries
|
44
|
-
site_label = page.find('text:contains("Site:")')
|
45
|
-
date_label = page.find('text:contains("Date:")')
|
46
|
-
|
47
|
-
# Get the region between these labels
|
48
|
-
between_region = site_label.below(
|
49
|
-
include_element=True, # Include starting element
|
50
|
-
until='text:contains("Date:")', # Stop at this element
|
51
|
-
include_endpoint=False # Don't include ending element
|
52
|
-
)
|
53
|
-
|
54
|
-
# Visualize the region between labels
|
55
|
-
between_region.show(color="green", label="Between")
|
56
|
-
|
57
|
-
# Extract text from this bounded area
|
58
|
-
between_region.extract_text()
|
59
|
-
```
|
60
|
-
|
61
|
-
## Navigating Left and Right
|
62
|
-
|
63
|
-
```python
|
64
|
-
# Find a field label
|
65
|
-
site_label = page.find('text:contains("Site:")')
|
66
|
-
|
67
|
-
# Get the content to the right (the field value)
|
68
|
-
value_region = site_label.right(width=200)
|
69
|
-
|
70
|
-
# Visualize the label and value regions
|
71
|
-
site_label.show(color="red", label="Label")
|
72
|
-
value_region.show(color="blue", label="Value")
|
73
|
-
|
74
|
-
# Extract just the value text
|
75
|
-
value_region.extract_text()
|
76
|
-
```
|
77
|
-
|
78
|
-
## Finding Adjacent Elements
|
79
|
-
|
80
|
-
```python
|
81
|
-
# Start with a label element
|
82
|
-
label = page.find('text:contains("Site:")')
|
83
|
-
|
84
|
-
# Find the next and previous elements in reading order
|
85
|
-
next_elem = label.next()
|
86
|
-
prev_elem = label.prev()
|
87
|
-
|
88
|
-
# Visualize all three elements
|
89
|
-
label.show(color="red", label="Current")
|
90
|
-
next_elem.show(color="green", label="Next") if next_elem else None
|
91
|
-
prev_elem.show(color="blue", label="Previous") if prev_elem else None
|
92
|
-
|
93
|
-
# Show the text of adjacent elements
|
94
|
-
{
|
95
|
-
"current": label.text,
|
96
|
-
"next": next_elem.text if next_elem else "None",
|
97
|
-
"previous": prev_elem.text if prev_elem else "None"
|
98
|
-
}
|
99
|
-
```
|
100
|
-
|
101
|
-
## Combining with Element Selectors
|
102
|
-
|
103
|
-
```python
|
104
|
-
# Find a section label
|
105
|
-
summary = page.find('text:contains("Summary:")')
|
106
|
-
|
107
|
-
# Find the next bold text element
|
108
|
-
next_bold = summary.next('text:bold', limit=20)
|
109
|
-
|
110
|
-
# Find the nearest line element
|
111
|
-
nearest_line = summary.nearest('line')
|
112
|
-
|
113
|
-
# Visualize what we found
|
114
|
-
summary.show(color="red", label="Summary")
|
115
|
-
next_bold.show(color="blue", label="Next Bold") if next_bold else None
|
116
|
-
nearest_line.show(color="green", label="Nearest Line") if nearest_line else None
|
117
|
-
|
118
|
-
# Show the content we found
|
119
|
-
{
|
120
|
-
"summary": summary.text,
|
121
|
-
"next_bold": next_bold.text if next_bold else "None found",
|
122
|
-
"nearest_line": nearest_line if nearest_line else "None found"
|
123
|
-
}
|
124
|
-
```
|
125
|
-
|
126
|
-
## Extracting Table Rows with Spatial Navigation
|
127
|
-
|
128
|
-
```python
|
129
|
-
# Find a table heading
|
130
|
-
table_heading = page.find('text:contains("Statute")')
|
131
|
-
table_heading.show(color="purple", label="Table Header")
|
132
|
-
|
133
|
-
# Extract table rows using spatial navigation
|
134
|
-
rows = []
|
135
|
-
current = table_heading
|
136
|
-
|
137
|
-
# Get the next 4 rows
|
138
|
-
for i in range(4):
|
139
|
-
# Find the next row below the current one
|
140
|
-
next_row = current.below(height=15)
|
141
|
-
|
142
|
-
if next_row:
|
143
|
-
rows.append(next_row)
|
144
|
-
current = next_row # Move to the next row
|
145
|
-
else:
|
146
|
-
break
|
147
|
-
|
148
|
-
# Visualize all found rows
|
149
|
-
page.clear_highlights()
|
150
|
-
for i, row in enumerate(rows):
|
151
|
-
row.highlight(label=f"Row {i+1}")
|
152
|
-
page.to_image(width=700)
|
153
|
-
```
|
154
|
-
|
155
|
-
```python
|
156
|
-
# Extract text from each row
|
157
|
-
[row.extract_text() for row in rows]
|
158
|
-
```
|
159
|
-
|
160
|
-
## Extracting Key-Value Pairs
|
161
|
-
|
162
|
-
```python
|
163
|
-
# Find all potential field labels (text with a colon)
|
164
|
-
labels = page.find_all('text:contains(":")')
|
165
|
-
|
166
|
-
# Visualize the labels
|
167
|
-
labels.show(color="blue", label="Labels")
|
168
|
-
|
169
|
-
# Extract key-value pairs
|
170
|
-
field_data = {}
|
171
|
-
|
172
|
-
for label in labels:
|
173
|
-
# Clean up the label text
|
174
|
-
key = label.text.strip().rstrip(':')
|
175
|
-
|
176
|
-
# Skip if not a proper label
|
177
|
-
if not key:
|
178
|
-
continue
|
179
|
-
|
180
|
-
# Get the value to the right
|
181
|
-
value = label.right(width=200).extract_text().strip()
|
182
|
-
|
183
|
-
# Add to our collection
|
184
|
-
field_data[key] = value
|
185
|
-
|
186
|
-
# Show the extracted data
|
187
|
-
field_data
|
188
|
-
```
|
189
|
-
|
190
|
-
Spatial navigation mimics how humans read documents, letting you navigate content based on physical relationships between elements. It's especially useful for extracting structured data from forms, tables, and formatted documents.
|