natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +422 -0
- natural_pdf/classification/mixin.py +163 -0
- natural_pdf/classification/results.py +80 -0
- natural_pdf/collections/mixins.py +111 -0
- natural_pdf/collections/pdf_collection.py +434 -15
- natural_pdf/core/element_manager.py +83 -0
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +578 -93
- natural_pdf/core/pdf.py +912 -460
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +712 -109
- natural_pdf/elements/region.py +722 -69
- natural_pdf/elements/text.py +4 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf/extraction/manager.py +135 -0
- natural_pdf/extraction/mixin.py +279 -0
- natural_pdf/extraction/result.py +23 -0
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf/ocr/ocr_options.py +94 -11
- natural_pdf/ocr/utils.py +19 -6
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +431 -230
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf/utils/tqdm_utils.py +51 -0
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -915
- docs/element-selection/index.md +0 -229
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -170
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -209
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -340
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -147
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -114
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -270
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -332
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -288
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -413
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -508
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2434
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -512
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -604
- docs/tutorials/12-ocr-integration.md +0 -175
- docs/tutorials/13-semantic-search.ipynb +0 -1328
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.7.dist-info/RECORD +0 -145
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,201 +0,0 @@
|
|
1
|
-
# Form Field Extraction
|
2
|
-
|
3
|
-
Business documents like invoices, forms, and applications contain field-value pairs that need to be extracted. This tutorial shows how to identify and extract these form fields.
|
4
|
-
|
5
|
-
```python
|
6
|
-
#%pip install "natural-pdf[all]"
|
7
|
-
```
|
8
|
-
|
9
|
-
```python
|
10
|
-
from natural_pdf import PDF
|
11
|
-
|
12
|
-
# Load a PDF
|
13
|
-
pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
|
14
|
-
page = pdf.pages[0]
|
15
|
-
|
16
|
-
# Find fields with labels ending in colon
|
17
|
-
labels = page.find_all('text:contains(":")')
|
18
|
-
|
19
|
-
# Visualize the found labels
|
20
|
-
labels.show(color="blue", label="Field Labels")
|
21
|
-
|
22
|
-
# Count how many potential fields we found
|
23
|
-
len(labels)
|
24
|
-
```
|
25
|
-
|
26
|
-
## Extracting Field Values
|
27
|
-
|
28
|
-
```python
|
29
|
-
# Extract the value for each field label
|
30
|
-
form_data = {}
|
31
|
-
|
32
|
-
for label in labels:
|
33
|
-
# Clean up the label text
|
34
|
-
field_name = label.text.strip().rstrip(':')
|
35
|
-
|
36
|
-
# Find the value to the right of the label
|
37
|
-
value_region = label.right(width=200)
|
38
|
-
value = value_region.extract_text().strip()
|
39
|
-
|
40
|
-
# Store in our dictionary
|
41
|
-
form_data[field_name] = value
|
42
|
-
|
43
|
-
# Display the extracted data
|
44
|
-
form_data
|
45
|
-
```
|
46
|
-
|
47
|
-
## Visualizing Labels and Values
|
48
|
-
|
49
|
-
```python
|
50
|
-
# Clear previous highlights
|
51
|
-
page.clear_highlights()
|
52
|
-
|
53
|
-
# Highlight both labels and their values
|
54
|
-
for label in labels:
|
55
|
-
# Highlight the label in red
|
56
|
-
label.show(color="red", label="Label")
|
57
|
-
|
58
|
-
# Highlight the value area in blue
|
59
|
-
label.right(width=200).show(color="blue", label="Value")
|
60
|
-
|
61
|
-
# Show the page image with highlighted elements
|
62
|
-
page.to_image()
|
63
|
-
```
|
64
|
-
|
65
|
-
## Handling Multi-line Values
|
66
|
-
|
67
|
-
```python
|
68
|
-
# Extract values that might span multiple lines
|
69
|
-
multi_line_data = {}
|
70
|
-
|
71
|
-
for label in labels:
|
72
|
-
# Get the field name
|
73
|
-
field_name = label.text.strip().rstrip(':')
|
74
|
-
|
75
|
-
# Look both to the right and below
|
76
|
-
right_value = label.right(width=200).extract_text().strip()
|
77
|
-
below_value = label.below(height=50).extract_text().strip()
|
78
|
-
|
79
|
-
# Combine the values if they're different
|
80
|
-
if right_value in below_value:
|
81
|
-
value = below_value
|
82
|
-
else:
|
83
|
-
value = f"{right_value} {below_value}".strip()
|
84
|
-
|
85
|
-
# Add to results
|
86
|
-
multi_line_data[field_name] = value
|
87
|
-
|
88
|
-
# Show fields with potential multi-line values
|
89
|
-
multi_line_data
|
90
|
-
```
|
91
|
-
|
92
|
-
## Finding Pattern-Based Fields
|
93
|
-
|
94
|
-
```python
|
95
|
-
import re
|
96
|
-
|
97
|
-
# Find dates in the format July 31, YYY
|
98
|
-
date_pattern = r'\b\w+ \d+, \d\d\d\d\b'
|
99
|
-
|
100
|
-
# Search all text elements for dates
|
101
|
-
text_elements = page.find_all('text')
|
102
|
-
print([elem.text for elem in text_elements])
|
103
|
-
dates = text_elements.filter(lambda elem: re.search(date_pattern, elem.text))
|
104
|
-
|
105
|
-
# Visualize the date fields
|
106
|
-
dates.show(color="green", label="Date")
|
107
|
-
|
108
|
-
# Extract just the date values
|
109
|
-
date_texts = [re.search(date_pattern, elem.text).group(0) for elem in dates]
|
110
|
-
date_texts
|
111
|
-
```
|
112
|
-
|
113
|
-
## Working with Form Tables
|
114
|
-
|
115
|
-
```python
|
116
|
-
# Run layout analysis to find table structures
|
117
|
-
page.analyze_layout()
|
118
|
-
|
119
|
-
# Find possible form tables
|
120
|
-
tables = page.find_all('region[type=table]')
|
121
|
-
|
122
|
-
if tables:
|
123
|
-
# Visualize the tables
|
124
|
-
tables.show(color="purple", label="Form Table")
|
125
|
-
|
126
|
-
# Extract data from the first table
|
127
|
-
first_table = tables[0]
|
128
|
-
table_data = first_table.extract_table()
|
129
|
-
table_data
|
130
|
-
else:
|
131
|
-
# Try to find form-like structure using text alignment
|
132
|
-
# Create a region where a form might be
|
133
|
-
form_region = page.create_region(50, 200, page.width - 50, 500)
|
134
|
-
|
135
|
-
# Group text by vertical position
|
136
|
-
rows = {}
|
137
|
-
text_elements = form_region.find_all('text')
|
138
|
-
|
139
|
-
for elem in text_elements:
|
140
|
-
# Round y-position to group elements in the same row
|
141
|
-
row_pos = round(elem.top / 5) * 5
|
142
|
-
if row_pos not in rows:
|
143
|
-
rows[row_pos] = []
|
144
|
-
rows[row_pos].append(elem)
|
145
|
-
|
146
|
-
# Extract data from rows (first 5 rows)
|
147
|
-
row_data = []
|
148
|
-
for y in sorted(rows.keys())[:5]:
|
149
|
-
# Sort elements by x-position (left to right)
|
150
|
-
elements = sorted(rows[y], key=lambda e: e.x0)
|
151
|
-
|
152
|
-
# Show the row
|
153
|
-
row_box = form_region.create_region(
|
154
|
-
min(e.x0 for e in elements),
|
155
|
-
min(e.top for e in elements),
|
156
|
-
max(e.x1 for e in elements),
|
157
|
-
max(e.bottom for e in elements)
|
158
|
-
)
|
159
|
-
row_box.show(color=None, use_color_cycling=True)
|
160
|
-
|
161
|
-
# Extract text from row
|
162
|
-
row_text = [e.text for e in elements]
|
163
|
-
row_data.append(row_text)
|
164
|
-
|
165
|
-
# Show the extracted rows
|
166
|
-
row_data
|
167
|
-
```
|
168
|
-
|
169
|
-
## Combining Different Extraction Techniques
|
170
|
-
|
171
|
-
```python
|
172
|
-
# Combine label-based and pattern-based extraction
|
173
|
-
all_fields = {}
|
174
|
-
|
175
|
-
# 1. First get fields with explicit labels
|
176
|
-
for label in labels:
|
177
|
-
field_name = label.text.strip().rstrip(':')
|
178
|
-
value = label.right(width=200).extract_text().strip()
|
179
|
-
all_fields[field_name] = value
|
180
|
-
|
181
|
-
# 2. Add date fields that we found with pattern matching
|
182
|
-
for date_elem in dates:
|
183
|
-
# Find the nearest label
|
184
|
-
nearby_label = date_elem.nearest('text:contains(":")')
|
185
|
-
|
186
|
-
if nearby_label:
|
187
|
-
# Extract the label text
|
188
|
-
label_text = nearby_label.text.strip().rstrip(':')
|
189
|
-
|
190
|
-
# Get the date value
|
191
|
-
date_value = re.search(date_pattern, date_elem.text).group(0)
|
192
|
-
|
193
|
-
# Add to our results if not already present
|
194
|
-
if label_text not in all_fields:
|
195
|
-
all_fields[label_text] = date_value
|
196
|
-
|
197
|
-
# Show all extracted fields
|
198
|
-
all_fields
|
199
|
-
```
|
200
|
-
|
201
|
-
Form field extraction enables you to automate data entry and document processing. By combining different techniques like label detection, spatial navigation, and pattern matching, you can handle a wide variety of form layouts.
|
@@ -1,54 +0,0 @@
|
|
1
|
-
{
|
2
|
-
"cells": [
|
3
|
-
{
|
4
|
-
"cell_type": "markdown",
|
5
|
-
"id": "7674e123",
|
6
|
-
"metadata": {},
|
7
|
-
"source": [
|
8
|
-
"# Enhanced Table Processing\n",
|
9
|
-
"\n",
|
10
|
-
"Tables are a common way to present structured data in documents, but they can be challenging to extract correctly. This tutorial demonstrates advanced techniques for working with tables in natural-pdf.\n",
|
11
|
-
"\n",
|
12
|
-
"TK"
|
13
|
-
]
|
14
|
-
},
|
15
|
-
{
|
16
|
-
"cell_type": "code",
|
17
|
-
"execution_count": 1,
|
18
|
-
"id": "08c7c5f0",
|
19
|
-
"metadata": {
|
20
|
-
"execution": {
|
21
|
-
"iopub.execute_input": "2025-04-21T21:25:37.324499Z",
|
22
|
-
"iopub.status.busy": "2025-04-21T21:25:37.324337Z",
|
23
|
-
"iopub.status.idle": "2025-04-21T21:25:37.328739Z",
|
24
|
-
"shell.execute_reply": "2025-04-21T21:25:37.328344Z"
|
25
|
-
}
|
26
|
-
},
|
27
|
-
"outputs": [],
|
28
|
-
"source": [
|
29
|
-
"#%pip install \"natural-pdf[all]\""
|
30
|
-
]
|
31
|
-
}
|
32
|
-
],
|
33
|
-
"metadata": {
|
34
|
-
"jupytext": {
|
35
|
-
"cell_metadata_filter": "-all",
|
36
|
-
"main_language": "python",
|
37
|
-
"notebook_metadata_filter": "-all"
|
38
|
-
},
|
39
|
-
"language_info": {
|
40
|
-
"codemirror_mode": {
|
41
|
-
"name": "ipython",
|
42
|
-
"version": 3
|
43
|
-
},
|
44
|
-
"file_extension": ".py",
|
45
|
-
"mimetype": "text/x-python",
|
46
|
-
"name": "python",
|
47
|
-
"nbconvert_exporter": "python",
|
48
|
-
"pygments_lexer": "ipython3",
|
49
|
-
"version": "3.10.13"
|
50
|
-
}
|
51
|
-
},
|
52
|
-
"nbformat": 4,
|
53
|
-
"nbformat_minor": 5
|
54
|
-
}
|
@@ -1,9 +0,0 @@
|
|
1
|
-
# Enhanced Table Processing
|
2
|
-
|
3
|
-
Tables are a common way to present structured data in documents, but they can be challenging to extract correctly. This tutorial demonstrates advanced techniques for working with tables in natural-pdf.
|
4
|
-
|
5
|
-
TK
|
6
|
-
|
7
|
-
```python
|
8
|
-
#%pip install "natural-pdf[all]"
|
9
|
-
```
|