natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +422 -0
- natural_pdf/classification/mixin.py +163 -0
- natural_pdf/classification/results.py +80 -0
- natural_pdf/collections/mixins.py +111 -0
- natural_pdf/collections/pdf_collection.py +434 -15
- natural_pdf/core/element_manager.py +83 -0
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +578 -93
- natural_pdf/core/pdf.py +912 -460
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +712 -109
- natural_pdf/elements/region.py +722 -69
- natural_pdf/elements/text.py +4 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf/extraction/manager.py +135 -0
- natural_pdf/extraction/mixin.py +279 -0
- natural_pdf/extraction/result.py +23 -0
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf/ocr/ocr_options.py +94 -11
- natural_pdf/ocr/utils.py +19 -6
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +431 -230
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf/utils/tqdm_utils.py +51 -0
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -915
- docs/element-selection/index.md +0 -229
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -170
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -209
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -340
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -147
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -114
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -270
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -332
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -288
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -413
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -508
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2434
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -512
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -604
- docs/tutorials/12-ocr-integration.md +0 -175
- docs/tutorials/13-semantic-search.ipynb +0 -1328
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.7.dist-info/RECORD +0 -145
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,109 +0,0 @@
|
|
1
|
-
# Excluding Content (Headers/Footers)
|
2
|
-
|
3
|
-
Often, PDFs have repeating headers or footers on every page that you want to ignore when extracting the main content. `natural-pdf` allows you to define exclusion regions.
|
4
|
-
|
5
|
-
We'll use a different PDF for this example, which has a distinct header and footer section: `0500000US42007.pdf`.
|
6
|
-
|
7
|
-
```python
|
8
|
-
#%pip install "natural-pdf[all]"
|
9
|
-
```
|
10
|
-
|
11
|
-
|
12
|
-
```python
|
13
|
-
from natural_pdf import PDF
|
14
|
-
|
15
|
-
pdf_url = "https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/0500000US42007.pdf"
|
16
|
-
|
17
|
-
# Load the PDF
|
18
|
-
pdf = PDF(pdf_url)
|
19
|
-
page = pdf.pages[0]
|
20
|
-
|
21
|
-
# Let's see the bottom part of the text WITHOUT exclusions
|
22
|
-
# It likely contains page numbers or other footer info.
|
23
|
-
full_text_unfiltered = page.extract_text()
|
24
|
-
|
25
|
-
# Show the last 200 characters (likely containing footer text)
|
26
|
-
full_text_unfiltered[-200:]
|
27
|
-
```
|
28
|
-
|
29
|
-
## Approach 1: Excluding a Fixed Area
|
30
|
-
|
31
|
-
A simple way to exclude headers or footers is to define a fixed region based on page coordinates. Let's exclude the bottom 200 pixels of the page.
|
32
|
-
|
33
|
-
```python
|
34
|
-
from natural_pdf import PDF
|
35
|
-
|
36
|
-
pdf_url = "https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/0500000US42007.pdf"
|
37
|
-
pdf = PDF(pdf_url)
|
38
|
-
|
39
|
-
# Define the exclusion region on every page using a lambda function
|
40
|
-
footer_height = 200
|
41
|
-
pdf.add_exclusion(
|
42
|
-
lambda page: page.region(top=page.height - footer_height),
|
43
|
-
label="Bottom 200pt Footer"
|
44
|
-
)
|
45
|
-
|
46
|
-
# Now extract text from the first page again, exclusions are active by default
|
47
|
-
page = pdf.pages[0]
|
48
|
-
|
49
|
-
# Visualize the excluded area
|
50
|
-
footer_region_viz = page.region(top=page.height - footer_height)
|
51
|
-
footer_region_viz.highlight(label="Excluded Footer Area")
|
52
|
-
page.to_image()
|
53
|
-
```
|
54
|
-
|
55
|
-
```python
|
56
|
-
filtered_text = page.extract_text() # use_exclusions=True is default
|
57
|
-
|
58
|
-
# Show the last 200 chars with footer area excluded
|
59
|
-
filtered_text[-200:]
|
60
|
-
```
|
61
|
-
|
62
|
-
This method is simple but might cut off content if the footer height varies or content extends lower on some pages.
|
63
|
-
|
64
|
-
## Approach 2: Excluding Based on Elements
|
65
|
-
|
66
|
-
A more robust way is to find specific elements that reliably mark the start of the footer (or end of the header) and exclude everything below (or above) them. In `Examples.md`, the footer was defined as everything below the last horizontal line.
|
67
|
-
|
68
|
-
```python
|
69
|
-
from natural_pdf import PDF
|
70
|
-
|
71
|
-
pdf_url = "https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/0500000US42007.pdf"
|
72
|
-
pdf = PDF(pdf_url)
|
73
|
-
page = pdf.pages[0] # Get page for finding elements
|
74
|
-
|
75
|
-
# Find the last horizontal line on the first page
|
76
|
-
# We'll use this logic to define our exclusion for all pages
|
77
|
-
last_line = page.find_all('line')[-1]
|
78
|
-
|
79
|
-
# Define the exclusion function using a lambda
|
80
|
-
# This finds the last line on *each* page and excludes below it
|
81
|
-
pdf.add_exclusion(
|
82
|
-
lambda p: p.find_all('line')[-1].below(),
|
83
|
-
label="Element-Based Footer"
|
84
|
-
)
|
85
|
-
|
86
|
-
# Extract text again, with the element-based exclusion active
|
87
|
-
filtered_text_element = page.extract_text()
|
88
|
-
|
89
|
-
# Show the last 200 chars with element-based footer exclusion
|
90
|
-
"Element-Based Excluded (last 200 chars): " + filtered_text_element[-200:]
|
91
|
-
|
92
|
-
# Visualize the element-based exclusion area
|
93
|
-
page.clear_highlights()
|
94
|
-
# Need to find the region again for visualization
|
95
|
-
footer_boundary = page.find_all('line')[-1]
|
96
|
-
footer_region_element = footer_boundary.below()
|
97
|
-
footer_region_element.show(label="Excluded Footer Area (Element)")
|
98
|
-
page.to_image()
|
99
|
-
```
|
100
|
-
|
101
|
-
This element-based approach is usually more reliable as it adapts to the content's position, but it depends on finding consistent boundary elements (like lines or specific text markers).
|
102
|
-
|
103
|
-
<div class="admonition note">
|
104
|
-
<p class="admonition-title">Applying Exclusions</p>
|
105
|
-
|
106
|
-
* `pdf.add_exclusion(func)` applies the exclusion function (which takes a page and returns a region) to *all* pages in the PDF.
|
107
|
-
* `page.add_exclusion(region)` adds an exclusion region only to that specific page.
|
108
|
-
* `extract_text(use_exclusions=False)` can be used to temporarily disable exclusions.
|
109
|
-
</div>
|
@@ -1,332 +0,0 @@
|
|
1
|
-
{
|
2
|
-
"cells": [
|
3
|
-
{
|
4
|
-
"cell_type": "markdown",
|
5
|
-
"id": "a0347143",
|
6
|
-
"metadata": {},
|
7
|
-
"source": [
|
8
|
-
"# Document Question Answering (QA)\n",
|
9
|
-
"\n",
|
10
|
-
"Sometimes, instead of searching for specific text patterns, you just want to ask the document a question directly. `natural-pdf` includes an extractive Question Answering feature.\n",
|
11
|
-
"\n",
|
12
|
-
"\"Extractive\" means it finds the literal answer text within the document, rather than generating a new answer or summarizing.\n",
|
13
|
-
"\n",
|
14
|
-
"Let's ask our `01-practice.pdf` a few questions."
|
15
|
-
]
|
16
|
-
},
|
17
|
-
{
|
18
|
-
"cell_type": "code",
|
19
|
-
"execution_count": 1,
|
20
|
-
"id": "883ee2f6",
|
21
|
-
"metadata": {
|
22
|
-
"execution": {
|
23
|
-
"iopub.execute_input": "2025-04-21T21:24:21.452596Z",
|
24
|
-
"iopub.status.busy": "2025-04-21T21:24:21.452452Z",
|
25
|
-
"iopub.status.idle": "2025-04-21T21:24:21.457320Z",
|
26
|
-
"shell.execute_reply": "2025-04-21T21:24:21.456901Z"
|
27
|
-
}
|
28
|
-
},
|
29
|
-
"outputs": [],
|
30
|
-
"source": [
|
31
|
-
"#%pip install \"natural-pdf[all]\""
|
32
|
-
]
|
33
|
-
},
|
34
|
-
{
|
35
|
-
"cell_type": "code",
|
36
|
-
"execution_count": 2,
|
37
|
-
"id": "abecda7a",
|
38
|
-
"metadata": {
|
39
|
-
"execution": {
|
40
|
-
"iopub.execute_input": "2025-04-21T21:24:21.459009Z",
|
41
|
-
"iopub.status.busy": "2025-04-21T21:24:21.458879Z",
|
42
|
-
"iopub.status.idle": "2025-04-21T21:24:30.401038Z",
|
43
|
-
"shell.execute_reply": "2025-04-21T21:24:30.400660Z"
|
44
|
-
}
|
45
|
-
},
|
46
|
-
"outputs": [
|
47
|
-
{
|
48
|
-
"name": "stderr",
|
49
|
-
"output_type": "stream",
|
50
|
-
"text": [
|
51
|
-
"Device set to use cpu\n"
|
52
|
-
]
|
53
|
-
},
|
54
|
-
{
|
55
|
-
"data": {
|
56
|
-
"text/plain": [
|
57
|
-
"{'answer': 'February 3, 1905',\n",
|
58
|
-
" 'confidence': 0.9979940056800842,\n",
|
59
|
-
" 'start': 6,\n",
|
60
|
-
" 'end': 6,\n",
|
61
|
-
" 'found': True,\n",
|
62
|
-
" 'page_num': 0,\n",
|
63
|
-
" 'source_elements': <ElementCollection[TextElement](count=1)>}"
|
64
|
-
]
|
65
|
-
},
|
66
|
-
"execution_count": 2,
|
67
|
-
"metadata": {},
|
68
|
-
"output_type": "execute_result"
|
69
|
-
}
|
70
|
-
],
|
71
|
-
"source": [
|
72
|
-
"from natural_pdf import PDF\n",
|
73
|
-
"\n",
|
74
|
-
"# Load the PDF and get the page\n",
|
75
|
-
"pdf = PDF(\"https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf\")\n",
|
76
|
-
"page = pdf.pages[0]\n",
|
77
|
-
"\n",
|
78
|
-
"# Ask about the date\n",
|
79
|
-
"question_1 = \"What is the inspection date?\"\n",
|
80
|
-
"answer_1 = page.ask(question_1)\n",
|
81
|
-
"\n",
|
82
|
-
"# The result is a dictionary with the answer, confidence, etc.\n",
|
83
|
-
"answer_1"
|
84
|
-
]
|
85
|
-
},
|
86
|
-
{
|
87
|
-
"cell_type": "code",
|
88
|
-
"execution_count": 3,
|
89
|
-
"id": "d6b5a66f",
|
90
|
-
"metadata": {
|
91
|
-
"execution": {
|
92
|
-
"iopub.execute_input": "2025-04-21T21:24:30.403287Z",
|
93
|
-
"iopub.status.busy": "2025-04-21T21:24:30.402917Z",
|
94
|
-
"iopub.status.idle": "2025-04-21T21:24:31.285240Z",
|
95
|
-
"shell.execute_reply": "2025-04-21T21:24:31.284848Z"
|
96
|
-
}
|
97
|
-
},
|
98
|
-
"outputs": [
|
99
|
-
{
|
100
|
-
"data": {
|
101
|
-
"text/plain": [
|
102
|
-
"{'answer': 'Jungle Health and Safety Inspection Service',\n",
|
103
|
-
" 'confidence': 0.9988948106765747,\n",
|
104
|
-
" 'start': 0,\n",
|
105
|
-
" 'end': 0,\n",
|
106
|
-
" 'found': True,\n",
|
107
|
-
" 'page_num': 0,\n",
|
108
|
-
" 'source_elements': <ElementCollection[TextElement](count=1)>}"
|
109
|
-
]
|
110
|
-
},
|
111
|
-
"execution_count": 3,
|
112
|
-
"metadata": {},
|
113
|
-
"output_type": "execute_result"
|
114
|
-
}
|
115
|
-
],
|
116
|
-
"source": [
|
117
|
-
"# Ask about the company name\n",
|
118
|
-
"question_2 = \"What company was inspected?\"\n",
|
119
|
-
"answer_2 = page.ask(question_2)\n",
|
120
|
-
"\n",
|
121
|
-
"# Display the answer dictionary\n",
|
122
|
-
"answer_2"
|
123
|
-
]
|
124
|
-
},
|
125
|
-
{
|
126
|
-
"cell_type": "code",
|
127
|
-
"execution_count": 4,
|
128
|
-
"id": "babaee28",
|
129
|
-
"metadata": {
|
130
|
-
"execution": {
|
131
|
-
"iopub.execute_input": "2025-04-21T21:24:31.286992Z",
|
132
|
-
"iopub.status.busy": "2025-04-21T21:24:31.286826Z",
|
133
|
-
"iopub.status.idle": "2025-04-21T21:24:32.069026Z",
|
134
|
-
"shell.execute_reply": "2025-04-21T21:24:32.068668Z"
|
135
|
-
}
|
136
|
-
},
|
137
|
-
"outputs": [
|
138
|
-
{
|
139
|
-
"data": {
|
140
|
-
"text/plain": [
|
141
|
-
"{'answer': 'Inadequate Protective Equipment.',\n",
|
142
|
-
" 'confidence': 0.9997999668121338,\n",
|
143
|
-
" 'start': 26,\n",
|
144
|
-
" 'end': 26,\n",
|
145
|
-
" 'found': True,\n",
|
146
|
-
" 'page_num': 0,\n",
|
147
|
-
" 'source_elements': <ElementCollection[TextElement](count=1)>}"
|
148
|
-
]
|
149
|
-
},
|
150
|
-
"execution_count": 4,
|
151
|
-
"metadata": {},
|
152
|
-
"output_type": "execute_result"
|
153
|
-
}
|
154
|
-
],
|
155
|
-
"source": [
|
156
|
-
"# Ask about specific content from the table\n",
|
157
|
-
"question_3 = \"What is statute 5.8.3 about?\"\n",
|
158
|
-
"answer_3 = page.ask(question_3)\n",
|
159
|
-
"\n",
|
160
|
-
"# Display the answer\n",
|
161
|
-
"answer_3"
|
162
|
-
]
|
163
|
-
},
|
164
|
-
{
|
165
|
-
"cell_type": "markdown",
|
166
|
-
"id": "cf24e07d",
|
167
|
-
"metadata": {},
|
168
|
-
"source": [
|
169
|
-
"The results include the extracted `answer`, a `confidence` score (useful for filtering uncertain answers), the `page_num`, and the `source_elements`.\n",
|
170
|
-
"\n",
|
171
|
-
"## Collecting Results into a DataFrame\n",
|
172
|
-
"\n",
|
173
|
-
"If you're asking multiple questions, it's often useful to collect the results into a pandas DataFrame for easier analysis."
|
174
|
-
]
|
175
|
-
},
|
176
|
-
{
|
177
|
-
"cell_type": "code",
|
178
|
-
"execution_count": 5,
|
179
|
-
"id": "00b777b5",
|
180
|
-
"metadata": {
|
181
|
-
"execution": {
|
182
|
-
"iopub.execute_input": "2025-04-21T21:24:32.070771Z",
|
183
|
-
"iopub.status.busy": "2025-04-21T21:24:32.070607Z",
|
184
|
-
"iopub.status.idle": "2025-04-21T21:24:35.309130Z",
|
185
|
-
"shell.execute_reply": "2025-04-21T21:24:35.308744Z"
|
186
|
-
}
|
187
|
-
},
|
188
|
-
"outputs": [
|
189
|
-
{
|
190
|
-
"data": {
|
191
|
-
"text/html": [
|
192
|
-
"<div>\n",
|
193
|
-
"<style scoped>\n",
|
194
|
-
" .dataframe tbody tr th:only-of-type {\n",
|
195
|
-
" vertical-align: middle;\n",
|
196
|
-
" }\n",
|
197
|
-
"\n",
|
198
|
-
" .dataframe tbody tr th {\n",
|
199
|
-
" vertical-align: top;\n",
|
200
|
-
" }\n",
|
201
|
-
"\n",
|
202
|
-
" .dataframe thead th {\n",
|
203
|
-
" text-align: right;\n",
|
204
|
-
" }\n",
|
205
|
-
"</style>\n",
|
206
|
-
"<table border=\"1\" class=\"dataframe\">\n",
|
207
|
-
" <thead>\n",
|
208
|
-
" <tr style=\"text-align: right;\">\n",
|
209
|
-
" <th></th>\n",
|
210
|
-
" <th>question</th>\n",
|
211
|
-
" <th>answer</th>\n",
|
212
|
-
" <th>confidence</th>\n",
|
213
|
-
" </tr>\n",
|
214
|
-
" </thead>\n",
|
215
|
-
" <tbody>\n",
|
216
|
-
" <tr>\n",
|
217
|
-
" <th>0</th>\n",
|
218
|
-
" <td>What is the inspection date?</td>\n",
|
219
|
-
" <td>February 3, 1905</td>\n",
|
220
|
-
" <td>0.997994</td>\n",
|
221
|
-
" </tr>\n",
|
222
|
-
" <tr>\n",
|
223
|
-
" <th>1</th>\n",
|
224
|
-
" <td>What company was inspected?</td>\n",
|
225
|
-
" <td>Jungle Health and Safety Inspection Service</td>\n",
|
226
|
-
" <td>0.998895</td>\n",
|
227
|
-
" </tr>\n",
|
228
|
-
" <tr>\n",
|
229
|
-
" <th>2</th>\n",
|
230
|
-
" <td>What is statute 5.8.3 about?</td>\n",
|
231
|
-
" <td>Inadequate Protective Equipment.</td>\n",
|
232
|
-
" <td>0.999800</td>\n",
|
233
|
-
" </tr>\n",
|
234
|
-
" <tr>\n",
|
235
|
-
" <th>3</th>\n",
|
236
|
-
" <td>How many violations were there in total?</td>\n",
|
237
|
-
" <td>4.12.7</td>\n",
|
238
|
-
" <td>0.662557</td>\n",
|
239
|
-
" </tr>\n",
|
240
|
-
" </tbody>\n",
|
241
|
-
"</table>\n",
|
242
|
-
"</div>"
|
243
|
-
],
|
244
|
-
"text/plain": [
|
245
|
-
" question \\\n",
|
246
|
-
"0 What is the inspection date? \n",
|
247
|
-
"1 What company was inspected? \n",
|
248
|
-
"2 What is statute 5.8.3 about? \n",
|
249
|
-
"3 How many violations were there in total? \n",
|
250
|
-
"\n",
|
251
|
-
" answer confidence \n",
|
252
|
-
"0 February 3, 1905 0.997994 \n",
|
253
|
-
"1 Jungle Health and Safety Inspection Service 0.998895 \n",
|
254
|
-
"2 Inadequate Protective Equipment. 0.999800 \n",
|
255
|
-
"3 4.12.7 0.662557 "
|
256
|
-
]
|
257
|
-
},
|
258
|
-
"execution_count": 5,
|
259
|
-
"metadata": {},
|
260
|
-
"output_type": "execute_result"
|
261
|
-
}
|
262
|
-
],
|
263
|
-
"source": [
|
264
|
-
"from natural_pdf import PDF\n",
|
265
|
-
"import pandas as pd\n",
|
266
|
-
"\n",
|
267
|
-
"pdf = PDF(\"https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf\")\n",
|
268
|
-
"page = pdf.pages[0]\n",
|
269
|
-
"\n",
|
270
|
-
"# List of questions to ask\n",
|
271
|
-
"questions = [\n",
|
272
|
-
" \"What is the inspection date?\",\n",
|
273
|
-
" \"What company was inspected?\",\n",
|
274
|
-
" \"What is statute 5.8.3 about?\",\n",
|
275
|
-
" \"How many violations were there in total?\" # This might be less reliable\n",
|
276
|
-
"]\n",
|
277
|
-
"\n",
|
278
|
-
"# Collect answers for each question\n",
|
279
|
-
"results = []\n",
|
280
|
-
"for q in questions:\n",
|
281
|
-
" answer_dict = page.ask(q)\n",
|
282
|
-
" # Add the original question to the dictionary\n",
|
283
|
-
" answer_dict['question'] = q\n",
|
284
|
-
" results.append(answer_dict)\n",
|
285
|
-
"\n",
|
286
|
-
"# Convert the list of dictionaries to a DataFrame\n",
|
287
|
-
"# We select only the most relevant columns here\n",
|
288
|
-
"df_results = pd.DataFrame(results)[['question', 'answer', 'confidence']]\n",
|
289
|
-
"\n",
|
290
|
-
"# Display the DataFrame\n",
|
291
|
-
"df_results"
|
292
|
-
]
|
293
|
-
},
|
294
|
-
{
|
295
|
-
"cell_type": "markdown",
|
296
|
-
"id": "381130c6",
|
297
|
-
"metadata": {},
|
298
|
-
"source": [
|
299
|
-
"This shows how you can iterate through questions, collect the answer dictionaries, and then create a structured DataFrame, making it easy to review questions, answers, and their confidence levels together.\n",
|
300
|
-
"\n",
|
301
|
-
"<div class=\"admonition note\">\n",
|
302
|
-
"<p class=\"admonition-title\">QA Model and Limitations</p>\n",
|
303
|
-
"\n",
|
304
|
-
" * The QA system relies on underlying transformer models. Performance and confidence scores vary.\n",
|
305
|
-
" * It works best for questions where the answer is explicitly stated. It cannot synthesize information or perform calculations (e.g., counting items might fail or return text containing a number rather than the count itself).\n",
|
306
|
-
" * You can potentially specify different QA models via the `model=` argument in `page.ask()` if others are configured.\n",
|
307
|
-
"</div> "
|
308
|
-
]
|
309
|
-
}
|
310
|
-
],
|
311
|
-
"metadata": {
|
312
|
-
"jupytext": {
|
313
|
-
"cell_metadata_filter": "-all",
|
314
|
-
"main_language": "python",
|
315
|
-
"notebook_metadata_filter": "-all"
|
316
|
-
},
|
317
|
-
"language_info": {
|
318
|
-
"codemirror_mode": {
|
319
|
-
"name": "ipython",
|
320
|
-
"version": 3
|
321
|
-
},
|
322
|
-
"file_extension": ".py",
|
323
|
-
"mimetype": "text/x-python",
|
324
|
-
"name": "python",
|
325
|
-
"nbconvert_exporter": "python",
|
326
|
-
"pygments_lexer": "ipython3",
|
327
|
-
"version": "3.10.13"
|
328
|
-
}
|
329
|
-
},
|
330
|
-
"nbformat": 4,
|
331
|
-
"nbformat_minor": 5
|
332
|
-
}
|
docs/tutorials/06-document-qa.md
DELETED
@@ -1,91 +0,0 @@
|
|
1
|
-
# Document Question Answering (QA)
|
2
|
-
|
3
|
-
Sometimes, instead of searching for specific text patterns, you just want to ask the document a question directly. `natural-pdf` includes an extractive Question Answering feature.
|
4
|
-
|
5
|
-
"Extractive" means it finds the literal answer text within the document, rather than generating a new answer or summarizing.
|
6
|
-
|
7
|
-
Let's ask our `01-practice.pdf` a few questions.
|
8
|
-
|
9
|
-
```python
|
10
|
-
#%pip install "natural-pdf[all]"
|
11
|
-
```
|
12
|
-
|
13
|
-
```python
|
14
|
-
from natural_pdf import PDF
|
15
|
-
|
16
|
-
# Load the PDF and get the page
|
17
|
-
pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
|
18
|
-
page = pdf.pages[0]
|
19
|
-
|
20
|
-
# Ask about the date
|
21
|
-
question_1 = "What is the inspection date?"
|
22
|
-
answer_1 = page.ask(question_1)
|
23
|
-
|
24
|
-
# The result is a dictionary with the answer, confidence, etc.
|
25
|
-
answer_1
|
26
|
-
```
|
27
|
-
|
28
|
-
```python
|
29
|
-
# Ask about the company name
|
30
|
-
question_2 = "What company was inspected?"
|
31
|
-
answer_2 = page.ask(question_2)
|
32
|
-
|
33
|
-
# Display the answer dictionary
|
34
|
-
answer_2
|
35
|
-
```
|
36
|
-
|
37
|
-
```python
|
38
|
-
# Ask about specific content from the table
|
39
|
-
question_3 = "What is statute 5.8.3 about?"
|
40
|
-
answer_3 = page.ask(question_3)
|
41
|
-
|
42
|
-
# Display the answer
|
43
|
-
answer_3
|
44
|
-
```
|
45
|
-
|
46
|
-
The results include the extracted `answer`, a `confidence` score (useful for filtering uncertain answers), the `page_num`, and the `source_elements`.
|
47
|
-
|
48
|
-
## Collecting Results into a DataFrame
|
49
|
-
|
50
|
-
If you're asking multiple questions, it's often useful to collect the results into a pandas DataFrame for easier analysis.
|
51
|
-
|
52
|
-
```python
|
53
|
-
from natural_pdf import PDF
|
54
|
-
import pandas as pd
|
55
|
-
|
56
|
-
pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
|
57
|
-
page = pdf.pages[0]
|
58
|
-
|
59
|
-
# List of questions to ask
|
60
|
-
questions = [
|
61
|
-
"What is the inspection date?",
|
62
|
-
"What company was inspected?",
|
63
|
-
"What is statute 5.8.3 about?",
|
64
|
-
"How many violations were there in total?" # This might be less reliable
|
65
|
-
]
|
66
|
-
|
67
|
-
# Collect answers for each question
|
68
|
-
results = []
|
69
|
-
for q in questions:
|
70
|
-
answer_dict = page.ask(q)
|
71
|
-
# Add the original question to the dictionary
|
72
|
-
answer_dict['question'] = q
|
73
|
-
results.append(answer_dict)
|
74
|
-
|
75
|
-
# Convert the list of dictionaries to a DataFrame
|
76
|
-
# We select only the most relevant columns here
|
77
|
-
df_results = pd.DataFrame(results)[['question', 'answer', 'confidence']]
|
78
|
-
|
79
|
-
# Display the DataFrame
|
80
|
-
df_results
|
81
|
-
```
|
82
|
-
|
83
|
-
This shows how you can iterate through questions, collect the answer dictionaries, and then create a structured DataFrame, making it easy to review questions, answers, and their confidence levels together.
|
84
|
-
|
85
|
-
<div class="admonition note">
|
86
|
-
<p class="admonition-title">QA Model and Limitations</p>
|
87
|
-
|
88
|
-
* The QA system relies on underlying transformer models. Performance and confidence scores vary.
|
89
|
-
* It works best for questions where the answer is explicitly stated. It cannot synthesize information or perform calculations (e.g., counting items might fail or return text containing a number rather than the count itself).
|
90
|
-
* You can potentially specify different QA models via the `model=` argument in `page.ask()` if others are configured.
|
91
|
-
</div>
|