natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +422 -0
- natural_pdf/classification/mixin.py +163 -0
- natural_pdf/classification/results.py +80 -0
- natural_pdf/collections/mixins.py +111 -0
- natural_pdf/collections/pdf_collection.py +434 -15
- natural_pdf/core/element_manager.py +83 -0
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +578 -93
- natural_pdf/core/pdf.py +912 -460
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +712 -109
- natural_pdf/elements/region.py +722 -69
- natural_pdf/elements/text.py +4 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf/extraction/manager.py +135 -0
- natural_pdf/extraction/mixin.py +279 -0
- natural_pdf/extraction/result.py +23 -0
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf/ocr/ocr_options.py +94 -11
- natural_pdf/ocr/utils.py +19 -6
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +431 -230
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf/utils/tqdm_utils.py +51 -0
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -915
- docs/element-selection/index.md +0 -229
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -170
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -209
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -340
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -147
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -114
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -270
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -332
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -288
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -413
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -508
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2434
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -512
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -604
- docs/tutorials/12-ocr-integration.md +0 -175
- docs/tutorials/13-semantic-search.ipynb +0 -1328
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.7.dist-info/RECORD +0 -145
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
docs/text-extraction/index.md
DELETED
@@ -1,292 +0,0 @@
|
|
1
|
-
# Text Extraction Guide
|
2
|
-
|
3
|
-
This guide demonstrates various ways to extract text from PDFs using Natural PDF, from simple page dumps to targeted extraction based on elements, regions, and styles.
|
4
|
-
|
5
|
-
## Setup
|
6
|
-
|
7
|
-
First, let's import necessary libraries and load a sample PDF. We'll use `example.pdf` from the tutorials' `pdfs` directory. *Adjust the path if your setup differs.*
|
8
|
-
|
9
|
-
```python
|
10
|
-
from natural_pdf import PDF
|
11
|
-
|
12
|
-
# Load the PDF
|
13
|
-
pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
|
14
|
-
|
15
|
-
# Select the first page for initial examples
|
16
|
-
page = pdf.pages[0]
|
17
|
-
|
18
|
-
# Display the first page
|
19
|
-
page.show(width=700)
|
20
|
-
```
|
21
|
-
|
22
|
-
## Basic Text Extraction
|
23
|
-
|
24
|
-
Get all text from a page or the entire document.
|
25
|
-
|
26
|
-
```python
|
27
|
-
# Extract all text from the first page
|
28
|
-
# Displaying first 500 characters
|
29
|
-
print(page.extract_text()[:500])
|
30
|
-
```
|
31
|
-
|
32
|
-
You can also preserve layout with `layout=True`.
|
33
|
-
|
34
|
-
```python
|
35
|
-
# Extract text from the entire document (may take time)
|
36
|
-
# Uncomment to run:
|
37
|
-
print(page.extract_text(layout=True)[:2000])
|
38
|
-
```
|
39
|
-
|
40
|
-
## Extracting Text from Specific Elements
|
41
|
-
|
42
|
-
Use selectors with `find()` or `find_all()` to target specific elements. *Selectors like `:contains("Summary")` are examples; adapt them to your PDF.*
|
43
|
-
|
44
|
-
```python
|
45
|
-
# Find a single element, e.g., a title containing "Summary"
|
46
|
-
# Adjust selector as needed
|
47
|
-
date_element = page.find('text:contains("Site")')
|
48
|
-
date_element # Display the found element object
|
49
|
-
```
|
50
|
-
|
51
|
-
```python
|
52
|
-
date_element.show()
|
53
|
-
```
|
54
|
-
|
55
|
-
```python
|
56
|
-
date_element.text
|
57
|
-
```
|
58
|
-
|
59
|
-
```python
|
60
|
-
# Find multiple elements, e.g., bold headings (size >= 8)
|
61
|
-
heading_elements = page.find_all('text[size>=8]:bold')
|
62
|
-
heading_elements
|
63
|
-
```
|
64
|
-
|
65
|
-
```python
|
66
|
-
page.find_all('text[size>=8]:bold').show()
|
67
|
-
```
|
68
|
-
|
69
|
-
```python
|
70
|
-
# Pull out all of their text (why? I don't know!)
|
71
|
-
print(heading_elements.extract_text())
|
72
|
-
```
|
73
|
-
|
74
|
-
## Advanced text searches
|
75
|
-
|
76
|
-
```python
|
77
|
-
# Exact phrase (case-sensitive)
|
78
|
-
page.find('text:contains("Hazardous Materials")').text
|
79
|
-
```
|
80
|
-
|
81
|
-
```python
|
82
|
-
# Exact phrase (case-sensitive)
|
83
|
-
page.find('text:contains("HAZARDOUS MATERIALS")', case=False).text
|
84
|
-
```
|
85
|
-
|
86
|
-
```python
|
87
|
-
# Regular expression (e.g., "YYYY Report")
|
88
|
-
regex = "\d+, \d{4}"
|
89
|
-
page.find(f'text:contains("{regex}")', regex=True)
|
90
|
-
```
|
91
|
-
|
92
|
-
```python
|
93
|
-
# Regular expression (e.g., "YYYY Report")
|
94
|
-
page.find_all('text[fontname="Helvetica"][size=10]')
|
95
|
-
```
|
96
|
-
|
97
|
-
# Regions
|
98
|
-
|
99
|
-
```python
|
100
|
-
# Region below an element (e.g., below "Introduction")
|
101
|
-
# Adjust selector as needed
|
102
|
-
page.find('text:contains("Summary")').below(include_element=True).show()
|
103
|
-
```
|
104
|
-
|
105
|
-
```python
|
106
|
-
(
|
107
|
-
page
|
108
|
-
.find('text:contains("Summary")')
|
109
|
-
.below(include_element=True)
|
110
|
-
.extract_text()
|
111
|
-
[:500]
|
112
|
-
)
|
113
|
-
```
|
114
|
-
|
115
|
-
```python
|
116
|
-
(
|
117
|
-
page
|
118
|
-
.find('text:contains("Summary")')
|
119
|
-
.below(include_element=True, until='line:horizontal')
|
120
|
-
.show()
|
121
|
-
)
|
122
|
-
```
|
123
|
-
|
124
|
-
```python
|
125
|
-
# Manually defined region via coordinates (x0, top, x1, bottom)
|
126
|
-
manual_region = page.create_region(30, 60, 600, 300)
|
127
|
-
manual_region.show()
|
128
|
-
```
|
129
|
-
|
130
|
-
```python
|
131
|
-
# Extract text from the manual region
|
132
|
-
manual_region.extract_text()[:500]
|
133
|
-
```
|
134
|
-
|
135
|
-
## Filtering Out Headers and Footers
|
136
|
-
|
137
|
-
Use Exclusion Zones to remove unwanted content before extraction. *Adjust selectors for typical header/footer content.*
|
138
|
-
|
139
|
-
```python
|
140
|
-
header_content = page.find('rect')
|
141
|
-
footer_content = page.find_all('line')[-1].below()
|
142
|
-
|
143
|
-
header_content.highlight()
|
144
|
-
footer_content.highlight()
|
145
|
-
page.to_image()
|
146
|
-
```
|
147
|
-
|
148
|
-
```python
|
149
|
-
page.extract_text()[:500]
|
150
|
-
```
|
151
|
-
|
152
|
-
```python
|
153
|
-
page.add_exclusion(header_content)
|
154
|
-
page.add_exclusion(footer_content)
|
155
|
-
```
|
156
|
-
|
157
|
-
```python
|
158
|
-
page.extract_text()[:500]
|
159
|
-
```
|
160
|
-
|
161
|
-
```python
|
162
|
-
full_text_no_exclusions = page.extract_text(use_exclusions=False)
|
163
|
-
clean_text = page.extract_text()
|
164
|
-
f"Original length: {len(full_text_no_exclusions)}, Excluded length: {len(clean_text)}"
|
165
|
-
```
|
166
|
-
|
167
|
-
```python
|
168
|
-
page.clear_exclusions()
|
169
|
-
```
|
170
|
-
|
171
|
-
*Exclusions can also be defined globally at the PDF level using `pdf.add_exclusion()` with a function.*
|
172
|
-
|
173
|
-
## Controlling Whitespace
|
174
|
-
|
175
|
-
Manage how spaces and blank lines are handled during extraction using `layout`.
|
176
|
-
|
177
|
-
```python
|
178
|
-
print(page.extract_text())
|
179
|
-
```
|
180
|
-
|
181
|
-
```python
|
182
|
-
print(page.extract_text(use_exclusions=False, layout=True))
|
183
|
-
```
|
184
|
-
|
185
|
-
### Font Information Access
|
186
|
-
|
187
|
-
Inspect font details of text elements.
|
188
|
-
|
189
|
-
```python
|
190
|
-
# Find the first text element on the page
|
191
|
-
first_text = page.find_all('text')[1]
|
192
|
-
first_text # Display basic info
|
193
|
-
```
|
194
|
-
|
195
|
-
```python
|
196
|
-
# Highlight the first text element
|
197
|
-
first_text.show()
|
198
|
-
```
|
199
|
-
|
200
|
-
```python
|
201
|
-
# Get detailed font properties dictionary
|
202
|
-
first_text.font_info()
|
203
|
-
```
|
204
|
-
|
205
|
-
```python
|
206
|
-
# Check specific style properties directly
|
207
|
-
f"Is Bold: {first_text.bold}, Is Italic: {first_text.italic}, Font: {first_text.fontname}, Size: {first_text.size}"
|
208
|
-
```
|
209
|
-
|
210
|
-
```python
|
211
|
-
# Find elements by font attributes (adjust selectors)
|
212
|
-
# Example: Find Arial fonts
|
213
|
-
arial_text = page.find_all('text[fontname*=Helvetica]')
|
214
|
-
arial_text # Display list of found elements
|
215
|
-
```
|
216
|
-
|
217
|
-
```python
|
218
|
-
# Example: Find large text (e.g., size >= 16)
|
219
|
-
large_text = page.find_all('text[size>=12]')
|
220
|
-
large_text
|
221
|
-
```
|
222
|
-
|
223
|
-
```python
|
224
|
-
# Example: Find large text (e.g., size >= 16)
|
225
|
-
bold_text = page.find_all('text:bold')
|
226
|
-
bold_text
|
227
|
-
```
|
228
|
-
|
229
|
-
## Working with Font Styles
|
230
|
-
|
231
|
-
Analyze and group text elements by their computed font *style*, which combines attributes like font name, size, boldness, etc., into logical groups.
|
232
|
-
|
233
|
-
```python
|
234
|
-
# Analyze styles on the page
|
235
|
-
# This returns a dictionary mapping style names to ElementList objects
|
236
|
-
page.analyze_text_styles()
|
237
|
-
page.text_style_labels
|
238
|
-
```
|
239
|
-
|
240
|
-
```python
|
241
|
-
page.find_all('text').highlight(group_by='style_label').to_image()
|
242
|
-
```
|
243
|
-
|
244
|
-
```python
|
245
|
-
page.find_all('text[style_label="8.0pt Helvetica"]')
|
246
|
-
```
|
247
|
-
|
248
|
-
```python
|
249
|
-
page.find_all('text[fontname="Helvetica"][size=8]')
|
250
|
-
```
|
251
|
-
|
252
|
-
*Font variants (e.g., `AAAAAB+FontName`) are also accessible via the `font-variant` attribute selector: `page.find_all('text[font-variant="AAAAAB"]')`.*
|
253
|
-
|
254
|
-
## Reading Order
|
255
|
-
|
256
|
-
Text extraction respects a pathetic attempt at natural reading order (top-to-bottom, left-to-right by default). `page.find_all('text')` returns elements already sorted this way.
|
257
|
-
|
258
|
-
```python
|
259
|
-
# Get first 5 text elements in reading order
|
260
|
-
elements_in_order = page.find_all('text')
|
261
|
-
elements_in_order[:5]
|
262
|
-
```
|
263
|
-
|
264
|
-
```python
|
265
|
-
# Text extracted via page.extract_text() respects this order automatically
|
266
|
-
# (Result already shown in Basic Text Extraction section)
|
267
|
-
page.extract_text()[:100]
|
268
|
-
```
|
269
|
-
|
270
|
-
## Element Navigation
|
271
|
-
|
272
|
-
Move between elements sequentially based on reading order using `.next()` and `.previous()`.
|
273
|
-
|
274
|
-
```python
|
275
|
-
page.clear_highlights()
|
276
|
-
|
277
|
-
start = page.find('text:contains("Date")')
|
278
|
-
start.highlight(label='Date label')
|
279
|
-
start.next().highlight(label='Maybe the date', color='green')
|
280
|
-
start.next('text:contains("\d")', regex=True).highlight(label='Probably the date')
|
281
|
-
|
282
|
-
page.to_image()
|
283
|
-
```
|
284
|
-
|
285
|
-
## Next Steps
|
286
|
-
|
287
|
-
Now that you know how to extract text, you might want to explore:
|
288
|
-
|
289
|
-
- [Working with regions](../regions/index.ipynb) for more precise extraction
|
290
|
-
- [OCR capabilities](../ocr/index.md) for scanned documents
|
291
|
-
- [Document layout analysis](../layout-analysis/index.ipynb) for automatic structure detection
|
292
|
-
- [Document QA](../document-qa/index.ipynb) for asking questions directly to your documents
|
@@ -1,194 +0,0 @@
|
|
1
|
-
{
|
2
|
-
"cells": [
|
3
|
-
{
|
4
|
-
"cell_type": "markdown",
|
5
|
-
"id": "2e20408a",
|
6
|
-
"metadata": {},
|
7
|
-
"source": [
|
8
|
-
"# Loading and Basic Text Extraction"
|
9
|
-
]
|
10
|
-
},
|
11
|
-
{
|
12
|
-
"cell_type": "code",
|
13
|
-
"execution_count": null,
|
14
|
-
"id": "0fee7ac5",
|
15
|
-
"metadata": {},
|
16
|
-
"outputs": [],
|
17
|
-
"source": [
|
18
|
-
"#%pip install \"natural-pdf[all]\""
|
19
|
-
]
|
20
|
-
},
|
21
|
-
{
|
22
|
-
"cell_type": "markdown",
|
23
|
-
"id": "765def8e",
|
24
|
-
"metadata": {},
|
25
|
-
"source": [
|
26
|
-
"In this tutorial, we'll learn how to:\n",
|
27
|
-
"\n",
|
28
|
-
"1. Load a PDF document\n",
|
29
|
-
"2. Extract text from pages\n",
|
30
|
-
"3. Extract specific elements \n",
|
31
|
-
"\n",
|
32
|
-
"## Loading a PDF\n",
|
33
|
-
"\n",
|
34
|
-
"Let's start by loading a PDF file:"
|
35
|
-
]
|
36
|
-
},
|
37
|
-
{
|
38
|
-
"cell_type": "code",
|
39
|
-
"execution_count": null,
|
40
|
-
"id": "2aca41e4",
|
41
|
-
"metadata": {},
|
42
|
-
"outputs": [],
|
43
|
-
"source": [
|
44
|
-
"from natural_pdf import PDF\n",
|
45
|
-
"import os\n",
|
46
|
-
"\n",
|
47
|
-
"# Load a PDF file\n",
|
48
|
-
"pdf = PDF(\"https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf\")\n",
|
49
|
-
"\n",
|
50
|
-
"# Basic info about the document\n",
|
51
|
-
"{\n",
|
52
|
-
" \"Filename\": os.path.basename(pdf.path),\n",
|
53
|
-
" \"Pages\": len(pdf.pages),\n",
|
54
|
-
" \"Title\": pdf.metadata.get(\"Title\", \"N/A\"),\n",
|
55
|
-
" \"Author\": pdf.metadata.get(\"Author\", \"N/A\")\n",
|
56
|
-
"}"
|
57
|
-
]
|
58
|
-
},
|
59
|
-
{
|
60
|
-
"cell_type": "markdown",
|
61
|
-
"id": "4e425d78",
|
62
|
-
"metadata": {},
|
63
|
-
"source": [
|
64
|
-
"## Extracting Text\n",
|
65
|
-
"\n",
|
66
|
-
"Now that we have loaded the PDF, let's extract the text from the first page:"
|
67
|
-
]
|
68
|
-
},
|
69
|
-
{
|
70
|
-
"cell_type": "code",
|
71
|
-
"execution_count": null,
|
72
|
-
"id": "a6fa636e",
|
73
|
-
"metadata": {},
|
74
|
-
"outputs": [],
|
75
|
-
"source": [
|
76
|
-
"# Get the first page\n",
|
77
|
-
"page = pdf.pages[0]\n",
|
78
|
-
"\n",
|
79
|
-
"# Extract text from the page\n",
|
80
|
-
"text = page.extract_text()\n",
|
81
|
-
"\n",
|
82
|
-
"# Show the first 200 characters of the text\n",
|
83
|
-
"print(text[:200])"
|
84
|
-
]
|
85
|
-
},
|
86
|
-
{
|
87
|
-
"cell_type": "markdown",
|
88
|
-
"id": "03e5318c",
|
89
|
-
"metadata": {},
|
90
|
-
"source": [
|
91
|
-
"## Finding and Extracting Specific Elements\n",
|
92
|
-
"\n",
|
93
|
-
"We can find specific elements using spatial queries and text content:"
|
94
|
-
]
|
95
|
-
},
|
96
|
-
{
|
97
|
-
"cell_type": "code",
|
98
|
-
"execution_count": null,
|
99
|
-
"id": "ca478166",
|
100
|
-
"metadata": {},
|
101
|
-
"outputs": [],
|
102
|
-
"source": [
|
103
|
-
"# Find text elements containing specific words\n",
|
104
|
-
"elements = page.find_all('text:contains(\"Inadequate\")')\n",
|
105
|
-
"\n",
|
106
|
-
"# Show these elements on the page\n",
|
107
|
-
"page.clear_highlights()\n",
|
108
|
-
"elements.highlight(color=\"red\", label=\"Contains 'Inadequate'\")\n",
|
109
|
-
"\n",
|
110
|
-
"# Display the page to see them\n",
|
111
|
-
"page.to_image(width=700)"
|
112
|
-
]
|
113
|
-
},
|
114
|
-
{
|
115
|
-
"cell_type": "markdown",
|
116
|
-
"id": "85311beb",
|
117
|
-
"metadata": {},
|
118
|
-
"source": [
|
119
|
-
"## Working with Layout Regions\n",
|
120
|
-
"\n",
|
121
|
-
"We can analyze the layout of the page to identify different regions:"
|
122
|
-
]
|
123
|
-
},
|
124
|
-
{
|
125
|
-
"cell_type": "code",
|
126
|
-
"execution_count": null,
|
127
|
-
"id": "d2eaa3cb",
|
128
|
-
"metadata": {},
|
129
|
-
"outputs": [],
|
130
|
-
"source": [
|
131
|
-
"# Analyze the page layout\n",
|
132
|
-
"page.analyze_layout(engine='yolo')\n",
|
133
|
-
"\n",
|
134
|
-
"# Find and highlight all detected regions\n",
|
135
|
-
"page.clear_highlights()\n",
|
136
|
-
"page.find_all('region').highlight(group_by='type')\n",
|
137
|
-
"\n",
|
138
|
-
"# Display the page to see the regions\n",
|
139
|
-
"page.to_image(width=900)"
|
140
|
-
]
|
141
|
-
},
|
142
|
-
{
|
143
|
-
"cell_type": "markdown",
|
144
|
-
"id": "5e562020",
|
145
|
-
"metadata": {},
|
146
|
-
"source": [
|
147
|
-
"## Working with Multiple Pages\n",
|
148
|
-
"\n",
|
149
|
-
"You can also work with multiple pages:"
|
150
|
-
]
|
151
|
-
},
|
152
|
-
{
|
153
|
-
"cell_type": "code",
|
154
|
-
"execution_count": null,
|
155
|
-
"id": "ed5bb91e",
|
156
|
-
"metadata": {},
|
157
|
-
"outputs": [],
|
158
|
-
"source": [
|
159
|
-
"# Process all pages\n",
|
160
|
-
"for page in pdf.pages:\n",
|
161
|
-
" page_text = page.extract_text()\n",
|
162
|
-
" print(f\"Page {page.number}\", page_text[:100]) # First 100 chars of each page"
|
163
|
-
]
|
164
|
-
},
|
165
|
-
{
|
166
|
-
"cell_type": "markdown",
|
167
|
-
"id": "e2b04ab1",
|
168
|
-
"metadata": {},
|
169
|
-
"source": [
|
170
|
-
"This tutorial covered the basics of loading PDFs and extracting text. In the next tutorials, we'll explore more advanced features like searching for specific elements, extracting structured content, and working with tables. "
|
171
|
-
]
|
172
|
-
},
|
173
|
-
{
|
174
|
-
"cell_type": "code",
|
175
|
-
"execution_count": null,
|
176
|
-
"id": "3112c5b6",
|
177
|
-
"metadata": {},
|
178
|
-
"outputs": [],
|
179
|
-
"source": [
|
180
|
-
"%%bash\n",
|
181
|
-
"pip install \"natural-pdf[all]\""
|
182
|
-
]
|
183
|
-
}
|
184
|
-
],
|
185
|
-
"metadata": {
|
186
|
-
"jupytext": {
|
187
|
-
"cell_metadata_filter": "-all",
|
188
|
-
"main_language": "python",
|
189
|
-
"notebook_metadata_filter": "-all"
|
190
|
-
}
|
191
|
-
},
|
192
|
-
"nbformat": 4,
|
193
|
-
"nbformat_minor": 5
|
194
|
-
}
|
@@ -1,95 +0,0 @@
|
|
1
|
-
# Loading and Basic Text Extraction
|
2
|
-
|
3
|
-
```python
|
4
|
-
#%pip install "natural-pdf[all]"
|
5
|
-
```
|
6
|
-
|
7
|
-
In this tutorial, we'll learn how to:
|
8
|
-
|
9
|
-
1. Load a PDF document
|
10
|
-
2. Extract text from pages
|
11
|
-
3. Extract specific elements
|
12
|
-
|
13
|
-
## Loading a PDF
|
14
|
-
|
15
|
-
Let's start by loading a PDF file:
|
16
|
-
|
17
|
-
```python
|
18
|
-
from natural_pdf import PDF
|
19
|
-
import os
|
20
|
-
|
21
|
-
# Load a PDF file
|
22
|
-
pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
|
23
|
-
|
24
|
-
# Basic info about the document
|
25
|
-
{
|
26
|
-
"Filename": os.path.basename(pdf.path),
|
27
|
-
"Pages": len(pdf.pages),
|
28
|
-
"Title": pdf.metadata.get("Title", "N/A"),
|
29
|
-
"Author": pdf.metadata.get("Author", "N/A")
|
30
|
-
}
|
31
|
-
```
|
32
|
-
|
33
|
-
## Extracting Text
|
34
|
-
|
35
|
-
Now that we have loaded the PDF, let's extract the text from the first page:
|
36
|
-
|
37
|
-
```python
|
38
|
-
# Get the first page
|
39
|
-
page = pdf.pages[0]
|
40
|
-
|
41
|
-
# Extract text from the page
|
42
|
-
text = page.extract_text()
|
43
|
-
|
44
|
-
# Show the first 200 characters of the text
|
45
|
-
print(text[:200])
|
46
|
-
```
|
47
|
-
|
48
|
-
## Finding and Extracting Specific Elements
|
49
|
-
|
50
|
-
We can find specific elements using spatial queries and text content:
|
51
|
-
|
52
|
-
```python
|
53
|
-
# Find text elements containing specific words
|
54
|
-
elements = page.find_all('text:contains("Inadequate")')
|
55
|
-
|
56
|
-
# Show these elements on the page
|
57
|
-
page.clear_highlights()
|
58
|
-
elements.highlight(color="red", label="Contains 'Inadequate'")
|
59
|
-
|
60
|
-
# Display the page to see them
|
61
|
-
page.to_image(width=700)
|
62
|
-
```
|
63
|
-
|
64
|
-
## Working with Layout Regions
|
65
|
-
|
66
|
-
We can analyze the layout of the page to identify different regions:
|
67
|
-
|
68
|
-
```python
|
69
|
-
# Analyze the page layout
|
70
|
-
page.analyze_layout(engine='yolo')
|
71
|
-
|
72
|
-
# Find and highlight all detected regions
|
73
|
-
page.clear_highlights()
|
74
|
-
page.find_all('region').highlight(group_by='type')
|
75
|
-
|
76
|
-
# Display the page to see the regions
|
77
|
-
page.to_image(width=900)
|
78
|
-
```
|
79
|
-
|
80
|
-
## Working with Multiple Pages
|
81
|
-
|
82
|
-
You can also work with multiple pages:
|
83
|
-
|
84
|
-
```python
|
85
|
-
# Process all pages
|
86
|
-
for page in pdf.pages:
|
87
|
-
page_text = page.extract_text()
|
88
|
-
print(f"Page {page.number}", page_text[:100]) # First 100 chars of each page
|
89
|
-
```
|
90
|
-
|
91
|
-
This tutorial covered the basics of loading PDFs and extracting text. In the next tutorials, we'll explore more advanced features like searching for specific elements, extracting structured content, and working with tables.
|
92
|
-
|
93
|
-
```bash
|
94
|
-
pip install "natural-pdf[all]"
|
95
|
-
```
|