natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +422 -0
- natural_pdf/classification/mixin.py +163 -0
- natural_pdf/classification/results.py +80 -0
- natural_pdf/collections/mixins.py +111 -0
- natural_pdf/collections/pdf_collection.py +434 -15
- natural_pdf/core/element_manager.py +83 -0
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +578 -93
- natural_pdf/core/pdf.py +912 -460
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +712 -109
- natural_pdf/elements/region.py +722 -69
- natural_pdf/elements/text.py +4 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf/extraction/manager.py +135 -0
- natural_pdf/extraction/mixin.py +279 -0
- natural_pdf/extraction/result.py +23 -0
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf/ocr/ocr_options.py +94 -11
- natural_pdf/ocr/utils.py +19 -6
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +431 -230
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf/utils/tqdm_utils.py +51 -0
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -915
- docs/element-selection/index.md +0 -229
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -170
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -209
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -340
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -147
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -114
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -270
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -332
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -288
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -413
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -508
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2434
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -512
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -604
- docs/tutorials/12-ocr-integration.md +0 -175
- docs/tutorials/13-semantic-search.ipynb +0 -1328
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.7.dist-info/RECORD +0 -145
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
docs/layout-analysis/index.md
DELETED
@@ -1,185 +0,0 @@
|
|
1
|
-
# Document Layout Analysis
|
2
|
-
|
3
|
-
Natural PDF can automatically detect the structure of a document (titles, paragraphs, tables, figures) using layout analysis models. This guide shows how to use this feature.
|
4
|
-
|
5
|
-
## Setup
|
6
|
-
|
7
|
-
We'll use a sample PDF that includes various layout elements.
|
8
|
-
|
9
|
-
```python
|
10
|
-
from natural_pdf import PDF
|
11
|
-
|
12
|
-
pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
|
13
|
-
page = pdf.pages[0]
|
14
|
-
|
15
|
-
page.to_image(width=700)
|
16
|
-
```
|
17
|
-
|
18
|
-
## Running Basic Layout Analysis
|
19
|
-
|
20
|
-
Use the `analyze_layout()` method. By default, it uses the YOLO model.
|
21
|
-
|
22
|
-
```python
|
23
|
-
# Analyze the layout using the default engine (YOLO)
|
24
|
-
# This adds 'region' elements to the page
|
25
|
-
page.analyze_layout()
|
26
|
-
```
|
27
|
-
|
28
|
-
```python
|
29
|
-
# Find all detected regions
|
30
|
-
regions = page.find_all('region')
|
31
|
-
len(regions) # Show how many regions were detected
|
32
|
-
```
|
33
|
-
|
34
|
-
```python
|
35
|
-
first_region = regions[0]
|
36
|
-
f"First region: type='{first_region.type}', confidence={first_region.confidence:.2f}"
|
37
|
-
```
|
38
|
-
|
39
|
-
## Visualizing Detected Layout
|
40
|
-
|
41
|
-
Use `highlight()` or `show()` on the detected regions.
|
42
|
-
|
43
|
-
```python
|
44
|
-
# Highlight all detected regions, colored by type
|
45
|
-
regions.highlight(group_by='type')
|
46
|
-
page.to_image(width=700)
|
47
|
-
```
|
48
|
-
|
49
|
-
## Finding Specific Region Types
|
50
|
-
|
51
|
-
Use attribute selectors to find regions of a specific type.
|
52
|
-
|
53
|
-
```python
|
54
|
-
# Find all detected titles
|
55
|
-
titles = page.find_all('region[type=title]')
|
56
|
-
titles
|
57
|
-
```
|
58
|
-
|
59
|
-
```python
|
60
|
-
titles.show()
|
61
|
-
```
|
62
|
-
|
63
|
-
```python
|
64
|
-
page.find_all('region[type=table]').show()
|
65
|
-
```
|
66
|
-
|
67
|
-
## Working with Layout Regions
|
68
|
-
|
69
|
-
Detected regions are like any other `Region` object. You can extract text, find elements within them, etc.
|
70
|
-
|
71
|
-
```python
|
72
|
-
page.find('region[type=table]').extract_text(layout=True)
|
73
|
-
```
|
74
|
-
|
75
|
-
## Using Different Layout Models
|
76
|
-
|
77
|
-
Natural PDF supports multiple engines (`yolo`, `paddle`, `tatr`). Specify the engine when calling `analyze_layout`.
|
78
|
-
|
79
|
-
*Note: Using different engines requires installing the corresponding extras (e.g., `natural-pdf[layout_paddle]`).* `yolo` is the default.
|
80
|
-
|
81
|
-
```python
|
82
|
-
page.clear_detected_layout_regions()
|
83
|
-
page.clear_highlights()
|
84
|
-
|
85
|
-
page.analyze_layout(engine="paddle")
|
86
|
-
page.find_all('region[model=paddle]').highlight(group_by='region_type')
|
87
|
-
page.to_image(width=700)
|
88
|
-
```
|
89
|
-
|
90
|
-
```python
|
91
|
-
# Analyze using Table Transformer (TATR) - specialized for tables
|
92
|
-
page.clear_detected_layout_regions()
|
93
|
-
page.clear_highlights()
|
94
|
-
|
95
|
-
page.analyze_layout(engine="tatr")
|
96
|
-
page.find_all('region[model=tatr]').highlight(group_by='region_type')
|
97
|
-
page.to_image(width=700)
|
98
|
-
```
|
99
|
-
|
100
|
-
```python
|
101
|
-
# Analyze using Table Transformer (TATR) - specialized for tables
|
102
|
-
page.clear_detected_layout_regions()
|
103
|
-
page.clear_highlights()
|
104
|
-
|
105
|
-
page.analyze_layout(engine="docling")
|
106
|
-
page.find_all('region[model=docling]').highlight(group_by='region_type')
|
107
|
-
page.to_image(width=700)
|
108
|
-
```
|
109
|
-
|
110
|
-
```python
|
111
|
-
# Analyze using Table Transformer (TATR) - specialized for tables
|
112
|
-
page.clear_detected_layout_regions()
|
113
|
-
page.clear_highlights()
|
114
|
-
|
115
|
-
page.analyze_layout(engine="surya")
|
116
|
-
page.find_all('region[model=surya]').highlight(group_by='region_type')
|
117
|
-
page.to_image(width=700)
|
118
|
-
```
|
119
|
-
|
120
|
-
*Note: Calling `analyze_layout` multiple times (even with the same engine) can add duplicate regions. You might want to use `page.clear_detected_layout_regions()` first, or filter by model using `region[model=yolo]`.*
|
121
|
-
|
122
|
-
## Controlling Confidence Threshold
|
123
|
-
|
124
|
-
Filter detections by their confidence score.
|
125
|
-
|
126
|
-
```python
|
127
|
-
# Re-run YOLO analysis (clearing previous results might be good practice)
|
128
|
-
page.clear_detected_layout_regions()
|
129
|
-
page.analyze_layout(engine="yolo")
|
130
|
-
|
131
|
-
# Find only high-confidence regions (e.g., >= 0.8)
|
132
|
-
high_conf_regions = page.find_all('region[confidence>=0.8]')
|
133
|
-
len(high_conf_regions)
|
134
|
-
```
|
135
|
-
|
136
|
-
## Table Structure with TATR
|
137
|
-
|
138
|
-
The TATR engine provides detailed table structure elements (`table`, `table-row`, `table-column`, `table-column-header`). This is very useful for precise table extraction.
|
139
|
-
|
140
|
-
```python
|
141
|
-
# Ensure TATR analysis has been run
|
142
|
-
page.clear_detected_layout_regions()
|
143
|
-
page.clear_highlights()
|
144
|
-
|
145
|
-
page.analyze_layout(engine="tatr")
|
146
|
-
page.find_all('region[model=tatr]').highlight(group_by='region_type')
|
147
|
-
page.to_image(width=700)
|
148
|
-
```
|
149
|
-
|
150
|
-
```python
|
151
|
-
# Find different structural elements from TATR
|
152
|
-
tables = page.find_all('region[type=table][model=tatr]')
|
153
|
-
rows = page.find_all('region[type=table-row][model=tatr]')
|
154
|
-
cols = page.find_all('region[type=table-column][model=tatr]')
|
155
|
-
hdrs = page.find_all('region[type=table-column-header][model=tatr]')
|
156
|
-
|
157
|
-
f"Found: {len(tables)} tables, {len(rows)} rows, {len(cols)} columns, {len(hdrs)} headers (from TATR)"
|
158
|
-
```
|
159
|
-
|
160
|
-
### Enhanced Table Extraction with TATR
|
161
|
-
|
162
|
-
When a `region[type=table]` comes from the TATR model, `extract_table()` can use the underlying row/column structure for more robust extraction.
|
163
|
-
|
164
|
-
```python
|
165
|
-
# Find the TATR table region again
|
166
|
-
tatr_table = page.find('region[type=table][model=tatr]')
|
167
|
-
|
168
|
-
# This extraction uses the detected rows/columns
|
169
|
-
tatr_table.extract_table()
|
170
|
-
```
|
171
|
-
|
172
|
-
if you'd like the normal approach instead of the "intelligent" one, you can ask for pdfplumber.
|
173
|
-
|
174
|
-
```python
|
175
|
-
# This extraction uses the detected rows/columns
|
176
|
-
tatr_table.extract_table(method='pdfplumber')
|
177
|
-
```
|
178
|
-
|
179
|
-
## Next Steps
|
180
|
-
|
181
|
-
Layout analysis provides regions that you can use for:
|
182
|
-
|
183
|
-
- [Table Extraction](../tables/index.ipynb): Especially powerful with TATR regions.
|
184
|
-
- [Text Extraction](../text-extraction/index.ipynb): Extract text only from specific region types (e.g., paragraphs).
|
185
|
-
- [Document QA](../document-qa/index.ipynb): Focus question answering on specific detected regions.
|
docs/ocr/index.md
DELETED
@@ -1,209 +0,0 @@
|
|
1
|
-
# OCR Integration
|
2
|
-
|
3
|
-
Natural PDF includes OCR (Optical Character Recognition) to extract text from scanned documents or images embedded in PDFs.
|
4
|
-
|
5
|
-
## OCR Engine Comparison
|
6
|
-
|
7
|
-
Natural PDF supports multiple OCR engines:
|
8
|
-
|
9
|
-
| Feature | EasyOCR | PaddleOCR | Surya OCR |
|
10
|
-
|----------------------|------------------------------------|------------------------------------------|---------------------------------------|
|
11
|
-
| **Installation** | `natural-pdf[easyocr]` | `natural-pdf[paddle]` | `natural-pdf[surya]` |
|
12
|
-
| **Primary Strength** | Good general performance, simpler | Excellent Asian language, speed | High accuracy, multilingual lines |
|
13
|
-
| **Speed** | Moderate | Fast | Moderate (GPU recommended) |
|
14
|
-
| **Memory Usage** | Higher | Efficient | Higher (GPU recommended) |
|
15
|
-
| **Paragraph Detect** | Yes (via option) | No | No (focuses on lines) |
|
16
|
-
| **Handwritten** | Better support | Limited | Limited |
|
17
|
-
| **Small Text** | Moderate | Good | Good |
|
18
|
-
| **When to Use** | General documents, handwritten text| Asian languages, speed-critical tasks | Highest accuracy needed, line-level |
|
19
|
-
|
20
|
-
## Basic OCR Usage
|
21
|
-
|
22
|
-
Apply OCR directly to a page or region:
|
23
|
-
|
24
|
-
```python
|
25
|
-
from natural_pdf import PDF
|
26
|
-
|
27
|
-
# Assume 'page' is a Page object from a PDF
|
28
|
-
page = pdf.pages[0]
|
29
|
-
|
30
|
-
# Apply OCR using the default engine (or specify one)
|
31
|
-
ocr_elements = page.apply_ocr(languages=['en'])
|
32
|
-
|
33
|
-
# Extract text (will use the results from apply_ocr if run previously)
|
34
|
-
text = page.extract_text()
|
35
|
-
print(text)
|
36
|
-
```
|
37
|
-
|
38
|
-
## Configuring OCR
|
39
|
-
|
40
|
-
Specify the engine and basic options directly:
|
41
|
-
|
42
|
-
## OCR Configuration
|
43
|
-
|
44
|
-
```python
|
45
|
-
# Use PaddleOCR for Chinese and English
|
46
|
-
ocr_elements = page.apply_ocr(engine='paddle', languages=['zh-cn', 'en'])
|
47
|
-
|
48
|
-
# Use EasyOCR with a lower confidence threshold
|
49
|
-
ocr_elements = page.apply_ocr(engine='easyocr', languages=['en'], min_confidence=0.3)
|
50
|
-
```
|
51
|
-
|
52
|
-
For advanced, engine-specific settings, use the Options classes:
|
53
|
-
|
54
|
-
```python
|
55
|
-
from natural_pdf.ocr import PaddleOCROptions, EasyOCROptions, SuryaOCROptions
|
56
|
-
|
57
|
-
# --- Configure PaddleOCR ---
|
58
|
-
paddle_opts = PaddleOCROptions(
|
59
|
-
languages=['en', 'zh-cn'],
|
60
|
-
use_gpu=True, # Explicitly enable GPU if available
|
61
|
-
use_angle_cls=False, # Disable text direction classification (if text is upright)
|
62
|
-
det_db_thresh=0.25, # Lower detection threshold (more boxes, potentially noisy)
|
63
|
-
rec_batch_num=16 # Increase recognition batch size for potential speedup on GPU
|
64
|
-
# rec_char_dict_path='/path/to/custom_dict.txt' # Optional: Path to a custom character dictionary
|
65
|
-
# See PaddleOCROptions documentation or source code for all parameters
|
66
|
-
)
|
67
|
-
ocr_elements = page.apply_ocr(engine='paddle', options=paddle_opts)
|
68
|
-
|
69
|
-
# --- Configure EasyOCR ---
|
70
|
-
easy_opts = EasyOCROptions(
|
71
|
-
languages=['en', 'fr'],
|
72
|
-
gpu=True, # Explicitly enable GPU if available
|
73
|
-
paragraph=True, # Group results into paragraphs (if structure is clear)
|
74
|
-
detail=1, # Ensure bounding boxes are returned (required)
|
75
|
-
text_threshold=0.6, # Confidence threshold for text detection (adjust based on tuning table)
|
76
|
-
link_threshold=0.4, # Standard EasyOCR param, uncomment if confirmed in wrapper
|
77
|
-
low_text=0.4, # Standard EasyOCR param, uncomment if confirmed in wrapper
|
78
|
-
batch_size=8 # Processing batch size (adjust based on memory)
|
79
|
-
# See EasyOCROptions documentation or source code for all parameters
|
80
|
-
)
|
81
|
-
ocr_elements = page.apply_ocr(engine='easyocr', options=easy_opts)
|
82
|
-
|
83
|
-
# --- Configure Surya OCR ---
|
84
|
-
# Surya focuses on line detection and recognition
|
85
|
-
surya_opts = SuryaOCROptions(
|
86
|
-
languages=['en', 'de'], # Specify languages for recognition
|
87
|
-
# device='cuda', # Use GPU ('cuda') or CPU ('cpu') <-- Set via env var TORCH_DEVICE
|
88
|
-
min_confidence=0.4 # Example: Adjust minimum confidence for results
|
89
|
-
# Core Surya options like device, batch size, and thresholds are typically
|
90
|
-
# set via environment variables (see note below).
|
91
|
-
)
|
92
|
-
ocr_elements = page.apply_ocr(engine='surya', options=surya_opts)
|
93
|
-
```
|
94
|
-
|
95
|
-
## Applying OCR Directly
|
96
|
-
|
97
|
-
The `page.apply_ocr(...)` and `region.apply_ocr(...)` methods are the primary way to run OCR:
|
98
|
-
|
99
|
-
```python
|
100
|
-
# Apply OCR to a page and get the OCR elements
|
101
|
-
ocr_elements = page.apply_ocr(engine='easyocr')
|
102
|
-
print(f"Found {len(ocr_elements)} text elements via OCR")
|
103
|
-
|
104
|
-
# Apply OCR to a specific region
|
105
|
-
title = page.find('text:contains("Title")')
|
106
|
-
content_region = title.below(height=300)
|
107
|
-
region_ocr_elements = content_region.apply_ocr(engine='paddle', languages=['en'])
|
108
|
-
```
|
109
|
-
|
110
|
-
## OCR Engines
|
111
|
-
|
112
|
-
Choose the engine best suited for your document and language requirements using the `engine` parameter in `apply_ocr`.
|
113
|
-
|
114
|
-
## Finding and Working with OCR Text
|
115
|
-
|
116
|
-
After applying OCR, work with the text just like regular text:
|
117
|
-
|
118
|
-
```python
|
119
|
-
# Find all OCR text elements
|
120
|
-
ocr_text = page.find_all('text[source=ocr]')
|
121
|
-
|
122
|
-
# Find high-confidence OCR text
|
123
|
-
high_conf = page.find_all('text[source=ocr][confidence>=0.8]')
|
124
|
-
|
125
|
-
# Extract text only from OCR elements
|
126
|
-
ocr_text_content = page.find_all('text[source=ocr]').extract_text()
|
127
|
-
|
128
|
-
# Filter OCR text by content
|
129
|
-
names = page.find_all('text[source=ocr]:contains("Smith")', case=False)
|
130
|
-
```
|
131
|
-
|
132
|
-
## Visualizing OCR Results
|
133
|
-
|
134
|
-
See OCR results to help debug issues:
|
135
|
-
|
136
|
-
```python
|
137
|
-
# Apply OCR
|
138
|
-
ocr_elements = page.apply_ocr()
|
139
|
-
|
140
|
-
# Highlight all OCR elements
|
141
|
-
for element in ocr_elements:
|
142
|
-
# Color based on confidence
|
143
|
-
if element.confidence >= 0.8:
|
144
|
-
color = "green" # High confidence
|
145
|
-
elif element.confidence >= 0.5:
|
146
|
-
color = "yellow" # Medium confidence
|
147
|
-
else:
|
148
|
-
color = "red" # Low confidence
|
149
|
-
|
150
|
-
element.highlight(color=color, label=f"OCR ({element.confidence:.2f})")
|
151
|
-
|
152
|
-
# Get the visualization as an image
|
153
|
-
image = page.to_image(labels=True)
|
154
|
-
# Just return the image in a Jupyter cell
|
155
|
-
image
|
156
|
-
|
157
|
-
# Highlight only high-confidence elements
|
158
|
-
high_conf = page.find_all('text[source=ocr][confidence>=0.8]')
|
159
|
-
high_conf.highlight(color="green", label="High Confidence OCR")
|
160
|
-
```
|
161
|
-
|
162
|
-
## Detect + LLM OCR
|
163
|
-
|
164
|
-
Sometimes you have a difficult piece of content where you need to use a local model to identify the content, then send it off in pieces to be identified by the LLM. You can do this with Natural PDF!
|
165
|
-
|
166
|
-
```python
|
167
|
-
from natural_pdf import PDF
|
168
|
-
from natural_pdf.ocr.utils import direct_ocr_llm
|
169
|
-
import openai
|
170
|
-
|
171
|
-
pdf = PDF("needs-ocr.pdf")
|
172
|
-
page = pdf.pages[0]
|
173
|
-
|
174
|
-
# Detect
|
175
|
-
page.apply_ocr('paddle', resolution=120, detect_only=True)
|
176
|
-
|
177
|
-
# Build the framework
|
178
|
-
client = openai.OpenAI(base_url="https://api.anthropic.com/v1/", api_key='sk-XXXXX')
|
179
|
-
prompt = """OCR this image. Return only the exact text from the image. Include misspellings,
|
180
|
-
punctuation, etc. Do not surround it with quotation marks. Do not include translations or comments.
|
181
|
-
The text is from a Greek spreadsheet, so most likely content is Modern Greek or numeric."""
|
182
|
-
|
183
|
-
# This returns the cleaned-up text
|
184
|
-
def correct(region):
|
185
|
-
return direct_ocr_llm(region, client, prompt=prompt, resolution=300, model="claude-3-5-haiku-20241022")
|
186
|
-
|
187
|
-
# Run 'correct' on each text element
|
188
|
-
page.correct_ocr(correct)
|
189
|
-
|
190
|
-
# You're done!
|
191
|
-
```
|
192
|
-
|
193
|
-
## Debugging OCR
|
194
|
-
|
195
|
-
```python
|
196
|
-
from natural_pdf.utils.packaging import create_correction_task_package
|
197
|
-
|
198
|
-
create_correction_task_package(pdf, "original.zip", overwrite=True)
|
199
|
-
```
|
200
|
-
|
201
|
-
This will at *some point* be official-ized, but for now you can look at `templates/spa` and see the correction package.
|
202
|
-
|
203
|
-
## Next Steps
|
204
|
-
|
205
|
-
With OCR capabilities, you can explore:
|
206
|
-
|
207
|
-
- [Layout Analysis](../layout-analysis/index.ipynb) for automatically detecting document structure
|
208
|
-
- [Document QA](../document-qa/index.ipynb) for asking questions about your documents
|
209
|
-
- [Visual Debugging](../visual-debugging/index.ipynb) for visualizing OCR results
|
docs/pdf-navigation/index.ipynb
DELETED
@@ -1,314 +0,0 @@
|
|
1
|
-
{
|
2
|
-
"cells": [
|
3
|
-
{
|
4
|
-
"cell_type": "markdown",
|
5
|
-
"id": "bba1860e",
|
6
|
-
"metadata": {},
|
7
|
-
"source": [
|
8
|
-
"# PDF Navigation\n",
|
9
|
-
"\n",
|
10
|
-
"This guide covers the basics of working with PDFs in Natural PDF - opening documents, accessing pages, and navigating through content.\n",
|
11
|
-
"\n",
|
12
|
-
"## Opening a PDF\n",
|
13
|
-
"\n",
|
14
|
-
"The main entry point to Natural PDF is the `PDF` class:"
|
15
|
-
]
|
16
|
-
},
|
17
|
-
{
|
18
|
-
"cell_type": "code",
|
19
|
-
"execution_count": 1,
|
20
|
-
"id": "56d12ab5",
|
21
|
-
"metadata": {
|
22
|
-
"execution": {
|
23
|
-
"iopub.execute_input": "2025-04-03T14:50:38.434157Z",
|
24
|
-
"iopub.status.busy": "2025-04-03T14:50:38.433170Z",
|
25
|
-
"iopub.status.idle": "2025-04-03T14:50:49.768101Z",
|
26
|
-
"shell.execute_reply": "2025-04-03T14:50:49.767384Z"
|
27
|
-
}
|
28
|
-
},
|
29
|
-
"outputs": [],
|
30
|
-
"source": [
|
31
|
-
"from natural_pdf import PDF\n",
|
32
|
-
"\n",
|
33
|
-
"# Open a PDF file\n",
|
34
|
-
"pdf = PDF(\"https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/0500000US42001.pdf\")"
|
35
|
-
]
|
36
|
-
},
|
37
|
-
{
|
38
|
-
"cell_type": "markdown",
|
39
|
-
"id": "c425482a",
|
40
|
-
"metadata": {},
|
41
|
-
"source": [
|
42
|
-
"## Accessing Pages\n",
|
43
|
-
"\n",
|
44
|
-
"Once you have a PDF object, you can access its pages:"
|
45
|
-
]
|
46
|
-
},
|
47
|
-
{
|
48
|
-
"cell_type": "code",
|
49
|
-
"execution_count": 2,
|
50
|
-
"id": "a3405aa9",
|
51
|
-
"metadata": {
|
52
|
-
"execution": {
|
53
|
-
"iopub.execute_input": "2025-04-03T14:50:49.770604Z",
|
54
|
-
"iopub.status.busy": "2025-04-03T14:50:49.770419Z",
|
55
|
-
"iopub.status.idle": "2025-04-03T14:50:50.700808Z",
|
56
|
-
"shell.execute_reply": "2025-04-03T14:50:50.699634Z"
|
57
|
-
}
|
58
|
-
},
|
59
|
-
"outputs": [
|
60
|
-
{
|
61
|
-
"name": "stdout",
|
62
|
-
"output_type": "stream",
|
63
|
-
"text": [
|
64
|
-
"This PDF has 153 pages\n",
|
65
|
-
"Page 1 has 985 characters\n",
|
66
|
-
"Page 2 has 778 characters\n",
|
67
|
-
"Page 3 has 522 characters\n",
|
68
|
-
"Page 4 has 984 characters\n",
|
69
|
-
"Page 5 has 778 characters\n",
|
70
|
-
"Page 6 has 523 characters\n"
|
71
|
-
]
|
72
|
-
},
|
73
|
-
{
|
74
|
-
"name": "stdout",
|
75
|
-
"output_type": "stream",
|
76
|
-
"text": [
|
77
|
-
"Page 7 has 982 characters\n",
|
78
|
-
"Page 8 has 772 characters\n",
|
79
|
-
"Page 9 has 522 characters\n",
|
80
|
-
"Page 10 has 1008 characters\n"
|
81
|
-
]
|
82
|
-
},
|
83
|
-
{
|
84
|
-
"name": "stdout",
|
85
|
-
"output_type": "stream",
|
86
|
-
"text": [
|
87
|
-
"Page 11 has 796 characters\n",
|
88
|
-
"Page 12 has 532 characters\n",
|
89
|
-
"Page 13 has 986 characters\n",
|
90
|
-
"Page 14 has 780 characters\n",
|
91
|
-
"Page 15 has 523 characters\n",
|
92
|
-
"Page 16 has 990 characters\n",
|
93
|
-
"Page 17 has 782 characters\n"
|
94
|
-
]
|
95
|
-
},
|
96
|
-
{
|
97
|
-
"name": "stdout",
|
98
|
-
"output_type": "stream",
|
99
|
-
"text": [
|
100
|
-
"Page 18 has 520 characters\n",
|
101
|
-
"Page 19 has 1006 characters\n",
|
102
|
-
"Page 20 has 795 characters\n"
|
103
|
-
]
|
104
|
-
}
|
105
|
-
],
|
106
|
-
"source": [
|
107
|
-
"# Get the total number of pages\n",
|
108
|
-
"num_pages = len(pdf)\n",
|
109
|
-
"print(f\"This PDF has {num_pages} pages\")\n",
|
110
|
-
"\n",
|
111
|
-
"# Get a specific page (0-indexed)\n",
|
112
|
-
"first_page = pdf.pages[0]\n",
|
113
|
-
"last_page = pdf.pages[-1]\n",
|
114
|
-
"\n",
|
115
|
-
"# Iterate through the first 20 pages\n",
|
116
|
-
"for page in pdf.pages[:20]:\n",
|
117
|
-
" print(f\"Page {page.number} has {len(page.extract_text())} characters\")"
|
118
|
-
]
|
119
|
-
},
|
120
|
-
{
|
121
|
-
"cell_type": "markdown",
|
122
|
-
"id": "2eca7327",
|
123
|
-
"metadata": {},
|
124
|
-
"source": [
|
125
|
-
"## Page Properties\n",
|
126
|
-
"\n",
|
127
|
-
"Each `Page` object has useful properties:"
|
128
|
-
]
|
129
|
-
},
|
130
|
-
{
|
131
|
-
"cell_type": "code",
|
132
|
-
"execution_count": 3,
|
133
|
-
"id": "348f28d7",
|
134
|
-
"metadata": {
|
135
|
-
"execution": {
|
136
|
-
"iopub.execute_input": "2025-04-03T14:50:50.713325Z",
|
137
|
-
"iopub.status.busy": "2025-04-03T14:50:50.711638Z",
|
138
|
-
"iopub.status.idle": "2025-04-03T14:50:50.738737Z",
|
139
|
-
"shell.execute_reply": "2025-04-03T14:50:50.726839Z"
|
140
|
-
}
|
141
|
-
},
|
142
|
-
"outputs": [
|
143
|
-
{
|
144
|
-
"name": "stdout",
|
145
|
-
"output_type": "stream",
|
146
|
-
"text": [
|
147
|
-
"612 792\n",
|
148
|
-
"20\n",
|
149
|
-
"19\n"
|
150
|
-
]
|
151
|
-
}
|
152
|
-
],
|
153
|
-
"source": [
|
154
|
-
"# Page dimensions in points (1/72 inch)\n",
|
155
|
-
"print(page.width, page.height)\n",
|
156
|
-
"\n",
|
157
|
-
"# Page number (1-indexed as shown in PDF viewers)\n",
|
158
|
-
"print(page.number)\n",
|
159
|
-
"\n",
|
160
|
-
"# Page index (0-indexed position in the PDF)\n",
|
161
|
-
"print(page.index)"
|
162
|
-
]
|
163
|
-
},
|
164
|
-
{
|
165
|
-
"cell_type": "markdown",
|
166
|
-
"id": "c7cf1839",
|
167
|
-
"metadata": {},
|
168
|
-
"source": [
|
169
|
-
"## Working Across Pages\n",
|
170
|
-
"\n",
|
171
|
-
"Natural PDF makes it easy to work with content across multiple pages:"
|
172
|
-
]
|
173
|
-
},
|
174
|
-
{
|
175
|
-
"cell_type": "code",
|
176
|
-
"execution_count": 4,
|
177
|
-
"id": "71a8f1ec",
|
178
|
-
"metadata": {
|
179
|
-
"execution": {
|
180
|
-
"iopub.execute_input": "2025-04-03T14:50:50.765495Z",
|
181
|
-
"iopub.status.busy": "2025-04-03T14:50:50.764444Z",
|
182
|
-
"iopub.status.idle": "2025-04-03T14:50:57.735494Z",
|
183
|
-
"shell.execute_reply": "2025-04-03T14:50:57.726489Z"
|
184
|
-
}
|
185
|
-
},
|
186
|
-
"outputs": [
|
187
|
-
{
|
188
|
-
"data": {
|
189
|
-
"text/plain": [
|
190
|
-
"<natural_pdf.core.pdf.PDF at 0x1045224d0>"
|
191
|
-
]
|
192
|
-
},
|
193
|
-
"execution_count": 4,
|
194
|
-
"metadata": {},
|
195
|
-
"output_type": "execute_result"
|
196
|
-
}
|
197
|
-
],
|
198
|
-
"source": [
|
199
|
-
"# Extract text from all pages\n",
|
200
|
-
"all_text = pdf.extract_text()\n",
|
201
|
-
"\n",
|
202
|
-
"# Find elements across all pages\n",
|
203
|
-
"all_headings = pdf.find_all('text[size>=14]:bold')\n",
|
204
|
-
"\n",
|
205
|
-
"# Add exclusion zones to all pages (like headers/footers)\n",
|
206
|
-
"pdf.add_exclusion(\n",
|
207
|
-
" lambda page: page.find('text:contains(\"CONFIDENTIAL\")').above() if page.find('text:contains(\"CONFIDENTIAL\")') else None,\n",
|
208
|
-
" label=\"header\"\n",
|
209
|
-
")"
|
210
|
-
]
|
211
|
-
},
|
212
|
-
{
|
213
|
-
"cell_type": "markdown",
|
214
|
-
"id": "e18051a4",
|
215
|
-
"metadata": {},
|
216
|
-
"source": [
|
217
|
-
"## The Page Collection\n",
|
218
|
-
"\n",
|
219
|
-
"The `pdf.pages` object is a `PageCollection` that allows batch operations on pages:"
|
220
|
-
]
|
221
|
-
},
|
222
|
-
{
|
223
|
-
"cell_type": "code",
|
224
|
-
"execution_count": 5,
|
225
|
-
"id": "e5f1c662",
|
226
|
-
"metadata": {
|
227
|
-
"execution": {
|
228
|
-
"iopub.execute_input": "2025-04-03T14:50:57.752240Z",
|
229
|
-
"iopub.status.busy": "2025-04-03T14:50:57.751868Z",
|
230
|
-
"iopub.status.idle": "2025-04-03T14:50:57.770738Z",
|
231
|
-
"shell.execute_reply": "2025-04-03T14:50:57.759415Z"
|
232
|
-
}
|
233
|
-
},
|
234
|
-
"outputs": [],
|
235
|
-
"source": [
|
236
|
-
"# Extract text from specific pages\n",
|
237
|
-
"text = pdf.pages[2:5].extract_text()\n",
|
238
|
-
"\n",
|
239
|
-
"# Find elements across specific pages\n",
|
240
|
-
"elements = pdf.pages[2:5].find_all('text:contains(\"Annual Report\")')"
|
241
|
-
]
|
242
|
-
},
|
243
|
-
{
|
244
|
-
"cell_type": "markdown",
|
245
|
-
"id": "9713e392",
|
246
|
-
"metadata": {},
|
247
|
-
"source": [
|
248
|
-
"## Document Sections Across Pages\n",
|
249
|
-
"\n",
|
250
|
-
"You can extract sections that span across multiple pages:"
|
251
|
-
]
|
252
|
-
},
|
253
|
-
{
|
254
|
-
"cell_type": "code",
|
255
|
-
"execution_count": 6,
|
256
|
-
"id": "d5b89a2b",
|
257
|
-
"metadata": {
|
258
|
-
"execution": {
|
259
|
-
"iopub.execute_input": "2025-04-03T14:50:57.782621Z",
|
260
|
-
"iopub.status.busy": "2025-04-03T14:50:57.781776Z",
|
261
|
-
"iopub.status.idle": "2025-04-03T14:50:57.811508Z",
|
262
|
-
"shell.execute_reply": "2025-04-03T14:50:57.805310Z"
|
263
|
-
}
|
264
|
-
},
|
265
|
-
"outputs": [],
|
266
|
-
"source": [
|
267
|
-
"# Get sections with headings as section starts\n",
|
268
|
-
"sections = pdf.pages.get_sections(\n",
|
269
|
-
" start_elements='text[size>=14]:bold',\n",
|
270
|
-
" new_section_on_page_break=False\n",
|
271
|
-
")"
|
272
|
-
]
|
273
|
-
},
|
274
|
-
{
|
275
|
-
"cell_type": "markdown",
|
276
|
-
"id": "f51594ce",
|
277
|
-
"metadata": {},
|
278
|
-
"source": [
|
279
|
-
"## Next Steps\n",
|
280
|
-
"\n",
|
281
|
-
"Now that you know how to navigate PDFs, you can:\n",
|
282
|
-
"\n",
|
283
|
-
"- [Find elements using selectors](../element-selection/index.ipynb)\n",
|
284
|
-
"- [Extract text from your documents](../text-extraction/index.ipynb)\n",
|
285
|
-
"- [Work with specific regions](../regions/index.ipynb)"
|
286
|
-
]
|
287
|
-
}
|
288
|
-
],
|
289
|
-
"metadata": {
|
290
|
-
"jupytext": {
|
291
|
-
"cell_metadata_filter": "-all",
|
292
|
-
"main_language": "python",
|
293
|
-
"notebook_metadata_filter": "-all",
|
294
|
-
"text_representation": {
|
295
|
-
"extension": ".md",
|
296
|
-
"format_name": "markdown"
|
297
|
-
}
|
298
|
-
},
|
299
|
-
"language_info": {
|
300
|
-
"codemirror_mode": {
|
301
|
-
"name": "ipython",
|
302
|
-
"version": 3
|
303
|
-
},
|
304
|
-
"file_extension": ".py",
|
305
|
-
"mimetype": "text/x-python",
|
306
|
-
"name": "python",
|
307
|
-
"nbconvert_exporter": "python",
|
308
|
-
"pygments_lexer": "ipython3",
|
309
|
-
"version": "3.10.13"
|
310
|
-
}
|
311
|
-
},
|
312
|
-
"nbformat": 4,
|
313
|
-
"nbformat_minor": 5
|
314
|
-
}
|