natural-pdf 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/categorizing-documents/index.md +168 -0
- docs/data-extraction/index.md +87 -0
- docs/element-selection/index.ipynb +218 -164
- docs/element-selection/index.md +20 -0
- docs/index.md +19 -0
- docs/ocr/index.md +63 -16
- docs/tutorials/01-loading-and-extraction.ipynb +1713 -34
- docs/tutorials/02-finding-elements.ipynb +123 -46
- docs/tutorials/03-extracting-blocks.ipynb +24 -19
- docs/tutorials/04-table-extraction.ipynb +17 -12
- docs/tutorials/05-excluding-content.ipynb +37 -32
- docs/tutorials/06-document-qa.ipynb +36 -31
- docs/tutorials/07-layout-analysis.ipynb +45 -40
- docs/tutorials/07-working-with-regions.ipynb +61 -60
- docs/tutorials/08-spatial-navigation.ipynb +76 -71
- docs/tutorials/09-section-extraction.ipynb +160 -155
- docs/tutorials/10-form-field-extraction.ipynb +71 -66
- docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
- docs/tutorials/12-ocr-integration.ipynb +3420 -312
- docs/tutorials/12-ocr-integration.md +68 -106
- docs/tutorials/13-semantic-search.ipynb +641 -251
- natural_pdf/__init__.py +2 -0
- natural_pdf/classification/manager.py +343 -0
- natural_pdf/classification/mixin.py +149 -0
- natural_pdf/classification/results.py +62 -0
- natural_pdf/collections/mixins.py +63 -0
- natural_pdf/collections/pdf_collection.py +321 -15
- natural_pdf/core/element_manager.py +67 -0
- natural_pdf/core/page.py +227 -64
- natural_pdf/core/pdf.py +387 -378
- natural_pdf/elements/collections.py +272 -41
- natural_pdf/elements/region.py +99 -15
- natural_pdf/elements/text.py +5 -2
- natural_pdf/exporters/paddleocr.py +1 -1
- natural_pdf/extraction/manager.py +134 -0
- natural_pdf/extraction/mixin.py +246 -0
- natural_pdf/extraction/result.py +37 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_manager.py +85 -25
- natural_pdf/ocr/ocr_options.py +33 -10
- natural_pdf/ocr/utils.py +14 -3
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/selectors/parser.py +363 -238
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +10 -5
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/text_extraction.py +52 -1
- natural_pdf/utils/tqdm_utils.py +43 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +6 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +52 -41
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
@@ -18,139 +18,97 @@ text_without_ocr = page.extract_text()
|
|
18
18
|
f"Without OCR: {len(text_without_ocr)} characters extracted"
|
19
19
|
```
|
20
20
|
|
21
|
-
##
|
21
|
+
## Applying OCR and Finding Elements
|
22
|
+
|
23
|
+
The core method is `page.apply_ocr()`. This runs the OCR process and adds `TextElement` objects to the page. You can specify the engine and languages.
|
24
|
+
|
25
|
+
**Note:** Re-applying OCR to the same page or region will automatically remove any previously generated OCR elements for that area before adding the new ones.
|
22
26
|
|
23
27
|
```python
|
24
|
-
#
|
25
|
-
page.apply_ocr()
|
28
|
+
# Apply OCR using the default engine (EasyOCR) for English
|
29
|
+
page.apply_ocr(languages=['en'])
|
26
30
|
|
27
|
-
# Select all text pieces
|
28
|
-
text_elements = page.find_all('text')
|
29
|
-
f"Found {len(text_elements)} text elements"
|
31
|
+
# Select all text pieces found by OCR
|
32
|
+
text_elements = page.find_all('text[source=ocr]')
|
33
|
+
print(f"Found {len(text_elements)} text elements using default OCR")
|
30
34
|
|
31
35
|
# Visualize the elements
|
32
36
|
text_elements.highlight()
|
33
|
-
```
|
34
37
|
|
35
|
-
|
36
|
-
|
37
|
-
```python
|
38
|
-
# Set OCR configuration for better results
|
39
|
-
page.ocr_config = {
|
40
|
-
'language': 'eng', # English
|
41
|
-
'dpi': 300, # Higher resolution
|
42
|
-
}
|
38
|
+
# Apply OCR using PaddleOCR for English and Chinese
|
39
|
+
page.apply_ocr(engine='paddle', languages=['en', 'ch_sim'])
|
43
40
|
|
44
|
-
#
|
45
|
-
|
41
|
+
# Apply OCR using SuryaOCR for English and German
|
42
|
+
page.apply_ocr(engine='surya', languages=['en', 'de'])
|
46
43
|
|
47
|
-
|
48
|
-
|
44
|
+
text_with_ocr = page.extract_text()
|
45
|
+
print(f"\nExtracted text after OCR:\n{text_with_ocr[:150]}...")
|
49
46
|
```
|
50
47
|
|
51
|
-
##
|
48
|
+
## Advanced OCR Configuration
|
52
49
|
|
53
|
-
|
54
|
-
# Configure for multiple languages
|
55
|
-
page.ocr_config = {
|
56
|
-
'language': 'eng+fra+deu', # English, French, German
|
57
|
-
'dpi': 300
|
58
|
-
}
|
59
|
-
|
60
|
-
# Extract text with multi-language support
|
61
|
-
multilang_text = page.extract_text()
|
62
|
-
multilang_text[:200]
|
63
|
-
```
|
64
|
-
|
65
|
-
## Extracting Tables from Scanned Documents
|
50
|
+
For more control, import and use the specific `Options` class for your chosen engine within the `apply_ocr` call.
|
66
51
|
|
67
52
|
```python
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
53
|
+
from natural_pdf.ocr import PaddleOCROptions, EasyOCROptions, SuryaOCROptions
|
54
|
+
|
55
|
+
# Re-apply OCR using EasyOCR with specific options
|
56
|
+
easy_opts = EasyOCROptions(
|
57
|
+
paragraph=False,
|
58
|
+
)
|
59
|
+
page.apply_ocr(engine='easyocr', languages=['en'], min_confidence=0.1, options=easy_opts)
|
60
|
+
|
61
|
+
paddle_opts = PaddleOCROptions(
|
62
|
+
use_angle_cls=False,
|
63
|
+
det_db_thresh=0.3,
|
64
|
+
)
|
65
|
+
page.apply_ocr(engine='paddle', languages=['en'], options=paddle_opts)
|
66
|
+
|
67
|
+
surya_opts = SuryaOCROptions()
|
68
|
+
page.apply_ocr(engine='surya', languages=['en'], min_confidence=0.5, detect_only=True, options=surya_opts)
|
84
69
|
```
|
85
70
|
|
86
|
-
##
|
71
|
+
## Interactive OCR Correction / Debugging
|
87
72
|
|
88
|
-
|
89
|
-
# Look for potential form labels (containing a colon)
|
90
|
-
labels = page.find_all('text:contains(":")')
|
91
|
-
|
92
|
-
# Visualize the labels
|
93
|
-
labels.highlight()
|
94
|
-
|
95
|
-
# Extract form data by looking to the right of each label
|
96
|
-
form_data = {}
|
97
|
-
for label in labels:
|
98
|
-
# Clean the label text
|
99
|
-
field_name = label.text.strip().rstrip(':')
|
100
|
-
|
101
|
-
# Find the value to the right
|
102
|
-
value_element = label.right(width=200)
|
103
|
-
value = value_element.extract_text().strip()
|
104
|
-
|
105
|
-
# Add to our dictionary
|
106
|
-
form_data[field_name] = value
|
107
|
-
|
108
|
-
# Display the extracted data
|
109
|
-
form_data
|
110
|
-
```
|
73
|
+
If OCR results aren't perfect, you can use the bundled interactive web application (SPA) to review and correct them.
|
111
74
|
|
112
|
-
|
75
|
+
1. **Package the data:**
|
76
|
+
After running `apply_ocr` (or `apply_layout`), use `create_correction_task_package` to create a zip file containing the PDF images and detected elements.
|
113
77
|
|
114
|
-
```python
|
115
|
-
|
116
|
-
page.use_ocr = True
|
117
|
-
page.analyze_layout()
|
78
|
+
```python
|
79
|
+
from natural_pdf.utils.packaging import create_correction_task_package
|
118
80
|
|
119
|
-
|
120
|
-
headings = page.find_all('region[type=heading]')
|
121
|
-
paragraphs = page.find_all('region[type=paragraph]')
|
81
|
+
page.apply_ocr()
|
122
82
|
|
123
|
-
|
124
|
-
|
125
|
-
paragraphs.highlight(color="blue", label="Paragraphs")
|
83
|
+
create_correction_task_package(pdf, "correction_package.zip", overwrite=True)
|
84
|
+
```
|
126
85
|
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
86
|
+
2. **Run the SPA:**
|
87
|
+
Navigate to the SPA directory within the installed `natural_pdf` library in your terminal and start a simple web server.
|
88
|
+
|
89
|
+
3. **Use the SPA:**
|
90
|
+
Open `http://localhost:8000` in your browser. Drag the `correction_package.zip` file onto the page to load the document. You can then click on text elements to correct the OCR results.
|
132
91
|
|
133
|
-
document_outline
|
134
|
-
```
|
135
92
|
|
136
93
|
## Working with Multiple Pages
|
137
94
|
|
95
|
+
Apply OCR or layout analysis to all pages using the `PDF` object.
|
96
|
+
|
138
97
|
```python
|
139
98
|
# Process all pages in the document
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
all_text
|
99
|
+
|
100
|
+
# Apply OCR to all pages (example using EasyOCR)
|
101
|
+
pdf.apply_ocr(engine='easyocr', languages=['en'])
|
102
|
+
print(f"Applied OCR to {len(pdf.pages)} pages.")
|
103
|
+
|
104
|
+
# Or apply layout analysis to all pages (example using Paddle)
|
105
|
+
# pdf.apply_layout(engine='paddle')
|
106
|
+
# print(f"Applied Layout Analysis to {len(pdf.pages)} pages.")
|
107
|
+
|
108
|
+
# Extract text from all pages (uses OCR results if available)
|
109
|
+
all_text_content = pdf.extract_text(page_separator="\\n\\n---\\n\\n")
|
110
|
+
|
111
|
+
print(f"\nCombined text from all pages:\n{all_text_content[:500]}...")
|
154
112
|
```
|
155
113
|
|
156
114
|
## Saving PDFs with Searchable Text
|
@@ -165,9 +123,13 @@ from natural_pdf import PDF
|
|
165
123
|
input_pdf_path = "https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/needs-ocr.pdf"
|
166
124
|
|
167
125
|
pdf = PDF(input_pdf_path)
|
168
|
-
|
126
|
+
# Apply OCR to all pages before saving
|
127
|
+
# Use desired engine and options
|
128
|
+
pdf.apply_ocr(engine='easyocr', languages=['en'])
|
169
129
|
|
170
130
|
pdf.save_searchable("needs-ocr-searchable.pdf")
|
131
|
+
|
132
|
+
print("Saved searchable PDF to needs-ocr-searchable.pdf")
|
171
133
|
```
|
172
134
|
|
173
135
|
This creates `needs-ocr-searchable.pdf`, which looks identical to the original but now has a text layer corresponding to the OCR results. You can adjust the rendering resolution used during saving with the `dpi` parameter (default is 300).
|