natural-pdf 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +222 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +260 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +409 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +484 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +586 -0
- docs/tutorials/12-ocr-integration.md +188 -0
- docs/tutorials/13-semantic-search.ipynb +1888 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +39 -20
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +98 -58
- natural_pdf/analyzers/layout/layout_options.py +32 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +84 -44
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +125 -97
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +416 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +907 -513
- natural_pdf/core/pdf.py +385 -287
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +302 -214
- natural_pdf/elements/collections.py +708 -508
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +854 -883
- natural_pdf/elements/text.py +122 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +23 -14
- natural_pdf/ocr/engine.py +17 -8
- natural_pdf/ocr/engine_easyocr.py +63 -47
- natural_pdf/ocr/engine_paddle.py +97 -68
- natural_pdf/ocr/engine_surya.py +54 -44
- natural_pdf/ocr/ocr_manager.py +88 -62
- natural_pdf/ocr/ocr_options.py +16 -10
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +15 -1
- natural_pdf-0.1.5.dist-info/RECORD +134 -0
- natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- tests/test_loading.py +50 -0
- tests/test_optional_deps.py +298 -0
- natural_pdf-0.1.4.dist-info/RECORD +0 -61
- natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,95 @@
|
|
1
|
+
# Loading and Basic Text Extraction
|
2
|
+
|
3
|
+
```python
|
4
|
+
#%pip install "natural-pdf[all]"
|
5
|
+
```
|
6
|
+
|
7
|
+
In this tutorial, we'll learn how to:
|
8
|
+
|
9
|
+
1. Load a PDF document
|
10
|
+
2. Extract text from pages
|
11
|
+
3. Extract specific elements
|
12
|
+
|
13
|
+
## Loading a PDF
|
14
|
+
|
15
|
+
Let's start by loading a PDF file:
|
16
|
+
|
17
|
+
```python
|
18
|
+
from natural_pdf import PDF
|
19
|
+
import os
|
20
|
+
|
21
|
+
# Load a PDF file
|
22
|
+
pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
|
23
|
+
|
24
|
+
# Basic info about the document
|
25
|
+
{
|
26
|
+
"Filename": os.path.basename(pdf.path),
|
27
|
+
"Pages": len(pdf.pages),
|
28
|
+
"Title": pdf.metadata.get("Title", "N/A"),
|
29
|
+
"Author": pdf.metadata.get("Author", "N/A")
|
30
|
+
}
|
31
|
+
```
|
32
|
+
|
33
|
+
## Extracting Text
|
34
|
+
|
35
|
+
Now that we have loaded the PDF, let's extract the text from the first page:
|
36
|
+
|
37
|
+
```python
|
38
|
+
# Get the first page
|
39
|
+
page = pdf.pages[0]
|
40
|
+
|
41
|
+
# Extract text from the page
|
42
|
+
text = page.extract_text()
|
43
|
+
|
44
|
+
# Show the first 200 characters of the text
|
45
|
+
print(text[:200])
|
46
|
+
```
|
47
|
+
|
48
|
+
## Finding and Extracting Specific Elements
|
49
|
+
|
50
|
+
We can find specific elements using spatial queries and text content:
|
51
|
+
|
52
|
+
```python
|
53
|
+
# Find text elements containing specific words
|
54
|
+
elements = page.find_all('text:contains("Inadequate")')
|
55
|
+
|
56
|
+
# Show these elements on the page
|
57
|
+
page.clear_highlights()
|
58
|
+
elements.highlight(color="red", label="Contains 'Inadequate'")
|
59
|
+
|
60
|
+
# Display the page to see them
|
61
|
+
page.to_image(width=700)
|
62
|
+
```
|
63
|
+
|
64
|
+
## Working with Layout Regions
|
65
|
+
|
66
|
+
We can analyze the layout of the page to identify different regions:
|
67
|
+
|
68
|
+
```python
|
69
|
+
# Analyze the page layout
|
70
|
+
page.analyze_layout(engine='yolo')
|
71
|
+
|
72
|
+
# Find and highlight all detected regions
|
73
|
+
page.clear_highlights()
|
74
|
+
page.find_all('region').highlight(group_by='type')
|
75
|
+
|
76
|
+
# Display the page to see the regions
|
77
|
+
page.to_image(width=900)
|
78
|
+
```
|
79
|
+
|
80
|
+
## Working with Multiple Pages
|
81
|
+
|
82
|
+
You can also work with multiple pages:
|
83
|
+
|
84
|
+
```python
|
85
|
+
# Process all pages
|
86
|
+
for page in pdf.pages:
|
87
|
+
page_text = page.extract_text()
|
88
|
+
print(f"Page {page.number}", page_text[:100]) # First 100 chars of each page
|
89
|
+
```
|
90
|
+
|
91
|
+
This tutorial covered the basics of loading PDFs and extracting text. In the next tutorials, we'll explore more advanced features like searching for specific elements, extracting structured content, and working with tables.
|
92
|
+
|
93
|
+
```bash
|
94
|
+
pip install "natural-pdf[all]"
|
95
|
+
```
|