natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +422 -0
- natural_pdf/classification/mixin.py +163 -0
- natural_pdf/classification/results.py +80 -0
- natural_pdf/collections/mixins.py +111 -0
- natural_pdf/collections/pdf_collection.py +434 -15
- natural_pdf/core/element_manager.py +83 -0
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +578 -93
- natural_pdf/core/pdf.py +912 -460
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +712 -109
- natural_pdf/elements/region.py +722 -69
- natural_pdf/elements/text.py +4 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf/extraction/manager.py +135 -0
- natural_pdf/extraction/mixin.py +279 -0
- natural_pdf/extraction/result.py +23 -0
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf/ocr/ocr_options.py +94 -11
- natural_pdf/ocr/utils.py +19 -6
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +431 -230
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf/utils/tqdm_utils.py +51 -0
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -915
- docs/element-selection/index.md +0 -229
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -170
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -209
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -340
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -147
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -114
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -270
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -332
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -288
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -413
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -508
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2434
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -512
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -604
- docs/tutorials/12-ocr-integration.md +0 -175
- docs/tutorials/13-semantic-search.ipynb +0 -1328
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.7.dist-info/RECORD +0 -145
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,149 +0,0 @@
|
|
1
|
-
# Finding Specific Elements
|
2
|
-
|
3
|
-
Extracting all the text is useful, but often you need specific pieces of information. `natural-pdf` lets you find elements using selectors, similar to CSS.
|
4
|
-
|
5
|
-
Let's find the "Site" and "Date" information from our `01-practice.pdf`:
|
6
|
-
|
7
|
-
```python
|
8
|
-
#%pip install "natural-pdf[all]"
|
9
|
-
```
|
10
|
-
|
11
|
-
|
12
|
-
```python
|
13
|
-
from natural_pdf import PDF
|
14
|
-
|
15
|
-
# Load a PDF
|
16
|
-
pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
|
17
|
-
|
18
|
-
# Get the first page (index 0)
|
19
|
-
page = pdf.pages[0]
|
20
|
-
|
21
|
-
# Find the text element containing "Site:"
|
22
|
-
# The ':contains()' pseudo-class looks for text content.
|
23
|
-
site_label = page.find('text:contains("Site:")')
|
24
|
-
|
25
|
-
# Find the text element containing "Date:"
|
26
|
-
date_label = page.find('text:contains("Date:")')
|
27
|
-
|
28
|
-
# Visualize the found elements
|
29
|
-
site_label.highlight(color="red", label="Site Label")
|
30
|
-
date_label.highlight(color="blue", label="Date Label")
|
31
|
-
|
32
|
-
# Access the text content directly
|
33
|
-
{
|
34
|
-
"Site Label": site_label.text,
|
35
|
-
"Date Label": date_label.text
|
36
|
-
}
|
37
|
-
|
38
|
-
# Display the page image to see the visualized elements
|
39
|
-
page.to_image()
|
40
|
-
```
|
41
|
-
|
42
|
-
## Finding Elements by Color
|
43
|
-
|
44
|
-
You can find elements based on their color:
|
45
|
-
|
46
|
-
```python
|
47
|
-
# Find text elements that are red
|
48
|
-
red_text = page.find('text[color~=red]')
|
49
|
-
red_text.highlight(color="red", label="Red Text")
|
50
|
-
print(f"Found red text: {red_text.text}")
|
51
|
-
|
52
|
-
# Find elements with specific RGB colors
|
53
|
-
blue_text = page.find('text[color=rgb(0,0,255)]')
|
54
|
-
```
|
55
|
-
|
56
|
-
## Finding Lines and Shapes
|
57
|
-
|
58
|
-
Find lines and rectangles based on their properties:
|
59
|
-
|
60
|
-
```python
|
61
|
-
# Find horizontal lines
|
62
|
-
horizontal_lines = page.find_all('line[horizontal]')
|
63
|
-
|
64
|
-
# Find thick lines (width >= 2)
|
65
|
-
thick_lines = page.find_all('line[width>=2]')
|
66
|
-
|
67
|
-
# Find rectangles
|
68
|
-
rectangles = page.find_all('rect')
|
69
|
-
|
70
|
-
# Visualize what we found
|
71
|
-
page.clear_highlights()
|
72
|
-
horizontal_lines.highlight(color="blue", label="Horizontal Lines")
|
73
|
-
thick_lines.highlight(color="red", label="Thick Lines")
|
74
|
-
rectangles.highlight(color="green", label="Rectangles")
|
75
|
-
page.to_image()
|
76
|
-
```
|
77
|
-
|
78
|
-
## Finding Elements by Font Properties
|
79
|
-
|
80
|
-
```python
|
81
|
-
# Find text with specific font properties
|
82
|
-
bold_text = page.find_all('text[style~=bold]')
|
83
|
-
large_text = page.find_all('text[size>=12]')
|
84
|
-
|
85
|
-
# Find text with specific font names
|
86
|
-
helvetica_text = page.find_all('text[fontname~=Helvetica]')
|
87
|
-
```
|
88
|
-
|
89
|
-
## Spatial Navigation
|
90
|
-
|
91
|
-
You can find elements based on their position relative to other elements:
|
92
|
-
|
93
|
-
```python
|
94
|
-
# Find text above a specific element
|
95
|
-
above_text = page.find('line[width=2]').above().extract_text()
|
96
|
-
|
97
|
-
# Find text below a specific element
|
98
|
-
below_text = page.find('text:contains("Summary")').below().extract_text()
|
99
|
-
|
100
|
-
# Find text to the right of a specific element
|
101
|
-
nearby_text = page.find('text:contains("Site")').right(width=200).extract_text()
|
102
|
-
```
|
103
|
-
|
104
|
-
## Combining Selectors
|
105
|
-
|
106
|
-
You can combine multiple conditions to find exactly what you need:
|
107
|
-
|
108
|
-
```python
|
109
|
-
# Find large, bold text that contains specific words
|
110
|
-
important_text = page.find_all('text[size>=12][style~=bold]:contains("Critical")')
|
111
|
-
|
112
|
-
# Find red text inside a rectangle
|
113
|
-
highlighted_text = page.find('rect').find_all('text[color~=red]')
|
114
|
-
```
|
115
|
-
|
116
|
-
<div class="admonition note">
|
117
|
-
<p class="admonition-title">Handling Missing Elements</p>
|
118
|
-
|
119
|
-
In these examples, we know certain elements exist in the PDF. In real-world scenarios, `page.find()` might not find a match and would return `None`. Production code should check for this:
|
120
|
-
|
121
|
-
```py
|
122
|
-
site_label = page.find('text:contains("Site:")')
|
123
|
-
if site_label:
|
124
|
-
# Found it! Proceed...
|
125
|
-
site_label.highlight(color="red", label="Site Label")
|
126
|
-
site_label.text # Display or use the text
|
127
|
-
else:
|
128
|
-
# Didn't find it, handle appropriately...
|
129
|
-
"Warning: 'Site:' label not found."
|
130
|
-
```
|
131
|
-
</div>
|
132
|
-
|
133
|
-
<div class="admonition tip">
|
134
|
-
<p class="admonition-title">Visual Debugging</p>
|
135
|
-
|
136
|
-
When working with complex selectors, it's helpful to visualize what you're finding:
|
137
|
-
|
138
|
-
```py
|
139
|
-
# Clear any existing highlights
|
140
|
-
page.clear_highlights()
|
141
|
-
|
142
|
-
# Find and highlight elements
|
143
|
-
elements = page.find_all('text[color~=red]')
|
144
|
-
elements.highlight(color="red", label="Red Text")
|
145
|
-
|
146
|
-
# Display the page to see what was found
|
147
|
-
page.to_image(width=800)
|
148
|
-
```
|
149
|
-
</div>
|