natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +422 -0
- natural_pdf/classification/mixin.py +163 -0
- natural_pdf/classification/results.py +80 -0
- natural_pdf/collections/mixins.py +111 -0
- natural_pdf/collections/pdf_collection.py +434 -15
- natural_pdf/core/element_manager.py +83 -0
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +578 -93
- natural_pdf/core/pdf.py +912 -460
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +712 -109
- natural_pdf/elements/region.py +722 -69
- natural_pdf/elements/text.py +4 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf/extraction/manager.py +135 -0
- natural_pdf/extraction/mixin.py +279 -0
- natural_pdf/extraction/result.py +23 -0
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf/ocr/ocr_options.py +94 -11
- natural_pdf/ocr/utils.py +19 -6
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +431 -230
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf/utils/tqdm_utils.py +51 -0
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -915
- docs/element-selection/index.md +0 -229
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -170
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -209
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -340
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -147
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -114
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -270
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -332
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -288
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -413
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -508
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2434
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -512
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -604
- docs/tutorials/12-ocr-integration.md +0 -175
- docs/tutorials/13-semantic-search.ipynb +0 -1328
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.7.dist-info/RECORD +0 -145
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
docs/regions/index.md
DELETED
@@ -1,294 +0,0 @@
|
|
1
|
-
# Working with Regions
|
2
|
-
|
3
|
-
Regions are rectangular areas on a page that define boundaries for operations like text extraction, element finding, or visualization. They're one of Natural PDF's most powerful features for working with specific parts of a document.
|
4
|
-
|
5
|
-
## Setup
|
6
|
-
|
7
|
-
Let's set up a PDF to experiment with regions.
|
8
|
-
|
9
|
-
```python
|
10
|
-
from natural_pdf import PDF
|
11
|
-
|
12
|
-
# Load the PDF
|
13
|
-
pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
|
14
|
-
|
15
|
-
# Get the first page
|
16
|
-
page = pdf.pages[0]
|
17
|
-
|
18
|
-
# Display the page
|
19
|
-
page.show(width=700)
|
20
|
-
```
|
21
|
-
|
22
|
-
## Creating Regions
|
23
|
-
|
24
|
-
There are several ways to create regions in Natural PDF.
|
25
|
-
|
26
|
-
### Using `create_region()` with Coordinates
|
27
|
-
|
28
|
-
This is the most direct method - provide the coordinates directly.
|
29
|
-
|
30
|
-
```python
|
31
|
-
# Create a region by specifying (x0, top, x1, bottom) coordinates
|
32
|
-
# Let's create a region in the middle of the page
|
33
|
-
mid_region = page.create_region(
|
34
|
-
x0=100, # Left edge
|
35
|
-
top=200, # Top edge
|
36
|
-
x1=500, # Right edge
|
37
|
-
bottom=400 # Bottom edge
|
38
|
-
)
|
39
|
-
|
40
|
-
# Highlight the region to see it
|
41
|
-
mid_region.highlight(color="blue").show()
|
42
|
-
```
|
43
|
-
|
44
|
-
### Using Element Methods: `above()`, `below()`, `left()`, `right()`
|
45
|
-
|
46
|
-
You can create regions relative to existing elements.
|
47
|
-
|
48
|
-
```python
|
49
|
-
# Find a heading-like element
|
50
|
-
heading = page.find('text[size>=12]:bold')
|
51
|
-
|
52
|
-
# Create a region below this heading element
|
53
|
-
if heading:
|
54
|
-
region_below = heading.below()
|
55
|
-
|
56
|
-
# Highlight the heading and the region below it
|
57
|
-
heading.highlight(color="red")
|
58
|
-
region_below.highlight(color="blue")
|
59
|
-
page.show()
|
60
|
-
```
|
61
|
-
|
62
|
-
```python
|
63
|
-
# Create a region with height limit
|
64
|
-
if heading:
|
65
|
-
# Only include 100px below the heading
|
66
|
-
small_region_below = heading.below(height=100)
|
67
|
-
|
68
|
-
page.clear_highlights()
|
69
|
-
heading.highlight(color="red")
|
70
|
-
small_region_below.highlight(color="green")
|
71
|
-
page.show()
|
72
|
-
```
|
73
|
-
|
74
|
-
```python
|
75
|
-
# Find a line or other element to create a region above
|
76
|
-
line = page.find('line')
|
77
|
-
if line:
|
78
|
-
# Create a region above the line
|
79
|
-
region_above = line.above()
|
80
|
-
|
81
|
-
page.clear_highlights()
|
82
|
-
line.highlight(color="black")
|
83
|
-
region_above.highlight(color="purple")
|
84
|
-
page.show()
|
85
|
-
```
|
86
|
-
|
87
|
-
### Creating a Region Between Elements with `until()`
|
88
|
-
|
89
|
-
```python
|
90
|
-
# Find two elements to use as boundaries
|
91
|
-
first_heading = page.find('text[size>=11]:bold')
|
92
|
-
next_heading = first_heading.next('text[size>=11]:bold') if first_heading else None
|
93
|
-
|
94
|
-
if first_heading and next_heading:
|
95
|
-
# Create a region from the first heading until the next heading
|
96
|
-
section = first_heading.below(until=next_heading, include_endpoint=False)
|
97
|
-
|
98
|
-
# Highlight both elements and the region between them
|
99
|
-
page.clear_highlights()
|
100
|
-
first_heading.highlight(color="red")
|
101
|
-
next_heading.highlight(color="red")
|
102
|
-
section.highlight(color="yellow")
|
103
|
-
page.show()
|
104
|
-
```
|
105
|
-
|
106
|
-
## Using Regions
|
107
|
-
|
108
|
-
Once you have a region, here's what you can do with it.
|
109
|
-
|
110
|
-
### Extract Text from a Region
|
111
|
-
|
112
|
-
```python
|
113
|
-
# Find a region to work with (e.g., from a title to the next bold text)
|
114
|
-
title = page.find('text:contains("Site")') # Adjust if needed
|
115
|
-
if title:
|
116
|
-
# Create a region from title down to the next bold text
|
117
|
-
content_region = title.below(until='line:horizontal', include_endpoint=False)
|
118
|
-
|
119
|
-
# Extract text from just this region
|
120
|
-
region_text = content_region.extract_text()
|
121
|
-
|
122
|
-
# Show the region and the extracted text
|
123
|
-
page.clear_highlights()
|
124
|
-
content_region.highlight(color="green")
|
125
|
-
page.show()
|
126
|
-
|
127
|
-
# Displaying the text (first 300 chars if long)
|
128
|
-
print(region_text[:300] + "..." if len(region_text) > 300 else region_text)
|
129
|
-
```
|
130
|
-
|
131
|
-
### Find Elements Within a Region
|
132
|
-
|
133
|
-
You can use a region as a "filter" to only find elements within its boundaries.
|
134
|
-
|
135
|
-
```python
|
136
|
-
# Create a region in an interesting part of the page
|
137
|
-
test_region = page.create_region(
|
138
|
-
x0=page.width * 0.1,
|
139
|
-
top=page.height * 0.25,
|
140
|
-
x1=page.width * 0.9,
|
141
|
-
bottom=page.height * 0.75
|
142
|
-
)
|
143
|
-
|
144
|
-
# Find all text elements ONLY within this region
|
145
|
-
text_in_region = test_region.find_all('text')
|
146
|
-
|
147
|
-
# Display result
|
148
|
-
page.clear_highlights()
|
149
|
-
test_region.highlight(color="blue")
|
150
|
-
text_in_region.highlight(color="red")
|
151
|
-
page.show()
|
152
|
-
|
153
|
-
len(text_in_region) # Number of text elements found in region
|
154
|
-
```
|
155
|
-
|
156
|
-
### Generate an Image of a Region
|
157
|
-
|
158
|
-
```python
|
159
|
-
# Find a specific region to capture
|
160
|
-
# (Could be a table, figure, or any significant area)
|
161
|
-
region_for_image = page.create_region(
|
162
|
-
x0=100,
|
163
|
-
top=150,
|
164
|
-
x1=page.width - 100,
|
165
|
-
bottom=300
|
166
|
-
)
|
167
|
-
|
168
|
-
# Generate an image of just this region
|
169
|
-
region_for_image.to_image(crop_only=True) # Shows just the region
|
170
|
-
```
|
171
|
-
|
172
|
-
### Adjust and Expand Regions
|
173
|
-
|
174
|
-
```python
|
175
|
-
# Take an existing region and expand it
|
176
|
-
region_a = page.create_region(200, 200, 400, 400)
|
177
|
-
|
178
|
-
# Expand by a certain number of points in each direction
|
179
|
-
expanded = region_a.expand(left=20, right=20, top=20, bottom=20)
|
180
|
-
|
181
|
-
# Visualize original and expanded regions
|
182
|
-
page.clear_highlights()
|
183
|
-
region_a.highlight(color="blue", label="Original")
|
184
|
-
expanded.highlight(color="red", label="Expanded")
|
185
|
-
page.to_image()
|
186
|
-
```
|
187
|
-
|
188
|
-
## Using Exclusion Zones with Regions
|
189
|
-
|
190
|
-
Exclusion zones are regions that you want to ignore during operations like text extraction.
|
191
|
-
|
192
|
-
```python
|
193
|
-
# Create a region for the whole page
|
194
|
-
full_page_region = page.create_region(0, 0, page.width, page.height)
|
195
|
-
|
196
|
-
# Extract text without exclusions as baseline
|
197
|
-
full_text = full_page_region.extract_text()
|
198
|
-
print(f"Full page text length: {len(full_text)} characters")
|
199
|
-
```
|
200
|
-
|
201
|
-
```python
|
202
|
-
# Define an area we want to exclude (like a header)
|
203
|
-
# Let's exclude the top 10% of the page
|
204
|
-
header_zone = page.create_region(0, 0, page.width, page.height * 0.1)
|
205
|
-
|
206
|
-
# Add this as an exclusion for the page
|
207
|
-
page.add_exclusion(header_zone)
|
208
|
-
|
209
|
-
# Visualize the exclusion
|
210
|
-
page.clear_highlights()
|
211
|
-
header_zone.highlight(color="red", label="Excluded")
|
212
|
-
page.show()
|
213
|
-
```
|
214
|
-
|
215
|
-
```python
|
216
|
-
# Now extract text again - the header should be excluded
|
217
|
-
text_with_exclusion = full_page_region.extract_text() # Uses apply_exclusions=True by default
|
218
|
-
|
219
|
-
# Compare text lengths
|
220
|
-
print(f"Original text: {len(full_text)} chars\nText with exclusion: {len(text_with_exclusion)} chars")
|
221
|
-
print(f"Difference: {len(full_text) - len(text_with_exclusion)} chars excluded")
|
222
|
-
```
|
223
|
-
|
224
|
-
```python
|
225
|
-
# When done with this page, clear exclusions
|
226
|
-
page.clear_exclusions()
|
227
|
-
```
|
228
|
-
|
229
|
-
## Document-Level Exclusions
|
230
|
-
|
231
|
-
PDF-level exclusions apply to all pages and use functions to adapt to each page.
|
232
|
-
|
233
|
-
```python
|
234
|
-
# Define a PDF-level exclusion for headers
|
235
|
-
# This will exclude the top 30% of every page
|
236
|
-
pdf.add_exclusion(
|
237
|
-
lambda p: p.create_region(0, 0, p.width, p.height * 0.3),
|
238
|
-
label="Header zone"
|
239
|
-
)
|
240
|
-
|
241
|
-
# Define a PDF-level exclusion for footers
|
242
|
-
# This will exclude the bottom 20% of every page
|
243
|
-
pdf.add_exclusion(
|
244
|
-
lambda p: p.create_region(0, p.height * 0.8, p.width, p.height),
|
245
|
-
label="Footer zone"
|
246
|
-
)
|
247
|
-
|
248
|
-
# PDF-level exclusions are used whenever you extract text
|
249
|
-
# Let's try on the first three pages
|
250
|
-
for page in pdf.pages[:3]:
|
251
|
-
text = page.extract_text()
|
252
|
-
text_original = page.extract_text(use_exclusions=False)
|
253
|
-
print(f"Page {page.number} – Before: {len(text_original)} After: {len(text)}")
|
254
|
-
```
|
255
|
-
|
256
|
-
```python
|
257
|
-
# Clear PDF-level exclusions when done
|
258
|
-
pdf.clear_exclusions()
|
259
|
-
print("Cleared all PDF-level exclusions")
|
260
|
-
```
|
261
|
-
|
262
|
-
## Working with Layout Analysis Regions
|
263
|
-
|
264
|
-
When you run layout analysis, the detected regions (tables, titles, etc.) are also Region objects.
|
265
|
-
|
266
|
-
```python
|
267
|
-
# First, run layout analysis to detect regions
|
268
|
-
page.analyze_layout() # Uses 'yolo' engine by default
|
269
|
-
|
270
|
-
# Find all detected regions
|
271
|
-
detected_regions = page.find_all('region')
|
272
|
-
print(f"Found {len(detected_regions)} layout regions")
|
273
|
-
```
|
274
|
-
|
275
|
-
```python
|
276
|
-
# Highlight all detected regions by type
|
277
|
-
detected_regions.highlight(group_by='region_type').show()
|
278
|
-
```
|
279
|
-
|
280
|
-
```python
|
281
|
-
# Extract text from a specific region type (e.g., title)
|
282
|
-
title_regions = page.find_all('region[type=title]')
|
283
|
-
if title_regions:
|
284
|
-
titles_text = title_regions.extract_text()
|
285
|
-
print(f"Title text: {titles_text}")
|
286
|
-
```
|
287
|
-
|
288
|
-
## Next Steps
|
289
|
-
|
290
|
-
Now that you understand regions, you can:
|
291
|
-
|
292
|
-
- [Extract tables](../tables/index.ipynb) from table regions
|
293
|
-
- [Ask questions](../document-qa/index.ipynb) about specific regions
|
294
|
-
- [Exclude content](../text-extraction/index.md#filtering-out-headers-and-footers) from extraction
|