natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +422 -0
- natural_pdf/classification/mixin.py +163 -0
- natural_pdf/classification/results.py +80 -0
- natural_pdf/collections/mixins.py +111 -0
- natural_pdf/collections/pdf_collection.py +434 -15
- natural_pdf/core/element_manager.py +83 -0
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +578 -93
- natural_pdf/core/pdf.py +912 -460
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +712 -109
- natural_pdf/elements/region.py +722 -69
- natural_pdf/elements/text.py +4 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf/extraction/manager.py +135 -0
- natural_pdf/extraction/mixin.py +279 -0
- natural_pdf/extraction/result.py +23 -0
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf/ocr/ocr_options.py +94 -11
- natural_pdf/ocr/utils.py +19 -6
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +431 -230
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf/utils/tqdm_utils.py +51 -0
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -915
- docs/element-selection/index.md +0 -229
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -170
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -209
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -340
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -147
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -114
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -270
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -332
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -288
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -413
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -508
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2434
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -512
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -604
- docs/tutorials/12-ocr-integration.md +0 -175
- docs/tutorials/13-semantic-search.ipynb +0 -1328
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.7.dist-info/RECORD +0 -145
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,151 +0,0 @@
|
|
1
|
-
# Working with Regions
|
2
|
-
|
3
|
-
Regions are rectangular areas on a page that let you focus on specific parts of a document. They're perfect for extracting text from defined areas, finding elements within certain boundaries, and working with document sections.
|
4
|
-
|
5
|
-
```python
|
6
|
-
#%pip install "natural-pdf[all]"
|
7
|
-
```
|
8
|
-
|
9
|
-
```python
|
10
|
-
from natural_pdf import PDF
|
11
|
-
|
12
|
-
# Load a PDF
|
13
|
-
pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
|
14
|
-
page = pdf.pages[0]
|
15
|
-
|
16
|
-
# Create a region in the top portion of the page
|
17
|
-
top_region = page.create_region(
|
18
|
-
50, # x0 (left)
|
19
|
-
50, # y0 (top)
|
20
|
-
page.width - 50, # x1 (right)
|
21
|
-
200 # y1 (bottom)
|
22
|
-
)
|
23
|
-
|
24
|
-
# Visualize the region
|
25
|
-
top_region.show(color="blue", label="Top Region")
|
26
|
-
|
27
|
-
# Extract text from this region
|
28
|
-
top_region.extract_text()
|
29
|
-
```
|
30
|
-
|
31
|
-
## Creating Regions from Elements
|
32
|
-
|
33
|
-
```python
|
34
|
-
# Find an element to create regions around
|
35
|
-
title = page.find('text:contains("Jungle Health")')
|
36
|
-
|
37
|
-
# Create regions relative to this element
|
38
|
-
below_title = title.below(height=100)
|
39
|
-
right_of_title = title.right(width=200)
|
40
|
-
above_title = title.above(height=50)
|
41
|
-
|
42
|
-
# Visualize these regions
|
43
|
-
below_title.show(color="green", label="Below")
|
44
|
-
right_of_title.show(color="red", label="Right")
|
45
|
-
above_title.show(color="orange", label="Above")
|
46
|
-
|
47
|
-
# Extract text from the region below the title
|
48
|
-
below_title.extract_text()
|
49
|
-
```
|
50
|
-
|
51
|
-
## Finding Elements Within Regions
|
52
|
-
|
53
|
-
```python
|
54
|
-
# Create a region for a specific document section
|
55
|
-
form_region = page.create_region(50, 100, page.width - 50, 300)
|
56
|
-
|
57
|
-
# Find elements only within this region
|
58
|
-
labels = form_region.find_all('text:contains(":")')
|
59
|
-
|
60
|
-
# Visualize the region and the elements found
|
61
|
-
form_region.show(color=(0, 0, 1, 0.2), label="Form Region")
|
62
|
-
labels.show(color="purple", label="Labels")
|
63
|
-
|
64
|
-
# Count the elements found
|
65
|
-
len(labels)
|
66
|
-
```
|
67
|
-
|
68
|
-
## Expanding and Adjusting Regions
|
69
|
-
|
70
|
-
```python
|
71
|
-
# Find an element to work with
|
72
|
-
element = page.find('text:contains("Summary:")')
|
73
|
-
|
74
|
-
# Create a tight region around the element
|
75
|
-
tight_region = page.create_region(
|
76
|
-
element.x0, element.top,
|
77
|
-
element.x1, element.bottom
|
78
|
-
)
|
79
|
-
|
80
|
-
# Expand it to include surrounding content
|
81
|
-
expanded_region = tight_region.expand(
|
82
|
-
left=10, # Expand 10 points to the left
|
83
|
-
right=200, # Expand 200 points to the right
|
84
|
-
top=5, # Expand 5 points above
|
85
|
-
bottom=100 # Expand 100 points below
|
86
|
-
)
|
87
|
-
|
88
|
-
# Visualize both regions
|
89
|
-
tight_region.show(color="red", label="Original")
|
90
|
-
expanded_region.show(color="blue", label="Expanded")
|
91
|
-
|
92
|
-
# Extract the content from the expanded region
|
93
|
-
expanded_region.extract_text()
|
94
|
-
```
|
95
|
-
|
96
|
-
## Creating Bounded Regions
|
97
|
-
|
98
|
-
```python
|
99
|
-
# Find two elements to serve as boundaries
|
100
|
-
start_elem = page.find('text:contains("Summary:")')
|
101
|
-
end_elem = page.find('text:contains("Statute")')
|
102
|
-
|
103
|
-
# Create a region from start to end element
|
104
|
-
bounded_region = start_elem.until(end_elem)
|
105
|
-
|
106
|
-
# Visualize the bounded region
|
107
|
-
bounded_region.show(color="green", label="Bounded Region")
|
108
|
-
|
109
|
-
# Extract text from this bounded region
|
110
|
-
bounded_region.extract_text()[:200] + "..." if len(bounded_region.extract_text()) > 200 else bounded_region.extract_text()
|
111
|
-
```
|
112
|
-
|
113
|
-
## Working with Multiple Regions
|
114
|
-
|
115
|
-
```python
|
116
|
-
# Define multiple regions to extract different parts of the document
|
117
|
-
header_region = page.create_region(0, 0, page.width, 100)
|
118
|
-
main_region = page.create_region(100, 100, page.width - 100, page.height - 150)
|
119
|
-
footer_region = page.create_region(0, page.height - 50, page.width, page.height)
|
120
|
-
|
121
|
-
# Visualize all regions
|
122
|
-
header_region.show(color="blue", label="Header")
|
123
|
-
main_region.show(color="green", label="Main Content")
|
124
|
-
footer_region.show(color="red", label="Footer")
|
125
|
-
|
126
|
-
# Extract content from each region
|
127
|
-
document_parts = {
|
128
|
-
"header": header_region.extract_text(),
|
129
|
-
"main": main_region.extract_text()[:100] + "...",
|
130
|
-
"footer": footer_region.extract_text()
|
131
|
-
}
|
132
|
-
|
133
|
-
# Show what we extracted
|
134
|
-
document_parts
|
135
|
-
```
|
136
|
-
|
137
|
-
## Creating an Image of a Region
|
138
|
-
|
139
|
-
```python
|
140
|
-
# Find a region of interest
|
141
|
-
table_header = page.find('text:contains("Statute")')
|
142
|
-
table_region = table_header.below(height=100)
|
143
|
-
|
144
|
-
# Visualize the region
|
145
|
-
table_region.show(color="purple", label="Table Region")
|
146
|
-
|
147
|
-
# Create an image of just this region
|
148
|
-
table_region.to_image(resolution=150)
|
149
|
-
```
|
150
|
-
|
151
|
-
Regions allow you to precisely target specific parts of a document for extraction and analysis. They're essential for handling complex document layouts and isolating the exact content you need.
|