natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +209 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +288 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +413 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +512 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +604 -0
- docs/tutorials/12-ocr-integration.md +175 -0
- docs/tutorials/13-semantic-search.ipynb +1328 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +50 -33
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/gemini.py +264 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +125 -58
- natural_pdf/analyzers/layout/layout_options.py +43 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +89 -45
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +146 -97
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +419 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +1044 -521
- natural_pdf/core/pdf.py +516 -313
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +307 -225
- natural_pdf/elements/collections.py +805 -543
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +889 -879
- natural_pdf/elements/text.py +127 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +57 -35
- natural_pdf/ocr/engine.py +150 -46
- natural_pdf/ocr/engine_easyocr.py +146 -150
- natural_pdf/ocr/engine_paddle.py +118 -175
- natural_pdf/ocr/engine_surya.py +78 -141
- natural_pdf/ocr/ocr_factory.py +114 -0
- natural_pdf/ocr/ocr_manager.py +122 -124
- natural_pdf/ocr/ocr_options.py +16 -20
- natural_pdf/ocr/utils.py +98 -0
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/templates/spa/css/style.css +334 -0
- natural_pdf/templates/spa/index.html +31 -0
- natural_pdf/templates/spa/js/app.js +472 -0
- natural_pdf/templates/spa/words.txt +235976 -0
- natural_pdf/utils/debug.py +32 -0
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/identifiers.py +29 -0
- natural_pdf/utils/packaging.py +418 -0
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
- natural_pdf-0.1.6.dist-info/RECORD +141 -0
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
- natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- natural_pdf/templates/ocr_debug.html +0 -517
- natural_pdf-0.1.4.dist-info/RECORD +0 -61
- natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -1,40 +1,44 @@
|
|
1
1
|
"""
|
2
2
|
Visualization utilities for natural-pdf.
|
3
3
|
"""
|
4
|
-
|
4
|
+
|
5
5
|
import io
|
6
|
+
import itertools # Added for cycling
|
6
7
|
import math
|
7
8
|
import random
|
8
|
-
import
|
9
|
+
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
10
|
+
|
9
11
|
from PIL import Image, ImageDraw, ImageFont
|
10
12
|
|
11
13
|
# Define a base list of visually distinct colors for highlighting
|
12
14
|
# Format: (R, G, B)
|
13
15
|
_BASE_HIGHLIGHT_COLORS = [
|
14
|
-
(255, 0, 0),
|
15
|
-
(0, 255, 0),
|
16
|
-
(0, 0, 255),
|
17
|
-
(255, 0, 255),
|
18
|
-
(0, 255, 255),
|
19
|
-
(255, 165, 0),
|
20
|
-
(128, 0, 128),
|
21
|
-
(0, 128, 0),
|
22
|
-
(0, 0, 128),
|
23
|
-
(255, 215, 0),
|
24
|
-
(75, 0, 130),
|
16
|
+
(255, 0, 0), # Red
|
17
|
+
(0, 255, 0), # Green
|
18
|
+
(0, 0, 255), # Blue
|
19
|
+
(255, 0, 255), # Magenta
|
20
|
+
(0, 255, 255), # Cyan
|
21
|
+
(255, 165, 0), # Orange
|
22
|
+
(128, 0, 128), # Purple
|
23
|
+
(0, 128, 0), # Dark Green
|
24
|
+
(0, 0, 128), # Navy
|
25
|
+
(255, 215, 0), # Gold
|
26
|
+
(75, 0, 130), # Indigo
|
25
27
|
(240, 128, 128), # Light Coral
|
26
|
-
(32, 178, 170),
|
27
|
-
(138, 43, 226),
|
28
|
-
(160, 82, 45),
|
28
|
+
(32, 178, 170), # Light Sea Green
|
29
|
+
(138, 43, 226), # Blue Violet
|
30
|
+
(160, 82, 45), # Sienna
|
29
31
|
]
|
30
32
|
|
31
33
|
# Default Alpha for highlight fills
|
32
34
|
DEFAULT_FILL_ALPHA = 100
|
33
35
|
|
36
|
+
|
34
37
|
class ColorManager:
|
35
38
|
"""
|
36
39
|
Manages color assignment for highlights, ensuring consistency for labels.
|
37
40
|
"""
|
41
|
+
|
38
42
|
def __init__(self, alpha: int = DEFAULT_FILL_ALPHA):
|
39
43
|
"""
|
40
44
|
Initializes the ColorManager.
|
@@ -52,8 +56,9 @@ class ColorManager:
|
|
52
56
|
"""Applies the instance's alpha to an RGB tuple."""
|
53
57
|
return (*rgb, self._alpha)
|
54
58
|
|
55
|
-
def get_color(
|
56
|
-
|
59
|
+
def get_color(
|
60
|
+
self, label: Optional[str] = None, force_cycle: bool = False
|
61
|
+
) -> Tuple[int, int, int, int]:
|
57
62
|
"""
|
58
63
|
Gets an RGBA color tuple.
|
59
64
|
|
@@ -72,7 +77,7 @@ class ColorManager:
|
|
72
77
|
# Always get the next color, don't store by label
|
73
78
|
rgb = next(self._color_cycle)
|
74
79
|
return self._get_rgba_color(rgb)
|
75
|
-
|
80
|
+
|
76
81
|
if label is not None:
|
77
82
|
if label in self._labels_colors:
|
78
83
|
# Return existing color for this label
|
@@ -99,41 +104,43 @@ class ColorManager:
|
|
99
104
|
self._color_cycle = itertools.cycle(self._available_colors)
|
100
105
|
self._labels_colors = {}
|
101
106
|
|
102
|
-
|
107
|
+
|
108
|
+
# --- Global color state and functions removed ---
|
103
109
|
# HIGHLIGHT_COLORS, _color_cycle, _current_labels_colors, _used_colors_iterator
|
104
110
|
# get_next_highlight_color(), reset_highlight_colors()
|
105
111
|
|
106
|
-
|
107
|
-
|
108
|
-
|
112
|
+
|
113
|
+
def create_legend(
|
114
|
+
labels_colors: Dict[str, Tuple[int, int, int, int]], width: int = 200, item_height: int = 30
|
115
|
+
) -> Image.Image:
|
109
116
|
"""
|
110
117
|
Create a legend image for the highlighted elements.
|
111
|
-
|
118
|
+
|
112
119
|
Args:
|
113
120
|
labels_colors: Dictionary mapping labels to colors
|
114
121
|
width: Width of the legend image
|
115
122
|
item_height: Height of each legend item
|
116
|
-
|
123
|
+
|
117
124
|
Returns:
|
118
125
|
PIL Image with the legend
|
119
126
|
"""
|
120
127
|
# Calculate the height based on the number of labels
|
121
128
|
height = len(labels_colors) * item_height + 10 # 10px padding
|
122
|
-
|
129
|
+
|
123
130
|
# Create a white image
|
124
|
-
legend = Image.new(
|
131
|
+
legend = Image.new("RGBA", (width, height), (255, 255, 255, 255))
|
125
132
|
draw = ImageDraw.Draw(legend)
|
126
|
-
|
133
|
+
|
127
134
|
# Try to load a font, use default if not available
|
128
135
|
try:
|
129
136
|
# Use a commonly available font, adjust size
|
130
|
-
font = ImageFont.truetype("DejaVuSans.ttf", 14)
|
137
|
+
font = ImageFont.truetype("DejaVuSans.ttf", 14)
|
131
138
|
except IOError:
|
132
139
|
try:
|
133
|
-
|
140
|
+
font = ImageFont.truetype("Arial.ttf", 14)
|
134
141
|
except IOError:
|
135
142
|
font = ImageFont.load_default()
|
136
|
-
|
143
|
+
|
137
144
|
# Draw each legend item
|
138
145
|
y = 5 # Start with 5px padding
|
139
146
|
for label, color in labels_colors.items():
|
@@ -141,10 +148,10 @@ def create_legend(labels_colors: Dict[str, Tuple[int, int, int, int]],
|
|
141
148
|
# Handle potential case where alpha isn't provided (use default 255)
|
142
149
|
if len(color) == 3:
|
143
150
|
r, g, b = color
|
144
|
-
alpha = 255
|
151
|
+
alpha = 255 # Assume opaque if alpha is missing
|
145
152
|
else:
|
146
153
|
r, g, b, alpha = color
|
147
|
-
|
154
|
+
|
148
155
|
# Calculate the apparent color when drawn on white background
|
149
156
|
# Alpha blending formula: result = (source * alpha) + (dest * (1-alpha))
|
150
157
|
# Where alpha is normalized to 0-1 range
|
@@ -152,72 +159,74 @@ def create_legend(labels_colors: Dict[str, Tuple[int, int, int, int]],
|
|
152
159
|
apparent_r = int(r * alpha_norm + 255 * (1 - alpha_norm))
|
153
160
|
apparent_g = int(g * alpha_norm + 255 * (1 - alpha_norm))
|
154
161
|
apparent_b = int(b * alpha_norm + 255 * (1 - alpha_norm))
|
155
|
-
|
162
|
+
|
156
163
|
# Use solid color that matches the apparent color of the semi-transparent highlight
|
157
164
|
legend_color = (apparent_r, apparent_g, apparent_b, 255)
|
158
|
-
|
165
|
+
|
159
166
|
# Draw the color box
|
160
167
|
draw.rectangle([(10, y), (30, y + item_height - 5)], fill=legend_color)
|
161
|
-
|
168
|
+
|
162
169
|
# Draw the label text
|
163
170
|
draw.text((40, y + (item_height // 2) - 6), label, fill=(0, 0, 0, 255), font=font)
|
164
|
-
|
171
|
+
|
165
172
|
# Move to the next position
|
166
173
|
y += item_height
|
167
|
-
|
174
|
+
|
168
175
|
return legend
|
169
176
|
|
170
|
-
|
171
|
-
|
172
|
-
|
177
|
+
|
178
|
+
def merge_images_with_legend(
|
179
|
+
image: Image.Image, legend: Image.Image, position: str = "right"
|
180
|
+
) -> Image.Image:
|
173
181
|
"""
|
174
182
|
Merge an image with a legend.
|
175
|
-
|
183
|
+
|
176
184
|
Args:
|
177
185
|
image: Main image
|
178
186
|
legend: Legend image
|
179
187
|
position: Position of the legend ('right', 'bottom', 'top', 'left')
|
180
|
-
|
188
|
+
|
181
189
|
Returns:
|
182
190
|
Merged image
|
183
191
|
"""
|
184
192
|
if not legend:
|
185
|
-
return image
|
186
|
-
|
187
|
-
# Determine background color from top-left pixel (safer than assuming white)
|
188
|
-
bg_color = image.getpixel((0,0)) if image.mode == 'RGBA' else (255, 255, 255, 255)
|
193
|
+
return image # Return original image if legend is None or empty
|
189
194
|
|
190
|
-
|
195
|
+
bg_color = (255, 255, 255, 255) # Always use white for the merged background
|
196
|
+
|
197
|
+
if position == "right":
|
191
198
|
# Create a new image with extra width for the legend
|
192
199
|
merged_width = image.width + legend.width
|
193
200
|
merged_height = max(image.height, legend.height)
|
194
|
-
merged = Image.new(
|
201
|
+
merged = Image.new("RGBA", (merged_width, merged_height), bg_color)
|
195
202
|
merged.paste(image, (0, 0))
|
196
|
-
merged.paste(
|
197
|
-
|
203
|
+
merged.paste(
|
204
|
+
legend, (image.width, 0), legend if legend.mode == "RGBA" else None
|
205
|
+
) # Handle transparency
|
206
|
+
elif position == "bottom":
|
198
207
|
# Create a new image with extra height for the legend
|
199
208
|
merged_width = max(image.width, legend.width)
|
200
209
|
merged_height = image.height + legend.height
|
201
|
-
merged = Image.new(
|
210
|
+
merged = Image.new("RGBA", (merged_width, merged_height), bg_color)
|
202
211
|
merged.paste(image, (0, 0))
|
203
|
-
merged.paste(legend, (0, image.height), legend if legend.mode ==
|
204
|
-
elif position ==
|
212
|
+
merged.paste(legend, (0, image.height), legend if legend.mode == "RGBA" else None)
|
213
|
+
elif position == "top":
|
205
214
|
# Create a new image with extra height for the legend
|
206
215
|
merged_width = max(image.width, legend.width)
|
207
216
|
merged_height = image.height + legend.height
|
208
|
-
merged = Image.new(
|
209
|
-
merged.paste(legend, (0, 0), legend if legend.mode ==
|
217
|
+
merged = Image.new("RGBA", (merged_width, merged_height), bg_color)
|
218
|
+
merged.paste(legend, (0, 0), legend if legend.mode == "RGBA" else None)
|
210
219
|
merged.paste(image, (0, legend.height))
|
211
|
-
elif position ==
|
220
|
+
elif position == "left":
|
212
221
|
# Create a new image with extra width for the legend
|
213
222
|
merged_width = image.width + legend.width
|
214
223
|
merged_height = max(image.height, legend.height)
|
215
|
-
merged = Image.new(
|
216
|
-
merged.paste(legend, (0, 0), legend if legend.mode ==
|
224
|
+
merged = Image.new("RGBA", (merged_width, merged_height), bg_color)
|
225
|
+
merged.paste(legend, (0, 0), legend if legend.mode == "RGBA" else None)
|
217
226
|
merged.paste(image, (legend.width, 0))
|
218
227
|
else:
|
219
228
|
# Invalid position, return the original image
|
220
229
|
print(f"Warning: Invalid legend position '{position}'. Returning original image.")
|
221
230
|
merged = image
|
222
|
-
|
223
|
-
return merged
|
231
|
+
|
232
|
+
return merged
|
natural_pdf/widgets/__init__.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
from .viewer import SimpleInteractiveViewerWidget as InteractiveViewerWidget
|
2
|
-
|
3
1
|
# Also provide the original implementation for reference
|
4
|
-
from .viewer import InteractiveViewerWidget as _OriginalInteractiveViewerWidget
|
2
|
+
from .viewer import InteractiveViewerWidget as _OriginalInteractiveViewerWidget
|
3
|
+
from .viewer import SimpleInteractiveViewerWidget as InteractiveViewerWidget
|