natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +222 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +260 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +409 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +484 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +586 -0
- docs/tutorials/12-ocr-integration.md +188 -0
- docs/tutorials/13-semantic-search.ipynb +1888 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +39 -20
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +98 -58
- natural_pdf/analyzers/layout/layout_options.py +32 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +84 -44
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +126 -98
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +416 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +910 -516
- natural_pdf/core/pdf.py +387 -289
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +302 -214
- natural_pdf/elements/collections.py +714 -514
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +854 -883
- natural_pdf/elements/text.py +122 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +23 -14
- natural_pdf/ocr/engine.py +17 -8
- natural_pdf/ocr/engine_easyocr.py +63 -47
- natural_pdf/ocr/engine_paddle.py +97 -68
- natural_pdf/ocr/engine_surya.py +54 -44
- natural_pdf/ocr/ocr_manager.py +88 -62
- natural_pdf/ocr/ocr_options.py +16 -10
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
- natural_pdf-0.1.5.dist-info/RECORD +134 -0
- natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- tests/test_loading.py +50 -0
- tests/test_optional_deps.py +298 -0
- natural_pdf-0.1.3.dist-info/RECORD +0 -61
- natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,43 +1,46 @@
|
|
1
1
|
"""
|
2
2
|
Centralized service for managing and rendering highlights in a PDF document.
|
3
3
|
"""
|
4
|
+
|
4
5
|
import io
|
6
|
+
import logging # Added
|
5
7
|
import os
|
6
|
-
import logging # Added
|
7
8
|
from dataclasses import dataclass, field
|
8
|
-
from typing import
|
9
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
9
10
|
|
10
|
-
from PIL import Image, ImageDraw, ImageFont
|
11
11
|
from colour import Color
|
12
|
+
from PIL import Image, ImageDraw, ImageFont
|
12
13
|
|
13
14
|
# Attempt to import Page for type hinting safely
|
14
15
|
try:
|
15
16
|
from .page import Page
|
16
17
|
except ImportError:
|
17
|
-
Page = Any
|
18
|
+
Page = Any # Fallback if circular import issue arises during type checking
|
18
19
|
|
19
20
|
# Import ColorManager and related utils
|
20
21
|
from natural_pdf.utils.visualization import ColorManager, create_legend, merge_images_with_legend
|
21
22
|
|
22
23
|
# Constants for drawing (Can be potentially moved to ColorManager/Renderer if desired)
|
23
|
-
BORDER_ALPHA = 180
|
24
|
-
DEFAULT_FALLBACK_COLOR = (255, 255, 0)
|
24
|
+
BORDER_ALPHA = 180 # Default alpha for highlight border
|
25
|
+
DEFAULT_FALLBACK_COLOR = (255, 255, 0) # Yellow fallback (RGB only, alpha added by ColorManager)
|
25
26
|
|
26
27
|
# Setup logger
|
27
28
|
logger = logging.getLogger(__name__)
|
28
29
|
|
30
|
+
|
29
31
|
@dataclass
|
30
32
|
class Highlight:
|
31
33
|
"""
|
32
34
|
Represents a single highlight to be drawn.
|
33
35
|
Stores geometric data, color, label, and extracted attributes.
|
34
36
|
"""
|
37
|
+
|
35
38
|
page_index: int
|
36
39
|
bbox: Tuple[float, float, float, float]
|
37
|
-
color: Tuple[int, int, int, int]
|
40
|
+
color: Tuple[int, int, int, int] # Final RGBA color determined by service
|
38
41
|
label: Optional[str] = None
|
39
42
|
polygon: Optional[List[Tuple[float, float]]] = None
|
40
|
-
attributes: Dict[str, Any] = field(default_factory=dict)
|
43
|
+
attributes: Dict[str, Any] = field(default_factory=dict) # Store extracted attribute values
|
41
44
|
|
42
45
|
@property
|
43
46
|
def is_polygon(self) -> bool:
|
@@ -46,14 +49,9 @@ class Highlight:
|
|
46
49
|
|
47
50
|
@property
|
48
51
|
def border_color(self) -> Tuple[int, int, int, int]:
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
self.color[0],
|
53
|
-
self.color[1],
|
54
|
-
self.color[2],
|
55
|
-
BORDER_ALPHA
|
56
|
-
)
|
52
|
+
"""Calculate a slightly darker/more opaque border color."""
|
53
|
+
# Use base color but increase alpha for border
|
54
|
+
return (self.color[0], self.color[1], self.color[2], BORDER_ALPHA)
|
57
55
|
|
58
56
|
|
59
57
|
class HighlightRenderer:
|
@@ -61,6 +59,7 @@ class HighlightRenderer:
|
|
61
59
|
Handles the drawing logic for highlights on a single page image.
|
62
60
|
Instantiated by HighlightingService for each render request.
|
63
61
|
"""
|
62
|
+
|
64
63
|
def __init__(
|
65
64
|
self,
|
66
65
|
page: Page,
|
@@ -69,13 +68,13 @@ class HighlightRenderer:
|
|
69
68
|
scale: float,
|
70
69
|
render_ocr: bool,
|
71
70
|
):
|
72
|
-
self.page = page
|
73
|
-
self.base_image = base_image.convert(
|
71
|
+
self.page = page # Keep page reference for OCR rendering
|
72
|
+
self.base_image = base_image.convert("RGBA") # Ensure RGBA
|
74
73
|
self.highlights = highlights
|
75
74
|
self.scale = scale
|
76
75
|
self.render_ocr = render_ocr
|
77
76
|
self.result_image = self.base_image.copy()
|
78
|
-
self.vertex_size = max(3, int(2 * self.scale))
|
77
|
+
self.vertex_size = max(3, int(2 * self.scale)) # Size of corner markers
|
79
78
|
|
80
79
|
def render(self) -> Image.Image:
|
81
80
|
"""Executes the rendering process."""
|
@@ -88,7 +87,7 @@ class HighlightRenderer:
|
|
88
87
|
"""Draws all highlight shapes, borders, vertices, and attributes."""
|
89
88
|
for highlight in self.highlights:
|
90
89
|
# Create a transparent overlay for this single highlight
|
91
|
-
overlay = Image.new(
|
90
|
+
overlay = Image.new("RGBA", self.base_image.size, (0, 0, 0, 0))
|
92
91
|
draw = ImageDraw.Draw(overlay)
|
93
92
|
|
94
93
|
scaled_bbox = None
|
@@ -96,7 +95,9 @@ class HighlightRenderer:
|
|
96
95
|
if highlight.is_polygon:
|
97
96
|
scaled_polygon = [(p[0] * self.scale, p[1] * self.scale) for p in highlight.polygon]
|
98
97
|
# Draw polygon fill and border
|
99
|
-
draw.polygon(
|
98
|
+
draw.polygon(
|
99
|
+
scaled_polygon, fill=highlight.color, outline=highlight.border_color, width=2
|
100
|
+
)
|
100
101
|
self._draw_vertices(draw, scaled_polygon, highlight.border_color)
|
101
102
|
|
102
103
|
# Calculate scaled bbox for attribute drawing
|
@@ -104,44 +105,63 @@ class HighlightRenderer:
|
|
104
105
|
y_coords = [p[1] for p in scaled_polygon]
|
105
106
|
scaled_bbox = [min(x_coords), min(y_coords), max(x_coords), max(y_coords)]
|
106
107
|
|
107
|
-
else:
|
108
|
+
else: # Rectangle
|
108
109
|
x0, top, x1, bottom = highlight.bbox
|
109
|
-
x0_s, top_s, x1_s, bottom_s =
|
110
|
+
x0_s, top_s, x1_s, bottom_s = (
|
111
|
+
x0 * self.scale,
|
112
|
+
top * self.scale,
|
113
|
+
x1 * self.scale,
|
114
|
+
bottom * self.scale,
|
115
|
+
)
|
110
116
|
scaled_bbox = [x0_s, top_s, x1_s, bottom_s]
|
111
117
|
# Draw rectangle fill and border
|
112
|
-
draw.rectangle(
|
118
|
+
draw.rectangle(
|
119
|
+
scaled_bbox, fill=highlight.color, outline=highlight.border_color, width=2
|
120
|
+
)
|
113
121
|
|
114
122
|
vertices = [(x0_s, top_s), (x1_s, top_s), (x1_s, bottom_s), (x0_s, bottom_s)]
|
115
123
|
self._draw_vertices(draw, vertices, highlight.border_color)
|
116
124
|
|
117
125
|
# Draw attributes if present on the highlight object
|
118
|
-
if highlight.attributes and scaled_bbox:
|
126
|
+
if highlight.attributes and scaled_bbox: # Ensure bbox is calculated
|
119
127
|
self._draw_attributes(draw, highlight.attributes, scaled_bbox)
|
120
128
|
|
121
129
|
# Composite this highlight's overlay onto the result using alpha blending
|
122
130
|
self.result_image = Image.alpha_composite(self.result_image, overlay)
|
123
131
|
|
124
|
-
def _draw_vertices(
|
132
|
+
def _draw_vertices(
|
133
|
+
self,
|
134
|
+
draw: ImageDraw.Draw,
|
135
|
+
vertices: List[Tuple[float, float]],
|
136
|
+
color: Tuple[int, int, int, int],
|
137
|
+
):
|
125
138
|
"""Draw small markers at each vertex."""
|
126
139
|
for x, y in vertices:
|
127
140
|
# Draw ellipse centered at vertex
|
128
141
|
draw.ellipse(
|
129
|
-
[
|
130
|
-
|
142
|
+
[
|
143
|
+
x - self.vertex_size,
|
144
|
+
y - self.vertex_size,
|
145
|
+
x + self.vertex_size,
|
146
|
+
y + self.vertex_size,
|
147
|
+
],
|
148
|
+
fill=color, # Use border color for vertices
|
131
149
|
)
|
132
150
|
|
133
|
-
def _draw_attributes(
|
151
|
+
def _draw_attributes(
|
152
|
+
self, draw: ImageDraw.Draw, attributes: Dict[str, Any], bbox_scaled: List[float]
|
153
|
+
):
|
134
154
|
"""Draws attribute key-value pairs on the highlight."""
|
135
155
|
try:
|
136
156
|
# Slightly larger font, scaled
|
137
157
|
font_size = max(10, int(8 * self.scale))
|
138
158
|
# Prioritize monospace fonts for better alignment
|
139
|
-
font = ImageFont.truetype("Arial.ttf", font_size)
|
159
|
+
font = ImageFont.truetype("Arial.ttf", font_size) # Fallback sans-serif
|
140
160
|
except IOError:
|
141
161
|
font = ImageFont.load_default()
|
142
|
-
font_size = 10
|
162
|
+
font_size = 10 # Reset size for default font
|
143
163
|
|
144
|
-
line_height = font_size + int(4 * self.scale)
|
164
|
+
line_height = font_size + int(4 * self.scale) # Scaled line spacing
|
145
165
|
bg_padding = int(3 * self.scale)
|
146
166
|
max_width = 0
|
147
167
|
text_lines = []
|
@@ -149,17 +169,19 @@ class HighlightRenderer:
|
|
149
169
|
# Format attribute lines
|
150
170
|
for name, value in attributes.items():
|
151
171
|
if isinstance(value, float):
|
152
|
-
value_str = f"{value:.2f}"
|
172
|
+
value_str = f"{value:.2f}" # Format floats
|
153
173
|
else:
|
154
174
|
value_str = str(value)
|
155
175
|
line = f"{name}: {value_str}"
|
156
176
|
text_lines.append(line)
|
157
177
|
try:
|
158
|
-
|
159
|
-
|
160
|
-
except AttributeError:
|
178
|
+
# Calculate max width for background box
|
179
|
+
max_width = max(max_width, draw.textlength(line, font=font))
|
180
|
+
except AttributeError:
|
181
|
+
pass # Ignore if textlength not available
|
161
182
|
|
162
|
-
if not text_lines:
|
183
|
+
if not text_lines:
|
184
|
+
return # Nothing to draw
|
163
185
|
|
164
186
|
total_height = line_height * len(text_lines)
|
165
187
|
|
@@ -175,8 +197,8 @@ class HighlightRenderer:
|
|
175
197
|
draw.rectangle(
|
176
198
|
[bg_x0, bg_y0, bg_x1, bg_y1],
|
177
199
|
fill=(255, 255, 255, 240),
|
178
|
-
outline=(0, 0, 0, 180),
|
179
|
-
width=1
|
200
|
+
outline=(0, 0, 0, 180), # Light black outline
|
201
|
+
width=1,
|
180
202
|
)
|
181
203
|
|
182
204
|
# Draw text lines (black)
|
@@ -190,21 +212,25 @@ class HighlightRenderer:
|
|
190
212
|
# Use the page reference to get OCR elements
|
191
213
|
try:
|
192
214
|
# Try finding first, then extracting if necessary
|
193
|
-
ocr_elements = self.page.find_all(
|
215
|
+
ocr_elements = self.page.find_all("text[source=ocr]")
|
194
216
|
if not ocr_elements:
|
195
|
-
|
196
|
-
|
197
|
-
|
217
|
+
# Don't run full OCR here, just extract if already run
|
218
|
+
ocr_elements = [
|
219
|
+
el for el in self.page.words if getattr(el, "source", None) == "ocr"
|
220
|
+
]
|
221
|
+
# Alternative: self.page.extract_ocr_elements() - but might be slow
|
198
222
|
|
199
223
|
except Exception as e:
|
200
|
-
logger.warning(
|
201
|
-
|
224
|
+
logger.warning(
|
225
|
+
f"Could not get OCR elements for page {self.page.number}: {e}", exc_info=True
|
226
|
+
)
|
227
|
+
return # Don't modify image if OCR elements aren't available
|
202
228
|
|
203
229
|
if not ocr_elements:
|
204
230
|
logger.debug(f"No OCR elements found for page {self.page.number} to render.")
|
205
231
|
return
|
206
232
|
|
207
|
-
overlay = Image.new(
|
233
|
+
overlay = Image.new("RGBA", self.base_image.size, (0, 0, 0, 0))
|
208
234
|
draw = ImageDraw.Draw(overlay)
|
209
235
|
|
210
236
|
# Find a suitable font
|
@@ -212,22 +238,28 @@ class HighlightRenderer:
|
|
212
238
|
default_font = ImageFont.load_default()
|
213
239
|
common_fonts = ["DejaVuSans.ttf", "Arial.ttf", "Helvetica.ttf", "FreeSans.ttf"]
|
214
240
|
for fname in common_fonts:
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
241
|
+
try:
|
242
|
+
ImageFont.truetype(fname, 10) # Test load
|
243
|
+
font_path = fname
|
244
|
+
break
|
245
|
+
except IOError:
|
246
|
+
continue
|
221
247
|
|
222
248
|
for element in ocr_elements:
|
223
249
|
x0, top, x1, bottom = element.bbox
|
224
|
-
x0_s, top_s, x1_s, bottom_s =
|
250
|
+
x0_s, top_s, x1_s, bottom_s = (
|
251
|
+
x0 * self.scale,
|
252
|
+
top * self.scale,
|
253
|
+
x1 * self.scale,
|
254
|
+
bottom * self.scale,
|
255
|
+
)
|
225
256
|
box_w, box_h = x1_s - x0_s, bottom_s - top_s
|
226
257
|
|
227
|
-
if box_h <= 0:
|
258
|
+
if box_h <= 0:
|
259
|
+
continue # Skip zero-height boxes
|
228
260
|
|
229
261
|
# --- Font Size Calculation ---
|
230
|
-
font_size = max(9, int(box_h * 0.85))
|
262
|
+
font_size = max(9, int(box_h * 0.85)) # Min size 9, 85% of box height
|
231
263
|
|
232
264
|
try:
|
233
265
|
sized_font = ImageFont.truetype(font_path, font_size) if font_path else default_font
|
@@ -236,33 +268,36 @@ class HighlightRenderer:
|
|
236
268
|
|
237
269
|
# --- Adjust Font Size if Text Overflows ---
|
238
270
|
try:
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
271
|
+
text_w = draw.textlength(element.text, font=sized_font)
|
272
|
+
if text_w > box_w * 1.1: # Allow 10% overflow
|
273
|
+
ratio = max(0.5, (box_w * 1.0) / text_w) # Don't shrink below 50%
|
274
|
+
font_size = max(9, int(font_size * ratio))
|
275
|
+
if font_path:
|
276
|
+
try:
|
277
|
+
sized_font = ImageFont.truetype(font_path, font_size)
|
278
|
+
except IOError:
|
279
|
+
pass # Keep previous if error
|
280
|
+
except AttributeError:
|
281
|
+
pass # Skip adjustment if textlength fails
|
247
282
|
|
248
283
|
# --- Draw Background and Text ---
|
249
|
-
padding = max(1, int(font_size * 0.05))
|
284
|
+
padding = max(1, int(font_size * 0.05)) # Minimal padding
|
250
285
|
draw.rectangle(
|
251
286
|
[x0_s - padding, top_s - padding, x1_s + padding, bottom_s + padding],
|
252
|
-
fill=(255, 255, 255, 230)
|
287
|
+
fill=(255, 255, 255, 230), # Highly transparent white background
|
253
288
|
)
|
254
289
|
|
255
290
|
# Calculate text position (centered vertically, slightly offset from left)
|
256
291
|
try:
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
292
|
+
if hasattr(sized_font, "getbbox"): # Modern PIL
|
293
|
+
_, text_top_offset, _, text_bottom_offset = sized_font.getbbox(element.text)
|
294
|
+
text_h = text_bottom_offset - text_top_offset
|
295
|
+
else: # Older PIL approximation
|
296
|
+
text_h = font_size
|
297
|
+
text_y = top_s + (box_h - text_h) / 2
|
298
|
+
# Adjust for vertical offset in some fonts
|
299
|
+
text_y -= text_top_offset if hasattr(sized_font, "getbbox") else 0
|
300
|
+
text_x = x0_s + padding # Start near left edge with padding
|
266
301
|
|
267
302
|
except Exception:
|
268
303
|
# Fallback positioning
|
@@ -279,18 +314,18 @@ class HighlightingService:
|
|
279
314
|
Central service to manage highlight data and orchestrate rendering.
|
280
315
|
Holds the state of all highlights across the document.
|
281
316
|
"""
|
317
|
+
|
282
318
|
def __init__(self, pdf_object):
|
283
|
-
self._pdf = pdf_object
|
319
|
+
self._pdf = pdf_object # Reference to the parent PDF object
|
284
320
|
self._highlights_by_page: Dict[int, List[Highlight]] = {}
|
285
|
-
self._color_manager = ColorManager()
|
321
|
+
self._color_manager = ColorManager() # Instantiate the color manager
|
286
322
|
logger.info("HighlightingService initialized with ColorManager.")
|
287
323
|
|
288
324
|
# Removed _get_next_color - logic moved to ColorManager
|
289
325
|
# Removed _color_cycle, _labels_colors - managed by ColorManager
|
290
326
|
|
291
327
|
def _process_color_input(
|
292
|
-
self,
|
293
|
-
color_input: Optional[Union[Tuple, str]]
|
328
|
+
self, color_input: Optional[Union[Tuple, str]]
|
294
329
|
) -> Optional[Tuple[int, int, int, int]]:
|
295
330
|
"""
|
296
331
|
Parses various color input formats into a standard RGBA tuple (0-255).
|
@@ -303,32 +338,37 @@ class HighlightingService:
|
|
303
338
|
# Convert float values (0.0-1.0) to int (0-255)
|
304
339
|
processed = []
|
305
340
|
all_float = all(isinstance(c, float) and 0.0 <= c <= 1.0 for c in color_input[:3])
|
306
|
-
|
341
|
+
|
307
342
|
for i, c in enumerate(color_input):
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
343
|
+
if isinstance(c, float):
|
344
|
+
val = (
|
345
|
+
int(c * 255)
|
346
|
+
if (i < 3 and all_float) or (i == 3 and 0.0 <= c <= 1.0)
|
347
|
+
else int(c)
|
348
|
+
)
|
349
|
+
elif isinstance(c, int):
|
350
|
+
val = c
|
351
|
+
else:
|
352
|
+
logger.warning(f"Invalid color component type: {c} in {color_input}")
|
353
|
+
return None # Invalid type
|
354
|
+
processed.append(max(0, min(255, val))) # Clamp to 0-255
|
316
355
|
|
317
356
|
# Check length and add default alpha if needed
|
318
357
|
if len(processed) == 3:
|
319
|
-
|
320
|
-
|
321
|
-
|
358
|
+
# Use alpha from ColorManager instance
|
359
|
+
processed.append(self._color_manager._alpha)
|
360
|
+
return tuple(processed)
|
322
361
|
elif len(processed) == 4:
|
323
|
-
|
362
|
+
return tuple(processed)
|
324
363
|
else:
|
325
|
-
|
326
|
-
|
364
|
+
logger.warning(f"Invalid color tuple length: {color_input}")
|
365
|
+
return None # Invalid length
|
327
366
|
|
328
367
|
elif isinstance(color_input, str):
|
329
368
|
try:
|
330
369
|
# Convert color name/hex string to RGB tuple (0.0-1.0 floats)
|
331
|
-
from colour import Color
|
370
|
+
from colour import Color # Import here if not at top
|
371
|
+
|
332
372
|
color_obj = Color(color_input)
|
333
373
|
# Convert floats (0.0-1.0) to integers (0-255)
|
334
374
|
r = int(color_obj.red * 255)
|
@@ -342,27 +382,27 @@ class HighlightingService:
|
|
342
382
|
rgba = (r, g, b, self._color_manager._alpha)
|
343
383
|
return rgba
|
344
384
|
except ImportError:
|
345
|
-
|
346
|
-
|
385
|
+
logger.error("Color utility class not found. Cannot process string colors.")
|
386
|
+
return None
|
347
387
|
except ValueError:
|
348
|
-
|
349
|
-
|
388
|
+
logger.warning(f"Invalid color string: '{color_input}'")
|
389
|
+
return None
|
350
390
|
except Exception as e:
|
351
|
-
|
352
|
-
|
391
|
+
logger.error(f"Error processing color string '{color_input}': {e}")
|
392
|
+
return None
|
353
393
|
else:
|
354
|
-
|
355
|
-
|
394
|
+
logger.warning(f"Invalid color input type: {type(color_input)}")
|
395
|
+
return None
|
356
396
|
|
357
397
|
def _determine_highlight_color(
|
358
398
|
self,
|
359
399
|
color_input: Optional[Union[Tuple, str]] = None,
|
360
400
|
label: Optional[str] = None,
|
361
|
-
use_color_cycling: bool = False
|
401
|
+
use_color_cycling: bool = False,
|
362
402
|
) -> Tuple[int, int, int, int]:
|
363
403
|
"""
|
364
404
|
Determines the final RGBA color for a highlight using the ColorManager.
|
365
|
-
|
405
|
+
|
366
406
|
Args:
|
367
407
|
color_input: User-provided color (tuple or string).
|
368
408
|
label: Label associated with the highlight.
|
@@ -383,48 +423,63 @@ class HighlightingService:
|
|
383
423
|
def add(
|
384
424
|
self,
|
385
425
|
page_index: int,
|
386
|
-
bbox: Union[Tuple[float, float, float, float], Any],
|
426
|
+
bbox: Union[Tuple[float, float, float, float], Any], # Relax input type hint
|
387
427
|
color: Optional[Union[Tuple, str]] = None,
|
388
428
|
label: Optional[str] = None,
|
389
429
|
use_color_cycling: bool = False,
|
390
430
|
element: Optional[Any] = None,
|
391
431
|
include_attrs: Optional[List[str]] = None,
|
392
|
-
existing: str =
|
432
|
+
existing: str = "append",
|
393
433
|
):
|
394
434
|
"""Adds a rectangular highlight."""
|
395
|
-
|
435
|
+
|
396
436
|
processed_bbox: Tuple[float, float, float, float]
|
397
437
|
# Check if bbox is an object with expected attributes (likely a Region)
|
398
438
|
# Assuming Region object has x0, top, x1, bottom attributes based on error context
|
399
|
-
if (
|
400
|
-
hasattr(bbox,
|
401
|
-
|
439
|
+
if (
|
440
|
+
hasattr(bbox, "x0")
|
441
|
+
and hasattr(bbox, "top")
|
442
|
+
and hasattr(bbox, "x1")
|
443
|
+
and hasattr(bbox, "bottom")
|
444
|
+
):
|
445
|
+
try:
|
402
446
|
# Ensure attributes are numeric before creating tuple
|
403
|
-
processed_bbox = (
|
404
|
-
|
405
|
-
|
406
|
-
|
447
|
+
processed_bbox = (
|
448
|
+
float(bbox.x0),
|
449
|
+
float(bbox.top),
|
450
|
+
float(bbox.x1),
|
451
|
+
float(bbox.bottom),
|
452
|
+
)
|
453
|
+
except (ValueError, TypeError):
|
454
|
+
logger.error(
|
455
|
+
f"Invalid attribute types in bbox object for page {page_index}: {bbox}. Expected numeric values."
|
456
|
+
)
|
457
|
+
return
|
407
458
|
elif isinstance(bbox, (list, tuple)) and len(bbox) == 4:
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
459
|
+
try:
|
460
|
+
# Ensure elements are numeric and convert to tuple
|
461
|
+
processed_bbox = tuple(float(v) for v in bbox)
|
462
|
+
except (ValueError, TypeError):
|
463
|
+
logger.error(
|
464
|
+
f"Invalid values in bbox sequence for page {page_index}: {bbox}. Expected numeric values."
|
465
|
+
)
|
466
|
+
return
|
414
467
|
else:
|
415
|
-
logger.error(
|
416
|
-
|
417
|
-
|
468
|
+
logger.error(
|
469
|
+
f"Invalid bbox type or structure provided for page {page_index}: {type(bbox)} - {bbox}. Expected tuple/list of 4 numbers or Region-like object."
|
470
|
+
)
|
471
|
+
return # Don't proceed if bbox is invalid
|
472
|
+
|
418
473
|
self._add_internal(
|
419
474
|
page_index=page_index,
|
420
|
-
bbox=processed_bbox,
|
475
|
+
bbox=processed_bbox, # Use the processed tuple
|
421
476
|
polygon=None,
|
422
477
|
color_input=color,
|
423
478
|
label=label,
|
424
479
|
use_color_cycling=use_color_cycling,
|
425
480
|
element=element,
|
426
481
|
include_attrs=include_attrs,
|
427
|
-
existing=existing
|
482
|
+
existing=existing,
|
428
483
|
)
|
429
484
|
|
430
485
|
def add_polygon(
|
@@ -436,7 +491,7 @@ class HighlightingService:
|
|
436
491
|
use_color_cycling: bool = False,
|
437
492
|
element: Optional[Any] = None,
|
438
493
|
include_attrs: Optional[List[str]] = None,
|
439
|
-
existing: str =
|
494
|
+
existing: str = "append",
|
440
495
|
):
|
441
496
|
"""Adds a polygonal highlight."""
|
442
497
|
# Calculate bounding box from polygon for internal storage
|
@@ -447,7 +502,7 @@ class HighlightingService:
|
|
447
502
|
else:
|
448
503
|
logger.warning(f"Invalid polygon provided for page {page_index}. Cannot add highlight.")
|
449
504
|
return
|
450
|
-
|
505
|
+
|
451
506
|
self._add_internal(
|
452
507
|
page_index=page_index,
|
453
508
|
bbox=bbox,
|
@@ -457,7 +512,7 @@ class HighlightingService:
|
|
457
512
|
use_color_cycling=use_color_cycling,
|
458
513
|
element=element,
|
459
514
|
include_attrs=include_attrs,
|
460
|
-
existing=existing
|
515
|
+
existing=existing,
|
461
516
|
)
|
462
517
|
|
463
518
|
def _add_internal(
|
@@ -470,34 +525,32 @@ class HighlightingService:
|
|
470
525
|
use_color_cycling: bool,
|
471
526
|
element: Optional[Any],
|
472
527
|
include_attrs: Optional[List[str]],
|
473
|
-
existing: str
|
528
|
+
existing: str,
|
474
529
|
):
|
475
530
|
"""Internal method to create and store a Highlight object."""
|
476
531
|
if page_index < 0 or page_index >= len(self._pdf.pages):
|
477
|
-
|
478
|
-
|
532
|
+
logger.error(f"Invalid page index {page_index}. Cannot add highlight.")
|
533
|
+
return
|
479
534
|
|
480
535
|
# Handle 'replace' logic - clear highlights for this page *before* adding new one
|
481
|
-
if existing ==
|
536
|
+
if existing == "replace":
|
482
537
|
self.clear_page(page_index)
|
483
538
|
|
484
539
|
# Determine the final color using the ColorManager
|
485
540
|
final_color = self._determine_highlight_color(
|
486
|
-
color_input=color_input,
|
487
|
-
label=label,
|
488
|
-
use_color_cycling=use_color_cycling
|
541
|
+
color_input=color_input, label=label, use_color_cycling=use_color_cycling
|
489
542
|
)
|
490
543
|
|
491
544
|
# Extract attributes from the element if requested
|
492
545
|
attributes_to_draw = {}
|
493
546
|
if element and include_attrs:
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
547
|
+
for attr_name in include_attrs:
|
548
|
+
try:
|
549
|
+
attr_value = getattr(element, attr_name, None)
|
550
|
+
if attr_value is not None:
|
551
|
+
attributes_to_draw[attr_name] = attr_value
|
552
|
+
except AttributeError:
|
553
|
+
logger.warning(f"Attribute '{attr_name}' not found on element {element}")
|
501
554
|
|
502
555
|
# Create the highlight data object
|
503
556
|
highlight = Highlight(
|
@@ -506,7 +559,7 @@ class HighlightingService:
|
|
506
559
|
color=final_color,
|
507
560
|
label=label,
|
508
561
|
polygon=polygon,
|
509
|
-
attributes=attributes_to_draw
|
562
|
+
attributes=attributes_to_draw,
|
510
563
|
)
|
511
564
|
|
512
565
|
# Add to the list for the specific page
|
@@ -542,10 +595,10 @@ class HighlightingService:
|
|
542
595
|
page_index: int,
|
543
596
|
scale: float = 2.0,
|
544
597
|
labels: bool = True,
|
545
|
-
legend_position: str =
|
598
|
+
legend_position: str = "right",
|
546
599
|
render_ocr: bool = False,
|
547
600
|
resolution: Optional[float] = None,
|
548
|
-
**kwargs
|
601
|
+
**kwargs, # Pass other args to pdfplumber.page.to_image if needed
|
549
602
|
) -> Optional[Image.Image]:
|
550
603
|
"""
|
551
604
|
Renders a specific page with its highlights.
|
@@ -569,26 +622,30 @@ class HighlightingService:
|
|
569
622
|
return None
|
570
623
|
|
571
624
|
page = self._pdf[page_index]
|
572
|
-
highlights_on_page = self.get_highlights_for_page(
|
625
|
+
highlights_on_page = self.get_highlights_for_page(
|
626
|
+
page_index
|
627
|
+
) # This list will be empty if clear_page was called
|
573
628
|
|
574
|
-
# --- Get Base Image ---
|
629
|
+
# --- Get Base Image ---
|
575
630
|
try:
|
576
631
|
render_resolution = resolution if resolution is not None else scale * 72
|
577
632
|
img_object = page._page.to_image(resolution=render_resolution, **kwargs)
|
578
633
|
base_image = img_object.annotated
|
579
634
|
if not isinstance(base_image, Image.Image):
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
base_image = base_image.convert(
|
586
|
-
logger.debug(
|
635
|
+
png_data = img_object._repr_png_()
|
636
|
+
if png_data:
|
637
|
+
base_image = Image.open(io.BytesIO(png_data)).convert("RGB")
|
638
|
+
else:
|
639
|
+
raise ValueError("Could not extract base PIL image from pdfplumber.")
|
640
|
+
base_image = base_image.convert("RGBA")
|
641
|
+
logger.debug(
|
642
|
+
f"Base image for page {page_index} rendered with resolution {render_resolution}."
|
643
|
+
)
|
587
644
|
except Exception as e:
|
588
645
|
logger.error(f"Failed to render base image for page {page_index}: {e}", exc_info=True)
|
589
646
|
return None
|
590
647
|
|
591
|
-
# --- Render Highlights ---
|
648
|
+
# --- Render Highlights ---
|
592
649
|
rendered_image: Image.Image
|
593
650
|
if highlights_on_page:
|
594
651
|
renderer = HighlightRenderer(
|
@@ -600,32 +657,36 @@ class HighlightingService:
|
|
600
657
|
)
|
601
658
|
rendered_image = renderer.render()
|
602
659
|
else:
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
# --- Add Legend (Based ONLY on this page's highlights) ---
|
660
|
+
if render_ocr:
|
661
|
+
# Still render OCR even if no highlights
|
662
|
+
renderer = HighlightRenderer(page, base_image, [], scale, True)
|
663
|
+
rendered_image = renderer.render()
|
664
|
+
else:
|
665
|
+
rendered_image = base_image # No highlights, no OCR requested
|
666
|
+
|
667
|
+
# --- Add Legend (Based ONLY on this page's highlights) ---
|
611
668
|
if labels:
|
612
669
|
# CHANGE: Create label_colors map only from highlights_on_page
|
613
670
|
labels_colors_on_page: Dict[str, Tuple[int, int, int, int]] = {}
|
614
671
|
for hl in highlights_on_page:
|
615
672
|
if hl.label and hl.label not in labels_colors_on_page:
|
616
673
|
labels_colors_on_page[hl.label] = hl.color
|
617
|
-
|
618
|
-
if labels_colors_on_page:
|
674
|
+
|
675
|
+
if labels_colors_on_page: # Only add legend if there are labels on this page
|
619
676
|
legend = create_legend(labels_colors_on_page)
|
620
|
-
if legend:
|
621
|
-
|
622
|
-
|
677
|
+
if legend: # Ensure create_legend didn't return None
|
678
|
+
rendered_image = merge_images_with_legend(
|
679
|
+
rendered_image, legend, legend_position
|
680
|
+
)
|
681
|
+
logger.debug(
|
682
|
+
f"Added legend with {len(labels_colors_on_page)} labels for page {page_index}."
|
683
|
+
)
|
623
684
|
else:
|
624
|
-
|
685
|
+
logger.debug(f"Legend creation returned None for page {page_index}.")
|
625
686
|
else:
|
626
|
-
|
627
|
-
|
628
|
-
return rendered_image
|
687
|
+
logger.debug(f"No labels found on page {page_index}, skipping legend.")
|
688
|
+
|
689
|
+
return rendered_image
|
629
690
|
|
630
691
|
def render_preview(
|
631
692
|
self,
|
@@ -633,10 +694,10 @@ class HighlightingService:
|
|
633
694
|
temporary_highlights: List[Dict],
|
634
695
|
scale: float = 2.0,
|
635
696
|
labels: bool = True,
|
636
|
-
legend_position: str =
|
697
|
+
legend_position: str = "right",
|
637
698
|
render_ocr: bool = False,
|
638
699
|
resolution: Optional[float] = None,
|
639
|
-
**kwargs
|
700
|
+
**kwargs,
|
640
701
|
) -> Optional[Image.Image]:
|
641
702
|
"""
|
642
703
|
Renders a preview image for a specific page containing only the
|
@@ -665,11 +726,16 @@ class HighlightingService:
|
|
665
726
|
try:
|
666
727
|
# Get base image from pdfplumber using the Page object's underlying _page
|
667
728
|
img_object = page._page.to_image(resolution=render_resolution, **kwargs)
|
668
|
-
base_image =
|
729
|
+
base_image = (
|
730
|
+
img_object.annotated
|
731
|
+
if hasattr(img_object, "annotated")
|
732
|
+
else img_object._repr_png_()
|
733
|
+
)
|
669
734
|
if isinstance(base_image, bytes):
|
670
|
-
|
671
|
-
|
672
|
-
|
735
|
+
from io import BytesIO
|
736
|
+
|
737
|
+
base_image = Image.open(BytesIO(base_image))
|
738
|
+
base_image = base_image.convert("RGB") # Ensure consistent format
|
673
739
|
|
674
740
|
# Convert temporary highlight dicts to Highlight objects
|
675
741
|
# Note: Colors/labels should be determined *here* for temporary preview
|
@@ -677,15 +743,15 @@ class HighlightingService:
|
|
677
743
|
for hl_data in temporary_highlights:
|
678
744
|
# Determine the final color using the service logic
|
679
745
|
final_color = self._determine_highlight_color(
|
680
|
-
color_input=hl_data.get(
|
681
|
-
label=hl_data.get(
|
682
|
-
use_color_cycling=hl_data.get(
|
746
|
+
color_input=hl_data.get("color"),
|
747
|
+
label=hl_data.get("label"),
|
748
|
+
use_color_cycling=hl_data.get("use_color_cycling", False),
|
683
749
|
)
|
684
750
|
|
685
751
|
# Extract potential attributes to draw
|
686
752
|
attrs_to_draw = {}
|
687
|
-
element = hl_data.get(
|
688
|
-
include_attrs = hl_data.get(
|
753
|
+
element = hl_data.get("element")
|
754
|
+
include_attrs = hl_data.get("include_attrs")
|
689
755
|
if element and include_attrs:
|
690
756
|
for attr_name in include_attrs:
|
691
757
|
try:
|
@@ -693,18 +759,22 @@ class HighlightingService:
|
|
693
759
|
if attr_value is not None:
|
694
760
|
attrs_to_draw[attr_name] = attr_value
|
695
761
|
except AttributeError:
|
696
|
-
logger.warning(
|
762
|
+
logger.warning(
|
763
|
+
f"Attribute '{attr_name}' not found on element {element}"
|
764
|
+
)
|
697
765
|
|
698
766
|
# Add highlight if geometry exists
|
699
|
-
if hl_data.get(
|
700
|
-
preview_highlights.append(
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
767
|
+
if hl_data.get("bbox") or hl_data.get("polygon"):
|
768
|
+
preview_highlights.append(
|
769
|
+
Highlight(
|
770
|
+
page_index=hl_data["page_index"],
|
771
|
+
bbox=hl_data.get("bbox"),
|
772
|
+
polygon=hl_data.get("polygon"),
|
773
|
+
color=final_color, # Use the determined color
|
774
|
+
label=hl_data.get("label"),
|
775
|
+
attributes=attrs_to_draw,
|
776
|
+
)
|
777
|
+
)
|
708
778
|
|
709
779
|
# Render only these highlights
|
710
780
|
renderer = HighlightRenderer(page, base_image, preview_highlights, scale, render_ocr)
|
@@ -716,9 +786,11 @@ class HighlightingService:
|
|
716
786
|
preview_labels = {h.label: h.color for h in preview_highlights if h.label}
|
717
787
|
if preview_labels:
|
718
788
|
legend = create_legend(preview_labels)
|
719
|
-
final_image = merge_images_with_legend(
|
789
|
+
final_image = merge_images_with_legend(
|
790
|
+
rendered_image, legend, position=legend_position
|
791
|
+
)
|
720
792
|
else:
|
721
|
-
final_image = rendered_image
|
793
|
+
final_image = rendered_image # No legend needed
|
722
794
|
else:
|
723
795
|
final_image = rendered_image
|
724
796
|
|
@@ -726,4 +798,4 @@ class HighlightingService:
|
|
726
798
|
logger.error(f"Error rendering preview for page {page_index}: {e}", exc_info=True)
|
727
799
|
return None
|
728
800
|
|
729
|
-
return final_image
|
801
|
+
return final_image
|