natural-pdf 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +6 -0
- natural_pdf/analyzers/layout/__init__.py +1 -0
- natural_pdf/analyzers/layout/base.py +151 -0
- natural_pdf/analyzers/layout/docling.py +247 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +166 -0
- natural_pdf/analyzers/layout/layout_manager.py +200 -0
- natural_pdf/analyzers/layout/layout_options.py +78 -0
- natural_pdf/analyzers/layout/paddle.py +240 -0
- natural_pdf/analyzers/layout/surya.py +151 -0
- natural_pdf/analyzers/layout/tatr.py +251 -0
- natural_pdf/analyzers/layout/yolo.py +165 -0
- natural_pdf/analyzers/text_options.py +60 -0
- natural_pdf/analyzers/text_structure.py +270 -0
- natural_pdf/analyzers/utils.py +57 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/element_manager.py +457 -0
- natural_pdf/core/highlighting_service.py +698 -0
- natural_pdf/core/page.py +1444 -0
- natural_pdf/core/pdf.py +653 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +761 -0
- natural_pdf/elements/collections.py +1345 -0
- natural_pdf/elements/line.py +140 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1793 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +56 -0
- natural_pdf/ocr/engine.py +104 -0
- natural_pdf/ocr/engine_easyocr.py +179 -0
- natural_pdf/ocr/engine_paddle.py +204 -0
- natural_pdf/ocr/engine_surya.py +171 -0
- natural_pdf/ocr/ocr_manager.py +191 -0
- natural_pdf/ocr/ocr_options.py +114 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +396 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +354 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +3 -0
- natural_pdf/utils/highlighting.py +12 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +223 -0
- natural_pdf/widgets/__init__.py +4 -0
- natural_pdf/widgets/frontend/viewer.js +88 -0
- natural_pdf/widgets/viewer.py +765 -0
- natural_pdf-0.1.0.dist-info/METADATA +295 -0
- natural_pdf-0.1.0.dist-info/RECORD +52 -0
- natural_pdf-0.1.0.dist-info/WHEEL +5 -0
- natural_pdf-0.1.0.dist-info/licenses/LICENSE +21 -0
- natural_pdf-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,698 @@
|
|
1
|
+
"""
|
2
|
+
Centralized service for managing and rendering highlights in a PDF document.
|
3
|
+
"""
|
4
|
+
import io
|
5
|
+
import os
|
6
|
+
import logging # Added
|
7
|
+
from dataclasses import dataclass, field
|
8
|
+
from typing import List, Dict, Tuple, Optional, Any, Union
|
9
|
+
|
10
|
+
from PIL import Image, ImageDraw, ImageFont
|
11
|
+
from colour import Color
|
12
|
+
|
13
|
+
# Attempt to import Page for type hinting safely
|
14
|
+
try:
|
15
|
+
from .page import Page
|
16
|
+
except ImportError:
|
17
|
+
Page = Any # Fallback if circular import issue arises during type checking
|
18
|
+
|
19
|
+
# Import ColorManager and related utils
|
20
|
+
from natural_pdf.utils.visualization import ColorManager, create_legend, merge_images_with_legend
|
21
|
+
|
22
|
+
# Constants for drawing (Can be potentially moved to ColorManager/Renderer if desired)
|
23
|
+
BORDER_ALPHA = 180 # Default alpha for highlight border
|
24
|
+
DEFAULT_FALLBACK_COLOR = (255, 255, 0) # Yellow fallback (RGB only, alpha added by ColorManager)
|
25
|
+
|
26
|
+
# Setup logger
|
27
|
+
logger = logging.getLogger(__name__)
|
28
|
+
|
29
|
+
@dataclass
|
30
|
+
class Highlight:
|
31
|
+
"""
|
32
|
+
Represents a single highlight to be drawn.
|
33
|
+
Stores geometric data, color, label, and extracted attributes.
|
34
|
+
"""
|
35
|
+
page_index: int
|
36
|
+
bbox: Tuple[float, float, float, float]
|
37
|
+
color: Tuple[int, int, int, int] # Final RGBA color determined by service
|
38
|
+
label: Optional[str] = None
|
39
|
+
polygon: Optional[List[Tuple[float, float]]] = None
|
40
|
+
attributes: Dict[str, Any] = field(default_factory=dict) # Store extracted attribute values
|
41
|
+
|
42
|
+
@property
|
43
|
+
def is_polygon(self) -> bool:
|
44
|
+
"""Check if this highlight uses polygon coordinates."""
|
45
|
+
return self.polygon is not None and len(self.polygon) >= 3
|
46
|
+
|
47
|
+
@property
|
48
|
+
def border_color(self) -> Tuple[int, int, int, int]:
|
49
|
+
"""Calculate a slightly darker/more opaque border color."""
|
50
|
+
# Use base color but increase alpha for border
|
51
|
+
return (
|
52
|
+
self.color[0],
|
53
|
+
self.color[1],
|
54
|
+
self.color[2],
|
55
|
+
BORDER_ALPHA
|
56
|
+
)
|
57
|
+
|
58
|
+
|
59
|
+
class HighlightRenderer:
|
60
|
+
"""
|
61
|
+
Handles the drawing logic for highlights on a single page image.
|
62
|
+
Instantiated by HighlightingService for each render request.
|
63
|
+
"""
|
64
|
+
def __init__(
|
65
|
+
self,
|
66
|
+
page: Page,
|
67
|
+
base_image: Image.Image,
|
68
|
+
highlights: List[Highlight],
|
69
|
+
scale: float,
|
70
|
+
render_ocr: bool,
|
71
|
+
):
|
72
|
+
self.page = page # Keep page reference for OCR rendering
|
73
|
+
self.base_image = base_image.convert('RGBA') # Ensure RGBA
|
74
|
+
self.highlights = highlights
|
75
|
+
self.scale = scale
|
76
|
+
self.render_ocr = render_ocr
|
77
|
+
self.result_image = self.base_image.copy()
|
78
|
+
self.vertex_size = max(3, int(2 * self.scale)) # Size of corner markers
|
79
|
+
|
80
|
+
def render(self) -> Image.Image:
|
81
|
+
"""Executes the rendering process."""
|
82
|
+
self._draw_highlights()
|
83
|
+
if self.render_ocr:
|
84
|
+
self._render_ocr_text()
|
85
|
+
return self.result_image
|
86
|
+
|
87
|
+
def _draw_highlights(self):
|
88
|
+
"""Draws all highlight shapes, borders, vertices, and attributes."""
|
89
|
+
for highlight in self.highlights:
|
90
|
+
# Create a transparent overlay for this single highlight
|
91
|
+
overlay = Image.new('RGBA', self.base_image.size, (0, 0, 0, 0))
|
92
|
+
draw = ImageDraw.Draw(overlay)
|
93
|
+
|
94
|
+
scaled_bbox = None
|
95
|
+
|
96
|
+
if highlight.is_polygon:
|
97
|
+
scaled_polygon = [(p[0] * self.scale, p[1] * self.scale) for p in highlight.polygon]
|
98
|
+
# Draw polygon fill and border
|
99
|
+
draw.polygon(scaled_polygon, fill=highlight.color, outline=highlight.border_color, width=2)
|
100
|
+
self._draw_vertices(draw, scaled_polygon, highlight.border_color)
|
101
|
+
|
102
|
+
# Calculate scaled bbox for attribute drawing
|
103
|
+
x_coords = [p[0] for p in scaled_polygon]
|
104
|
+
y_coords = [p[1] for p in scaled_polygon]
|
105
|
+
scaled_bbox = [min(x_coords), min(y_coords), max(x_coords), max(y_coords)]
|
106
|
+
|
107
|
+
else: # Rectangle
|
108
|
+
x0, top, x1, bottom = highlight.bbox
|
109
|
+
x0_s, top_s, x1_s, bottom_s = x0 * self.scale, top * self.scale, x1 * self.scale, bottom * self.scale
|
110
|
+
scaled_bbox = [x0_s, top_s, x1_s, bottom_s]
|
111
|
+
# Draw rectangle fill and border
|
112
|
+
draw.rectangle(scaled_bbox, fill=highlight.color, outline=highlight.border_color, width=2)
|
113
|
+
|
114
|
+
vertices = [(x0_s, top_s), (x1_s, top_s), (x1_s, bottom_s), (x0_s, bottom_s)]
|
115
|
+
self._draw_vertices(draw, vertices, highlight.border_color)
|
116
|
+
|
117
|
+
# Draw attributes if present on the highlight object
|
118
|
+
if highlight.attributes and scaled_bbox: # Ensure bbox is calculated
|
119
|
+
self._draw_attributes(draw, highlight.attributes, scaled_bbox)
|
120
|
+
|
121
|
+
# Composite this highlight's overlay onto the result using alpha blending
|
122
|
+
self.result_image = Image.alpha_composite(self.result_image, overlay)
|
123
|
+
|
124
|
+
def _draw_vertices(self, draw: ImageDraw.Draw, vertices: List[Tuple[float, float]], color: Tuple[int, int, int, int]):
|
125
|
+
"""Draw small markers at each vertex."""
|
126
|
+
for x, y in vertices:
|
127
|
+
# Draw ellipse centered at vertex
|
128
|
+
draw.ellipse(
|
129
|
+
[x - self.vertex_size, y - self.vertex_size, x + self.vertex_size, y + self.vertex_size],
|
130
|
+
fill=color # Use border color for vertices
|
131
|
+
)
|
132
|
+
|
133
|
+
def _draw_attributes(self, draw: ImageDraw.Draw, attributes: Dict[str, Any], bbox_scaled: List[float]):
|
134
|
+
"""Draws attribute key-value pairs on the highlight."""
|
135
|
+
try:
|
136
|
+
# Slightly larger font, scaled
|
137
|
+
font_size = max(10, int(8 * self.scale))
|
138
|
+
# Prioritize monospace fonts for better alignment
|
139
|
+
font = ImageFont.truetype("Arial.ttf", font_size) # Fallback sans-serif
|
140
|
+
except IOError:
|
141
|
+
font = ImageFont.load_default()
|
142
|
+
font_size = 10 # Reset size for default font
|
143
|
+
|
144
|
+
line_height = font_size + int(4 * self.scale) # Scaled line spacing
|
145
|
+
bg_padding = int(3 * self.scale)
|
146
|
+
max_width = 0
|
147
|
+
text_lines = []
|
148
|
+
|
149
|
+
# Format attribute lines
|
150
|
+
for name, value in attributes.items():
|
151
|
+
if isinstance(value, float):
|
152
|
+
value_str = f"{value:.2f}" # Format floats
|
153
|
+
else:
|
154
|
+
value_str = str(value)
|
155
|
+
line = f"{name}: {value_str}"
|
156
|
+
text_lines.append(line)
|
157
|
+
try:
|
158
|
+
# Calculate max width for background box
|
159
|
+
max_width = max(max_width, draw.textlength(line, font=font))
|
160
|
+
except AttributeError: pass # Ignore if textlength not available
|
161
|
+
|
162
|
+
if not text_lines: return # Nothing to draw
|
163
|
+
|
164
|
+
total_height = line_height * len(text_lines)
|
165
|
+
|
166
|
+
# Position near top-right corner with padding
|
167
|
+
x = bbox_scaled[2] - int(2 * self.scale) - max_width
|
168
|
+
y = bbox_scaled[1] + int(2 * self.scale)
|
169
|
+
|
170
|
+
# Draw background rectangle (semi-transparent white)
|
171
|
+
bg_x0 = x - bg_padding
|
172
|
+
bg_y0 = y - bg_padding
|
173
|
+
bg_x1 = x + max_width + bg_padding
|
174
|
+
bg_y1 = y + total_height + bg_padding
|
175
|
+
draw.rectangle(
|
176
|
+
[bg_x0, bg_y0, bg_x1, bg_y1],
|
177
|
+
fill=(255, 255, 255, 240),
|
178
|
+
outline=(0, 0, 0, 180), # Light black outline
|
179
|
+
width=1
|
180
|
+
)
|
181
|
+
|
182
|
+
# Draw text lines (black)
|
183
|
+
current_y = y
|
184
|
+
for line in text_lines:
|
185
|
+
draw.text((x, current_y), line, fill=(0, 0, 0, 255), font=font)
|
186
|
+
current_y += line_height
|
187
|
+
|
188
|
+
def _render_ocr_text(self):
|
189
|
+
"""Renders OCR text onto the image. (Adapted from old HighlightManager)"""
|
190
|
+
# Use the page reference to get OCR elements
|
191
|
+
try:
|
192
|
+
# Try finding first, then extracting if necessary
|
193
|
+
ocr_elements = self.page.find_all('text[source=ocr]')
|
194
|
+
if not ocr_elements:
|
195
|
+
# Don't run full OCR here, just extract if already run
|
196
|
+
ocr_elements = [el for el in self.page.words if getattr(el, 'source', None) == 'ocr']
|
197
|
+
# Alternative: self.page.extract_ocr_elements() - but might be slow
|
198
|
+
|
199
|
+
except Exception as e:
|
200
|
+
logger.warning(f"Could not get OCR elements for page {self.page.number}: {e}", exc_info=True)
|
201
|
+
return # Don't modify image if OCR elements aren't available
|
202
|
+
|
203
|
+
if not ocr_elements:
|
204
|
+
logger.debug(f"No OCR elements found for page {self.page.number} to render.")
|
205
|
+
return
|
206
|
+
|
207
|
+
overlay = Image.new('RGBA', self.base_image.size, (0, 0, 0, 0))
|
208
|
+
draw = ImageDraw.Draw(overlay)
|
209
|
+
|
210
|
+
# Find a suitable font
|
211
|
+
font_path = None
|
212
|
+
default_font = ImageFont.load_default()
|
213
|
+
common_fonts = ["DejaVuSans.ttf", "Arial.ttf", "Helvetica.ttf", "FreeSans.ttf"]
|
214
|
+
for fname in common_fonts:
|
215
|
+
try:
|
216
|
+
ImageFont.truetype(fname, 10) # Test load
|
217
|
+
font_path = fname
|
218
|
+
break
|
219
|
+
except IOError:
|
220
|
+
continue
|
221
|
+
|
222
|
+
for element in ocr_elements:
|
223
|
+
x0, top, x1, bottom = element.bbox
|
224
|
+
x0_s, top_s, x1_s, bottom_s = x0 * self.scale, top * self.scale, x1 * self.scale, bottom * self.scale
|
225
|
+
box_w, box_h = x1_s - x0_s, bottom_s - top_s
|
226
|
+
|
227
|
+
if box_h <= 0: continue # Skip zero-height boxes
|
228
|
+
|
229
|
+
# --- Font Size Calculation ---
|
230
|
+
font_size = max(9, int(box_h * 0.85)) # Min size 9, 85% of box height
|
231
|
+
|
232
|
+
try:
|
233
|
+
sized_font = ImageFont.truetype(font_path, font_size) if font_path else default_font
|
234
|
+
except IOError:
|
235
|
+
sized_font = default_font
|
236
|
+
|
237
|
+
# --- Adjust Font Size if Text Overflows ---
|
238
|
+
try:
|
239
|
+
text_w = draw.textlength(element.text, font=sized_font)
|
240
|
+
if text_w > box_w * 1.1: # Allow 10% overflow
|
241
|
+
ratio = max(0.5, (box_w * 1.0) / text_w) # Don't shrink below 50%
|
242
|
+
font_size = max(9, int(font_size * ratio))
|
243
|
+
if font_path:
|
244
|
+
try: sized_font = ImageFont.truetype(font_path, font_size)
|
245
|
+
except IOError: pass # Keep previous if error
|
246
|
+
except AttributeError: pass # Skip adjustment if textlength fails
|
247
|
+
|
248
|
+
# --- Draw Background and Text ---
|
249
|
+
padding = max(1, int(font_size * 0.05)) # Minimal padding
|
250
|
+
draw.rectangle(
|
251
|
+
[x0_s - padding, top_s - padding, x1_s + padding, bottom_s + padding],
|
252
|
+
fill=(255, 255, 255, 230) # Highly transparent white background
|
253
|
+
)
|
254
|
+
|
255
|
+
# Calculate text position (centered vertically, slightly offset from left)
|
256
|
+
try:
|
257
|
+
if hasattr(sized_font, "getbbox"): # Modern PIL
|
258
|
+
_, text_top_offset, _, text_bottom_offset = sized_font.getbbox(element.text)
|
259
|
+
text_h = text_bottom_offset - text_top_offset
|
260
|
+
else: # Older PIL approximation
|
261
|
+
text_h = font_size
|
262
|
+
text_y = top_s + (box_h - text_h) / 2
|
263
|
+
# Adjust for vertical offset in some fonts
|
264
|
+
text_y -= text_top_offset if hasattr(sized_font, "getbbox") else 0
|
265
|
+
text_x = x0_s + padding # Start near left edge with padding
|
266
|
+
|
267
|
+
except Exception:
|
268
|
+
# Fallback positioning
|
269
|
+
text_x, text_y = x0_s + padding, top_s + padding
|
270
|
+
|
271
|
+
draw.text((text_x, text_y), element.text, fill=(0, 0, 0, 255), font=sized_font)
|
272
|
+
|
273
|
+
# Composite the OCR text overlay onto the result image
|
274
|
+
self.result_image = Image.alpha_composite(self.result_image, overlay)
|
275
|
+
|
276
|
+
|
277
|
+
class HighlightingService:
|
278
|
+
"""
|
279
|
+
Central service to manage highlight data and orchestrate rendering.
|
280
|
+
Holds the state of all highlights across the document.
|
281
|
+
"""
|
282
|
+
def __init__(self, pdf_object):
|
283
|
+
self._pdf = pdf_object # Reference to the parent PDF object
|
284
|
+
self._highlights_by_page: Dict[int, List[Highlight]] = {}
|
285
|
+
self._color_manager = ColorManager() # Instantiate the color manager
|
286
|
+
logger.info("HighlightingService initialized with ColorManager.")
|
287
|
+
|
288
|
+
# Removed _get_next_color - logic moved to ColorManager
|
289
|
+
# Removed _color_cycle, _labels_colors - managed by ColorManager
|
290
|
+
|
291
|
+
def _process_color_input(
|
292
|
+
self,
|
293
|
+
color_input: Optional[Union[Tuple, str]]
|
294
|
+
) -> Optional[Tuple[int, int, int, int]]:
|
295
|
+
"""
|
296
|
+
Parses various color input formats into a standard RGBA tuple (0-255).
|
297
|
+
Returns None if input is invalid.
|
298
|
+
"""
|
299
|
+
if color_input is None:
|
300
|
+
return None
|
301
|
+
|
302
|
+
if isinstance(color_input, tuple):
|
303
|
+
# Convert float values (0.0-1.0) to int (0-255)
|
304
|
+
processed = []
|
305
|
+
all_float = all(isinstance(c, float) and 0.0 <= c <= 1.0 for c in color_input[:3])
|
306
|
+
|
307
|
+
for i, c in enumerate(color_input):
|
308
|
+
if isinstance(c, float):
|
309
|
+
val = int(c * 255) if (i < 3 and all_float) or (i==3 and 0.0 <= c <= 1.0) else int(c)
|
310
|
+
elif isinstance(c, int):
|
311
|
+
val = c
|
312
|
+
else:
|
313
|
+
logger.warning(f"Invalid color component type: {c} in {color_input}")
|
314
|
+
return None # Invalid type
|
315
|
+
processed.append(max(0, min(255, val))) # Clamp to 0-255
|
316
|
+
|
317
|
+
# Check length and add default alpha if needed
|
318
|
+
if len(processed) == 3:
|
319
|
+
# Use alpha from ColorManager instance
|
320
|
+
processed.append(self._color_manager._alpha)
|
321
|
+
return tuple(processed)
|
322
|
+
elif len(processed) == 4:
|
323
|
+
return tuple(processed)
|
324
|
+
else:
|
325
|
+
logger.warning(f"Invalid color tuple length: {color_input}")
|
326
|
+
return None # Invalid length
|
327
|
+
|
328
|
+
elif isinstance(color_input, str):
|
329
|
+
try:
|
330
|
+
# Convert color name/hex string to RGB tuple (0.0-1.0 floats)
|
331
|
+
from colour import Color # Import here if not at top
|
332
|
+
color_obj = Color(color_input)
|
333
|
+
# Convert floats (0.0-1.0) to integers (0-255)
|
334
|
+
r = int(color_obj.red * 255)
|
335
|
+
g = int(color_obj.green * 255)
|
336
|
+
b = int(color_obj.blue * 255)
|
337
|
+
# Clamp values just in case
|
338
|
+
r = max(0, min(255, r))
|
339
|
+
g = max(0, min(255, g))
|
340
|
+
b = max(0, min(255, b))
|
341
|
+
# Add alpha
|
342
|
+
rgba = (r, g, b, self._color_manager._alpha)
|
343
|
+
return rgba
|
344
|
+
except ImportError:
|
345
|
+
logger.error("Color utility class not found. Cannot process string colors.")
|
346
|
+
return None
|
347
|
+
except ValueError:
|
348
|
+
logger.warning(f"Invalid color string: '{color_input}'")
|
349
|
+
return None
|
350
|
+
except Exception as e:
|
351
|
+
logger.error(f"Error processing color string '{color_input}': {e}")
|
352
|
+
return None
|
353
|
+
else:
|
354
|
+
logger.warning(f"Invalid color input type: {type(color_input)}")
|
355
|
+
return None
|
356
|
+
|
357
|
+
def _determine_highlight_color(
|
358
|
+
self,
|
359
|
+
color_input: Optional[Union[Tuple, str]] = None,
|
360
|
+
label: Optional[str] = None,
|
361
|
+
use_color_cycling: bool = False
|
362
|
+
) -> Tuple[int, int, int, int]:
|
363
|
+
"""
|
364
|
+
Determines the final RGBA color for a highlight using the ColorManager.
|
365
|
+
|
366
|
+
Args:
|
367
|
+
color_input: User-provided color (tuple or string).
|
368
|
+
label: Label associated with the highlight.
|
369
|
+
use_color_cycling: Whether to force cycling (ignores label).
|
370
|
+
|
371
|
+
Returns:
|
372
|
+
RGBA color tuple (0-255).
|
373
|
+
"""
|
374
|
+
explicit_color = self._process_color_input(color_input)
|
375
|
+
|
376
|
+
if explicit_color:
|
377
|
+
# If a valid color was explicitly provided, use it
|
378
|
+
return explicit_color
|
379
|
+
else:
|
380
|
+
# Otherwise, use the color manager to get a color based on label/cycling
|
381
|
+
return self._color_manager.get_color(label=label, force_cycle=use_color_cycling)
|
382
|
+
|
383
|
+
def add(
|
384
|
+
self,
|
385
|
+
page_index: int,
|
386
|
+
bbox: Tuple[float, float, float, float],
|
387
|
+
color: Optional[Union[Tuple, str]] = None,
|
388
|
+
label: Optional[str] = None,
|
389
|
+
use_color_cycling: bool = False,
|
390
|
+
element: Optional[Any] = None,
|
391
|
+
include_attrs: Optional[List[str]] = None,
|
392
|
+
existing: str = 'append'
|
393
|
+
):
|
394
|
+
"""Adds a rectangular highlight."""
|
395
|
+
self._add_internal(
|
396
|
+
page_index=page_index,
|
397
|
+
bbox=bbox,
|
398
|
+
polygon=None,
|
399
|
+
color_input=color,
|
400
|
+
label=label,
|
401
|
+
use_color_cycling=use_color_cycling,
|
402
|
+
element=element,
|
403
|
+
include_attrs=include_attrs,
|
404
|
+
existing=existing
|
405
|
+
)
|
406
|
+
|
407
|
+
def add_polygon(
|
408
|
+
self,
|
409
|
+
page_index: int,
|
410
|
+
polygon: List[Tuple[float, float]],
|
411
|
+
color: Optional[Union[Tuple, str]] = None,
|
412
|
+
label: Optional[str] = None,
|
413
|
+
use_color_cycling: bool = False,
|
414
|
+
element: Optional[Any] = None,
|
415
|
+
include_attrs: Optional[List[str]] = None,
|
416
|
+
existing: str = 'append'
|
417
|
+
):
|
418
|
+
"""Adds a polygonal highlight."""
|
419
|
+
# Calculate bounding box from polygon for internal storage
|
420
|
+
if polygon and len(polygon) >= 3:
|
421
|
+
x_coords = [p[0] for p in polygon]
|
422
|
+
y_coords = [p[1] for p in polygon]
|
423
|
+
bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
|
424
|
+
else:
|
425
|
+
logger.warning(f"Invalid polygon provided for page {page_index}. Cannot add highlight.")
|
426
|
+
return
|
427
|
+
|
428
|
+
self._add_internal(
|
429
|
+
page_index=page_index,
|
430
|
+
bbox=bbox,
|
431
|
+
polygon=polygon,
|
432
|
+
color_input=color,
|
433
|
+
label=label,
|
434
|
+
use_color_cycling=use_color_cycling,
|
435
|
+
element=element,
|
436
|
+
include_attrs=include_attrs,
|
437
|
+
existing=existing
|
438
|
+
)
|
439
|
+
|
440
|
+
def _add_internal(
|
441
|
+
self,
|
442
|
+
page_index: int,
|
443
|
+
bbox: Tuple[float, float, float, float],
|
444
|
+
polygon: Optional[List[Tuple[float, float]]],
|
445
|
+
color_input: Optional[Union[Tuple, str]],
|
446
|
+
label: Optional[str],
|
447
|
+
use_color_cycling: bool,
|
448
|
+
element: Optional[Any],
|
449
|
+
include_attrs: Optional[List[str]],
|
450
|
+
existing: str
|
451
|
+
):
|
452
|
+
"""Internal method to create and store a Highlight object."""
|
453
|
+
if page_index < 0 or page_index >= len(self._pdf.pages):
|
454
|
+
logger.error(f"Invalid page index {page_index}. Cannot add highlight.")
|
455
|
+
return
|
456
|
+
|
457
|
+
# Handle 'replace' logic - clear highlights for this page *before* adding new one
|
458
|
+
if existing == 'replace':
|
459
|
+
self.clear_page(page_index)
|
460
|
+
|
461
|
+
# Determine the final color using the ColorManager
|
462
|
+
final_color = self._determine_highlight_color(
|
463
|
+
color_input=color_input,
|
464
|
+
label=label,
|
465
|
+
use_color_cycling=use_color_cycling
|
466
|
+
)
|
467
|
+
|
468
|
+
# Extract attributes from the element if requested
|
469
|
+
attributes_to_draw = {}
|
470
|
+
if element and include_attrs:
|
471
|
+
for attr_name in include_attrs:
|
472
|
+
try:
|
473
|
+
attr_value = getattr(element, attr_name, None)
|
474
|
+
if attr_value is not None:
|
475
|
+
attributes_to_draw[attr_name] = attr_value
|
476
|
+
except AttributeError:
|
477
|
+
logger.warning(f"Attribute '{attr_name}' not found on element {element}")
|
478
|
+
|
479
|
+
# Create the highlight data object
|
480
|
+
highlight = Highlight(
|
481
|
+
page_index=page_index,
|
482
|
+
bbox=bbox,
|
483
|
+
color=final_color,
|
484
|
+
label=label,
|
485
|
+
polygon=polygon,
|
486
|
+
attributes=attributes_to_draw
|
487
|
+
)
|
488
|
+
|
489
|
+
# Add to the list for the specific page
|
490
|
+
if page_index not in self._highlights_by_page:
|
491
|
+
self._highlights_by_page[page_index] = []
|
492
|
+
self._highlights_by_page[page_index].append(highlight)
|
493
|
+
logger.debug(f"Added highlight to page {page_index}: {highlight}")
|
494
|
+
|
495
|
+
def clear_all(self):
|
496
|
+
"""Clears all highlights from all pages and resets the color manager."""
|
497
|
+
self._highlights_by_page = {}
|
498
|
+
self._color_manager.reset()
|
499
|
+
logger.info("Cleared all highlights and reset ColorManager.")
|
500
|
+
|
501
|
+
def clear_page(self, page_index: int):
|
502
|
+
"""Clears all highlights from a specific page."""
|
503
|
+
if page_index in self._highlights_by_page:
|
504
|
+
del self._highlights_by_page[page_index]
|
505
|
+
logger.debug(f"Cleared highlights for page {page_index}.")
|
506
|
+
# Note: We typically don't reset the color manager when clearing a single page
|
507
|
+
# to maintain color consistency if highlights are added back.
|
508
|
+
|
509
|
+
def get_highlights_for_page(self, page_index: int) -> List[Highlight]:
|
510
|
+
"""Returns a list of Highlight objects for a specific page."""
|
511
|
+
return self._highlights_by_page.get(page_index, [])
|
512
|
+
|
513
|
+
def get_labels_and_colors(self) -> Dict[str, Tuple[int, int, int, int]]:
|
514
|
+
"""Returns a mapping of labels used to their assigned colors (for persistent highlights)."""
|
515
|
+
return self._color_manager.get_label_colors()
|
516
|
+
|
517
|
+
def render_page(
|
518
|
+
self,
|
519
|
+
page_index: int,
|
520
|
+
scale: float = 2.0,
|
521
|
+
labels: bool = True,
|
522
|
+
legend_position: str = 'right',
|
523
|
+
render_ocr: bool = False,
|
524
|
+
resolution: Optional[float] = None,
|
525
|
+
**kwargs # Pass other args to pdfplumber.page.to_image if needed
|
526
|
+
) -> Optional[Image.Image]:
|
527
|
+
"""
|
528
|
+
Renders a specific page with its highlights.
|
529
|
+
|
530
|
+
Args:
|
531
|
+
page_index: The 0-based index of the page to render.
|
532
|
+
scale: Scale factor for rendering highlights.
|
533
|
+
labels: Whether to include a legend for highlights.
|
534
|
+
legend_position: Position of the legend.
|
535
|
+
render_ocr: Whether to render OCR text on the image.
|
536
|
+
resolution: Optional resolution (DPI) for the base page image.
|
537
|
+
Defaults to scale * 72.
|
538
|
+
kwargs: Additional keyword arguments for pdfplumber's page.to_image.
|
539
|
+
|
540
|
+
Returns:
|
541
|
+
A PIL Image object of the rendered page, or None if rendering fails.
|
542
|
+
"""
|
543
|
+
if page_index < 0 or page_index >= len(self._pdf.pages):
|
544
|
+
logger.error(f"Invalid page index {page_index} for rendering.")
|
545
|
+
return None
|
546
|
+
|
547
|
+
page = self._pdf[page_index]
|
548
|
+
highlights_on_page = self.get_highlights_for_page(page_index)
|
549
|
+
|
550
|
+
# --- Get Base Image ---
|
551
|
+
try:
|
552
|
+
render_resolution = resolution if resolution is not None else scale * 72
|
553
|
+
# Use the underlying pdfplumber page object for base rendering
|
554
|
+
img_object = page._page.to_image(resolution=render_resolution, **kwargs)
|
555
|
+
# Access the PIL image directly
|
556
|
+
base_image = img_object.annotated # .annotated usually holds the PIL Image
|
557
|
+
if not isinstance(base_image, Image.Image):
|
558
|
+
# Fallback for different pdfplumber versions/outputs
|
559
|
+
png_data = img_object._repr_png_()
|
560
|
+
if png_data:
|
561
|
+
base_image = Image.open(io.BytesIO(png_data)).convert('RGB')
|
562
|
+
else:
|
563
|
+
raise ValueError("Could not extract base PIL image from pdfplumber.")
|
564
|
+
# Convert to RGBA for compositing
|
565
|
+
base_image = base_image.convert('RGBA')
|
566
|
+
logger.debug(f"Base image for page {page_index} rendered with resolution {render_resolution}.")
|
567
|
+
except Exception as e:
|
568
|
+
logger.error(f"Failed to render base image for page {page_index}: {e}", exc_info=True)
|
569
|
+
return None
|
570
|
+
|
571
|
+
# --- Render Highlights ---
|
572
|
+
if highlights_on_page:
|
573
|
+
renderer = HighlightRenderer(
|
574
|
+
page=page,
|
575
|
+
base_image=base_image,
|
576
|
+
highlights=highlights_on_page,
|
577
|
+
scale=scale,
|
578
|
+
render_ocr=render_ocr,
|
579
|
+
)
|
580
|
+
rendered_image = renderer.render()
|
581
|
+
else:
|
582
|
+
# If no highlights, still need to potentially render OCR if requested
|
583
|
+
if render_ocr:
|
584
|
+
renderer = HighlightRenderer(page, base_image, [], scale, True)
|
585
|
+
rendered_image = renderer.render() # Will only call _render_ocr_text
|
586
|
+
else:
|
587
|
+
rendered_image = base_image # No highlights, no OCR requested
|
588
|
+
|
589
|
+
# --- Add Legend ---
|
590
|
+
if labels:
|
591
|
+
label_colors = self.get_labels_and_colors()
|
592
|
+
if label_colors:
|
593
|
+
legend = create_legend(label_colors)
|
594
|
+
rendered_image = merge_images_with_legend(rendered_image, legend, legend_position)
|
595
|
+
logger.debug(f"Added legend with {len(label_colors)} labels to page {page_index}.")
|
596
|
+
|
597
|
+
return rendered_image
|
598
|
+
|
599
|
+
def render_preview(
|
600
|
+
self,
|
601
|
+
page_index: int,
|
602
|
+
temporary_highlights: List[Dict],
|
603
|
+
scale: float = 2.0,
|
604
|
+
labels: bool = True,
|
605
|
+
legend_position: str = 'right',
|
606
|
+
render_ocr: bool = False,
|
607
|
+
resolution: Optional[float] = None,
|
608
|
+
**kwargs
|
609
|
+
) -> Optional[Image.Image]:
|
610
|
+
"""
|
611
|
+
Renders a preview image for a specific page containing only the
|
612
|
+
provided temporary highlights. Does not affect persistent state.
|
613
|
+
|
614
|
+
Args:
|
615
|
+
page_index: Index of the page to render.
|
616
|
+
temporary_highlights: List of highlight data dicts (from ElementCollection._prepare).
|
617
|
+
scale: Scale factor for rendering.
|
618
|
+
labels: Whether to include a legend.
|
619
|
+
legend_position: Position of the legend.
|
620
|
+
render_ocr: Whether to render OCR text.
|
621
|
+
resolution: Resolution for base page image rendering.
|
622
|
+
**kwargs: Additional args for pdfplumber's to_image.
|
623
|
+
|
624
|
+
Returns:
|
625
|
+
PIL Image of the preview, or None if rendering fails.
|
626
|
+
"""
|
627
|
+
if page_index < 0 or page_index >= len(self._pdf.pages):
|
628
|
+
logger.error(f"Invalid page index {page_index} for render_preview.")
|
629
|
+
return None
|
630
|
+
|
631
|
+
page = self._pdf.pages[page_index]
|
632
|
+
render_resolution = resolution if resolution is not None else scale * 72
|
633
|
+
|
634
|
+
try:
|
635
|
+
# Get base image from pdfplumber using the Page object's underlying _page
|
636
|
+
img_object = page._page.to_image(resolution=render_resolution, **kwargs)
|
637
|
+
base_image = img_object.annotated if hasattr(img_object, 'annotated') else img_object._repr_png_()
|
638
|
+
if isinstance(base_image, bytes):
|
639
|
+
from io import BytesIO
|
640
|
+
base_image = Image.open(BytesIO(base_image))
|
641
|
+
base_image = base_image.convert("RGB") # Ensure consistent format
|
642
|
+
|
643
|
+
# Convert temporary highlight dicts to Highlight objects
|
644
|
+
# Note: Colors/labels should be determined *here* for temporary preview
|
645
|
+
preview_highlights = []
|
646
|
+
for hl_data in temporary_highlights:
|
647
|
+
# Determine the final color using the service logic
|
648
|
+
final_color = self._determine_highlight_color(
|
649
|
+
color_input=hl_data.get('color'),
|
650
|
+
label=hl_data.get('label'),
|
651
|
+
use_color_cycling=hl_data.get('use_color_cycling', False)
|
652
|
+
)
|
653
|
+
|
654
|
+
# Extract potential attributes to draw
|
655
|
+
attrs_to_draw = {}
|
656
|
+
element = hl_data.get('element')
|
657
|
+
include_attrs = hl_data.get('include_attrs')
|
658
|
+
if element and include_attrs:
|
659
|
+
for attr_name in include_attrs:
|
660
|
+
try:
|
661
|
+
attr_value = getattr(element, attr_name, None)
|
662
|
+
if attr_value is not None:
|
663
|
+
attrs_to_draw[attr_name] = attr_value
|
664
|
+
except AttributeError:
|
665
|
+
logger.warning(f"Attribute '{attr_name}' not found on element {element}")
|
666
|
+
|
667
|
+
# Add highlight if geometry exists
|
668
|
+
if hl_data.get('bbox') or hl_data.get('polygon'):
|
669
|
+
preview_highlights.append(Highlight(
|
670
|
+
page_index=hl_data['page_index'],
|
671
|
+
bbox=hl_data.get('bbox'),
|
672
|
+
polygon=hl_data.get('polygon'),
|
673
|
+
color=final_color, # Use the determined color
|
674
|
+
label=hl_data.get('label'),
|
675
|
+
attributes=attrs_to_draw
|
676
|
+
))
|
677
|
+
|
678
|
+
# Render only these highlights
|
679
|
+
renderer = HighlightRenderer(page, base_image, preview_highlights, scale, render_ocr)
|
680
|
+
rendered_image = renderer.render()
|
681
|
+
|
682
|
+
# Create legend only from temporary highlights
|
683
|
+
legend = None
|
684
|
+
if labels:
|
685
|
+
preview_labels = {h.label: h.color for h in preview_highlights if h.label}
|
686
|
+
if preview_labels:
|
687
|
+
legend = create_legend(preview_labels)
|
688
|
+
final_image = merge_images_with_legend(rendered_image, legend, position=legend_position)
|
689
|
+
else:
|
690
|
+
final_image = rendered_image # No legend needed
|
691
|
+
else:
|
692
|
+
final_image = rendered_image
|
693
|
+
|
694
|
+
except Exception as e:
|
695
|
+
logger.error(f"Error rendering preview for page {page_index}: {e}", exc_info=True)
|
696
|
+
return None
|
697
|
+
|
698
|
+
return final_image
|