natural-pdf 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. natural_pdf/__init__.py +55 -0
  2. natural_pdf/analyzers/__init__.py +6 -0
  3. natural_pdf/analyzers/layout/__init__.py +1 -0
  4. natural_pdf/analyzers/layout/base.py +151 -0
  5. natural_pdf/analyzers/layout/docling.py +247 -0
  6. natural_pdf/analyzers/layout/layout_analyzer.py +166 -0
  7. natural_pdf/analyzers/layout/layout_manager.py +200 -0
  8. natural_pdf/analyzers/layout/layout_options.py +78 -0
  9. natural_pdf/analyzers/layout/paddle.py +240 -0
  10. natural_pdf/analyzers/layout/surya.py +151 -0
  11. natural_pdf/analyzers/layout/tatr.py +251 -0
  12. natural_pdf/analyzers/layout/yolo.py +165 -0
  13. natural_pdf/analyzers/text_options.py +60 -0
  14. natural_pdf/analyzers/text_structure.py +270 -0
  15. natural_pdf/analyzers/utils.py +57 -0
  16. natural_pdf/core/__init__.py +3 -0
  17. natural_pdf/core/element_manager.py +457 -0
  18. natural_pdf/core/highlighting_service.py +698 -0
  19. natural_pdf/core/page.py +1444 -0
  20. natural_pdf/core/pdf.py +653 -0
  21. natural_pdf/elements/__init__.py +3 -0
  22. natural_pdf/elements/base.py +761 -0
  23. natural_pdf/elements/collections.py +1345 -0
  24. natural_pdf/elements/line.py +140 -0
  25. natural_pdf/elements/rect.py +122 -0
  26. natural_pdf/elements/region.py +1793 -0
  27. natural_pdf/elements/text.py +304 -0
  28. natural_pdf/ocr/__init__.py +56 -0
  29. natural_pdf/ocr/engine.py +104 -0
  30. natural_pdf/ocr/engine_easyocr.py +179 -0
  31. natural_pdf/ocr/engine_paddle.py +204 -0
  32. natural_pdf/ocr/engine_surya.py +171 -0
  33. natural_pdf/ocr/ocr_manager.py +191 -0
  34. natural_pdf/ocr/ocr_options.py +114 -0
  35. natural_pdf/qa/__init__.py +3 -0
  36. natural_pdf/qa/document_qa.py +396 -0
  37. natural_pdf/selectors/__init__.py +4 -0
  38. natural_pdf/selectors/parser.py +354 -0
  39. natural_pdf/templates/__init__.py +1 -0
  40. natural_pdf/templates/ocr_debug.html +517 -0
  41. natural_pdf/utils/__init__.py +3 -0
  42. natural_pdf/utils/highlighting.py +12 -0
  43. natural_pdf/utils/reading_order.py +227 -0
  44. natural_pdf/utils/visualization.py +223 -0
  45. natural_pdf/widgets/__init__.py +4 -0
  46. natural_pdf/widgets/frontend/viewer.js +88 -0
  47. natural_pdf/widgets/viewer.py +765 -0
  48. natural_pdf-0.1.0.dist-info/METADATA +295 -0
  49. natural_pdf-0.1.0.dist-info/RECORD +52 -0
  50. natural_pdf-0.1.0.dist-info/WHEEL +5 -0
  51. natural_pdf-0.1.0.dist-info/licenses/LICENSE +21 -0
  52. natural_pdf-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,698 @@
1
+ """
2
+ Centralized service for managing and rendering highlights in a PDF document.
3
+ """
4
+ import io
5
+ import os
6
+ import logging # Added
7
+ from dataclasses import dataclass, field
8
+ from typing import List, Dict, Tuple, Optional, Any, Union
9
+
10
+ from PIL import Image, ImageDraw, ImageFont
11
+ from colour import Color
12
+
13
+ # Attempt to import Page for type hinting safely
14
+ try:
15
+ from .page import Page
16
+ except ImportError:
17
+ Page = Any # Fallback if circular import issue arises during type checking
18
+
19
+ # Import ColorManager and related utils
20
+ from natural_pdf.utils.visualization import ColorManager, create_legend, merge_images_with_legend
21
+
22
+ # Constants for drawing (Can be potentially moved to ColorManager/Renderer if desired)
23
+ BORDER_ALPHA = 180 # Default alpha for highlight border
24
+ DEFAULT_FALLBACK_COLOR = (255, 255, 0) # Yellow fallback (RGB only, alpha added by ColorManager)
25
+
26
+ # Setup logger
27
+ logger = logging.getLogger(__name__)
28
+
29
+ @dataclass
30
+ class Highlight:
31
+ """
32
+ Represents a single highlight to be drawn.
33
+ Stores geometric data, color, label, and extracted attributes.
34
+ """
35
+ page_index: int
36
+ bbox: Tuple[float, float, float, float]
37
+ color: Tuple[int, int, int, int] # Final RGBA color determined by service
38
+ label: Optional[str] = None
39
+ polygon: Optional[List[Tuple[float, float]]] = None
40
+ attributes: Dict[str, Any] = field(default_factory=dict) # Store extracted attribute values
41
+
42
+ @property
43
+ def is_polygon(self) -> bool:
44
+ """Check if this highlight uses polygon coordinates."""
45
+ return self.polygon is not None and len(self.polygon) >= 3
46
+
47
+ @property
48
+ def border_color(self) -> Tuple[int, int, int, int]:
49
+ """Calculate a slightly darker/more opaque border color."""
50
+ # Use base color but increase alpha for border
51
+ return (
52
+ self.color[0],
53
+ self.color[1],
54
+ self.color[2],
55
+ BORDER_ALPHA
56
+ )
57
+
58
+
59
+ class HighlightRenderer:
60
+ """
61
+ Handles the drawing logic for highlights on a single page image.
62
+ Instantiated by HighlightingService for each render request.
63
+ """
64
+ def __init__(
65
+ self,
66
+ page: Page,
67
+ base_image: Image.Image,
68
+ highlights: List[Highlight],
69
+ scale: float,
70
+ render_ocr: bool,
71
+ ):
72
+ self.page = page # Keep page reference for OCR rendering
73
+ self.base_image = base_image.convert('RGBA') # Ensure RGBA
74
+ self.highlights = highlights
75
+ self.scale = scale
76
+ self.render_ocr = render_ocr
77
+ self.result_image = self.base_image.copy()
78
+ self.vertex_size = max(3, int(2 * self.scale)) # Size of corner markers
79
+
80
+ def render(self) -> Image.Image:
81
+ """Executes the rendering process."""
82
+ self._draw_highlights()
83
+ if self.render_ocr:
84
+ self._render_ocr_text()
85
+ return self.result_image
86
+
87
+ def _draw_highlights(self):
88
+ """Draws all highlight shapes, borders, vertices, and attributes."""
89
+ for highlight in self.highlights:
90
+ # Create a transparent overlay for this single highlight
91
+ overlay = Image.new('RGBA', self.base_image.size, (0, 0, 0, 0))
92
+ draw = ImageDraw.Draw(overlay)
93
+
94
+ scaled_bbox = None
95
+
96
+ if highlight.is_polygon:
97
+ scaled_polygon = [(p[0] * self.scale, p[1] * self.scale) for p in highlight.polygon]
98
+ # Draw polygon fill and border
99
+ draw.polygon(scaled_polygon, fill=highlight.color, outline=highlight.border_color, width=2)
100
+ self._draw_vertices(draw, scaled_polygon, highlight.border_color)
101
+
102
+ # Calculate scaled bbox for attribute drawing
103
+ x_coords = [p[0] for p in scaled_polygon]
104
+ y_coords = [p[1] for p in scaled_polygon]
105
+ scaled_bbox = [min(x_coords), min(y_coords), max(x_coords), max(y_coords)]
106
+
107
+ else: # Rectangle
108
+ x0, top, x1, bottom = highlight.bbox
109
+ x0_s, top_s, x1_s, bottom_s = x0 * self.scale, top * self.scale, x1 * self.scale, bottom * self.scale
110
+ scaled_bbox = [x0_s, top_s, x1_s, bottom_s]
111
+ # Draw rectangle fill and border
112
+ draw.rectangle(scaled_bbox, fill=highlight.color, outline=highlight.border_color, width=2)
113
+
114
+ vertices = [(x0_s, top_s), (x1_s, top_s), (x1_s, bottom_s), (x0_s, bottom_s)]
115
+ self._draw_vertices(draw, vertices, highlight.border_color)
116
+
117
+ # Draw attributes if present on the highlight object
118
+ if highlight.attributes and scaled_bbox: # Ensure bbox is calculated
119
+ self._draw_attributes(draw, highlight.attributes, scaled_bbox)
120
+
121
+ # Composite this highlight's overlay onto the result using alpha blending
122
+ self.result_image = Image.alpha_composite(self.result_image, overlay)
123
+
124
+ def _draw_vertices(self, draw: ImageDraw.Draw, vertices: List[Tuple[float, float]], color: Tuple[int, int, int, int]):
125
+ """Draw small markers at each vertex."""
126
+ for x, y in vertices:
127
+ # Draw ellipse centered at vertex
128
+ draw.ellipse(
129
+ [x - self.vertex_size, y - self.vertex_size, x + self.vertex_size, y + self.vertex_size],
130
+ fill=color # Use border color for vertices
131
+ )
132
+
133
+ def _draw_attributes(self, draw: ImageDraw.Draw, attributes: Dict[str, Any], bbox_scaled: List[float]):
134
+ """Draws attribute key-value pairs on the highlight."""
135
+ try:
136
+ # Slightly larger font, scaled
137
+ font_size = max(10, int(8 * self.scale))
138
+ # Prioritize monospace fonts for better alignment
139
+ font = ImageFont.truetype("Arial.ttf", font_size) # Fallback sans-serif
140
+ except IOError:
141
+ font = ImageFont.load_default()
142
+ font_size = 10 # Reset size for default font
143
+
144
+ line_height = font_size + int(4 * self.scale) # Scaled line spacing
145
+ bg_padding = int(3 * self.scale)
146
+ max_width = 0
147
+ text_lines = []
148
+
149
+ # Format attribute lines
150
+ for name, value in attributes.items():
151
+ if isinstance(value, float):
152
+ value_str = f"{value:.2f}" # Format floats
153
+ else:
154
+ value_str = str(value)
155
+ line = f"{name}: {value_str}"
156
+ text_lines.append(line)
157
+ try:
158
+ # Calculate max width for background box
159
+ max_width = max(max_width, draw.textlength(line, font=font))
160
+ except AttributeError: pass # Ignore if textlength not available
161
+
162
+ if not text_lines: return # Nothing to draw
163
+
164
+ total_height = line_height * len(text_lines)
165
+
166
+ # Position near top-right corner with padding
167
+ x = bbox_scaled[2] - int(2 * self.scale) - max_width
168
+ y = bbox_scaled[1] + int(2 * self.scale)
169
+
170
+ # Draw background rectangle (semi-transparent white)
171
+ bg_x0 = x - bg_padding
172
+ bg_y0 = y - bg_padding
173
+ bg_x1 = x + max_width + bg_padding
174
+ bg_y1 = y + total_height + bg_padding
175
+ draw.rectangle(
176
+ [bg_x0, bg_y0, bg_x1, bg_y1],
177
+ fill=(255, 255, 255, 240),
178
+ outline=(0, 0, 0, 180), # Light black outline
179
+ width=1
180
+ )
181
+
182
+ # Draw text lines (black)
183
+ current_y = y
184
+ for line in text_lines:
185
+ draw.text((x, current_y), line, fill=(0, 0, 0, 255), font=font)
186
+ current_y += line_height
187
+
188
+ def _render_ocr_text(self):
189
+ """Renders OCR text onto the image. (Adapted from old HighlightManager)"""
190
+ # Use the page reference to get OCR elements
191
+ try:
192
+ # Try finding first, then extracting if necessary
193
+ ocr_elements = self.page.find_all('text[source=ocr]')
194
+ if not ocr_elements:
195
+ # Don't run full OCR here, just extract if already run
196
+ ocr_elements = [el for el in self.page.words if getattr(el, 'source', None) == 'ocr']
197
+ # Alternative: self.page.extract_ocr_elements() - but might be slow
198
+
199
+ except Exception as e:
200
+ logger.warning(f"Could not get OCR elements for page {self.page.number}: {e}", exc_info=True)
201
+ return # Don't modify image if OCR elements aren't available
202
+
203
+ if not ocr_elements:
204
+ logger.debug(f"No OCR elements found for page {self.page.number} to render.")
205
+ return
206
+
207
+ overlay = Image.new('RGBA', self.base_image.size, (0, 0, 0, 0))
208
+ draw = ImageDraw.Draw(overlay)
209
+
210
+ # Find a suitable font
211
+ font_path = None
212
+ default_font = ImageFont.load_default()
213
+ common_fonts = ["DejaVuSans.ttf", "Arial.ttf", "Helvetica.ttf", "FreeSans.ttf"]
214
+ for fname in common_fonts:
215
+ try:
216
+ ImageFont.truetype(fname, 10) # Test load
217
+ font_path = fname
218
+ break
219
+ except IOError:
220
+ continue
221
+
222
+ for element in ocr_elements:
223
+ x0, top, x1, bottom = element.bbox
224
+ x0_s, top_s, x1_s, bottom_s = x0 * self.scale, top * self.scale, x1 * self.scale, bottom * self.scale
225
+ box_w, box_h = x1_s - x0_s, bottom_s - top_s
226
+
227
+ if box_h <= 0: continue # Skip zero-height boxes
228
+
229
+ # --- Font Size Calculation ---
230
+ font_size = max(9, int(box_h * 0.85)) # Min size 9, 85% of box height
231
+
232
+ try:
233
+ sized_font = ImageFont.truetype(font_path, font_size) if font_path else default_font
234
+ except IOError:
235
+ sized_font = default_font
236
+
237
+ # --- Adjust Font Size if Text Overflows ---
238
+ try:
239
+ text_w = draw.textlength(element.text, font=sized_font)
240
+ if text_w > box_w * 1.1: # Allow 10% overflow
241
+ ratio = max(0.5, (box_w * 1.0) / text_w) # Don't shrink below 50%
242
+ font_size = max(9, int(font_size * ratio))
243
+ if font_path:
244
+ try: sized_font = ImageFont.truetype(font_path, font_size)
245
+ except IOError: pass # Keep previous if error
246
+ except AttributeError: pass # Skip adjustment if textlength fails
247
+
248
+ # --- Draw Background and Text ---
249
+ padding = max(1, int(font_size * 0.05)) # Minimal padding
250
+ draw.rectangle(
251
+ [x0_s - padding, top_s - padding, x1_s + padding, bottom_s + padding],
252
+ fill=(255, 255, 255, 230) # Highly transparent white background
253
+ )
254
+
255
+ # Calculate text position (centered vertically, slightly offset from left)
256
+ try:
257
+ if hasattr(sized_font, "getbbox"): # Modern PIL
258
+ _, text_top_offset, _, text_bottom_offset = sized_font.getbbox(element.text)
259
+ text_h = text_bottom_offset - text_top_offset
260
+ else: # Older PIL approximation
261
+ text_h = font_size
262
+ text_y = top_s + (box_h - text_h) / 2
263
+ # Adjust for vertical offset in some fonts
264
+ text_y -= text_top_offset if hasattr(sized_font, "getbbox") else 0
265
+ text_x = x0_s + padding # Start near left edge with padding
266
+
267
+ except Exception:
268
+ # Fallback positioning
269
+ text_x, text_y = x0_s + padding, top_s + padding
270
+
271
+ draw.text((text_x, text_y), element.text, fill=(0, 0, 0, 255), font=sized_font)
272
+
273
+ # Composite the OCR text overlay onto the result image
274
+ self.result_image = Image.alpha_composite(self.result_image, overlay)
275
+
276
+
277
+ class HighlightingService:
278
+ """
279
+ Central service to manage highlight data and orchestrate rendering.
280
+ Holds the state of all highlights across the document.
281
+ """
282
+ def __init__(self, pdf_object):
283
+ self._pdf = pdf_object # Reference to the parent PDF object
284
+ self._highlights_by_page: Dict[int, List[Highlight]] = {}
285
+ self._color_manager = ColorManager() # Instantiate the color manager
286
+ logger.info("HighlightingService initialized with ColorManager.")
287
+
288
+ # Removed _get_next_color - logic moved to ColorManager
289
+ # Removed _color_cycle, _labels_colors - managed by ColorManager
290
+
291
+ def _process_color_input(
292
+ self,
293
+ color_input: Optional[Union[Tuple, str]]
294
+ ) -> Optional[Tuple[int, int, int, int]]:
295
+ """
296
+ Parses various color input formats into a standard RGBA tuple (0-255).
297
+ Returns None if input is invalid.
298
+ """
299
+ if color_input is None:
300
+ return None
301
+
302
+ if isinstance(color_input, tuple):
303
+ # Convert float values (0.0-1.0) to int (0-255)
304
+ processed = []
305
+ all_float = all(isinstance(c, float) and 0.0 <= c <= 1.0 for c in color_input[:3])
306
+
307
+ for i, c in enumerate(color_input):
308
+ if isinstance(c, float):
309
+ val = int(c * 255) if (i < 3 and all_float) or (i==3 and 0.0 <= c <= 1.0) else int(c)
310
+ elif isinstance(c, int):
311
+ val = c
312
+ else:
313
+ logger.warning(f"Invalid color component type: {c} in {color_input}")
314
+ return None # Invalid type
315
+ processed.append(max(0, min(255, val))) # Clamp to 0-255
316
+
317
+ # Check length and add default alpha if needed
318
+ if len(processed) == 3:
319
+ # Use alpha from ColorManager instance
320
+ processed.append(self._color_manager._alpha)
321
+ return tuple(processed)
322
+ elif len(processed) == 4:
323
+ return tuple(processed)
324
+ else:
325
+ logger.warning(f"Invalid color tuple length: {color_input}")
326
+ return None # Invalid length
327
+
328
+ elif isinstance(color_input, str):
329
+ try:
330
+ # Convert color name/hex string to RGB tuple (0.0-1.0 floats)
331
+ from colour import Color # Import here if not at top
332
+ color_obj = Color(color_input)
333
+ # Convert floats (0.0-1.0) to integers (0-255)
334
+ r = int(color_obj.red * 255)
335
+ g = int(color_obj.green * 255)
336
+ b = int(color_obj.blue * 255)
337
+ # Clamp values just in case
338
+ r = max(0, min(255, r))
339
+ g = max(0, min(255, g))
340
+ b = max(0, min(255, b))
341
+ # Add alpha
342
+ rgba = (r, g, b, self._color_manager._alpha)
343
+ return rgba
344
+ except ImportError:
345
+ logger.error("Color utility class not found. Cannot process string colors.")
346
+ return None
347
+ except ValueError:
348
+ logger.warning(f"Invalid color string: '{color_input}'")
349
+ return None
350
+ except Exception as e:
351
+ logger.error(f"Error processing color string '{color_input}': {e}")
352
+ return None
353
+ else:
354
+ logger.warning(f"Invalid color input type: {type(color_input)}")
355
+ return None
356
+
357
+ def _determine_highlight_color(
358
+ self,
359
+ color_input: Optional[Union[Tuple, str]] = None,
360
+ label: Optional[str] = None,
361
+ use_color_cycling: bool = False
362
+ ) -> Tuple[int, int, int, int]:
363
+ """
364
+ Determines the final RGBA color for a highlight using the ColorManager.
365
+
366
+ Args:
367
+ color_input: User-provided color (tuple or string).
368
+ label: Label associated with the highlight.
369
+ use_color_cycling: Whether to force cycling (ignores label).
370
+
371
+ Returns:
372
+ RGBA color tuple (0-255).
373
+ """
374
+ explicit_color = self._process_color_input(color_input)
375
+
376
+ if explicit_color:
377
+ # If a valid color was explicitly provided, use it
378
+ return explicit_color
379
+ else:
380
+ # Otherwise, use the color manager to get a color based on label/cycling
381
+ return self._color_manager.get_color(label=label, force_cycle=use_color_cycling)
382
+
383
+ def add(
384
+ self,
385
+ page_index: int,
386
+ bbox: Tuple[float, float, float, float],
387
+ color: Optional[Union[Tuple, str]] = None,
388
+ label: Optional[str] = None,
389
+ use_color_cycling: bool = False,
390
+ element: Optional[Any] = None,
391
+ include_attrs: Optional[List[str]] = None,
392
+ existing: str = 'append'
393
+ ):
394
+ """Adds a rectangular highlight."""
395
+ self._add_internal(
396
+ page_index=page_index,
397
+ bbox=bbox,
398
+ polygon=None,
399
+ color_input=color,
400
+ label=label,
401
+ use_color_cycling=use_color_cycling,
402
+ element=element,
403
+ include_attrs=include_attrs,
404
+ existing=existing
405
+ )
406
+
407
+ def add_polygon(
408
+ self,
409
+ page_index: int,
410
+ polygon: List[Tuple[float, float]],
411
+ color: Optional[Union[Tuple, str]] = None,
412
+ label: Optional[str] = None,
413
+ use_color_cycling: bool = False,
414
+ element: Optional[Any] = None,
415
+ include_attrs: Optional[List[str]] = None,
416
+ existing: str = 'append'
417
+ ):
418
+ """Adds a polygonal highlight."""
419
+ # Calculate bounding box from polygon for internal storage
420
+ if polygon and len(polygon) >= 3:
421
+ x_coords = [p[0] for p in polygon]
422
+ y_coords = [p[1] for p in polygon]
423
+ bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
424
+ else:
425
+ logger.warning(f"Invalid polygon provided for page {page_index}. Cannot add highlight.")
426
+ return
427
+
428
+ self._add_internal(
429
+ page_index=page_index,
430
+ bbox=bbox,
431
+ polygon=polygon,
432
+ color_input=color,
433
+ label=label,
434
+ use_color_cycling=use_color_cycling,
435
+ element=element,
436
+ include_attrs=include_attrs,
437
+ existing=existing
438
+ )
439
+
440
+ def _add_internal(
441
+ self,
442
+ page_index: int,
443
+ bbox: Tuple[float, float, float, float],
444
+ polygon: Optional[List[Tuple[float, float]]],
445
+ color_input: Optional[Union[Tuple, str]],
446
+ label: Optional[str],
447
+ use_color_cycling: bool,
448
+ element: Optional[Any],
449
+ include_attrs: Optional[List[str]],
450
+ existing: str
451
+ ):
452
+ """Internal method to create and store a Highlight object."""
453
+ if page_index < 0 or page_index >= len(self._pdf.pages):
454
+ logger.error(f"Invalid page index {page_index}. Cannot add highlight.")
455
+ return
456
+
457
+ # Handle 'replace' logic - clear highlights for this page *before* adding new one
458
+ if existing == 'replace':
459
+ self.clear_page(page_index)
460
+
461
+ # Determine the final color using the ColorManager
462
+ final_color = self._determine_highlight_color(
463
+ color_input=color_input,
464
+ label=label,
465
+ use_color_cycling=use_color_cycling
466
+ )
467
+
468
+ # Extract attributes from the element if requested
469
+ attributes_to_draw = {}
470
+ if element and include_attrs:
471
+ for attr_name in include_attrs:
472
+ try:
473
+ attr_value = getattr(element, attr_name, None)
474
+ if attr_value is not None:
475
+ attributes_to_draw[attr_name] = attr_value
476
+ except AttributeError:
477
+ logger.warning(f"Attribute '{attr_name}' not found on element {element}")
478
+
479
+ # Create the highlight data object
480
+ highlight = Highlight(
481
+ page_index=page_index,
482
+ bbox=bbox,
483
+ color=final_color,
484
+ label=label,
485
+ polygon=polygon,
486
+ attributes=attributes_to_draw
487
+ )
488
+
489
+ # Add to the list for the specific page
490
+ if page_index not in self._highlights_by_page:
491
+ self._highlights_by_page[page_index] = []
492
+ self._highlights_by_page[page_index].append(highlight)
493
+ logger.debug(f"Added highlight to page {page_index}: {highlight}")
494
+
495
+ def clear_all(self):
496
+ """Clears all highlights from all pages and resets the color manager."""
497
+ self._highlights_by_page = {}
498
+ self._color_manager.reset()
499
+ logger.info("Cleared all highlights and reset ColorManager.")
500
+
501
+ def clear_page(self, page_index: int):
502
+ """Clears all highlights from a specific page."""
503
+ if page_index in self._highlights_by_page:
504
+ del self._highlights_by_page[page_index]
505
+ logger.debug(f"Cleared highlights for page {page_index}.")
506
+ # Note: We typically don't reset the color manager when clearing a single page
507
+ # to maintain color consistency if highlights are added back.
508
+
509
+ def get_highlights_for_page(self, page_index: int) -> List[Highlight]:
510
+ """Returns a list of Highlight objects for a specific page."""
511
+ return self._highlights_by_page.get(page_index, [])
512
+
513
+ def get_labels_and_colors(self) -> Dict[str, Tuple[int, int, int, int]]:
514
+ """Returns a mapping of labels used to their assigned colors (for persistent highlights)."""
515
+ return self._color_manager.get_label_colors()
516
+
517
+ def render_page(
518
+ self,
519
+ page_index: int,
520
+ scale: float = 2.0,
521
+ labels: bool = True,
522
+ legend_position: str = 'right',
523
+ render_ocr: bool = False,
524
+ resolution: Optional[float] = None,
525
+ **kwargs # Pass other args to pdfplumber.page.to_image if needed
526
+ ) -> Optional[Image.Image]:
527
+ """
528
+ Renders a specific page with its highlights.
529
+
530
+ Args:
531
+ page_index: The 0-based index of the page to render.
532
+ scale: Scale factor for rendering highlights.
533
+ labels: Whether to include a legend for highlights.
534
+ legend_position: Position of the legend.
535
+ render_ocr: Whether to render OCR text on the image.
536
+ resolution: Optional resolution (DPI) for the base page image.
537
+ Defaults to scale * 72.
538
+ kwargs: Additional keyword arguments for pdfplumber's page.to_image.
539
+
540
+ Returns:
541
+ A PIL Image object of the rendered page, or None if rendering fails.
542
+ """
543
+ if page_index < 0 or page_index >= len(self._pdf.pages):
544
+ logger.error(f"Invalid page index {page_index} for rendering.")
545
+ return None
546
+
547
+ page = self._pdf[page_index]
548
+ highlights_on_page = self.get_highlights_for_page(page_index)
549
+
550
+ # --- Get Base Image ---
551
+ try:
552
+ render_resolution = resolution if resolution is not None else scale * 72
553
+ # Use the underlying pdfplumber page object for base rendering
554
+ img_object = page._page.to_image(resolution=render_resolution, **kwargs)
555
+ # Access the PIL image directly
556
+ base_image = img_object.annotated # .annotated usually holds the PIL Image
557
+ if not isinstance(base_image, Image.Image):
558
+ # Fallback for different pdfplumber versions/outputs
559
+ png_data = img_object._repr_png_()
560
+ if png_data:
561
+ base_image = Image.open(io.BytesIO(png_data)).convert('RGB')
562
+ else:
563
+ raise ValueError("Could not extract base PIL image from pdfplumber.")
564
+ # Convert to RGBA for compositing
565
+ base_image = base_image.convert('RGBA')
566
+ logger.debug(f"Base image for page {page_index} rendered with resolution {render_resolution}.")
567
+ except Exception as e:
568
+ logger.error(f"Failed to render base image for page {page_index}: {e}", exc_info=True)
569
+ return None
570
+
571
+ # --- Render Highlights ---
572
+ if highlights_on_page:
573
+ renderer = HighlightRenderer(
574
+ page=page,
575
+ base_image=base_image,
576
+ highlights=highlights_on_page,
577
+ scale=scale,
578
+ render_ocr=render_ocr,
579
+ )
580
+ rendered_image = renderer.render()
581
+ else:
582
+ # If no highlights, still need to potentially render OCR if requested
583
+ if render_ocr:
584
+ renderer = HighlightRenderer(page, base_image, [], scale, True)
585
+ rendered_image = renderer.render() # Will only call _render_ocr_text
586
+ else:
587
+ rendered_image = base_image # No highlights, no OCR requested
588
+
589
+ # --- Add Legend ---
590
+ if labels:
591
+ label_colors = self.get_labels_and_colors()
592
+ if label_colors:
593
+ legend = create_legend(label_colors)
594
+ rendered_image = merge_images_with_legend(rendered_image, legend, legend_position)
595
+ logger.debug(f"Added legend with {len(label_colors)} labels to page {page_index}.")
596
+
597
+ return rendered_image
598
+
599
+ def render_preview(
600
+ self,
601
+ page_index: int,
602
+ temporary_highlights: List[Dict],
603
+ scale: float = 2.0,
604
+ labels: bool = True,
605
+ legend_position: str = 'right',
606
+ render_ocr: bool = False,
607
+ resolution: Optional[float] = None,
608
+ **kwargs
609
+ ) -> Optional[Image.Image]:
610
+ """
611
+ Renders a preview image for a specific page containing only the
612
+ provided temporary highlights. Does not affect persistent state.
613
+
614
+ Args:
615
+ page_index: Index of the page to render.
616
+ temporary_highlights: List of highlight data dicts (from ElementCollection._prepare).
617
+ scale: Scale factor for rendering.
618
+ labels: Whether to include a legend.
619
+ legend_position: Position of the legend.
620
+ render_ocr: Whether to render OCR text.
621
+ resolution: Resolution for base page image rendering.
622
+ **kwargs: Additional args for pdfplumber's to_image.
623
+
624
+ Returns:
625
+ PIL Image of the preview, or None if rendering fails.
626
+ """
627
+ if page_index < 0 or page_index >= len(self._pdf.pages):
628
+ logger.error(f"Invalid page index {page_index} for render_preview.")
629
+ return None
630
+
631
+ page = self._pdf.pages[page_index]
632
+ render_resolution = resolution if resolution is not None else scale * 72
633
+
634
+ try:
635
+ # Get base image from pdfplumber using the Page object's underlying _page
636
+ img_object = page._page.to_image(resolution=render_resolution, **kwargs)
637
+ base_image = img_object.annotated if hasattr(img_object, 'annotated') else img_object._repr_png_()
638
+ if isinstance(base_image, bytes):
639
+ from io import BytesIO
640
+ base_image = Image.open(BytesIO(base_image))
641
+ base_image = base_image.convert("RGB") # Ensure consistent format
642
+
643
+ # Convert temporary highlight dicts to Highlight objects
644
+ # Note: Colors/labels should be determined *here* for temporary preview
645
+ preview_highlights = []
646
+ for hl_data in temporary_highlights:
647
+ # Determine the final color using the service logic
648
+ final_color = self._determine_highlight_color(
649
+ color_input=hl_data.get('color'),
650
+ label=hl_data.get('label'),
651
+ use_color_cycling=hl_data.get('use_color_cycling', False)
652
+ )
653
+
654
+ # Extract potential attributes to draw
655
+ attrs_to_draw = {}
656
+ element = hl_data.get('element')
657
+ include_attrs = hl_data.get('include_attrs')
658
+ if element and include_attrs:
659
+ for attr_name in include_attrs:
660
+ try:
661
+ attr_value = getattr(element, attr_name, None)
662
+ if attr_value is not None:
663
+ attrs_to_draw[attr_name] = attr_value
664
+ except AttributeError:
665
+ logger.warning(f"Attribute '{attr_name}' not found on element {element}")
666
+
667
+ # Add highlight if geometry exists
668
+ if hl_data.get('bbox') or hl_data.get('polygon'):
669
+ preview_highlights.append(Highlight(
670
+ page_index=hl_data['page_index'],
671
+ bbox=hl_data.get('bbox'),
672
+ polygon=hl_data.get('polygon'),
673
+ color=final_color, # Use the determined color
674
+ label=hl_data.get('label'),
675
+ attributes=attrs_to_draw
676
+ ))
677
+
678
+ # Render only these highlights
679
+ renderer = HighlightRenderer(page, base_image, preview_highlights, scale, render_ocr)
680
+ rendered_image = renderer.render()
681
+
682
+ # Create legend only from temporary highlights
683
+ legend = None
684
+ if labels:
685
+ preview_labels = {h.label: h.color for h in preview_highlights if h.label}
686
+ if preview_labels:
687
+ legend = create_legend(preview_labels)
688
+ final_image = merge_images_with_legend(rendered_image, legend, position=legend_position)
689
+ else:
690
+ final_image = rendered_image # No legend needed
691
+ else:
692
+ final_image = rendered_image
693
+
694
+ except Exception as e:
695
+ logger.error(f"Error rendering preview for page {page_index}: {e}", exc_info=True)
696
+ return None
697
+
698
+ return final_image