natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +222 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +260 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +409 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +484 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +586 -0
  56. docs/tutorials/12-ocr-integration.md +188 -0
  57. docs/tutorials/13-semantic-search.ipynb +1888 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +39 -20
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  67. natural_pdf/analyzers/layout/layout_manager.py +98 -58
  68. natural_pdf/analyzers/layout/layout_options.py +32 -17
  69. natural_pdf/analyzers/layout/paddle.py +152 -95
  70. natural_pdf/analyzers/layout/surya.py +164 -92
  71. natural_pdf/analyzers/layout/tatr.py +149 -84
  72. natural_pdf/analyzers/layout/yolo.py +84 -44
  73. natural_pdf/analyzers/text_options.py +22 -15
  74. natural_pdf/analyzers/text_structure.py +131 -85
  75. natural_pdf/analyzers/utils.py +30 -23
  76. natural_pdf/collections/pdf_collection.py +126 -98
  77. natural_pdf/core/__init__.py +1 -1
  78. natural_pdf/core/element_manager.py +416 -337
  79. natural_pdf/core/highlighting_service.py +268 -196
  80. natural_pdf/core/page.py +910 -516
  81. natural_pdf/core/pdf.py +387 -289
  82. natural_pdf/elements/__init__.py +1 -1
  83. natural_pdf/elements/base.py +302 -214
  84. natural_pdf/elements/collections.py +714 -514
  85. natural_pdf/elements/line.py +39 -36
  86. natural_pdf/elements/rect.py +32 -30
  87. natural_pdf/elements/region.py +854 -883
  88. natural_pdf/elements/text.py +122 -99
  89. natural_pdf/exporters/__init__.py +0 -1
  90. natural_pdf/exporters/searchable_pdf.py +261 -102
  91. natural_pdf/ocr/__init__.py +23 -14
  92. natural_pdf/ocr/engine.py +17 -8
  93. natural_pdf/ocr/engine_easyocr.py +63 -47
  94. natural_pdf/ocr/engine_paddle.py +97 -68
  95. natural_pdf/ocr/engine_surya.py +54 -44
  96. natural_pdf/ocr/ocr_manager.py +88 -62
  97. natural_pdf/ocr/ocr_options.py +16 -10
  98. natural_pdf/qa/__init__.py +1 -1
  99. natural_pdf/qa/document_qa.py +119 -111
  100. natural_pdf/search/__init__.py +37 -31
  101. natural_pdf/search/haystack_search_service.py +312 -189
  102. natural_pdf/search/haystack_utils.py +186 -122
  103. natural_pdf/search/search_options.py +25 -14
  104. natural_pdf/search/search_service_protocol.py +12 -6
  105. natural_pdf/search/searchable_mixin.py +261 -176
  106. natural_pdf/selectors/__init__.py +2 -1
  107. natural_pdf/selectors/parser.py +159 -316
  108. natural_pdf/templates/__init__.py +1 -1
  109. natural_pdf/utils/highlighting.py +8 -2
  110. natural_pdf/utils/reading_order.py +65 -63
  111. natural_pdf/utils/text_extraction.py +195 -0
  112. natural_pdf/utils/visualization.py +70 -61
  113. natural_pdf/widgets/__init__.py +2 -3
  114. natural_pdf/widgets/viewer.py +749 -718
  115. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
  116. natural_pdf-0.1.5.dist-info/RECORD +134 -0
  117. natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
  118. notebooks/Examples.ipynb +1293 -0
  119. pdfs/.gitkeep +0 -0
  120. pdfs/01-practice.pdf +543 -0
  121. pdfs/0500000US42001.pdf +0 -0
  122. pdfs/0500000US42007.pdf +0 -0
  123. pdfs/2014 Statistics.pdf +0 -0
  124. pdfs/2019 Statistics.pdf +0 -0
  125. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  126. pdfs/needs-ocr.pdf +0 -0
  127. tests/test_loading.py +50 -0
  128. tests/test_optional_deps.py +298 -0
  129. natural_pdf-0.1.3.dist-info/RECORD +0 -61
  130. natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
  131. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
  132. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,43 +1,46 @@
1
1
  """
2
2
  Centralized service for managing and rendering highlights in a PDF document.
3
3
  """
4
+
4
5
  import io
6
+ import logging # Added
5
7
  import os
6
- import logging # Added
7
8
  from dataclasses import dataclass, field
8
- from typing import List, Dict, Tuple, Optional, Any, Union
9
+ from typing import Any, Dict, List, Optional, Tuple, Union
9
10
 
10
- from PIL import Image, ImageDraw, ImageFont
11
11
  from colour import Color
12
+ from PIL import Image, ImageDraw, ImageFont
12
13
 
13
14
  # Attempt to import Page for type hinting safely
14
15
  try:
15
16
  from .page import Page
16
17
  except ImportError:
17
- Page = Any # Fallback if circular import issue arises during type checking
18
+ Page = Any # Fallback if circular import issue arises during type checking
18
19
 
19
20
  # Import ColorManager and related utils
20
21
  from natural_pdf.utils.visualization import ColorManager, create_legend, merge_images_with_legend
21
22
 
22
23
  # Constants for drawing (Can be potentially moved to ColorManager/Renderer if desired)
23
- BORDER_ALPHA = 180 # Default alpha for highlight border
24
- DEFAULT_FALLBACK_COLOR = (255, 255, 0) # Yellow fallback (RGB only, alpha added by ColorManager)
24
+ BORDER_ALPHA = 180 # Default alpha for highlight border
25
+ DEFAULT_FALLBACK_COLOR = (255, 255, 0) # Yellow fallback (RGB only, alpha added by ColorManager)
25
26
 
26
27
  # Setup logger
27
28
  logger = logging.getLogger(__name__)
28
29
 
30
+
29
31
  @dataclass
30
32
  class Highlight:
31
33
  """
32
34
  Represents a single highlight to be drawn.
33
35
  Stores geometric data, color, label, and extracted attributes.
34
36
  """
37
+
35
38
  page_index: int
36
39
  bbox: Tuple[float, float, float, float]
37
- color: Tuple[int, int, int, int] # Final RGBA color determined by service
40
+ color: Tuple[int, int, int, int] # Final RGBA color determined by service
38
41
  label: Optional[str] = None
39
42
  polygon: Optional[List[Tuple[float, float]]] = None
40
- attributes: Dict[str, Any] = field(default_factory=dict) # Store extracted attribute values
43
+ attributes: Dict[str, Any] = field(default_factory=dict) # Store extracted attribute values
41
44
 
42
45
  @property
43
46
  def is_polygon(self) -> bool:
@@ -46,14 +49,9 @@ class Highlight:
46
49
 
47
50
  @property
48
51
  def border_color(self) -> Tuple[int, int, int, int]:
49
- """Calculate a slightly darker/more opaque border color."""
50
- # Use base color but increase alpha for border
51
- return (
52
- self.color[0],
53
- self.color[1],
54
- self.color[2],
55
- BORDER_ALPHA
56
- )
52
+ """Calculate a slightly darker/more opaque border color."""
53
+ # Use base color but increase alpha for border
54
+ return (self.color[0], self.color[1], self.color[2], BORDER_ALPHA)
57
55
 
58
56
 
59
57
  class HighlightRenderer:
@@ -61,6 +59,7 @@ class HighlightRenderer:
61
59
  Handles the drawing logic for highlights on a single page image.
62
60
  Instantiated by HighlightingService for each render request.
63
61
  """
62
+
64
63
  def __init__(
65
64
  self,
66
65
  page: Page,
@@ -69,13 +68,13 @@ class HighlightRenderer:
69
68
  scale: float,
70
69
  render_ocr: bool,
71
70
  ):
72
- self.page = page # Keep page reference for OCR rendering
73
- self.base_image = base_image.convert('RGBA') # Ensure RGBA
71
+ self.page = page # Keep page reference for OCR rendering
72
+ self.base_image = base_image.convert("RGBA") # Ensure RGBA
74
73
  self.highlights = highlights
75
74
  self.scale = scale
76
75
  self.render_ocr = render_ocr
77
76
  self.result_image = self.base_image.copy()
78
- self.vertex_size = max(3, int(2 * self.scale)) # Size of corner markers
77
+ self.vertex_size = max(3, int(2 * self.scale)) # Size of corner markers
79
78
 
80
79
  def render(self) -> Image.Image:
81
80
  """Executes the rendering process."""
@@ -88,7 +87,7 @@ class HighlightRenderer:
88
87
  """Draws all highlight shapes, borders, vertices, and attributes."""
89
88
  for highlight in self.highlights:
90
89
  # Create a transparent overlay for this single highlight
91
- overlay = Image.new('RGBA', self.base_image.size, (0, 0, 0, 0))
90
+ overlay = Image.new("RGBA", self.base_image.size, (0, 0, 0, 0))
92
91
  draw = ImageDraw.Draw(overlay)
93
92
 
94
93
  scaled_bbox = None
@@ -96,7 +95,9 @@ class HighlightRenderer:
96
95
  if highlight.is_polygon:
97
96
  scaled_polygon = [(p[0] * self.scale, p[1] * self.scale) for p in highlight.polygon]
98
97
  # Draw polygon fill and border
99
- draw.polygon(scaled_polygon, fill=highlight.color, outline=highlight.border_color, width=2)
98
+ draw.polygon(
99
+ scaled_polygon, fill=highlight.color, outline=highlight.border_color, width=2
100
+ )
100
101
  self._draw_vertices(draw, scaled_polygon, highlight.border_color)
101
102
 
102
103
  # Calculate scaled bbox for attribute drawing
@@ -104,44 +105,63 @@ class HighlightRenderer:
104
105
  y_coords = [p[1] for p in scaled_polygon]
105
106
  scaled_bbox = [min(x_coords), min(y_coords), max(x_coords), max(y_coords)]
106
107
 
107
- else: # Rectangle
108
+ else: # Rectangle
108
109
  x0, top, x1, bottom = highlight.bbox
109
- x0_s, top_s, x1_s, bottom_s = x0 * self.scale, top * self.scale, x1 * self.scale, bottom * self.scale
110
+ x0_s, top_s, x1_s, bottom_s = (
111
+ x0 * self.scale,
112
+ top * self.scale,
113
+ x1 * self.scale,
114
+ bottom * self.scale,
115
+ )
110
116
  scaled_bbox = [x0_s, top_s, x1_s, bottom_s]
111
117
  # Draw rectangle fill and border
112
- draw.rectangle(scaled_bbox, fill=highlight.color, outline=highlight.border_color, width=2)
118
+ draw.rectangle(
119
+ scaled_bbox, fill=highlight.color, outline=highlight.border_color, width=2
120
+ )
113
121
 
114
122
  vertices = [(x0_s, top_s), (x1_s, top_s), (x1_s, bottom_s), (x0_s, bottom_s)]
115
123
  self._draw_vertices(draw, vertices, highlight.border_color)
116
124
 
117
125
  # Draw attributes if present on the highlight object
118
- if highlight.attributes and scaled_bbox: # Ensure bbox is calculated
126
+ if highlight.attributes and scaled_bbox: # Ensure bbox is calculated
119
127
  self._draw_attributes(draw, highlight.attributes, scaled_bbox)
120
128
 
121
129
  # Composite this highlight's overlay onto the result using alpha blending
122
130
  self.result_image = Image.alpha_composite(self.result_image, overlay)
123
131
 
124
- def _draw_vertices(self, draw: ImageDraw.Draw, vertices: List[Tuple[float, float]], color: Tuple[int, int, int, int]):
132
+ def _draw_vertices(
133
+ self,
134
+ draw: ImageDraw.Draw,
135
+ vertices: List[Tuple[float, float]],
136
+ color: Tuple[int, int, int, int],
137
+ ):
125
138
  """Draw small markers at each vertex."""
126
139
  for x, y in vertices:
127
140
  # Draw ellipse centered at vertex
128
141
  draw.ellipse(
129
- [x - self.vertex_size, y - self.vertex_size, x + self.vertex_size, y + self.vertex_size],
130
- fill=color # Use border color for vertices
142
+ [
143
+ x - self.vertex_size,
144
+ y - self.vertex_size,
145
+ x + self.vertex_size,
146
+ y + self.vertex_size,
147
+ ],
148
+ fill=color, # Use border color for vertices
131
149
  )
132
150
 
133
- def _draw_attributes(self, draw: ImageDraw.Draw, attributes: Dict[str, Any], bbox_scaled: List[float]):
151
+ def _draw_attributes(
152
+ self, draw: ImageDraw.Draw, attributes: Dict[str, Any], bbox_scaled: List[float]
153
+ ):
134
154
  """Draws attribute key-value pairs on the highlight."""
135
155
  try:
136
156
  # Slightly larger font, scaled
137
157
  font_size = max(10, int(8 * self.scale))
138
158
  # Prioritize monospace fonts for better alignment
139
- font = ImageFont.truetype("Arial.ttf", font_size) # Fallback sans-serif
159
+ font = ImageFont.truetype("Arial.ttf", font_size) # Fallback sans-serif
140
160
  except IOError:
141
161
  font = ImageFont.load_default()
142
- font_size = 10 # Reset size for default font
162
+ font_size = 10 # Reset size for default font
143
163
 
144
- line_height = font_size + int(4 * self.scale) # Scaled line spacing
164
+ line_height = font_size + int(4 * self.scale) # Scaled line spacing
145
165
  bg_padding = int(3 * self.scale)
146
166
  max_width = 0
147
167
  text_lines = []
@@ -149,17 +169,19 @@ class HighlightRenderer:
149
169
  # Format attribute lines
150
170
  for name, value in attributes.items():
151
171
  if isinstance(value, float):
152
- value_str = f"{value:.2f}" # Format floats
172
+ value_str = f"{value:.2f}" # Format floats
153
173
  else:
154
174
  value_str = str(value)
155
175
  line = f"{name}: {value_str}"
156
176
  text_lines.append(line)
157
177
  try:
158
- # Calculate max width for background box
159
- max_width = max(max_width, draw.textlength(line, font=font))
160
- except AttributeError: pass # Ignore if textlength not available
178
+ # Calculate max width for background box
179
+ max_width = max(max_width, draw.textlength(line, font=font))
180
+ except AttributeError:
181
+ pass # Ignore if textlength not available
161
182
 
162
- if not text_lines: return # Nothing to draw
183
+ if not text_lines:
184
+ return # Nothing to draw
163
185
 
164
186
  total_height = line_height * len(text_lines)
165
187
 
@@ -175,8 +197,8 @@ class HighlightRenderer:
175
197
  draw.rectangle(
176
198
  [bg_x0, bg_y0, bg_x1, bg_y1],
177
199
  fill=(255, 255, 255, 240),
178
- outline=(0, 0, 0, 180), # Light black outline
179
- width=1
200
+ outline=(0, 0, 0, 180), # Light black outline
201
+ width=1,
180
202
  )
181
203
 
182
204
  # Draw text lines (black)
@@ -190,21 +212,25 @@ class HighlightRenderer:
190
212
  # Use the page reference to get OCR elements
191
213
  try:
192
214
  # Try finding first, then extracting if necessary
193
- ocr_elements = self.page.find_all('text[source=ocr]')
215
+ ocr_elements = self.page.find_all("text[source=ocr]")
194
216
  if not ocr_elements:
195
- # Don't run full OCR here, just extract if already run
196
- ocr_elements = [el for el in self.page.words if getattr(el, 'source', None) == 'ocr']
197
- # Alternative: self.page.extract_ocr_elements() - but might be slow
217
+ # Don't run full OCR here, just extract if already run
218
+ ocr_elements = [
219
+ el for el in self.page.words if getattr(el, "source", None) == "ocr"
220
+ ]
221
+ # Alternative: self.page.extract_ocr_elements() - but might be slow
198
222
 
199
223
  except Exception as e:
200
- logger.warning(f"Could not get OCR elements for page {self.page.number}: {e}", exc_info=True)
201
- return # Don't modify image if OCR elements aren't available
224
+ logger.warning(
225
+ f"Could not get OCR elements for page {self.page.number}: {e}", exc_info=True
226
+ )
227
+ return # Don't modify image if OCR elements aren't available
202
228
 
203
229
  if not ocr_elements:
204
230
  logger.debug(f"No OCR elements found for page {self.page.number} to render.")
205
231
  return
206
232
 
207
- overlay = Image.new('RGBA', self.base_image.size, (0, 0, 0, 0))
233
+ overlay = Image.new("RGBA", self.base_image.size, (0, 0, 0, 0))
208
234
  draw = ImageDraw.Draw(overlay)
209
235
 
210
236
  # Find a suitable font
@@ -212,22 +238,28 @@ class HighlightRenderer:
212
238
  default_font = ImageFont.load_default()
213
239
  common_fonts = ["DejaVuSans.ttf", "Arial.ttf", "Helvetica.ttf", "FreeSans.ttf"]
214
240
  for fname in common_fonts:
215
- try:
216
- ImageFont.truetype(fname, 10) # Test load
217
- font_path = fname
218
- break
219
- except IOError:
220
- continue
241
+ try:
242
+ ImageFont.truetype(fname, 10) # Test load
243
+ font_path = fname
244
+ break
245
+ except IOError:
246
+ continue
221
247
 
222
248
  for element in ocr_elements:
223
249
  x0, top, x1, bottom = element.bbox
224
- x0_s, top_s, x1_s, bottom_s = x0 * self.scale, top * self.scale, x1 * self.scale, bottom * self.scale
250
+ x0_s, top_s, x1_s, bottom_s = (
251
+ x0 * self.scale,
252
+ top * self.scale,
253
+ x1 * self.scale,
254
+ bottom * self.scale,
255
+ )
225
256
  box_w, box_h = x1_s - x0_s, bottom_s - top_s
226
257
 
227
- if box_h <= 0: continue # Skip zero-height boxes
258
+ if box_h <= 0:
259
+ continue # Skip zero-height boxes
228
260
 
229
261
  # --- Font Size Calculation ---
230
- font_size = max(9, int(box_h * 0.85)) # Min size 9, 85% of box height
262
+ font_size = max(9, int(box_h * 0.85)) # Min size 9, 85% of box height
231
263
 
232
264
  try:
233
265
  sized_font = ImageFont.truetype(font_path, font_size) if font_path else default_font
@@ -236,33 +268,36 @@ class HighlightRenderer:
236
268
 
237
269
  # --- Adjust Font Size if Text Overflows ---
238
270
  try:
239
- text_w = draw.textlength(element.text, font=sized_font)
240
- if text_w > box_w * 1.1: # Allow 10% overflow
241
- ratio = max(0.5, (box_w * 1.0) / text_w) # Don't shrink below 50%
242
- font_size = max(9, int(font_size * ratio))
243
- if font_path:
244
- try: sized_font = ImageFont.truetype(font_path, font_size)
245
- except IOError: pass # Keep previous if error
246
- except AttributeError: pass # Skip adjustment if textlength fails
271
+ text_w = draw.textlength(element.text, font=sized_font)
272
+ if text_w > box_w * 1.1: # Allow 10% overflow
273
+ ratio = max(0.5, (box_w * 1.0) / text_w) # Don't shrink below 50%
274
+ font_size = max(9, int(font_size * ratio))
275
+ if font_path:
276
+ try:
277
+ sized_font = ImageFont.truetype(font_path, font_size)
278
+ except IOError:
279
+ pass # Keep previous if error
280
+ except AttributeError:
281
+ pass # Skip adjustment if textlength fails
247
282
 
248
283
  # --- Draw Background and Text ---
249
- padding = max(1, int(font_size * 0.05)) # Minimal padding
284
+ padding = max(1, int(font_size * 0.05)) # Minimal padding
250
285
  draw.rectangle(
251
286
  [x0_s - padding, top_s - padding, x1_s + padding, bottom_s + padding],
252
- fill=(255, 255, 255, 230) # Highly transparent white background
287
+ fill=(255, 255, 255, 230), # Highly transparent white background
253
288
  )
254
289
 
255
290
  # Calculate text position (centered vertically, slightly offset from left)
256
291
  try:
257
- if hasattr(sized_font, "getbbox"): # Modern PIL
258
- _, text_top_offset, _, text_bottom_offset = sized_font.getbbox(element.text)
259
- text_h = text_bottom_offset - text_top_offset
260
- else: # Older PIL approximation
261
- text_h = font_size
262
- text_y = top_s + (box_h - text_h) / 2
263
- # Adjust for vertical offset in some fonts
264
- text_y -= text_top_offset if hasattr(sized_font, "getbbox") else 0
265
- text_x = x0_s + padding # Start near left edge with padding
292
+ if hasattr(sized_font, "getbbox"): # Modern PIL
293
+ _, text_top_offset, _, text_bottom_offset = sized_font.getbbox(element.text)
294
+ text_h = text_bottom_offset - text_top_offset
295
+ else: # Older PIL approximation
296
+ text_h = font_size
297
+ text_y = top_s + (box_h - text_h) / 2
298
+ # Adjust for vertical offset in some fonts
299
+ text_y -= text_top_offset if hasattr(sized_font, "getbbox") else 0
300
+ text_x = x0_s + padding # Start near left edge with padding
266
301
 
267
302
  except Exception:
268
303
  # Fallback positioning
@@ -279,18 +314,18 @@ class HighlightingService:
279
314
  Central service to manage highlight data and orchestrate rendering.
280
315
  Holds the state of all highlights across the document.
281
316
  """
317
+
282
318
  def __init__(self, pdf_object):
283
- self._pdf = pdf_object # Reference to the parent PDF object
319
+ self._pdf = pdf_object # Reference to the parent PDF object
284
320
  self._highlights_by_page: Dict[int, List[Highlight]] = {}
285
- self._color_manager = ColorManager() # Instantiate the color manager
321
+ self._color_manager = ColorManager() # Instantiate the color manager
286
322
  logger.info("HighlightingService initialized with ColorManager.")
287
323
 
288
324
  # Removed _get_next_color - logic moved to ColorManager
289
325
  # Removed _color_cycle, _labels_colors - managed by ColorManager
290
326
 
291
327
  def _process_color_input(
292
- self,
293
- color_input: Optional[Union[Tuple, str]]
328
+ self, color_input: Optional[Union[Tuple, str]]
294
329
  ) -> Optional[Tuple[int, int, int, int]]:
295
330
  """
296
331
  Parses various color input formats into a standard RGBA tuple (0-255).
@@ -303,32 +338,37 @@ class HighlightingService:
303
338
  # Convert float values (0.0-1.0) to int (0-255)
304
339
  processed = []
305
340
  all_float = all(isinstance(c, float) and 0.0 <= c <= 1.0 for c in color_input[:3])
306
-
341
+
307
342
  for i, c in enumerate(color_input):
308
- if isinstance(c, float):
309
- val = int(c * 255) if (i < 3 and all_float) or (i==3 and 0.0 <= c <= 1.0) else int(c)
310
- elif isinstance(c, int):
311
- val = c
312
- else:
313
- logger.warning(f"Invalid color component type: {c} in {color_input}")
314
- return None # Invalid type
315
- processed.append(max(0, min(255, val))) # Clamp to 0-255
343
+ if isinstance(c, float):
344
+ val = (
345
+ int(c * 255)
346
+ if (i < 3 and all_float) or (i == 3 and 0.0 <= c <= 1.0)
347
+ else int(c)
348
+ )
349
+ elif isinstance(c, int):
350
+ val = c
351
+ else:
352
+ logger.warning(f"Invalid color component type: {c} in {color_input}")
353
+ return None # Invalid type
354
+ processed.append(max(0, min(255, val))) # Clamp to 0-255
316
355
 
317
356
  # Check length and add default alpha if needed
318
357
  if len(processed) == 3:
319
- # Use alpha from ColorManager instance
320
- processed.append(self._color_manager._alpha)
321
- return tuple(processed)
358
+ # Use alpha from ColorManager instance
359
+ processed.append(self._color_manager._alpha)
360
+ return tuple(processed)
322
361
  elif len(processed) == 4:
323
- return tuple(processed)
362
+ return tuple(processed)
324
363
  else:
325
- logger.warning(f"Invalid color tuple length: {color_input}")
326
- return None # Invalid length
364
+ logger.warning(f"Invalid color tuple length: {color_input}")
365
+ return None # Invalid length
327
366
 
328
367
  elif isinstance(color_input, str):
329
368
  try:
330
369
  # Convert color name/hex string to RGB tuple (0.0-1.0 floats)
331
- from colour import Color # Import here if not at top
370
+ from colour import Color # Import here if not at top
371
+
332
372
  color_obj = Color(color_input)
333
373
  # Convert floats (0.0-1.0) to integers (0-255)
334
374
  r = int(color_obj.red * 255)
@@ -342,27 +382,27 @@ class HighlightingService:
342
382
  rgba = (r, g, b, self._color_manager._alpha)
343
383
  return rgba
344
384
  except ImportError:
345
- logger.error("Color utility class not found. Cannot process string colors.")
346
- return None
385
+ logger.error("Color utility class not found. Cannot process string colors.")
386
+ return None
347
387
  except ValueError:
348
- logger.warning(f"Invalid color string: '{color_input}'")
349
- return None
388
+ logger.warning(f"Invalid color string: '{color_input}'")
389
+ return None
350
390
  except Exception as e:
351
- logger.error(f"Error processing color string '{color_input}': {e}")
352
- return None
391
+ logger.error(f"Error processing color string '{color_input}': {e}")
392
+ return None
353
393
  else:
354
- logger.warning(f"Invalid color input type: {type(color_input)}")
355
- return None
394
+ logger.warning(f"Invalid color input type: {type(color_input)}")
395
+ return None
356
396
 
357
397
  def _determine_highlight_color(
358
398
  self,
359
399
  color_input: Optional[Union[Tuple, str]] = None,
360
400
  label: Optional[str] = None,
361
- use_color_cycling: bool = False
401
+ use_color_cycling: bool = False,
362
402
  ) -> Tuple[int, int, int, int]:
363
403
  """
364
404
  Determines the final RGBA color for a highlight using the ColorManager.
365
-
405
+
366
406
  Args:
367
407
  color_input: User-provided color (tuple or string).
368
408
  label: Label associated with the highlight.
@@ -383,48 +423,63 @@ class HighlightingService:
383
423
  def add(
384
424
  self,
385
425
  page_index: int,
386
- bbox: Union[Tuple[float, float, float, float], Any], # Relax input type hint
426
+ bbox: Union[Tuple[float, float, float, float], Any], # Relax input type hint
387
427
  color: Optional[Union[Tuple, str]] = None,
388
428
  label: Optional[str] = None,
389
429
  use_color_cycling: bool = False,
390
430
  element: Optional[Any] = None,
391
431
  include_attrs: Optional[List[str]] = None,
392
- existing: str = 'append'
432
+ existing: str = "append",
393
433
  ):
394
434
  """Adds a rectangular highlight."""
395
-
435
+
396
436
  processed_bbox: Tuple[float, float, float, float]
397
437
  # Check if bbox is an object with expected attributes (likely a Region)
398
438
  # Assuming Region object has x0, top, x1, bottom attributes based on error context
399
- if (hasattr(bbox, 'x0') and hasattr(bbox, 'top') and
400
- hasattr(bbox, 'x1') and hasattr(bbox, 'bottom')):
401
- try:
439
+ if (
440
+ hasattr(bbox, "x0")
441
+ and hasattr(bbox, "top")
442
+ and hasattr(bbox, "x1")
443
+ and hasattr(bbox, "bottom")
444
+ ):
445
+ try:
402
446
  # Ensure attributes are numeric before creating tuple
403
- processed_bbox = (float(bbox.x0), float(bbox.top), float(bbox.x1), float(bbox.bottom))
404
- except (ValueError, TypeError):
405
- logger.error(f"Invalid attribute types in bbox object for page {page_index}: {bbox}. Expected numeric values.")
406
- return
447
+ processed_bbox = (
448
+ float(bbox.x0),
449
+ float(bbox.top),
450
+ float(bbox.x1),
451
+ float(bbox.bottom),
452
+ )
453
+ except (ValueError, TypeError):
454
+ logger.error(
455
+ f"Invalid attribute types in bbox object for page {page_index}: {bbox}. Expected numeric values."
456
+ )
457
+ return
407
458
  elif isinstance(bbox, (list, tuple)) and len(bbox) == 4:
408
- try:
409
- # Ensure elements are numeric and convert to tuple
410
- processed_bbox = tuple(float(v) for v in bbox)
411
- except (ValueError, TypeError):
412
- logger.error(f"Invalid values in bbox sequence for page {page_index}: {bbox}. Expected numeric values.")
413
- return
459
+ try:
460
+ # Ensure elements are numeric and convert to tuple
461
+ processed_bbox = tuple(float(v) for v in bbox)
462
+ except (ValueError, TypeError):
463
+ logger.error(
464
+ f"Invalid values in bbox sequence for page {page_index}: {bbox}. Expected numeric values."
465
+ )
466
+ return
414
467
  else:
415
- logger.error(f"Invalid bbox type or structure provided for page {page_index}: {type(bbox)} - {bbox}. Expected tuple/list of 4 numbers or Region-like object.")
416
- return # Don't proceed if bbox is invalid
417
-
468
+ logger.error(
469
+ f"Invalid bbox type or structure provided for page {page_index}: {type(bbox)} - {bbox}. Expected tuple/list of 4 numbers or Region-like object."
470
+ )
471
+ return # Don't proceed if bbox is invalid
472
+
418
473
  self._add_internal(
419
474
  page_index=page_index,
420
- bbox=processed_bbox, # Use the processed tuple
475
+ bbox=processed_bbox, # Use the processed tuple
421
476
  polygon=None,
422
477
  color_input=color,
423
478
  label=label,
424
479
  use_color_cycling=use_color_cycling,
425
480
  element=element,
426
481
  include_attrs=include_attrs,
427
- existing=existing
482
+ existing=existing,
428
483
  )
429
484
 
430
485
  def add_polygon(
@@ -436,7 +491,7 @@ class HighlightingService:
436
491
  use_color_cycling: bool = False,
437
492
  element: Optional[Any] = None,
438
493
  include_attrs: Optional[List[str]] = None,
439
- existing: str = 'append'
494
+ existing: str = "append",
440
495
  ):
441
496
  """Adds a polygonal highlight."""
442
497
  # Calculate bounding box from polygon for internal storage
@@ -447,7 +502,7 @@ class HighlightingService:
447
502
  else:
448
503
  logger.warning(f"Invalid polygon provided for page {page_index}. Cannot add highlight.")
449
504
  return
450
-
505
+
451
506
  self._add_internal(
452
507
  page_index=page_index,
453
508
  bbox=bbox,
@@ -457,7 +512,7 @@ class HighlightingService:
457
512
  use_color_cycling=use_color_cycling,
458
513
  element=element,
459
514
  include_attrs=include_attrs,
460
- existing=existing
515
+ existing=existing,
461
516
  )
462
517
 
463
518
  def _add_internal(
@@ -470,34 +525,32 @@ class HighlightingService:
470
525
  use_color_cycling: bool,
471
526
  element: Optional[Any],
472
527
  include_attrs: Optional[List[str]],
473
- existing: str
528
+ existing: str,
474
529
  ):
475
530
  """Internal method to create and store a Highlight object."""
476
531
  if page_index < 0 or page_index >= len(self._pdf.pages):
477
- logger.error(f"Invalid page index {page_index}. Cannot add highlight.")
478
- return
532
+ logger.error(f"Invalid page index {page_index}. Cannot add highlight.")
533
+ return
479
534
 
480
535
  # Handle 'replace' logic - clear highlights for this page *before* adding new one
481
- if existing == 'replace':
536
+ if existing == "replace":
482
537
  self.clear_page(page_index)
483
538
 
484
539
  # Determine the final color using the ColorManager
485
540
  final_color = self._determine_highlight_color(
486
- color_input=color_input,
487
- label=label,
488
- use_color_cycling=use_color_cycling
541
+ color_input=color_input, label=label, use_color_cycling=use_color_cycling
489
542
  )
490
543
 
491
544
  # Extract attributes from the element if requested
492
545
  attributes_to_draw = {}
493
546
  if element and include_attrs:
494
- for attr_name in include_attrs:
495
- try:
496
- attr_value = getattr(element, attr_name, None)
497
- if attr_value is not None:
498
- attributes_to_draw[attr_name] = attr_value
499
- except AttributeError:
500
- logger.warning(f"Attribute '{attr_name}' not found on element {element}")
547
+ for attr_name in include_attrs:
548
+ try:
549
+ attr_value = getattr(element, attr_name, None)
550
+ if attr_value is not None:
551
+ attributes_to_draw[attr_name] = attr_value
552
+ except AttributeError:
553
+ logger.warning(f"Attribute '{attr_name}' not found on element {element}")
501
554
 
502
555
  # Create the highlight data object
503
556
  highlight = Highlight(
@@ -506,7 +559,7 @@ class HighlightingService:
506
559
  color=final_color,
507
560
  label=label,
508
561
  polygon=polygon,
509
- attributes=attributes_to_draw
562
+ attributes=attributes_to_draw,
510
563
  )
511
564
 
512
565
  # Add to the list for the specific page
@@ -542,10 +595,10 @@ class HighlightingService:
542
595
  page_index: int,
543
596
  scale: float = 2.0,
544
597
  labels: bool = True,
545
- legend_position: str = 'right',
598
+ legend_position: str = "right",
546
599
  render_ocr: bool = False,
547
600
  resolution: Optional[float] = None,
548
- **kwargs # Pass other args to pdfplumber.page.to_image if needed
601
+ **kwargs, # Pass other args to pdfplumber.page.to_image if needed
549
602
  ) -> Optional[Image.Image]:
550
603
  """
551
604
  Renders a specific page with its highlights.
@@ -569,26 +622,30 @@ class HighlightingService:
569
622
  return None
570
623
 
571
624
  page = self._pdf[page_index]
572
- highlights_on_page = self.get_highlights_for_page(page_index) # This list will be empty if clear_page was called
625
+ highlights_on_page = self.get_highlights_for_page(
626
+ page_index
627
+ ) # This list will be empty if clear_page was called
573
628
 
574
- # --- Get Base Image ---
629
+ # --- Get Base Image ---
575
630
  try:
576
631
  render_resolution = resolution if resolution is not None else scale * 72
577
632
  img_object = page._page.to_image(resolution=render_resolution, **kwargs)
578
633
  base_image = img_object.annotated
579
634
  if not isinstance(base_image, Image.Image):
580
- png_data = img_object._repr_png_()
581
- if png_data:
582
- base_image = Image.open(io.BytesIO(png_data)).convert('RGB')
583
- else:
584
- raise ValueError("Could not extract base PIL image from pdfplumber.")
585
- base_image = base_image.convert('RGBA')
586
- logger.debug(f"Base image for page {page_index} rendered with resolution {render_resolution}.")
635
+ png_data = img_object._repr_png_()
636
+ if png_data:
637
+ base_image = Image.open(io.BytesIO(png_data)).convert("RGB")
638
+ else:
639
+ raise ValueError("Could not extract base PIL image from pdfplumber.")
640
+ base_image = base_image.convert("RGBA")
641
+ logger.debug(
642
+ f"Base image for page {page_index} rendered with resolution {render_resolution}."
643
+ )
587
644
  except Exception as e:
588
645
  logger.error(f"Failed to render base image for page {page_index}: {e}", exc_info=True)
589
646
  return None
590
647
 
591
- # --- Render Highlights ---
648
+ # --- Render Highlights ---
592
649
  rendered_image: Image.Image
593
650
  if highlights_on_page:
594
651
  renderer = HighlightRenderer(
@@ -600,32 +657,36 @@ class HighlightingService:
600
657
  )
601
658
  rendered_image = renderer.render()
602
659
  else:
603
- if render_ocr:
604
- # Still render OCR even if no highlights
605
- renderer = HighlightRenderer(page, base_image, [], scale, True)
606
- rendered_image = renderer.render()
607
- else:
608
- rendered_image = base_image # No highlights, no OCR requested
609
-
610
- # --- Add Legend (Based ONLY on this page's highlights) ---
660
+ if render_ocr:
661
+ # Still render OCR even if no highlights
662
+ renderer = HighlightRenderer(page, base_image, [], scale, True)
663
+ rendered_image = renderer.render()
664
+ else:
665
+ rendered_image = base_image # No highlights, no OCR requested
666
+
667
+ # --- Add Legend (Based ONLY on this page's highlights) ---
611
668
  if labels:
612
669
  # CHANGE: Create label_colors map only from highlights_on_page
613
670
  labels_colors_on_page: Dict[str, Tuple[int, int, int, int]] = {}
614
671
  for hl in highlights_on_page:
615
672
  if hl.label and hl.label not in labels_colors_on_page:
616
673
  labels_colors_on_page[hl.label] = hl.color
617
-
618
- if labels_colors_on_page: # Only add legend if there are labels on this page
674
+
675
+ if labels_colors_on_page: # Only add legend if there are labels on this page
619
676
  legend = create_legend(labels_colors_on_page)
620
- if legend: # Ensure create_legend didn't return None
621
- rendered_image = merge_images_with_legend(rendered_image, legend, legend_position)
622
- logger.debug(f"Added legend with {len(labels_colors_on_page)} labels for page {page_index}.")
677
+ if legend: # Ensure create_legend didn't return None
678
+ rendered_image = merge_images_with_legend(
679
+ rendered_image, legend, legend_position
680
+ )
681
+ logger.debug(
682
+ f"Added legend with {len(labels_colors_on_page)} labels for page {page_index}."
683
+ )
623
684
  else:
624
- logger.debug(f"Legend creation returned None for page {page_index}.")
685
+ logger.debug(f"Legend creation returned None for page {page_index}.")
625
686
  else:
626
- logger.debug(f"No labels found on page {page_index}, skipping legend.")
627
-
628
- return rendered_image
687
+ logger.debug(f"No labels found on page {page_index}, skipping legend.")
688
+
689
+ return rendered_image
629
690
 
630
691
  def render_preview(
631
692
  self,
@@ -633,10 +694,10 @@ class HighlightingService:
633
694
  temporary_highlights: List[Dict],
634
695
  scale: float = 2.0,
635
696
  labels: bool = True,
636
- legend_position: str = 'right',
697
+ legend_position: str = "right",
637
698
  render_ocr: bool = False,
638
699
  resolution: Optional[float] = None,
639
- **kwargs
700
+ **kwargs,
640
701
  ) -> Optional[Image.Image]:
641
702
  """
642
703
  Renders a preview image for a specific page containing only the
@@ -665,11 +726,16 @@ class HighlightingService:
665
726
  try:
666
727
  # Get base image from pdfplumber using the Page object's underlying _page
667
728
  img_object = page._page.to_image(resolution=render_resolution, **kwargs)
668
- base_image = img_object.annotated if hasattr(img_object, 'annotated') else img_object._repr_png_()
729
+ base_image = (
730
+ img_object.annotated
731
+ if hasattr(img_object, "annotated")
732
+ else img_object._repr_png_()
733
+ )
669
734
  if isinstance(base_image, bytes):
670
- from io import BytesIO
671
- base_image = Image.open(BytesIO(base_image))
672
- base_image = base_image.convert("RGB") # Ensure consistent format
735
+ from io import BytesIO
736
+
737
+ base_image = Image.open(BytesIO(base_image))
738
+ base_image = base_image.convert("RGB") # Ensure consistent format
673
739
 
674
740
  # Convert temporary highlight dicts to Highlight objects
675
741
  # Note: Colors/labels should be determined *here* for temporary preview
@@ -677,15 +743,15 @@ class HighlightingService:
677
743
  for hl_data in temporary_highlights:
678
744
  # Determine the final color using the service logic
679
745
  final_color = self._determine_highlight_color(
680
- color_input=hl_data.get('color'),
681
- label=hl_data.get('label'),
682
- use_color_cycling=hl_data.get('use_color_cycling', False)
746
+ color_input=hl_data.get("color"),
747
+ label=hl_data.get("label"),
748
+ use_color_cycling=hl_data.get("use_color_cycling", False),
683
749
  )
684
750
 
685
751
  # Extract potential attributes to draw
686
752
  attrs_to_draw = {}
687
- element = hl_data.get('element')
688
- include_attrs = hl_data.get('include_attrs')
753
+ element = hl_data.get("element")
754
+ include_attrs = hl_data.get("include_attrs")
689
755
  if element and include_attrs:
690
756
  for attr_name in include_attrs:
691
757
  try:
@@ -693,18 +759,22 @@ class HighlightingService:
693
759
  if attr_value is not None:
694
760
  attrs_to_draw[attr_name] = attr_value
695
761
  except AttributeError:
696
- logger.warning(f"Attribute '{attr_name}' not found on element {element}")
762
+ logger.warning(
763
+ f"Attribute '{attr_name}' not found on element {element}"
764
+ )
697
765
 
698
766
  # Add highlight if geometry exists
699
- if hl_data.get('bbox') or hl_data.get('polygon'):
700
- preview_highlights.append(Highlight(
701
- page_index=hl_data['page_index'],
702
- bbox=hl_data.get('bbox'),
703
- polygon=hl_data.get('polygon'),
704
- color=final_color, # Use the determined color
705
- label=hl_data.get('label'),
706
- attributes=attrs_to_draw
707
- ))
767
+ if hl_data.get("bbox") or hl_data.get("polygon"):
768
+ preview_highlights.append(
769
+ Highlight(
770
+ page_index=hl_data["page_index"],
771
+ bbox=hl_data.get("bbox"),
772
+ polygon=hl_data.get("polygon"),
773
+ color=final_color, # Use the determined color
774
+ label=hl_data.get("label"),
775
+ attributes=attrs_to_draw,
776
+ )
777
+ )
708
778
 
709
779
  # Render only these highlights
710
780
  renderer = HighlightRenderer(page, base_image, preview_highlights, scale, render_ocr)
@@ -716,9 +786,11 @@ class HighlightingService:
716
786
  preview_labels = {h.label: h.color for h in preview_highlights if h.label}
717
787
  if preview_labels:
718
788
  legend = create_legend(preview_labels)
719
- final_image = merge_images_with_legend(rendered_image, legend, position=legend_position)
789
+ final_image = merge_images_with_legend(
790
+ rendered_image, legend, position=legend_position
791
+ )
720
792
  else:
721
- final_image = rendered_image # No legend needed
793
+ final_image = rendered_image # No legend needed
722
794
  else:
723
795
  final_image = rendered_image
724
796
 
@@ -726,4 +798,4 @@ class HighlightingService:
726
798
  logger.error(f"Error rendering preview for page {page_index}: {e}", exc_info=True)
727
799
  return None
728
800
 
729
- return final_image
801
+ return final_image