natural-pdf 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. natural_pdf/__init__.py +55 -0
  2. natural_pdf/analyzers/__init__.py +6 -0
  3. natural_pdf/analyzers/layout/__init__.py +1 -0
  4. natural_pdf/analyzers/layout/base.py +151 -0
  5. natural_pdf/analyzers/layout/docling.py +247 -0
  6. natural_pdf/analyzers/layout/layout_analyzer.py +166 -0
  7. natural_pdf/analyzers/layout/layout_manager.py +200 -0
  8. natural_pdf/analyzers/layout/layout_options.py +78 -0
  9. natural_pdf/analyzers/layout/paddle.py +240 -0
  10. natural_pdf/analyzers/layout/surya.py +151 -0
  11. natural_pdf/analyzers/layout/tatr.py +251 -0
  12. natural_pdf/analyzers/layout/yolo.py +165 -0
  13. natural_pdf/analyzers/text_options.py +60 -0
  14. natural_pdf/analyzers/text_structure.py +270 -0
  15. natural_pdf/analyzers/utils.py +57 -0
  16. natural_pdf/core/__init__.py +3 -0
  17. natural_pdf/core/element_manager.py +457 -0
  18. natural_pdf/core/highlighting_service.py +698 -0
  19. natural_pdf/core/page.py +1444 -0
  20. natural_pdf/core/pdf.py +653 -0
  21. natural_pdf/elements/__init__.py +3 -0
  22. natural_pdf/elements/base.py +761 -0
  23. natural_pdf/elements/collections.py +1345 -0
  24. natural_pdf/elements/line.py +140 -0
  25. natural_pdf/elements/rect.py +122 -0
  26. natural_pdf/elements/region.py +1793 -0
  27. natural_pdf/elements/text.py +304 -0
  28. natural_pdf/ocr/__init__.py +56 -0
  29. natural_pdf/ocr/engine.py +104 -0
  30. natural_pdf/ocr/engine_easyocr.py +179 -0
  31. natural_pdf/ocr/engine_paddle.py +204 -0
  32. natural_pdf/ocr/engine_surya.py +171 -0
  33. natural_pdf/ocr/ocr_manager.py +191 -0
  34. natural_pdf/ocr/ocr_options.py +114 -0
  35. natural_pdf/qa/__init__.py +3 -0
  36. natural_pdf/qa/document_qa.py +396 -0
  37. natural_pdf/selectors/__init__.py +4 -0
  38. natural_pdf/selectors/parser.py +354 -0
  39. natural_pdf/templates/__init__.py +1 -0
  40. natural_pdf/templates/ocr_debug.html +517 -0
  41. natural_pdf/utils/__init__.py +3 -0
  42. natural_pdf/utils/highlighting.py +12 -0
  43. natural_pdf/utils/reading_order.py +227 -0
  44. natural_pdf/utils/visualization.py +223 -0
  45. natural_pdf/widgets/__init__.py +4 -0
  46. natural_pdf/widgets/frontend/viewer.js +88 -0
  47. natural_pdf/widgets/viewer.py +765 -0
  48. natural_pdf-0.1.0.dist-info/METADATA +295 -0
  49. natural_pdf-0.1.0.dist-info/RECORD +52 -0
  50. natural_pdf-0.1.0.dist-info/WHEEL +5 -0
  51. natural_pdf-0.1.0.dist-info/licenses/LICENSE +21 -0
  52. natural_pdf-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,761 @@
1
+ """
2
+ Base Element class for natural-pdf.
3
+ """
4
+ from typing import Any, Dict, List, Optional, TYPE_CHECKING, Union, Tuple
5
+ from PIL import Image
6
+
7
+ if TYPE_CHECKING:
8
+ from natural_pdf.core.page import Page
9
+ from natural_pdf.elements.region import Region
10
+ from natural_pdf.elements.base import Element, DirectionalMixin
11
+
12
+
13
+ class DirectionalMixin:
14
+ """
15
+ Mixin class providing directional methods for both Element and Region classes.
16
+ """
17
+
18
+ def _direction(self, direction: str, size: Optional[float] = None,
19
+ cross_size: str = "full", include_element: bool = False,
20
+ until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> Union['Element', 'Region']:
21
+ """
22
+ Protected helper method to create a region in a specified direction relative to this element/region.
23
+
24
+ Args:
25
+ direction: 'left', 'right', 'above', or 'below'
26
+ size: Size in the primary direction (width for horizontal, height for vertical)
27
+ cross_size: Size in the cross direction ('full' or 'element')
28
+ include_element: Whether to include this element/region's area in the result
29
+ until: Optional selector string to specify a boundary element
30
+ include_endpoint: Whether to include the boundary element found by 'until'
31
+ **kwargs: Additional parameters for the 'until' selector search
32
+
33
+ Returns:
34
+ Region object
35
+ """
36
+ import math # Use math.inf for infinity
37
+
38
+ is_horizontal = direction in ('left', 'right')
39
+ is_positive = direction in ('right', 'below') # right/below are positive directions
40
+ pixel_offset = 1 # Offset for excluding elements/endpoints
41
+
42
+ # 1. Determine initial boundaries based on direction and include_element
43
+ if is_horizontal:
44
+ # Initial cross-boundaries (vertical)
45
+ y0 = 0 if cross_size == "full" else self.top
46
+ y1 = self.page.height if cross_size == "full" else self.bottom
47
+
48
+ # Initial primary boundaries (horizontal)
49
+ if is_positive: # right
50
+ x0_initial = self.x0 if include_element else self.x1 + pixel_offset
51
+ x1_initial = self.x1 # This edge moves
52
+ else: # left
53
+ x0_initial = self.x0 # This edge moves
54
+ x1_initial = self.x1 if include_element else self.x0 - pixel_offset
55
+ else: # Vertical
56
+ # Initial cross-boundaries (horizontal)
57
+ x0 = 0 if cross_size == "full" else self.x0
58
+ x1 = self.page.width if cross_size == "full" else self.x1
59
+
60
+ # Initial primary boundaries (vertical)
61
+ if is_positive: # below
62
+ y0_initial = self.top if include_element else self.bottom + pixel_offset
63
+ y1_initial = self.bottom # This edge moves
64
+ else: # above
65
+ y0_initial = self.top # This edge moves
66
+ y1_initial = self.bottom if include_element else self.top - pixel_offset
67
+
68
+ # 2. Calculate the final primary boundary, considering 'size' or page limits
69
+ if is_horizontal:
70
+ if is_positive: # right
71
+ x1_final = min(self.page.width, x1_initial + (size if size is not None else (self.page.width - x1_initial)))
72
+ x0_final = x0_initial
73
+ else: # left
74
+ x0_final = max(0, x0_initial - (size if size is not None else x0_initial))
75
+ x1_final = x1_initial
76
+ else: # Vertical
77
+ if is_positive: # below
78
+ y1_final = min(self.page.height, y1_initial + (size if size is not None else (self.page.height - y1_initial)))
79
+ y0_final = y0_initial
80
+ else: # above
81
+ y0_final = max(0, y0_initial - (size if size is not None else y0_initial))
82
+ y1_final = y1_initial
83
+
84
+ # 3. Handle 'until' selector if provided
85
+ target = None
86
+ if until:
87
+ all_matches = self.page.find_all(until, **kwargs)
88
+ matches_in_direction = []
89
+
90
+ # Filter and sort matches based on direction
91
+ if direction == 'above':
92
+ matches_in_direction = [m for m in all_matches if m.bottom <= self.top]
93
+ matches_in_direction.sort(key=lambda e: e.bottom, reverse=True)
94
+ elif direction == 'below':
95
+ matches_in_direction = [m for m in all_matches if m.top >= self.bottom]
96
+ matches_in_direction.sort(key=lambda e: e.top)
97
+ elif direction == 'left':
98
+ matches_in_direction = [m for m in all_matches if m.x1 <= self.x0]
99
+ matches_in_direction.sort(key=lambda e: e.x1, reverse=True)
100
+ elif direction == 'right':
101
+ matches_in_direction = [m for m in all_matches if m.x0 >= self.x1]
102
+ matches_in_direction.sort(key=lambda e: e.x0)
103
+
104
+ if matches_in_direction:
105
+ target = matches_in_direction[0]
106
+
107
+ # Adjust the primary boundary based on the target
108
+ if is_horizontal:
109
+ if is_positive: # right
110
+ x1_final = target.x1 if include_endpoint else target.x0 - pixel_offset
111
+ else: # left
112
+ x0_final = target.x0 if include_endpoint else target.x1 + pixel_offset
113
+ else: # Vertical
114
+ if is_positive: # below
115
+ y1_final = target.bottom if include_endpoint else target.top - pixel_offset
116
+ else: # above
117
+ y0_final = target.top if include_endpoint else target.bottom + pixel_offset
118
+
119
+ # Adjust cross boundaries if cross_size is 'element'
120
+ if cross_size == "element":
121
+ if is_horizontal: # Adjust y0, y1
122
+ target_y0 = target.top if include_endpoint else target.bottom # Use opposite boundary if excluding
123
+ target_y1 = target.bottom if include_endpoint else target.top
124
+ y0 = min(y0, target_y0)
125
+ y1 = max(y1, target_y1)
126
+ else: # Adjust x0, x1
127
+ target_x0 = target.x0 if include_endpoint else target.x1 # Use opposite boundary if excluding
128
+ target_x1 = target.x1 if include_endpoint else target.x0
129
+ x0 = min(x0, target_x0)
130
+ x1 = max(x1, target_x1)
131
+
132
+ # 4. Finalize bbox coordinates
133
+ if is_horizontal:
134
+ bbox = (x0_final, y0, x1_final, y1)
135
+ else:
136
+ bbox = (x0, y0_final, x1, y1_final)
137
+
138
+ # Ensure valid coordinates (x0 <= x1, y0 <= y1)
139
+ final_x0 = min(bbox[0], bbox[2])
140
+ final_y0 = min(bbox[1], bbox[3])
141
+ final_x1 = max(bbox[0], bbox[2])
142
+ final_y1 = max(bbox[1], bbox[3])
143
+ final_bbox = (final_x0, final_y0, final_x1, final_y1)
144
+
145
+ # 5. Create and return appropriate object based on self type
146
+ from natural_pdf.elements.region import Region
147
+ result = Region(self.page, final_bbox)
148
+ result.source_element = self
149
+ result.includes_source = include_element
150
+ # Optionally store the boundary element if found
151
+ if target:
152
+ result.boundary_element = target
153
+
154
+ return result
155
+
156
+ def above(self, height: Optional[float] = None, width: str = "full", include_element: bool = False,
157
+ until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> Union['Element', 'Region']:
158
+ """
159
+ Select region above this element/region.
160
+
161
+ Args:
162
+ height: Height of the region above, in points
163
+ width: Width mode - "full" for full page width or "element" for element width
164
+ include_element: Whether to include this element/region in the result (default: False)
165
+ until: Optional selector string to specify an upper boundary element
166
+ include_endpoint: Whether to include the boundary element in the region (default: True)
167
+ **kwargs: Additional parameters
168
+
169
+ Returns:
170
+ Region object representing the area above
171
+ """
172
+ return self._direction(
173
+ direction='above',
174
+ size=height,
175
+ cross_size=width,
176
+ include_element=include_element,
177
+ until=until,
178
+ include_endpoint=include_endpoint,
179
+ **kwargs
180
+ )
181
+
182
+ def below(self, height: Optional[float] = None, width: str = "full", include_element: bool = False,
183
+ until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> Union['Element', 'Region']:
184
+ """
185
+ Select region below this element/region.
186
+
187
+ Args:
188
+ height: Height of the region below, in points
189
+ width: Width mode - "full" for full page width or "element" for element width
190
+ include_element: Whether to include this element/region in the result (default: False)
191
+ until: Optional selector string to specify a lower boundary element
192
+ include_endpoint: Whether to include the boundary element in the region (default: True)
193
+ **kwargs: Additional parameters
194
+
195
+ Returns:
196
+ Region object representing the area below
197
+ """
198
+ return self._direction(
199
+ direction='below',
200
+ size=height,
201
+ cross_size=width,
202
+ include_element=include_element,
203
+ until=until,
204
+ include_endpoint=include_endpoint,
205
+ **kwargs
206
+ )
207
+
208
+ def left(self, width: Optional[float] = None, height: str = "full", include_element: bool = False,
209
+ until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> Union['Element', 'Region']:
210
+ """
211
+ Select region to the left of this element/region.
212
+
213
+ Args:
214
+ width: Width of the region to the left, in points
215
+ height: Height mode - "full" for full page height or "element" for element height
216
+ include_element: Whether to include this element/region in the result (default: False)
217
+ until: Optional selector string to specify a left boundary element
218
+ include_endpoint: Whether to include the boundary element in the region (default: True)
219
+ **kwargs: Additional parameters
220
+
221
+ Returns:
222
+ Region object representing the area to the left
223
+ """
224
+ return self._direction(
225
+ direction='left',
226
+ size=width,
227
+ cross_size=height,
228
+ include_element=include_element,
229
+ until=until,
230
+ include_endpoint=include_endpoint,
231
+ **kwargs
232
+ )
233
+
234
+ def right(self, width: Optional[float] = None, height: str = "full", include_element: bool = False,
235
+ until: Optional[str] = None, include_endpoint: bool = True, **kwargs) -> Union['Element', 'Region']:
236
+ """
237
+ Select region to the right of this element/region.
238
+
239
+ Args:
240
+ width: Width of the region to the right, in points
241
+ height: Height mode - "full" for full page height or "element" for element height
242
+ include_element: Whether to include this element/region in the result (default: False)
243
+ until: Optional selector string to specify a right boundary element
244
+ include_endpoint: Whether to include the boundary element in the region (default: True)
245
+ **kwargs: Additional parameters
246
+
247
+ Returns:
248
+ Region object representing the area to the right
249
+ """
250
+ return self._direction(
251
+ direction='right',
252
+ size=width,
253
+ cross_size=height,
254
+ include_element=include_element,
255
+ until=until,
256
+ include_endpoint=include_endpoint,
257
+ **kwargs
258
+ )
259
+
260
+
261
+ class Element(DirectionalMixin):
262
+ """
263
+ Base class for all PDF elements.
264
+
265
+ This class provides common properties and methods for all PDF elements,
266
+ such as text, rectangles, lines, etc.
267
+ """
268
+
269
+ def __init__(self, obj: Dict[str, Any], page: 'Page'):
270
+ """
271
+ Initialize base element.
272
+
273
+ Args:
274
+ obj: The underlying pdfplumber object
275
+ page: The parent Page object
276
+ """
277
+ self._obj = obj
278
+ self._page = page
279
+
280
+ @property
281
+ def type(self) -> str:
282
+ """Element type."""
283
+ return self._obj.get('object_type', 'unknown')
284
+
285
+ @property
286
+ def bbox(self) -> Tuple[float, float, float, float]:
287
+ """Bounding box (x0, top, x1, bottom)."""
288
+ return (self.x0, self.top, self.x1, self.bottom)
289
+
290
+ @property
291
+ def x0(self) -> float:
292
+ """Left x-coordinate."""
293
+ if self.has_polygon:
294
+ return min(pt[0] for pt in self.polygon)
295
+ return self._obj.get('x0', 0)
296
+
297
+ @property
298
+ def top(self) -> float:
299
+ """Top y-coordinate."""
300
+ if self.has_polygon:
301
+ return min(pt[1] for pt in self.polygon)
302
+ return self._obj.get('top', 0)
303
+
304
+ @property
305
+ def x1(self) -> float:
306
+ """Right x-coordinate."""
307
+ if self.has_polygon:
308
+ return max(pt[0] for pt in self.polygon)
309
+ return self._obj.get('x1', 0)
310
+
311
+ @property
312
+ def bottom(self) -> float:
313
+ """Bottom y-coordinate."""
314
+ if self.has_polygon:
315
+ return max(pt[1] for pt in self.polygon)
316
+ return self._obj.get('bottom', 0)
317
+
318
+ @property
319
+ def width(self) -> float:
320
+ """Element width."""
321
+ return self.x1 - self.x0
322
+
323
+ @property
324
+ def height(self) -> float:
325
+ """Element height."""
326
+ return self.bottom - self.top
327
+
328
+ @property
329
+ def has_polygon(self) -> bool:
330
+ """Check if this element has polygon coordinates."""
331
+ return ('polygon' in self._obj and self._obj['polygon'] and len(self._obj['polygon']) >= 3) or hasattr(self, '_polygon')
332
+
333
+ @property
334
+ def polygon(self) -> List[Tuple[float, float]]:
335
+ """Get polygon coordinates if available, otherwise return rectangle corners."""
336
+ if hasattr(self, '_polygon') and self._polygon:
337
+ return self._polygon
338
+ elif 'polygon' in self._obj and self._obj['polygon']:
339
+ return self._obj['polygon']
340
+ else:
341
+ # Create rectangle corners as fallback
342
+ return [
343
+ (self._obj.get('x0', 0), self._obj.get('top', 0)), # top-left
344
+ (self._obj.get('x1', 0), self._obj.get('top', 0)), # top-right
345
+ (self._obj.get('x1', 0), self._obj.get('bottom', 0)), # bottom-right
346
+ (self._obj.get('x0', 0), self._obj.get('bottom', 0)) # bottom-left
347
+ ]
348
+
349
+ def is_point_inside(self, x: float, y: float) -> bool:
350
+ """
351
+ Check if a point is inside this element using ray casting algorithm for polygons.
352
+
353
+ Args:
354
+ x: X-coordinate to check
355
+ y: Y-coordinate to check
356
+
357
+ Returns:
358
+ True if the point is inside the element
359
+ """
360
+ if not self.has_polygon:
361
+ # Use simple rectangle check
362
+ return (self.x0 <= x <= self.x1) and (self.top <= y <= self.bottom)
363
+
364
+ # Ray casting algorithm for complex polygons
365
+ poly = self.polygon
366
+ n = len(poly)
367
+ inside = False
368
+
369
+ p1x, p1y = poly[0]
370
+ for i in range(1, n + 1):
371
+ p2x, p2y = poly[i % n]
372
+ if y > min(p1y, p2y) and y <= max(p1y, p2y) and x <= max(p1x, p2x):
373
+ if p1y != p2y:
374
+ xinters = (y - p1y) * (p2x - p1x) / (p2y - p1y) + p1x
375
+ if p1x == p2x or x <= xinters:
376
+ inside = not inside
377
+ p1x, p1y = p2x, p2y
378
+
379
+ return inside
380
+
381
+ @property
382
+ def page(self) -> 'Page':
383
+ """Get the parent page."""
384
+ return self._page
385
+
386
+ def next(self, selector: Optional[str] = None, limit: int = 10, apply_exclusions: bool = True, **kwargs) -> Optional['Element']:
387
+ """
388
+ Find next element in reading order.
389
+
390
+ Args:
391
+ selector: Optional selector to filter by
392
+ limit: Maximum number of elements to search through (default: 10)
393
+ apply_exclusions: Whether to apply exclusion regions (default: True)
394
+ **kwargs: Additional parameters
395
+
396
+ Returns:
397
+ Next element or None if not found
398
+ """
399
+ # Get all elements in reading order
400
+ all_elements = self.page.find_all('*', apply_exclusions=apply_exclusions)
401
+
402
+ # Find our index in the list
403
+ try:
404
+ # Compare by object identity since bbox could match multiple elements
405
+ idx = next(i for i, elem in enumerate(all_elements) if elem is self)
406
+ except StopIteration:
407
+ # If not found, it might have been filtered out by exclusions
408
+ return None
409
+
410
+ # Search for next matching element
411
+ if selector:
412
+ # Filter elements after this one
413
+ candidates = all_elements[idx+1:]
414
+ # Limit search range for performance
415
+ candidates = candidates[:limit] if limit else candidates
416
+
417
+ # Find matching elements
418
+ matches = self.page.filter_elements(candidates, selector, **kwargs)
419
+ return matches[0] if matches else None
420
+ elif idx + 1 < len(all_elements):
421
+ # No selector, just return the next element
422
+ return all_elements[idx + 1]
423
+
424
+ return None
425
+
426
+ def prev(self, selector: Optional[str] = None, limit: int = 10, apply_exclusions: bool = True, **kwargs) -> Optional['Element']:
427
+ """
428
+ Find previous element in reading order.
429
+
430
+ Args:
431
+ selector: Optional selector to filter by
432
+ limit: Maximum number of elements to search through (default: 10)
433
+ apply_exclusions: Whether to apply exclusion regions (default: True)
434
+ **kwargs: Additional parameters
435
+
436
+ Returns:
437
+ Previous element or None if not found
438
+ """
439
+ # Get all elements in reading order
440
+ all_elements = self.page.find_all('*', apply_exclusions=apply_exclusions)
441
+
442
+ # Find our index in the list
443
+ try:
444
+ # Compare by object identity since bbox could match multiple elements
445
+ idx = next(i for i, elem in enumerate(all_elements) if elem is self)
446
+ except StopIteration:
447
+ # If not found, it might have been filtered out by exclusions
448
+ return None
449
+
450
+ # Search for previous matching element
451
+ if selector:
452
+ # Filter elements before this one
453
+ candidates = all_elements[:idx]
454
+ # Reverse to start from closest to this element
455
+ candidates = candidates[::-1]
456
+ # Limit search range for performance
457
+ candidates = candidates[:limit] if limit else candidates
458
+
459
+ # Find matching elements
460
+ matches = self.page.filter_elements(candidates, selector, **kwargs)
461
+ return matches[0] if matches else None
462
+ elif idx > 0:
463
+ # No selector, just return the previous element
464
+ return all_elements[idx - 1]
465
+
466
+ return None
467
+
468
+ def nearest(self, selector: str, max_distance: Optional[float] = None, apply_exclusions: bool = True, **kwargs) -> Optional['Element']:
469
+ """
470
+ Find nearest element matching selector.
471
+
472
+ Args:
473
+ selector: CSS-like selector string
474
+ max_distance: Maximum distance to search (default: None = unlimited)
475
+ apply_exclusions: Whether to apply exclusion regions (default: True)
476
+ **kwargs: Additional parameters
477
+
478
+ Returns:
479
+ Nearest element or None if not found
480
+ """
481
+ # Find matching elements
482
+ matches = self.page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
483
+ if not matches:
484
+ return None
485
+
486
+ # Calculate distance to center point of this element
487
+ self_center_x = (self.x0 + self.x1) / 2
488
+ self_center_y = (self.top + self.bottom) / 2
489
+
490
+ # Calculate distances to each match
491
+ distances = []
492
+ for match in matches:
493
+ if match is self: # Skip self
494
+ continue
495
+
496
+ match_center_x = (match.x0 + match.x1) / 2
497
+ match_center_y = (match.top + match.bottom) / 2
498
+
499
+ # Euclidean distance
500
+ distance = ((match_center_x - self_center_x) ** 2 +
501
+ (match_center_y - self_center_y) ** 2) ** 0.5
502
+
503
+ # Filter by max_distance if specified
504
+ if max_distance is None or distance <= max_distance:
505
+ distances.append((match, distance))
506
+
507
+ # Sort by distance and return the closest
508
+ if distances:
509
+ distances.sort(key=lambda x: x[1])
510
+ return distances[0][0]
511
+
512
+ return None
513
+
514
+ def until(self, selector: str, include_endpoint: bool = True, width: str = "element", **kwargs) -> 'Region':
515
+ """
516
+ Select content from this element until matching selector.
517
+
518
+ Args:
519
+ selector: CSS-like selector string
520
+ include_endpoint: Whether to include the endpoint element in the region (default: True)
521
+ width: Width mode - "element" to use element widths or "full" for full page width
522
+ **kwargs: Additional selection parameters
523
+
524
+ Returns:
525
+ Region object representing the selected content
526
+ """
527
+ from natural_pdf.elements.region import Region
528
+
529
+ # Find the target element
530
+ target = self.page.find(selector, **kwargs)
531
+ if not target:
532
+ # If target not found, return a region with just this element
533
+ return Region(self.page, self.bbox)
534
+
535
+ # Use full page width if requested
536
+ if width == "full":
537
+ x0 = 0
538
+ x1 = self.page.width
539
+ # Determine vertical bounds based on element positions
540
+ if target.top >= self.bottom: # Target is below this element
541
+ top = self.top
542
+ bottom = target.bottom if include_endpoint else target.top - 1 # Subtract 1 pixel when excluding
543
+ else: # Target is above this element
544
+ top = target.top if include_endpoint else target.bottom + 1 # Add 1 pixel when excluding
545
+ bottom = self.bottom
546
+ return Region(self.page, (x0, top, x1, bottom))
547
+
548
+ # Otherwise use element-based width
549
+ # Determine the correct order for creating the region
550
+ # If the target is below this element (normal reading order)
551
+ if target.top >= self.bottom:
552
+ x0 = min(self.x0, target.x0 if include_endpoint else target.x1)
553
+ x1 = max(self.x1, target.x1 if include_endpoint else target.x0)
554
+ top = self.top
555
+ bottom = target.bottom if include_endpoint else target.top - 1 # Subtract 1 pixel when excluding
556
+ # If the target is above this element (reverse reading order)
557
+ elif target.bottom <= self.top:
558
+ x0 = min(self.x0, target.x0 if include_endpoint else target.x1)
559
+ x1 = max(self.x1, target.x1 if include_endpoint else target.x0)
560
+ top = target.top if include_endpoint else target.bottom + 1 # Add 1 pixel when excluding
561
+ bottom = self.bottom
562
+ # If they're side by side, use the horizontal version
563
+ elif target.x0 >= self.x1: # Target is to the right
564
+ x0 = self.x0
565
+ x1 = target.x1 if include_endpoint else target.x0
566
+ top = min(self.top, target.top if include_endpoint else target.bottom)
567
+ bottom = max(self.bottom, target.bottom if include_endpoint else target.top)
568
+ else: # Target is to the left
569
+ x0 = target.x0 if include_endpoint else target.x1
570
+ x1 = self.x1
571
+ top = min(self.top, target.top if include_endpoint else target.bottom)
572
+ bottom = max(self.bottom, target.bottom if include_endpoint else target.top)
573
+
574
+ region = Region(self.page, (x0, top, x1, bottom))
575
+ region.source_element = self
576
+ region.end_element = target
577
+ return region
578
+
579
+ # Note: select_until method removed in favor of until()
580
+
581
+ def extract_text(self, preserve_whitespace=True, use_exclusions=True, **kwargs) -> str:
582
+ """
583
+ Extract text from this element.
584
+
585
+ Args:
586
+ preserve_whitespace: Whether to keep blank characters (default: True)
587
+ use_exclusions: Whether to apply exclusion regions (default: True)
588
+ **kwargs: Additional extraction parameters
589
+
590
+ Returns:
591
+ Extracted text as string
592
+ """
593
+ # Default implementation - override in subclasses
594
+ return ""
595
+
596
+ # Note: extract_text_compat method removed
597
+
598
+ def highlight(self,
599
+ label: Optional[str] = None,
600
+ color: Optional[Union[Tuple, str]] = None, # Allow string color
601
+ use_color_cycling: bool = False,
602
+ include_attrs: Optional[List[str]] = None,
603
+ existing: str = 'append') -> 'Element':
604
+ """
605
+ Highlight this element on the page.
606
+
607
+ Args:
608
+ label: Optional label for the highlight
609
+ color: Color tuple/string for the highlight, or None to use automatic color
610
+ use_color_cycling: Force color cycling even with no label (default: False)
611
+ include_attrs: List of attribute names to display on the highlight (e.g., ['confidence', 'type'])
612
+ existing: How to handle existing highlights - 'append' (default) or 'replace'
613
+
614
+ Returns:
615
+ Self for method chaining
616
+ """
617
+ # Access the correct highlighter service
618
+ highlighter = self.page._highlighter
619
+
620
+ # Prepare common arguments
621
+ highlight_args = {
622
+ "page_index": self.page.index,
623
+ "color": color,
624
+ "label": label,
625
+ "use_color_cycling": use_color_cycling,
626
+ "element": self, # Pass the element itself so attributes can be accessed
627
+ "include_attrs": include_attrs,
628
+ "existing": existing
629
+ }
630
+
631
+ # Call the appropriate service method based on geometry
632
+ if self.has_polygon:
633
+ highlight_args["polygon"] = self.polygon
634
+ highlighter.add_polygon(**highlight_args)
635
+ else:
636
+ highlight_args["bbox"] = self.bbox
637
+ highlighter.add(**highlight_args)
638
+
639
+ return self
640
+
641
+ def show(self,
642
+ scale: float = 2.0,
643
+ labels: bool = True,
644
+ legend_position: str = 'right',
645
+ color: Optional[Union[Tuple, str]] = "red", # Default color for single element
646
+ label: Optional[str] = None) -> Optional['Image.Image']:
647
+ """
648
+ Show the page with only this element highlighted temporarily.
649
+
650
+ Args:
651
+ scale: Scale factor for rendering
652
+ labels: Whether to include a legend for the highlight
653
+ legend_position: Position of the legend
654
+ color: Color to highlight this element (default: red)
655
+ label: Optional label for this element in the legend
656
+
657
+ Returns:
658
+ PIL Image of the page with only this element highlighted, or None if error.
659
+ """
660
+ if not hasattr(self, 'page') or not self.page:
661
+ logger.warning(f"Cannot show element, missing 'page' attribute: {self}")
662
+ return None
663
+ if not hasattr(self.page, '_highlighter') or not self.page._highlighter:
664
+ logger.warning(f"Cannot show element, page lacks highlighter service: {self}")
665
+ return None
666
+
667
+ service = self.page._highlighter
668
+
669
+ # Determine the label if not provided
670
+ display_label = label if label is not None else f"{self.__class__.__name__}"
671
+
672
+ # Prepare temporary highlight data for just this element
673
+ temp_highlight_data = {
674
+ "page_index": self.page.index,
675
+ "bbox": self.bbox if not self.has_polygon else None,
676
+ "polygon": self.polygon if self.has_polygon else None,
677
+ "color": color, # Use provided or default color
678
+ "label": display_label,
679
+ "use_color_cycling": False # Explicitly false for single preview
680
+ }
681
+
682
+ # Check if we actually got geometry data
683
+ if temp_highlight_data['bbox'] is None and temp_highlight_data['polygon'] is None:
684
+ logger.warning(f"Cannot show element, failed to get bbox or polygon: {self}")
685
+ return None
686
+
687
+ # Use render_preview to show only this highlight
688
+ try:
689
+ return service.render_preview(
690
+ page_index=self.page.index,
691
+ temporary_highlights=[temp_highlight_data],
692
+ scale=scale,
693
+ labels=labels,
694
+ legend_position=legend_position
695
+ )
696
+ except Exception as e:
697
+ logger.error(f"Error calling render_preview for element {self}: {e}", exc_info=True)
698
+ return None
699
+
700
+ def save(self,
701
+ filename: str,
702
+ scale: float = 2.0,
703
+ labels: bool = True,
704
+ legend_position: str = 'right') -> None:
705
+ """
706
+ Save the page with this element highlighted to an image file.
707
+
708
+ Args:
709
+ filename: Path to save the image to
710
+ scale: Scale factor for rendering
711
+ labels: Whether to include a legend for labels
712
+ legend_position: Position of the legend
713
+
714
+ Returns:
715
+ Self for method chaining
716
+ """
717
+ # Save the highlighted image
718
+ self.page.save_image(filename, scale=scale, labels=labels, legend_position=legend_position)
719
+ return self
720
+
721
+ # Note: save_image method removed in favor of save()
722
+
723
+ def __repr__(self) -> str:
724
+ """String representation of the element."""
725
+ return f"<{self.__class__.__name__} bbox={self.bbox}>"
726
+
727
+ def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional['Element']:
728
+ """
729
+ Find first element within this element's bounds matching the selector.
730
+ Creates a temporary region to perform the search.
731
+
732
+ Args:
733
+ selector: CSS-like selector string
734
+ apply_exclusions: Whether to apply exclusion regions
735
+ **kwargs: Additional parameters for element filtering
736
+
737
+ Returns:
738
+ First matching element or None
739
+ """
740
+ # Create a temporary region from this element's bounds
741
+ from natural_pdf.elements.region import Region
742
+ temp_region = Region(self.page, self.bbox)
743
+ return temp_region.find(selector, apply_exclusions=apply_exclusions, **kwargs)
744
+
745
+ def find_all(self, selector: str, apply_exclusions=True, **kwargs) -> 'ElementCollection':
746
+ """
747
+ Find all elements within this element's bounds matching the selector.
748
+ Creates a temporary region to perform the search.
749
+
750
+ Args:
751
+ selector: CSS-like selector string
752
+ apply_exclusions: Whether to apply exclusion regions
753
+ **kwargs: Additional parameters for element filtering
754
+
755
+ Returns:
756
+ ElementCollection with matching elements
757
+ """
758
+ # Create a temporary region from this element's bounds
759
+ from natural_pdf.elements.region import Region
760
+ temp_region = Region(self.page, self.bbox)
761
+ return temp_region.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)