natural-pdf 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. natural_pdf/__init__.py +55 -0
  2. natural_pdf/analyzers/__init__.py +6 -0
  3. natural_pdf/analyzers/layout/__init__.py +1 -0
  4. natural_pdf/analyzers/layout/base.py +151 -0
  5. natural_pdf/analyzers/layout/docling.py +247 -0
  6. natural_pdf/analyzers/layout/layout_analyzer.py +166 -0
  7. natural_pdf/analyzers/layout/layout_manager.py +200 -0
  8. natural_pdf/analyzers/layout/layout_options.py +78 -0
  9. natural_pdf/analyzers/layout/paddle.py +240 -0
  10. natural_pdf/analyzers/layout/surya.py +151 -0
  11. natural_pdf/analyzers/layout/tatr.py +251 -0
  12. natural_pdf/analyzers/layout/yolo.py +165 -0
  13. natural_pdf/analyzers/text_options.py +60 -0
  14. natural_pdf/analyzers/text_structure.py +270 -0
  15. natural_pdf/analyzers/utils.py +57 -0
  16. natural_pdf/core/__init__.py +3 -0
  17. natural_pdf/core/element_manager.py +457 -0
  18. natural_pdf/core/highlighting_service.py +698 -0
  19. natural_pdf/core/page.py +1444 -0
  20. natural_pdf/core/pdf.py +653 -0
  21. natural_pdf/elements/__init__.py +3 -0
  22. natural_pdf/elements/base.py +761 -0
  23. natural_pdf/elements/collections.py +1345 -0
  24. natural_pdf/elements/line.py +140 -0
  25. natural_pdf/elements/rect.py +122 -0
  26. natural_pdf/elements/region.py +1793 -0
  27. natural_pdf/elements/text.py +304 -0
  28. natural_pdf/ocr/__init__.py +56 -0
  29. natural_pdf/ocr/engine.py +104 -0
  30. natural_pdf/ocr/engine_easyocr.py +179 -0
  31. natural_pdf/ocr/engine_paddle.py +204 -0
  32. natural_pdf/ocr/engine_surya.py +171 -0
  33. natural_pdf/ocr/ocr_manager.py +191 -0
  34. natural_pdf/ocr/ocr_options.py +114 -0
  35. natural_pdf/qa/__init__.py +3 -0
  36. natural_pdf/qa/document_qa.py +396 -0
  37. natural_pdf/selectors/__init__.py +4 -0
  38. natural_pdf/selectors/parser.py +354 -0
  39. natural_pdf/templates/__init__.py +1 -0
  40. natural_pdf/templates/ocr_debug.html +517 -0
  41. natural_pdf/utils/__init__.py +3 -0
  42. natural_pdf/utils/highlighting.py +12 -0
  43. natural_pdf/utils/reading_order.py +227 -0
  44. natural_pdf/utils/visualization.py +223 -0
  45. natural_pdf/widgets/__init__.py +4 -0
  46. natural_pdf/widgets/frontend/viewer.js +88 -0
  47. natural_pdf/widgets/viewer.py +765 -0
  48. natural_pdf-0.1.0.dist-info/METADATA +295 -0
  49. natural_pdf-0.1.0.dist-info/RECORD +52 -0
  50. natural_pdf-0.1.0.dist-info/WHEEL +5 -0
  51. natural_pdf-0.1.0.dist-info/licenses/LICENSE +21 -0
  52. natural_pdf-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1444 @@
1
+ import pdfplumber
2
+ import os
3
+ import logging
4
+ import tempfile
5
+ from typing import List, Optional, Union, Any, Dict, Callable, TYPE_CHECKING, Tuple
6
+ from PIL import Image
7
+ import base64
8
+ import io
9
+ import json
10
+
11
+ from natural_pdf.elements.collections import ElementCollection
12
+
13
+ if TYPE_CHECKING:
14
+ import pdfplumber
15
+ from natural_pdf.core.pdf import PDF
16
+ from natural_pdf.elements.collections import ElementCollection
17
+ from natural_pdf.core.highlighting_service import HighlightingService
18
+ from natural_pdf.elements.base import Element
19
+
20
+ from natural_pdf.elements.region import Region
21
+ from natural_pdf.elements.text import TextElement
22
+ from natural_pdf.analyzers.layout.layout_manager import LayoutManager
23
+ from natural_pdf.analyzers.layout.layout_options import LayoutOptions
24
+ from natural_pdf.ocr import OCROptions
25
+ from natural_pdf.ocr import OCRManager
26
+ from natural_pdf.core.element_manager import ElementManager
27
+ from natural_pdf.analyzers.layout.layout_analyzer import LayoutAnalyzer
28
+ from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
29
+ from natural_pdf.analyzers.text_options import TextStyleOptions
30
+ from natural_pdf.widgets import InteractiveViewerWidget
31
+ from natural_pdf.widgets.viewer import SimpleInteractiveViewerWidget
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+ class Page:
36
+ """
37
+ Enhanced Page wrapper built on top of pdfplumber.Page.
38
+
39
+ This class provides a fluent interface for working with PDF pages,
40
+ with improved selection, navigation, extraction, and question-answering capabilities.
41
+ """
42
+
43
+ def __init__(self, page: 'pdfplumber.page.Page', parent: 'PDF', index: int, font_attrs=None):
44
+ """
45
+ Initialize a page wrapper.
46
+
47
+ Args:
48
+ page: pdfplumber page object
49
+ parent: Parent PDF object
50
+ index: Index of this page in the PDF (0-based)
51
+ font_attrs: Font attributes to consider when grouping characters into words.
52
+ """
53
+ self._page = page
54
+ self._parent = parent
55
+ self._index = index
56
+ self._text_styles = None # Lazy-loaded text style analyzer results
57
+ self._exclusions = [] # List to store exclusion functions/regions
58
+
59
+ # Region management
60
+ self._regions = {
61
+ 'detected': [], # Layout detection results
62
+ 'named': {}, # Named regions (name -> region)
63
+ }
64
+
65
+ # Initialize ElementManager
66
+ self._element_mgr = ElementManager(self, font_attrs)
67
+
68
+ # --- Get OCR Manager Instance ---
69
+ if OCRManager and hasattr(parent, '_ocr_manager') and isinstance(parent._ocr_manager, OCRManager):
70
+ self._ocr_manager = parent._ocr_manager
71
+ logger.debug(f"Page {self.number}: Using OCRManager instance from parent PDF.")
72
+ else:
73
+ self._ocr_manager = None
74
+ if OCRManager:
75
+ logger.warning(f"Page {self.number}: OCRManager instance not found on parent PDF object.")
76
+
77
+ # --- Get Layout Manager Instance ---
78
+ if LayoutManager and hasattr(parent, '_layout_manager') and isinstance(parent._layout_manager, LayoutManager):
79
+ self._layout_manager = parent._layout_manager
80
+ logger.debug(f"Page {self.number}: Using LayoutManager instance from parent PDF.")
81
+ else:
82
+ self._layout_manager = None
83
+ if LayoutManager:
84
+ logger.warning(f"Page {self.number}: LayoutManager instance not found on parent PDF object. Layout analysis will fail.")
85
+
86
+ # Initialize the internal variable with a single underscore
87
+ self._layout_analyzer = None
88
+
89
+ @property
90
+ def pdf(self) -> 'PDF':
91
+ """Provides public access to the parent PDF object."""
92
+ return self._parent
93
+
94
+ @property
95
+ def number(self) -> int:
96
+ """Get page number (1-based)."""
97
+ return self._page.page_number
98
+
99
+ @property
100
+ def index(self) -> int:
101
+ """Get page index (0-based)."""
102
+ return self._index
103
+
104
+ @property
105
+ def width(self) -> float:
106
+ """Get page width."""
107
+ return self._page.width
108
+
109
+ @property
110
+ def height(self) -> float:
111
+ """Get page height."""
112
+ return self._page.height
113
+
114
+ # --- Highlighting Service Accessor ---
115
+ @property
116
+ def _highlighter(self) -> 'HighlightingService':
117
+ """Provides access to the parent PDF's HighlightingService."""
118
+ if not hasattr(self._parent, 'highlighter'):
119
+ # This should ideally not happen if PDF.__init__ works correctly
120
+ raise AttributeError("Parent PDF object does not have a 'highlighter' attribute.")
121
+ return self._parent.highlighter
122
+
123
+ def add_exclusion(self, exclusion_func_or_region: Union[Callable[['Page'], Region], Region]) -> 'Page':
124
+ """
125
+ Add an exclusion to the page. Text from these regions will be excluded from extraction.
126
+
127
+ Args:
128
+ exclusion_func_or_region: Either a Region object or a function that takes a Page
129
+ and returns a Region to exclude
130
+
131
+ Returns:
132
+ Self for method chaining
133
+ """
134
+ self._exclusions.append(exclusion_func_or_region)
135
+ return self
136
+
137
+ def add_region(self, region: Region, name: Optional[str] = None) -> 'Page':
138
+ """
139
+ Add a region to the page.
140
+
141
+ Args:
142
+ region: Region object to add
143
+ name: Optional name for the region
144
+
145
+ Returns:
146
+ Self for method chaining
147
+ """
148
+ # Check if it's actually a Region object
149
+ if not isinstance(region, Region):
150
+ raise TypeError("region must be a Region object")
151
+
152
+ # Set the source and name
153
+ region.source = 'named'
154
+
155
+ if name:
156
+ region.name = name
157
+ # Add to named regions dictionary (overwriting if name already exists)
158
+ self._regions['named'][name] = region
159
+ else:
160
+ # Add to detected regions list (unnamed but registered)
161
+ self._regions['detected'].append(region)
162
+
163
+ # Add to element manager for selector queries
164
+ self._element_mgr.add_region(region)
165
+
166
+ return self
167
+
168
+ def add_regions(self, regions: List[Region], prefix: Optional[str] = None) -> 'Page':
169
+ """
170
+ Add multiple regions to the page.
171
+
172
+ Args:
173
+ regions: List of Region objects to add
174
+ prefix: Optional prefix for automatic naming (regions will be named prefix_1, prefix_2, etc.)
175
+
176
+ Returns:
177
+ Self for method chaining
178
+ """
179
+ if prefix:
180
+ # Add with automatic sequential naming
181
+ for i, region in enumerate(regions):
182
+ self.add_region(region, name=f"{prefix}_{i+1}")
183
+ else:
184
+ # Add without names
185
+ for region in regions:
186
+ self.add_region(region)
187
+
188
+ return self
189
+
190
+ def _get_exclusion_regions(self, include_callable=True, debug=False) -> List[Region]:
191
+ """
192
+ Get all exclusion regions for this page.
193
+
194
+ Args:
195
+ include_callable: Whether to evaluate callable exclusion functions
196
+ debug: Enable verbose debug logging for exclusion evaluation
197
+
198
+ Returns:
199
+ List of Region objects to exclude
200
+ """
201
+ regions = []
202
+
203
+ # Track exclusion results for debugging
204
+ if debug:
205
+ print(f"\nPage {self.index}: Evaluating {len(self._exclusions)} exclusions")
206
+
207
+ for i, exclusion in enumerate(self._exclusions):
208
+ # Get exclusion label if it's a tuple from PDF level
209
+ exclusion_label = f"exclusion {i}"
210
+ original_exclusion = exclusion
211
+
212
+ # Check if it's a tuple from PDF.add_exclusion
213
+ if isinstance(exclusion, tuple) and len(exclusion) == 2 and callable(exclusion[0]):
214
+ # This is likely from PDF.add_exclusion with (func, label)
215
+ exclusion_func, label = exclusion
216
+ if label:
217
+ exclusion_label = label
218
+ exclusion = exclusion_func
219
+
220
+ # Process callable exclusion functions
221
+ if callable(exclusion) and include_callable:
222
+ # It's a function, call it with this page
223
+ try:
224
+ if debug:
225
+ print(f" - Evaluating callable {exclusion_label}...")
226
+
227
+ # Create a temporary copy of exclusions to avoid recursion
228
+ original_exclusions = self._exclusions
229
+ self._exclusions = [] # Temporarily clear exclusions
230
+
231
+ # Call the function
232
+ region = exclusion(self)
233
+
234
+ # Restore exclusions
235
+ self._exclusions = original_exclusions
236
+
237
+ if region:
238
+ regions.append(region)
239
+ if debug:
240
+ print(f" ✓ Added region: {region}")
241
+ else:
242
+ if debug:
243
+ print(f" ✗ Function returned None, no region added")
244
+
245
+ except Exception as e:
246
+ error_msg = f"Error in {exclusion_label} for page {self.index}: {e}"
247
+ print(error_msg)
248
+ # Print more detailed traceback for debugging
249
+ import traceback
250
+ print(f" Traceback: {traceback.format_exc().splitlines()[-3:]}")
251
+
252
+ # Process direct Region objects
253
+ elif not callable(exclusion):
254
+ # It's already a Region object
255
+ regions.append(exclusion)
256
+ if debug:
257
+ print(f" - Added direct region: {exclusion}")
258
+
259
+ if debug:
260
+ print(f"Page {self.index}: Found {len(regions)} valid exclusion regions")
261
+
262
+ return regions
263
+
264
+ def _filter_elements_by_exclusions(self, elements: List['Element'], debug_exclusions: bool = False) -> List['Element']:
265
+ """
266
+ Filters a list of elements, removing those within the page's exclusion regions.
267
+
268
+ Args:
269
+ elements: The list of elements to filter.
270
+ debug_exclusions: Whether to output detailed exclusion debugging info (default: False).
271
+
272
+ Returns:
273
+ A new list containing only the elements not falling within any exclusion region.
274
+ """
275
+ if not self._exclusions:
276
+ if debug_exclusions:
277
+ print(f"Page {self.index}: No exclusions defined, returning all {len(elements)} elements.")
278
+ return elements
279
+
280
+ # Get all exclusion regions, including evaluating callable functions
281
+ exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=debug_exclusions)
282
+
283
+ if not exclusion_regions:
284
+ if debug_exclusions:
285
+ print(f"Page {self.index}: No valid exclusion regions found, returning all {len(elements)} elements.")
286
+ return elements
287
+
288
+ if debug_exclusions:
289
+ print(f"Page {self.index}: Applying {len(exclusion_regions)} exclusion regions to {len(elements)} elements.")
290
+
291
+ filtered_elements = []
292
+ excluded_count = 0
293
+ for element in elements:
294
+ exclude = False
295
+ for region in exclusion_regions:
296
+ # Use the region's method to check if the element is inside
297
+ if region._is_element_in_region(element):
298
+ exclude = True
299
+ excluded_count += 1
300
+ break # No need to check other regions for this element
301
+ if not exclude:
302
+ filtered_elements.append(element)
303
+
304
+ if debug_exclusions:
305
+ print(f"Page {self.index}: Excluded {excluded_count} elements, keeping {len(filtered_elements)}.")
306
+
307
+ return filtered_elements
308
+
309
+ def find(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> Any:
310
+ """
311
+ Find first element on this page matching selector.
312
+
313
+ Args:
314
+ selector: CSS-like selector string
315
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
316
+ regex: Whether to use regex for text search in :contains (default: False)
317
+ case: Whether to do case-sensitive text search (default: True)
318
+ **kwargs: Additional filter parameters
319
+
320
+ Returns:
321
+ Element object or None if not found
322
+ """
323
+ from natural_pdf.selectors.parser import parse_selector
324
+ selector_obj = parse_selector(selector)
325
+
326
+ # Pass regex and case flags to selector function
327
+ kwargs['regex'] = regex
328
+ kwargs['case'] = case
329
+
330
+ # First get all matching elements without applying exclusions initially within _apply_selector
331
+ results_collection = self._apply_selector(selector_obj, **kwargs) # _apply_selector doesn't filter
332
+
333
+ # Filter the results based on exclusions if requested
334
+ if apply_exclusions and self._exclusions and results_collection:
335
+ filtered_elements = self._filter_elements_by_exclusions(results_collection.elements)
336
+ # Return the first element from the filtered list
337
+ return filtered_elements[0] if filtered_elements else None
338
+ elif results_collection:
339
+ # Return the first element from the unfiltered results
340
+ return results_collection.first
341
+ else:
342
+ return None
343
+
344
+ def find_all(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> 'ElementCollection':
345
+ """
346
+ Find all elements on this page matching selector.
347
+
348
+ Args:
349
+ selector: CSS-like selector string
350
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
351
+ regex: Whether to use regex for text search in :contains (default: False)
352
+ case: Whether to do case-sensitive text search (default: True)
353
+ **kwargs: Additional filter parameters
354
+
355
+ Returns:
356
+ ElementCollection with matching elements
357
+ """
358
+ from natural_pdf.selectors.parser import parse_selector
359
+ selector_obj = parse_selector(selector)
360
+
361
+ # Pass regex and case flags to selector function
362
+ kwargs['regex'] = regex
363
+ kwargs['case'] = case
364
+
365
+ # First get all matching elements without applying exclusions initially within _apply_selector
366
+ results_collection = self._apply_selector(selector_obj, **kwargs) # _apply_selector doesn't filter
367
+
368
+ # Filter the results based on exclusions if requested
369
+ if apply_exclusions and self._exclusions and results_collection:
370
+ filtered_elements = self._filter_elements_by_exclusions(results_collection.elements)
371
+ return ElementCollection(filtered_elements)
372
+ else:
373
+ # Return the unfiltered collection
374
+ return results_collection
375
+
376
+ def _apply_selector(self, selector_obj: Dict, **kwargs) -> 'ElementCollection': # Removed apply_exclusions arg
377
+ """
378
+ Apply selector to page elements.
379
+ Exclusions are now handled by the calling methods (find, find_all) if requested.
380
+
381
+ Args:
382
+ selector_obj: Parsed selector dictionary
383
+ **kwargs: Additional filter parameters including 'regex' and 'case'
384
+
385
+ Returns:
386
+ ElementCollection of matching elements (unfiltered by exclusions)
387
+ """
388
+ from natural_pdf.selectors.parser import selector_to_filter_func
389
+
390
+ # Get element type to filter
391
+ element_type = selector_obj.get('type', 'any').lower()
392
+
393
+ # Determine which elements to search based on element type
394
+ elements_to_search = []
395
+ if element_type == 'any':
396
+ elements_to_search = self._element_mgr.get_all_elements()
397
+ elif element_type == 'text':
398
+ elements_to_search = self._element_mgr.words
399
+ elif element_type == 'char':
400
+ elements_to_search = self._element_mgr.chars
401
+ elif element_type == 'word':
402
+ elements_to_search = self._element_mgr.words
403
+ elif element_type == 'rect' or element_type == 'rectangle':
404
+ elements_to_search = self._element_mgr.rects
405
+ elif element_type == 'line':
406
+ elements_to_search = self._element_mgr.lines
407
+ elif element_type == 'region':
408
+ elements_to_search = self._element_mgr.regions
409
+ else:
410
+ elements_to_search = self._element_mgr.get_all_elements()
411
+
412
+ # Create filter function from selector, passing any additional parameters
413
+ filter_func = selector_to_filter_func(selector_obj, **kwargs)
414
+
415
+ # Apply the filter to matching elements
416
+ matching_elements = [element for element in elements_to_search if filter_func(element)]
417
+
418
+ # Handle spatial pseudo-classes that require relationship checking
419
+ for pseudo in selector_obj.get('pseudo_classes', []):
420
+ name = pseudo.get('name')
421
+ args = pseudo.get('args', '')
422
+
423
+ if name in ('above', 'below', 'near', 'left-of', 'right-of'):
424
+ # Find the reference element first
425
+ from natural_pdf.selectors.parser import parse_selector
426
+ ref_selector = parse_selector(args) if isinstance(args, str) else args
427
+ # Recursively call _apply_selector for reference element (exclusions handled later)
428
+ ref_elements = self._apply_selector(ref_selector, **kwargs)
429
+
430
+ if not ref_elements:
431
+ return ElementCollection([])
432
+
433
+ ref_element = ref_elements.first
434
+ if not ref_element: continue
435
+
436
+ # Filter elements based on spatial relationship
437
+ if name == 'above':
438
+ matching_elements = [el for el in matching_elements if hasattr(el, 'bottom') and hasattr(ref_element, 'top') and el.bottom <= ref_element.top]
439
+ elif name == 'below':
440
+ matching_elements = [el for el in matching_elements if hasattr(el, 'top') and hasattr(ref_element, 'bottom') and el.top >= ref_element.bottom]
441
+ elif name == 'left-of':
442
+ matching_elements = [el for el in matching_elements if hasattr(el, 'x1') and hasattr(ref_element, 'x0') and el.x1 <= ref_element.x0]
443
+ elif name == 'right-of':
444
+ matching_elements = [el for el in matching_elements if hasattr(el, 'x0') and hasattr(ref_element, 'x1') and el.x0 >= ref_element.x1]
445
+ elif name == 'near':
446
+ def distance(el1, el2):
447
+ if not (hasattr(el1, 'x0') and hasattr(el1, 'x1') and hasattr(el1, 'top') and hasattr(el1, 'bottom') and
448
+ hasattr(el2, 'x0') and hasattr(el2, 'x1') and hasattr(el2, 'top') and hasattr(el2, 'bottom')):
449
+ return float('inf') # Cannot calculate distance
450
+ el1_center_x = (el1.x0 + el1.x1) / 2
451
+ el1_center_y = (el1.top + el1.bottom) / 2
452
+ el2_center_x = (el2.x0 + el2.x1) / 2
453
+ el2_center_y = (el2.top + el2.bottom) / 2
454
+ return ((el1_center_x - el2_center_x) ** 2 + (el1_center_y - el2_center_y) ** 2) ** 0.5
455
+
456
+ threshold = kwargs.get('near_threshold', 50)
457
+ matching_elements = [el for el in matching_elements if distance(el, ref_element) <= threshold]
458
+
459
+ # Sort elements in reading order if requested
460
+ if kwargs.get('reading_order', True):
461
+ if all(hasattr(el, 'top') and hasattr(el, 'x0') for el in matching_elements):
462
+ matching_elements.sort(key=lambda el: (el.top, el.x0))
463
+ else:
464
+ logger.warning("Cannot sort elements in reading order: Missing required attributes (top, x0).")
465
+
466
+ # Create result collection - exclusions are handled by the calling methods (find, find_all)
467
+ result = ElementCollection(matching_elements)
468
+
469
+ return result
470
+
471
+ def create_region(self, x0: float, top: float, x1: float, bottom: float) -> Any:
472
+ """
473
+ Create a region on this page with the specified coordinates.
474
+
475
+ Args:
476
+ x0: Left x-coordinate
477
+ top: Top y-coordinate
478
+ x1: Right x-coordinate
479
+ bottom: Bottom y-coordinate
480
+
481
+ Returns:
482
+ Region object for the specified coordinates
483
+ """
484
+ from natural_pdf.elements.region import Region
485
+ return Region(self, (x0, top, x1, bottom))
486
+
487
+ def region(self, left: float = None, top: float = None, right: float = None, bottom: float = None,
488
+ width: str = "full") -> Any:
489
+ """
490
+ Create a region on this page with more intuitive named parameters.
491
+
492
+ Args:
493
+ left: Left x-coordinate (default: 0)
494
+ top: Top y-coordinate (default: 0)
495
+ right: Right x-coordinate (default: page width)
496
+ bottom: Bottom y-coordinate (default: page height)
497
+ width: Width mode - "full" for full page width or "element" for element width
498
+
499
+ Returns:
500
+ Region object for the specified coordinates
501
+
502
+ Examples:
503
+ >>> page.region(top=100, bottom=200) # Full width from y=100 to y=200
504
+ >>> page.region(left=50, right=150, top=100, bottom=200) # Specific rectangle
505
+ """
506
+ # Handle defaults
507
+ left = 0 if left is None else left
508
+ top = 0 if top is None else top
509
+ right = self.width if right is None else right
510
+ bottom = self.height if bottom is None else bottom
511
+
512
+ # Handle width parameter
513
+ if width == "full":
514
+ left = 0
515
+ right = self.width
516
+ elif width != "element":
517
+ raise ValueError("Width must be 'full' or 'element'")
518
+
519
+ from natural_pdf.elements.region import Region
520
+ region = Region(self, (left, top, right, bottom))
521
+ return region
522
+
523
+ def get_elements(self, apply_exclusions=True, debug_exclusions: bool = False) -> List['Element']:
524
+ """
525
+ Get all elements on this page.
526
+
527
+ Args:
528
+ apply_exclusions: Whether to apply exclusion regions (default: True).
529
+ debug_exclusions: Whether to output detailed exclusion debugging info (default: False).
530
+
531
+ Returns:
532
+ List of all elements on the page, potentially filtered by exclusions.
533
+ """
534
+ # Get all elements from the element manager
535
+ all_elements = self._element_mgr.get_all_elements()
536
+
537
+ # Apply exclusions if requested
538
+ if apply_exclusions and self._exclusions:
539
+ return self._filter_elements_by_exclusions(all_elements, debug_exclusions=debug_exclusions)
540
+ else:
541
+ if debug_exclusions:
542
+ print(f"Page {self.index}: get_elements returning all {len(all_elements)} elements (exclusions not applied).")
543
+ return all_elements
544
+
545
+ def filter_elements(self, elements: List['Element'], selector: str, **kwargs) -> List['Element']:
546
+ """
547
+ Filter a list of elements based on a selector.
548
+
549
+ Args:
550
+ elements: List of elements to filter
551
+ selector: CSS-like selector string
552
+ **kwargs: Additional filter parameters
553
+
554
+ Returns:
555
+ List of elements that match the selector
556
+ """
557
+ from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
558
+
559
+ # Parse the selector
560
+ selector_obj = parse_selector(selector)
561
+
562
+ # Create filter function from selector
563
+ filter_func = selector_to_filter_func(selector_obj, **kwargs)
564
+
565
+ # Apply the filter to the elements
566
+ matching_elements = [element for element in elements if filter_func(element)]
567
+
568
+ # Sort elements in reading order if requested
569
+ if kwargs.get('reading_order', True):
570
+ if all(hasattr(el, 'top') and hasattr(el, 'x0') for el in matching_elements):
571
+ matching_elements.sort(key=lambda el: (el.top, el.x0))
572
+ else:
573
+ logger.warning("Cannot sort elements in reading order: Missing required attributes (top, x0).")
574
+
575
+ return matching_elements
576
+
577
+ def until(self, selector: str, include_endpoint: bool = True, **kwargs) -> Any:
578
+ """
579
+ Select content from the top of the page until matching selector.
580
+
581
+ Args:
582
+ selector: CSS-like selector string
583
+ include_endpoint: Whether to include the endpoint element in the region
584
+ **kwargs: Additional selection parameters
585
+
586
+ Returns:
587
+ Region object representing the selected content
588
+
589
+ Examples:
590
+ >>> page.until('text:contains("Conclusion")') # Select from top to conclusion
591
+ >>> page.until('line[width>=2]', include_endpoint=False) # Select up to thick line
592
+ """
593
+ # Find the target element
594
+ target = self.find(selector, **kwargs)
595
+ if not target:
596
+ # If target not found, return a default region (full page)
597
+ from natural_pdf.elements.region import Region
598
+ return Region(self, (0, 0, self.width, self.height))
599
+
600
+ # Create a region from the top of the page to the target
601
+ from natural_pdf.elements.region import Region
602
+ # Ensure target has positional attributes before using them
603
+ target_top = getattr(target, 'top', 0)
604
+ target_bottom = getattr(target, 'bottom', self.height)
605
+
606
+ if include_endpoint:
607
+ # Include the target element
608
+ region = Region(self, (0, 0, self.width, target_bottom))
609
+ else:
610
+ # Up to the target element
611
+ region = Region(self, (0, 0, self.width, target_top))
612
+
613
+ region.end_element = target
614
+ return region
615
+
616
+
617
+ def crop(self, bbox=None, **kwargs) -> Any:
618
+ """
619
+ Crop the page to the specified bounding box.
620
+
621
+ This is a direct wrapper around pdfplumber's crop method.
622
+
623
+ Args:
624
+ bbox: Bounding box (x0, top, x1, bottom) or None
625
+ **kwargs: Additional parameters (top, bottom, left, right)
626
+
627
+ Returns:
628
+ Cropped page object (pdfplumber.Page)
629
+ """
630
+ # Returns the pdfplumber page object, not a natural-pdf Page
631
+ return self._page.crop(bbox, **kwargs)
632
+
633
+ def extract_text(self,
634
+ preserve_whitespace=True,
635
+ use_exclusions=True,
636
+ debug_exclusions=False, **kwargs) -> str:
637
+ """
638
+ Extract text from this page, respecting any exclusion regions.
639
+
640
+ Args:
641
+ preserve_whitespace: Whether to keep blank characters (default: True)
642
+ use_exclusions: Whether to apply exclusion regions (default: True)
643
+ debug_exclusions: Whether to output detailed exclusion debugging info (default: False)
644
+ **kwargs: Additional extraction parameters passed to pdfplumber
645
+
646
+ Returns:
647
+ Extracted text as string
648
+ """
649
+ if not use_exclusions or not self._exclusions:
650
+ # If no exclusions or exclusions disabled, use regular extraction
651
+ if debug_exclusions:
652
+ print(f"Page {self.index}: Extracting text via pdfplumber (exclusions not applied).")
653
+ # Note: pdfplumber still uses keep_blank_chars parameter
654
+ return self._page.extract_text(keep_blank_chars=preserve_whitespace, **kwargs)
655
+
656
+ # --- Exclusion Logic ---
657
+ # 1. Get all potentially relevant text elements (words)
658
+ all_text_elements = self.words # Use the words property
659
+ if debug_exclusions:
660
+ print(f"Page {self.index}: Starting text extraction with {len(all_text_elements)} words before exclusion.")
661
+
662
+ # 2. Filter elements using the centralized method
663
+ filtered_elements = self._filter_elements_by_exclusions(all_text_elements, debug_exclusions=debug_exclusions)
664
+
665
+ # 3. Extract text from the filtered elements
666
+ collection = ElementCollection(filtered_elements)
667
+ # Ensure elements are sorted for logical text flow (might be redundant if self.words is sorted)
668
+ if all(hasattr(el, 'top') and hasattr(el, 'x0') for el in collection.elements):
669
+ collection.sort(key=lambda el: (el.top, el.x0))
670
+
671
+ # Join text, handling potential missing text attributes gracefully
672
+ result = " ".join(getattr(el, 'text', '') for el in collection.elements)
673
+
674
+ if debug_exclusions:
675
+ print(f"Page {self.index}: Extracted {len(result)} characters of text with exclusions applied.")
676
+
677
+ return result
678
+
679
+ def extract_table(self, table_settings={}) -> List[Any]:
680
+ """
681
+ Extract the largest table from this page.
682
+
683
+ Args:
684
+ table_settings: Additional extraction parameters
685
+
686
+ Returns:
687
+ List of extracted tables (or None if no table found)
688
+ """
689
+ # pdfplumber returns None if no table found
690
+ return self._page.extract_table(table_settings)
691
+
692
+ def extract_tables(self, table_settings={}) -> List[Any]:
693
+ """
694
+ Extract tables from this page.
695
+
696
+ Args:
697
+ table_settings: Additional extraction parameters
698
+
699
+ Returns:
700
+ List of extracted tables
701
+ """
702
+ # pdfplumber returns list of tables
703
+ return self._page.extract_tables(table_settings)
704
+
705
+ def _load_elements(self):
706
+ """Load all elements from the page via ElementManager."""
707
+ self._element_mgr.load_elements()
708
+
709
+ def _create_char_elements(self):
710
+ """DEPRECATED: Use self._element_mgr.chars"""
711
+ logger.warning("_create_char_elements is deprecated. Access via self._element_mgr.chars.")
712
+ return self._element_mgr.chars # Delegate
713
+
714
+ def _process_font_information(self, char_dict):
715
+ """DEPRECATED: Handled by ElementManager"""
716
+ logger.warning("_process_font_information is deprecated. Handled by ElementManager.")
717
+ # ElementManager handles this internally
718
+ pass
719
+
720
+ def _group_chars_into_words(self, keep_spaces=True, font_attrs=None):
721
+ """DEPRECATED: Use self._element_mgr.words"""
722
+ logger.warning("_group_chars_into_words is deprecated. Access via self._element_mgr.words.")
723
+ return self._element_mgr.words # Delegate
724
+
725
+ def _process_line_into_words(self, line_chars, keep_spaces, font_attrs):
726
+ """DEPRECATED: Handled by ElementManager"""
727
+ logger.warning("_process_line_into_words is deprecated. Handled by ElementManager.")
728
+ pass
729
+
730
+ def _check_font_attributes_match(self, char, prev_char, font_attrs):
731
+ """DEPRECATED: Handled by ElementManager"""
732
+ logger.warning("_check_font_attributes_match is deprecated. Handled by ElementManager.")
733
+ pass
734
+
735
+ def _create_word_element(self, chars, font_attrs):
736
+ """DEPRECATED: Handled by ElementManager"""
737
+ logger.warning("_create_word_element is deprecated. Handled by ElementManager.")
738
+ pass
739
+
740
+ @property
741
+ def chars(self) -> List[Any]:
742
+ """Get all character elements on this page."""
743
+ return self._element_mgr.chars
744
+
745
+ @property
746
+ def words(self) -> List[Any]:
747
+ """Get all word elements on this page."""
748
+ return self._element_mgr.words
749
+
750
+ @property
751
+ def rects(self) -> List[Any]:
752
+ """Get all rectangle elements on this page."""
753
+ return self._element_mgr.rects
754
+
755
+ @property
756
+ def lines(self) -> List[Any]:
757
+ """Get all line elements on this page."""
758
+ return self._element_mgr.lines
759
+
760
+ def highlight(self,
761
+ bbox: Optional[Tuple[float, float, float, float]] = None,
762
+ color: Optional[Union[Tuple, str]] = None,
763
+ label: Optional[str] = None,
764
+ use_color_cycling: bool = False,
765
+ element: Optional[Any] = None,
766
+ include_attrs: Optional[List[str]] = None,
767
+ existing: str = 'append') -> 'Page':
768
+ """
769
+ Highlight a bounding box or the entire page.
770
+ Delegates to the central HighlightingService.
771
+
772
+ Args:
773
+ bbox: Bounding box (x0, top, x1, bottom). If None, highlight entire page.
774
+ color: RGBA color tuple/string for the highlight.
775
+ label: Optional label for the highlight.
776
+ use_color_cycling: If True and no label/color, use next cycle color.
777
+ element: Optional original element being highlighted (for attribute extraction).
778
+ include_attrs: List of attribute names from 'element' to display.
779
+ existing: How to handle existing highlights ('append' or 'replace').
780
+
781
+ Returns:
782
+ Self for method chaining.
783
+ """
784
+ target_bbox = bbox if bbox is not None else (0, 0, self.width, self.height)
785
+ self._highlighter.add(
786
+ page_index=self.index,
787
+ bbox=target_bbox,
788
+ color=color,
789
+ label=label,
790
+ use_color_cycling=use_color_cycling,
791
+ element=element,
792
+ include_attrs=include_attrs,
793
+ existing=existing
794
+ )
795
+ return self
796
+
797
+ def highlight_polygon(
798
+ self,
799
+ polygon: List[Tuple[float, float]],
800
+ color: Optional[Union[Tuple, str]] = None,
801
+ label: Optional[str] = None,
802
+ use_color_cycling: bool = False,
803
+ element: Optional[Any] = None,
804
+ include_attrs: Optional[List[str]] = None,
805
+ existing: str = 'append') -> 'Page':
806
+ """
807
+ Highlight a polygon shape on the page.
808
+ Delegates to the central HighlightingService.
809
+
810
+ Args:
811
+ polygon: List of (x, y) points defining the polygon.
812
+ color: RGBA color tuple/string for the highlight.
813
+ label: Optional label for the highlight.
814
+ use_color_cycling: If True and no label/color, use next cycle color.
815
+ element: Optional original element being highlighted (for attribute extraction).
816
+ include_attrs: List of attribute names from 'element' to display.
817
+ existing: How to handle existing highlights ('append' or 'replace').
818
+
819
+ Returns:
820
+ Self for method chaining.
821
+ """
822
+ self._highlighter.add_polygon(
823
+ page_index=self.index,
824
+ polygon=polygon,
825
+ color=color,
826
+ label=label,
827
+ use_color_cycling=use_color_cycling,
828
+ element=element,
829
+ include_attrs=include_attrs,
830
+ existing=existing
831
+ )
832
+ return self
833
+
834
+ def show(self,
835
+ scale: float = 2.0,
836
+ width: Optional[int] = None,
837
+ labels: bool = True,
838
+ legend_position: str = 'right',
839
+ render_ocr: bool = False) -> Optional[Image.Image]:
840
+ """
841
+ Generates and returns an image of the page with persistent highlights rendered.
842
+
843
+ Args:
844
+ scale: Scale factor for rendering.
845
+ width: Optional width for the output image.
846
+ labels: Whether to include a legend for labels.
847
+ legend_position: Position of the legend.
848
+ render_ocr: Whether to render OCR text.
849
+
850
+ Returns:
851
+ PIL Image object of the page with highlights, or None if rendering fails.
852
+ """
853
+ return self.to_image(
854
+ scale=scale,
855
+ width=width,
856
+ labels=labels,
857
+ legend_position=legend_position,
858
+ render_ocr=render_ocr,
859
+ include_highlights=True # Ensure highlights are requested
860
+ )
861
+
862
+ def save_image(self,
863
+ filename: str,
864
+ scale: float = 2.0,
865
+ width: Optional[int] = None,
866
+ labels: bool = True,
867
+ legend_position: str = 'right',
868
+ render_ocr: bool = False,
869
+ include_highlights: bool = True, # Allow saving without highlights
870
+ resolution: Optional[float] = None,
871
+ **kwargs) -> 'Page':
872
+ """
873
+ Save the page image to a file, rendering highlights via HighlightingService.
874
+
875
+ Args:
876
+ filename: Path to save the image to.
877
+ scale: Scale factor for rendering highlights.
878
+ width: Optional width for the output image.
879
+ labels: Whether to include a legend.
880
+ legend_position: Position of the legend.
881
+ render_ocr: Whether to render OCR text.
882
+ include_highlights: Whether to render highlights.
883
+ resolution: Resolution for base image rendering.
884
+ **kwargs: Additional args for pdfplumber's to_image.
885
+
886
+ Returns:
887
+ Self for method chaining.
888
+ """
889
+ # Use to_image to generate and save the image
890
+ self.to_image(
891
+ path=filename,
892
+ scale=scale,
893
+ width=width,
894
+ labels=labels,
895
+ legend_position=legend_position,
896
+ render_ocr=render_ocr,
897
+ include_highlights=include_highlights,
898
+ resolution=resolution,
899
+ **kwargs
900
+ )
901
+ return self
902
+
903
+ def clear_highlights(self) -> 'Page':
904
+ """
905
+ Clear all highlights *from this specific page* via HighlightingService.
906
+
907
+ Returns:
908
+ Self for method chaining
909
+ """
910
+ self._highlighter.clear_page(self.index)
911
+ return self
912
+
913
+ def analyze_text_styles(self, options: Optional[TextStyleOptions] = None) -> ElementCollection:
914
+ """
915
+ Analyze text elements by style, adding attributes directly to elements.
916
+
917
+ This method uses TextStyleAnalyzer to process text elements (typically words)
918
+ on the page. It adds the following attributes to each processed element:
919
+ - style_label: A descriptive or numeric label for the style group.
920
+ - style_key: A hashable tuple representing the style properties used for grouping.
921
+ - style_properties: A dictionary containing the extracted style properties.
922
+
923
+ Args:
924
+ options: Optional TextStyleOptions to configure the analysis.
925
+ If None, the analyzer's default options are used.
926
+
927
+ Returns:
928
+ ElementCollection containing all processed text elements with added style attributes.
929
+ """
930
+ # Create analyzer (optionally pass default options from PDF config here)
931
+ # For now, it uses its own defaults if options=None
932
+ analyzer = TextStyleAnalyzer()
933
+
934
+ # Analyze the page. The analyzer now modifies elements directly
935
+ # and returns the collection of processed elements.
936
+ processed_elements_collection = analyzer.analyze(self, options=options)
937
+
938
+ # Return the collection of elements which now have style attributes
939
+ return processed_elements_collection
940
+
941
+ def to_image(self,
942
+ path: Optional[str] = None,
943
+ scale: float = 2.0,
944
+ width: Optional[int] = None,
945
+ labels: bool = True,
946
+ legend_position: str = 'right',
947
+ render_ocr: bool = False,
948
+ resolution: Optional[float] = None,
949
+ include_highlights: bool = True,
950
+ **kwargs) -> Optional[Image.Image]:
951
+ """
952
+ Generate a PIL image of the page, using HighlightingService if needed.
953
+
954
+ Args:
955
+ path: Optional path to save the image to.
956
+ scale: Scale factor for rendering highlights.
957
+ width: Optional width for the output image.
958
+ labels: Whether to include a legend for highlights.
959
+ legend_position: Position of the legend.
960
+ render_ocr: Whether to render OCR text on highlights.
961
+ resolution: Resolution in DPI for base page image (default: scale * 72).
962
+ include_highlights: Whether to render highlights.
963
+ **kwargs: Additional parameters for pdfplumber.to_image.
964
+
965
+ Returns:
966
+ PIL Image of the page, or None if rendering fails.
967
+ """
968
+ image = None
969
+ try:
970
+ if include_highlights:
971
+ # Delegate rendering to the central service
972
+ image = self._highlighter.render_page(
973
+ page_index=self.index,
974
+ scale=scale,
975
+ labels=labels,
976
+ legend_position=legend_position,
977
+ render_ocr=render_ocr,
978
+ resolution=resolution,
979
+ **kwargs
980
+ )
981
+ else:
982
+ # Get the base page image directly from pdfplumber if no highlights needed
983
+ render_resolution = resolution if resolution is not None else scale * 72
984
+ # Use the underlying pdfplumber page object
985
+ img_object = self._page.to_image(resolution=render_resolution, **kwargs)
986
+ # Access the PIL image directly (assuming pdfplumber structure)
987
+ image = img_object.annotated if hasattr(img_object, 'annotated') else img_object._repr_png_()
988
+ if isinstance(image, bytes): # Handle cases where it returns bytes
989
+ from io import BytesIO
990
+ image = Image.open(BytesIO(image)).convert('RGB') # Convert to RGB for consistency
991
+
992
+ except Exception as e:
993
+ logger.error(f"Error rendering page {self.index}: {e}", exc_info=True)
994
+ return None # Return None on error
995
+
996
+ if image is None: return None
997
+
998
+ # Resize the final image if width is provided
999
+ if width is not None and width > 0 and image.width > 0:
1000
+ aspect_ratio = image.height / image.width
1001
+ height = int(width * aspect_ratio)
1002
+ try:
1003
+ image = image.resize((width, height), Image.Resampling.LANCZOS) # Use modern resampling
1004
+ except Exception as resize_error:
1005
+ logger.warning(f"Could not resize image: {resize_error}")
1006
+
1007
+ # Save the image if path is provided
1008
+ if path:
1009
+ try:
1010
+ # Ensure directory exists
1011
+ os.makedirs(os.path.dirname(path), exist_ok=True)
1012
+ image.save(path)
1013
+ logger.debug(f"Saved page image to: {path}")
1014
+ except Exception as save_error:
1015
+ logger.error(f"Failed to save image to {path}: {save_error}")
1016
+
1017
+ return image
1018
+
1019
+ def _create_text_elements_from_ocr(self, ocr_results: List[Dict[str, Any]], image_width=None, image_height=None) -> List[TextElement]:
1020
+ """DEPRECATED: Use self._element_mgr.create_text_elements_from_ocr"""
1021
+ logger.warning("_create_text_elements_from_ocr is deprecated. Use self._element_mgr version.")
1022
+ return self._element_mgr.create_text_elements_from_ocr(ocr_results, image_width, image_height)
1023
+
1024
+ def apply_ocr(
1025
+ self,
1026
+ engine: Optional[str] = None,
1027
+ options: Optional[OCROptions] = None,
1028
+ languages: Optional[List[str]] = None,
1029
+ min_confidence: Optional[float] = None,
1030
+ device: Optional[str] = None,
1031
+ ) -> List[TextElement]:
1032
+ """
1033
+ Apply OCR to THIS page and add results to page elements via PDF.apply_ocr_to_pages.
1034
+
1035
+ Returns:
1036
+ List of created TextElements derived from OCR results for this page.
1037
+ """
1038
+ if not hasattr(self._parent, 'apply_ocr_to_pages'):
1039
+ logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr_to_pages'. Cannot apply OCR.")
1040
+ return []
1041
+
1042
+ logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr_to_pages.")
1043
+ try:
1044
+ # Delegate to parent PDF, targeting only this page's index
1045
+ self._parent.apply_ocr_to_pages(
1046
+ pages=[self.index],
1047
+ engine=engine, options=options, languages=languages,
1048
+ min_confidence=min_confidence, device=device
1049
+ )
1050
+ except Exception as e:
1051
+ logger.error(f"Page {self.number}: Error during delegated OCR call: {e}", exc_info=True)
1052
+ return []
1053
+
1054
+ # Return the OCR elements specifically added to this page
1055
+ # Use element manager to retrieve them
1056
+ ocr_elements = [el for el in self.words if getattr(el, 'source', None) == 'ocr']
1057
+ logger.debug(f"Page {self.number}: apply_ocr completed. Found {len(ocr_elements)} OCR elements.")
1058
+ return ocr_elements
1059
+
1060
+ def extract_ocr_elements(
1061
+ self,
1062
+ engine: Optional[str] = None,
1063
+ options: Optional[OCROptions] = None,
1064
+ languages: Optional[List[str]] = None,
1065
+ min_confidence: Optional[float] = None,
1066
+ device: Optional[str] = None,
1067
+ ) -> List[TextElement]:
1068
+ """
1069
+ Extract text elements using OCR *without* adding them to the page's elements.
1070
+ Uses the shared OCRManager instance.
1071
+ """
1072
+ if not self._ocr_manager:
1073
+ logger.error(f"Page {self.number}: OCRManager not available. Cannot extract OCR elements.")
1074
+ return []
1075
+
1076
+ logger.info(f"Page {self.number}: Extracting OCR elements (extract only)...")
1077
+ try:
1078
+ ocr_scale = getattr(self._parent, '_config', {}).get('ocr_image_scale', 2.0)
1079
+ # Get base image without highlights
1080
+ image = self.to_image(scale=ocr_scale, include_highlights=False)
1081
+ if not image:
1082
+ logger.error(f" Failed to render page {self.number} to image for OCR extraction.")
1083
+ return []
1084
+ logger.debug(f" Rendered image size: {image.width}x{image.height}")
1085
+ except Exception as e:
1086
+ logger.error(f" Failed to render page {self.number} to image: {e}", exc_info=True)
1087
+ return []
1088
+
1089
+ manager_args = {'images': image, 'options': options, 'engine': engine}
1090
+ if languages is not None: manager_args['languages'] = languages
1091
+ if min_confidence is not None: manager_args['min_confidence'] = min_confidence
1092
+ if device is not None: manager_args['device'] = device
1093
+
1094
+ logger.debug(f" Calling OCR Manager (extract only) with args: { {k:v for k,v in manager_args.items() if k != 'images'} }")
1095
+ try:
1096
+ # apply_ocr now returns List[List[Dict]] or List[Dict]
1097
+ results_list = self._ocr_manager.apply_ocr(**manager_args)
1098
+ # If it returned a list of lists (batch mode), take the first list
1099
+ results = results_list[0] if isinstance(results_list, list) and results_list and isinstance(results_list[0], list) else results_list
1100
+
1101
+ if not isinstance(results, list):
1102
+ logger.error(f" OCR Manager returned unexpected type: {type(results)}")
1103
+ results = []
1104
+ logger.info(f" OCR Manager returned {len(results)} results for extraction.")
1105
+ except Exception as e:
1106
+ logger.error(f" OCR processing failed during extraction: {e}", exc_info=True)
1107
+ return []
1108
+
1109
+ # Convert results but DO NOT add to ElementManager
1110
+ logger.debug(f" Converting OCR results to TextElements (extract only)...")
1111
+ # Use a temporary method to create elements without adding them globally
1112
+ temp_elements = []
1113
+ scale_x = self.width / image.width if image.width else 1
1114
+ scale_y = self.height / image.height if image.height else 1
1115
+ for result in results:
1116
+ x0, top, x1, bottom = [float(c) for c in result['bbox']]
1117
+ elem_data = {
1118
+ 'text': result['text'], 'confidence': result['confidence'],
1119
+ 'x0': x0 * scale_x, 'top': top * scale_y,
1120
+ 'x1': x1 * scale_x, 'bottom': bottom * scale_y,
1121
+ 'width': (x1 - x0) * scale_x, 'height': (bottom - top) * scale_y,
1122
+ 'object_type': 'text', 'source': 'ocr',
1123
+ 'fontname': 'OCR-temp', 'size': 10.0, 'page_number': self.number
1124
+ }
1125
+ temp_elements.append(TextElement(elem_data, self))
1126
+
1127
+ logger.info(f" Created {len(temp_elements)} TextElements from OCR (extract only).")
1128
+ return temp_elements
1129
+
1130
+ @property
1131
+ def layout_analyzer(self) -> LayoutAnalyzer:
1132
+ """Get or create the layout analyzer for this page."""
1133
+ if self._layout_analyzer is None:
1134
+ if not self._layout_manager:
1135
+ logger.warning("LayoutManager not available, cannot create LayoutAnalyzer.")
1136
+ return None
1137
+ self._layout_analyzer = LayoutAnalyzer(self)
1138
+ return self._layout_analyzer
1139
+
1140
+ def analyze_layout(
1141
+ self,
1142
+ engine: Optional[str] = None,
1143
+ options: Optional[LayoutOptions] = None,
1144
+ confidence: Optional[float] = None,
1145
+ classes: Optional[List[str]] = None,
1146
+ exclude_classes: Optional[List[str]] = None,
1147
+ device: Optional[str] = None,
1148
+ existing: str = "replace"
1149
+ ) -> ElementCollection[Region]:
1150
+ """
1151
+ Analyze the page layout using the configured LayoutManager.
1152
+ Adds detected Region objects to the page's element manager.
1153
+
1154
+ Returns:
1155
+ ElementCollection containing the detected Region objects.
1156
+ """
1157
+ analyzer = self.layout_analyzer
1158
+ if not analyzer:
1159
+ logger.error("Layout analysis failed: LayoutAnalyzer not initialized (is LayoutManager available?).")
1160
+ return ElementCollection([]) # Return empty collection
1161
+
1162
+ # The analyzer's analyze_layout method already adds regions to the page
1163
+ # and its element manager. We just need to retrieve them.
1164
+ analyzer.analyze_layout(
1165
+ engine=engine,
1166
+ options=options,
1167
+ confidence=confidence,
1168
+ classes=classes,
1169
+ exclude_classes=exclude_classes,
1170
+ device=device,
1171
+ existing=existing
1172
+ )
1173
+
1174
+ # Retrieve the detected regions from the element manager
1175
+ # Filter regions based on source='detected' and potentially the model used if available
1176
+ detected_regions = [r for r in self._element_mgr.regions
1177
+ if r.source == 'detected' and (not engine or getattr(r, 'model', None) == engine)]
1178
+
1179
+ return ElementCollection(detected_regions)
1180
+
1181
+ def get_section_between(self, start_element=None, end_element=None, boundary_inclusion='both') -> Optional[Region]: # Return Optional
1182
+ """
1183
+ Get a section between two elements on this page.
1184
+ """
1185
+ # Create a full-page region to operate within
1186
+ page_region = self.create_region(0, 0, self.width, self.height)
1187
+
1188
+ # Delegate to the region's method
1189
+ try:
1190
+ return page_region.get_section_between(
1191
+ start_element=start_element,
1192
+ end_element=end_element,
1193
+ boundary_inclusion=boundary_inclusion
1194
+ )
1195
+ except Exception as e:
1196
+ logger.error(f"Error getting section between elements on page {self.index}: {e}", exc_info=True)
1197
+ return None
1198
+
1199
+ def get_sections(self,
1200
+ start_elements=None,
1201
+ end_elements=None,
1202
+ boundary_inclusion='both',
1203
+ y_threshold=5.0,
1204
+ bounding_box=None) -> 'ElementCollection[Region]': # Updated type hint
1205
+ """
1206
+ Get sections of a page defined by start/end elements.
1207
+ Uses the page-level implementation.
1208
+
1209
+ Returns:
1210
+ An ElementCollection containing the found Region objects.
1211
+ """
1212
+ # Helper function to get bounds from bounding_box parameter
1213
+ def get_bounds():
1214
+ if bounding_box:
1215
+ x0, top, x1, bottom = bounding_box
1216
+ # Clamp to page boundaries
1217
+ return max(0, x0), max(0, top), min(self.width, x1), min(self.height, bottom)
1218
+ else:
1219
+ return 0, 0, self.width, self.height
1220
+
1221
+ regions = []
1222
+
1223
+ # Handle cases where elements are provided as strings (selectors)
1224
+ if isinstance(start_elements, str):
1225
+ start_elements = self.find_all(start_elements).elements # Get list of elements
1226
+ elif hasattr(start_elements, 'elements'): # Handle ElementCollection input
1227
+ start_elements = start_elements.elements
1228
+
1229
+ if isinstance(end_elements, str):
1230
+ end_elements = self.find_all(end_elements).elements
1231
+ elif hasattr(end_elements, 'elements'):
1232
+ end_elements = end_elements.elements
1233
+
1234
+ # Ensure start_elements is a list
1235
+ if start_elements is None: start_elements = []
1236
+ if end_elements is None: end_elements = []
1237
+
1238
+ valid_inclusions = ['start', 'end', 'both', 'none']
1239
+ if boundary_inclusion not in valid_inclusions:
1240
+ raise ValueError(f"boundary_inclusion must be one of {valid_inclusions}")
1241
+
1242
+ if not start_elements:
1243
+ # Return an empty ElementCollection if no start elements
1244
+ return ElementCollection([])
1245
+
1246
+ # Combine start and end elements with their type
1247
+ all_boundaries = []
1248
+ for el in start_elements: all_boundaries.append((el, 'start'))
1249
+ for el in end_elements: all_boundaries.append((el, 'end'))
1250
+
1251
+ # Sort all boundary elements primarily by top, then x0
1252
+ try:
1253
+ all_boundaries.sort(key=lambda x: (x[0].top, x[0].x0))
1254
+ except AttributeError as e:
1255
+ logger.error(f"Error sorting boundaries: Element missing top/x0 attribute? {e}")
1256
+ return ElementCollection([]) # Cannot proceed if elements lack position
1257
+
1258
+ # Process sorted boundaries to find sections
1259
+ current_start_element = None
1260
+ active_section_started = False
1261
+
1262
+ for element, element_type in all_boundaries:
1263
+ if element_type == 'start':
1264
+ # If we have an active section, this start implicitly ends it
1265
+ if active_section_started:
1266
+ end_boundary_el = element # Use this start as the end boundary
1267
+ # Determine region boundaries
1268
+ sec_top = current_start_element.top if boundary_inclusion in ['start', 'both'] else current_start_element.bottom
1269
+ sec_bottom = end_boundary_el.top if boundary_inclusion not in ['end', 'both'] else end_boundary_el.bottom
1270
+
1271
+ if sec_top < sec_bottom: # Ensure valid region
1272
+ x0, _, x1, _ = get_bounds()
1273
+ region = self.create_region(x0, sec_top, x1, sec_bottom)
1274
+ region.start_element = current_start_element
1275
+ region.end_element = end_boundary_el # Mark the element that ended it
1276
+ region.is_end_next_start = True # Mark how it ended
1277
+ regions.append(region)
1278
+ active_section_started = False # Reset for the new start
1279
+
1280
+ # Set this as the potential start of the next section
1281
+ current_start_element = element
1282
+ active_section_started = True
1283
+
1284
+ elif element_type == 'end' and active_section_started:
1285
+ # We found an explicit end for the current section
1286
+ end_boundary_el = element
1287
+ sec_top = current_start_element.top if boundary_inclusion in ['start', 'both'] else current_start_element.bottom
1288
+ sec_bottom = end_boundary_el.bottom if boundary_inclusion in ['end', 'both'] else end_boundary_el.top
1289
+
1290
+ if sec_top < sec_bottom: # Ensure valid region
1291
+ x0, _, x1, _ = get_bounds()
1292
+ region = self.create_region(x0, sec_top, x1, sec_bottom)
1293
+ region.start_element = current_start_element
1294
+ region.end_element = end_boundary_el
1295
+ region.is_end_next_start = False
1296
+ regions.append(region)
1297
+
1298
+ # Reset: section ended explicitly
1299
+ current_start_element = None
1300
+ active_section_started = False
1301
+
1302
+ # Handle the last section if it was started but never explicitly ended
1303
+ if active_section_started:
1304
+ sec_top = current_start_element.top if boundary_inclusion in ['start', 'both'] else current_start_element.bottom
1305
+ x0, _, x1, page_bottom = get_bounds()
1306
+ if sec_top < page_bottom:
1307
+ region = self.create_region(x0, sec_top, x1, page_bottom)
1308
+ region.start_element = current_start_element
1309
+ region.end_element = None # Ended by page end
1310
+ region.is_end_next_start = False
1311
+ regions.append(region)
1312
+
1313
+ # Return the list wrapped in an ElementCollection
1314
+ return ElementCollection(regions)
1315
+
1316
+ def __repr__(self) -> str:
1317
+ """String representation of the page."""
1318
+ return f"<Page number={self.number} index={self.index}>"
1319
+
1320
+ def ask(self, question: str, min_confidence: float = 0.1, model: str = None, debug: bool = False, **kwargs) -> Dict[str, Any]:
1321
+ """
1322
+ Ask a question about the page content using document QA.
1323
+ """
1324
+ try:
1325
+ from natural_pdf.qa.document_qa import get_qa_engine
1326
+ # Get or initialize QA engine with specified model
1327
+ qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
1328
+ # Ask the question using the QA engine
1329
+ return qa_engine.ask_pdf_page(self, question, min_confidence=min_confidence, debug=debug, **kwargs)
1330
+ except ImportError:
1331
+ logger.error("Question answering requires the 'natural_pdf.qa' module. Please install necessary dependencies.")
1332
+ return {"answer": None, "confidence": 0.0, "found": False, "page_num": self.number, "source_elements": []}
1333
+ except Exception as e:
1334
+ logger.error(f"Error during page.ask: {e}", exc_info=True)
1335
+ return {"answer": None, "confidence": 0.0, "found": False, "page_num": self.number, "source_elements": []}
1336
+
1337
+ def show_preview(self,
1338
+ temporary_highlights: List[Dict],
1339
+ scale: float = 2.0,
1340
+ width: Optional[int] = None,
1341
+ labels: bool = True,
1342
+ legend_position: str = 'right',
1343
+ render_ocr: bool = False) -> Optional[Image.Image]:
1344
+ """
1345
+ Generates and returns a non-stateful preview image containing only
1346
+ the provided temporary highlights.
1347
+
1348
+ Args:
1349
+ temporary_highlights: List of highlight data dictionaries (as prepared by
1350
+ ElementCollection._prepare_highlight_data).
1351
+ scale: Scale factor for rendering.
1352
+ width: Optional width for the output image.
1353
+ labels: Whether to include a legend.
1354
+ legend_position: Position of the legend.
1355
+ render_ocr: Whether to render OCR text.
1356
+
1357
+ Returns:
1358
+ PIL Image object of the preview, or None if rendering fails.
1359
+ """
1360
+ try:
1361
+ # Delegate rendering to the highlighter service's preview method
1362
+ img = self._highlighter.render_preview(
1363
+ page_index=self.index,
1364
+ temporary_highlights=temporary_highlights,
1365
+ scale=scale,
1366
+ labels=labels,
1367
+ legend_position=legend_position,
1368
+ render_ocr=render_ocr
1369
+ )
1370
+ except AttributeError:
1371
+ logger.error(f"HighlightingService does not have the required 'render_preview' method.")
1372
+ return None
1373
+ except Exception as e:
1374
+ logger.error(f"Error calling highlighter.render_preview for page {self.index}: {e}", exc_info=True)
1375
+ return None
1376
+
1377
+ # Return the rendered image directly
1378
+ return img
1379
+
1380
+ @property
1381
+ def text_style_labels(self) -> List[str]:
1382
+ """
1383
+ Get a sorted list of unique text style labels found on the page.
1384
+
1385
+ Runs text style analysis with default options if it hasn't been run yet.
1386
+ To use custom options, call `analyze_text_styles(options=...)` explicitly first.
1387
+
1388
+ Returns:
1389
+ A sorted list of unique style label strings.
1390
+ """
1391
+ # Check if the summary attribute exists from a previous run
1392
+ if not hasattr(self, '_text_styles_summary') or not self._text_styles_summary:
1393
+ # If not, run the analysis with default options
1394
+ logger.debug(f"Page {self.number}: Running default text style analysis to get labels.")
1395
+ self.analyze_text_styles() # Use default options
1396
+
1397
+ # Extract labels from the summary dictionary
1398
+ if hasattr(self, '_text_styles_summary') and self._text_styles_summary:
1399
+ # The summary maps style_key -> {'label': ..., 'properties': ...}
1400
+ labels = {style_info['label'] for style_info in self._text_styles_summary.values()}
1401
+ return sorted(list(labels))
1402
+ else:
1403
+ # Fallback if summary wasn't created for some reason (e.g., no text elements)
1404
+ logger.warning(f"Page {self.number}: Text style summary not found after analysis.")
1405
+ return []
1406
+
1407
+ def viewer(self,
1408
+ # elements_to_render: Optional[List['Element']] = None, # No longer needed, from_page handles it
1409
+ # include_element_types: List[str] = ['word', 'line', 'rect', 'region'] # No longer needed
1410
+ ) -> 'SimpleInteractiveViewerWidget': # Return type hint updated
1411
+ """
1412
+ Creates and returns an interactive ipywidget for exploring elements on this page.
1413
+
1414
+ Uses SimpleInteractiveViewerWidget.from_page() to create the viewer.
1415
+
1416
+ Returns:
1417
+ A SimpleInteractiveViewerWidget instance ready for display in Jupyter.
1418
+
1419
+ Raises:
1420
+ RuntimeError: If required dependencies (ipywidgets) are missing.
1421
+ ValueError: If image rendering or data preparation fails within from_page.
1422
+ """
1423
+ # Import the widget class (might need to be moved to top if used elsewhere)
1424
+ from natural_pdf.widgets.viewer import SimpleInteractiveViewerWidget
1425
+
1426
+ logger.info(f"Generating interactive viewer for Page {self.number} using SimpleInteractiveViewerWidget.from_page...")
1427
+
1428
+ try:
1429
+ # Delegate creation entirely to the from_page class method
1430
+ viewer_widget = SimpleInteractiveViewerWidget.from_page(self)
1431
+ if viewer_widget is None:
1432
+ # This case might happen if from_page had error handling to return None, though we removed most.
1433
+ # Keeping a check here just in case.
1434
+ raise RuntimeError("SimpleInteractiveViewerWidget.from_page returned None, indicating an issue during widget creation.")
1435
+
1436
+ logger.info("Interactive viewer widget created successfully.")
1437
+ return viewer_widget
1438
+ except ImportError as e:
1439
+ logger.error("Failed to import SimpleInteractiveViewerWidget. Ensure natural_pdf.widgets and ipywidgets are installed.")
1440
+ raise RuntimeError("Widget class not found. ipywidgets or natural_pdf.widgets might be missing or setup incorrect.") from e
1441
+ except Exception as e:
1442
+ logger.error(f"Failed to create interactive viewer: {e}", exc_info=True)
1443
+ # Re-raise the exception to make it visible to the user
1444
+ raise RuntimeError(f"Failed to create interactive viewer: {e}") from e