natural-pdf 25.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. examples/__init__.py +3 -0
  2. examples/another_exclusion_example.py +20 -0
  3. examples/basic_usage.py +190 -0
  4. examples/boundary_exclusion_test.py +137 -0
  5. examples/boundary_inclusion_fix_test.py +157 -0
  6. examples/chainable_layout_example.py +70 -0
  7. examples/color_basic_test.py +49 -0
  8. examples/color_name_example.py +71 -0
  9. examples/color_test.py +62 -0
  10. examples/debug_ocr.py +91 -0
  11. examples/direct_ocr_test.py +148 -0
  12. examples/direct_paddle_test.py +99 -0
  13. examples/direct_qa_example.py +165 -0
  14. examples/document_layout_analysis.py +123 -0
  15. examples/document_qa_example.py +185 -0
  16. examples/exclusion_count_debug.py +128 -0
  17. examples/exclusion_debug.py +107 -0
  18. examples/exclusion_example.py +150 -0
  19. examples/exclusion_optimization_example.py +190 -0
  20. examples/extract_text_test.py +128 -0
  21. examples/font_aware_example.py +101 -0
  22. examples/font_variant_example.py +124 -0
  23. examples/footer_overlap_test.py +124 -0
  24. examples/highlight_all_example.py +82 -0
  25. examples/highlight_attributes_test.py +114 -0
  26. examples/highlight_confidence_display.py +122 -0
  27. examples/highlight_demo.py +110 -0
  28. examples/highlight_float_test.py +71 -0
  29. examples/highlight_test.py +147 -0
  30. examples/highlighting_example.py +123 -0
  31. examples/image_width_example.py +84 -0
  32. examples/improved_api_example.py +128 -0
  33. examples/layout_confidence_display_test.py +65 -0
  34. examples/layout_confidence_test.py +82 -0
  35. examples/layout_coordinate_debug.py +258 -0
  36. examples/layout_highlight_test.py +77 -0
  37. examples/logging_example.py +70 -0
  38. examples/ocr_comprehensive.py +193 -0
  39. examples/ocr_debug_example.py +87 -0
  40. examples/ocr_default_test.py +97 -0
  41. examples/ocr_engine_comparison.py +235 -0
  42. examples/ocr_example.py +89 -0
  43. examples/ocr_simplified_params.py +79 -0
  44. examples/ocr_visualization.py +102 -0
  45. examples/ocr_visualization_test.py +121 -0
  46. examples/paddle_layout_example.py +315 -0
  47. examples/paddle_layout_simple.py +74 -0
  48. examples/paddleocr_example.py +224 -0
  49. examples/page_collection_example.py +103 -0
  50. examples/polygon_highlight_example.py +83 -0
  51. examples/position_methods_example.py +134 -0
  52. examples/region_boundary_test.py +73 -0
  53. examples/region_exclusion_test.py +149 -0
  54. examples/region_expand_example.py +109 -0
  55. examples/region_image_example.py +116 -0
  56. examples/region_ocr_test.py +119 -0
  57. examples/region_sections_example.py +115 -0
  58. examples/school_books.py +49 -0
  59. examples/school_books_all.py +52 -0
  60. examples/scouring.py +36 -0
  61. examples/section_extraction_example.py +232 -0
  62. examples/simple_document_qa.py +97 -0
  63. examples/spatial_navigation_example.py +108 -0
  64. examples/table_extraction_example.py +135 -0
  65. examples/table_structure_detection.py +155 -0
  66. examples/tatr_cells_test.py +56 -0
  67. examples/tatr_ocr_table_test.py +94 -0
  68. examples/text_search_example.py +122 -0
  69. examples/text_style_example.py +110 -0
  70. examples/tiny-text.py +61 -0
  71. examples/until_boundaries_example.py +156 -0
  72. examples/until_example.py +112 -0
  73. examples/very_basics.py +15 -0
  74. natural_pdf/__init__.py +55 -0
  75. natural_pdf/analyzers/__init__.py +9 -0
  76. natural_pdf/analyzers/document_layout.py +736 -0
  77. natural_pdf/analyzers/text_structure.py +153 -0
  78. natural_pdf/core/__init__.py +3 -0
  79. natural_pdf/core/page.py +2376 -0
  80. natural_pdf/core/pdf.py +572 -0
  81. natural_pdf/elements/__init__.py +3 -0
  82. natural_pdf/elements/base.py +553 -0
  83. natural_pdf/elements/collections.py +770 -0
  84. natural_pdf/elements/line.py +124 -0
  85. natural_pdf/elements/rect.py +122 -0
  86. natural_pdf/elements/region.py +1366 -0
  87. natural_pdf/elements/text.py +304 -0
  88. natural_pdf/ocr/__init__.py +62 -0
  89. natural_pdf/ocr/easyocr_engine.py +254 -0
  90. natural_pdf/ocr/engine.py +158 -0
  91. natural_pdf/ocr/paddleocr_engine.py +263 -0
  92. natural_pdf/qa/__init__.py +3 -0
  93. natural_pdf/qa/document_qa.py +405 -0
  94. natural_pdf/selectors/__init__.py +4 -0
  95. natural_pdf/selectors/parser.py +360 -0
  96. natural_pdf/templates/__init__.py +1 -0
  97. natural_pdf/templates/ocr_debug.html +517 -0
  98. natural_pdf/utils/__init__.py +4 -0
  99. natural_pdf/utils/highlighting.py +605 -0
  100. natural_pdf/utils/ocr.py +515 -0
  101. natural_pdf/utils/reading_order.py +227 -0
  102. natural_pdf/utils/visualization.py +151 -0
  103. natural_pdf-25.3.16.dist-info/LICENSE +21 -0
  104. natural_pdf-25.3.16.dist-info/METADATA +268 -0
  105. natural_pdf-25.3.16.dist-info/RECORD +109 -0
  106. natural_pdf-25.3.16.dist-info/WHEEL +5 -0
  107. natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
  108. tests/__init__.py +3 -0
  109. tests/test_pdf.py +39 -0
@@ -0,0 +1,2376 @@
1
+ import pdfplumber
2
+ import os
3
+ import tempfile
4
+ from typing import List, Optional, Union, Any, Dict, Callable, TYPE_CHECKING, Tuple
5
+ from PIL import Image
6
+
7
+ if TYPE_CHECKING:
8
+ import pdfplumber
9
+ from natural_pdf.core.pdf import PDF
10
+ from natural_pdf.elements.collections import ElementCollection
11
+ from natural_pdf.utils.highlighting import HighlightManager
12
+ from natural_pdf.elements.base import Element
13
+
14
+ from natural_pdf.elements.region import Region
15
+ from natural_pdf.elements.text import TextElement
16
+ from natural_pdf.analyzers.document_layout import (
17
+ YOLODocLayoutDetector,
18
+ TableTransformerDetector,
19
+ PaddleLayoutDetector,
20
+ convert_to_regions
21
+ )
22
+ from natural_pdf.utils.ocr import OCRManager
23
+
24
+ # Import OCR engines
25
+ try:
26
+ from natural_pdf.ocr import OCREngine, EasyOCREngine, PaddleOCREngine
27
+ HAS_OCR_ENGINES = True
28
+ except ImportError:
29
+ # Fallback if the OCR engines are not available
30
+ HAS_OCR_ENGINES = False
31
+
32
+
33
+ class Page:
34
+ """
35
+ Enhanced Page wrapper built on top of pdfplumber.Page.
36
+
37
+ This class provides a fluent interface for working with PDF pages,
38
+ with improved selection, navigation, extraction, and question-answering capabilities.
39
+ """
40
+
41
+ def __init__(self, page: 'pdfplumber.page.Page', parent: 'PDF', index: int, font_attrs=None):
42
+ """
43
+ Initialize a page wrapper.
44
+
45
+ Args:
46
+ page: pdfplumber page object
47
+ parent: Parent PDF object
48
+ index: Index of this page in the PDF (0-based)
49
+ font_attrs: Font attributes to consider when grouping characters into words.
50
+ Default: ['fontname', 'size'] (Group by font name and size)
51
+ None: Only consider spatial relationships
52
+ List: Custom attributes to consider (e.g., ['fontname', 'size', 'color'])
53
+ """
54
+ self._page = page
55
+ self._parent = parent
56
+ self._index = index
57
+ self._elements = None # Lazy-loaded
58
+ self._highlight_manager = None # Lazy-loaded
59
+ self._text_styles = None # Lazy-loaded text style analyzer results
60
+ self._exclusions = [] # List to store exclusion functions/regions
61
+
62
+ # Region management
63
+ self._regions = {
64
+ 'detected': [], # Layout detection results
65
+ 'named': {}, # Named regions (name -> region)
66
+ }
67
+
68
+ # Default to grouping by fontname and size if not specified
69
+ self._font_attrs = ['fontname', 'size'] if font_attrs is None else font_attrs
70
+
71
+ @property
72
+ def number(self) -> int:
73
+ """Get page number (1-based)."""
74
+ return self._page.page_number
75
+
76
+ @property
77
+ def index(self) -> int:
78
+ """Get page index (0-based)."""
79
+ return self._index
80
+
81
+ @property
82
+ def width(self) -> float:
83
+ """Get page width."""
84
+ return self._page.width
85
+
86
+ @property
87
+ def height(self) -> float:
88
+ """Get page height."""
89
+ return self._page.height
90
+
91
+ def add_exclusion(self, exclusion_func_or_region: Union[Callable[['Page'], Region], Region]) -> 'Page':
92
+ """
93
+ Add an exclusion to the page. Text from these regions will be excluded from extraction.
94
+
95
+ Args:
96
+ exclusion_func_or_region: Either a Region object or a function that takes a Page
97
+ and returns a Region to exclude
98
+
99
+ Returns:
100
+ Self for method chaining
101
+ """
102
+ self._exclusions.append(exclusion_func_or_region)
103
+ return self
104
+
105
+ def add_region(self, region: Region, name: Optional[str] = None) -> 'Page':
106
+ """
107
+ Add a region to the page.
108
+
109
+ Args:
110
+ region: Region object to add
111
+ name: Optional name for the region
112
+
113
+ Returns:
114
+ Self for method chaining
115
+ """
116
+ # Check if it's actually a Region object
117
+ if not isinstance(region, Region):
118
+ raise TypeError("region must be a Region object")
119
+
120
+ # Set the source and name
121
+ region.source = 'named'
122
+
123
+ if name:
124
+ region.name = name
125
+ # Add to named regions dictionary (overwriting if name already exists)
126
+ self._regions['named'][name] = region
127
+ else:
128
+ # Add to detected regions list (unnamed but registered)
129
+ self._regions['detected'].append(region)
130
+
131
+ # Make sure regions is in _elements for selectors
132
+ if self._elements is not None and 'regions' not in self._elements:
133
+ self._elements['regions'] = []
134
+
135
+ # Add to elements for selector queries
136
+ if self._elements is not None:
137
+ if region not in self._elements['regions']:
138
+ self._elements['regions'].append(region)
139
+
140
+ return self
141
+
142
+ def add_regions(self, regions: List[Region], prefix: Optional[str] = None) -> 'Page':
143
+ """
144
+ Add multiple regions to the page.
145
+
146
+ Args:
147
+ regions: List of Region objects to add
148
+ prefix: Optional prefix for automatic naming (regions will be named prefix_1, prefix_2, etc.)
149
+
150
+ Returns:
151
+ Self for method chaining
152
+ """
153
+ if prefix:
154
+ # Add with automatic sequential naming
155
+ for i, region in enumerate(regions):
156
+ self.add_region(region, name=f"{prefix}_{i+1}")
157
+ else:
158
+ # Add without names
159
+ for region in regions:
160
+ self.add_region(region)
161
+
162
+ return self
163
+
164
+ def _get_exclusion_regions(self, include_callable=True, debug=False) -> List[Region]:
165
+ """
166
+ Get all exclusion regions for this page.
167
+
168
+ Args:
169
+ include_callable: Whether to evaluate callable exclusion functions
170
+ debug: Enable verbose debug logging for exclusion evaluation
171
+
172
+ Returns:
173
+ List of Region objects to exclude
174
+ """
175
+ regions = []
176
+
177
+ # Track exclusion results for debugging
178
+ if debug:
179
+ print(f"\nPage {self.index}: Evaluating {len(self._exclusions)} exclusions")
180
+
181
+ for i, exclusion in enumerate(self._exclusions):
182
+ # Get exclusion label if it's a tuple from PDF level
183
+ exclusion_label = f"exclusion {i}"
184
+ original_exclusion = exclusion
185
+
186
+ # Check if it's a tuple from PDF.add_exclusion
187
+ if isinstance(exclusion, tuple) and len(exclusion) == 2 and callable(exclusion[0]):
188
+ # This is likely from PDF.add_exclusion with (func, label)
189
+ exclusion_func, label = exclusion
190
+ if label:
191
+ exclusion_label = label
192
+ exclusion = exclusion_func
193
+
194
+ # Process callable exclusion functions
195
+ if callable(exclusion) and include_callable:
196
+ # It's a function, call it with this page
197
+ try:
198
+ if debug:
199
+ print(f" - Evaluating callable {exclusion_label}...")
200
+
201
+ # Create a temporary copy of exclusions to avoid recursion
202
+ original_exclusions = self._exclusions
203
+ self._exclusions = [] # Temporarily clear exclusions
204
+
205
+ # Call the function
206
+ region = exclusion(self)
207
+
208
+ # Restore exclusions
209
+ self._exclusions = original_exclusions
210
+
211
+ if region:
212
+ regions.append(region)
213
+ if debug:
214
+ print(f" ✓ Added region: {region}")
215
+ else:
216
+ if debug:
217
+ print(f" ✗ Function returned None, no region added")
218
+
219
+ except Exception as e:
220
+ error_msg = f"Error in {exclusion_label} for page {self.index}: {e}"
221
+ print(error_msg)
222
+ # Print more detailed traceback for debugging
223
+ import traceback
224
+ print(f" Traceback: {traceback.format_exc().splitlines()[-3:]}")
225
+
226
+ # Process direct Region objects
227
+ elif not callable(exclusion):
228
+ # It's already a Region object
229
+ regions.append(exclusion)
230
+ if debug:
231
+ print(f" - Added direct region: {exclusion}")
232
+
233
+ if debug:
234
+ print(f"Page {self.index}: Found {len(regions)} valid exclusion regions")
235
+
236
+ return regions
237
+
238
+ def find(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> Any:
239
+ """
240
+ Find first element on this page matching selector.
241
+
242
+ Args:
243
+ selector: CSS-like selector string
244
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
245
+ regex: Whether to use regex for text search in :contains (default: False)
246
+ case: Whether to do case-sensitive text search (default: True)
247
+ **kwargs: Additional filter parameters
248
+
249
+ Returns:
250
+ Element object or None if not found
251
+ """
252
+ from natural_pdf.selectors.parser import parse_selector
253
+ selector_obj = parse_selector(selector)
254
+
255
+ # Pass regex and case flags to selector function
256
+ kwargs['regex'] = regex
257
+ kwargs['case'] = case
258
+
259
+ # First get all matching elements without applying exclusions
260
+ results = self._apply_selector(selector_obj, **kwargs)
261
+
262
+ # Then filter by exclusions if requested
263
+ if apply_exclusions and self._exclusions and results:
264
+ # Get all exclusion regions, including those from lambda functions
265
+ exclusion_regions = self._get_exclusion_regions(include_callable=True)
266
+
267
+ # Apply exclusion regions if any
268
+ if exclusion_regions:
269
+ results = results.exclude_regions(exclusion_regions)
270
+
271
+ return results.first if results else None
272
+
273
+ def find_all(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> 'ElementCollection':
274
+ """
275
+ Find all elements on this page matching selector.
276
+
277
+ Args:
278
+ selector: CSS-like selector string
279
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
280
+ regex: Whether to use regex for text search in :contains (default: False)
281
+ case: Whether to do case-sensitive text search (default: True)
282
+ **kwargs: Additional filter parameters
283
+
284
+ Returns:
285
+ ElementCollection with matching elements
286
+ """
287
+ from natural_pdf.selectors.parser import parse_selector
288
+ selector_obj = parse_selector(selector)
289
+
290
+ # Pass regex and case flags to selector function
291
+ kwargs['regex'] = regex
292
+ kwargs['case'] = case
293
+
294
+ # First get all matching elements without applying exclusions
295
+ results = self._apply_selector(selector_obj, **kwargs)
296
+
297
+ # Then filter by exclusions if requested
298
+ if apply_exclusions and self._exclusions and results:
299
+ # Get all exclusion regions, including those from lambda functions
300
+ exclusion_regions = self._get_exclusion_regions(include_callable=True)
301
+
302
+ # Apply exclusion regions if any
303
+ if exclusion_regions:
304
+ results = results.exclude_regions(exclusion_regions)
305
+
306
+ return results
307
+
308
+ def _apply_selector(self, selector_obj: Dict, apply_exclusions=True, **kwargs) -> 'ElementCollection':
309
+ """
310
+ Apply selector to page elements.
311
+
312
+ Args:
313
+ selector_obj: Parsed selector dictionary
314
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
315
+ **kwargs: Additional filter parameters
316
+
317
+ Returns:
318
+ ElementCollection of matching elements
319
+ """
320
+ from natural_pdf.elements.collections import ElementCollection
321
+ from natural_pdf.selectors.parser import selector_to_filter_func
322
+
323
+ # Load all elements if not already loaded
324
+ self._load_elements()
325
+
326
+ # Get element type to filter
327
+ element_type = selector_obj.get('type', 'any').lower()
328
+
329
+ # Determine which elements to search based on element type
330
+ elements_to_search = []
331
+ if element_type == 'any':
332
+ # Search all element types
333
+ for key, elements_list in self._elements.items():
334
+ # Skip chars if we have words for text search (avoid duplication)
335
+ if key == 'chars' and 'words' in self._elements:
336
+ continue
337
+ elements_to_search.extend(elements_list)
338
+ elif element_type == 'text':
339
+ # Prefer word elements over character elements for text
340
+ if 'words' in self._elements:
341
+ elements_to_search = self._elements.get('words', [])
342
+ else:
343
+ elements_to_search = self._elements.get('chars', [])
344
+ elif element_type == 'char':
345
+ elements_to_search = self._elements.get('chars', [])
346
+ elif element_type == 'word':
347
+ elements_to_search = self._elements.get('words', [])
348
+ elif element_type == 'rect' or element_type == 'rectangle':
349
+ elements_to_search = self._elements.get('rects', [])
350
+ elif element_type == 'line':
351
+ elements_to_search = self._elements.get('lines', [])
352
+ elif element_type == 'region':
353
+ # Start with an empty list
354
+ elements_to_search = []
355
+
356
+ # Add regions from _elements if available
357
+ if 'regions' in self._elements and self._elements['regions']:
358
+ elements_to_search.extend(self._elements['regions'])
359
+
360
+ # If no regions in _elements, look in _regions
361
+ if not elements_to_search:
362
+ # Add detected regions
363
+ elements_to_search.extend(self._regions['detected'])
364
+
365
+ # Add named regions
366
+ elements_to_search.extend(self._regions['named'].values())
367
+ else:
368
+ # If type doesn't match a specific category, look in all categories
369
+ for key, elements_list in self._elements.items():
370
+ # Skip chars if we have words for text search (avoid duplication)
371
+ if key == 'chars' and 'words' in self._elements:
372
+ continue
373
+ elements_to_search.extend(elements_list)
374
+
375
+ # Create filter function from selector, passing any additional parameters
376
+ filter_func = selector_to_filter_func(selector_obj, **kwargs)
377
+
378
+ # Apply the filter to matching elements
379
+ matching_elements = [element for element in elements_to_search if filter_func(element)]
380
+
381
+ # Handle spatial pseudo-classes that require relationship checking
382
+ for pseudo in selector_obj.get('pseudo_classes', []):
383
+ name = pseudo.get('name')
384
+ args = pseudo.get('args', '')
385
+
386
+ if name in ('above', 'below', 'near', 'left-of', 'right-of'):
387
+ # Find the reference element first
388
+ from natural_pdf.selectors.parser import parse_selector
389
+ ref_selector = parse_selector(args) if isinstance(args, str) else args
390
+ ref_elements = self._apply_selector(ref_selector)
391
+
392
+ if not ref_elements:
393
+ # No reference elements found, so no matches
394
+ return ElementCollection([])
395
+
396
+ # Use the first reference element for now
397
+ # TODO: Improve this to consider all reference elements
398
+ ref_element = ref_elements.first()
399
+
400
+ # Filter elements based on spatial relationship
401
+ if name == 'above':
402
+ matching_elements = [el for el in matching_elements if el.bottom <= ref_element.top]
403
+ elif name == 'below':
404
+ matching_elements = [el for el in matching_elements if el.top >= ref_element.bottom]
405
+ elif name == 'left-of':
406
+ matching_elements = [el for el in matching_elements if el.x1 <= ref_element.x0]
407
+ elif name == 'right-of':
408
+ matching_elements = [el for el in matching_elements if el.x0 >= ref_element.x1]
409
+ elif name == 'near':
410
+ # Calculate distance between centers
411
+ def distance(el1, el2):
412
+ el1_center_x = (el1.x0 + el1.x1) / 2
413
+ el1_center_y = (el1.top + el1.bottom) / 2
414
+ el2_center_x = (el2.x0 + el2.x1) / 2
415
+ el2_center_y = (el2.top + el2.bottom) / 2
416
+ return ((el1_center_x - el2_center_x) ** 2 + (el1_center_y - el2_center_y) ** 2) ** 0.5
417
+
418
+ # Get distance threshold from kwargs or use default
419
+ threshold = kwargs.get('near_threshold', 50) # Default 50 points
420
+ matching_elements = [el for el in matching_elements if distance(el, ref_element) <= threshold]
421
+
422
+ # Sort elements in reading order if requested
423
+ if kwargs.get('reading_order', True):
424
+ # TODO: Implement proper reading order sorting
425
+ # For now, simple top-to-bottom, left-to-right ordering
426
+ matching_elements.sort(key=lambda el: (el.top, el.x0))
427
+
428
+ # Create result collection
429
+ result = ElementCollection(matching_elements)
430
+
431
+ # Apply exclusions if requested and if there are exclusions defined
432
+ # Note: We don't apply exclusions here as that would cause recursion
433
+ # Exclusions are applied at the higher level via exclude_regions
434
+
435
+ return result
436
+
437
+ def create_region(self, x0: float, top: float, x1: float, bottom: float) -> Any:
438
+ """
439
+ Create a region on this page with the specified coordinates.
440
+
441
+ Args:
442
+ x0: Left x-coordinate
443
+ top: Top y-coordinate
444
+ x1: Right x-coordinate
445
+ bottom: Bottom y-coordinate
446
+
447
+ Returns:
448
+ Region object for the specified coordinates
449
+ """
450
+ from natural_pdf.elements.region import Region
451
+ return Region(self, (x0, top, x1, bottom))
452
+
453
+ def region(self, left: float = None, top: float = None, right: float = None, bottom: float = None,
454
+ width: str = "full") -> Any:
455
+ """
456
+ Create a region on this page with more intuitive named parameters.
457
+
458
+ Args:
459
+ left: Left x-coordinate (default: 0)
460
+ top: Top y-coordinate (default: 0)
461
+ right: Right x-coordinate (default: page width)
462
+ bottom: Bottom y-coordinate (default: page height)
463
+ width: Width mode - "full" for full page width or "element" for element width
464
+
465
+ Returns:
466
+ Region object for the specified coordinates
467
+
468
+ Examples:
469
+ >>> page.region(top=100, bottom=200) # Full width from y=100 to y=200
470
+ >>> page.region(left=50, right=150, top=100, bottom=200) # Specific rectangle
471
+ """
472
+ # Handle defaults
473
+ left = 0 if left is None else left
474
+ top = 0 if top is None else top
475
+ right = self.width if right is None else right
476
+ bottom = self.height if bottom is None else bottom
477
+
478
+ # Handle width parameter
479
+ if width == "full":
480
+ left = 0
481
+ right = self.width
482
+ elif width != "element":
483
+ raise ValueError("Width must be 'full' or 'element'")
484
+
485
+ from natural_pdf.elements.region import Region
486
+ region = Region(self, (left, top, right, bottom))
487
+ return region
488
+
489
+ def get_elements(self, apply_exclusions=True) -> List['Element']:
490
+ """
491
+ Get all elements on this page.
492
+
493
+ Args:
494
+ apply_exclusions: Whether to apply exclusion regions
495
+
496
+ Returns:
497
+ List of all elements on the page
498
+ """
499
+ # Load elements if not already loaded
500
+ self._load_elements()
501
+
502
+ # Combine all element types
503
+ all_elements = []
504
+ all_elements.extend(self.words)
505
+ all_elements.extend(self.rects)
506
+ all_elements.extend(self.lines)
507
+ # Add other element types as needed
508
+
509
+ # Apply exclusions if requested
510
+ if apply_exclusions and self._exclusions:
511
+ exclusion_regions = self._get_exclusion_regions(include_callable=True)
512
+ if exclusion_regions:
513
+ # Keep elements that are not in any exclusion region
514
+ filtered_elements = []
515
+ for element in all_elements:
516
+ in_exclusion = False
517
+ for region in exclusion_regions:
518
+ if region._is_element_in_region(element):
519
+ in_exclusion = True
520
+ break
521
+ if not in_exclusion:
522
+ filtered_elements.append(element)
523
+ return filtered_elements
524
+
525
+ return all_elements
526
+
527
+ def filter_elements(self, elements: List['Element'], selector: str, **kwargs) -> List['Element']:
528
+ """
529
+ Filter a list of elements based on a selector.
530
+
531
+ Args:
532
+ elements: List of elements to filter
533
+ selector: CSS-like selector string
534
+ **kwargs: Additional filter parameters
535
+
536
+ Returns:
537
+ List of elements that match the selector
538
+ """
539
+ from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
540
+
541
+ # Parse the selector
542
+ selector_obj = parse_selector(selector)
543
+
544
+ # Create filter function from selector
545
+ filter_func = selector_to_filter_func(selector_obj)
546
+
547
+ # Apply the filter to the elements
548
+ matching_elements = [element for element in elements if filter_func(element)]
549
+
550
+ # Sort elements in reading order if requested
551
+ if kwargs.get('reading_order', True):
552
+ matching_elements.sort(key=lambda el: (el.top, el.x0))
553
+
554
+ return matching_elements
555
+
556
+ def until(self, selector: str, include_endpoint: bool = True, **kwargs) -> Any:
557
+ """
558
+ Select content from the top of the page until matching selector.
559
+
560
+ Args:
561
+ selector: CSS-like selector string
562
+ include_endpoint: Whether to include the endpoint element in the region
563
+ **kwargs: Additional selection parameters
564
+
565
+ Returns:
566
+ Region object representing the selected content
567
+
568
+ Examples:
569
+ >>> page.until('text:contains("Conclusion")') # Select from top to conclusion
570
+ >>> page.until('line[width>=2]', include_endpoint=False) # Select up to thick line
571
+ """
572
+ # Find the target element
573
+ target = self.find(selector, **kwargs)
574
+ if not target:
575
+ # If target not found, return a default region
576
+ from natural_pdf.elements.region import Region
577
+ return Region(self, (0, 0, self.width, self.height))
578
+
579
+ # Create a region from the top of the page to the target
580
+ from natural_pdf.elements.region import Region
581
+ if include_endpoint:
582
+ # Include the target element
583
+ region = Region(self, (0, 0, self.width, target.bottom))
584
+ else:
585
+ # Up to the target element
586
+ region = Region(self, (0, 0, self.width, target.top))
587
+
588
+ region.end_element = target
589
+ return region
590
+
591
+ # Alias for backward compatibility
592
+ def select_until(self, selector: str, include_target: bool = True, **kwargs) -> Any:
593
+ """
594
+ DEPRECATED: Use until() instead.
595
+ Select content from this point until matching selector.
596
+
597
+ Args:
598
+ selector: CSS-like selector string
599
+ include_target: Whether to include the target element in the region
600
+ **kwargs: Additional selection parameters
601
+
602
+ Returns:
603
+ Region object representing the selected content
604
+ """
605
+ import warnings
606
+ warnings.warn(
607
+ "select_until() is deprecated and will be removed in a future version. Use until() instead.",
608
+ DeprecationWarning,
609
+ stacklevel=2
610
+ )
611
+ return self.until(selector, include_endpoint=include_target, **kwargs)
612
+
613
+ def crop(self, bbox=None, **kwargs) -> Any:
614
+ """
615
+ Crop the page to the specified bounding box.
616
+
617
+ This is a direct wrapper around pdfplumber's crop method.
618
+
619
+ Args:
620
+ bbox: Bounding box (x0, top, x1, bottom) or None
621
+ **kwargs: Additional parameters (top, bottom, left, right)
622
+
623
+ Returns:
624
+ Cropped page object
625
+ """
626
+ # TODO: Create proper wrapper for cropped page
627
+ return self._page.crop(bbox, **kwargs)
628
+
629
+ def extract_text(self,
630
+ preserve_whitespace=True,
631
+ use_exclusions=True,
632
+ debug_exclusions=False, ocr=None, **kwargs) -> str:
633
+ """
634
+ Extract text from this page, respecting any exclusion regions.
635
+
636
+ Args:
637
+ preserve_whitespace: Whether to keep blank characters (default: True)
638
+ use_exclusions: Whether to apply exclusion regions (default: True)
639
+ debug_exclusions: Whether to output detailed exclusion debugging info (default: False)
640
+ ocr: OCR configuration. If None, uses PDF settings
641
+ **kwargs: Additional extraction parameters
642
+
643
+ Returns:
644
+ Extracted text as string
645
+ """
646
+ if not self._exclusions or not use_exclusions:
647
+ # If no exclusions or exclusions disabled, use regular extraction
648
+ if debug_exclusions:
649
+ print(f"Page {self.index}: No exclusions to apply or use_exclusions=False")
650
+ # Note: pdfplumber still uses keep_blank_chars parameter
651
+ return self._page.extract_text(keep_blank_chars=preserve_whitespace, **kwargs)
652
+
653
+ # Get all exclusion regions
654
+ if debug_exclusions:
655
+ print(f"Page {self.index}: Getting exclusion regions with debugging enabled")
656
+
657
+ # Important: We need to evaluate lambda functions from PDF level
658
+ # These functions are stored directly in _exclusions and not as tuples
659
+ exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=debug_exclusions)
660
+
661
+ if not exclusion_regions:
662
+ if debug_exclusions:
663
+ print(f"Page {self.index}: No valid exclusion regions were found")
664
+ # Note: pdfplumber still uses keep_blank_chars parameter
665
+ return self._page.extract_text(keep_blank_chars=preserve_whitespace, **kwargs)
666
+
667
+ if debug_exclusions:
668
+ print(f"Page {self.index}: Found {len(exclusion_regions)} exclusion regions to apply")
669
+
670
+ # Find all text elements
671
+ all_text = self.find_all('text')
672
+
673
+ if debug_exclusions:
674
+ print(f"Page {self.index}: Found {len(all_text)} text elements before exclusion filtering")
675
+
676
+ # Filter out elements in excluded regions
677
+ filtered_elements = []
678
+ excluded_count = 0
679
+
680
+ for element in all_text:
681
+ exclude = False
682
+ for region in exclusion_regions:
683
+ if region._is_element_in_region(element):
684
+ exclude = True
685
+ excluded_count += 1
686
+ break
687
+ if not exclude:
688
+ filtered_elements.append(element)
689
+
690
+ if debug_exclusions:
691
+ print(f"Page {self.index}: Excluded {excluded_count} elements, keeping {len(filtered_elements)}")
692
+
693
+ # Extract text from filtered elements
694
+ from natural_pdf.elements.collections import ElementCollection
695
+ collection = ElementCollection(filtered_elements)
696
+ result = collection.extract_text(preserve_whitespace=preserve_whitespace, **kwargs)
697
+
698
+ # Apply OCR if explicitly requested
699
+ use_ocr = ocr is True or (ocr is not None and isinstance(ocr, dict) and ocr.get('enabled', False))
700
+ if use_ocr:
701
+ # Process OCR parameter into normalized config
702
+ ocr_config = self._get_ocr_config(ocr)
703
+
704
+ # Apply OCR if explicitly enabled or in auto mode and no text found
705
+ if ocr_config.get('enabled') is True or ocr is True or (
706
+ ocr_config.get('enabled') == 'auto' and not result.strip()
707
+ ):
708
+ print(f"Using OCR for extract_text")
709
+ # Get existing OCR elements or run OCR
710
+ if any(elem.source == 'ocr' for elem in filtered_elements):
711
+ # We already have OCR elements, just re-extract from them
712
+ ocr_elements = [elem for elem in filtered_elements if elem.source == 'ocr']
713
+ ocr_collection = ElementCollection(ocr_elements)
714
+ ocr_text = ocr_collection.extract_text(preserve_whitespace=preserve_whitespace, **kwargs)
715
+
716
+ if ocr_text.strip():
717
+ result = ocr_text
718
+ else:
719
+ # Run OCR and get text from OCR elements
720
+ ocr_elements = self.apply_ocr(**ocr_config)
721
+
722
+ if ocr_elements:
723
+ # Filter OCR elements by exclusions
724
+ if use_exclusions:
725
+ filtered_ocr = []
726
+ for element in ocr_elements:
727
+ exclude = False
728
+ for region in exclusion_regions:
729
+ if region._is_element_in_region(element):
730
+ exclude = True
731
+ break
732
+ if not exclude:
733
+ filtered_ocr.append(element)
734
+ else:
735
+ filtered_ocr = ocr_elements
736
+
737
+ ocr_collection = ElementCollection(filtered_ocr)
738
+ ocr_text = ocr_collection.extract_text(preserve_whitespace=preserve_whitespace, **kwargs)
739
+
740
+ # Use OCR text if it's not empty
741
+ if ocr_text.strip():
742
+ result = ocr_text
743
+
744
+ if debug_exclusions:
745
+ print(f"Page {self.index}: Extracted {len(result)} characters of text with exclusions applied")
746
+
747
+ return result
748
+
749
+ def extract_table(self, table_settings={}) -> List[Any]:
750
+ """
751
+ Extract the largest table from this page.
752
+
753
+ Args:
754
+ table_settings: Additional extraction parameters
755
+
756
+ Returns:
757
+ List of extracted tables
758
+ """
759
+ # For now, directly use pdfplumber's extraction
760
+ return self._page.extract_table(table_settings)
761
+
762
+ def extract_tables(self, table_settings={}) -> List[Any]:
763
+ """
764
+ Extract tables from this page.
765
+
766
+ Args:
767
+ table_settings: Additional extraction parameters
768
+
769
+ Returns:
770
+ List of extracted tables
771
+ """
772
+ # For now, directly use pdfplumber's extraction
773
+ return self._page.extract_tables(table_settings)
774
+
775
+ def _load_elements(self, include_ocr=None):
776
+ """
777
+ Load all elements from the page (lazy loading).
778
+
779
+ Args:
780
+ include_ocr: Whether to include OCR text elements. If None, uses PDF settings.
781
+ """
782
+ if self._elements is None:
783
+ from natural_pdf.elements.text import TextElement
784
+ from natural_pdf.elements.rect import RectangleElement
785
+ from natural_pdf.elements.line import LineElement
786
+
787
+ # Get the font attributes to use for word grouping
788
+ font_attrs = self._font_attrs
789
+
790
+ # Get keep_spaces setting from PDF config or default to True (new behavior)
791
+ keep_spaces = self._parent._config.get('keep_spaces', True)
792
+
793
+ # Process characters, annotating with font information
794
+ chars = []
795
+ for c in self._page.chars:
796
+ # Check for font references (F0, F1, etc.) and map to actual fonts
797
+ if c.get('fontname', '').startswith('F') and len(c['fontname']) <= 3:
798
+ # Access the PDF resource info to get actual font name
799
+ font_ref = c['fontname']
800
+ try:
801
+ # Try to get font info from resources
802
+ if self._page.page_obj.get('Resources', {}).get('Font', {}):
803
+ fonts = self._page.page_obj['Resources']['Font']
804
+ if font_ref in fonts:
805
+ font_obj = fonts[font_ref]
806
+ if font_obj.get('BaseFont'):
807
+ c['real_fontname'] = font_obj['BaseFont']
808
+ except (KeyError, AttributeError, TypeError):
809
+ pass
810
+
811
+ chars.append(TextElement(c, self))
812
+
813
+ # Create word-level text elements by grouping chars
814
+ from itertools import groupby
815
+ from operator import itemgetter
816
+
817
+ # Sort chars by y-position (line) and then x-position
818
+ sorted_chars = sorted(self._page.chars, key=lambda c: (round(c['top']), c['x0']))
819
+
820
+ # Group chars by line (similar y-position)
821
+ line_groups = []
822
+ for _, line_chars in groupby(sorted_chars, key=lambda c: round(c['top'])):
823
+ line_chars = list(line_chars)
824
+
825
+ # Now group chars into words based on x-distance and font attributes
826
+ words = []
827
+ current_word = []
828
+
829
+ for i, char in enumerate(line_chars):
830
+ # Handle whitespace characters differently based on keep_spaces setting
831
+ if char['text'].isspace():
832
+ if keep_spaces:
833
+ # Include spaces in words when keep_spaces is enabled
834
+ if current_word:
835
+ current_word.append(char)
836
+ else:
837
+ # Skip leading spaces at the start of a line
838
+ continue
839
+ else:
840
+ # Original behavior: Skip whitespace and close current word
841
+ if current_word:
842
+ # Combine text from characters and normalize spaces
843
+ text = ''.join(c['text'] for c in current_word)
844
+
845
+ # Collapse multiple consecutive spaces into a single space
846
+ import re
847
+ text = re.sub(r'\s+', ' ', text)
848
+
849
+ # Create a combined word object
850
+ word_obj = {
851
+ 'text': text,
852
+ 'x0': min(c['x0'] for c in current_word),
853
+ 'x1': max(c['x1'] for c in current_word),
854
+ 'top': min(c['top'] for c in current_word),
855
+ 'bottom': max(c['bottom'] for c in current_word),
856
+ 'fontname': current_word[0].get('fontname', ''),
857
+ 'size': current_word[0].get('size', 0),
858
+ 'object_type': 'word',
859
+ 'page_number': current_word[0]['page_number']
860
+ }
861
+
862
+ # Handle real fontname if available
863
+ if 'real_fontname' in current_word[0]:
864
+ word_obj['real_fontname'] = current_word[0]['real_fontname']
865
+
866
+ # Handle color - use the first char's color
867
+ if 'non_stroking_color' in current_word[0]:
868
+ word_obj['non_stroking_color'] = current_word[0]['non_stroking_color']
869
+
870
+ # Copy any additional font attributes
871
+ for attr in font_attrs:
872
+ if attr in current_word[0]:
873
+ word_obj[attr] = current_word[0][attr]
874
+
875
+ words.append(TextElement(word_obj, self))
876
+ current_word = []
877
+ continue
878
+
879
+ # If this is a new word, start it
880
+ if not current_word:
881
+ current_word.append(char)
882
+ else:
883
+ # Check if this char is part of the current word or a new word
884
+ prev_char = current_word[-1]
885
+
886
+ # Check if font attributes match for this character
887
+ font_attrs_match = True
888
+ if font_attrs:
889
+ for attr in font_attrs:
890
+ # If attribute doesn't match or isn't present in both chars, break word
891
+ if attr not in char or attr not in prev_char or char[attr] != prev_char[attr]:
892
+ font_attrs_match = False
893
+ break
894
+
895
+ # If font attributes don't match, it's a new word
896
+ if not font_attrs_match:
897
+ # Combine text from characters and normalize spaces
898
+ text = ''.join(c['text'] for c in current_word)
899
+
900
+ # Collapse multiple consecutive spaces into a single space
901
+ import re
902
+ text = re.sub(r'\s+', ' ', text)
903
+
904
+ # Finish current word
905
+ word_obj = {
906
+ 'text': text,
907
+ 'x0': min(c['x0'] for c in current_word),
908
+ 'x1': max(c['x1'] for c in current_word),
909
+ 'top': min(c['top'] for c in current_word),
910
+ 'bottom': max(c['bottom'] for c in current_word),
911
+ 'fontname': current_word[0].get('fontname', ''),
912
+ 'size': current_word[0].get('size', 0),
913
+ 'object_type': 'word',
914
+ 'page_number': current_word[0]['page_number']
915
+ }
916
+
917
+ # Handle real fontname if available
918
+ if 'real_fontname' in current_word[0]:
919
+ word_obj['real_fontname'] = current_word[0]['real_fontname']
920
+
921
+ # Handle color - use the first char's color
922
+ if 'non_stroking_color' in current_word[0]:
923
+ word_obj['non_stroking_color'] = current_word[0]['non_stroking_color']
924
+
925
+ # Copy any additional font attributes
926
+ for attr in font_attrs:
927
+ if attr in current_word[0]:
928
+ word_obj[attr] = current_word[0][attr]
929
+
930
+ words.append(TextElement(word_obj, self))
931
+ current_word = [char]
932
+ # If the gap between chars is larger than a threshold, it's a new word
933
+ # Use a wider threshold when keep_spaces is enabled to allow for natural spaces
934
+ elif char['x0'] - prev_char['x1'] > prev_char['width'] * (1.5 if keep_spaces else 0.5):
935
+ # Combine text from characters and normalize spaces
936
+ text = ''.join(c['text'] for c in current_word)
937
+
938
+ # Collapse multiple consecutive spaces into a single space
939
+ import re
940
+ text = re.sub(r'\s+', ' ', text)
941
+
942
+ # Finish current word
943
+ word_obj = {
944
+ 'text': text,
945
+ 'x0': min(c['x0'] for c in current_word),
946
+ 'x1': max(c['x1'] for c in current_word),
947
+ 'top': min(c['top'] for c in current_word),
948
+ 'bottom': max(c['bottom'] for c in current_word),
949
+ 'fontname': current_word[0].get('fontname', ''),
950
+ 'size': current_word[0].get('size', 0),
951
+ 'object_type': 'word',
952
+ 'page_number': current_word[0]['page_number']
953
+ }
954
+
955
+ # Handle real fontname if available
956
+ if 'real_fontname' in current_word[0]:
957
+ word_obj['real_fontname'] = current_word[0]['real_fontname']
958
+
959
+ # Handle color - use the first char's color
960
+ if 'non_stroking_color' in current_word[0]:
961
+ word_obj['non_stroking_color'] = current_word[0]['non_stroking_color']
962
+
963
+ # Copy any additional font attributes
964
+ for attr in font_attrs:
965
+ if attr in current_word[0]:
966
+ word_obj[attr] = current_word[0][attr]
967
+
968
+ words.append(TextElement(word_obj, self))
969
+ current_word = [char]
970
+ else:
971
+ # Continue current word
972
+ current_word.append(char)
973
+
974
+ # Handle the last word if there is one
975
+ if current_word:
976
+ # Combine text from characters and normalize spaces
977
+ text = ''.join(c['text'] for c in current_word)
978
+
979
+ # Collapse multiple consecutive spaces into a single space
980
+ import re
981
+ text = re.sub(r'\s+', ' ', text)
982
+
983
+ word_obj = {
984
+ 'text': text,
985
+ 'x0': min(c['x0'] for c in current_word),
986
+ 'x1': max(c['x1'] for c in current_word),
987
+ 'top': min(c['top'] for c in current_word),
988
+ 'bottom': max(c['bottom'] for c in current_word),
989
+ 'fontname': current_word[0].get('fontname', ''),
990
+ 'size': current_word[0].get('size', 0),
991
+ 'object_type': 'word',
992
+ 'page_number': current_word[0]['page_number']
993
+ }
994
+
995
+ # Handle real fontname if available
996
+ if 'real_fontname' in current_word[0]:
997
+ word_obj['real_fontname'] = current_word[0]['real_fontname']
998
+
999
+ # Handle color - use the first char's color
1000
+ if 'non_stroking_color' in current_word[0]:
1001
+ word_obj['non_stroking_color'] = current_word[0]['non_stroking_color']
1002
+
1003
+ # Copy any additional font attributes
1004
+ for attr in font_attrs:
1005
+ if attr in current_word[0]:
1006
+ word_obj[attr] = current_word[0][attr]
1007
+
1008
+ words.append(TextElement(word_obj, self))
1009
+
1010
+ line_groups.extend(words)
1011
+
1012
+ self._elements = {
1013
+ 'chars': chars,
1014
+ 'words': line_groups,
1015
+ 'rects': [RectangleElement(r, self) for r in self._page.rects],
1016
+ 'lines': [LineElement(l, self) for l in self._page.lines],
1017
+ # Add other element types as needed
1018
+ }
1019
+
1020
+ # Check if we should run OCR
1021
+ apply_ocr = False
1022
+
1023
+ # Check if OCR is explicitly requested
1024
+ if include_ocr is True:
1025
+ apply_ocr = True
1026
+ # Otherwise, check PDF-level settings for auto mode
1027
+ elif include_ocr is None and self._parent._ocr_config.get('enabled') == 'auto':
1028
+ # In auto mode, apply OCR if few or no text elements found
1029
+ if len(line_groups) < 5: # Arbitrary threshold
1030
+ apply_ocr = True
1031
+
1032
+ # Apply OCR if needed
1033
+ if apply_ocr:
1034
+ ocr_elements = self.apply_ocr()
1035
+ # OCR elements are already added to self._elements in apply_ocr()
1036
+
1037
+ @property
1038
+ def chars(self) -> List[Any]:
1039
+ """Get all character elements on this page."""
1040
+ self._load_elements()
1041
+ return self._elements['chars']
1042
+
1043
+ @property
1044
+ def words(self) -> List[Any]:
1045
+ """Get all word elements on this page."""
1046
+ self._load_elements()
1047
+ return self._elements['words']
1048
+
1049
+ @property
1050
+ def rects(self) -> List[Any]:
1051
+ """Get all rectangle elements on this page."""
1052
+ self._load_elements()
1053
+ return self._elements['rects']
1054
+
1055
+ @property
1056
+ def lines(self) -> List[Any]:
1057
+ """Get all line elements on this page."""
1058
+ self._load_elements()
1059
+ return self._elements['lines']
1060
+
1061
+ @property
1062
+ def _highlight_mgr(self) -> 'HighlightManager':
1063
+ """Get the highlight manager for this page."""
1064
+ if self._highlight_manager is None:
1065
+ from natural_pdf.utils.highlighting import HighlightManager
1066
+ self._highlight_manager = HighlightManager(self)
1067
+ return self._highlight_manager
1068
+
1069
+ def highlight(self,
1070
+ color: Optional[Tuple[int, int, int, int]] = None,
1071
+ label: Optional[str] = None) -> 'Page':
1072
+ """
1073
+ Highlight the entire page.
1074
+
1075
+ Args:
1076
+ color: RGBA color tuple for the highlight, or None to use the next color
1077
+ label: Optional label for the highlight
1078
+
1079
+ Returns:
1080
+ Self for method chaining
1081
+ """
1082
+ # Add a highlight for the entire page
1083
+ self._highlight_mgr.add_highlight(
1084
+ (0, 0, self.width, self.height), color, label
1085
+ )
1086
+ return self
1087
+
1088
+ def show(self,
1089
+ scale: float = 2.0,
1090
+ width: Optional[int] = None,
1091
+ labels: bool = True,
1092
+ legend_position: str = 'right',
1093
+ render_ocr: bool = False) -> Image.Image:
1094
+ """
1095
+ Show the page with any highlights.
1096
+
1097
+ Args:
1098
+ scale: Scale factor for rendering
1099
+ width: Optional width for the output image in pixels
1100
+ labels: Whether to include a legend for labels
1101
+ legend_position: Position of the legend
1102
+ render_ocr: Whether to render OCR text with white background boxes
1103
+
1104
+ Returns:
1105
+ PIL Image of the page with highlights
1106
+ """
1107
+ # Use to_image to get the image
1108
+ return self.to_image(
1109
+ scale=scale,
1110
+ width=width,
1111
+ labels=labels,
1112
+ legend_position=legend_position,
1113
+ render_ocr=render_ocr
1114
+ )
1115
+
1116
+
1117
+
1118
+ def save_image(self,
1119
+ filename: str,
1120
+ scale: float = 2.0,
1121
+ width: Optional[int] = None,
1122
+ labels: bool = True,
1123
+ legend_position: str = 'right',
1124
+ render_ocr: bool = False) -> 'Page':
1125
+ """
1126
+ Save the page with any highlights to an image file.
1127
+
1128
+ Args:
1129
+ filename: Path to save the image to
1130
+ scale: Scale factor for rendering
1131
+ width: Optional width for the output image in pixels
1132
+ labels: Whether to include a legend for labels
1133
+ legend_position: Position of the legend
1134
+ render_ocr: Whether to render OCR text with white background boxes
1135
+
1136
+ Returns:
1137
+ Self for method chaining
1138
+ """
1139
+ # Use to_image to generate and save the image
1140
+ self.to_image(
1141
+ path=filename,
1142
+ scale=scale,
1143
+ width=width,
1144
+ labels=labels,
1145
+ legend_position=legend_position,
1146
+ render_ocr=render_ocr
1147
+ )
1148
+ return self
1149
+
1150
+ def debug_ocr(self, output_path):
1151
+ """
1152
+ Generate an interactive HTML debug report for OCR results.
1153
+
1154
+ This creates a single-file HTML report with:
1155
+ - Side-by-side view of image regions and OCR text
1156
+ - Confidence scores with color coding
1157
+ - Editable correction fields
1158
+ - Filtering and sorting options
1159
+ - Export functionality for corrected text
1160
+
1161
+ Args:
1162
+ output_path: Path to save the HTML report
1163
+
1164
+ Returns:
1165
+ Path to the generated HTML file
1166
+ """
1167
+ from natural_pdf.utils.ocr import debug_ocr_to_html
1168
+ return debug_ocr_to_html([self], output_path)
1169
+
1170
+ def save(self,
1171
+ filename: str,
1172
+ scale: float = 2.0,
1173
+ width: Optional[int] = None,
1174
+ labels: bool = False,
1175
+ legend_position: str = 'right') -> 'Page':
1176
+ """
1177
+ DEPRECATED: Use to_image() instead.
1178
+ Save the page with any highlights to an image file.
1179
+ """
1180
+ import warnings
1181
+ warnings.warn(
1182
+ "save() is deprecated and will be removed in a future version. Use to_image() instead.",
1183
+ DeprecationWarning,
1184
+ stacklevel=2
1185
+ )
1186
+ self.to_image(
1187
+ path=filename,
1188
+ scale=scale,
1189
+ width=width,
1190
+ show_labels=labels,
1191
+ legend_position=legend_position
1192
+ )
1193
+ return self
1194
+
1195
+ def clear_highlights(self) -> 'Page':
1196
+ """
1197
+ Clear all highlights from the page.
1198
+
1199
+ Returns:
1200
+ Self for method chaining
1201
+ """
1202
+ self._highlight_mgr.clear_highlights()
1203
+ return self
1204
+
1205
+ def analyze_text_styles(self) -> Dict[str, 'ElementCollection']:
1206
+ """
1207
+ Analyze and group text elements by their style properties.
1208
+
1209
+ Returns:
1210
+ Dictionary mapping style labels to element collections
1211
+ """
1212
+ # Import the analyzer
1213
+ from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
1214
+
1215
+ # Create analyzer
1216
+ analyzer = TextStyleAnalyzer()
1217
+
1218
+ # Analyze the page and store the results
1219
+ self._text_styles = analyzer.analyze(self)
1220
+
1221
+ # Return the analyzed styles
1222
+ return self._text_styles
1223
+
1224
+ def highlight_text_styles(self) -> 'Page':
1225
+ """
1226
+ Highlight text elements grouped by their style properties.
1227
+
1228
+ This automatically analyzes the styles if they haven't been analyzed yet.
1229
+
1230
+ Returns:
1231
+ Self for method chaining
1232
+ """
1233
+ # Analyze styles if not already done
1234
+ if self._text_styles is None:
1235
+ self.analyze_text_styles()
1236
+
1237
+ # Highlight each style group with its own color
1238
+ for label, elements in self._text_styles.items():
1239
+ elements.highlight(label=label)
1240
+
1241
+ return self
1242
+
1243
+ def highlight_all(self,
1244
+ include_types: Optional[List[str]] = None,
1245
+ include_text_styles: bool = False,
1246
+ include_layout_regions: bool = False,
1247
+ apply_exclusions: bool = True,
1248
+ use_color_cycling: bool = True,
1249
+ layout_confidence: float = 0.2) -> 'Page':
1250
+ """
1251
+ Highlight all elements on the page, grouped by type or style.
1252
+
1253
+ Each element type or style gets its own color and label in the legend.
1254
+
1255
+ Args:
1256
+ include_types: Optional list of element types to include
1257
+ (e.g., ['text', 'line', 'rect'])
1258
+ If None, all available types will be included
1259
+ include_text_styles: Whether to highlight text by style groups
1260
+ (font, size, etc.) instead of as a single group
1261
+ include_layout_regions: Whether to include detected layout regions
1262
+ (will run layout detection if not already done)
1263
+ Layout regions will be grouped by model and type
1264
+ apply_exclusions: Whether to respect exclusion zones (default: True)
1265
+ use_color_cycling: Whether to use different colors for each type (default: True)
1266
+ layout_confidence: Confidence threshold for layout regions (default: 0.2)
1267
+ If True is passed, all regions will be included regardless of confidence
1268
+
1269
+ Returns:
1270
+ Self for method chaining
1271
+ """
1272
+ # Load all elements if not already loaded
1273
+ self._load_elements()
1274
+
1275
+ # Get exclusion regions if we're applying exclusions
1276
+ exclusion_regions = []
1277
+ if apply_exclusions and self._exclusions:
1278
+ # Get exclusion regions using callable functions when appropriate
1279
+ exclusion_regions = self._get_exclusion_regions(include_callable=True)
1280
+
1281
+ # Define all available element types
1282
+ all_types = {
1283
+ 'text': self.words,
1284
+ 'char': self.chars,
1285
+ 'rect': self.rects,
1286
+ 'line': self.lines,
1287
+ # Add other types as they become available
1288
+ }
1289
+
1290
+ # Highlight by text styles if requested
1291
+ # This takes precedence over normal text highlighting
1292
+ if include_text_styles:
1293
+ # Analyze text styles
1294
+ styles = self.analyze_text_styles()
1295
+
1296
+ # Apply exclusions to each style group if needed
1297
+ if apply_exclusions and exclusion_regions:
1298
+ for label, elements in styles.items():
1299
+ # Filter out excluded elements
1300
+ filtered_elements = elements.exclude_regions(exclusion_regions)
1301
+ # Highlight with appropriate label
1302
+ filtered_elements.highlight(label=label, use_color_cycling=use_color_cycling)
1303
+ else:
1304
+ # Highlight without exclusions
1305
+ for label, elements in styles.items():
1306
+ elements.highlight(label=label, use_color_cycling=use_color_cycling)
1307
+
1308
+ # Highlight non-text elements normally
1309
+ if include_types:
1310
+ # Filter to only include non-text types
1311
+ non_text_types = [t for t in include_types if t != 'text']
1312
+
1313
+ # Highlight each non-text type
1314
+ for element_type in non_text_types:
1315
+ if element_type in all_types and all_types[element_type]:
1316
+ label = f"{element_type.capitalize()} Elements"
1317
+ elements = all_types[element_type]
1318
+
1319
+ # Skip empty collections
1320
+ if not elements:
1321
+ continue
1322
+
1323
+ # Create an ElementCollection if needed
1324
+ from natural_pdf.elements.collections import ElementCollection
1325
+ if not isinstance(elements, ElementCollection):
1326
+ elements = ElementCollection(elements)
1327
+
1328
+ # Apply exclusions if needed
1329
+ if apply_exclusions and exclusion_regions:
1330
+ elements = elements.exclude_regions(exclusion_regions)
1331
+
1332
+ # Highlight with appropriate label
1333
+ elements.highlight(label=label, cycle_colors=cycle_colors)
1334
+ else:
1335
+ # Highlight all non-text elements
1336
+ for element_type in all_types.keys():
1337
+ if element_type != 'text' and element_type != 'char':
1338
+ if all_types[element_type]:
1339
+ label = f"{element_type.capitalize()} Elements"
1340
+ elements = all_types[element_type]
1341
+
1342
+ # Skip empty collections
1343
+ if not elements:
1344
+ continue
1345
+
1346
+ # Create an ElementCollection if needed
1347
+ from natural_pdf.elements.collections import ElementCollection
1348
+ if not isinstance(elements, ElementCollection):
1349
+ elements = ElementCollection(elements)
1350
+
1351
+ # Apply exclusions if needed
1352
+ if apply_exclusions and exclusion_regions:
1353
+ elements = elements.exclude_regions(exclusion_regions)
1354
+
1355
+ # Highlight with appropriate label
1356
+ elements.highlight(label=label, use_color_cycling=use_color_cycling)
1357
+
1358
+ return self
1359
+
1360
+ # Normal highlight_all behavior (by element type)
1361
+ # Determine which types to highlight
1362
+ types_to_highlight = include_types if include_types else all_types.keys()
1363
+
1364
+ # Highlight each type of element with its own color/label
1365
+ for element_type in types_to_highlight:
1366
+ if element_type in all_types and all_types[element_type]:
1367
+ # Format label (e.g., "text" -> "Text Elements")
1368
+ label = f"{element_type.capitalize()} Elements"
1369
+
1370
+ # Get the elements and highlight them
1371
+ elements = all_types[element_type]
1372
+
1373
+ # Skip empty collections
1374
+ if not elements:
1375
+ continue
1376
+
1377
+ # Create an ElementCollection if needed
1378
+ from natural_pdf.elements.collections import ElementCollection
1379
+ if not isinstance(elements, ElementCollection):
1380
+ elements = ElementCollection(elements)
1381
+
1382
+ # Apply exclusions if needed
1383
+ if apply_exclusions and exclusion_regions:
1384
+ elements = elements.exclude_regions(exclusion_regions)
1385
+
1386
+ # Highlight with appropriate label
1387
+ elements.highlight(label=label, use_color_cycling=use_color_cycling)
1388
+
1389
+ # Include layout regions if requested
1390
+ if include_layout_regions:
1391
+ # Run layout detection if not already done
1392
+ if (not hasattr(self, 'detected_layout_regions') or not self.detected_layout_regions) and \
1393
+ ('detected' not in self._regions or not self._regions['detected']):
1394
+ # Make sure to run analyze_layout with include_highlights=False
1395
+ self.analyze_layout(confidence=layout_confidence)
1396
+
1397
+ # Get layout regions from either detected_layout_regions or _regions['detected']
1398
+ layout_regions = []
1399
+ if hasattr(self, 'detected_layout_regions') and self.detected_layout_regions:
1400
+ layout_regions = self.detected_layout_regions
1401
+ elif 'detected' in self._regions and self._regions['detected']:
1402
+ layout_regions = self._regions['detected']
1403
+
1404
+ # Filter regions by confidence (handle case where layout_confidence=True)
1405
+ if isinstance(layout_confidence, bool):
1406
+ # If True is passed, don't filter by confidence
1407
+ filtered_regions = layout_regions
1408
+ else:
1409
+ # Filter by confidence threshold
1410
+ filtered_regions = [r for r in layout_regions if hasattr(r, 'confidence') and r.confidence >= layout_confidence]
1411
+ layout_regions = filtered_regions
1412
+
1413
+ # Group regions by model and type for better visualization
1414
+ models = set(r.model for r in layout_regions if hasattr(r, 'model'))
1415
+
1416
+ for model in models:
1417
+ # Get regions for this model
1418
+ model_regions = [r for r in layout_regions if hasattr(r, 'model') and r.model == model]
1419
+
1420
+ # Group by type within model
1421
+ types = set(r.region_type for r in model_regions if hasattr(r, 'region_type'))
1422
+
1423
+ for region_type in types:
1424
+ # Get regions of this type
1425
+ type_regions = [r for r in model_regions if hasattr(r, 'region_type') and r.region_type == region_type]
1426
+
1427
+ # Create a collection and highlight
1428
+ from natural_pdf.elements.collections import ElementCollection
1429
+ collection = ElementCollection(type_regions)
1430
+
1431
+ # Determine color based on type (similar to highlight_layout logic)
1432
+ color = None
1433
+ if model == 'tatr':
1434
+ if region_type == 'table':
1435
+ color = (1, 0, 0, 0.3) # Red for tables
1436
+ elif region_type == 'table row':
1437
+ color = (0, 1, 0, 0.3) # Green for rows
1438
+ elif region_type == 'table column':
1439
+ color = (0, 0, 1, 0.3) # Blue for columns
1440
+ elif region_type == 'table column header':
1441
+ color = (0, 1, 1, 0.3) # Cyan for column headers
1442
+
1443
+ # Don't use ElementCollection for this case since we want individual confidence scores
1444
+ # Instead, highlight each region individually with its own confidence
1445
+ for region in type_regions:
1446
+ # Create a label with model and type
1447
+ label = f"Layout ({model}): {region_type}"
1448
+
1449
+ # Highlight with the same color scheme but don't automatically include attributes
1450
+ region.highlight(
1451
+ label=label,
1452
+ color=color,
1453
+ use_color_cycling=use_color_cycling
1454
+ # No include_attrs by default - user must explicitly request it
1455
+ )
1456
+
1457
+ return self
1458
+
1459
+ def to_image(self,
1460
+ path: Optional[str] = None,
1461
+ scale: float = 2.0,
1462
+ width: Optional[int] = None,
1463
+ labels: bool = True,
1464
+ legend_position: str = 'right',
1465
+ render_ocr: bool = False,
1466
+ resolution: float = None,
1467
+ include_highlights: bool = True,
1468
+ **kwargs) -> Image.Image:
1469
+ """
1470
+ Generate a PIL image of the page, optionally with highlights, and optionally save it to a file.
1471
+
1472
+ Args:
1473
+ path: Optional path to save the image to
1474
+ scale: Scale factor for rendering highlights (default: 2.0)
1475
+ width: Optional width for the output image in pixels (height calculated to maintain aspect ratio)
1476
+ labels: Whether to include a legend for labels (default: True)
1477
+ legend_position: Position of the legend (default: 'right')
1478
+ render_ocr: Whether to render OCR text with white background boxes (default: False)
1479
+ resolution: Resolution in DPI for base page image (default: scale * 72)
1480
+ include_highlights: Whether to include highlights (default: True)
1481
+ **kwargs: Additional parameters for pdfplumber.to_image
1482
+
1483
+ Returns:
1484
+ PIL Image of the page
1485
+
1486
+ Examples:
1487
+ >>> # Get base page image without highlights
1488
+ >>> img = page.to_image(include_highlights=False)
1489
+ >>>
1490
+ >>> # Get image with highlights and no labels
1491
+ >>> img = page.to_image(labels=False)
1492
+ >>>
1493
+ >>> # Save image with specific width
1494
+ >>> page.to_image(path="output.png", width=800)
1495
+ """
1496
+ # Use resolution based on scale if not provided
1497
+ if resolution is None:
1498
+ resolution = scale * 72 # Convert scale to DPI (72 is base DPI)
1499
+
1500
+ if include_highlights and hasattr(self, '_highlight_mgr'):
1501
+ # Get the highlighted image
1502
+ image = self._highlight_mgr.get_highlighted_image(scale, labels, legend_position, render_ocr)
1503
+ else:
1504
+ # Get the base page image from pdfplumber
1505
+ image = self._page.to_image(resolution=resolution, **kwargs).annotated
1506
+
1507
+ # Resize the image if width is provided
1508
+ if width is not None and width > 0:
1509
+ # Calculate height to maintain aspect ratio
1510
+ aspect_ratio = image.height / image.width
1511
+ height = int(width * aspect_ratio)
1512
+ # Resize the image
1513
+ image = image.resize((width, height), Image.LANCZOS)
1514
+
1515
+ # Save the image if path is provided
1516
+ if path:
1517
+ image.save(path)
1518
+
1519
+ return image
1520
+
1521
+ def _get_ocr_config(self, ocr_params: Optional[Union[bool, str, List, Dict]] = None) -> Dict[str, Any]:
1522
+ """
1523
+ Get the OCR configuration by merging defaults, PDF settings, and provided params.
1524
+
1525
+ Args:
1526
+ ocr_params: OCR parameters to override defaults
1527
+
1528
+ Returns:
1529
+ Merged OCR configuration
1530
+ """
1531
+ if HAS_OCR_ENGINES and hasattr(self._parent, '_ocr_engine') and self._parent._ocr_engine:
1532
+ # Use new OCR engine system
1533
+ engine = self._parent._ocr_engine
1534
+
1535
+ # Get normalized PDF-level config
1536
+ pdf_config = self._parent._ocr_config
1537
+
1538
+ # Special case: If ocr_params is boolean True, convert to config with enabled=True
1539
+ if ocr_params is True:
1540
+ ocr_params = {"enabled": True}
1541
+
1542
+ # Normalize provided config
1543
+ if ocr_params is not None:
1544
+ provided_config = engine.normalize_config(ocr_params)
1545
+
1546
+ # If provided config explicitly sets enabled, respect that
1547
+ if "enabled" in provided_config:
1548
+ # Always merge configs to get language settings etc. from PDF-level config
1549
+ result_config = engine.merge_configs(pdf_config, provided_config)
1550
+ # Only print status if verbose mode is not explicitly disabled
1551
+ if provided_config.get('verbose', True):
1552
+ print(f"OCR enabled status from provided params: {provided_config.get('enabled')}")
1553
+ return result_config
1554
+ else:
1555
+ # Merge configs and keep PDF-level enabled status
1556
+ result_config = engine.merge_configs(pdf_config, provided_config)
1557
+ # Only print status if verbose mode is not explicitly disabled
1558
+ if provided_config.get('verbose', True):
1559
+ print(f"OCR enabled status from PDF config: {pdf_config.get('enabled')}")
1560
+ return result_config
1561
+ else:
1562
+ # Use PDF-level config
1563
+ # Only print status if verbose mode is not explicitly disabled
1564
+ if ocr_params is None or not isinstance(ocr_params, dict) or ocr_params.get('verbose', True):
1565
+ print(f"Using PDF-level OCR config: {pdf_config}")
1566
+ return pdf_config
1567
+ else:
1568
+ # Fallback to legacy OCR manager
1569
+ ocr_manager = OCRManager.get_instance()
1570
+
1571
+ # Get normalized PDF-level config
1572
+ pdf_config = self._parent._ocr_config
1573
+
1574
+ # Special case: If ocr_params is boolean True, convert to config with enabled=True
1575
+ if ocr_params is True:
1576
+ ocr_params = {"enabled": True}
1577
+
1578
+ # Normalize provided config
1579
+ if ocr_params is not None:
1580
+ provided_config = ocr_manager.normalize_config(ocr_params)
1581
+
1582
+ # If provided config explicitly sets enabled, respect that
1583
+ if "enabled" in provided_config:
1584
+ # Always merge configs to get language settings etc. from PDF-level config
1585
+ result_config = ocr_manager.merge_configs(pdf_config, provided_config)
1586
+ print(f"OCR enabled status from provided params: {provided_config.get('enabled')}")
1587
+ return result_config
1588
+ else:
1589
+ # Merge configs and keep PDF-level enabled status
1590
+ result_config = ocr_manager.merge_configs(pdf_config, provided_config)
1591
+ print(f"OCR enabled status from PDF config: {pdf_config.get('enabled')}")
1592
+ return result_config
1593
+ else:
1594
+ # Use PDF-level config
1595
+ print(f"Using PDF-level OCR config: {pdf_config}")
1596
+ return pdf_config
1597
+
1598
+ def _create_text_elements_from_ocr(self, ocr_results: List[Dict[str, Any]], image_width=None, image_height=None) -> List[TextElement]:
1599
+ """
1600
+ Convert OCR results to TextElement objects.
1601
+
1602
+ Args:
1603
+ ocr_results: List of OCR results with text, bbox, and confidence
1604
+ image_width: Width of the source image (for coordinate scaling)
1605
+ image_height: Height of the source image (for coordinate scaling)
1606
+
1607
+ Returns:
1608
+ List of created TextElement objects
1609
+ """
1610
+ elements = []
1611
+
1612
+ # Calculate scale factors to convert from image coordinates to PDF coordinates
1613
+ # Default to 1.0 if not provided (assume coordinates are already in PDF space)
1614
+ scale_x = 1.0
1615
+ scale_y = 1.0
1616
+
1617
+ if image_width and image_height:
1618
+ scale_x = self.width / image_width
1619
+ scale_y = self.height / image_height
1620
+
1621
+ for result in ocr_results:
1622
+ # Convert numpy int32 to float if needed and scale to PDF coordinates
1623
+ x0 = float(result['bbox'][0]) * scale_x
1624
+ top = float(result['bbox'][1]) * scale_y
1625
+ x1 = float(result['bbox'][2]) * scale_x
1626
+ bottom = float(result['bbox'][3]) * scale_y
1627
+
1628
+ # Create a TextElement object with additional required fields for highlighting
1629
+ element_data = {
1630
+ 'text': result['text'],
1631
+ 'x0': x0,
1632
+ 'top': top,
1633
+ 'x1': x1,
1634
+ 'bottom': bottom,
1635
+ 'width': x1 - x0,
1636
+ 'height': bottom - top,
1637
+ 'object_type': 'text',
1638
+ 'source': 'ocr',
1639
+ 'confidence': result['confidence'],
1640
+ # Add default font information to work with existing expectations
1641
+ 'fontname': 'OCR-detected',
1642
+ 'size': 10.0,
1643
+ 'page_number': self.number
1644
+ }
1645
+
1646
+ elem = TextElement(element_data, self)
1647
+ elements.append(elem)
1648
+
1649
+ # Add to page's elements
1650
+ if hasattr(self, '_elements') and self._elements is not None:
1651
+ # Add to words list to make it accessible via standard API
1652
+ if 'words' in self._elements:
1653
+ self._elements['words'].append(elem)
1654
+ else:
1655
+ self._elements['words'] = [elem]
1656
+
1657
+ return elements
1658
+
1659
+ def apply_ocr(self, **ocr_params) -> List[TextElement]:
1660
+ """
1661
+ Apply OCR to this page and register results as text elements.
1662
+
1663
+ Args:
1664
+ **ocr_params: OCR parameters to override defaults
1665
+
1666
+ Returns:
1667
+ List of created text elements
1668
+ """
1669
+ # Get OCR config (merge defaults, PDF settings, and provided params)
1670
+ # Ensure OCR is enabled for this explicit OCR call
1671
+ if isinstance(ocr_params, dict):
1672
+ ocr_params["enabled"] = True
1673
+ else:
1674
+ ocr_params = {"enabled": True}
1675
+
1676
+ config = self._get_ocr_config(ocr_params)
1677
+
1678
+ # Skip if OCR is still disabled (should not happen after the above override)
1679
+ if not config.get('enabled'):
1680
+ print(f"OCR is disabled in config despite override - forcing enabled=True")
1681
+ config["enabled"] = True
1682
+
1683
+ # Render page to image
1684
+ print(f"Rendering page {self.number} to image for OCR...")
1685
+ image = self.to_image()
1686
+ print(f"Image size: {image.width}x{image.height}")
1687
+
1688
+ # Save image for debugging if needed
1689
+ try:
1690
+ import os
1691
+ debug_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "output")
1692
+ os.makedirs(debug_dir, exist_ok=True)
1693
+ debug_path = os.path.join(debug_dir, f"page_{self.number}_for_ocr.png")
1694
+ image.save(debug_path)
1695
+ print(f"Saved page image for debugging to {debug_path}")
1696
+ except Exception as e:
1697
+ print(f"Could not save debug image: {e}")
1698
+
1699
+ # Process the image with the appropriate OCR engine
1700
+ print(f"Processing image with OCR engine...")
1701
+ if HAS_OCR_ENGINES and hasattr(self._parent, '_ocr_engine') and self._parent._ocr_engine:
1702
+ # Use new OCR engine system
1703
+ print(f"Using OCR engine: {self._parent._ocr_engine.__class__.__name__}")
1704
+ engine = self._parent._ocr_engine
1705
+ results = engine.process_image(image, config)
1706
+ else:
1707
+ # Fallback to legacy OCR manager
1708
+ print(f"Using legacy OCR manager")
1709
+ ocr_mgr = OCRManager.get_instance()
1710
+ results = ocr_mgr.detect_and_recognize(image, config)
1711
+
1712
+ print(f"OCR returned {len(results)} results")
1713
+
1714
+ # Convert results to elements and add to page, with image dimensions for scaling
1715
+ elements = self._create_text_elements_from_ocr(results, image.width, image.height)
1716
+
1717
+ return elements
1718
+
1719
+ def extract_ocr_elements(self, **ocr_params) -> List[TextElement]:
1720
+ """
1721
+ Extract text elements using OCR.
1722
+
1723
+ This method applies OCR to the page and returns the resulting text elements
1724
+ without modifying the page's elements list.
1725
+
1726
+ Args:
1727
+ **ocr_params: OCR parameters to override defaults
1728
+
1729
+ Returns:
1730
+ List of text elements created from OCR
1731
+ """
1732
+ print("=" * 40)
1733
+ print(f"Page.extract_ocr_elements called with params: {ocr_params}")
1734
+
1735
+ # Get OCR config
1736
+ # Ensure OCR is enabled for this explicit OCR call
1737
+ if isinstance(ocr_params, dict):
1738
+ ocr_params["enabled"] = True
1739
+ else:
1740
+ ocr_params = {"enabled": True}
1741
+
1742
+ config = self._get_ocr_config(ocr_params)
1743
+ print(f"OCR config after normalization: {config}")
1744
+
1745
+ # Skip if OCR is still disabled (should not happen after the above override)
1746
+ if not config.get('enabled'):
1747
+ print(f"OCR is disabled in config despite override - forcing enabled=True")
1748
+ config["enabled"] = True
1749
+
1750
+ # Try direct OCR test for debugging
1751
+ import os
1752
+ try:
1753
+ print("Trying direct OCR test for debugging...")
1754
+
1755
+ # Save image to temp file for debugging
1756
+ output_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "output")
1757
+ os.makedirs(output_dir, exist_ok=True)
1758
+ temp_image_path = os.path.join(output_dir, "direct_ocr_debug.png")
1759
+
1760
+ # Get the image using the direct method
1761
+ print("Generating page image...")
1762
+ from PIL import Image
1763
+ image = self.to_image()
1764
+ image.save(temp_image_path)
1765
+ print(f"Saved image to {temp_image_path}")
1766
+
1767
+ try:
1768
+ import easyocr
1769
+ print("Testing direct EasyOCR...")
1770
+ reader = easyocr.Reader(['en'])
1771
+ import numpy as np
1772
+ result = reader.readtext(np.array(image))
1773
+ print(f"Direct EasyOCR test got {len(result)} results")
1774
+ except ImportError:
1775
+ print("EasyOCR not available for direct test")
1776
+ except Exception as e:
1777
+ print(f"Error in direct EasyOCR test: {e}")
1778
+
1779
+ try:
1780
+ import paddleocr
1781
+ print("Testing direct PaddleOCR...")
1782
+ reader = paddleocr.PaddleOCR(lang='en')
1783
+ import numpy as np
1784
+ result = reader.ocr(np.array(image), cls=False)
1785
+ if result is not None and len(result) > 0:
1786
+ page_result = result[0] if isinstance(result[0], list) else result
1787
+ print(f"Direct PaddleOCR test got {len(page_result)} results")
1788
+ else:
1789
+ print(f"Direct PaddleOCR test got no results: {result}")
1790
+ except ImportError:
1791
+ print("PaddleOCR not available for direct test")
1792
+ except Exception as e:
1793
+ print(f"Error in direct PaddleOCR test: {e}")
1794
+ except Exception as e:
1795
+ print(f"Error in direct OCR test: {e}")
1796
+
1797
+ # Now try the normal process
1798
+ print("Proceeding with normal OCR process...")
1799
+
1800
+ # Render page to image
1801
+ print(f"Rendering page {self.number} to image for OCR...")
1802
+ image = self.to_image()
1803
+ print(f"Image size: {image.width}x{image.height}")
1804
+
1805
+ # Process the image with the appropriate OCR engine
1806
+ print(f"Processing image with OCR engine...")
1807
+ results = []
1808
+
1809
+ try:
1810
+ if HAS_OCR_ENGINES and hasattr(self._parent, '_ocr_engine') and self._parent._ocr_engine:
1811
+ # Use new OCR engine system
1812
+ print(f"Using OCR engine: {self._parent._ocr_engine.__class__.__name__}")
1813
+ engine = self._parent._ocr_engine
1814
+
1815
+ # Directly test the engine
1816
+ print(f"Direct test of {engine.__class__.__name__}.process_image")
1817
+ results = engine.process_image(image, config)
1818
+ print(f"Engine returned {len(results)} results")
1819
+ else:
1820
+ # Fallback to legacy OCR manager
1821
+ print(f"Using legacy OCR manager")
1822
+ ocr_mgr = OCRManager.get_instance()
1823
+ results = ocr_mgr.detect_and_recognize(image, config)
1824
+ print(f"OCR manager returned {len(results)} results")
1825
+ except Exception as e:
1826
+ print(f"Error during OCR processing: {e}")
1827
+ import traceback
1828
+ traceback.print_exc()
1829
+ return []
1830
+
1831
+ print(f"OCR returned {len(results)} results")
1832
+ if len(results) > 0:
1833
+ print(f"First result: {results[0]}")
1834
+
1835
+ # Create a copy of the original _elements so we can restore it later
1836
+ original_elements = None
1837
+ if hasattr(self, '_elements'):
1838
+ original_elements = self._elements
1839
+ # Temporarily set _elements to None so they aren't added to the page
1840
+ self._elements = None
1841
+
1842
+ # Create elements with proper scaling (but don't add to page)
1843
+ print(f"Creating text elements from {len(results)} OCR results...")
1844
+ elements = self._create_text_elements_from_ocr(results, image.width, image.height)
1845
+ print(f"Created {len(elements)} text elements")
1846
+
1847
+ # Restore original elements
1848
+ if original_elements is not None:
1849
+ self._elements = original_elements
1850
+
1851
+ print(f"Returning {len(elements)} OCR elements")
1852
+ print("=" * 40)
1853
+ return elements
1854
+
1855
+ def analyze_layout(self,
1856
+ model: str = "yolo",
1857
+ confidence: float = 0.2,
1858
+ classes: Optional[List[str]] = None,
1859
+ exclude_classes: Optional[List[str]] = None,
1860
+ device: str = "cpu",
1861
+ existing: str = "replace",
1862
+ model_params: Optional[Dict[str, Any]] = None,
1863
+ # Legacy parameters for backward compatibility
1864
+ model_path: Optional[str] = None,
1865
+ image_size: int = 1024,
1866
+ create_cells: bool = False) -> 'Page':
1867
+ """
1868
+ Analyze the page layout using a machine learning model.
1869
+
1870
+ Args:
1871
+ model: Model type to use ('yolo', 'tatr', or 'paddle')
1872
+ confidence: Minimum confidence threshold for detections
1873
+ classes: Specific classes to detect (None for all supported classes)
1874
+ exclude_classes: Classes to exclude from detection
1875
+ device: Device to use for inference ('cpu' or 'cuda:0'/'gpu')
1876
+ existing: How to handle existing regions: 'replace' (default) or 'append'
1877
+ model_params: Dictionary of model-specific parameters:
1878
+ - YOLO: {"model_path": "...", "image_size": 1024}
1879
+ - TATR: {"model_path": "...", "create_cells": False}
1880
+ - Paddle: {"lang": "en", "use_angle_cls": False, "enable_table": True}
1881
+ model_path: (Legacy) Optional path to custom model file
1882
+ image_size: (Legacy) Size to resize the image to before detection (YOLO only)
1883
+ create_cells: (Legacy) Whether to create cell regions for TATR table regions
1884
+
1885
+ Returns:
1886
+ Self for method chaining
1887
+ """
1888
+ # Initialize model_params if None
1889
+ if model_params is None:
1890
+ model_params = {}
1891
+
1892
+ # Handle legacy parameters by adding them to model_params
1893
+ if model_path is not None:
1894
+ model_params['model_path'] = model_path
1895
+ if model.lower() == "yolo" and image_size != 1024:
1896
+ model_params['image_size'] = image_size
1897
+ if model.lower() == "tatr" and create_cells:
1898
+ model_params['create_cells'] = create_cells
1899
+
1900
+ # Create a temporary directory to store the page image
1901
+ temp_dir = tempfile.mkdtemp()
1902
+ temp_image_path = os.path.join(temp_dir, f"page_{self.index}.png")
1903
+
1904
+ try:
1905
+ # Render the page as an image and save to temp file
1906
+ # Explicitly set include_highlights=False to ensure we get the original page image
1907
+ page_image = self.to_image(resolution=150.0, include_highlights=False)
1908
+ page_image.save(temp_image_path)
1909
+
1910
+ # Initialize the appropriate detector based on the model type
1911
+ if model.lower() == "yolo":
1912
+ # Extract YOLO-specific parameters
1913
+ model_file = model_params.get('model_path', "doclayout_yolo_docstructbench_imgsz1024.pt")
1914
+ yolo_image_size = model_params.get('image_size', 1024)
1915
+
1916
+ detector = YOLODocLayoutDetector(
1917
+ model_file=model_file,
1918
+ device=device
1919
+ )
1920
+ # Run detection
1921
+ detections = detector.detect(
1922
+ temp_image_path,
1923
+ confidence=confidence,
1924
+ classes=classes,
1925
+ exclude_classes=exclude_classes,
1926
+ image_size=yolo_image_size
1927
+ )
1928
+
1929
+ elif model.lower() == "tatr" or model.lower() == "table-transformer":
1930
+ # Extract TATR-specific parameters
1931
+ tatr_model_path = model_params.get('model_path')
1932
+
1933
+ detector = TableTransformerDetector(
1934
+ detection_model="microsoft/table-transformer-detection" if tatr_model_path is None else tatr_model_path,
1935
+ device=device
1936
+ )
1937
+ # Run detection
1938
+ detections = detector.detect(
1939
+ temp_image_path,
1940
+ confidence=confidence,
1941
+ classes=classes,
1942
+ exclude_classes=exclude_classes
1943
+ )
1944
+
1945
+ elif model.lower() == "paddle":
1946
+ # Extract PaddlePaddle-specific parameters
1947
+ paddle_lang = model_params.get('lang', 'en')
1948
+ use_angle_cls = model_params.get('use_angle_cls', False)
1949
+ enable_table = model_params.get('enable_table', True)
1950
+ show_log = model_params.get('show_log', False)
1951
+
1952
+ # Convert device format
1953
+ paddle_device = 'gpu' if device.startswith('cuda') else device
1954
+
1955
+ # Initialize PaddleLayoutDetector
1956
+ detector = PaddleLayoutDetector(
1957
+ lang=paddle_lang,
1958
+ use_angle_cls=use_angle_cls,
1959
+ device=paddle_device,
1960
+ enable_table=enable_table,
1961
+ show_log=show_log
1962
+ )
1963
+
1964
+ # Run detection
1965
+ detections = detector.detect(
1966
+ temp_image_path,
1967
+ confidence=confidence,
1968
+ classes=classes,
1969
+ exclude_classes=exclude_classes
1970
+ )
1971
+
1972
+ else:
1973
+ raise ValueError(f"Unsupported model type: {model}. Currently supported: 'yolo', 'tatr', 'paddle'")
1974
+
1975
+ # Calculate the scale factor to convert from image to PDF coordinates
1976
+ # Note: This assumes the image resolution is 150 DPI
1977
+ scale_x = self.width / page_image.width
1978
+ scale_y = self.height / page_image.height
1979
+
1980
+ # Create a list to store layout regions
1981
+ layout_regions = []
1982
+
1983
+ # Convert detections to regions
1984
+ for detection in detections:
1985
+ x_min, y_min, x_max, y_max = detection['bbox']
1986
+
1987
+ # Convert coordinates from image to PDF space
1988
+ pdf_x0 = x_min * scale_x
1989
+ pdf_y0 = y_min * scale_y
1990
+ pdf_x1 = x_max * scale_x
1991
+ pdf_y1 = y_max * scale_y
1992
+
1993
+ # Create a region
1994
+ region = Region(self, (pdf_x0, pdf_y0, pdf_x1, pdf_y1))
1995
+ region.region_type = detection['class']
1996
+ region.normalized_type = detection['normalized_class']
1997
+ region.confidence = detection['confidence']
1998
+ region.model = model # Store which model detected this region
1999
+ region.source = 'detected' # Set the source for selectors
2000
+
2001
+ layout_regions.append(region)
2002
+
2003
+ # Handle existing regions based on mode
2004
+ if existing.lower() == 'append':
2005
+ # Append to existing detected regions
2006
+ self._regions['detected'].extend(layout_regions)
2007
+ else:
2008
+ # Replace existing detected regions
2009
+ self._regions['detected'] = layout_regions
2010
+
2011
+ # Make sure elements is initialized
2012
+ self._load_elements()
2013
+
2014
+ # Update elements collection for selectors
2015
+ if 'regions' not in self._elements:
2016
+ self._elements['regions'] = []
2017
+
2018
+ # Update elements collection based on existing mode
2019
+ if existing.lower() == 'append':
2020
+ # Only add new regions that aren't already in the collection
2021
+ for region in layout_regions:
2022
+ if region not in self._elements['regions']:
2023
+ self._elements['regions'].append(region)
2024
+ else:
2025
+ # Replace existing regions in _elements with detected regions, keep named regions
2026
+ # First get all named regions from _elements['regions']
2027
+ named_regions = [r for r in self._elements['regions'] if r.source == 'named']
2028
+ # Then create a new list with named regions and layout regions
2029
+ self._elements['regions'] = named_regions + layout_regions
2030
+
2031
+ # Create cells for table regions if requested and using TATR
2032
+ create_cells_flag = model_params.get('create_cells', create_cells)
2033
+ if model.lower() == 'tatr' and create_cells_flag:
2034
+ # Debug log
2035
+ print(f"Creating cells for {len([r for r in layout_regions if r.region_type == 'table'])} table regions")
2036
+
2037
+ cell_count = 0
2038
+ for region in layout_regions:
2039
+ # Check if it's a table region
2040
+ if region.region_type == 'table':
2041
+ try:
2042
+ # Create cells for the table
2043
+ cells = region.create_cells()
2044
+ cell_count += len(cells)
2045
+
2046
+ # Add cell regions to our tracking structures
2047
+ layout_regions.extend(cells)
2048
+
2049
+ # Also add to _elements for selectors
2050
+ if 'regions' in self._elements:
2051
+ self._elements['regions'].extend(cells)
2052
+
2053
+ # And to _regions['detected']
2054
+ self._regions['detected'].extend(cells)
2055
+
2056
+ except Exception as e:
2057
+ print(f"Error creating cells for table: {e}")
2058
+
2059
+ # Debug log
2060
+ print(f"Created {cell_count} cells in total")
2061
+
2062
+ # Store layout regions in an instance variable so they can be accessed after the method returns
2063
+ self.detected_layout_regions = layout_regions
2064
+ return self
2065
+
2066
+ finally:
2067
+ # Clean up temporary file and directory
2068
+ if os.path.exists(temp_image_path):
2069
+ os.remove(temp_image_path)
2070
+ os.rmdir(temp_dir)
2071
+
2072
+ def highlight_layout(self,
2073
+ layout_regions: Optional[List[Region]] = None,
2074
+ confidence: float = 0.2,
2075
+ label_format: str = "{type} ({conf:.2f}){model}") -> 'Page':
2076
+ """
2077
+ Highlight detected layout regions on the page.
2078
+
2079
+ Args:
2080
+ layout_regions: List of regions to highlight (runs analyze_layout if None)
2081
+ confidence: Minimum confidence threshold for highlighting regions
2082
+ label_format: Format string for region labels
2083
+
2084
+ Returns:
2085
+ Self for method chaining
2086
+ """
2087
+ # If no regions provided, use detected_layout_regions, detected regions, or run layout detection
2088
+ if layout_regions:
2089
+ regions = layout_regions
2090
+ elif hasattr(self, 'detected_layout_regions') and self.detected_layout_regions:
2091
+ regions = self.detected_layout_regions
2092
+ elif 'detected' in self._regions and self._regions['detected']:
2093
+ regions = self._regions['detected']
2094
+ else:
2095
+ # Call analyze_layout with include_highlights=False and use the result directly
2096
+ self.analyze_layout(confidence=confidence)
2097
+ regions = self.detected_layout_regions
2098
+
2099
+ # Highlight each region with its type as the label
2100
+ for region in regions:
2101
+ # Skip regions below confidence threshold
2102
+ if region.confidence < confidence:
2103
+ continue
2104
+
2105
+ # No model filtering here - use selectors for that
2106
+
2107
+ # Format label
2108
+ model_suffix = f" ({region.model})" if hasattr(region, 'model') else ""
2109
+ label = label_format.format(
2110
+ type=region.region_type,
2111
+ conf=region.confidence,
2112
+ model=model_suffix
2113
+ )
2114
+
2115
+ # Highlight region with appropriate color based on model
2116
+ if hasattr(region, 'model') and region.model == 'tatr':
2117
+ # Use different colors for table structure elements
2118
+ if region.region_type == 'table':
2119
+ color = (1, 0, 0, 0.3) # Red for tables
2120
+ elif region.region_type == 'table row':
2121
+ color = (0, 1, 0, 0.3) # Green for rows
2122
+ elif region.region_type == 'table column':
2123
+ color = (0, 0, 1, 0.3) # Blue for columns
2124
+ elif region.region_type == 'table column header':
2125
+ color = (0, 1, 1, 0.3) # Cyan for column headers
2126
+ else:
2127
+ color = None # Default color cycling
2128
+ region.highlight(label=label, color=color)
2129
+ else:
2130
+ region.highlight(label=label)
2131
+
2132
+ return self
2133
+
2134
+ def get_section_between(self, start_element=None, end_element=None, boundary_inclusion='both') -> Region:
2135
+ """
2136
+ Get a section between two elements on this page.
2137
+
2138
+ Args:
2139
+ start_element: Element marking the start of the section
2140
+ end_element: Element marking the end of the section
2141
+ boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none'
2142
+
2143
+ Returns:
2144
+ Region representing the section between elements
2145
+ """
2146
+ # Create a full-page region
2147
+ page_region = self.create_region(0, 0, self.width, self.height)
2148
+
2149
+ # Get the section from the region
2150
+ return page_region.get_section_between(
2151
+ start_element=start_element,
2152
+ end_element=end_element,
2153
+ boundary_inclusion=boundary_inclusion
2154
+ )
2155
+
2156
+ def get_sections(self,
2157
+ start_elements=None,
2158
+ end_elements=None,
2159
+ boundary_inclusion='both',
2160
+ y_threshold=5.0,
2161
+ bounding_box=None):
2162
+ """
2163
+ Get sections of a page defined by start/end elements.
2164
+
2165
+ Args:
2166
+ start_elements: Elements or selector string that mark the start of sections
2167
+ end_elements: Elements or selector string that mark the end of sections
2168
+ boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none'
2169
+ y_threshold: Maximum vertical difference to consider elements on same line
2170
+ bounding_box: Optional tuple (x0, top, x1, bottom) to limit the section area
2171
+
2172
+ Returns:
2173
+ List of Region objects representing the sections
2174
+ """
2175
+ # Helper function to get bounds from bounding_box parameter
2176
+ def get_bounds():
2177
+ if bounding_box:
2178
+ return bounding_box[0], bounding_box[1], bounding_box[2], bounding_box[3]
2179
+ else:
2180
+ return 0, 0, self.width, self.height
2181
+
2182
+ regions = []
2183
+
2184
+ # Handle cases where elements are provided as strings (selectors)
2185
+ if isinstance(start_elements, str):
2186
+ start_elements = self.find_all(start_elements)
2187
+
2188
+ if isinstance(end_elements, str):
2189
+ end_elements = self.find_all(end_elements)
2190
+
2191
+ # Validate boundary_inclusion parameter
2192
+ valid_inclusions = ['start', 'end', 'both', 'none']
2193
+ if boundary_inclusion not in valid_inclusions:
2194
+ raise ValueError(f"boundary_inclusion must be one of {valid_inclusions}")
2195
+
2196
+ # If no start elements, can't do anything
2197
+ if not start_elements:
2198
+ return regions
2199
+
2200
+ # Sort elements by position (top-to-bottom, left-to-right)
2201
+ all_elements = []
2202
+
2203
+ for element in start_elements:
2204
+ all_elements.append((element, 'start'))
2205
+
2206
+ if end_elements:
2207
+ for element in end_elements:
2208
+ all_elements.append((element, 'end'))
2209
+
2210
+ # Group elements with similar Y coordinates
2211
+ # Consider elements on the same line if they're within the threshold
2212
+
2213
+ # First sort all elements by Y position
2214
+ all_elements.sort(key=lambda x: x[0].top)
2215
+
2216
+ # Group elements on the same line
2217
+ grouped_elements = []
2218
+ current_group = []
2219
+ current_group_type = None
2220
+ current_y = None
2221
+
2222
+ for element, element_type in all_elements:
2223
+ if current_y is None or abs(element.top - current_y) <= y_threshold:
2224
+ # Element is on the same line as current group
2225
+ if current_group and element_type != current_group_type:
2226
+ # If we have a mixed group, prioritize start elements over end elements
2227
+ if element_type == 'start':
2228
+ current_group_type = 'start'
2229
+ elif not current_group:
2230
+ current_group_type = element_type
2231
+
2232
+ current_group.append(element)
2233
+ current_y = element.top # Update reference Y
2234
+ else:
2235
+ # Element is on a new line, close current group and start a new one
2236
+ if current_group:
2237
+ # Find the leftmost element in the group
2238
+ leftmost = min(current_group, key=lambda e: e.x0)
2239
+ grouped_elements.append((leftmost, current_group_type))
2240
+
2241
+ # Start a new group
2242
+ current_group = [element]
2243
+ current_group_type = element_type
2244
+ current_y = element.top
2245
+
2246
+ # Add the last group
2247
+ if current_group:
2248
+ # Find the leftmost element in the group
2249
+ leftmost = min(current_group, key=lambda e: e.x0)
2250
+ grouped_elements.append((leftmost, current_group_type))
2251
+
2252
+ # Use the grouped elements for sectioning
2253
+ all_elements = grouped_elements
2254
+
2255
+ # Find sections
2256
+ current_start = None
2257
+
2258
+ for i, (element, element_type) in enumerate(all_elements):
2259
+ if element_type == 'start':
2260
+ # If we already have a start without an end, create a section until this start
2261
+ if current_start is not None:
2262
+ # Create a region from current_start to this start
2263
+ start_element = current_start
2264
+ end_element = element
2265
+
2266
+ # Determine region boundaries based on inclusion parameter
2267
+ if boundary_inclusion in ['start', 'both']:
2268
+ top = start_element.top
2269
+ else:
2270
+ top = start_element.bottom
2271
+
2272
+ if boundary_inclusion in ['end', 'both']:
2273
+ bottom = end_element.bottom
2274
+ else:
2275
+ bottom = end_element.top
2276
+
2277
+ # Create the region
2278
+ x0, _, x1, _ = get_bounds()
2279
+ region = self.create_region(x0, top, x1, bottom)
2280
+ region.start_element = start_element
2281
+ region.end_element = end_element
2282
+ region.is_end_next_start = True
2283
+ regions.append(region)
2284
+
2285
+ # Save this element as the current start
2286
+ current_start = element
2287
+
2288
+ elif element_type == 'end' and current_start is not None:
2289
+ # We found an end for the current start
2290
+ start_element = current_start
2291
+ end_element = element
2292
+
2293
+ # Determine region boundaries based on inclusion parameter
2294
+ if boundary_inclusion in ['start', 'both']:
2295
+ top = start_element.top
2296
+ else:
2297
+ top = start_element.bottom
2298
+
2299
+ if boundary_inclusion in ['end', 'both']:
2300
+ bottom = end_element.bottom
2301
+ else:
2302
+ bottom = end_element.top
2303
+
2304
+ # Create the region
2305
+ x0, _, x1, _ = get_bounds()
2306
+ region = self.create_region(x0, top, x1, bottom)
2307
+ region.start_element = start_element
2308
+ region.end_element = end_element
2309
+ region.is_end_next_start = False
2310
+ regions.append(region)
2311
+
2312
+ # Reset current start so we don't use it again
2313
+ current_start = None
2314
+
2315
+ # If we have a start without an end at the end, create a section to the page bottom
2316
+ if current_start is not None:
2317
+ # Determine region top boundary based on inclusion parameter
2318
+ if boundary_inclusion in ['start', 'both']:
2319
+ top = current_start.top
2320
+ else:
2321
+ top = current_start.bottom
2322
+
2323
+ # Create the region to the bottom of the page
2324
+ x0, _, x1, page_bottom = get_bounds()
2325
+ region = self.create_region(x0, top, x1, page_bottom)
2326
+ region.start_element = current_start
2327
+ region.end_element = None
2328
+ region.is_end_next_start = False
2329
+ regions.append(region)
2330
+
2331
+ return regions
2332
+
2333
+ def __repr__(self) -> str:
2334
+ """String representation of the page."""
2335
+ return f"<Page number={self.number} index={self.index}>"
2336
+
2337
+ def ask(self, question: str, min_confidence: float = 0.1, model: str = None, debug: bool = False, **kwargs) -> Dict[str, Any]:
2338
+ """
2339
+ Ask a question about the page content using document QA.
2340
+
2341
+ This method uses a document question answering model to extract answers from the page content.
2342
+ It leverages both textual content and layout information for better understanding.
2343
+
2344
+ Args:
2345
+ question: The question to ask about the page content
2346
+ min_confidence: Minimum confidence threshold for answers (0.0-1.0)
2347
+ model: Optional model name to use for QA (if None, uses default model)
2348
+ **kwargs: Additional parameters to pass to the QA engine
2349
+
2350
+ Returns:
2351
+ Dictionary with answer details: {
2352
+ "answer": extracted text,
2353
+ "confidence": confidence score,
2354
+ "found": whether an answer was found,
2355
+ "page_num": page number,
2356
+ "source_elements": list of elements that contain the answer (if found)
2357
+ }
2358
+ """
2359
+ try:
2360
+ from natural_pdf.qa.document_qa import get_qa_engine
2361
+
2362
+ # Get or initialize QA engine with specified model
2363
+ qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
2364
+
2365
+ # Ask the question using the QA engine
2366
+ return qa_engine.ask_pdf_page(self, question, min_confidence=min_confidence, debug=debug, **kwargs)
2367
+ except ImportError as e:
2368
+ import logging
2369
+ logger = logging.getLogger("natural_pdf.core.page")
2370
+ logger.warning(f"QA functionality not available: {e}")
2371
+ return {
2372
+ "answer": "",
2373
+ "confidence": 0.0,
2374
+ "error": "QA functionality not available",
2375
+ "found": False
2376
+ }