natural-pdf 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +6 -0
- natural_pdf/analyzers/layout/__init__.py +1 -0
- natural_pdf/analyzers/layout/base.py +151 -0
- natural_pdf/analyzers/layout/docling.py +247 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +166 -0
- natural_pdf/analyzers/layout/layout_manager.py +200 -0
- natural_pdf/analyzers/layout/layout_options.py +78 -0
- natural_pdf/analyzers/layout/paddle.py +240 -0
- natural_pdf/analyzers/layout/surya.py +151 -0
- natural_pdf/analyzers/layout/tatr.py +251 -0
- natural_pdf/analyzers/layout/yolo.py +165 -0
- natural_pdf/analyzers/text_options.py +60 -0
- natural_pdf/analyzers/text_structure.py +270 -0
- natural_pdf/analyzers/utils.py +57 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/element_manager.py +457 -0
- natural_pdf/core/highlighting_service.py +698 -0
- natural_pdf/core/page.py +1444 -0
- natural_pdf/core/pdf.py +653 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +761 -0
- natural_pdf/elements/collections.py +1345 -0
- natural_pdf/elements/line.py +140 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1793 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +56 -0
- natural_pdf/ocr/engine.py +104 -0
- natural_pdf/ocr/engine_easyocr.py +179 -0
- natural_pdf/ocr/engine_paddle.py +204 -0
- natural_pdf/ocr/engine_surya.py +171 -0
- natural_pdf/ocr/ocr_manager.py +191 -0
- natural_pdf/ocr/ocr_options.py +114 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +396 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +354 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +3 -0
- natural_pdf/utils/highlighting.py +12 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +223 -0
- natural_pdf/widgets/__init__.py +4 -0
- natural_pdf/widgets/frontend/viewer.js +88 -0
- natural_pdf/widgets/viewer.py +765 -0
- natural_pdf-0.1.0.dist-info/METADATA +295 -0
- natural_pdf-0.1.0.dist-info/RECORD +52 -0
- natural_pdf-0.1.0.dist-info/WHEEL +5 -0
- natural_pdf-0.1.0.dist-info/licenses/LICENSE +21 -0
- natural_pdf-0.1.0.dist-info/top_level.txt +1 -0
natural_pdf/core/page.py
ADDED
@@ -0,0 +1,1444 @@
|
|
1
|
+
import pdfplumber
|
2
|
+
import os
|
3
|
+
import logging
|
4
|
+
import tempfile
|
5
|
+
from typing import List, Optional, Union, Any, Dict, Callable, TYPE_CHECKING, Tuple
|
6
|
+
from PIL import Image
|
7
|
+
import base64
|
8
|
+
import io
|
9
|
+
import json
|
10
|
+
|
11
|
+
from natural_pdf.elements.collections import ElementCollection
|
12
|
+
|
13
|
+
if TYPE_CHECKING:
|
14
|
+
import pdfplumber
|
15
|
+
from natural_pdf.core.pdf import PDF
|
16
|
+
from natural_pdf.elements.collections import ElementCollection
|
17
|
+
from natural_pdf.core.highlighting_service import HighlightingService
|
18
|
+
from natural_pdf.elements.base import Element
|
19
|
+
|
20
|
+
from natural_pdf.elements.region import Region
|
21
|
+
from natural_pdf.elements.text import TextElement
|
22
|
+
from natural_pdf.analyzers.layout.layout_manager import LayoutManager
|
23
|
+
from natural_pdf.analyzers.layout.layout_options import LayoutOptions
|
24
|
+
from natural_pdf.ocr import OCROptions
|
25
|
+
from natural_pdf.ocr import OCRManager
|
26
|
+
from natural_pdf.core.element_manager import ElementManager
|
27
|
+
from natural_pdf.analyzers.layout.layout_analyzer import LayoutAnalyzer
|
28
|
+
from natural_pdf.analyzers.text_structure import TextStyleAnalyzer
|
29
|
+
from natural_pdf.analyzers.text_options import TextStyleOptions
|
30
|
+
from natural_pdf.widgets import InteractiveViewerWidget
|
31
|
+
from natural_pdf.widgets.viewer import SimpleInteractiveViewerWidget
|
32
|
+
|
33
|
+
logger = logging.getLogger(__name__)
|
34
|
+
|
35
|
+
class Page:
|
36
|
+
"""
|
37
|
+
Enhanced Page wrapper built on top of pdfplumber.Page.
|
38
|
+
|
39
|
+
This class provides a fluent interface for working with PDF pages,
|
40
|
+
with improved selection, navigation, extraction, and question-answering capabilities.
|
41
|
+
"""
|
42
|
+
|
43
|
+
def __init__(self, page: 'pdfplumber.page.Page', parent: 'PDF', index: int, font_attrs=None):
|
44
|
+
"""
|
45
|
+
Initialize a page wrapper.
|
46
|
+
|
47
|
+
Args:
|
48
|
+
page: pdfplumber page object
|
49
|
+
parent: Parent PDF object
|
50
|
+
index: Index of this page in the PDF (0-based)
|
51
|
+
font_attrs: Font attributes to consider when grouping characters into words.
|
52
|
+
"""
|
53
|
+
self._page = page
|
54
|
+
self._parent = parent
|
55
|
+
self._index = index
|
56
|
+
self._text_styles = None # Lazy-loaded text style analyzer results
|
57
|
+
self._exclusions = [] # List to store exclusion functions/regions
|
58
|
+
|
59
|
+
# Region management
|
60
|
+
self._regions = {
|
61
|
+
'detected': [], # Layout detection results
|
62
|
+
'named': {}, # Named regions (name -> region)
|
63
|
+
}
|
64
|
+
|
65
|
+
# Initialize ElementManager
|
66
|
+
self._element_mgr = ElementManager(self, font_attrs)
|
67
|
+
|
68
|
+
# --- Get OCR Manager Instance ---
|
69
|
+
if OCRManager and hasattr(parent, '_ocr_manager') and isinstance(parent._ocr_manager, OCRManager):
|
70
|
+
self._ocr_manager = parent._ocr_manager
|
71
|
+
logger.debug(f"Page {self.number}: Using OCRManager instance from parent PDF.")
|
72
|
+
else:
|
73
|
+
self._ocr_manager = None
|
74
|
+
if OCRManager:
|
75
|
+
logger.warning(f"Page {self.number}: OCRManager instance not found on parent PDF object.")
|
76
|
+
|
77
|
+
# --- Get Layout Manager Instance ---
|
78
|
+
if LayoutManager and hasattr(parent, '_layout_manager') and isinstance(parent._layout_manager, LayoutManager):
|
79
|
+
self._layout_manager = parent._layout_manager
|
80
|
+
logger.debug(f"Page {self.number}: Using LayoutManager instance from parent PDF.")
|
81
|
+
else:
|
82
|
+
self._layout_manager = None
|
83
|
+
if LayoutManager:
|
84
|
+
logger.warning(f"Page {self.number}: LayoutManager instance not found on parent PDF object. Layout analysis will fail.")
|
85
|
+
|
86
|
+
# Initialize the internal variable with a single underscore
|
87
|
+
self._layout_analyzer = None
|
88
|
+
|
89
|
+
@property
|
90
|
+
def pdf(self) -> 'PDF':
|
91
|
+
"""Provides public access to the parent PDF object."""
|
92
|
+
return self._parent
|
93
|
+
|
94
|
+
@property
|
95
|
+
def number(self) -> int:
|
96
|
+
"""Get page number (1-based)."""
|
97
|
+
return self._page.page_number
|
98
|
+
|
99
|
+
@property
|
100
|
+
def index(self) -> int:
|
101
|
+
"""Get page index (0-based)."""
|
102
|
+
return self._index
|
103
|
+
|
104
|
+
@property
|
105
|
+
def width(self) -> float:
|
106
|
+
"""Get page width."""
|
107
|
+
return self._page.width
|
108
|
+
|
109
|
+
@property
|
110
|
+
def height(self) -> float:
|
111
|
+
"""Get page height."""
|
112
|
+
return self._page.height
|
113
|
+
|
114
|
+
# --- Highlighting Service Accessor ---
|
115
|
+
@property
|
116
|
+
def _highlighter(self) -> 'HighlightingService':
|
117
|
+
"""Provides access to the parent PDF's HighlightingService."""
|
118
|
+
if not hasattr(self._parent, 'highlighter'):
|
119
|
+
# This should ideally not happen if PDF.__init__ works correctly
|
120
|
+
raise AttributeError("Parent PDF object does not have a 'highlighter' attribute.")
|
121
|
+
return self._parent.highlighter
|
122
|
+
|
123
|
+
def add_exclusion(self, exclusion_func_or_region: Union[Callable[['Page'], Region], Region]) -> 'Page':
|
124
|
+
"""
|
125
|
+
Add an exclusion to the page. Text from these regions will be excluded from extraction.
|
126
|
+
|
127
|
+
Args:
|
128
|
+
exclusion_func_or_region: Either a Region object or a function that takes a Page
|
129
|
+
and returns a Region to exclude
|
130
|
+
|
131
|
+
Returns:
|
132
|
+
Self for method chaining
|
133
|
+
"""
|
134
|
+
self._exclusions.append(exclusion_func_or_region)
|
135
|
+
return self
|
136
|
+
|
137
|
+
def add_region(self, region: Region, name: Optional[str] = None) -> 'Page':
|
138
|
+
"""
|
139
|
+
Add a region to the page.
|
140
|
+
|
141
|
+
Args:
|
142
|
+
region: Region object to add
|
143
|
+
name: Optional name for the region
|
144
|
+
|
145
|
+
Returns:
|
146
|
+
Self for method chaining
|
147
|
+
"""
|
148
|
+
# Check if it's actually a Region object
|
149
|
+
if not isinstance(region, Region):
|
150
|
+
raise TypeError("region must be a Region object")
|
151
|
+
|
152
|
+
# Set the source and name
|
153
|
+
region.source = 'named'
|
154
|
+
|
155
|
+
if name:
|
156
|
+
region.name = name
|
157
|
+
# Add to named regions dictionary (overwriting if name already exists)
|
158
|
+
self._regions['named'][name] = region
|
159
|
+
else:
|
160
|
+
# Add to detected regions list (unnamed but registered)
|
161
|
+
self._regions['detected'].append(region)
|
162
|
+
|
163
|
+
# Add to element manager for selector queries
|
164
|
+
self._element_mgr.add_region(region)
|
165
|
+
|
166
|
+
return self
|
167
|
+
|
168
|
+
def add_regions(self, regions: List[Region], prefix: Optional[str] = None) -> 'Page':
|
169
|
+
"""
|
170
|
+
Add multiple regions to the page.
|
171
|
+
|
172
|
+
Args:
|
173
|
+
regions: List of Region objects to add
|
174
|
+
prefix: Optional prefix for automatic naming (regions will be named prefix_1, prefix_2, etc.)
|
175
|
+
|
176
|
+
Returns:
|
177
|
+
Self for method chaining
|
178
|
+
"""
|
179
|
+
if prefix:
|
180
|
+
# Add with automatic sequential naming
|
181
|
+
for i, region in enumerate(regions):
|
182
|
+
self.add_region(region, name=f"{prefix}_{i+1}")
|
183
|
+
else:
|
184
|
+
# Add without names
|
185
|
+
for region in regions:
|
186
|
+
self.add_region(region)
|
187
|
+
|
188
|
+
return self
|
189
|
+
|
190
|
+
def _get_exclusion_regions(self, include_callable=True, debug=False) -> List[Region]:
|
191
|
+
"""
|
192
|
+
Get all exclusion regions for this page.
|
193
|
+
|
194
|
+
Args:
|
195
|
+
include_callable: Whether to evaluate callable exclusion functions
|
196
|
+
debug: Enable verbose debug logging for exclusion evaluation
|
197
|
+
|
198
|
+
Returns:
|
199
|
+
List of Region objects to exclude
|
200
|
+
"""
|
201
|
+
regions = []
|
202
|
+
|
203
|
+
# Track exclusion results for debugging
|
204
|
+
if debug:
|
205
|
+
print(f"\nPage {self.index}: Evaluating {len(self._exclusions)} exclusions")
|
206
|
+
|
207
|
+
for i, exclusion in enumerate(self._exclusions):
|
208
|
+
# Get exclusion label if it's a tuple from PDF level
|
209
|
+
exclusion_label = f"exclusion {i}"
|
210
|
+
original_exclusion = exclusion
|
211
|
+
|
212
|
+
# Check if it's a tuple from PDF.add_exclusion
|
213
|
+
if isinstance(exclusion, tuple) and len(exclusion) == 2 and callable(exclusion[0]):
|
214
|
+
# This is likely from PDF.add_exclusion with (func, label)
|
215
|
+
exclusion_func, label = exclusion
|
216
|
+
if label:
|
217
|
+
exclusion_label = label
|
218
|
+
exclusion = exclusion_func
|
219
|
+
|
220
|
+
# Process callable exclusion functions
|
221
|
+
if callable(exclusion) and include_callable:
|
222
|
+
# It's a function, call it with this page
|
223
|
+
try:
|
224
|
+
if debug:
|
225
|
+
print(f" - Evaluating callable {exclusion_label}...")
|
226
|
+
|
227
|
+
# Create a temporary copy of exclusions to avoid recursion
|
228
|
+
original_exclusions = self._exclusions
|
229
|
+
self._exclusions = [] # Temporarily clear exclusions
|
230
|
+
|
231
|
+
# Call the function
|
232
|
+
region = exclusion(self)
|
233
|
+
|
234
|
+
# Restore exclusions
|
235
|
+
self._exclusions = original_exclusions
|
236
|
+
|
237
|
+
if region:
|
238
|
+
regions.append(region)
|
239
|
+
if debug:
|
240
|
+
print(f" ✓ Added region: {region}")
|
241
|
+
else:
|
242
|
+
if debug:
|
243
|
+
print(f" ✗ Function returned None, no region added")
|
244
|
+
|
245
|
+
except Exception as e:
|
246
|
+
error_msg = f"Error in {exclusion_label} for page {self.index}: {e}"
|
247
|
+
print(error_msg)
|
248
|
+
# Print more detailed traceback for debugging
|
249
|
+
import traceback
|
250
|
+
print(f" Traceback: {traceback.format_exc().splitlines()[-3:]}")
|
251
|
+
|
252
|
+
# Process direct Region objects
|
253
|
+
elif not callable(exclusion):
|
254
|
+
# It's already a Region object
|
255
|
+
regions.append(exclusion)
|
256
|
+
if debug:
|
257
|
+
print(f" - Added direct region: {exclusion}")
|
258
|
+
|
259
|
+
if debug:
|
260
|
+
print(f"Page {self.index}: Found {len(regions)} valid exclusion regions")
|
261
|
+
|
262
|
+
return regions
|
263
|
+
|
264
|
+
def _filter_elements_by_exclusions(self, elements: List['Element'], debug_exclusions: bool = False) -> List['Element']:
|
265
|
+
"""
|
266
|
+
Filters a list of elements, removing those within the page's exclusion regions.
|
267
|
+
|
268
|
+
Args:
|
269
|
+
elements: The list of elements to filter.
|
270
|
+
debug_exclusions: Whether to output detailed exclusion debugging info (default: False).
|
271
|
+
|
272
|
+
Returns:
|
273
|
+
A new list containing only the elements not falling within any exclusion region.
|
274
|
+
"""
|
275
|
+
if not self._exclusions:
|
276
|
+
if debug_exclusions:
|
277
|
+
print(f"Page {self.index}: No exclusions defined, returning all {len(elements)} elements.")
|
278
|
+
return elements
|
279
|
+
|
280
|
+
# Get all exclusion regions, including evaluating callable functions
|
281
|
+
exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=debug_exclusions)
|
282
|
+
|
283
|
+
if not exclusion_regions:
|
284
|
+
if debug_exclusions:
|
285
|
+
print(f"Page {self.index}: No valid exclusion regions found, returning all {len(elements)} elements.")
|
286
|
+
return elements
|
287
|
+
|
288
|
+
if debug_exclusions:
|
289
|
+
print(f"Page {self.index}: Applying {len(exclusion_regions)} exclusion regions to {len(elements)} elements.")
|
290
|
+
|
291
|
+
filtered_elements = []
|
292
|
+
excluded_count = 0
|
293
|
+
for element in elements:
|
294
|
+
exclude = False
|
295
|
+
for region in exclusion_regions:
|
296
|
+
# Use the region's method to check if the element is inside
|
297
|
+
if region._is_element_in_region(element):
|
298
|
+
exclude = True
|
299
|
+
excluded_count += 1
|
300
|
+
break # No need to check other regions for this element
|
301
|
+
if not exclude:
|
302
|
+
filtered_elements.append(element)
|
303
|
+
|
304
|
+
if debug_exclusions:
|
305
|
+
print(f"Page {self.index}: Excluded {excluded_count} elements, keeping {len(filtered_elements)}.")
|
306
|
+
|
307
|
+
return filtered_elements
|
308
|
+
|
309
|
+
def find(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> Any:
|
310
|
+
"""
|
311
|
+
Find first element on this page matching selector.
|
312
|
+
|
313
|
+
Args:
|
314
|
+
selector: CSS-like selector string
|
315
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
|
316
|
+
regex: Whether to use regex for text search in :contains (default: False)
|
317
|
+
case: Whether to do case-sensitive text search (default: True)
|
318
|
+
**kwargs: Additional filter parameters
|
319
|
+
|
320
|
+
Returns:
|
321
|
+
Element object or None if not found
|
322
|
+
"""
|
323
|
+
from natural_pdf.selectors.parser import parse_selector
|
324
|
+
selector_obj = parse_selector(selector)
|
325
|
+
|
326
|
+
# Pass regex and case flags to selector function
|
327
|
+
kwargs['regex'] = regex
|
328
|
+
kwargs['case'] = case
|
329
|
+
|
330
|
+
# First get all matching elements without applying exclusions initially within _apply_selector
|
331
|
+
results_collection = self._apply_selector(selector_obj, **kwargs) # _apply_selector doesn't filter
|
332
|
+
|
333
|
+
# Filter the results based on exclusions if requested
|
334
|
+
if apply_exclusions and self._exclusions and results_collection:
|
335
|
+
filtered_elements = self._filter_elements_by_exclusions(results_collection.elements)
|
336
|
+
# Return the first element from the filtered list
|
337
|
+
return filtered_elements[0] if filtered_elements else None
|
338
|
+
elif results_collection:
|
339
|
+
# Return the first element from the unfiltered results
|
340
|
+
return results_collection.first
|
341
|
+
else:
|
342
|
+
return None
|
343
|
+
|
344
|
+
def find_all(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> 'ElementCollection':
|
345
|
+
"""
|
346
|
+
Find all elements on this page matching selector.
|
347
|
+
|
348
|
+
Args:
|
349
|
+
selector: CSS-like selector string
|
350
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
|
351
|
+
regex: Whether to use regex for text search in :contains (default: False)
|
352
|
+
case: Whether to do case-sensitive text search (default: True)
|
353
|
+
**kwargs: Additional filter parameters
|
354
|
+
|
355
|
+
Returns:
|
356
|
+
ElementCollection with matching elements
|
357
|
+
"""
|
358
|
+
from natural_pdf.selectors.parser import parse_selector
|
359
|
+
selector_obj = parse_selector(selector)
|
360
|
+
|
361
|
+
# Pass regex and case flags to selector function
|
362
|
+
kwargs['regex'] = regex
|
363
|
+
kwargs['case'] = case
|
364
|
+
|
365
|
+
# First get all matching elements without applying exclusions initially within _apply_selector
|
366
|
+
results_collection = self._apply_selector(selector_obj, **kwargs) # _apply_selector doesn't filter
|
367
|
+
|
368
|
+
# Filter the results based on exclusions if requested
|
369
|
+
if apply_exclusions and self._exclusions and results_collection:
|
370
|
+
filtered_elements = self._filter_elements_by_exclusions(results_collection.elements)
|
371
|
+
return ElementCollection(filtered_elements)
|
372
|
+
else:
|
373
|
+
# Return the unfiltered collection
|
374
|
+
return results_collection
|
375
|
+
|
376
|
+
def _apply_selector(self, selector_obj: Dict, **kwargs) -> 'ElementCollection': # Removed apply_exclusions arg
|
377
|
+
"""
|
378
|
+
Apply selector to page elements.
|
379
|
+
Exclusions are now handled by the calling methods (find, find_all) if requested.
|
380
|
+
|
381
|
+
Args:
|
382
|
+
selector_obj: Parsed selector dictionary
|
383
|
+
**kwargs: Additional filter parameters including 'regex' and 'case'
|
384
|
+
|
385
|
+
Returns:
|
386
|
+
ElementCollection of matching elements (unfiltered by exclusions)
|
387
|
+
"""
|
388
|
+
from natural_pdf.selectors.parser import selector_to_filter_func
|
389
|
+
|
390
|
+
# Get element type to filter
|
391
|
+
element_type = selector_obj.get('type', 'any').lower()
|
392
|
+
|
393
|
+
# Determine which elements to search based on element type
|
394
|
+
elements_to_search = []
|
395
|
+
if element_type == 'any':
|
396
|
+
elements_to_search = self._element_mgr.get_all_elements()
|
397
|
+
elif element_type == 'text':
|
398
|
+
elements_to_search = self._element_mgr.words
|
399
|
+
elif element_type == 'char':
|
400
|
+
elements_to_search = self._element_mgr.chars
|
401
|
+
elif element_type == 'word':
|
402
|
+
elements_to_search = self._element_mgr.words
|
403
|
+
elif element_type == 'rect' or element_type == 'rectangle':
|
404
|
+
elements_to_search = self._element_mgr.rects
|
405
|
+
elif element_type == 'line':
|
406
|
+
elements_to_search = self._element_mgr.lines
|
407
|
+
elif element_type == 'region':
|
408
|
+
elements_to_search = self._element_mgr.regions
|
409
|
+
else:
|
410
|
+
elements_to_search = self._element_mgr.get_all_elements()
|
411
|
+
|
412
|
+
# Create filter function from selector, passing any additional parameters
|
413
|
+
filter_func = selector_to_filter_func(selector_obj, **kwargs)
|
414
|
+
|
415
|
+
# Apply the filter to matching elements
|
416
|
+
matching_elements = [element for element in elements_to_search if filter_func(element)]
|
417
|
+
|
418
|
+
# Handle spatial pseudo-classes that require relationship checking
|
419
|
+
for pseudo in selector_obj.get('pseudo_classes', []):
|
420
|
+
name = pseudo.get('name')
|
421
|
+
args = pseudo.get('args', '')
|
422
|
+
|
423
|
+
if name in ('above', 'below', 'near', 'left-of', 'right-of'):
|
424
|
+
# Find the reference element first
|
425
|
+
from natural_pdf.selectors.parser import parse_selector
|
426
|
+
ref_selector = parse_selector(args) if isinstance(args, str) else args
|
427
|
+
# Recursively call _apply_selector for reference element (exclusions handled later)
|
428
|
+
ref_elements = self._apply_selector(ref_selector, **kwargs)
|
429
|
+
|
430
|
+
if not ref_elements:
|
431
|
+
return ElementCollection([])
|
432
|
+
|
433
|
+
ref_element = ref_elements.first
|
434
|
+
if not ref_element: continue
|
435
|
+
|
436
|
+
# Filter elements based on spatial relationship
|
437
|
+
if name == 'above':
|
438
|
+
matching_elements = [el for el in matching_elements if hasattr(el, 'bottom') and hasattr(ref_element, 'top') and el.bottom <= ref_element.top]
|
439
|
+
elif name == 'below':
|
440
|
+
matching_elements = [el for el in matching_elements if hasattr(el, 'top') and hasattr(ref_element, 'bottom') and el.top >= ref_element.bottom]
|
441
|
+
elif name == 'left-of':
|
442
|
+
matching_elements = [el for el in matching_elements if hasattr(el, 'x1') and hasattr(ref_element, 'x0') and el.x1 <= ref_element.x0]
|
443
|
+
elif name == 'right-of':
|
444
|
+
matching_elements = [el for el in matching_elements if hasattr(el, 'x0') and hasattr(ref_element, 'x1') and el.x0 >= ref_element.x1]
|
445
|
+
elif name == 'near':
|
446
|
+
def distance(el1, el2):
|
447
|
+
if not (hasattr(el1, 'x0') and hasattr(el1, 'x1') and hasattr(el1, 'top') and hasattr(el1, 'bottom') and
|
448
|
+
hasattr(el2, 'x0') and hasattr(el2, 'x1') and hasattr(el2, 'top') and hasattr(el2, 'bottom')):
|
449
|
+
return float('inf') # Cannot calculate distance
|
450
|
+
el1_center_x = (el1.x0 + el1.x1) / 2
|
451
|
+
el1_center_y = (el1.top + el1.bottom) / 2
|
452
|
+
el2_center_x = (el2.x0 + el2.x1) / 2
|
453
|
+
el2_center_y = (el2.top + el2.bottom) / 2
|
454
|
+
return ((el1_center_x - el2_center_x) ** 2 + (el1_center_y - el2_center_y) ** 2) ** 0.5
|
455
|
+
|
456
|
+
threshold = kwargs.get('near_threshold', 50)
|
457
|
+
matching_elements = [el for el in matching_elements if distance(el, ref_element) <= threshold]
|
458
|
+
|
459
|
+
# Sort elements in reading order if requested
|
460
|
+
if kwargs.get('reading_order', True):
|
461
|
+
if all(hasattr(el, 'top') and hasattr(el, 'x0') for el in matching_elements):
|
462
|
+
matching_elements.sort(key=lambda el: (el.top, el.x0))
|
463
|
+
else:
|
464
|
+
logger.warning("Cannot sort elements in reading order: Missing required attributes (top, x0).")
|
465
|
+
|
466
|
+
# Create result collection - exclusions are handled by the calling methods (find, find_all)
|
467
|
+
result = ElementCollection(matching_elements)
|
468
|
+
|
469
|
+
return result
|
470
|
+
|
471
|
+
def create_region(self, x0: float, top: float, x1: float, bottom: float) -> Any:
|
472
|
+
"""
|
473
|
+
Create a region on this page with the specified coordinates.
|
474
|
+
|
475
|
+
Args:
|
476
|
+
x0: Left x-coordinate
|
477
|
+
top: Top y-coordinate
|
478
|
+
x1: Right x-coordinate
|
479
|
+
bottom: Bottom y-coordinate
|
480
|
+
|
481
|
+
Returns:
|
482
|
+
Region object for the specified coordinates
|
483
|
+
"""
|
484
|
+
from natural_pdf.elements.region import Region
|
485
|
+
return Region(self, (x0, top, x1, bottom))
|
486
|
+
|
487
|
+
def region(self, left: float = None, top: float = None, right: float = None, bottom: float = None,
|
488
|
+
width: str = "full") -> Any:
|
489
|
+
"""
|
490
|
+
Create a region on this page with more intuitive named parameters.
|
491
|
+
|
492
|
+
Args:
|
493
|
+
left: Left x-coordinate (default: 0)
|
494
|
+
top: Top y-coordinate (default: 0)
|
495
|
+
right: Right x-coordinate (default: page width)
|
496
|
+
bottom: Bottom y-coordinate (default: page height)
|
497
|
+
width: Width mode - "full" for full page width or "element" for element width
|
498
|
+
|
499
|
+
Returns:
|
500
|
+
Region object for the specified coordinates
|
501
|
+
|
502
|
+
Examples:
|
503
|
+
>>> page.region(top=100, bottom=200) # Full width from y=100 to y=200
|
504
|
+
>>> page.region(left=50, right=150, top=100, bottom=200) # Specific rectangle
|
505
|
+
"""
|
506
|
+
# Handle defaults
|
507
|
+
left = 0 if left is None else left
|
508
|
+
top = 0 if top is None else top
|
509
|
+
right = self.width if right is None else right
|
510
|
+
bottom = self.height if bottom is None else bottom
|
511
|
+
|
512
|
+
# Handle width parameter
|
513
|
+
if width == "full":
|
514
|
+
left = 0
|
515
|
+
right = self.width
|
516
|
+
elif width != "element":
|
517
|
+
raise ValueError("Width must be 'full' or 'element'")
|
518
|
+
|
519
|
+
from natural_pdf.elements.region import Region
|
520
|
+
region = Region(self, (left, top, right, bottom))
|
521
|
+
return region
|
522
|
+
|
523
|
+
def get_elements(self, apply_exclusions=True, debug_exclusions: bool = False) -> List['Element']:
|
524
|
+
"""
|
525
|
+
Get all elements on this page.
|
526
|
+
|
527
|
+
Args:
|
528
|
+
apply_exclusions: Whether to apply exclusion regions (default: True).
|
529
|
+
debug_exclusions: Whether to output detailed exclusion debugging info (default: False).
|
530
|
+
|
531
|
+
Returns:
|
532
|
+
List of all elements on the page, potentially filtered by exclusions.
|
533
|
+
"""
|
534
|
+
# Get all elements from the element manager
|
535
|
+
all_elements = self._element_mgr.get_all_elements()
|
536
|
+
|
537
|
+
# Apply exclusions if requested
|
538
|
+
if apply_exclusions and self._exclusions:
|
539
|
+
return self._filter_elements_by_exclusions(all_elements, debug_exclusions=debug_exclusions)
|
540
|
+
else:
|
541
|
+
if debug_exclusions:
|
542
|
+
print(f"Page {self.index}: get_elements returning all {len(all_elements)} elements (exclusions not applied).")
|
543
|
+
return all_elements
|
544
|
+
|
545
|
+
def filter_elements(self, elements: List['Element'], selector: str, **kwargs) -> List['Element']:
|
546
|
+
"""
|
547
|
+
Filter a list of elements based on a selector.
|
548
|
+
|
549
|
+
Args:
|
550
|
+
elements: List of elements to filter
|
551
|
+
selector: CSS-like selector string
|
552
|
+
**kwargs: Additional filter parameters
|
553
|
+
|
554
|
+
Returns:
|
555
|
+
List of elements that match the selector
|
556
|
+
"""
|
557
|
+
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
558
|
+
|
559
|
+
# Parse the selector
|
560
|
+
selector_obj = parse_selector(selector)
|
561
|
+
|
562
|
+
# Create filter function from selector
|
563
|
+
filter_func = selector_to_filter_func(selector_obj, **kwargs)
|
564
|
+
|
565
|
+
# Apply the filter to the elements
|
566
|
+
matching_elements = [element for element in elements if filter_func(element)]
|
567
|
+
|
568
|
+
# Sort elements in reading order if requested
|
569
|
+
if kwargs.get('reading_order', True):
|
570
|
+
if all(hasattr(el, 'top') and hasattr(el, 'x0') for el in matching_elements):
|
571
|
+
matching_elements.sort(key=lambda el: (el.top, el.x0))
|
572
|
+
else:
|
573
|
+
logger.warning("Cannot sort elements in reading order: Missing required attributes (top, x0).")
|
574
|
+
|
575
|
+
return matching_elements
|
576
|
+
|
577
|
+
def until(self, selector: str, include_endpoint: bool = True, **kwargs) -> Any:
|
578
|
+
"""
|
579
|
+
Select content from the top of the page until matching selector.
|
580
|
+
|
581
|
+
Args:
|
582
|
+
selector: CSS-like selector string
|
583
|
+
include_endpoint: Whether to include the endpoint element in the region
|
584
|
+
**kwargs: Additional selection parameters
|
585
|
+
|
586
|
+
Returns:
|
587
|
+
Region object representing the selected content
|
588
|
+
|
589
|
+
Examples:
|
590
|
+
>>> page.until('text:contains("Conclusion")') # Select from top to conclusion
|
591
|
+
>>> page.until('line[width>=2]', include_endpoint=False) # Select up to thick line
|
592
|
+
"""
|
593
|
+
# Find the target element
|
594
|
+
target = self.find(selector, **kwargs)
|
595
|
+
if not target:
|
596
|
+
# If target not found, return a default region (full page)
|
597
|
+
from natural_pdf.elements.region import Region
|
598
|
+
return Region(self, (0, 0, self.width, self.height))
|
599
|
+
|
600
|
+
# Create a region from the top of the page to the target
|
601
|
+
from natural_pdf.elements.region import Region
|
602
|
+
# Ensure target has positional attributes before using them
|
603
|
+
target_top = getattr(target, 'top', 0)
|
604
|
+
target_bottom = getattr(target, 'bottom', self.height)
|
605
|
+
|
606
|
+
if include_endpoint:
|
607
|
+
# Include the target element
|
608
|
+
region = Region(self, (0, 0, self.width, target_bottom))
|
609
|
+
else:
|
610
|
+
# Up to the target element
|
611
|
+
region = Region(self, (0, 0, self.width, target_top))
|
612
|
+
|
613
|
+
region.end_element = target
|
614
|
+
return region
|
615
|
+
|
616
|
+
|
617
|
+
def crop(self, bbox=None, **kwargs) -> Any:
|
618
|
+
"""
|
619
|
+
Crop the page to the specified bounding box.
|
620
|
+
|
621
|
+
This is a direct wrapper around pdfplumber's crop method.
|
622
|
+
|
623
|
+
Args:
|
624
|
+
bbox: Bounding box (x0, top, x1, bottom) or None
|
625
|
+
**kwargs: Additional parameters (top, bottom, left, right)
|
626
|
+
|
627
|
+
Returns:
|
628
|
+
Cropped page object (pdfplumber.Page)
|
629
|
+
"""
|
630
|
+
# Returns the pdfplumber page object, not a natural-pdf Page
|
631
|
+
return self._page.crop(bbox, **kwargs)
|
632
|
+
|
633
|
+
def extract_text(self,
|
634
|
+
preserve_whitespace=True,
|
635
|
+
use_exclusions=True,
|
636
|
+
debug_exclusions=False, **kwargs) -> str:
|
637
|
+
"""
|
638
|
+
Extract text from this page, respecting any exclusion regions.
|
639
|
+
|
640
|
+
Args:
|
641
|
+
preserve_whitespace: Whether to keep blank characters (default: True)
|
642
|
+
use_exclusions: Whether to apply exclusion regions (default: True)
|
643
|
+
debug_exclusions: Whether to output detailed exclusion debugging info (default: False)
|
644
|
+
**kwargs: Additional extraction parameters passed to pdfplumber
|
645
|
+
|
646
|
+
Returns:
|
647
|
+
Extracted text as string
|
648
|
+
"""
|
649
|
+
if not use_exclusions or not self._exclusions:
|
650
|
+
# If no exclusions or exclusions disabled, use regular extraction
|
651
|
+
if debug_exclusions:
|
652
|
+
print(f"Page {self.index}: Extracting text via pdfplumber (exclusions not applied).")
|
653
|
+
# Note: pdfplumber still uses keep_blank_chars parameter
|
654
|
+
return self._page.extract_text(keep_blank_chars=preserve_whitespace, **kwargs)
|
655
|
+
|
656
|
+
# --- Exclusion Logic ---
|
657
|
+
# 1. Get all potentially relevant text elements (words)
|
658
|
+
all_text_elements = self.words # Use the words property
|
659
|
+
if debug_exclusions:
|
660
|
+
print(f"Page {self.index}: Starting text extraction with {len(all_text_elements)} words before exclusion.")
|
661
|
+
|
662
|
+
# 2. Filter elements using the centralized method
|
663
|
+
filtered_elements = self._filter_elements_by_exclusions(all_text_elements, debug_exclusions=debug_exclusions)
|
664
|
+
|
665
|
+
# 3. Extract text from the filtered elements
|
666
|
+
collection = ElementCollection(filtered_elements)
|
667
|
+
# Ensure elements are sorted for logical text flow (might be redundant if self.words is sorted)
|
668
|
+
if all(hasattr(el, 'top') and hasattr(el, 'x0') for el in collection.elements):
|
669
|
+
collection.sort(key=lambda el: (el.top, el.x0))
|
670
|
+
|
671
|
+
# Join text, handling potential missing text attributes gracefully
|
672
|
+
result = " ".join(getattr(el, 'text', '') for el in collection.elements)
|
673
|
+
|
674
|
+
if debug_exclusions:
|
675
|
+
print(f"Page {self.index}: Extracted {len(result)} characters of text with exclusions applied.")
|
676
|
+
|
677
|
+
return result
|
678
|
+
|
679
|
+
def extract_table(self, table_settings={}) -> List[Any]:
|
680
|
+
"""
|
681
|
+
Extract the largest table from this page.
|
682
|
+
|
683
|
+
Args:
|
684
|
+
table_settings: Additional extraction parameters
|
685
|
+
|
686
|
+
Returns:
|
687
|
+
List of extracted tables (or None if no table found)
|
688
|
+
"""
|
689
|
+
# pdfplumber returns None if no table found
|
690
|
+
return self._page.extract_table(table_settings)
|
691
|
+
|
692
|
+
def extract_tables(self, table_settings={}) -> List[Any]:
|
693
|
+
"""
|
694
|
+
Extract tables from this page.
|
695
|
+
|
696
|
+
Args:
|
697
|
+
table_settings: Additional extraction parameters
|
698
|
+
|
699
|
+
Returns:
|
700
|
+
List of extracted tables
|
701
|
+
"""
|
702
|
+
# pdfplumber returns list of tables
|
703
|
+
return self._page.extract_tables(table_settings)
|
704
|
+
|
705
|
+
def _load_elements(self):
|
706
|
+
"""Load all elements from the page via ElementManager."""
|
707
|
+
self._element_mgr.load_elements()
|
708
|
+
|
709
|
+
def _create_char_elements(self):
|
710
|
+
"""DEPRECATED: Use self._element_mgr.chars"""
|
711
|
+
logger.warning("_create_char_elements is deprecated. Access via self._element_mgr.chars.")
|
712
|
+
return self._element_mgr.chars # Delegate
|
713
|
+
|
714
|
+
def _process_font_information(self, char_dict):
|
715
|
+
"""DEPRECATED: Handled by ElementManager"""
|
716
|
+
logger.warning("_process_font_information is deprecated. Handled by ElementManager.")
|
717
|
+
# ElementManager handles this internally
|
718
|
+
pass
|
719
|
+
|
720
|
+
def _group_chars_into_words(self, keep_spaces=True, font_attrs=None):
|
721
|
+
"""DEPRECATED: Use self._element_mgr.words"""
|
722
|
+
logger.warning("_group_chars_into_words is deprecated. Access via self._element_mgr.words.")
|
723
|
+
return self._element_mgr.words # Delegate
|
724
|
+
|
725
|
+
def _process_line_into_words(self, line_chars, keep_spaces, font_attrs):
|
726
|
+
"""DEPRECATED: Handled by ElementManager"""
|
727
|
+
logger.warning("_process_line_into_words is deprecated. Handled by ElementManager.")
|
728
|
+
pass
|
729
|
+
|
730
|
+
def _check_font_attributes_match(self, char, prev_char, font_attrs):
|
731
|
+
"""DEPRECATED: Handled by ElementManager"""
|
732
|
+
logger.warning("_check_font_attributes_match is deprecated. Handled by ElementManager.")
|
733
|
+
pass
|
734
|
+
|
735
|
+
def _create_word_element(self, chars, font_attrs):
|
736
|
+
"""DEPRECATED: Handled by ElementManager"""
|
737
|
+
logger.warning("_create_word_element is deprecated. Handled by ElementManager.")
|
738
|
+
pass
|
739
|
+
|
740
|
+
@property
|
741
|
+
def chars(self) -> List[Any]:
|
742
|
+
"""Get all character elements on this page."""
|
743
|
+
return self._element_mgr.chars
|
744
|
+
|
745
|
+
@property
|
746
|
+
def words(self) -> List[Any]:
|
747
|
+
"""Get all word elements on this page."""
|
748
|
+
return self._element_mgr.words
|
749
|
+
|
750
|
+
@property
|
751
|
+
def rects(self) -> List[Any]:
|
752
|
+
"""Get all rectangle elements on this page."""
|
753
|
+
return self._element_mgr.rects
|
754
|
+
|
755
|
+
@property
|
756
|
+
def lines(self) -> List[Any]:
|
757
|
+
"""Get all line elements on this page."""
|
758
|
+
return self._element_mgr.lines
|
759
|
+
|
760
|
+
def highlight(self,
|
761
|
+
bbox: Optional[Tuple[float, float, float, float]] = None,
|
762
|
+
color: Optional[Union[Tuple, str]] = None,
|
763
|
+
label: Optional[str] = None,
|
764
|
+
use_color_cycling: bool = False,
|
765
|
+
element: Optional[Any] = None,
|
766
|
+
include_attrs: Optional[List[str]] = None,
|
767
|
+
existing: str = 'append') -> 'Page':
|
768
|
+
"""
|
769
|
+
Highlight a bounding box or the entire page.
|
770
|
+
Delegates to the central HighlightingService.
|
771
|
+
|
772
|
+
Args:
|
773
|
+
bbox: Bounding box (x0, top, x1, bottom). If None, highlight entire page.
|
774
|
+
color: RGBA color tuple/string for the highlight.
|
775
|
+
label: Optional label for the highlight.
|
776
|
+
use_color_cycling: If True and no label/color, use next cycle color.
|
777
|
+
element: Optional original element being highlighted (for attribute extraction).
|
778
|
+
include_attrs: List of attribute names from 'element' to display.
|
779
|
+
existing: How to handle existing highlights ('append' or 'replace').
|
780
|
+
|
781
|
+
Returns:
|
782
|
+
Self for method chaining.
|
783
|
+
"""
|
784
|
+
target_bbox = bbox if bbox is not None else (0, 0, self.width, self.height)
|
785
|
+
self._highlighter.add(
|
786
|
+
page_index=self.index,
|
787
|
+
bbox=target_bbox,
|
788
|
+
color=color,
|
789
|
+
label=label,
|
790
|
+
use_color_cycling=use_color_cycling,
|
791
|
+
element=element,
|
792
|
+
include_attrs=include_attrs,
|
793
|
+
existing=existing
|
794
|
+
)
|
795
|
+
return self
|
796
|
+
|
797
|
+
def highlight_polygon(
|
798
|
+
self,
|
799
|
+
polygon: List[Tuple[float, float]],
|
800
|
+
color: Optional[Union[Tuple, str]] = None,
|
801
|
+
label: Optional[str] = None,
|
802
|
+
use_color_cycling: bool = False,
|
803
|
+
element: Optional[Any] = None,
|
804
|
+
include_attrs: Optional[List[str]] = None,
|
805
|
+
existing: str = 'append') -> 'Page':
|
806
|
+
"""
|
807
|
+
Highlight a polygon shape on the page.
|
808
|
+
Delegates to the central HighlightingService.
|
809
|
+
|
810
|
+
Args:
|
811
|
+
polygon: List of (x, y) points defining the polygon.
|
812
|
+
color: RGBA color tuple/string for the highlight.
|
813
|
+
label: Optional label for the highlight.
|
814
|
+
use_color_cycling: If True and no label/color, use next cycle color.
|
815
|
+
element: Optional original element being highlighted (for attribute extraction).
|
816
|
+
include_attrs: List of attribute names from 'element' to display.
|
817
|
+
existing: How to handle existing highlights ('append' or 'replace').
|
818
|
+
|
819
|
+
Returns:
|
820
|
+
Self for method chaining.
|
821
|
+
"""
|
822
|
+
self._highlighter.add_polygon(
|
823
|
+
page_index=self.index,
|
824
|
+
polygon=polygon,
|
825
|
+
color=color,
|
826
|
+
label=label,
|
827
|
+
use_color_cycling=use_color_cycling,
|
828
|
+
element=element,
|
829
|
+
include_attrs=include_attrs,
|
830
|
+
existing=existing
|
831
|
+
)
|
832
|
+
return self
|
833
|
+
|
834
|
+
def show(self,
|
835
|
+
scale: float = 2.0,
|
836
|
+
width: Optional[int] = None,
|
837
|
+
labels: bool = True,
|
838
|
+
legend_position: str = 'right',
|
839
|
+
render_ocr: bool = False) -> Optional[Image.Image]:
|
840
|
+
"""
|
841
|
+
Generates and returns an image of the page with persistent highlights rendered.
|
842
|
+
|
843
|
+
Args:
|
844
|
+
scale: Scale factor for rendering.
|
845
|
+
width: Optional width for the output image.
|
846
|
+
labels: Whether to include a legend for labels.
|
847
|
+
legend_position: Position of the legend.
|
848
|
+
render_ocr: Whether to render OCR text.
|
849
|
+
|
850
|
+
Returns:
|
851
|
+
PIL Image object of the page with highlights, or None if rendering fails.
|
852
|
+
"""
|
853
|
+
return self.to_image(
|
854
|
+
scale=scale,
|
855
|
+
width=width,
|
856
|
+
labels=labels,
|
857
|
+
legend_position=legend_position,
|
858
|
+
render_ocr=render_ocr,
|
859
|
+
include_highlights=True # Ensure highlights are requested
|
860
|
+
)
|
861
|
+
|
862
|
+
def save_image(self,
|
863
|
+
filename: str,
|
864
|
+
scale: float = 2.0,
|
865
|
+
width: Optional[int] = None,
|
866
|
+
labels: bool = True,
|
867
|
+
legend_position: str = 'right',
|
868
|
+
render_ocr: bool = False,
|
869
|
+
include_highlights: bool = True, # Allow saving without highlights
|
870
|
+
resolution: Optional[float] = None,
|
871
|
+
**kwargs) -> 'Page':
|
872
|
+
"""
|
873
|
+
Save the page image to a file, rendering highlights via HighlightingService.
|
874
|
+
|
875
|
+
Args:
|
876
|
+
filename: Path to save the image to.
|
877
|
+
scale: Scale factor for rendering highlights.
|
878
|
+
width: Optional width for the output image.
|
879
|
+
labels: Whether to include a legend.
|
880
|
+
legend_position: Position of the legend.
|
881
|
+
render_ocr: Whether to render OCR text.
|
882
|
+
include_highlights: Whether to render highlights.
|
883
|
+
resolution: Resolution for base image rendering.
|
884
|
+
**kwargs: Additional args for pdfplumber's to_image.
|
885
|
+
|
886
|
+
Returns:
|
887
|
+
Self for method chaining.
|
888
|
+
"""
|
889
|
+
# Use to_image to generate and save the image
|
890
|
+
self.to_image(
|
891
|
+
path=filename,
|
892
|
+
scale=scale,
|
893
|
+
width=width,
|
894
|
+
labels=labels,
|
895
|
+
legend_position=legend_position,
|
896
|
+
render_ocr=render_ocr,
|
897
|
+
include_highlights=include_highlights,
|
898
|
+
resolution=resolution,
|
899
|
+
**kwargs
|
900
|
+
)
|
901
|
+
return self
|
902
|
+
|
903
|
+
def clear_highlights(self) -> 'Page':
|
904
|
+
"""
|
905
|
+
Clear all highlights *from this specific page* via HighlightingService.
|
906
|
+
|
907
|
+
Returns:
|
908
|
+
Self for method chaining
|
909
|
+
"""
|
910
|
+
self._highlighter.clear_page(self.index)
|
911
|
+
return self
|
912
|
+
|
913
|
+
def analyze_text_styles(self, options: Optional[TextStyleOptions] = None) -> ElementCollection:
|
914
|
+
"""
|
915
|
+
Analyze text elements by style, adding attributes directly to elements.
|
916
|
+
|
917
|
+
This method uses TextStyleAnalyzer to process text elements (typically words)
|
918
|
+
on the page. It adds the following attributes to each processed element:
|
919
|
+
- style_label: A descriptive or numeric label for the style group.
|
920
|
+
- style_key: A hashable tuple representing the style properties used for grouping.
|
921
|
+
- style_properties: A dictionary containing the extracted style properties.
|
922
|
+
|
923
|
+
Args:
|
924
|
+
options: Optional TextStyleOptions to configure the analysis.
|
925
|
+
If None, the analyzer's default options are used.
|
926
|
+
|
927
|
+
Returns:
|
928
|
+
ElementCollection containing all processed text elements with added style attributes.
|
929
|
+
"""
|
930
|
+
# Create analyzer (optionally pass default options from PDF config here)
|
931
|
+
# For now, it uses its own defaults if options=None
|
932
|
+
analyzer = TextStyleAnalyzer()
|
933
|
+
|
934
|
+
# Analyze the page. The analyzer now modifies elements directly
|
935
|
+
# and returns the collection of processed elements.
|
936
|
+
processed_elements_collection = analyzer.analyze(self, options=options)
|
937
|
+
|
938
|
+
# Return the collection of elements which now have style attributes
|
939
|
+
return processed_elements_collection
|
940
|
+
|
941
|
+
def to_image(self,
|
942
|
+
path: Optional[str] = None,
|
943
|
+
scale: float = 2.0,
|
944
|
+
width: Optional[int] = None,
|
945
|
+
labels: bool = True,
|
946
|
+
legend_position: str = 'right',
|
947
|
+
render_ocr: bool = False,
|
948
|
+
resolution: Optional[float] = None,
|
949
|
+
include_highlights: bool = True,
|
950
|
+
**kwargs) -> Optional[Image.Image]:
|
951
|
+
"""
|
952
|
+
Generate a PIL image of the page, using HighlightingService if needed.
|
953
|
+
|
954
|
+
Args:
|
955
|
+
path: Optional path to save the image to.
|
956
|
+
scale: Scale factor for rendering highlights.
|
957
|
+
width: Optional width for the output image.
|
958
|
+
labels: Whether to include a legend for highlights.
|
959
|
+
legend_position: Position of the legend.
|
960
|
+
render_ocr: Whether to render OCR text on highlights.
|
961
|
+
resolution: Resolution in DPI for base page image (default: scale * 72).
|
962
|
+
include_highlights: Whether to render highlights.
|
963
|
+
**kwargs: Additional parameters for pdfplumber.to_image.
|
964
|
+
|
965
|
+
Returns:
|
966
|
+
PIL Image of the page, or None if rendering fails.
|
967
|
+
"""
|
968
|
+
image = None
|
969
|
+
try:
|
970
|
+
if include_highlights:
|
971
|
+
# Delegate rendering to the central service
|
972
|
+
image = self._highlighter.render_page(
|
973
|
+
page_index=self.index,
|
974
|
+
scale=scale,
|
975
|
+
labels=labels,
|
976
|
+
legend_position=legend_position,
|
977
|
+
render_ocr=render_ocr,
|
978
|
+
resolution=resolution,
|
979
|
+
**kwargs
|
980
|
+
)
|
981
|
+
else:
|
982
|
+
# Get the base page image directly from pdfplumber if no highlights needed
|
983
|
+
render_resolution = resolution if resolution is not None else scale * 72
|
984
|
+
# Use the underlying pdfplumber page object
|
985
|
+
img_object = self._page.to_image(resolution=render_resolution, **kwargs)
|
986
|
+
# Access the PIL image directly (assuming pdfplumber structure)
|
987
|
+
image = img_object.annotated if hasattr(img_object, 'annotated') else img_object._repr_png_()
|
988
|
+
if isinstance(image, bytes): # Handle cases where it returns bytes
|
989
|
+
from io import BytesIO
|
990
|
+
image = Image.open(BytesIO(image)).convert('RGB') # Convert to RGB for consistency
|
991
|
+
|
992
|
+
except Exception as e:
|
993
|
+
logger.error(f"Error rendering page {self.index}: {e}", exc_info=True)
|
994
|
+
return None # Return None on error
|
995
|
+
|
996
|
+
if image is None: return None
|
997
|
+
|
998
|
+
# Resize the final image if width is provided
|
999
|
+
if width is not None and width > 0 and image.width > 0:
|
1000
|
+
aspect_ratio = image.height / image.width
|
1001
|
+
height = int(width * aspect_ratio)
|
1002
|
+
try:
|
1003
|
+
image = image.resize((width, height), Image.Resampling.LANCZOS) # Use modern resampling
|
1004
|
+
except Exception as resize_error:
|
1005
|
+
logger.warning(f"Could not resize image: {resize_error}")
|
1006
|
+
|
1007
|
+
# Save the image if path is provided
|
1008
|
+
if path:
|
1009
|
+
try:
|
1010
|
+
# Ensure directory exists
|
1011
|
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
1012
|
+
image.save(path)
|
1013
|
+
logger.debug(f"Saved page image to: {path}")
|
1014
|
+
except Exception as save_error:
|
1015
|
+
logger.error(f"Failed to save image to {path}: {save_error}")
|
1016
|
+
|
1017
|
+
return image
|
1018
|
+
|
1019
|
+
def _create_text_elements_from_ocr(self, ocr_results: List[Dict[str, Any]], image_width=None, image_height=None) -> List[TextElement]:
|
1020
|
+
"""DEPRECATED: Use self._element_mgr.create_text_elements_from_ocr"""
|
1021
|
+
logger.warning("_create_text_elements_from_ocr is deprecated. Use self._element_mgr version.")
|
1022
|
+
return self._element_mgr.create_text_elements_from_ocr(ocr_results, image_width, image_height)
|
1023
|
+
|
1024
|
+
def apply_ocr(
|
1025
|
+
self,
|
1026
|
+
engine: Optional[str] = None,
|
1027
|
+
options: Optional[OCROptions] = None,
|
1028
|
+
languages: Optional[List[str]] = None,
|
1029
|
+
min_confidence: Optional[float] = None,
|
1030
|
+
device: Optional[str] = None,
|
1031
|
+
) -> List[TextElement]:
|
1032
|
+
"""
|
1033
|
+
Apply OCR to THIS page and add results to page elements via PDF.apply_ocr_to_pages.
|
1034
|
+
|
1035
|
+
Returns:
|
1036
|
+
List of created TextElements derived from OCR results for this page.
|
1037
|
+
"""
|
1038
|
+
if not hasattr(self._parent, 'apply_ocr_to_pages'):
|
1039
|
+
logger.error(f"Page {self.number}: Parent PDF missing 'apply_ocr_to_pages'. Cannot apply OCR.")
|
1040
|
+
return []
|
1041
|
+
|
1042
|
+
logger.info(f"Page {self.number}: Delegating apply_ocr to PDF.apply_ocr_to_pages.")
|
1043
|
+
try:
|
1044
|
+
# Delegate to parent PDF, targeting only this page's index
|
1045
|
+
self._parent.apply_ocr_to_pages(
|
1046
|
+
pages=[self.index],
|
1047
|
+
engine=engine, options=options, languages=languages,
|
1048
|
+
min_confidence=min_confidence, device=device
|
1049
|
+
)
|
1050
|
+
except Exception as e:
|
1051
|
+
logger.error(f"Page {self.number}: Error during delegated OCR call: {e}", exc_info=True)
|
1052
|
+
return []
|
1053
|
+
|
1054
|
+
# Return the OCR elements specifically added to this page
|
1055
|
+
# Use element manager to retrieve them
|
1056
|
+
ocr_elements = [el for el in self.words if getattr(el, 'source', None) == 'ocr']
|
1057
|
+
logger.debug(f"Page {self.number}: apply_ocr completed. Found {len(ocr_elements)} OCR elements.")
|
1058
|
+
return ocr_elements
|
1059
|
+
|
1060
|
+
def extract_ocr_elements(
|
1061
|
+
self,
|
1062
|
+
engine: Optional[str] = None,
|
1063
|
+
options: Optional[OCROptions] = None,
|
1064
|
+
languages: Optional[List[str]] = None,
|
1065
|
+
min_confidence: Optional[float] = None,
|
1066
|
+
device: Optional[str] = None,
|
1067
|
+
) -> List[TextElement]:
|
1068
|
+
"""
|
1069
|
+
Extract text elements using OCR *without* adding them to the page's elements.
|
1070
|
+
Uses the shared OCRManager instance.
|
1071
|
+
"""
|
1072
|
+
if not self._ocr_manager:
|
1073
|
+
logger.error(f"Page {self.number}: OCRManager not available. Cannot extract OCR elements.")
|
1074
|
+
return []
|
1075
|
+
|
1076
|
+
logger.info(f"Page {self.number}: Extracting OCR elements (extract only)...")
|
1077
|
+
try:
|
1078
|
+
ocr_scale = getattr(self._parent, '_config', {}).get('ocr_image_scale', 2.0)
|
1079
|
+
# Get base image without highlights
|
1080
|
+
image = self.to_image(scale=ocr_scale, include_highlights=False)
|
1081
|
+
if not image:
|
1082
|
+
logger.error(f" Failed to render page {self.number} to image for OCR extraction.")
|
1083
|
+
return []
|
1084
|
+
logger.debug(f" Rendered image size: {image.width}x{image.height}")
|
1085
|
+
except Exception as e:
|
1086
|
+
logger.error(f" Failed to render page {self.number} to image: {e}", exc_info=True)
|
1087
|
+
return []
|
1088
|
+
|
1089
|
+
manager_args = {'images': image, 'options': options, 'engine': engine}
|
1090
|
+
if languages is not None: manager_args['languages'] = languages
|
1091
|
+
if min_confidence is not None: manager_args['min_confidence'] = min_confidence
|
1092
|
+
if device is not None: manager_args['device'] = device
|
1093
|
+
|
1094
|
+
logger.debug(f" Calling OCR Manager (extract only) with args: { {k:v for k,v in manager_args.items() if k != 'images'} }")
|
1095
|
+
try:
|
1096
|
+
# apply_ocr now returns List[List[Dict]] or List[Dict]
|
1097
|
+
results_list = self._ocr_manager.apply_ocr(**manager_args)
|
1098
|
+
# If it returned a list of lists (batch mode), take the first list
|
1099
|
+
results = results_list[0] if isinstance(results_list, list) and results_list and isinstance(results_list[0], list) else results_list
|
1100
|
+
|
1101
|
+
if not isinstance(results, list):
|
1102
|
+
logger.error(f" OCR Manager returned unexpected type: {type(results)}")
|
1103
|
+
results = []
|
1104
|
+
logger.info(f" OCR Manager returned {len(results)} results for extraction.")
|
1105
|
+
except Exception as e:
|
1106
|
+
logger.error(f" OCR processing failed during extraction: {e}", exc_info=True)
|
1107
|
+
return []
|
1108
|
+
|
1109
|
+
# Convert results but DO NOT add to ElementManager
|
1110
|
+
logger.debug(f" Converting OCR results to TextElements (extract only)...")
|
1111
|
+
# Use a temporary method to create elements without adding them globally
|
1112
|
+
temp_elements = []
|
1113
|
+
scale_x = self.width / image.width if image.width else 1
|
1114
|
+
scale_y = self.height / image.height if image.height else 1
|
1115
|
+
for result in results:
|
1116
|
+
x0, top, x1, bottom = [float(c) for c in result['bbox']]
|
1117
|
+
elem_data = {
|
1118
|
+
'text': result['text'], 'confidence': result['confidence'],
|
1119
|
+
'x0': x0 * scale_x, 'top': top * scale_y,
|
1120
|
+
'x1': x1 * scale_x, 'bottom': bottom * scale_y,
|
1121
|
+
'width': (x1 - x0) * scale_x, 'height': (bottom - top) * scale_y,
|
1122
|
+
'object_type': 'text', 'source': 'ocr',
|
1123
|
+
'fontname': 'OCR-temp', 'size': 10.0, 'page_number': self.number
|
1124
|
+
}
|
1125
|
+
temp_elements.append(TextElement(elem_data, self))
|
1126
|
+
|
1127
|
+
logger.info(f" Created {len(temp_elements)} TextElements from OCR (extract only).")
|
1128
|
+
return temp_elements
|
1129
|
+
|
1130
|
+
@property
|
1131
|
+
def layout_analyzer(self) -> LayoutAnalyzer:
|
1132
|
+
"""Get or create the layout analyzer for this page."""
|
1133
|
+
if self._layout_analyzer is None:
|
1134
|
+
if not self._layout_manager:
|
1135
|
+
logger.warning("LayoutManager not available, cannot create LayoutAnalyzer.")
|
1136
|
+
return None
|
1137
|
+
self._layout_analyzer = LayoutAnalyzer(self)
|
1138
|
+
return self._layout_analyzer
|
1139
|
+
|
1140
|
+
def analyze_layout(
|
1141
|
+
self,
|
1142
|
+
engine: Optional[str] = None,
|
1143
|
+
options: Optional[LayoutOptions] = None,
|
1144
|
+
confidence: Optional[float] = None,
|
1145
|
+
classes: Optional[List[str]] = None,
|
1146
|
+
exclude_classes: Optional[List[str]] = None,
|
1147
|
+
device: Optional[str] = None,
|
1148
|
+
existing: str = "replace"
|
1149
|
+
) -> ElementCollection[Region]:
|
1150
|
+
"""
|
1151
|
+
Analyze the page layout using the configured LayoutManager.
|
1152
|
+
Adds detected Region objects to the page's element manager.
|
1153
|
+
|
1154
|
+
Returns:
|
1155
|
+
ElementCollection containing the detected Region objects.
|
1156
|
+
"""
|
1157
|
+
analyzer = self.layout_analyzer
|
1158
|
+
if not analyzer:
|
1159
|
+
logger.error("Layout analysis failed: LayoutAnalyzer not initialized (is LayoutManager available?).")
|
1160
|
+
return ElementCollection([]) # Return empty collection
|
1161
|
+
|
1162
|
+
# The analyzer's analyze_layout method already adds regions to the page
|
1163
|
+
# and its element manager. We just need to retrieve them.
|
1164
|
+
analyzer.analyze_layout(
|
1165
|
+
engine=engine,
|
1166
|
+
options=options,
|
1167
|
+
confidence=confidence,
|
1168
|
+
classes=classes,
|
1169
|
+
exclude_classes=exclude_classes,
|
1170
|
+
device=device,
|
1171
|
+
existing=existing
|
1172
|
+
)
|
1173
|
+
|
1174
|
+
# Retrieve the detected regions from the element manager
|
1175
|
+
# Filter regions based on source='detected' and potentially the model used if available
|
1176
|
+
detected_regions = [r for r in self._element_mgr.regions
|
1177
|
+
if r.source == 'detected' and (not engine or getattr(r, 'model', None) == engine)]
|
1178
|
+
|
1179
|
+
return ElementCollection(detected_regions)
|
1180
|
+
|
1181
|
+
def get_section_between(self, start_element=None, end_element=None, boundary_inclusion='both') -> Optional[Region]: # Return Optional
|
1182
|
+
"""
|
1183
|
+
Get a section between two elements on this page.
|
1184
|
+
"""
|
1185
|
+
# Create a full-page region to operate within
|
1186
|
+
page_region = self.create_region(0, 0, self.width, self.height)
|
1187
|
+
|
1188
|
+
# Delegate to the region's method
|
1189
|
+
try:
|
1190
|
+
return page_region.get_section_between(
|
1191
|
+
start_element=start_element,
|
1192
|
+
end_element=end_element,
|
1193
|
+
boundary_inclusion=boundary_inclusion
|
1194
|
+
)
|
1195
|
+
except Exception as e:
|
1196
|
+
logger.error(f"Error getting section between elements on page {self.index}: {e}", exc_info=True)
|
1197
|
+
return None
|
1198
|
+
|
1199
|
+
def get_sections(self,
|
1200
|
+
start_elements=None,
|
1201
|
+
end_elements=None,
|
1202
|
+
boundary_inclusion='both',
|
1203
|
+
y_threshold=5.0,
|
1204
|
+
bounding_box=None) -> 'ElementCollection[Region]': # Updated type hint
|
1205
|
+
"""
|
1206
|
+
Get sections of a page defined by start/end elements.
|
1207
|
+
Uses the page-level implementation.
|
1208
|
+
|
1209
|
+
Returns:
|
1210
|
+
An ElementCollection containing the found Region objects.
|
1211
|
+
"""
|
1212
|
+
# Helper function to get bounds from bounding_box parameter
|
1213
|
+
def get_bounds():
|
1214
|
+
if bounding_box:
|
1215
|
+
x0, top, x1, bottom = bounding_box
|
1216
|
+
# Clamp to page boundaries
|
1217
|
+
return max(0, x0), max(0, top), min(self.width, x1), min(self.height, bottom)
|
1218
|
+
else:
|
1219
|
+
return 0, 0, self.width, self.height
|
1220
|
+
|
1221
|
+
regions = []
|
1222
|
+
|
1223
|
+
# Handle cases where elements are provided as strings (selectors)
|
1224
|
+
if isinstance(start_elements, str):
|
1225
|
+
start_elements = self.find_all(start_elements).elements # Get list of elements
|
1226
|
+
elif hasattr(start_elements, 'elements'): # Handle ElementCollection input
|
1227
|
+
start_elements = start_elements.elements
|
1228
|
+
|
1229
|
+
if isinstance(end_elements, str):
|
1230
|
+
end_elements = self.find_all(end_elements).elements
|
1231
|
+
elif hasattr(end_elements, 'elements'):
|
1232
|
+
end_elements = end_elements.elements
|
1233
|
+
|
1234
|
+
# Ensure start_elements is a list
|
1235
|
+
if start_elements is None: start_elements = []
|
1236
|
+
if end_elements is None: end_elements = []
|
1237
|
+
|
1238
|
+
valid_inclusions = ['start', 'end', 'both', 'none']
|
1239
|
+
if boundary_inclusion not in valid_inclusions:
|
1240
|
+
raise ValueError(f"boundary_inclusion must be one of {valid_inclusions}")
|
1241
|
+
|
1242
|
+
if not start_elements:
|
1243
|
+
# Return an empty ElementCollection if no start elements
|
1244
|
+
return ElementCollection([])
|
1245
|
+
|
1246
|
+
# Combine start and end elements with their type
|
1247
|
+
all_boundaries = []
|
1248
|
+
for el in start_elements: all_boundaries.append((el, 'start'))
|
1249
|
+
for el in end_elements: all_boundaries.append((el, 'end'))
|
1250
|
+
|
1251
|
+
# Sort all boundary elements primarily by top, then x0
|
1252
|
+
try:
|
1253
|
+
all_boundaries.sort(key=lambda x: (x[0].top, x[0].x0))
|
1254
|
+
except AttributeError as e:
|
1255
|
+
logger.error(f"Error sorting boundaries: Element missing top/x0 attribute? {e}")
|
1256
|
+
return ElementCollection([]) # Cannot proceed if elements lack position
|
1257
|
+
|
1258
|
+
# Process sorted boundaries to find sections
|
1259
|
+
current_start_element = None
|
1260
|
+
active_section_started = False
|
1261
|
+
|
1262
|
+
for element, element_type in all_boundaries:
|
1263
|
+
if element_type == 'start':
|
1264
|
+
# If we have an active section, this start implicitly ends it
|
1265
|
+
if active_section_started:
|
1266
|
+
end_boundary_el = element # Use this start as the end boundary
|
1267
|
+
# Determine region boundaries
|
1268
|
+
sec_top = current_start_element.top if boundary_inclusion in ['start', 'both'] else current_start_element.bottom
|
1269
|
+
sec_bottom = end_boundary_el.top if boundary_inclusion not in ['end', 'both'] else end_boundary_el.bottom
|
1270
|
+
|
1271
|
+
if sec_top < sec_bottom: # Ensure valid region
|
1272
|
+
x0, _, x1, _ = get_bounds()
|
1273
|
+
region = self.create_region(x0, sec_top, x1, sec_bottom)
|
1274
|
+
region.start_element = current_start_element
|
1275
|
+
region.end_element = end_boundary_el # Mark the element that ended it
|
1276
|
+
region.is_end_next_start = True # Mark how it ended
|
1277
|
+
regions.append(region)
|
1278
|
+
active_section_started = False # Reset for the new start
|
1279
|
+
|
1280
|
+
# Set this as the potential start of the next section
|
1281
|
+
current_start_element = element
|
1282
|
+
active_section_started = True
|
1283
|
+
|
1284
|
+
elif element_type == 'end' and active_section_started:
|
1285
|
+
# We found an explicit end for the current section
|
1286
|
+
end_boundary_el = element
|
1287
|
+
sec_top = current_start_element.top if boundary_inclusion in ['start', 'both'] else current_start_element.bottom
|
1288
|
+
sec_bottom = end_boundary_el.bottom if boundary_inclusion in ['end', 'both'] else end_boundary_el.top
|
1289
|
+
|
1290
|
+
if sec_top < sec_bottom: # Ensure valid region
|
1291
|
+
x0, _, x1, _ = get_bounds()
|
1292
|
+
region = self.create_region(x0, sec_top, x1, sec_bottom)
|
1293
|
+
region.start_element = current_start_element
|
1294
|
+
region.end_element = end_boundary_el
|
1295
|
+
region.is_end_next_start = False
|
1296
|
+
regions.append(region)
|
1297
|
+
|
1298
|
+
# Reset: section ended explicitly
|
1299
|
+
current_start_element = None
|
1300
|
+
active_section_started = False
|
1301
|
+
|
1302
|
+
# Handle the last section if it was started but never explicitly ended
|
1303
|
+
if active_section_started:
|
1304
|
+
sec_top = current_start_element.top if boundary_inclusion in ['start', 'both'] else current_start_element.bottom
|
1305
|
+
x0, _, x1, page_bottom = get_bounds()
|
1306
|
+
if sec_top < page_bottom:
|
1307
|
+
region = self.create_region(x0, sec_top, x1, page_bottom)
|
1308
|
+
region.start_element = current_start_element
|
1309
|
+
region.end_element = None # Ended by page end
|
1310
|
+
region.is_end_next_start = False
|
1311
|
+
regions.append(region)
|
1312
|
+
|
1313
|
+
# Return the list wrapped in an ElementCollection
|
1314
|
+
return ElementCollection(regions)
|
1315
|
+
|
1316
|
+
def __repr__(self) -> str:
|
1317
|
+
"""String representation of the page."""
|
1318
|
+
return f"<Page number={self.number} index={self.index}>"
|
1319
|
+
|
1320
|
+
def ask(self, question: str, min_confidence: float = 0.1, model: str = None, debug: bool = False, **kwargs) -> Dict[str, Any]:
|
1321
|
+
"""
|
1322
|
+
Ask a question about the page content using document QA.
|
1323
|
+
"""
|
1324
|
+
try:
|
1325
|
+
from natural_pdf.qa.document_qa import get_qa_engine
|
1326
|
+
# Get or initialize QA engine with specified model
|
1327
|
+
qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
|
1328
|
+
# Ask the question using the QA engine
|
1329
|
+
return qa_engine.ask_pdf_page(self, question, min_confidence=min_confidence, debug=debug, **kwargs)
|
1330
|
+
except ImportError:
|
1331
|
+
logger.error("Question answering requires the 'natural_pdf.qa' module. Please install necessary dependencies.")
|
1332
|
+
return {"answer": None, "confidence": 0.0, "found": False, "page_num": self.number, "source_elements": []}
|
1333
|
+
except Exception as e:
|
1334
|
+
logger.error(f"Error during page.ask: {e}", exc_info=True)
|
1335
|
+
return {"answer": None, "confidence": 0.0, "found": False, "page_num": self.number, "source_elements": []}
|
1336
|
+
|
1337
|
+
def show_preview(self,
|
1338
|
+
temporary_highlights: List[Dict],
|
1339
|
+
scale: float = 2.0,
|
1340
|
+
width: Optional[int] = None,
|
1341
|
+
labels: bool = True,
|
1342
|
+
legend_position: str = 'right',
|
1343
|
+
render_ocr: bool = False) -> Optional[Image.Image]:
|
1344
|
+
"""
|
1345
|
+
Generates and returns a non-stateful preview image containing only
|
1346
|
+
the provided temporary highlights.
|
1347
|
+
|
1348
|
+
Args:
|
1349
|
+
temporary_highlights: List of highlight data dictionaries (as prepared by
|
1350
|
+
ElementCollection._prepare_highlight_data).
|
1351
|
+
scale: Scale factor for rendering.
|
1352
|
+
width: Optional width for the output image.
|
1353
|
+
labels: Whether to include a legend.
|
1354
|
+
legend_position: Position of the legend.
|
1355
|
+
render_ocr: Whether to render OCR text.
|
1356
|
+
|
1357
|
+
Returns:
|
1358
|
+
PIL Image object of the preview, or None if rendering fails.
|
1359
|
+
"""
|
1360
|
+
try:
|
1361
|
+
# Delegate rendering to the highlighter service's preview method
|
1362
|
+
img = self._highlighter.render_preview(
|
1363
|
+
page_index=self.index,
|
1364
|
+
temporary_highlights=temporary_highlights,
|
1365
|
+
scale=scale,
|
1366
|
+
labels=labels,
|
1367
|
+
legend_position=legend_position,
|
1368
|
+
render_ocr=render_ocr
|
1369
|
+
)
|
1370
|
+
except AttributeError:
|
1371
|
+
logger.error(f"HighlightingService does not have the required 'render_preview' method.")
|
1372
|
+
return None
|
1373
|
+
except Exception as e:
|
1374
|
+
logger.error(f"Error calling highlighter.render_preview for page {self.index}: {e}", exc_info=True)
|
1375
|
+
return None
|
1376
|
+
|
1377
|
+
# Return the rendered image directly
|
1378
|
+
return img
|
1379
|
+
|
1380
|
+
@property
|
1381
|
+
def text_style_labels(self) -> List[str]:
|
1382
|
+
"""
|
1383
|
+
Get a sorted list of unique text style labels found on the page.
|
1384
|
+
|
1385
|
+
Runs text style analysis with default options if it hasn't been run yet.
|
1386
|
+
To use custom options, call `analyze_text_styles(options=...)` explicitly first.
|
1387
|
+
|
1388
|
+
Returns:
|
1389
|
+
A sorted list of unique style label strings.
|
1390
|
+
"""
|
1391
|
+
# Check if the summary attribute exists from a previous run
|
1392
|
+
if not hasattr(self, '_text_styles_summary') or not self._text_styles_summary:
|
1393
|
+
# If not, run the analysis with default options
|
1394
|
+
logger.debug(f"Page {self.number}: Running default text style analysis to get labels.")
|
1395
|
+
self.analyze_text_styles() # Use default options
|
1396
|
+
|
1397
|
+
# Extract labels from the summary dictionary
|
1398
|
+
if hasattr(self, '_text_styles_summary') and self._text_styles_summary:
|
1399
|
+
# The summary maps style_key -> {'label': ..., 'properties': ...}
|
1400
|
+
labels = {style_info['label'] for style_info in self._text_styles_summary.values()}
|
1401
|
+
return sorted(list(labels))
|
1402
|
+
else:
|
1403
|
+
# Fallback if summary wasn't created for some reason (e.g., no text elements)
|
1404
|
+
logger.warning(f"Page {self.number}: Text style summary not found after analysis.")
|
1405
|
+
return []
|
1406
|
+
|
1407
|
+
def viewer(self,
|
1408
|
+
# elements_to_render: Optional[List['Element']] = None, # No longer needed, from_page handles it
|
1409
|
+
# include_element_types: List[str] = ['word', 'line', 'rect', 'region'] # No longer needed
|
1410
|
+
) -> 'SimpleInteractiveViewerWidget': # Return type hint updated
|
1411
|
+
"""
|
1412
|
+
Creates and returns an interactive ipywidget for exploring elements on this page.
|
1413
|
+
|
1414
|
+
Uses SimpleInteractiveViewerWidget.from_page() to create the viewer.
|
1415
|
+
|
1416
|
+
Returns:
|
1417
|
+
A SimpleInteractiveViewerWidget instance ready for display in Jupyter.
|
1418
|
+
|
1419
|
+
Raises:
|
1420
|
+
RuntimeError: If required dependencies (ipywidgets) are missing.
|
1421
|
+
ValueError: If image rendering or data preparation fails within from_page.
|
1422
|
+
"""
|
1423
|
+
# Import the widget class (might need to be moved to top if used elsewhere)
|
1424
|
+
from natural_pdf.widgets.viewer import SimpleInteractiveViewerWidget
|
1425
|
+
|
1426
|
+
logger.info(f"Generating interactive viewer for Page {self.number} using SimpleInteractiveViewerWidget.from_page...")
|
1427
|
+
|
1428
|
+
try:
|
1429
|
+
# Delegate creation entirely to the from_page class method
|
1430
|
+
viewer_widget = SimpleInteractiveViewerWidget.from_page(self)
|
1431
|
+
if viewer_widget is None:
|
1432
|
+
# This case might happen if from_page had error handling to return None, though we removed most.
|
1433
|
+
# Keeping a check here just in case.
|
1434
|
+
raise RuntimeError("SimpleInteractiveViewerWidget.from_page returned None, indicating an issue during widget creation.")
|
1435
|
+
|
1436
|
+
logger.info("Interactive viewer widget created successfully.")
|
1437
|
+
return viewer_widget
|
1438
|
+
except ImportError as e:
|
1439
|
+
logger.error("Failed to import SimpleInteractiveViewerWidget. Ensure natural_pdf.widgets and ipywidgets are installed.")
|
1440
|
+
raise RuntimeError("Widget class not found. ipywidgets or natural_pdf.widgets might be missing or setup incorrect.") from e
|
1441
|
+
except Exception as e:
|
1442
|
+
logger.error(f"Failed to create interactive viewer: {e}", exc_info=True)
|
1443
|
+
# Re-raise the exception to make it visible to the user
|
1444
|
+
raise RuntimeError(f"Failed to create interactive viewer: {e}") from e
|