natural-pdf 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +6 -0
- natural_pdf/analyzers/layout/__init__.py +1 -0
- natural_pdf/analyzers/layout/base.py +151 -0
- natural_pdf/analyzers/layout/docling.py +247 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +166 -0
- natural_pdf/analyzers/layout/layout_manager.py +200 -0
- natural_pdf/analyzers/layout/layout_options.py +78 -0
- natural_pdf/analyzers/layout/paddle.py +240 -0
- natural_pdf/analyzers/layout/surya.py +151 -0
- natural_pdf/analyzers/layout/tatr.py +251 -0
- natural_pdf/analyzers/layout/yolo.py +165 -0
- natural_pdf/analyzers/text_options.py +60 -0
- natural_pdf/analyzers/text_structure.py +270 -0
- natural_pdf/analyzers/utils.py +57 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/element_manager.py +457 -0
- natural_pdf/core/highlighting_service.py +698 -0
- natural_pdf/core/page.py +1444 -0
- natural_pdf/core/pdf.py +653 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +761 -0
- natural_pdf/elements/collections.py +1345 -0
- natural_pdf/elements/line.py +140 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1793 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +56 -0
- natural_pdf/ocr/engine.py +104 -0
- natural_pdf/ocr/engine_easyocr.py +179 -0
- natural_pdf/ocr/engine_paddle.py +204 -0
- natural_pdf/ocr/engine_surya.py +171 -0
- natural_pdf/ocr/ocr_manager.py +191 -0
- natural_pdf/ocr/ocr_options.py +114 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +396 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +354 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +3 -0
- natural_pdf/utils/highlighting.py +12 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +223 -0
- natural_pdf/widgets/__init__.py +4 -0
- natural_pdf/widgets/frontend/viewer.js +88 -0
- natural_pdf/widgets/viewer.py +765 -0
- natural_pdf-0.1.0.dist-info/METADATA +295 -0
- natural_pdf-0.1.0.dist-info/RECORD +52 -0
- natural_pdf-0.1.0.dist-info/WHEEL +5 -0
- natural_pdf-0.1.0.dist-info/licenses/LICENSE +21 -0
- natural_pdf-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1345 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
from typing import List, Optional, Dict, Any, Union, Callable, TypeVar, Generic, Iterator, Tuple, TYPE_CHECKING
|
4
|
+
from natural_pdf.ocr import OCROptions
|
5
|
+
|
6
|
+
logger = logging.getLogger(__name__)
|
7
|
+
|
8
|
+
if TYPE_CHECKING:
|
9
|
+
from natural_pdf.core.page import Page
|
10
|
+
from natural_pdf.elements.region import Region
|
11
|
+
|
12
|
+
T = TypeVar('T')
|
13
|
+
P = TypeVar('P', bound='Page')
|
14
|
+
|
15
|
+
class ElementCollection(Generic[T]):
|
16
|
+
"""
|
17
|
+
Collection of PDF elements with batch operations.
|
18
|
+
"""
|
19
|
+
|
20
|
+
def __init__(self, elements: List[T]):
|
21
|
+
"""
|
22
|
+
Initialize a collection of elements.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
elements: List of Element objects
|
26
|
+
"""
|
27
|
+
self._elements = elements or []
|
28
|
+
|
29
|
+
def __len__(self) -> int:
|
30
|
+
"""Get the number of elements in the collection."""
|
31
|
+
return len(self._elements)
|
32
|
+
|
33
|
+
def __getitem__(self, index: int) -> 'Element':
|
34
|
+
"""Get an element by index."""
|
35
|
+
return self._elements[index]
|
36
|
+
|
37
|
+
def __iter__(self):
|
38
|
+
"""Iterate over elements."""
|
39
|
+
return iter(self._elements)
|
40
|
+
|
41
|
+
def __repr__(self) -> str:
|
42
|
+
"""Return a string representation showing the element count."""
|
43
|
+
element_type = "Mixed"
|
44
|
+
if self._elements:
|
45
|
+
types = set(type(el).__name__ for el in self._elements)
|
46
|
+
if len(types) == 1:
|
47
|
+
element_type = types.pop()
|
48
|
+
return f"<ElementCollection[{element_type}](count={len(self)})>"
|
49
|
+
|
50
|
+
@property
|
51
|
+
def elements(self) -> List['Element']:
|
52
|
+
"""Get the elements in this collection."""
|
53
|
+
return self._elements
|
54
|
+
|
55
|
+
@property
|
56
|
+
def first(self) -> Optional['Element']:
|
57
|
+
"""Get the first element in the collection."""
|
58
|
+
return self._elements[0] if self._elements else None
|
59
|
+
|
60
|
+
@property
|
61
|
+
def last(self) -> Optional['Element']:
|
62
|
+
"""Get the last element in the collection."""
|
63
|
+
return self._elements[-1] if self._elements else None
|
64
|
+
|
65
|
+
def highest(self) -> Optional['Element']:
|
66
|
+
"""
|
67
|
+
Get element with the smallest top y-coordinate (highest on page).
|
68
|
+
|
69
|
+
Raises:
|
70
|
+
ValueError: If elements are on multiple pages
|
71
|
+
|
72
|
+
Returns:
|
73
|
+
Element with smallest top value or None if empty
|
74
|
+
"""
|
75
|
+
if not self._elements:
|
76
|
+
return None
|
77
|
+
|
78
|
+
# Check if elements are on multiple pages
|
79
|
+
if self._are_on_multiple_pages():
|
80
|
+
raise ValueError("Cannot determine highest element across multiple pages")
|
81
|
+
|
82
|
+
return min(self._elements, key=lambda e: e.top)
|
83
|
+
|
84
|
+
def lowest(self) -> Optional['Element']:
|
85
|
+
"""
|
86
|
+
Get element with the largest bottom y-coordinate (lowest on page).
|
87
|
+
|
88
|
+
Raises:
|
89
|
+
ValueError: If elements are on multiple pages
|
90
|
+
|
91
|
+
Returns:
|
92
|
+
Element with largest bottom value or None if empty
|
93
|
+
"""
|
94
|
+
if not self._elements:
|
95
|
+
return None
|
96
|
+
|
97
|
+
# Check if elements are on multiple pages
|
98
|
+
if self._are_on_multiple_pages():
|
99
|
+
raise ValueError("Cannot determine lowest element across multiple pages")
|
100
|
+
|
101
|
+
return max(self._elements, key=lambda e: e.bottom)
|
102
|
+
|
103
|
+
def leftmost(self) -> Optional['Element']:
|
104
|
+
"""
|
105
|
+
Get element with the smallest x0 coordinate (leftmost on page).
|
106
|
+
|
107
|
+
Raises:
|
108
|
+
ValueError: If elements are on multiple pages
|
109
|
+
|
110
|
+
Returns:
|
111
|
+
Element with smallest x0 value or None if empty
|
112
|
+
"""
|
113
|
+
if not self._elements:
|
114
|
+
return None
|
115
|
+
|
116
|
+
# Check if elements are on multiple pages
|
117
|
+
if self._are_on_multiple_pages():
|
118
|
+
raise ValueError("Cannot determine leftmost element across multiple pages")
|
119
|
+
|
120
|
+
return min(self._elements, key=lambda e: e.x0)
|
121
|
+
|
122
|
+
def rightmost(self) -> Optional['Element']:
|
123
|
+
"""
|
124
|
+
Get element with the largest x1 coordinate (rightmost on page).
|
125
|
+
|
126
|
+
Raises:
|
127
|
+
ValueError: If elements are on multiple pages
|
128
|
+
|
129
|
+
Returns:
|
130
|
+
Element with largest x1 value or None if empty
|
131
|
+
"""
|
132
|
+
if not self._elements:
|
133
|
+
return None
|
134
|
+
|
135
|
+
# Check if elements are on multiple pages
|
136
|
+
if self._are_on_multiple_pages():
|
137
|
+
raise ValueError("Cannot determine rightmost element across multiple pages")
|
138
|
+
|
139
|
+
return max(self._elements, key=lambda e: e.x1)
|
140
|
+
|
141
|
+
def _are_on_multiple_pages(self) -> bool:
|
142
|
+
"""
|
143
|
+
Check if elements in this collection span multiple pages.
|
144
|
+
|
145
|
+
Returns:
|
146
|
+
True if elements are on different pages, False otherwise
|
147
|
+
"""
|
148
|
+
if not self._elements:
|
149
|
+
return False
|
150
|
+
|
151
|
+
# Get the page index of the first element
|
152
|
+
if not hasattr(self._elements[0], 'page'):
|
153
|
+
return False
|
154
|
+
|
155
|
+
first_page_idx = self._elements[0].page.index
|
156
|
+
|
157
|
+
# Check if any element is on a different page
|
158
|
+
return any(hasattr(e, 'page') and e.page.index != first_page_idx for e in self._elements)
|
159
|
+
|
160
|
+
def exclude_regions(self, regions: List['Region']) -> 'ElementCollection':
|
161
|
+
"""
|
162
|
+
Remove elements that are within any of the specified regions.
|
163
|
+
|
164
|
+
Args:
|
165
|
+
regions: List of Region objects to exclude
|
166
|
+
|
167
|
+
Returns:
|
168
|
+
New ElementCollection with filtered elements
|
169
|
+
"""
|
170
|
+
if not regions:
|
171
|
+
return ElementCollection(self._elements)
|
172
|
+
|
173
|
+
filtered = []
|
174
|
+
for element in self._elements:
|
175
|
+
exclude = False
|
176
|
+
for region in regions:
|
177
|
+
if region._is_element_in_region(element):
|
178
|
+
exclude = True
|
179
|
+
break
|
180
|
+
if not exclude:
|
181
|
+
filtered.append(element)
|
182
|
+
|
183
|
+
return ElementCollection(filtered)
|
184
|
+
|
185
|
+
def extract_text(self, preserve_whitespace=True, use_exclusions=True, **kwargs) -> str:
|
186
|
+
"""
|
187
|
+
Extract text from all elements in the collection.
|
188
|
+
|
189
|
+
Args:
|
190
|
+
preserve_whitespace: Whether to keep blank characters (default: True)
|
191
|
+
use_exclusions: Whether to apply exclusion regions (default: True)
|
192
|
+
**kwargs: Additional extraction parameters
|
193
|
+
|
194
|
+
Returns:
|
195
|
+
Combined text from all elements
|
196
|
+
"""
|
197
|
+
# Filter to just text-like elements
|
198
|
+
text_elements = [e for e in self._elements if hasattr(e, 'extract_text')]
|
199
|
+
|
200
|
+
# Sort elements in reading order (top-to-bottom, left-to-right)
|
201
|
+
sorted_elements = sorted(text_elements, key=lambda e: (e.top, e.x0))
|
202
|
+
|
203
|
+
# Extract text from each element
|
204
|
+
texts = []
|
205
|
+
for element in sorted_elements:
|
206
|
+
# Extract text with new parameter names
|
207
|
+
text = element.extract_text(preserve_whitespace=preserve_whitespace, use_exclusions=use_exclusions, **kwargs)
|
208
|
+
|
209
|
+
if text:
|
210
|
+
texts.append(text)
|
211
|
+
|
212
|
+
return " ".join(texts)
|
213
|
+
|
214
|
+
def filter(self, func: Callable[['Element'], bool]) -> 'ElementCollection':
|
215
|
+
"""
|
216
|
+
Filter elements using a function.
|
217
|
+
|
218
|
+
Args:
|
219
|
+
func: Function that takes an element and returns True to keep it
|
220
|
+
|
221
|
+
Returns:
|
222
|
+
New ElementCollection with filtered elements
|
223
|
+
"""
|
224
|
+
return ElementCollection([e for e in self._elements if func(e)])
|
225
|
+
|
226
|
+
def sort(self, key=None, reverse=False) -> 'ElementCollection':
|
227
|
+
"""
|
228
|
+
Sort elements by the given key function.
|
229
|
+
|
230
|
+
Args:
|
231
|
+
key: Function to generate a key for sorting
|
232
|
+
reverse: Whether to sort in descending order
|
233
|
+
|
234
|
+
Returns:
|
235
|
+
Self for method chaining
|
236
|
+
"""
|
237
|
+
self._elements.sort(key=key, reverse=reverse)
|
238
|
+
return self
|
239
|
+
|
240
|
+
def highlight(self,
|
241
|
+
label: Optional[str] = None,
|
242
|
+
color: Optional[Union[Tuple, str]] = None,
|
243
|
+
group_by: Optional[str] = None,
|
244
|
+
label_format: Optional[str] = None,
|
245
|
+
distinct: bool = False,
|
246
|
+
include_attrs: Optional[List[str]] = None,
|
247
|
+
replace: bool = False) -> 'ElementCollection':
|
248
|
+
"""
|
249
|
+
Adds persistent highlights for all elements in the collection to the page
|
250
|
+
via the HighlightingService.
|
251
|
+
|
252
|
+
By default, this APPENDS highlights to any existing ones on the page.
|
253
|
+
To replace existing highlights, set `replace=True`.
|
254
|
+
|
255
|
+
Uses grouping logic based on parameters (defaulting to grouping by type).
|
256
|
+
|
257
|
+
Args:
|
258
|
+
label: Optional explicit label for the entire collection. If provided,
|
259
|
+
all elements are highlighted as a single group with this label,
|
260
|
+
ignoring 'group_by' and the default type-based grouping.
|
261
|
+
color: Optional explicit color for the highlight (tuple/string). Applied
|
262
|
+
consistently if 'label' is provided or if grouping occurs.
|
263
|
+
group_by: Optional attribute name present on the elements. If provided
|
264
|
+
(and 'label' is None), elements will be grouped based on the
|
265
|
+
value of this attribute, and each group will be highlighted
|
266
|
+
with a distinct label and color.
|
267
|
+
label_format: Optional Python f-string to format the group label when
|
268
|
+
'group_by' is used. Can reference element attributes
|
269
|
+
(e.g., "Type: {region_type}, Conf: {confidence:.2f}").
|
270
|
+
If None, the attribute value itself is used as the label.
|
271
|
+
distinct: If True, bypasses all grouping and highlights each element
|
272
|
+
individually with cycling colors (the previous default behavior).
|
273
|
+
(default: False)
|
274
|
+
include_attrs: List of attribute names from the element to display directly
|
275
|
+
on the highlight itself (distinct from group label).
|
276
|
+
replace: If True, existing highlights on the affected page(s)
|
277
|
+
are cleared before adding these highlights.
|
278
|
+
If False (default), highlights are appended to existing ones.
|
279
|
+
|
280
|
+
Returns:
|
281
|
+
Self for method chaining
|
282
|
+
|
283
|
+
Raises:
|
284
|
+
AttributeError: If 'group_by' is provided but the attribute doesn't exist
|
285
|
+
on some elements.
|
286
|
+
ValueError: If 'label_format' is provided but contains invalid keys for
|
287
|
+
element attributes.
|
288
|
+
"""
|
289
|
+
# 1. Prepare the highlight data based on parameters
|
290
|
+
highlight_data_list = self._prepare_highlight_data(
|
291
|
+
distinct=distinct,
|
292
|
+
label=label,
|
293
|
+
color=color,
|
294
|
+
group_by=group_by,
|
295
|
+
label_format=label_format,
|
296
|
+
include_attrs=include_attrs
|
297
|
+
# 'replace' flag is handled during the add call below
|
298
|
+
)
|
299
|
+
|
300
|
+
# 2. Add prepared highlights to the persistent service
|
301
|
+
if not highlight_data_list:
|
302
|
+
return self # Nothing to add
|
303
|
+
|
304
|
+
# Get page and highlighter from the first element (assume uniform page)
|
305
|
+
first_element = self._elements[0]
|
306
|
+
if not hasattr(first_element, 'page') or not hasattr(first_element.page, '_highlighter'):
|
307
|
+
logger.warning("Cannot highlight collection: Elements lack page or highlighter access.")
|
308
|
+
return self
|
309
|
+
|
310
|
+
page = first_element.page
|
311
|
+
highlighter = page._highlighter
|
312
|
+
|
313
|
+
# Use a set to track pages affected if replacing
|
314
|
+
pages_to_clear = set()
|
315
|
+
# Check the 'replace' flag. If True, we replace.
|
316
|
+
if replace:
|
317
|
+
# Identify all unique page indices in this operation
|
318
|
+
for data in highlight_data_list:
|
319
|
+
pages_to_clear.add(data['page_index'])
|
320
|
+
# Clear those pages *before* adding new highlights
|
321
|
+
logger.debug(f"Highlighting with replace=True. Clearing highlights for pages: {pages_to_clear}")
|
322
|
+
for page_idx in pages_to_clear:
|
323
|
+
highlighter.clear_page(page_idx)
|
324
|
+
|
325
|
+
for data in highlight_data_list:
|
326
|
+
# Call the appropriate service add method
|
327
|
+
add_args = {
|
328
|
+
"page_index": data['page_index'],
|
329
|
+
"color": data['color'], # Color determined by _prepare
|
330
|
+
"label": data['label'], # Label determined by _prepare
|
331
|
+
"use_color_cycling": data.get('use_color_cycling', False), # Set by _prepare if distinct
|
332
|
+
"element": data['element'],
|
333
|
+
"include_attrs": data['include_attrs'],
|
334
|
+
# Internal call to service always appends, as clearing was handled above
|
335
|
+
"existing": 'append'
|
336
|
+
}
|
337
|
+
if data.get('polygon'):
|
338
|
+
add_args["polygon"] = data['polygon']
|
339
|
+
highlighter.add_polygon(**add_args)
|
340
|
+
elif data.get('bbox'):
|
341
|
+
add_args["bbox"] = data['bbox']
|
342
|
+
highlighter.add(**add_args)
|
343
|
+
else:
|
344
|
+
logger.warning(f"Skipping highlight data, no bbox or polygon found: {data}")
|
345
|
+
|
346
|
+
return self
|
347
|
+
|
348
|
+
def _prepare_highlight_data(self,
|
349
|
+
distinct: bool = False,
|
350
|
+
label: Optional[str] = None,
|
351
|
+
color: Optional[Union[Tuple, str]] = None,
|
352
|
+
group_by: Optional[str] = None,
|
353
|
+
label_format: Optional[str] = None,
|
354
|
+
include_attrs: Optional[List[str]] = None) -> List[Dict]:
|
355
|
+
"""
|
356
|
+
Determines the parameters for highlighting each element based on the strategy.
|
357
|
+
|
358
|
+
Does not interact with the HighlightingService directly.
|
359
|
+
|
360
|
+
Returns:
|
361
|
+
List of dictionaries, each containing parameters for a single highlight
|
362
|
+
(e.g., page_index, bbox/polygon, color, label, element, include_attrs, attributes_to_draw).
|
363
|
+
Color and label determination happens here.
|
364
|
+
"""
|
365
|
+
prepared_data = []
|
366
|
+
if not self._elements: return prepared_data
|
367
|
+
|
368
|
+
# Need access to the HighlightingService to determine colors correctly.
|
369
|
+
highlighter = None
|
370
|
+
first_element = self._elements[0]
|
371
|
+
if hasattr(first_element, 'page') and hasattr(first_element.page, '_highlighter'):
|
372
|
+
highlighter = first_element.page._highlighter
|
373
|
+
else:
|
374
|
+
logger.warning("Cannot determine highlight colors: HighlightingService not accessible from elements.")
|
375
|
+
return []
|
376
|
+
|
377
|
+
if distinct:
|
378
|
+
logger.debug("_prepare: Distinct highlighting strategy.")
|
379
|
+
for element in self._elements:
|
380
|
+
# Call the service's color determination logic
|
381
|
+
final_color = highlighter._determine_highlight_color(label=None, color_input=None, use_color_cycling=True)
|
382
|
+
element_data = self._get_element_highlight_params(element, include_attrs)
|
383
|
+
if element_data:
|
384
|
+
element_data.update({
|
385
|
+
'color': final_color,
|
386
|
+
'label': None,
|
387
|
+
'use_color_cycling': True
|
388
|
+
})
|
389
|
+
prepared_data.append(element_data)
|
390
|
+
|
391
|
+
elif label is not None:
|
392
|
+
logger.debug(f"_prepare: Explicit label '{label}' strategy.")
|
393
|
+
final_color = highlighter._determine_highlight_color(label=label, color_input=color, use_color_cycling=False)
|
394
|
+
for element in self._elements:
|
395
|
+
element_data = self._get_element_highlight_params(element, include_attrs)
|
396
|
+
if element_data:
|
397
|
+
element_data.update({
|
398
|
+
'color': final_color,
|
399
|
+
'label': label
|
400
|
+
})
|
401
|
+
prepared_data.append(element_data)
|
402
|
+
|
403
|
+
elif group_by is not None:
|
404
|
+
logger.debug("_prepare: Grouping by attribute strategy.")
|
405
|
+
grouped_elements = self._group_elements_by_attr(group_by)
|
406
|
+
for group_key, group_elements in grouped_elements.items():
|
407
|
+
if not group_elements: continue
|
408
|
+
group_label = self._format_group_label(group_key, label_format, group_elements[0], group_by)
|
409
|
+
final_color = highlighter._determine_highlight_color(label=group_label, color_input=None, use_color_cycling=False)
|
410
|
+
logger.debug(f" _prepare group '{group_label}' ({len(group_elements)} elements) -> color {final_color}")
|
411
|
+
for element in group_elements:
|
412
|
+
element_data = self._get_element_highlight_params(element, include_attrs)
|
413
|
+
if element_data:
|
414
|
+
element_data.update({
|
415
|
+
'color': final_color,
|
416
|
+
'label': group_label
|
417
|
+
})
|
418
|
+
prepared_data.append(element_data)
|
419
|
+
else:
|
420
|
+
logger.debug("_prepare: Default grouping strategy.")
|
421
|
+
element_types = set(type(el).__name__ for el in self._elements)
|
422
|
+
|
423
|
+
if len(element_types) == 1:
|
424
|
+
type_name = element_types.pop()
|
425
|
+
base_name = type_name.replace("Element", "").replace("Region", "") if type_name != "Region" else "Region"
|
426
|
+
auto_label = f"{base_name} Elements" if base_name else "Elements"
|
427
|
+
# Determine color *before* logging or using it
|
428
|
+
final_color = highlighter._determine_highlight_color(label=auto_label, color_input=color, use_color_cycling=False)
|
429
|
+
logger.debug(f" _prepare default group '{auto_label}' -> color {final_color}")
|
430
|
+
for element in self._elements:
|
431
|
+
element_data = self._get_element_highlight_params(element, include_attrs)
|
432
|
+
if element_data:
|
433
|
+
element_data.update({'color': final_color, 'label': auto_label})
|
434
|
+
prepared_data.append(element_data)
|
435
|
+
else:
|
436
|
+
# Mixed types: Generate generic label and warn
|
437
|
+
type_names_str = ", ".join(sorted(list(element_types)))
|
438
|
+
auto_label = "Mixed Elements"
|
439
|
+
logger.warning(
|
440
|
+
f"Highlighting collection with mixed element types ({type_names_str}) "
|
441
|
+
f"using generic label '{auto_label}'. Consider using 'label', 'group_by', "
|
442
|
+
f"or 'distinct=True' for more specific highlighting."
|
443
|
+
)
|
444
|
+
final_color = highlighter._determine_highlight_color(label=auto_label, color_input=color, use_color_cycling=False)
|
445
|
+
# Determine color *before* logging or using it (already done above for this branch)
|
446
|
+
logger.debug(f" _prepare default group '{auto_label}' -> color {final_color}")
|
447
|
+
for element in self._elements:
|
448
|
+
element_data = self._get_element_highlight_params(element, include_attrs)
|
449
|
+
if element_data:
|
450
|
+
element_data.update({'color': final_color, 'label': auto_label})
|
451
|
+
prepared_data.append(element_data)
|
452
|
+
|
453
|
+
return prepared_data
|
454
|
+
|
455
|
+
def _call_element_highlighter(self, element: T,
|
456
|
+
color: Optional[Union[Tuple, str]],
|
457
|
+
label: Optional[str],
|
458
|
+
use_color_cycling: bool,
|
459
|
+
include_attrs: Optional[List[str]],
|
460
|
+
existing: str):
|
461
|
+
"""Low-level helper to call the appropriate HighlightingService method for an element."""
|
462
|
+
if not hasattr(element, 'page') or not hasattr(element.page, '_highlighter'):
|
463
|
+
logger.warning(f"Cannot highlight element, missing 'page' attribute or page lacks highlighter access: {element}")
|
464
|
+
return
|
465
|
+
|
466
|
+
page = element.page
|
467
|
+
args_for_highlighter = {
|
468
|
+
"page_index": page.index,
|
469
|
+
"color": color,
|
470
|
+
"label": label,
|
471
|
+
"use_color_cycling": use_color_cycling,
|
472
|
+
"include_attrs": include_attrs,
|
473
|
+
"existing": existing,
|
474
|
+
"element": element
|
475
|
+
}
|
476
|
+
|
477
|
+
is_polygon = getattr(element, 'has_polygon', False)
|
478
|
+
geom_data = None
|
479
|
+
add_method = None
|
480
|
+
|
481
|
+
if is_polygon:
|
482
|
+
geom_data = getattr(element, 'polygon', None)
|
483
|
+
if geom_data:
|
484
|
+
args_for_highlighter['polygon'] = geom_data
|
485
|
+
add_method = page._highlighter.add_polygon
|
486
|
+
else:
|
487
|
+
geom_data = getattr(element, 'bbox', None)
|
488
|
+
if geom_data:
|
489
|
+
args_for_highlighter['bbox'] = geom_data
|
490
|
+
add_method = page._highlighter.add
|
491
|
+
|
492
|
+
if add_method and geom_data:
|
493
|
+
try:
|
494
|
+
add_method(**args_for_highlighter)
|
495
|
+
except Exception as e:
|
496
|
+
logger.error(f"Error calling highlighter method for element {element} on page {page.index}: {e}", exc_info=True)
|
497
|
+
elif not geom_data:
|
498
|
+
logger.warning(f"Cannot highlight element, no bbox or polygon found: {element}")
|
499
|
+
|
500
|
+
def _highlight_as_single_group(self, label: str,
|
501
|
+
color: Optional[Union[Tuple, str]],
|
502
|
+
include_attrs: Optional[List[str]],
|
503
|
+
existing: str):
|
504
|
+
"""Highlights all elements with the same explicit label and color."""
|
505
|
+
for element in self._elements:
|
506
|
+
self._call_element_highlighter(
|
507
|
+
element=element,
|
508
|
+
color=color, # Use explicit color if provided
|
509
|
+
label=label, # Use the explicit group label
|
510
|
+
use_color_cycling=False, # Use consistent color for the label
|
511
|
+
include_attrs=include_attrs,
|
512
|
+
existing=existing
|
513
|
+
)
|
514
|
+
|
515
|
+
def _highlight_grouped_by_attribute(self, group_by: str,
|
516
|
+
label_format: Optional[str],
|
517
|
+
include_attrs: Optional[List[str]],
|
518
|
+
existing: str):
|
519
|
+
"""Groups elements by attribute and highlights each group distinctly."""
|
520
|
+
grouped_elements: Dict[Any, List[T]] = {}
|
521
|
+
# Group elements by the specified attribute value
|
522
|
+
for element in self._elements:
|
523
|
+
try:
|
524
|
+
group_key = getattr(element, group_by, None)
|
525
|
+
if group_key is None: # Handle elements missing the attribute
|
526
|
+
group_key = f"Missing '{group_by}'"
|
527
|
+
# Ensure group_key is hashable (convert list/dict if necessary)
|
528
|
+
if isinstance(group_key, (list, dict)):
|
529
|
+
group_key = str(group_key)
|
530
|
+
|
531
|
+
if group_key not in grouped_elements:
|
532
|
+
grouped_elements[group_key] = []
|
533
|
+
grouped_elements[group_key].append(element)
|
534
|
+
except AttributeError:
|
535
|
+
logger.warning(f"Attribute '{group_by}' not found on element {element}. Skipping grouping.")
|
536
|
+
group_key = f"Error accessing '{group_by}'"
|
537
|
+
if group_key not in grouped_elements:
|
538
|
+
grouped_elements[group_key] = []
|
539
|
+
grouped_elements[group_key].append(element)
|
540
|
+
except TypeError: # Handle unhashable types
|
541
|
+
logger.warning(f"Attribute value for '{group_by}' on {element} is unhashable ({type(group_key)}). Using string representation.")
|
542
|
+
group_key = str(group_key)
|
543
|
+
if group_key not in grouped_elements:
|
544
|
+
grouped_elements[group_key] = []
|
545
|
+
grouped_elements[group_key].append(element)
|
546
|
+
|
547
|
+
|
548
|
+
# Highlight each group
|
549
|
+
for group_key, group_elements in grouped_elements.items():
|
550
|
+
if not group_elements: continue
|
551
|
+
|
552
|
+
# Determine the label for this group
|
553
|
+
first_element = group_elements[0] # Use first element for formatting
|
554
|
+
group_label = None
|
555
|
+
if label_format:
|
556
|
+
try:
|
557
|
+
# Create a dict of element attributes for formatting
|
558
|
+
element_attrs = first_element.__dict__.copy() # Start with element's dict
|
559
|
+
# Ensure the group_by key itself is present correctly
|
560
|
+
element_attrs[group_by] = group_key
|
561
|
+
group_label = label_format.format(**element_attrs)
|
562
|
+
except KeyError as e:
|
563
|
+
logger.warning(f"Invalid key '{e}' in label_format '{label_format}'. Using group key as label.")
|
564
|
+
group_label = str(group_key)
|
565
|
+
except Exception as format_e:
|
566
|
+
logger.warning(f"Error formatting label '{label_format}': {format_e}. Using group key as label.")
|
567
|
+
group_label = str(group_key)
|
568
|
+
else:
|
569
|
+
group_label = str(group_key) # Use the attribute value as label
|
570
|
+
|
571
|
+
logger.debug(f" Highlighting group '{group_label}' ({len(group_elements)} elements)")
|
572
|
+
|
573
|
+
# Highlight all elements in this group with the derived label
|
574
|
+
for element in group_elements:
|
575
|
+
self._call_element_highlighter(
|
576
|
+
element=element,
|
577
|
+
color=None, # Let ColorManager choose based on label
|
578
|
+
label=group_label, # Use the derived group label
|
579
|
+
use_color_cycling=False, # Use consistent color for the label
|
580
|
+
include_attrs=include_attrs,
|
581
|
+
existing=existing
|
582
|
+
)
|
583
|
+
|
584
|
+
def _highlight_distinctly(self, include_attrs: Optional[List[str]], existing: str):
|
585
|
+
"""DEPRECATED: Logic moved to _prepare_highlight_data. Kept for reference/potential reuse."""
|
586
|
+
# This method is no longer called directly by the main highlight path.
|
587
|
+
# The distinct logic is handled within _prepare_highlight_data.
|
588
|
+
for element in self._elements:
|
589
|
+
self._call_element_highlighter(
|
590
|
+
element=element,
|
591
|
+
color=None, # Let ColorManager cycle
|
592
|
+
label=None, # No label for distinct elements
|
593
|
+
use_color_cycling=True, # Force cycling
|
594
|
+
include_attrs=include_attrs,
|
595
|
+
existing=existing
|
596
|
+
)
|
597
|
+
|
598
|
+
def show(self,
|
599
|
+
# --- Visualization Parameters ---
|
600
|
+
group_by: Optional[str] = None,
|
601
|
+
label: Optional[str] = None,
|
602
|
+
color: Optional[Union[Tuple, str]] = None,
|
603
|
+
label_format: Optional[str] = None,
|
604
|
+
distinct: bool = False,
|
605
|
+
include_attrs: Optional[List[str]] = None,
|
606
|
+
# --- Rendering Parameters ---
|
607
|
+
scale: float = 2.0,
|
608
|
+
labels: bool = True, # Use 'labels' consistent with service
|
609
|
+
legend_position: str = 'right',
|
610
|
+
render_ocr: bool = False) -> Optional['Image.Image']:
|
611
|
+
"""
|
612
|
+
Generates a temporary preview image highlighting elements in this collection
|
613
|
+
on their page, ignoring any persistent highlights.
|
614
|
+
|
615
|
+
Currently only supports collections where all elements are on the same page.
|
616
|
+
|
617
|
+
Allows grouping and coloring elements based on attributes, similar to the
|
618
|
+
persistent `highlight()` method, but only for this temporary view.
|
619
|
+
|
620
|
+
Args:
|
621
|
+
group_by: Attribute name to group elements by for distinct colors/labels.
|
622
|
+
label: Explicit label for all elements (overrides group_by).
|
623
|
+
color: Explicit color for all elements (if label used) or base color.
|
624
|
+
label_format: F-string to format group labels if group_by is used.
|
625
|
+
distinct: Highlight each element distinctly (overrides group_by/label).
|
626
|
+
include_attrs: Attributes to display on individual highlights.
|
627
|
+
scale: Scale factor for rendering image.
|
628
|
+
labels: Whether to include a legend for the temporary highlights.
|
629
|
+
legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
|
630
|
+
render_ocr: Whether to render OCR text.
|
631
|
+
|
632
|
+
Returns:
|
633
|
+
PIL Image object of the temporary preview, or None if rendering fails or
|
634
|
+
elements span multiple pages.
|
635
|
+
|
636
|
+
Raises:
|
637
|
+
ValueError: If the collection is empty or elements are on different pages.
|
638
|
+
"""
|
639
|
+
if not self._elements:
|
640
|
+
raise ValueError("Cannot show an empty collection.")
|
641
|
+
|
642
|
+
# Check if elements are on multiple pages
|
643
|
+
if self._are_on_multiple_pages():
|
644
|
+
raise ValueError("show() currently only supports collections where all elements are on the same page.")
|
645
|
+
|
646
|
+
# Get the page and highlighting service from the first element
|
647
|
+
first_element = self._elements[0]
|
648
|
+
if not hasattr(first_element, 'page') or not first_element.page:
|
649
|
+
logger.warning("Cannot show collection: First element has no associated page.")
|
650
|
+
return None
|
651
|
+
page = first_element.page
|
652
|
+
if not hasattr(page, 'pdf') or not page.pdf:
|
653
|
+
logger.warning("Cannot show collection: Page has no associated PDF object.")
|
654
|
+
return None
|
655
|
+
|
656
|
+
service = page._highlighter
|
657
|
+
if not service:
|
658
|
+
logger.warning("Cannot show collection: PDF object has no highlighting service.")
|
659
|
+
return None
|
660
|
+
|
661
|
+
# 1. Prepare temporary highlight data based on grouping parameters
|
662
|
+
# This returns a list of dicts, suitable for render_preview
|
663
|
+
highlight_data_list = self._prepare_highlight_data(
|
664
|
+
distinct=distinct,
|
665
|
+
label=label,
|
666
|
+
color=color,
|
667
|
+
group_by=group_by,
|
668
|
+
label_format=label_format,
|
669
|
+
include_attrs=include_attrs
|
670
|
+
)
|
671
|
+
|
672
|
+
if not highlight_data_list:
|
673
|
+
logger.warning("No highlight data generated for show(). Rendering clean page.")
|
674
|
+
# Render the page without any temporary highlights
|
675
|
+
highlight_data_list = []
|
676
|
+
|
677
|
+
# 2. Call render_preview on the HighlightingService
|
678
|
+
try:
|
679
|
+
return service.render_preview(
|
680
|
+
page_index=page.index,
|
681
|
+
temporary_highlights=highlight_data_list,
|
682
|
+
scale=scale,
|
683
|
+
labels=labels, # Use 'labels'
|
684
|
+
legend_position=legend_position,
|
685
|
+
render_ocr=render_ocr
|
686
|
+
)
|
687
|
+
except Exception as e:
|
688
|
+
logger.error(f"Error calling highlighting_service.render_preview: {e}", exc_info=True)
|
689
|
+
return None
|
690
|
+
|
691
|
+
def save(self,
|
692
|
+
filename: str,
|
693
|
+
scale: float = 2.0,
|
694
|
+
width: Optional[int] = None,
|
695
|
+
labels: bool = True,
|
696
|
+
legend_position: str = 'right',
|
697
|
+
render_ocr: bool = False) -> 'ElementCollection':
|
698
|
+
"""
|
699
|
+
Save the page with this collection's elements highlighted to an image file.
|
700
|
+
|
701
|
+
Args:
|
702
|
+
filename: Path to save the image to
|
703
|
+
scale: Scale factor for rendering
|
704
|
+
width: Optional width for the output image in pixels
|
705
|
+
labels: Whether to include a legend for labels
|
706
|
+
legend_position: Position of the legend
|
707
|
+
render_ocr: Whether to render OCR text with white background boxes
|
708
|
+
|
709
|
+
Returns:
|
710
|
+
Self for method chaining
|
711
|
+
"""
|
712
|
+
# Use to_image to generate and save the image
|
713
|
+
self.to_image(
|
714
|
+
path=filename,
|
715
|
+
scale=scale,
|
716
|
+
width=width,
|
717
|
+
labels=labels,
|
718
|
+
legend_position=legend_position,
|
719
|
+
render_ocr=render_ocr
|
720
|
+
)
|
721
|
+
return self
|
722
|
+
|
723
|
+
def to_image(self,
|
724
|
+
path: Optional[str] = None,
|
725
|
+
scale: float = 2.0,
|
726
|
+
width: Optional[int] = None,
|
727
|
+
labels: bool = True,
|
728
|
+
legend_position: str = 'right',
|
729
|
+
render_ocr: bool = False) -> Optional['Image.Image']:
|
730
|
+
"""
|
731
|
+
Generate an image of the page with this collection's elements highlighted,
|
732
|
+
optionally saving it to a file.
|
733
|
+
|
734
|
+
Args:
|
735
|
+
path: Optional path to save the image to
|
736
|
+
scale: Scale factor for rendering
|
737
|
+
width: Optional width for the output image in pixels (height calculated to maintain aspect ratio)
|
738
|
+
labels: Whether to include a legend for labels
|
739
|
+
legend_position: Position of the legend
|
740
|
+
render_ocr: Whether to render OCR text with white background boxes
|
741
|
+
|
742
|
+
Returns:
|
743
|
+
PIL Image of the page with elements highlighted, or None if no valid page
|
744
|
+
"""
|
745
|
+
# Get the page from the first element (if available)
|
746
|
+
if self._elements and hasattr(self._elements[0], 'page'):
|
747
|
+
page = self._elements[0].page
|
748
|
+
# Generate the image using to_image
|
749
|
+
return page.to_image(
|
750
|
+
path=path,
|
751
|
+
scale=scale,
|
752
|
+
width=width,
|
753
|
+
labels=labels,
|
754
|
+
legend_position=legend_position,
|
755
|
+
render_ocr=render_ocr
|
756
|
+
)
|
757
|
+
return None
|
758
|
+
|
759
|
+
def _group_elements_by_attr(self, group_by: str) -> Dict[Any, List[T]]:
|
760
|
+
"""Groups elements by the specified attribute."""
|
761
|
+
grouped_elements: Dict[Any, List[T]] = {}
|
762
|
+
for element in self._elements:
|
763
|
+
try:
|
764
|
+
group_key = getattr(element, group_by, None)
|
765
|
+
if group_key is None: # Handle elements missing the attribute
|
766
|
+
group_key = f"Missing '{group_by}'"
|
767
|
+
# Ensure group_key is hashable (convert list/dict if necessary)
|
768
|
+
if isinstance(group_key, (list, dict)):
|
769
|
+
group_key = str(group_key)
|
770
|
+
|
771
|
+
if group_key not in grouped_elements:
|
772
|
+
grouped_elements[group_key] = []
|
773
|
+
grouped_elements[group_key].append(element)
|
774
|
+
except AttributeError:
|
775
|
+
logger.warning(f"Attribute '{group_by}' not found on element {element}. Skipping grouping.")
|
776
|
+
group_key = f"Error accessing '{group_by}'"
|
777
|
+
if group_key not in grouped_elements:
|
778
|
+
grouped_elements[group_key] = []
|
779
|
+
grouped_elements[group_key].append(element)
|
780
|
+
except TypeError: # Handle unhashable types
|
781
|
+
logger.warning(f"Attribute value for '{group_by}' on {element} is unhashable ({type(group_key)}). Using string representation.")
|
782
|
+
group_key = str(group_key)
|
783
|
+
if group_key not in grouped_elements:
|
784
|
+
grouped_elements[group_key] = []
|
785
|
+
grouped_elements[group_key].append(element)
|
786
|
+
|
787
|
+
return grouped_elements
|
788
|
+
|
789
|
+
def _format_group_label(self, group_key: Any, label_format: Optional[str], sample_element: T, group_by_attr: str) -> str:
|
790
|
+
"""Formats the label for a group based on the key and format string."""
|
791
|
+
if label_format:
|
792
|
+
try:
|
793
|
+
element_attrs = sample_element.__dict__.copy()
|
794
|
+
element_attrs[group_by_attr] = group_key # Ensure key is present
|
795
|
+
return label_format.format(**element_attrs)
|
796
|
+
except KeyError as e:
|
797
|
+
logger.warning(f"Invalid key '{e}' in label_format '{label_format}'. Using group key as label.")
|
798
|
+
return str(group_key)
|
799
|
+
except Exception as format_e:
|
800
|
+
logger.warning(f"Error formatting label '{label_format}': {format_e}. Using group key as label.")
|
801
|
+
return str(group_key)
|
802
|
+
else:
|
803
|
+
return str(group_key)
|
804
|
+
|
805
|
+
def _get_element_highlight_params(self, element: T, include_attrs: Optional[List[str]]) -> Optional[Dict]:
|
806
|
+
"""Extracts common parameters needed for highlighting a single element."""
|
807
|
+
if not hasattr(element, 'page'): return None
|
808
|
+
page = element.page
|
809
|
+
|
810
|
+
base_data = {
|
811
|
+
'page_index': page.index,
|
812
|
+
'element': element,
|
813
|
+
'include_attrs': include_attrs,
|
814
|
+
'attributes_to_draw': {},
|
815
|
+
'bbox': None,
|
816
|
+
'polygon': None
|
817
|
+
}
|
818
|
+
|
819
|
+
# Extract geometry
|
820
|
+
is_polygon = getattr(element, 'has_polygon', False)
|
821
|
+
geom_data = None
|
822
|
+
if is_polygon:
|
823
|
+
geom_data = getattr(element, 'polygon', None)
|
824
|
+
if geom_data: base_data['polygon'] = geom_data
|
825
|
+
else:
|
826
|
+
geom_data = getattr(element, 'bbox', None)
|
827
|
+
if geom_data: base_data['bbox'] = geom_data
|
828
|
+
|
829
|
+
if not geom_data:
|
830
|
+
logger.warning(f"Cannot prepare highlight, no bbox or polygon found for element: {element}")
|
831
|
+
return None
|
832
|
+
|
833
|
+
# Extract attributes if requested
|
834
|
+
if include_attrs:
|
835
|
+
for attr_name in include_attrs:
|
836
|
+
try:
|
837
|
+
attr_value = getattr(element, attr_name, None)
|
838
|
+
if attr_value is not None:
|
839
|
+
base_data['attributes_to_draw'][attr_name] = attr_value
|
840
|
+
except AttributeError:
|
841
|
+
logger.warning(f"Attribute '{attr_name}' not found on element {element} for include_attrs")
|
842
|
+
|
843
|
+
return base_data
|
844
|
+
|
845
|
+
def viewer(self, title: Optional[str] = None) -> Optional['widgets.DOMWidget']:
|
846
|
+
"""
|
847
|
+
Creates and returns an interactive ipywidget showing ONLY the elements
|
848
|
+
in this collection on their page background.
|
849
|
+
|
850
|
+
Args:
|
851
|
+
title: Optional title for the viewer window/widget.
|
852
|
+
|
853
|
+
Returns:
|
854
|
+
An InteractiveViewerWidget instance or None if elements lack page context.
|
855
|
+
"""
|
856
|
+
if not self.elements:
|
857
|
+
logger.warning("Cannot generate interactive viewer for empty collection.")
|
858
|
+
return None
|
859
|
+
|
860
|
+
# Assume all elements are on the same page and have .page attribute
|
861
|
+
try:
|
862
|
+
page = self.elements[0].page
|
863
|
+
# Check if the page object actually has the method
|
864
|
+
if hasattr(page, 'viewer') and callable(page.viewer):
|
865
|
+
final_title = title or f"Interactive Viewer for Collection ({len(self.elements)} elements)"
|
866
|
+
# Call the page method, passing this collection's elements
|
867
|
+
return page.viewer(
|
868
|
+
elements_to_render=self.elements,
|
869
|
+
title=final_title # Pass title if Page method accepts it
|
870
|
+
)
|
871
|
+
else:
|
872
|
+
logger.error("Page object is missing the 'viewer' method.")
|
873
|
+
return None
|
874
|
+
except AttributeError:
|
875
|
+
logger.error("Cannot generate interactive viewer: Elements in collection lack 'page' attribute.")
|
876
|
+
return None
|
877
|
+
except IndexError:
|
878
|
+
# Should be caught by the empty check, but just in case
|
879
|
+
logger.error("Cannot generate interactive viewer: Collection unexpectedly became empty.")
|
880
|
+
return None
|
881
|
+
except Exception as e:
|
882
|
+
logger.error(f"Error creating interactive viewer from collection: {e}", exc_info=True)
|
883
|
+
return None
|
884
|
+
|
885
|
+
class PageCollection(Generic[P]):
|
886
|
+
"""
|
887
|
+
A collection of PDF pages with cross-page operations.
|
888
|
+
|
889
|
+
This class provides methods for working with multiple pages, such as finding
|
890
|
+
elements across pages, extracting text from page ranges, and more.
|
891
|
+
"""
|
892
|
+
|
893
|
+
def __init__(self, pages: List[P]):
|
894
|
+
"""
|
895
|
+
Initialize a page collection.
|
896
|
+
|
897
|
+
Args:
|
898
|
+
pages: List of Page objects
|
899
|
+
"""
|
900
|
+
self.pages = pages
|
901
|
+
|
902
|
+
def __len__(self) -> int:
|
903
|
+
"""Return the number of pages in the collection."""
|
904
|
+
return len(self.pages)
|
905
|
+
|
906
|
+
def __getitem__(self, idx) -> Union[P, 'PageCollection[P]']:
|
907
|
+
"""Support indexing and slicing."""
|
908
|
+
if isinstance(idx, slice):
|
909
|
+
return PageCollection(self.pages[idx])
|
910
|
+
return self.pages[idx]
|
911
|
+
|
912
|
+
def __iter__(self) -> Iterator[P]:
|
913
|
+
"""Support iteration."""
|
914
|
+
return iter(self.pages)
|
915
|
+
|
916
|
+
def __repr__(self) -> str:
|
917
|
+
"""Return a string representation showing the page count."""
|
918
|
+
return f"<PageCollection(count={len(self)})>"
|
919
|
+
|
920
|
+
def extract_text(self, keep_blank_chars=True, apply_exclusions=True, **kwargs) -> str:
|
921
|
+
"""
|
922
|
+
Extract text from all pages in the collection.
|
923
|
+
|
924
|
+
Args:
|
925
|
+
keep_blank_chars: Whether to keep blank characters (default: True)
|
926
|
+
apply_exclusions: Whether to apply exclusion regions (default: True)
|
927
|
+
**kwargs: Additional extraction parameters
|
928
|
+
|
929
|
+
Returns:
|
930
|
+
Combined text from all pages
|
931
|
+
"""
|
932
|
+
texts = []
|
933
|
+
for page in self.pages:
|
934
|
+
text = page.extract_text(
|
935
|
+
keep_blank_chars=keep_blank_chars,
|
936
|
+
apply_exclusions=apply_exclusions,
|
937
|
+
**kwargs
|
938
|
+
)
|
939
|
+
texts.append(text)
|
940
|
+
|
941
|
+
return "\n".join(texts)
|
942
|
+
|
943
|
+
# --- NEW METHOD ---
|
944
|
+
def apply_ocr(
|
945
|
+
self,
|
946
|
+
engine: Optional[str] = None,
|
947
|
+
options: Optional[OCROptions] = None,
|
948
|
+
languages: Optional[List[str]] = None,
|
949
|
+
min_confidence: Optional[float] = None,
|
950
|
+
device: Optional[str] = None,
|
951
|
+
# Add other simple mode args if needed
|
952
|
+
) -> 'PageCollection[P]':
|
953
|
+
"""
|
954
|
+
Applies OCR to all pages within this collection using batch processing.
|
955
|
+
|
956
|
+
This delegates the work to the parent PDF object's `apply_ocr_to_pages`
|
957
|
+
method for efficiency. The OCR results (TextElements) are added directly
|
958
|
+
to the respective Page objects within this collection.
|
959
|
+
|
960
|
+
Args:
|
961
|
+
engine: Name of the engine (e.g., 'easyocr', 'paddleocr', 'surya').
|
962
|
+
Uses manager's default if None. Ignored if 'options' is provided.
|
963
|
+
options: An specific Options object (e.g., EasyOCROptions) for
|
964
|
+
advanced configuration. Overrides simple arguments.
|
965
|
+
languages: List of language codes for simple mode.
|
966
|
+
min_confidence: Minimum confidence threshold for simple mode.
|
967
|
+
device: Device string ('cpu', 'cuda', etc.) for simple mode.
|
968
|
+
|
969
|
+
Returns:
|
970
|
+
Self for method chaining.
|
971
|
+
|
972
|
+
Raises:
|
973
|
+
RuntimeError: If pages in the collection lack a parent PDF object
|
974
|
+
or if the parent PDF object lacks the required
|
975
|
+
`apply_ocr_to_pages` method.
|
976
|
+
(Propagates exceptions from PDF.apply_ocr_to_pages)
|
977
|
+
"""
|
978
|
+
if not self.pages:
|
979
|
+
logger.warning("Cannot apply OCR to an empty PageCollection.")
|
980
|
+
return self
|
981
|
+
|
982
|
+
# Assume all pages share the same parent PDF object
|
983
|
+
first_page = self.pages[0]
|
984
|
+
if not hasattr(first_page, '_parent') or not first_page._parent:
|
985
|
+
raise RuntimeError("Pages in this collection do not have a parent PDF reference.")
|
986
|
+
|
987
|
+
parent_pdf = first_page._parent
|
988
|
+
|
989
|
+
if not hasattr(parent_pdf, 'apply_ocr_to_pages') or not callable(parent_pdf.apply_ocr_to_pages):
|
990
|
+
raise RuntimeError("Parent PDF object does not have the required 'apply_ocr_to_pages' method.")
|
991
|
+
|
992
|
+
# Get the 0-based indices of the pages in this collection
|
993
|
+
page_indices = [p.index for p in self.pages]
|
994
|
+
|
995
|
+
logger.info(f"Applying OCR via parent PDF to page indices: {page_indices} in collection.")
|
996
|
+
|
997
|
+
# Delegate the batch call to the parent PDF object
|
998
|
+
parent_pdf.apply_ocr_to_pages(
|
999
|
+
pages=page_indices,
|
1000
|
+
engine=engine,
|
1001
|
+
options=options,
|
1002
|
+
languages=languages,
|
1003
|
+
min_confidence=min_confidence,
|
1004
|
+
device=device
|
1005
|
+
# Pass any other relevant simple_kwargs here if added
|
1006
|
+
)
|
1007
|
+
# The PDF method modifies the Page objects directly by adding elements.
|
1008
|
+
|
1009
|
+
return self # Return self for chaining
|
1010
|
+
|
1011
|
+
def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional[T]:
|
1012
|
+
"""
|
1013
|
+
Find the first element matching the selector across all pages.
|
1014
|
+
|
1015
|
+
Args:
|
1016
|
+
selector: CSS-like selector string
|
1017
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
|
1018
|
+
**kwargs: Additional filter parameters
|
1019
|
+
|
1020
|
+
Returns:
|
1021
|
+
First matching element or None
|
1022
|
+
"""
|
1023
|
+
for page in self.pages:
|
1024
|
+
element = page.find(selector, apply_exclusions=apply_exclusions, **kwargs)
|
1025
|
+
if element:
|
1026
|
+
return element
|
1027
|
+
return None
|
1028
|
+
|
1029
|
+
def find_all(self, selector: str, apply_exclusions=True, **kwargs) -> ElementCollection:
|
1030
|
+
"""
|
1031
|
+
Find all elements matching the selector across all pages.
|
1032
|
+
|
1033
|
+
Args:
|
1034
|
+
selector: CSS-like selector string
|
1035
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
|
1036
|
+
**kwargs: Additional filter parameters
|
1037
|
+
|
1038
|
+
Returns:
|
1039
|
+
ElementCollection with matching elements from all pages
|
1040
|
+
"""
|
1041
|
+
all_elements = []
|
1042
|
+
for page in self.pages:
|
1043
|
+
elements = page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
|
1044
|
+
if elements:
|
1045
|
+
all_elements.extend(elements.elements)
|
1046
|
+
|
1047
|
+
return ElementCollection(all_elements)
|
1048
|
+
|
1049
|
+
# def debug_ocr(self, output_path):
|
1050
|
+
# """
|
1051
|
+
# Generate an interactive HTML debug report for OCR results.
|
1052
|
+
|
1053
|
+
# This creates a single-file HTML report with:
|
1054
|
+
# - Side-by-side view of image regions and OCR text
|
1055
|
+
# - Confidence scores with color coding
|
1056
|
+
# - Editable correction fields
|
1057
|
+
# - Filtering and sorting options
|
1058
|
+
# - Export functionality for corrected text
|
1059
|
+
|
1060
|
+
# Args:
|
1061
|
+
# output_path: Path to save the HTML report
|
1062
|
+
|
1063
|
+
# Returns:
|
1064
|
+
# Path to the generated HTML file
|
1065
|
+
# """
|
1066
|
+
# from natural_pdf.utils.ocr import debug_ocr_to_html
|
1067
|
+
# return debug_ocr_to_html(self.pages, output_path)
|
1068
|
+
|
1069
|
+
def get_sections(self,
|
1070
|
+
start_elements=None,
|
1071
|
+
end_elements=None,
|
1072
|
+
new_section_on_page_break=False,
|
1073
|
+
boundary_inclusion='both') -> List['Region']:
|
1074
|
+
"""
|
1075
|
+
Extract sections from a page collection based on start/end elements.
|
1076
|
+
|
1077
|
+
Args:
|
1078
|
+
start_elements: Elements or selector string that mark the start of sections
|
1079
|
+
end_elements: Elements or selector string that mark the end of sections
|
1080
|
+
new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
|
1081
|
+
boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
|
1082
|
+
|
1083
|
+
Returns:
|
1084
|
+
List of Region objects representing the extracted sections
|
1085
|
+
"""
|
1086
|
+
# Find start and end elements across all pages
|
1087
|
+
if isinstance(start_elements, str):
|
1088
|
+
start_elements = self.find_all(start_elements).elements
|
1089
|
+
|
1090
|
+
if isinstance(end_elements, str):
|
1091
|
+
end_elements = self.find_all(end_elements).elements
|
1092
|
+
|
1093
|
+
# If no start elements, return empty list
|
1094
|
+
if not start_elements:
|
1095
|
+
return []
|
1096
|
+
|
1097
|
+
# If there are page break boundaries, we'll need to add them
|
1098
|
+
if new_section_on_page_break:
|
1099
|
+
# For each page boundary, create virtual "end" and "start" elements
|
1100
|
+
for i in range(len(self.pages) - 1):
|
1101
|
+
# Add a virtual "end" element at the bottom of the current page
|
1102
|
+
page = self.pages[i]
|
1103
|
+
# If end_elements is None, initialize it as an empty list
|
1104
|
+
if end_elements is None:
|
1105
|
+
end_elements = []
|
1106
|
+
|
1107
|
+
# Create a region at the bottom of the page as an artificial end marker
|
1108
|
+
from natural_pdf.elements.region import Region
|
1109
|
+
bottom_region = Region(page, (0, page.height - 1, page.width, page.height))
|
1110
|
+
bottom_region.is_page_boundary = True # Mark it as a special boundary
|
1111
|
+
end_elements.append(bottom_region)
|
1112
|
+
|
1113
|
+
# Add a virtual "start" element at the top of the next page
|
1114
|
+
next_page = self.pages[i + 1]
|
1115
|
+
top_region = Region(next_page, (0, 0, next_page.width, 1))
|
1116
|
+
top_region.is_page_boundary = True # Mark it as a special boundary
|
1117
|
+
start_elements.append(top_region)
|
1118
|
+
|
1119
|
+
# Get all elements from all pages and sort them in document order
|
1120
|
+
all_elements = []
|
1121
|
+
for page in self.pages:
|
1122
|
+
elements = page.get_elements()
|
1123
|
+
all_elements.extend(elements)
|
1124
|
+
|
1125
|
+
# Sort by page index, then vertical position, then horizontal position
|
1126
|
+
all_elements.sort(key=lambda e: (e.page.index, e.top, e.x0))
|
1127
|
+
|
1128
|
+
# Mark section boundaries
|
1129
|
+
section_boundaries = []
|
1130
|
+
|
1131
|
+
# Add start element boundaries
|
1132
|
+
for element in start_elements:
|
1133
|
+
if element in all_elements:
|
1134
|
+
idx = all_elements.index(element)
|
1135
|
+
section_boundaries.append({
|
1136
|
+
'index': idx,
|
1137
|
+
'element': element,
|
1138
|
+
'type': 'start',
|
1139
|
+
'page_idx': element.page.index
|
1140
|
+
})
|
1141
|
+
elif hasattr(element, 'is_page_boundary') and element.is_page_boundary:
|
1142
|
+
# This is a virtual page boundary element
|
1143
|
+
section_boundaries.append({
|
1144
|
+
'index': -1, # Special index for page boundaries
|
1145
|
+
'element': element,
|
1146
|
+
'type': 'start',
|
1147
|
+
'page_idx': element.page.index
|
1148
|
+
})
|
1149
|
+
|
1150
|
+
# Add end element boundaries if provided
|
1151
|
+
if end_elements:
|
1152
|
+
for element in end_elements:
|
1153
|
+
if element in all_elements:
|
1154
|
+
idx = all_elements.index(element)
|
1155
|
+
section_boundaries.append({
|
1156
|
+
'index': idx,
|
1157
|
+
'element': element,
|
1158
|
+
'type': 'end',
|
1159
|
+
'page_idx': element.page.index
|
1160
|
+
})
|
1161
|
+
elif hasattr(element, 'is_page_boundary') and element.is_page_boundary:
|
1162
|
+
# This is a virtual page boundary element
|
1163
|
+
section_boundaries.append({
|
1164
|
+
'index': -1, # Special index for page boundaries
|
1165
|
+
'element': element,
|
1166
|
+
'type': 'end',
|
1167
|
+
'page_idx': element.page.index
|
1168
|
+
})
|
1169
|
+
|
1170
|
+
# Sort boundaries by page index, then by actual document position
|
1171
|
+
section_boundaries.sort(key=lambda x: (x['page_idx'],
|
1172
|
+
x['index'] if x['index'] != -1 else
|
1173
|
+
(0 if x['type'] == 'start' else float('inf'))))
|
1174
|
+
|
1175
|
+
# Generate sections
|
1176
|
+
sections = []
|
1177
|
+
current_start = None
|
1178
|
+
|
1179
|
+
for i, boundary in enumerate(section_boundaries):
|
1180
|
+
# If it's a start boundary and we don't have a current start
|
1181
|
+
if boundary['type'] == 'start' and current_start is None:
|
1182
|
+
current_start = boundary
|
1183
|
+
|
1184
|
+
# If it's an end boundary and we have a current start
|
1185
|
+
elif boundary['type'] == 'end' and current_start is not None:
|
1186
|
+
# Create a section from current_start to this boundary
|
1187
|
+
start_element = current_start['element']
|
1188
|
+
end_element = boundary['element']
|
1189
|
+
|
1190
|
+
# If both elements are on the same page, use the page's get_section_between
|
1191
|
+
if start_element.page == end_element.page:
|
1192
|
+
section = start_element.page.get_section_between(
|
1193
|
+
start_element,
|
1194
|
+
end_element,
|
1195
|
+
boundary_inclusion
|
1196
|
+
)
|
1197
|
+
sections.append(section)
|
1198
|
+
else:
|
1199
|
+
# Create a multi-page section
|
1200
|
+
from natural_pdf.elements.region import Region
|
1201
|
+
|
1202
|
+
# Get the start and end pages
|
1203
|
+
start_page = start_element.page
|
1204
|
+
end_page = end_element.page
|
1205
|
+
|
1206
|
+
# Create a combined region
|
1207
|
+
combined_region = Region(
|
1208
|
+
start_page,
|
1209
|
+
(0, start_element.top, start_page.width, start_page.height)
|
1210
|
+
)
|
1211
|
+
combined_region._spans_pages = True
|
1212
|
+
combined_region._page_range = (start_page.index, end_page.index)
|
1213
|
+
combined_region.start_element = start_element
|
1214
|
+
combined_region.end_element = end_element
|
1215
|
+
|
1216
|
+
# Get all elements that fall within this multi-page region
|
1217
|
+
combined_elements = []
|
1218
|
+
|
1219
|
+
# Get elements from the first page
|
1220
|
+
first_page_elements = [e for e in all_elements
|
1221
|
+
if e.page == start_page and e.top >= start_element.top]
|
1222
|
+
combined_elements.extend(first_page_elements)
|
1223
|
+
|
1224
|
+
# Get elements from middle pages (if any)
|
1225
|
+
for page_idx in range(start_page.index + 1, end_page.index):
|
1226
|
+
middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
|
1227
|
+
combined_elements.extend(middle_page_elements)
|
1228
|
+
|
1229
|
+
# Get elements from the last page
|
1230
|
+
last_page_elements = [e for e in all_elements
|
1231
|
+
if e.page == end_page and e.bottom <= end_element.bottom]
|
1232
|
+
combined_elements.extend(last_page_elements)
|
1233
|
+
|
1234
|
+
# Store the elements in the combined region
|
1235
|
+
combined_region._multi_page_elements = combined_elements
|
1236
|
+
|
1237
|
+
sections.append(combined_region)
|
1238
|
+
|
1239
|
+
current_start = None
|
1240
|
+
|
1241
|
+
# If it's another start boundary and we have a current start (for splitting by starts only)
|
1242
|
+
elif boundary['type'] == 'start' and current_start is not None and not end_elements:
|
1243
|
+
# Create a section from current_start to just before this boundary
|
1244
|
+
start_element = current_start['element']
|
1245
|
+
|
1246
|
+
# Find the last element before this boundary on the same page
|
1247
|
+
if start_element.page == boundary['element'].page:
|
1248
|
+
# Find elements on this page
|
1249
|
+
page_elements = [e for e in all_elements if e.page == start_element.page]
|
1250
|
+
# Sort by position
|
1251
|
+
page_elements.sort(key=lambda e: (e.top, e.x0))
|
1252
|
+
|
1253
|
+
# Find the last element before the boundary
|
1254
|
+
end_idx = page_elements.index(boundary['element']) - 1 if boundary['element'] in page_elements else -1
|
1255
|
+
end_element = page_elements[end_idx] if end_idx >= 0 else None
|
1256
|
+
|
1257
|
+
# Create the section
|
1258
|
+
section = start_element.page.get_section_between(
|
1259
|
+
start_element,
|
1260
|
+
end_element,
|
1261
|
+
boundary_inclusion
|
1262
|
+
)
|
1263
|
+
sections.append(section)
|
1264
|
+
else:
|
1265
|
+
# Cross-page section - create from current_start to the end of its page
|
1266
|
+
from natural_pdf.elements.region import Region
|
1267
|
+
start_page = start_element.page
|
1268
|
+
|
1269
|
+
region = Region(
|
1270
|
+
start_page,
|
1271
|
+
(0, start_element.top, start_page.width, start_page.height)
|
1272
|
+
)
|
1273
|
+
region.start_element = start_element
|
1274
|
+
sections.append(region)
|
1275
|
+
|
1276
|
+
current_start = boundary
|
1277
|
+
|
1278
|
+
# Handle the last section if we have a current start
|
1279
|
+
if current_start is not None:
|
1280
|
+
start_element = current_start['element']
|
1281
|
+
start_page = start_element.page
|
1282
|
+
|
1283
|
+
if end_elements:
|
1284
|
+
# With end_elements, we need an explicit end - use the last element
|
1285
|
+
# on the last page of the collection
|
1286
|
+
last_page = self.pages[-1]
|
1287
|
+
last_page_elements = [e for e in all_elements if e.page == last_page]
|
1288
|
+
last_page_elements.sort(key=lambda e: (e.top, e.x0))
|
1289
|
+
end_element = last_page_elements[-1] if last_page_elements else None
|
1290
|
+
|
1291
|
+
# Create a multi-page section
|
1292
|
+
from natural_pdf.elements.region import Region
|
1293
|
+
|
1294
|
+
if start_page == last_page:
|
1295
|
+
# Simple case - both on same page
|
1296
|
+
section = start_page.get_section_between(
|
1297
|
+
start_element,
|
1298
|
+
end_element,
|
1299
|
+
boundary_inclusion
|
1300
|
+
)
|
1301
|
+
sections.append(section)
|
1302
|
+
else:
|
1303
|
+
# Create a multi-page section
|
1304
|
+
combined_region = Region(
|
1305
|
+
start_page,
|
1306
|
+
(0, start_element.top, start_page.width, start_page.height)
|
1307
|
+
)
|
1308
|
+
combined_region._spans_pages = True
|
1309
|
+
combined_region._page_range = (start_page.index, last_page.index)
|
1310
|
+
combined_region.start_element = start_element
|
1311
|
+
combined_region.end_element = end_element
|
1312
|
+
|
1313
|
+
# Get all elements that fall within this multi-page region
|
1314
|
+
combined_elements = []
|
1315
|
+
|
1316
|
+
# Get elements from the first page
|
1317
|
+
first_page_elements = [e for e in all_elements
|
1318
|
+
if e.page == start_page and e.top >= start_element.top]
|
1319
|
+
combined_elements.extend(first_page_elements)
|
1320
|
+
|
1321
|
+
# Get elements from middle pages (if any)
|
1322
|
+
for page_idx in range(start_page.index + 1, last_page.index):
|
1323
|
+
middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
|
1324
|
+
combined_elements.extend(middle_page_elements)
|
1325
|
+
|
1326
|
+
# Get elements from the last page
|
1327
|
+
last_page_elements = [e for e in all_elements
|
1328
|
+
if e.page == last_page and (end_element is None or e.bottom <= end_element.bottom)]
|
1329
|
+
combined_elements.extend(last_page_elements)
|
1330
|
+
|
1331
|
+
# Store the elements in the combined region
|
1332
|
+
combined_region._multi_page_elements = combined_elements
|
1333
|
+
|
1334
|
+
sections.append(combined_region)
|
1335
|
+
else:
|
1336
|
+
# With start_elements only, create a section to the end of the current page
|
1337
|
+
from natural_pdf.elements.region import Region
|
1338
|
+
region = Region(
|
1339
|
+
start_page,
|
1340
|
+
(0, start_element.top, start_page.width, start_page.height)
|
1341
|
+
)
|
1342
|
+
region.start_element = start_element
|
1343
|
+
sections.append(region)
|
1344
|
+
|
1345
|
+
return sections
|