natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +209 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +288 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +413 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +512 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +604 -0
  56. docs/tutorials/12-ocr-integration.md +175 -0
  57. docs/tutorials/13-semantic-search.ipynb +1328 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +50 -33
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/gemini.py +264 -0
  67. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  68. natural_pdf/analyzers/layout/layout_manager.py +125 -58
  69. natural_pdf/analyzers/layout/layout_options.py +43 -17
  70. natural_pdf/analyzers/layout/paddle.py +152 -95
  71. natural_pdf/analyzers/layout/surya.py +164 -92
  72. natural_pdf/analyzers/layout/tatr.py +149 -84
  73. natural_pdf/analyzers/layout/yolo.py +89 -45
  74. natural_pdf/analyzers/text_options.py +22 -15
  75. natural_pdf/analyzers/text_structure.py +131 -85
  76. natural_pdf/analyzers/utils.py +30 -23
  77. natural_pdf/collections/pdf_collection.py +146 -97
  78. natural_pdf/core/__init__.py +1 -1
  79. natural_pdf/core/element_manager.py +419 -337
  80. natural_pdf/core/highlighting_service.py +268 -196
  81. natural_pdf/core/page.py +1044 -521
  82. natural_pdf/core/pdf.py +516 -313
  83. natural_pdf/elements/__init__.py +1 -1
  84. natural_pdf/elements/base.py +307 -225
  85. natural_pdf/elements/collections.py +805 -543
  86. natural_pdf/elements/line.py +39 -36
  87. natural_pdf/elements/rect.py +32 -30
  88. natural_pdf/elements/region.py +889 -879
  89. natural_pdf/elements/text.py +127 -99
  90. natural_pdf/exporters/__init__.py +0 -1
  91. natural_pdf/exporters/searchable_pdf.py +261 -102
  92. natural_pdf/ocr/__init__.py +57 -35
  93. natural_pdf/ocr/engine.py +150 -46
  94. natural_pdf/ocr/engine_easyocr.py +146 -150
  95. natural_pdf/ocr/engine_paddle.py +118 -175
  96. natural_pdf/ocr/engine_surya.py +78 -141
  97. natural_pdf/ocr/ocr_factory.py +114 -0
  98. natural_pdf/ocr/ocr_manager.py +122 -124
  99. natural_pdf/ocr/ocr_options.py +16 -20
  100. natural_pdf/ocr/utils.py +98 -0
  101. natural_pdf/qa/__init__.py +1 -1
  102. natural_pdf/qa/document_qa.py +119 -111
  103. natural_pdf/search/__init__.py +37 -31
  104. natural_pdf/search/haystack_search_service.py +312 -189
  105. natural_pdf/search/haystack_utils.py +186 -122
  106. natural_pdf/search/search_options.py +25 -14
  107. natural_pdf/search/search_service_protocol.py +12 -6
  108. natural_pdf/search/searchable_mixin.py +261 -176
  109. natural_pdf/selectors/__init__.py +2 -1
  110. natural_pdf/selectors/parser.py +159 -316
  111. natural_pdf/templates/__init__.py +1 -1
  112. natural_pdf/templates/spa/css/style.css +334 -0
  113. natural_pdf/templates/spa/index.html +31 -0
  114. natural_pdf/templates/spa/js/app.js +472 -0
  115. natural_pdf/templates/spa/words.txt +235976 -0
  116. natural_pdf/utils/debug.py +32 -0
  117. natural_pdf/utils/highlighting.py +8 -2
  118. natural_pdf/utils/identifiers.py +29 -0
  119. natural_pdf/utils/packaging.py +418 -0
  120. natural_pdf/utils/reading_order.py +65 -63
  121. natural_pdf/utils/text_extraction.py +195 -0
  122. natural_pdf/utils/visualization.py +70 -61
  123. natural_pdf/widgets/__init__.py +2 -3
  124. natural_pdf/widgets/viewer.py +749 -718
  125. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
  126. natural_pdf-0.1.6.dist-info/RECORD +141 -0
  127. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
  128. natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
  129. notebooks/Examples.ipynb +1293 -0
  130. pdfs/.gitkeep +0 -0
  131. pdfs/01-practice.pdf +543 -0
  132. pdfs/0500000US42001.pdf +0 -0
  133. pdfs/0500000US42007.pdf +0 -0
  134. pdfs/2014 Statistics.pdf +0 -0
  135. pdfs/2019 Statistics.pdf +0 -0
  136. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  137. pdfs/needs-ocr.pdf +0 -0
  138. natural_pdf/templates/ocr_debug.html +0 -517
  139. natural_pdf-0.1.4.dist-info/RECORD +0 -61
  140. natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
  141. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -1,8 +1,27 @@
1
1
  import logging
2
-
3
- from typing import List, Optional, Dict, Any, Union, Callable, TypeVar, Generic, Iterator, Tuple, TYPE_CHECKING
2
+ from typing import (
3
+ TYPE_CHECKING,
4
+ Any,
5
+ Callable,
6
+ Dict,
7
+ Generic,
8
+ Iterator,
9
+ List,
10
+ Optional,
11
+ Tuple,
12
+ TypeVar,
13
+ Union,
14
+ )
15
+
16
+ from pdfplumber.utils.geometry import objects_to_bbox
17
+
18
+ # New Imports
19
+ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
20
+
21
+ from natural_pdf.elements.text import TextElement # Needed for isinstance check
4
22
  from natural_pdf.ocr import OCROptions
5
23
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
24
+ from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import the new utility
6
25
 
7
26
  logger = logging.getLogger(__name__)
8
27
 
@@ -10,35 +29,36 @@ if TYPE_CHECKING:
10
29
  from natural_pdf.core.page import Page
11
30
  from natural_pdf.elements.region import Region
12
31
 
13
- T = TypeVar('T')
14
- P = TypeVar('P', bound='Page')
32
+ T = TypeVar("T")
33
+ P = TypeVar("P", bound="Page")
34
+
15
35
 
16
36
  class ElementCollection(Generic[T]):
17
37
  """
18
38
  Collection of PDF elements with batch operations.
19
39
  """
20
-
40
+
21
41
  def __init__(self, elements: List[T]):
22
42
  """
23
43
  Initialize a collection of elements.
24
-
44
+
25
45
  Args:
26
46
  elements: List of Element objects
27
47
  """
28
48
  self._elements = elements or []
29
-
49
+
30
50
  def __len__(self) -> int:
31
51
  """Get the number of elements in the collection."""
32
52
  return len(self._elements)
33
-
34
- def __getitem__(self, index: int) -> 'Element':
53
+
54
+ def __getitem__(self, index: int) -> "Element":
35
55
  """Get an element by index."""
36
56
  return self._elements[index]
37
-
57
+
38
58
  def __iter__(self):
39
59
  """Iterate over elements."""
40
60
  return iter(self._elements)
41
-
61
+
42
62
  def __repr__(self) -> str:
43
63
  """Return a string representation showing the element count."""
44
64
  element_type = "Mixed"
@@ -47,130 +67,130 @@ class ElementCollection(Generic[T]):
47
67
  if len(types) == 1:
48
68
  element_type = types.pop()
49
69
  return f"<ElementCollection[{element_type}](count={len(self)})>"
50
-
70
+
51
71
  @property
52
- def elements(self) -> List['Element']:
72
+ def elements(self) -> List["Element"]:
53
73
  """Get the elements in this collection."""
54
74
  return self._elements
55
-
75
+
56
76
  @property
57
- def first(self) -> Optional['Element']:
77
+ def first(self) -> Optional["Element"]:
58
78
  """Get the first element in the collection."""
59
79
  return self._elements[0] if self._elements else None
60
-
80
+
61
81
  @property
62
- def last(self) -> Optional['Element']:
82
+ def last(self) -> Optional["Element"]:
63
83
  """Get the last element in the collection."""
64
84
  return self._elements[-1] if self._elements else None
65
-
66
- def highest(self) -> Optional['Element']:
85
+
86
+ def highest(self) -> Optional["Element"]:
67
87
  """
68
88
  Get element with the smallest top y-coordinate (highest on page).
69
-
89
+
70
90
  Raises:
71
91
  ValueError: If elements are on multiple pages
72
-
92
+
73
93
  Returns:
74
94
  Element with smallest top value or None if empty
75
95
  """
76
96
  if not self._elements:
77
97
  return None
78
-
98
+
79
99
  # Check if elements are on multiple pages
80
100
  if self._are_on_multiple_pages():
81
101
  raise ValueError("Cannot determine highest element across multiple pages")
82
-
102
+
83
103
  return min(self._elements, key=lambda e: e.top)
84
-
85
- def lowest(self) -> Optional['Element']:
104
+
105
+ def lowest(self) -> Optional["Element"]:
86
106
  """
87
107
  Get element with the largest bottom y-coordinate (lowest on page).
88
-
108
+
89
109
  Raises:
90
110
  ValueError: If elements are on multiple pages
91
-
111
+
92
112
  Returns:
93
113
  Element with largest bottom value or None if empty
94
114
  """
95
115
  if not self._elements:
96
116
  return None
97
-
117
+
98
118
  # Check if elements are on multiple pages
99
119
  if self._are_on_multiple_pages():
100
120
  raise ValueError("Cannot determine lowest element across multiple pages")
101
-
121
+
102
122
  return max(self._elements, key=lambda e: e.bottom)
103
-
104
- def leftmost(self) -> Optional['Element']:
123
+
124
+ def leftmost(self) -> Optional["Element"]:
105
125
  """
106
126
  Get element with the smallest x0 coordinate (leftmost on page).
107
-
127
+
108
128
  Raises:
109
129
  ValueError: If elements are on multiple pages
110
-
130
+
111
131
  Returns:
112
132
  Element with smallest x0 value or None if empty
113
133
  """
114
134
  if not self._elements:
115
135
  return None
116
-
136
+
117
137
  # Check if elements are on multiple pages
118
138
  if self._are_on_multiple_pages():
119
139
  raise ValueError("Cannot determine leftmost element across multiple pages")
120
-
140
+
121
141
  return min(self._elements, key=lambda e: e.x0)
122
-
123
- def rightmost(self) -> Optional['Element']:
142
+
143
+ def rightmost(self) -> Optional["Element"]:
124
144
  """
125
145
  Get element with the largest x1 coordinate (rightmost on page).
126
-
146
+
127
147
  Raises:
128
148
  ValueError: If elements are on multiple pages
129
-
149
+
130
150
  Returns:
131
151
  Element with largest x1 value or None if empty
132
152
  """
133
153
  if not self._elements:
134
154
  return None
135
-
155
+
136
156
  # Check if elements are on multiple pages
137
157
  if self._are_on_multiple_pages():
138
158
  raise ValueError("Cannot determine rightmost element across multiple pages")
139
-
159
+
140
160
  return max(self._elements, key=lambda e: e.x1)
141
-
161
+
142
162
  def _are_on_multiple_pages(self) -> bool:
143
163
  """
144
164
  Check if elements in this collection span multiple pages.
145
-
165
+
146
166
  Returns:
147
167
  True if elements are on different pages, False otherwise
148
168
  """
149
169
  if not self._elements:
150
170
  return False
151
-
171
+
152
172
  # Get the page index of the first element
153
- if not hasattr(self._elements[0], 'page'):
173
+ if not hasattr(self._elements[0], "page"):
154
174
  return False
155
-
175
+
156
176
  first_page_idx = self._elements[0].page.index
157
-
177
+
158
178
  # Check if any element is on a different page
159
- return any(hasattr(e, 'page') and e.page.index != first_page_idx for e in self._elements)
160
-
161
- def exclude_regions(self, regions: List['Region']) -> 'ElementCollection':
179
+ return any(hasattr(e, "page") and e.page.index != first_page_idx for e in self._elements)
180
+
181
+ def exclude_regions(self, regions: List["Region"]) -> "ElementCollection":
162
182
  """
163
183
  Remove elements that are within any of the specified regions.
164
-
184
+
165
185
  Args:
166
186
  regions: List of Region objects to exclude
167
-
187
+
168
188
  Returns:
169
189
  New ElementCollection with filtered elements
170
190
  """
171
191
  if not regions:
172
192
  return ElementCollection(self._elements)
173
-
193
+
174
194
  filtered = []
175
195
  for element in self._elements:
176
196
  exclude = False
@@ -180,72 +200,156 @@ class ElementCollection(Generic[T]):
180
200
  break
181
201
  if not exclude:
182
202
  filtered.append(element)
183
-
203
+
184
204
  return ElementCollection(filtered)
185
-
205
+
186
206
  def extract_text(self, preserve_whitespace=True, use_exclusions=True, **kwargs) -> str:
187
207
  """
188
- Extract text from all elements in the collection.
189
-
208
+ Extract text from all TextElements in the collection, optionally using
209
+ pdfplumber's layout engine if layout=True is specified.
210
+
190
211
  Args:
191
- preserve_whitespace: Whether to keep blank characters (default: True)
192
- use_exclusions: Whether to apply exclusion regions (default: True)
193
- **kwargs: Additional extraction parameters
194
-
212
+ preserve_whitespace: Deprecated. Use layout=False for simple joining.
213
+ use_exclusions: Deprecated. Exclusions should be applied *before* creating
214
+ the collection or by filtering the collection itself.
215
+ **kwargs: Additional layout parameters passed directly to pdfplumber's
216
+ `chars_to_textmap` function ONLY if `layout=True` is passed.
217
+ See Page.extract_text docstring for common parameters.
218
+ If `layout=False` or omitted, performs a simple join.
219
+
195
220
  Returns:
196
- Combined text from all elements
221
+ Combined text from elements, potentially with layout-based spacing.
197
222
  """
198
- # Filter to just text-like elements
199
- text_elements = [e for e in self._elements if hasattr(e, 'extract_text')]
200
-
201
- # Sort elements in reading order (top-to-bottom, left-to-right)
202
- sorted_elements = sorted(text_elements, key=lambda e: (e.top, e.x0))
203
-
204
- # Extract text from each element
205
- texts = []
206
- for element in sorted_elements:
207
- # Extract text with new parameter names
208
- text = element.extract_text(preserve_whitespace=preserve_whitespace, use_exclusions=use_exclusions, **kwargs)
209
-
210
- if text:
211
- texts.append(text)
212
-
213
- return " ".join(texts)
214
-
215
- def filter(self, func: Callable[['Element'], bool]) -> 'ElementCollection':
223
+ # Filter to just TextElements that likely have _char_dicts
224
+ text_elements = [
225
+ el
226
+ for el in self._elements
227
+ if isinstance(el, TextElement) and hasattr(el, "_char_dicts")
228
+ ]
229
+
230
+ if not text_elements:
231
+ return ""
232
+
233
+ # Collect all character dictionaries
234
+ all_char_dicts = []
235
+ for el in text_elements:
236
+ all_char_dicts.extend(getattr(el, "_char_dicts", []))
237
+
238
+ if not all_char_dicts:
239
+ # Handle case where elements exist but have no char dicts
240
+ logger.warning(
241
+ "ElementCollection.extract_text: No character dictionaries found in TextElements."
242
+ )
243
+ return " ".join(
244
+ getattr(el, "text", "") for el in text_elements
245
+ ) # Fallback to simple join of word text
246
+
247
+ # Check if layout is requested
248
+ use_layout = kwargs.get("layout", False)
249
+
250
+ if use_layout:
251
+ logger.debug("ElementCollection.extract_text: Using layout=True path.")
252
+ # Layout requested: Use chars_to_textmap
253
+
254
+ # Prepare layout kwargs
255
+ layout_kwargs = {}
256
+ allowed_keys = set(WORD_EXTRACTOR_KWARGS) | set(TEXTMAP_KWARGS)
257
+ for key, value in kwargs.items():
258
+ if key in allowed_keys:
259
+ layout_kwargs[key] = value
260
+ layout_kwargs["layout"] = True # Ensure layout is True
261
+
262
+ # Calculate overall bbox for the elements used
263
+ collection_bbox = objects_to_bbox(all_char_dicts)
264
+ coll_x0, coll_top, coll_x1, coll_bottom = collection_bbox
265
+ coll_width = coll_x1 - coll_x0
266
+ coll_height = coll_bottom - coll_top
267
+
268
+ # Set layout parameters based on collection bounds
269
+ # Warn if collection is sparse? TBD.
270
+ if "layout_bbox" not in layout_kwargs:
271
+ layout_kwargs["layout_bbox"] = collection_bbox
272
+ if "layout_width" not in layout_kwargs:
273
+ layout_kwargs["layout_width"] = coll_width
274
+ if "layout_height" not in layout_kwargs:
275
+ layout_kwargs["layout_height"] = coll_height
276
+ # Set shifts relative to the collection's top-left
277
+ if "x_shift" not in layout_kwargs:
278
+ layout_kwargs["x_shift"] = coll_x0
279
+ if "y_shift" not in layout_kwargs:
280
+ layout_kwargs["y_shift"] = coll_top
281
+
282
+ try:
283
+ # Sort chars by document order (page, top, x0)
284
+ # Need page info on char dicts for multi-page collections
285
+ # Assuming char dicts have 'page_number' from element creation
286
+ all_char_dicts.sort(
287
+ key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
288
+ )
289
+ textmap = chars_to_textmap(all_char_dicts, **layout_kwargs)
290
+ result = textmap.as_string
291
+ except Exception as e:
292
+ logger.error(
293
+ f"ElementCollection: Error calling chars_to_textmap: {e}", exc_info=True
294
+ )
295
+ logger.warning(
296
+ "ElementCollection: Falling back to simple text join due to layout error."
297
+ )
298
+ # Fallback sorting and joining
299
+ all_char_dicts.sort(
300
+ key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
301
+ )
302
+ result = " ".join(c.get("text", "") for c in all_char_dicts)
303
+
304
+ else:
305
+ # Default: Simple join without layout
306
+ logger.debug("ElementCollection.extract_text: Using simple join (layout=False).")
307
+ # Sort chars by document order (page, top, x0)
308
+ all_char_dicts.sort(
309
+ key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
310
+ )
311
+ # Simple join of character text
312
+ result = "".join(c.get("text", "") for c in all_char_dicts)
313
+ # Replace multiple spaces created by joining possibly overlapping chars? Maybe not necessary.
314
+
315
+ return result
316
+
317
+ def filter(self, func: Callable[["Element"], bool]) -> "ElementCollection":
216
318
  """
217
319
  Filter elements using a function.
218
-
320
+
219
321
  Args:
220
322
  func: Function that takes an element and returns True to keep it
221
-
323
+
222
324
  Returns:
223
325
  New ElementCollection with filtered elements
224
326
  """
225
327
  return ElementCollection([e for e in self._elements if func(e)])
226
-
227
- def sort(self, key=None, reverse=False) -> 'ElementCollection':
328
+
329
+ def sort(self, key=None, reverse=False) -> "ElementCollection":
228
330
  """
229
331
  Sort elements by the given key function.
230
-
332
+
231
333
  Args:
232
334
  key: Function to generate a key for sorting
233
335
  reverse: Whether to sort in descending order
234
-
336
+
235
337
  Returns:
236
338
  Self for method chaining
237
339
  """
238
340
  self._elements.sort(key=key, reverse=reverse)
239
341
  return self
240
-
241
- def highlight(self,
242
- label: Optional[str] = None,
243
- color: Optional[Union[Tuple, str]] = None,
244
- group_by: Optional[str] = None,
245
- label_format: Optional[str] = None,
246
- distinct: bool = False,
247
- include_attrs: Optional[List[str]] = None,
248
- replace: bool = False) -> 'ElementCollection':
342
+
343
+ def highlight(
344
+ self,
345
+ label: Optional[str] = None,
346
+ color: Optional[Union[Tuple, str]] = None,
347
+ group_by: Optional[str] = None,
348
+ label_format: Optional[str] = None,
349
+ distinct: bool = False,
350
+ include_attrs: Optional[List[str]] = None,
351
+ replace: bool = False,
352
+ ) -> "ElementCollection":
249
353
  """
250
354
  Adds persistent highlights for all elements in the collection to the page
251
355
  via the HighlightingService.
@@ -294,17 +398,17 @@ class ElementCollection(Generic[T]):
294
398
  color=color,
295
399
  group_by=group_by,
296
400
  label_format=label_format,
297
- include_attrs=include_attrs
401
+ include_attrs=include_attrs,
298
402
  # 'replace' flag is handled during the add call below
299
403
  )
300
404
 
301
405
  # 2. Add prepared highlights to the persistent service
302
406
  if not highlight_data_list:
303
- return self # Nothing to add
407
+ return self # Nothing to add
304
408
 
305
409
  # Get page and highlighter from the first element (assume uniform page)
306
410
  first_element = self._elements[0]
307
- if not hasattr(first_element, 'page') or not hasattr(first_element.page, '_highlighter'):
411
+ if not hasattr(first_element, "page") or not hasattr(first_element.page, "_highlighter"):
308
412
  logger.warning("Cannot highlight collection: Elements lack page or highlighter access.")
309
413
  return self
310
414
 
@@ -317,42 +421,48 @@ class ElementCollection(Generic[T]):
317
421
  if replace:
318
422
  # Identify all unique page indices in this operation
319
423
  for data in highlight_data_list:
320
- pages_to_clear.add(data['page_index'])
424
+ pages_to_clear.add(data["page_index"])
321
425
  # Clear those pages *before* adding new highlights
322
- logger.debug(f"Highlighting with replace=True. Clearing highlights for pages: {pages_to_clear}")
426
+ logger.debug(
427
+ f"Highlighting with replace=True. Clearing highlights for pages: {pages_to_clear}"
428
+ )
323
429
  for page_idx in pages_to_clear:
324
430
  highlighter.clear_page(page_idx)
325
431
 
326
432
  for data in highlight_data_list:
327
433
  # Call the appropriate service add method
328
434
  add_args = {
329
- "page_index": data['page_index'],
330
- "color": data['color'], # Color determined by _prepare
331
- "label": data['label'], # Label determined by _prepare
332
- "use_color_cycling": data.get('use_color_cycling', False), # Set by _prepare if distinct
333
- "element": data['element'],
334
- "include_attrs": data['include_attrs'],
435
+ "page_index": data["page_index"],
436
+ "color": data["color"], # Color determined by _prepare
437
+ "label": data["label"], # Label determined by _prepare
438
+ "use_color_cycling": data.get(
439
+ "use_color_cycling", False
440
+ ), # Set by _prepare if distinct
441
+ "element": data["element"],
442
+ "include_attrs": data["include_attrs"],
335
443
  # Internal call to service always appends, as clearing was handled above
336
- "existing": 'append'
444
+ "existing": "append",
337
445
  }
338
- if data.get('polygon'):
339
- add_args["polygon"] = data['polygon']
446
+ if data.get("polygon"):
447
+ add_args["polygon"] = data["polygon"]
340
448
  highlighter.add_polygon(**add_args)
341
- elif data.get('bbox'):
342
- add_args["bbox"] = data['bbox']
449
+ elif data.get("bbox"):
450
+ add_args["bbox"] = data["bbox"]
343
451
  highlighter.add(**add_args)
344
452
  else:
345
453
  logger.warning(f"Skipping highlight data, no bbox or polygon found: {data}")
346
454
 
347
455
  return self
348
456
 
349
- def _prepare_highlight_data(self,
350
- distinct: bool = False,
351
- label: Optional[str] = None,
352
- color: Optional[Union[Tuple, str]] = None,
353
- group_by: Optional[str] = None,
354
- label_format: Optional[str] = None,
355
- include_attrs: Optional[List[str]] = None) -> List[Dict]:
457
+ def _prepare_highlight_data(
458
+ self,
459
+ distinct: bool = False,
460
+ label: Optional[str] = None,
461
+ color: Optional[Union[Tuple, str]] = None,
462
+ group_by: Optional[str] = None,
463
+ label_format: Optional[str] = None,
464
+ include_attrs: Optional[List[str]] = None,
465
+ ) -> List[Dict]:
356
466
  """
357
467
  Determines the parameters for highlighting each element based on the strategy.
358
468
 
@@ -364,58 +474,64 @@ class ElementCollection(Generic[T]):
364
474
  Color and label determination happens here.
365
475
  """
366
476
  prepared_data = []
367
- if not self._elements: return prepared_data
477
+ if not self._elements:
478
+ return prepared_data
368
479
 
369
480
  # Need access to the HighlightingService to determine colors correctly.
370
481
  highlighter = None
371
482
  first_element = self._elements[0]
372
- if hasattr(first_element, 'page') and hasattr(first_element.page, '_highlighter'):
483
+ if hasattr(first_element, "page") and hasattr(first_element.page, "_highlighter"):
373
484
  highlighter = first_element.page._highlighter
374
485
  else:
375
- logger.warning("Cannot determine highlight colors: HighlightingService not accessible from elements.")
486
+ logger.warning(
487
+ "Cannot determine highlight colors: HighlightingService not accessible from elements."
488
+ )
376
489
  return []
377
490
 
378
491
  if distinct:
379
492
  logger.debug("_prepare: Distinct highlighting strategy.")
380
493
  for element in self._elements:
381
494
  # Call the service's color determination logic
382
- final_color = highlighter._determine_highlight_color(label=None, color_input=None, use_color_cycling=True)
495
+ final_color = highlighter._determine_highlight_color(
496
+ label=None, color_input=None, use_color_cycling=True
497
+ )
383
498
  element_data = self._get_element_highlight_params(element, include_attrs)
384
499
  if element_data:
385
- element_data.update({
386
- 'color': final_color,
387
- 'label': None,
388
- 'use_color_cycling': True
389
- })
500
+ element_data.update(
501
+ {"color": final_color, "label": None, "use_color_cycling": True}
502
+ )
390
503
  prepared_data.append(element_data)
391
504
 
392
505
  elif label is not None:
393
506
  logger.debug(f"_prepare: Explicit label '{label}' strategy.")
394
- final_color = highlighter._determine_highlight_color(label=label, color_input=color, use_color_cycling=False)
507
+ final_color = highlighter._determine_highlight_color(
508
+ label=label, color_input=color, use_color_cycling=False
509
+ )
395
510
  for element in self._elements:
396
511
  element_data = self._get_element_highlight_params(element, include_attrs)
397
512
  if element_data:
398
- element_data.update({
399
- 'color': final_color,
400
- 'label': label
401
- })
513
+ element_data.update({"color": final_color, "label": label})
402
514
  prepared_data.append(element_data)
403
515
 
404
516
  elif group_by is not None:
405
517
  logger.debug("_prepare: Grouping by attribute strategy.")
406
518
  grouped_elements = self._group_elements_by_attr(group_by)
407
519
  for group_key, group_elements in grouped_elements.items():
408
- if not group_elements: continue
409
- group_label = self._format_group_label(group_key, label_format, group_elements[0], group_by)
410
- final_color = highlighter._determine_highlight_color(label=group_label, color_input=None, use_color_cycling=False)
411
- logger.debug(f" _prepare group '{group_label}' ({len(group_elements)} elements) -> color {final_color}")
520
+ if not group_elements:
521
+ continue
522
+ group_label = self._format_group_label(
523
+ group_key, label_format, group_elements[0], group_by
524
+ )
525
+ final_color = highlighter._determine_highlight_color(
526
+ label=group_label, color_input=None, use_color_cycling=False
527
+ )
528
+ logger.debug(
529
+ f" _prepare group '{group_label}' ({len(group_elements)} elements) -> color {final_color}"
530
+ )
412
531
  for element in group_elements:
413
532
  element_data = self._get_element_highlight_params(element, include_attrs)
414
533
  if element_data:
415
- element_data.update({
416
- 'color': final_color,
417
- 'label': group_label
418
- })
534
+ element_data.update({"color": final_color, "label": group_label})
419
535
  prepared_data.append(element_data)
420
536
  else:
421
537
  logger.debug("_prepare: Default grouping strategy.")
@@ -423,15 +539,21 @@ class ElementCollection(Generic[T]):
423
539
 
424
540
  if len(element_types) == 1:
425
541
  type_name = element_types.pop()
426
- base_name = type_name.replace("Element", "").replace("Region", "") if type_name != "Region" else "Region"
542
+ base_name = (
543
+ type_name.replace("Element", "").replace("Region", "")
544
+ if type_name != "Region"
545
+ else "Region"
546
+ )
427
547
  auto_label = f"{base_name} Elements" if base_name else "Elements"
428
548
  # Determine color *before* logging or using it
429
- final_color = highlighter._determine_highlight_color(label=auto_label, color_input=color, use_color_cycling=False)
549
+ final_color = highlighter._determine_highlight_color(
550
+ label=auto_label, color_input=color, use_color_cycling=False
551
+ )
430
552
  logger.debug(f" _prepare default group '{auto_label}' -> color {final_color}")
431
553
  for element in self._elements:
432
554
  element_data = self._get_element_highlight_params(element, include_attrs)
433
555
  if element_data:
434
- element_data.update({'color': final_color, 'label': auto_label})
556
+ element_data.update({"color": final_color, "label": auto_label})
435
557
  prepared_data.append(element_data)
436
558
  else:
437
559
  # Mixed types: Generate generic label and warn
@@ -442,26 +564,33 @@ class ElementCollection(Generic[T]):
442
564
  f"using generic label '{auto_label}'. Consider using 'label', 'group_by', "
443
565
  f"or 'distinct=True' for more specific highlighting."
444
566
  )
445
- final_color = highlighter._determine_highlight_color(label=auto_label, color_input=color, use_color_cycling=False)
567
+ final_color = highlighter._determine_highlight_color(
568
+ label=auto_label, color_input=color, use_color_cycling=False
569
+ )
446
570
  # Determine color *before* logging or using it (already done above for this branch)
447
571
  logger.debug(f" _prepare default group '{auto_label}' -> color {final_color}")
448
572
  for element in self._elements:
449
573
  element_data = self._get_element_highlight_params(element, include_attrs)
450
574
  if element_data:
451
- element_data.update({'color': final_color, 'label': auto_label})
575
+ element_data.update({"color": final_color, "label": auto_label})
452
576
  prepared_data.append(element_data)
453
577
 
454
578
  return prepared_data
455
579
 
456
- def _call_element_highlighter(self, element: T,
457
- color: Optional[Union[Tuple, str]],
458
- label: Optional[str],
459
- use_color_cycling: bool,
460
- include_attrs: Optional[List[str]],
461
- existing: str):
580
+ def _call_element_highlighter(
581
+ self,
582
+ element: T,
583
+ color: Optional[Union[Tuple, str]],
584
+ label: Optional[str],
585
+ use_color_cycling: bool,
586
+ include_attrs: Optional[List[str]],
587
+ existing: str,
588
+ ):
462
589
  """Low-level helper to call the appropriate HighlightingService method for an element."""
463
- if not hasattr(element, 'page') or not hasattr(element.page, '_highlighter'):
464
- logger.warning(f"Cannot highlight element, missing 'page' attribute or page lacks highlighter access: {element}")
590
+ if not hasattr(element, "page") or not hasattr(element.page, "_highlighter"):
591
+ logger.warning(
592
+ f"Cannot highlight element, missing 'page' attribute or page lacks highlighter access: {element}"
593
+ )
465
594
  return
466
595
 
467
596
  page = element.page
@@ -472,59 +601,68 @@ class ElementCollection(Generic[T]):
472
601
  "use_color_cycling": use_color_cycling,
473
602
  "include_attrs": include_attrs,
474
603
  "existing": existing,
475
- "element": element
604
+ "element": element,
476
605
  }
477
606
 
478
- is_polygon = getattr(element, 'has_polygon', False)
607
+ is_polygon = getattr(element, "has_polygon", False)
479
608
  geom_data = None
480
609
  add_method = None
481
610
 
482
611
  if is_polygon:
483
- geom_data = getattr(element, 'polygon', None)
612
+ geom_data = getattr(element, "polygon", None)
484
613
  if geom_data:
485
- args_for_highlighter['polygon'] = geom_data
614
+ args_for_highlighter["polygon"] = geom_data
486
615
  add_method = page._highlighter.add_polygon
487
616
  else:
488
- geom_data = getattr(element, 'bbox', None)
617
+ geom_data = getattr(element, "bbox", None)
489
618
  if geom_data:
490
- args_for_highlighter['bbox'] = geom_data
619
+ args_for_highlighter["bbox"] = geom_data
491
620
  add_method = page._highlighter.add
492
621
 
493
622
  if add_method and geom_data:
494
623
  try:
495
624
  add_method(**args_for_highlighter)
496
625
  except Exception as e:
497
- logger.error(f"Error calling highlighter method for element {element} on page {page.index}: {e}", exc_info=True)
626
+ logger.error(
627
+ f"Error calling highlighter method for element {element} on page {page.index}: {e}",
628
+ exc_info=True,
629
+ )
498
630
  elif not geom_data:
499
631
  logger.warning(f"Cannot highlight element, no bbox or polygon found: {element}")
500
632
 
501
- def _highlight_as_single_group(self, label: str,
502
- color: Optional[Union[Tuple, str]],
503
- include_attrs: Optional[List[str]],
504
- existing: str):
633
+ def _highlight_as_single_group(
634
+ self,
635
+ label: str,
636
+ color: Optional[Union[Tuple, str]],
637
+ include_attrs: Optional[List[str]],
638
+ existing: str,
639
+ ):
505
640
  """Highlights all elements with the same explicit label and color."""
506
641
  for element in self._elements:
507
642
  self._call_element_highlighter(
508
643
  element=element,
509
- color=color, # Use explicit color if provided
510
- label=label, # Use the explicit group label
511
- use_color_cycling=False, # Use consistent color for the label
644
+ color=color, # Use explicit color if provided
645
+ label=label, # Use the explicit group label
646
+ use_color_cycling=False, # Use consistent color for the label
512
647
  include_attrs=include_attrs,
513
- existing=existing
648
+ existing=existing,
514
649
  )
515
650
 
516
- def _highlight_grouped_by_attribute(self, group_by: str,
517
- label_format: Optional[str],
518
- include_attrs: Optional[List[str]],
519
- existing: str):
651
+ def _highlight_grouped_by_attribute(
652
+ self,
653
+ group_by: str,
654
+ label_format: Optional[str],
655
+ include_attrs: Optional[List[str]],
656
+ existing: str,
657
+ ):
520
658
  """Groups elements by attribute and highlights each group distinctly."""
521
659
  grouped_elements: Dict[Any, List[T]] = {}
522
660
  # Group elements by the specified attribute value
523
661
  for element in self._elements:
524
662
  try:
525
663
  group_key = getattr(element, group_by, None)
526
- if group_key is None: # Handle elements missing the attribute
527
- group_key = f"Missing '{group_by}'"
664
+ if group_key is None: # Handle elements missing the attribute
665
+ group_key = f"Missing '{group_by}'"
528
666
  # Ensure group_key is hashable (convert list/dict if necessary)
529
667
  if isinstance(group_key, (list, dict)):
530
668
  group_key = str(group_key)
@@ -533,41 +671,49 @@ class ElementCollection(Generic[T]):
533
671
  grouped_elements[group_key] = []
534
672
  grouped_elements[group_key].append(element)
535
673
  except AttributeError:
536
- logger.warning(f"Attribute '{group_by}' not found on element {element}. Skipping grouping.")
537
- group_key = f"Error accessing '{group_by}'"
538
- if group_key not in grouped_elements:
539
- grouped_elements[group_key] = []
540
- grouped_elements[group_key].append(element)
541
- except TypeError: # Handle unhashable types
542
- logger.warning(f"Attribute value for '{group_by}' on {element} is unhashable ({type(group_key)}). Using string representation.")
543
- group_key = str(group_key)
544
- if group_key not in grouped_elements:
545
- grouped_elements[group_key] = []
546
- grouped_elements[group_key].append(element)
547
-
674
+ logger.warning(
675
+ f"Attribute '{group_by}' not found on element {element}. Skipping grouping."
676
+ )
677
+ group_key = f"Error accessing '{group_by}'"
678
+ if group_key not in grouped_elements:
679
+ grouped_elements[group_key] = []
680
+ grouped_elements[group_key].append(element)
681
+ except TypeError: # Handle unhashable types
682
+ logger.warning(
683
+ f"Attribute value for '{group_by}' on {element} is unhashable ({type(group_key)}). Using string representation."
684
+ )
685
+ group_key = str(group_key)
686
+ if group_key not in grouped_elements:
687
+ grouped_elements[group_key] = []
688
+ grouped_elements[group_key].append(element)
548
689
 
549
690
  # Highlight each group
550
691
  for group_key, group_elements in grouped_elements.items():
551
- if not group_elements: continue
692
+ if not group_elements:
693
+ continue
552
694
 
553
695
  # Determine the label for this group
554
- first_element = group_elements[0] # Use first element for formatting
696
+ first_element = group_elements[0] # Use first element for formatting
555
697
  group_label = None
556
698
  if label_format:
557
699
  try:
558
700
  # Create a dict of element attributes for formatting
559
- element_attrs = first_element.__dict__.copy() # Start with element's dict
701
+ element_attrs = first_element.__dict__.copy() # Start with element's dict
560
702
  # Ensure the group_by key itself is present correctly
561
703
  element_attrs[group_by] = group_key
562
704
  group_label = label_format.format(**element_attrs)
563
705
  except KeyError as e:
564
- logger.warning(f"Invalid key '{e}' in label_format '{label_format}'. Using group key as label.")
706
+ logger.warning(
707
+ f"Invalid key '{e}' in label_format '{label_format}'. Using group key as label."
708
+ )
565
709
  group_label = str(group_key)
566
710
  except Exception as format_e:
567
- logger.warning(f"Error formatting label '{label_format}': {format_e}. Using group key as label.")
711
+ logger.warning(
712
+ f"Error formatting label '{label_format}': {format_e}. Using group key as label."
713
+ )
568
714
  group_label = str(group_key)
569
715
  else:
570
- group_label = str(group_key) # Use the attribute value as label
716
+ group_label = str(group_key) # Use the attribute value as label
571
717
 
572
718
  logger.debug(f" Highlighting group '{group_label}' ({len(group_elements)} elements)")
573
719
 
@@ -575,11 +721,11 @@ class ElementCollection(Generic[T]):
575
721
  for element in group_elements:
576
722
  self._call_element_highlighter(
577
723
  element=element,
578
- color=None, # Let ColorManager choose based on label
579
- label=group_label, # Use the derived group label
580
- use_color_cycling=False, # Use consistent color for the label
724
+ color=None, # Let ColorManager choose based on label
725
+ label=group_label, # Use the derived group label
726
+ use_color_cycling=False, # Use consistent color for the label
581
727
  include_attrs=include_attrs,
582
- existing=existing
728
+ existing=existing,
583
729
  )
584
730
 
585
731
  def _highlight_distinctly(self, include_attrs: Optional[List[str]], existing: str):
@@ -589,116 +735,122 @@ class ElementCollection(Generic[T]):
589
735
  for element in self._elements:
590
736
  self._call_element_highlighter(
591
737
  element=element,
592
- color=None, # Let ColorManager cycle
593
- label=None, # No label for distinct elements
594
- use_color_cycling=True, # Force cycling
738
+ color=None, # Let ColorManager cycle
739
+ label=None, # No label for distinct elements
740
+ use_color_cycling=True, # Force cycling
595
741
  include_attrs=include_attrs,
596
- existing=existing
742
+ existing=existing,
743
+ )
744
+
745
+ def show(
746
+ self,
747
+ # --- Visualization Parameters ---
748
+ group_by: Optional[str] = None,
749
+ label: Optional[str] = None,
750
+ color: Optional[Union[Tuple, str]] = None,
751
+ label_format: Optional[str] = None,
752
+ distinct: bool = False,
753
+ include_attrs: Optional[List[str]] = None,
754
+ # --- Rendering Parameters ---
755
+ scale: float = 2.0,
756
+ labels: bool = True, # Use 'labels' consistent with service
757
+ legend_position: str = "right",
758
+ render_ocr: bool = False,
759
+ ) -> Optional["Image.Image"]:
760
+ """
761
+ Generates a temporary preview image highlighting elements in this collection
762
+ on their page, ignoring any persistent highlights.
763
+
764
+ Currently only supports collections where all elements are on the same page.
765
+
766
+ Allows grouping and coloring elements based on attributes, similar to the
767
+ persistent `highlight()` method, but only for this temporary view.
768
+
769
+ Args:
770
+ group_by: Attribute name to group elements by for distinct colors/labels.
771
+ label: Explicit label for all elements (overrides group_by).
772
+ color: Explicit color for all elements (if label used) or base color.
773
+ label_format: F-string to format group labels if group_by is used.
774
+ distinct: Highlight each element distinctly (overrides group_by/label).
775
+ include_attrs: Attributes to display on individual highlights.
776
+ scale: Scale factor for rendering image.
777
+ labels: Whether to include a legend for the temporary highlights.
778
+ legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
779
+ render_ocr: Whether to render OCR text.
780
+
781
+ Returns:
782
+ PIL Image object of the temporary preview, or None if rendering fails or
783
+ elements span multiple pages.
784
+
785
+ Raises:
786
+ ValueError: If the collection is empty or elements are on different pages.
787
+ """
788
+ if not self._elements:
789
+ raise ValueError("Cannot show an empty collection.")
790
+
791
+ # Check if elements are on multiple pages
792
+ if self._are_on_multiple_pages():
793
+ raise ValueError(
794
+ "show() currently only supports collections where all elements are on the same page."
795
+ )
796
+
797
+ # Get the page and highlighting service from the first element
798
+ first_element = self._elements[0]
799
+ if not hasattr(first_element, "page") or not first_element.page:
800
+ logger.warning("Cannot show collection: First element has no associated page.")
801
+ return None
802
+ page = first_element.page
803
+ if not hasattr(page, "pdf") or not page.pdf:
804
+ logger.warning("Cannot show collection: Page has no associated PDF object.")
805
+ return None
806
+
807
+ service = page._highlighter
808
+ if not service:
809
+ logger.warning("Cannot show collection: PDF object has no highlighting service.")
810
+ return None
811
+
812
+ # 1. Prepare temporary highlight data based on grouping parameters
813
+ # This returns a list of dicts, suitable for render_preview
814
+ highlight_data_list = self._prepare_highlight_data(
815
+ distinct=distinct,
816
+ label=label,
817
+ color=color,
818
+ group_by=group_by,
819
+ label_format=label_format,
820
+ include_attrs=include_attrs,
821
+ )
822
+
823
+ if not highlight_data_list:
824
+ logger.warning("No highlight data generated for show(). Rendering clean page.")
825
+ # Render the page without any temporary highlights
826
+ highlight_data_list = []
827
+
828
+ # 2. Call render_preview on the HighlightingService
829
+ try:
830
+ return service.render_preview(
831
+ page_index=page.index,
832
+ temporary_highlights=highlight_data_list,
833
+ scale=scale,
834
+ labels=labels, # Use 'labels'
835
+ legend_position=legend_position,
836
+ render_ocr=render_ocr,
597
837
  )
598
-
599
- def show(self,
600
- # --- Visualization Parameters ---
601
- group_by: Optional[str] = None,
602
- label: Optional[str] = None,
603
- color: Optional[Union[Tuple, str]] = None,
604
- label_format: Optional[str] = None,
605
- distinct: bool = False,
606
- include_attrs: Optional[List[str]] = None,
607
- # --- Rendering Parameters ---
608
- scale: float = 2.0,
609
- labels: bool = True, # Use 'labels' consistent with service
610
- legend_position: str = 'right',
611
- render_ocr: bool = False) -> Optional['Image.Image']:
612
- """
613
- Generates a temporary preview image highlighting elements in this collection
614
- on their page, ignoring any persistent highlights.
615
-
616
- Currently only supports collections where all elements are on the same page.
617
-
618
- Allows grouping and coloring elements based on attributes, similar to the
619
- persistent `highlight()` method, but only for this temporary view.
620
-
621
- Args:
622
- group_by: Attribute name to group elements by for distinct colors/labels.
623
- label: Explicit label for all elements (overrides group_by).
624
- color: Explicit color for all elements (if label used) or base color.
625
- label_format: F-string to format group labels if group_by is used.
626
- distinct: Highlight each element distinctly (overrides group_by/label).
627
- include_attrs: Attributes to display on individual highlights.
628
- scale: Scale factor for rendering image.
629
- labels: Whether to include a legend for the temporary highlights.
630
- legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
631
- render_ocr: Whether to render OCR text.
632
-
633
- Returns:
634
- PIL Image object of the temporary preview, or None if rendering fails or
635
- elements span multiple pages.
636
-
637
- Raises:
638
- ValueError: If the collection is empty or elements are on different pages.
639
- """
640
- if not self._elements:
641
- raise ValueError("Cannot show an empty collection.")
642
-
643
- # Check if elements are on multiple pages
644
- if self._are_on_multiple_pages():
645
- raise ValueError("show() currently only supports collections where all elements are on the same page.")
646
-
647
- # Get the page and highlighting service from the first element
648
- first_element = self._elements[0]
649
- if not hasattr(first_element, 'page') or not first_element.page:
650
- logger.warning("Cannot show collection: First element has no associated page.")
651
- return None
652
- page = first_element.page
653
- if not hasattr(page, 'pdf') or not page.pdf:
654
- logger.warning("Cannot show collection: Page has no associated PDF object.")
655
- return None
656
-
657
- service = page._highlighter
658
- if not service:
659
- logger.warning("Cannot show collection: PDF object has no highlighting service.")
660
- return None
661
-
662
- # 1. Prepare temporary highlight data based on grouping parameters
663
- # This returns a list of dicts, suitable for render_preview
664
- highlight_data_list = self._prepare_highlight_data(
665
- distinct=distinct,
666
- label=label,
667
- color=color,
668
- group_by=group_by,
669
- label_format=label_format,
670
- include_attrs=include_attrs
671
- )
672
-
673
- if not highlight_data_list:
674
- logger.warning("No highlight data generated for show(). Rendering clean page.")
675
- # Render the page without any temporary highlights
676
- highlight_data_list = []
677
-
678
- # 2. Call render_preview on the HighlightingService
679
- try:
680
- return service.render_preview(
681
- page_index=page.index,
682
- temporary_highlights=highlight_data_list,
683
- scale=scale,
684
- labels=labels, # Use 'labels'
685
- legend_position=legend_position,
686
- render_ocr=render_ocr
687
- )
688
- except Exception as e:
689
- logger.error(f"Error calling highlighting_service.render_preview: {e}", exc_info=True)
690
- return None
691
-
692
- def save(self,
693
- filename: str,
694
- scale: float = 2.0,
695
- width: Optional[int] = None,
696
- labels: bool = True,
697
- legend_position: str = 'right',
698
- render_ocr: bool = False) -> 'ElementCollection':
838
+ except Exception as e:
839
+ logger.error(f"Error calling highlighting_service.render_preview: {e}", exc_info=True)
840
+ return None
841
+
842
+ def save(
843
+ self,
844
+ filename: str,
845
+ scale: float = 2.0,
846
+ width: Optional[int] = None,
847
+ labels: bool = True,
848
+ legend_position: str = "right",
849
+ render_ocr: bool = False,
850
+ ) -> "ElementCollection":
699
851
  """
700
852
  Save the page with this collection's elements highlighted to an image file.
701
-
853
+
702
854
  Args:
703
855
  filename: Path to save the image to
704
856
  scale: Scale factor for rendering
@@ -706,32 +858,34 @@ class ElementCollection(Generic[T]):
706
858
  labels: Whether to include a legend for labels
707
859
  legend_position: Position of the legend
708
860
  render_ocr: Whether to render OCR text with white background boxes
709
-
861
+
710
862
  Returns:
711
863
  Self for method chaining
712
864
  """
713
865
  # Use to_image to generate and save the image
714
866
  self.to_image(
715
- path=filename,
867
+ path=filename,
716
868
  scale=scale,
717
869
  width=width,
718
- labels=labels,
870
+ labels=labels,
719
871
  legend_position=legend_position,
720
- render_ocr=render_ocr
872
+ render_ocr=render_ocr,
721
873
  )
722
874
  return self
723
-
724
- def to_image(self,
725
- path: Optional[str] = None,
726
- scale: float = 2.0,
727
- width: Optional[int] = None,
728
- labels: bool = True,
729
- legend_position: str = 'right',
730
- render_ocr: bool = False) -> Optional['Image.Image']:
875
+
876
+ def to_image(
877
+ self,
878
+ path: Optional[str] = None,
879
+ scale: float = 2.0,
880
+ width: Optional[int] = None,
881
+ labels: bool = True,
882
+ legend_position: str = "right",
883
+ render_ocr: bool = False,
884
+ ) -> Optional["Image.Image"]:
731
885
  """
732
886
  Generate an image of the page with this collection's elements highlighted,
733
887
  optionally saving it to a file.
734
-
888
+
735
889
  Args:
736
890
  path: Optional path to save the image to
737
891
  scale: Scale factor for rendering
@@ -739,21 +893,21 @@ class ElementCollection(Generic[T]):
739
893
  labels: Whether to include a legend for labels
740
894
  legend_position: Position of the legend
741
895
  render_ocr: Whether to render OCR text with white background boxes
742
-
896
+
743
897
  Returns:
744
898
  PIL Image of the page with elements highlighted, or None if no valid page
745
899
  """
746
900
  # Get the page from the first element (if available)
747
- if self._elements and hasattr(self._elements[0], 'page'):
901
+ if self._elements and hasattr(self._elements[0], "page"):
748
902
  page = self._elements[0].page
749
903
  # Generate the image using to_image
750
904
  return page.to_image(
751
- path=path,
905
+ path=path,
752
906
  scale=scale,
753
907
  width=width,
754
- labels=labels,
908
+ labels=labels,
755
909
  legend_position=legend_position,
756
- render_ocr=render_ocr
910
+ render_ocr=render_ocr,
757
911
  )
758
912
  return None
759
913
 
@@ -763,7 +917,7 @@ class ElementCollection(Generic[T]):
763
917
  for element in self._elements:
764
918
  try:
765
919
  group_key = getattr(element, group_by, None)
766
- if group_key is None: # Handle elements missing the attribute
920
+ if group_key is None: # Handle elements missing the attribute
767
921
  group_key = f"Missing '{group_by}'"
768
922
  # Ensure group_key is hashable (convert list/dict if necessary)
769
923
  if isinstance(group_key, (list, dict)):
@@ -773,13 +927,17 @@ class ElementCollection(Generic[T]):
773
927
  grouped_elements[group_key] = []
774
928
  grouped_elements[group_key].append(element)
775
929
  except AttributeError:
776
- logger.warning(f"Attribute '{group_by}' not found on element {element}. Skipping grouping.")
930
+ logger.warning(
931
+ f"Attribute '{group_by}' not found on element {element}. Skipping grouping."
932
+ )
777
933
  group_key = f"Error accessing '{group_by}'"
778
934
  if group_key not in grouped_elements:
779
935
  grouped_elements[group_key] = []
780
936
  grouped_elements[group_key].append(element)
781
- except TypeError: # Handle unhashable types
782
- logger.warning(f"Attribute value for '{group_by}' on {element} is unhashable ({type(group_key)}). Using string representation.")
937
+ except TypeError: # Handle unhashable types
938
+ logger.warning(
939
+ f"Attribute value for '{group_by}' on {element} is unhashable ({type(group_key)}). Using string representation."
940
+ )
783
941
  group_key = str(group_key)
784
942
  if group_key not in grouped_elements:
785
943
  grouped_elements[group_key] = []
@@ -787,48 +945,61 @@ class ElementCollection(Generic[T]):
787
945
 
788
946
  return grouped_elements
789
947
 
790
- def _format_group_label(self, group_key: Any, label_format: Optional[str], sample_element: T, group_by_attr: str) -> str:
948
+ def _format_group_label(
949
+ self, group_key: Any, label_format: Optional[str], sample_element: T, group_by_attr: str
950
+ ) -> str:
791
951
  """Formats the label for a group based on the key and format string."""
792
952
  if label_format:
793
953
  try:
794
954
  element_attrs = sample_element.__dict__.copy()
795
- element_attrs[group_by_attr] = group_key # Ensure key is present
955
+ element_attrs[group_by_attr] = group_key # Ensure key is present
796
956
  return label_format.format(**element_attrs)
797
957
  except KeyError as e:
798
- logger.warning(f"Invalid key '{e}' in label_format '{label_format}'. Using group key as label.")
958
+ logger.warning(
959
+ f"Invalid key '{e}' in label_format '{label_format}'. Using group key as label."
960
+ )
799
961
  return str(group_key)
800
962
  except Exception as format_e:
801
- logger.warning(f"Error formatting label '{label_format}': {format_e}. Using group key as label.")
963
+ logger.warning(
964
+ f"Error formatting label '{label_format}': {format_e}. Using group key as label."
965
+ )
802
966
  return str(group_key)
803
967
  else:
804
968
  return str(group_key)
805
969
 
806
- def _get_element_highlight_params(self, element: T, include_attrs: Optional[List[str]]) -> Optional[Dict]:
970
+ def _get_element_highlight_params(
971
+ self, element: T, include_attrs: Optional[List[str]]
972
+ ) -> Optional[Dict]:
807
973
  """Extracts common parameters needed for highlighting a single element."""
808
- if not hasattr(element, 'page'): return None
974
+ if not hasattr(element, "page"):
975
+ return None
809
976
  page = element.page
810
977
 
811
978
  base_data = {
812
- 'page_index': page.index,
813
- 'element': element,
814
- 'include_attrs': include_attrs,
815
- 'attributes_to_draw': {},
816
- 'bbox': None,
817
- 'polygon': None
979
+ "page_index": page.index,
980
+ "element": element,
981
+ "include_attrs": include_attrs,
982
+ "attributes_to_draw": {},
983
+ "bbox": None,
984
+ "polygon": None,
818
985
  }
819
986
 
820
987
  # Extract geometry
821
- is_polygon = getattr(element, 'has_polygon', False)
988
+ is_polygon = getattr(element, "has_polygon", False)
822
989
  geom_data = None
823
990
  if is_polygon:
824
- geom_data = getattr(element, 'polygon', None)
825
- if geom_data: base_data['polygon'] = geom_data
991
+ geom_data = getattr(element, "polygon", None)
992
+ if geom_data:
993
+ base_data["polygon"] = geom_data
826
994
  else:
827
- geom_data = getattr(element, 'bbox', None)
828
- if geom_data: base_data['bbox'] = geom_data
995
+ geom_data = getattr(element, "bbox", None)
996
+ if geom_data:
997
+ base_data["bbox"] = geom_data
829
998
 
830
999
  if not geom_data:
831
- logger.warning(f"Cannot prepare highlight, no bbox or polygon found for element: {element}")
1000
+ logger.warning(
1001
+ f"Cannot prepare highlight, no bbox or polygon found for element: {element}"
1002
+ )
832
1003
  return None
833
1004
 
834
1005
  # Extract attributes if requested
@@ -837,13 +1008,15 @@ class ElementCollection(Generic[T]):
837
1008
  try:
838
1009
  attr_value = getattr(element, attr_name, None)
839
1010
  if attr_value is not None:
840
- base_data['attributes_to_draw'][attr_name] = attr_value
1011
+ base_data["attributes_to_draw"][attr_name] = attr_value
841
1012
  except AttributeError:
842
- logger.warning(f"Attribute '{attr_name}' not found on element {element} for include_attrs")
1013
+ logger.warning(
1014
+ f"Attribute '{attr_name}' not found on element {element} for include_attrs"
1015
+ )
843
1016
 
844
1017
  return base_data
845
1018
 
846
- def viewer(self, title: Optional[str] = None) -> Optional['widgets.DOMWidget']:
1019
+ def viewer(self, title: Optional[str] = None) -> Optional["widgets.DOMWidget"]:
847
1020
  """
848
1021
  Creates and returns an interactive ipywidget showing ONLY the elements
849
1022
  in this collection on their page background.
@@ -862,28 +1035,36 @@ class ElementCollection(Generic[T]):
862
1035
  try:
863
1036
  page = self.elements[0].page
864
1037
  # Check if the page object actually has the method
865
- if hasattr(page, 'viewer') and callable(page.viewer):
866
- final_title = title or f"Interactive Viewer for Collection ({len(self.elements)} elements)"
1038
+ if hasattr(page, "viewer") and callable(page.viewer):
1039
+ final_title = (
1040
+ title or f"Interactive Viewer for Collection ({len(self.elements)} elements)"
1041
+ )
867
1042
  # Call the page method, passing this collection's elements
868
1043
  return page.viewer(
869
1044
  elements_to_render=self.elements,
870
- title=final_title # Pass title if Page method accepts it
1045
+ title=final_title, # Pass title if Page method accepts it
871
1046
  )
872
1047
  else:
873
- logger.error("Page object is missing the 'viewer' method.")
874
- return None
1048
+ logger.error("Page object is missing the 'viewer' method.")
1049
+ return None
875
1050
  except AttributeError:
876
- logger.error("Cannot generate interactive viewer: Elements in collection lack 'page' attribute.")
1051
+ logger.error(
1052
+ "Cannot generate interactive viewer: Elements in collection lack 'page' attribute."
1053
+ )
877
1054
  return None
878
1055
  except IndexError:
879
- # Should be caught by the empty check, but just in case
880
- logger.error("Cannot generate interactive viewer: Collection unexpectedly became empty.")
881
- return None
1056
+ # Should be caught by the empty check, but just in case
1057
+ logger.error(
1058
+ "Cannot generate interactive viewer: Collection unexpectedly became empty."
1059
+ )
1060
+ return None
882
1061
  except Exception as e:
883
- logger.error(f"Error creating interactive viewer from collection: {e}", exc_info=True)
884
- return None
1062
+ logger.error(f"Error creating interactive viewer from collection: {e}", exc_info=True)
1063
+ return None
885
1064
 
886
- def find_all(self, selector: str, regex: bool = False, case: bool = True, **kwargs) -> 'ElementCollection[T]':
1065
+ def find_all(
1066
+ self, selector: str, regex: bool = False, case: bool = True, **kwargs
1067
+ ) -> "ElementCollection[T]":
887
1068
  """
888
1069
  Filter elements within this collection matching the selector.
889
1070
 
@@ -903,21 +1084,21 @@ class ElementCollection(Generic[T]):
903
1084
  selector_obj = parse_selector(selector)
904
1085
  except Exception as e:
905
1086
  logger.error(f"Error parsing selector '{selector}': {e}")
906
- return ElementCollection([]) # Return empty on parse error
1087
+ return ElementCollection([]) # Return empty on parse error
907
1088
 
908
1089
  # Pass regex and case flags to selector function generator
909
- kwargs['regex'] = regex
910
- kwargs['case'] = case
1090
+ kwargs["regex"] = regex
1091
+ kwargs["case"] = case
911
1092
 
912
1093
  try:
913
1094
  filter_func = selector_to_filter_func(selector_obj, **kwargs)
914
1095
  except Exception as e:
915
1096
  logger.error(f"Error creating filter function for selector '{selector}': {e}")
916
- return ElementCollection([]) # Return empty on filter creation error
1097
+ return ElementCollection([]) # Return empty on filter creation error
917
1098
 
918
1099
  matching_elements = [element for element in self._elements if filter_func(element)]
919
1100
 
920
- # Note: Unlike Page.find_all, this doesn't re-sort.
1101
+ # Note: Unlike Page.find_all, this doesn't re-sort.
921
1102
  # Sorting should be done explicitly on the collection if needed.
922
1103
 
923
1104
  return ElementCollection(matching_elements)
@@ -938,96 +1119,134 @@ class ElementCollection(Generic[T]):
938
1119
  results = self.find_all(selector, regex=regex, case=case, **kwargs)
939
1120
  return results.first
940
1121
 
1122
+ def correct_ocr(
1123
+ self,
1124
+ correction_callback: Callable[[Any], Optional[str]],
1125
+ ) -> "ElementCollection":
1126
+ """
1127
+ Applies corrections to OCR-generated text elements within this collection
1128
+ using a user-provided callback function.
1129
+
1130
+ Iterates through elements currently in the collection. If an element's
1131
+ 'source' attribute starts with 'ocr', it calls the `correction_callback`
1132
+ for that element, passing the element itself.
1133
+
1134
+ The `correction_callback` should contain the logic to:
1135
+ 1. Determine if the element needs correction.
1136
+ 2. Perform the correction (e.g., call an LLM).
1137
+ 3. Return the new text (`str`) or `None`.
1138
+
1139
+ If the callback returns a string, the element's `.text` is updated in place.
1140
+ Metadata updates (source, confidence, etc.) should happen within the callback.
1141
+ Elements without a source starting with 'ocr' are skipped.
1142
+
1143
+ Args:
1144
+ correction_callback: A function accepting an element and returning
1145
+ `Optional[str]` (new text or None).
1146
+
1147
+ Returns:
1148
+ Self for method chaining.
1149
+ """
1150
+ # Delegate to the utility function
1151
+ _apply_ocr_correction_to_elements(
1152
+ elements=self._elements,
1153
+ correction_callback=correction_callback,
1154
+ caller_info=f"ElementCollection(len={len(self._elements)})", # Pass caller info
1155
+ )
1156
+ return self # Return self for chaining
1157
+
1158
+
941
1159
  class PageCollection(Generic[P]):
942
1160
  """
943
1161
  A collection of PDF pages with cross-page operations.
944
-
1162
+
945
1163
  This class provides methods for working with multiple pages, such as finding
946
1164
  elements across pages, extracting text from page ranges, and more.
947
1165
  """
948
-
1166
+
949
1167
  def __init__(self, pages: List[P]):
950
1168
  """
951
1169
  Initialize a page collection.
952
-
1170
+
953
1171
  Args:
954
1172
  pages: List of Page objects
955
1173
  """
956
1174
  self.pages = pages
957
-
1175
+
958
1176
  def __len__(self) -> int:
959
1177
  """Return the number of pages in the collection."""
960
1178
  return len(self.pages)
961
-
962
- def __getitem__(self, idx) -> Union[P, 'PageCollection[P]']:
1179
+
1180
+ def __getitem__(self, idx) -> Union[P, "PageCollection[P]"]:
963
1181
  """Support indexing and slicing."""
964
1182
  if isinstance(idx, slice):
965
1183
  return PageCollection(self.pages[idx])
966
1184
  return self.pages[idx]
967
-
1185
+
968
1186
  def __iter__(self) -> Iterator[P]:
969
1187
  """Support iteration."""
970
1188
  return iter(self.pages)
971
-
1189
+
972
1190
  def __repr__(self) -> str:
973
1191
  """Return a string representation showing the page count."""
974
1192
  return f"<PageCollection(count={len(self)})>"
975
-
1193
+
976
1194
  def extract_text(self, keep_blank_chars=True, apply_exclusions=True, **kwargs) -> str:
977
1195
  """
978
1196
  Extract text from all pages in the collection.
979
-
1197
+
980
1198
  Args:
981
1199
  keep_blank_chars: Whether to keep blank characters (default: True)
982
1200
  apply_exclusions: Whether to apply exclusion regions (default: True)
983
1201
  **kwargs: Additional extraction parameters
984
-
1202
+
985
1203
  Returns:
986
1204
  Combined text from all pages
987
1205
  """
988
1206
  texts = []
989
1207
  for page in self.pages:
990
1208
  text = page.extract_text(
991
- keep_blank_chars=keep_blank_chars,
992
- apply_exclusions=apply_exclusions,
993
- **kwargs
1209
+ keep_blank_chars=keep_blank_chars, apply_exclusions=apply_exclusions, **kwargs
994
1210
  )
995
1211
  texts.append(text)
996
-
1212
+
997
1213
  return "\n".join(texts)
998
1214
 
999
- # --- NEW METHOD ---
1000
1215
  def apply_ocr(
1001
1216
  self,
1002
1217
  engine: Optional[str] = None,
1003
- options: Optional[OCROptions] = None,
1218
+ # --- Common OCR Parameters (Direct Arguments) ---
1004
1219
  languages: Optional[List[str]] = None,
1005
- min_confidence: Optional[float] = None,
1220
+ min_confidence: Optional[float] = None, # Min confidence threshold
1006
1221
  device: Optional[str] = None,
1007
- # Add other simple mode args if needed
1008
- ) -> 'PageCollection[P]':
1222
+ resolution: Optional[int] = None, # DPI for rendering
1223
+ apply_exclusions: bool = True, # New parameter
1224
+ # --- Engine-Specific Options ---
1225
+ options: Optional[Any] = None, # e.g., EasyOCROptions(...)
1226
+ ) -> "PageCollection[P]":
1009
1227
  """
1010
1228
  Applies OCR to all pages within this collection using batch processing.
1011
1229
 
1012
- This delegates the work to the parent PDF object's `apply_ocr` method for efficiency. The OCR results (TextElements) are added directly
1013
- to the respective Page objects within this collection.
1230
+ This delegates the work to the parent PDF object's `apply_ocr` method.
1014
1231
 
1015
1232
  Args:
1016
- engine: Name of the engine (e.g., 'easyocr', 'paddleocr', 'surya').
1017
- Uses manager's default if None. Ignored if 'options' is provided.
1018
- options: An specific Options object (e.g., EasyOCROptions) for
1019
- advanced configuration. Overrides simple arguments.
1020
- languages: List of language codes for simple mode.
1021
- min_confidence: Minimum confidence threshold for simple mode.
1022
- device: Device string ('cpu', 'cuda', etc.) for simple mode.
1233
+ engine: Name of the OCR engine (e.g., 'easyocr', 'paddleocr').
1234
+ languages: List of language codes (e.g., ['en', 'fr'], ['en', 'ch']).
1235
+ **Must be codes understood by the specific selected engine.**
1236
+ No mapping is performed.
1237
+ min_confidence: Minimum confidence threshold for detected text (0.0 to 1.0).
1238
+ device: Device to run OCR on (e.g., 'cpu', 'cuda', 'mps').
1239
+ resolution: DPI resolution to render page images before OCR (e.g., 150, 300).
1240
+ apply_exclusions: If True (default), render page images for OCR with
1241
+ excluded areas masked (whited out). If False, OCR
1242
+ the raw page images without masking exclusions.
1243
+ options: An engine-specific options object (e.g., EasyOCROptions) or dict.
1023
1244
 
1024
1245
  Returns:
1025
1246
  Self for method chaining.
1026
1247
 
1027
1248
  Raises:
1028
- RuntimeError: If pages in the collection lack a parent PDF object
1029
- or if the parent PDF object lacks the required
1030
- `apply_ocr` method.
1249
+ RuntimeError: If pages lack a parent PDF or parent lacks `apply_ocr`.
1031
1250
  (Propagates exceptions from PDF.apply_ocr)
1032
1251
  """
1033
1252
  if not self.pages:
@@ -1036,43 +1255,43 @@ class PageCollection(Generic[P]):
1036
1255
 
1037
1256
  # Assume all pages share the same parent PDF object
1038
1257
  first_page = self.pages[0]
1039
- if not hasattr(first_page, '_parent') or not first_page._parent:
1258
+ if not hasattr(first_page, "_parent") or not first_page._parent:
1040
1259
  raise RuntimeError("Pages in this collection do not have a parent PDF reference.")
1041
1260
 
1042
1261
  parent_pdf = first_page._parent
1043
1262
 
1044
- # Updated check for renamed method
1045
- if not hasattr(parent_pdf, 'apply_ocr') or not callable(parent_pdf.apply_ocr):
1046
- raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
1263
+ if not hasattr(parent_pdf, "apply_ocr") or not callable(parent_pdf.apply_ocr):
1264
+ raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
1047
1265
 
1048
1266
  # Get the 0-based indices of the pages in this collection
1049
1267
  page_indices = [p.index for p in self.pages]
1050
1268
 
1051
1269
  logger.info(f"Applying OCR via parent PDF to page indices: {page_indices} in collection.")
1052
1270
 
1053
- # Delegate the batch call to the parent PDF object (using renamed method)
1271
+ # Delegate the batch call to the parent PDF object, passing direct args and apply_exclusions
1054
1272
  parent_pdf.apply_ocr(
1055
1273
  pages=page_indices,
1056
1274
  engine=engine,
1057
- options=options,
1058
1275
  languages=languages,
1059
- min_confidence=min_confidence,
1060
- device=device
1061
- # Pass any other relevant simple_kwargs here if added
1276
+ min_confidence=min_confidence, # Pass the renamed parameter
1277
+ device=device,
1278
+ resolution=resolution,
1279
+ apply_exclusions=apply_exclusions, # Pass down
1280
+ options=options,
1062
1281
  )
1063
1282
  # The PDF method modifies the Page objects directly by adding elements.
1064
1283
 
1065
- return self # Return self for chaining
1284
+ return self # Return self for chaining
1066
1285
 
1067
1286
  def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional[T]:
1068
1287
  """
1069
1288
  Find the first element matching the selector across all pages.
1070
-
1289
+
1071
1290
  Args:
1072
1291
  selector: CSS-like selector string
1073
1292
  apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
1074
1293
  **kwargs: Additional filter parameters
1075
-
1294
+
1076
1295
  Returns:
1077
1296
  First matching element or None
1078
1297
  """
@@ -1081,16 +1300,16 @@ class PageCollection(Generic[P]):
1081
1300
  if element:
1082
1301
  return element
1083
1302
  return None
1084
-
1303
+
1085
1304
  def find_all(self, selector: str, apply_exclusions=True, **kwargs) -> ElementCollection:
1086
1305
  """
1087
1306
  Find all elements matching the selector across all pages.
1088
-
1307
+
1089
1308
  Args:
1090
1309
  selector: CSS-like selector string
1091
1310
  apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
1092
1311
  **kwargs: Additional filter parameters
1093
-
1312
+
1094
1313
  Returns:
1095
1314
  ElementCollection with matching elements from all pages
1096
1315
  """
@@ -1099,57 +1318,79 @@ class PageCollection(Generic[P]):
1099
1318
  elements = page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
1100
1319
  if elements:
1101
1320
  all_elements.extend(elements.elements)
1102
-
1321
+
1103
1322
  return ElementCollection(all_elements)
1104
-
1105
- # def debug_ocr(self, output_path):
1106
- # """
1107
- # Generate an interactive HTML debug report for OCR results.
1108
-
1109
- # This creates a single-file HTML report with:
1110
- # - Side-by-side view of image regions and OCR text
1111
- # - Confidence scores with color coding
1112
- # - Editable correction fields
1113
- # - Filtering and sorting options
1114
- # - Export functionality for corrected text
1115
-
1116
- # Args:
1117
- # output_path: Path to save the HTML report
1118
-
1119
- # Returns:
1120
- # Path to the generated HTML file
1121
- # """
1122
- # from natural_pdf.utils.ocr import debug_ocr_to_html
1123
- # return debug_ocr_to_html(self.pages, output_path)
1124
-
1125
- def get_sections(self,
1126
- start_elements=None,
1127
- end_elements=None,
1128
- new_section_on_page_break=False,
1129
- boundary_inclusion='both') -> List['Region']:
1323
+
1324
+ def correct_ocr(
1325
+ self,
1326
+ correction_callback: Callable[[Any], Optional[str]],
1327
+ ) -> "PageCollection[P]":
1328
+ """
1329
+ Applies corrections to OCR-generated text elements across all pages
1330
+ in this collection using a user-provided callback function.
1331
+
1332
+ This method delegates to the parent PDF's `correct_ocr` method,
1333
+ targeting all pages within this collection.
1334
+
1335
+ Args:
1336
+ correction_callback: A function that accepts a single argument (an element
1337
+ object) and returns `Optional[str]` (new text or None).
1338
+
1339
+ Returns:
1340
+ A dictionary containing aggregate statistics for the process across all pages:
1341
+ {'elements_checked': total_checked, 'corrections_applied': total_applied}
1342
+
1343
+ Raises:
1344
+ RuntimeError: If the collection is empty, pages lack a parent PDF reference,
1345
+ or the parent PDF lacks the `correct_ocr` method.
1346
+ """
1347
+ if not self.pages:
1348
+ logger.warning("Cannot correct OCR for an empty PageCollection.")
1349
+
1350
+ # Assume all pages share the same parent PDF object
1351
+ parent_pdf = self.pages[0]._parent
1352
+
1353
+ page_indices = [p.index for p in self.pages]
1354
+ logger.info(f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices}.")
1355
+
1356
+ # Delegate the call to the parent PDF object for the relevant pages
1357
+ parent_pdf.correct_ocr(
1358
+ correction_callback=correction_callback,
1359
+ pages=page_indices
1360
+ )
1361
+
1362
+ return self
1363
+
1364
+ def get_sections(
1365
+ self,
1366
+ start_elements=None,
1367
+ end_elements=None,
1368
+ new_section_on_page_break=False,
1369
+ boundary_inclusion="both",
1370
+ ) -> List["Region"]:
1130
1371
  """
1131
1372
  Extract sections from a page collection based on start/end elements.
1132
-
1373
+
1133
1374
  Args:
1134
1375
  start_elements: Elements or selector string that mark the start of sections
1135
1376
  end_elements: Elements or selector string that mark the end of sections
1136
1377
  new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
1137
1378
  boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
1138
-
1379
+
1139
1380
  Returns:
1140
1381
  List of Region objects representing the extracted sections
1141
1382
  """
1142
1383
  # Find start and end elements across all pages
1143
1384
  if isinstance(start_elements, str):
1144
1385
  start_elements = self.find_all(start_elements).elements
1145
-
1386
+
1146
1387
  if isinstance(end_elements, str):
1147
1388
  end_elements = self.find_all(end_elements).elements
1148
-
1389
+
1149
1390
  # If no start elements, return empty list
1150
1391
  if not start_elements:
1151
1392
  return []
1152
-
1393
+
1153
1394
  # If there are page break boundaries, we'll need to add them
1154
1395
  if new_section_on_page_break:
1155
1396
  # For each page boundary, create virtual "end" and "start" elements
@@ -1159,183 +1400,200 @@ class PageCollection(Generic[P]):
1159
1400
  # If end_elements is None, initialize it as an empty list
1160
1401
  if end_elements is None:
1161
1402
  end_elements = []
1162
-
1403
+
1163
1404
  # Create a region at the bottom of the page as an artificial end marker
1164
1405
  from natural_pdf.elements.region import Region
1406
+
1165
1407
  bottom_region = Region(page, (0, page.height - 1, page.width, page.height))
1166
1408
  bottom_region.is_page_boundary = True # Mark it as a special boundary
1167
1409
  end_elements.append(bottom_region)
1168
-
1410
+
1169
1411
  # Add a virtual "start" element at the top of the next page
1170
1412
  next_page = self.pages[i + 1]
1171
1413
  top_region = Region(next_page, (0, 0, next_page.width, 1))
1172
1414
  top_region.is_page_boundary = True # Mark it as a special boundary
1173
1415
  start_elements.append(top_region)
1174
-
1416
+
1175
1417
  # Get all elements from all pages and sort them in document order
1176
1418
  all_elements = []
1177
1419
  for page in self.pages:
1178
1420
  elements = page.get_elements()
1179
1421
  all_elements.extend(elements)
1180
-
1422
+
1181
1423
  # Sort by page index, then vertical position, then horizontal position
1182
1424
  all_elements.sort(key=lambda e: (e.page.index, e.top, e.x0))
1183
-
1425
+
1184
1426
  # Mark section boundaries
1185
1427
  section_boundaries = []
1186
-
1428
+
1187
1429
  # Add start element boundaries
1188
1430
  for element in start_elements:
1189
1431
  if element in all_elements:
1190
1432
  idx = all_elements.index(element)
1191
- section_boundaries.append({
1192
- 'index': idx,
1193
- 'element': element,
1194
- 'type': 'start',
1195
- 'page_idx': element.page.index
1196
- })
1197
- elif hasattr(element, 'is_page_boundary') and element.is_page_boundary:
1433
+ section_boundaries.append(
1434
+ {
1435
+ "index": idx,
1436
+ "element": element,
1437
+ "type": "start",
1438
+ "page_idx": element.page.index,
1439
+ }
1440
+ )
1441
+ elif hasattr(element, "is_page_boundary") and element.is_page_boundary:
1198
1442
  # This is a virtual page boundary element
1199
- section_boundaries.append({
1200
- 'index': -1, # Special index for page boundaries
1201
- 'element': element,
1202
- 'type': 'start',
1203
- 'page_idx': element.page.index
1204
- })
1205
-
1443
+ section_boundaries.append(
1444
+ {
1445
+ "index": -1, # Special index for page boundaries
1446
+ "element": element,
1447
+ "type": "start",
1448
+ "page_idx": element.page.index,
1449
+ }
1450
+ )
1451
+
1206
1452
  # Add end element boundaries if provided
1207
1453
  if end_elements:
1208
1454
  for element in end_elements:
1209
1455
  if element in all_elements:
1210
1456
  idx = all_elements.index(element)
1211
- section_boundaries.append({
1212
- 'index': idx,
1213
- 'element': element,
1214
- 'type': 'end',
1215
- 'page_idx': element.page.index
1216
- })
1217
- elif hasattr(element, 'is_page_boundary') and element.is_page_boundary:
1457
+ section_boundaries.append(
1458
+ {
1459
+ "index": idx,
1460
+ "element": element,
1461
+ "type": "end",
1462
+ "page_idx": element.page.index,
1463
+ }
1464
+ )
1465
+ elif hasattr(element, "is_page_boundary") and element.is_page_boundary:
1218
1466
  # This is a virtual page boundary element
1219
- section_boundaries.append({
1220
- 'index': -1, # Special index for page boundaries
1221
- 'element': element,
1222
- 'type': 'end',
1223
- 'page_idx': element.page.index
1224
- })
1225
-
1467
+ section_boundaries.append(
1468
+ {
1469
+ "index": -1, # Special index for page boundaries
1470
+ "element": element,
1471
+ "type": "end",
1472
+ "page_idx": element.page.index,
1473
+ }
1474
+ )
1475
+
1226
1476
  # Sort boundaries by page index, then by actual document position
1227
- section_boundaries.sort(key=lambda x: (x['page_idx'],
1228
- x['index'] if x['index'] != -1 else
1229
- (0 if x['type'] == 'start' else float('inf'))))
1230
-
1477
+ section_boundaries.sort(
1478
+ key=lambda x: (
1479
+ x["page_idx"],
1480
+ x["index"] if x["index"] != -1 else (0 if x["type"] == "start" else float("inf")),
1481
+ )
1482
+ )
1483
+
1231
1484
  # Generate sections
1232
1485
  sections = []
1233
1486
  current_start = None
1234
-
1487
+
1235
1488
  for i, boundary in enumerate(section_boundaries):
1236
1489
  # If it's a start boundary and we don't have a current start
1237
- if boundary['type'] == 'start' and current_start is None:
1490
+ if boundary["type"] == "start" and current_start is None:
1238
1491
  current_start = boundary
1239
-
1492
+
1240
1493
  # If it's an end boundary and we have a current start
1241
- elif boundary['type'] == 'end' and current_start is not None:
1494
+ elif boundary["type"] == "end" and current_start is not None:
1242
1495
  # Create a section from current_start to this boundary
1243
- start_element = current_start['element']
1244
- end_element = boundary['element']
1245
-
1496
+ start_element = current_start["element"]
1497
+ end_element = boundary["element"]
1498
+
1246
1499
  # If both elements are on the same page, use the page's get_section_between
1247
1500
  if start_element.page == end_element.page:
1248
1501
  section = start_element.page.get_section_between(
1249
- start_element,
1250
- end_element,
1251
- boundary_inclusion
1502
+ start_element, end_element, boundary_inclusion
1252
1503
  )
1253
1504
  sections.append(section)
1254
1505
  else:
1255
1506
  # Create a multi-page section
1256
1507
  from natural_pdf.elements.region import Region
1257
-
1508
+
1258
1509
  # Get the start and end pages
1259
1510
  start_page = start_element.page
1260
1511
  end_page = end_element.page
1261
-
1512
+
1262
1513
  # Create a combined region
1263
1514
  combined_region = Region(
1264
- start_page,
1265
- (0, start_element.top, start_page.width, start_page.height)
1515
+ start_page, (0, start_element.top, start_page.width, start_page.height)
1266
1516
  )
1267
1517
  combined_region._spans_pages = True
1268
1518
  combined_region._page_range = (start_page.index, end_page.index)
1269
1519
  combined_region.start_element = start_element
1270
1520
  combined_region.end_element = end_element
1271
-
1521
+
1272
1522
  # Get all elements that fall within this multi-page region
1273
1523
  combined_elements = []
1274
-
1524
+
1275
1525
  # Get elements from the first page
1276
- first_page_elements = [e for e in all_elements
1277
- if e.page == start_page and e.top >= start_element.top]
1526
+ first_page_elements = [
1527
+ e
1528
+ for e in all_elements
1529
+ if e.page == start_page and e.top >= start_element.top
1530
+ ]
1278
1531
  combined_elements.extend(first_page_elements)
1279
-
1532
+
1280
1533
  # Get elements from middle pages (if any)
1281
1534
  for page_idx in range(start_page.index + 1, end_page.index):
1282
1535
  middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
1283
1536
  combined_elements.extend(middle_page_elements)
1284
-
1537
+
1285
1538
  # Get elements from the last page
1286
- last_page_elements = [e for e in all_elements
1287
- if e.page == end_page and e.bottom <= end_element.bottom]
1539
+ last_page_elements = [
1540
+ e
1541
+ for e in all_elements
1542
+ if e.page == end_page and e.bottom <= end_element.bottom
1543
+ ]
1288
1544
  combined_elements.extend(last_page_elements)
1289
-
1545
+
1290
1546
  # Store the elements in the combined region
1291
1547
  combined_region._multi_page_elements = combined_elements
1292
-
1548
+
1293
1549
  sections.append(combined_region)
1294
-
1550
+
1295
1551
  current_start = None
1296
-
1552
+
1297
1553
  # If it's another start boundary and we have a current start (for splitting by starts only)
1298
- elif boundary['type'] == 'start' and current_start is not None and not end_elements:
1554
+ elif boundary["type"] == "start" and current_start is not None and not end_elements:
1299
1555
  # Create a section from current_start to just before this boundary
1300
- start_element = current_start['element']
1301
-
1556
+ start_element = current_start["element"]
1557
+
1302
1558
  # Find the last element before this boundary on the same page
1303
- if start_element.page == boundary['element'].page:
1559
+ if start_element.page == boundary["element"].page:
1304
1560
  # Find elements on this page
1305
1561
  page_elements = [e for e in all_elements if e.page == start_element.page]
1306
1562
  # Sort by position
1307
1563
  page_elements.sort(key=lambda e: (e.top, e.x0))
1308
-
1564
+
1309
1565
  # Find the last element before the boundary
1310
- end_idx = page_elements.index(boundary['element']) - 1 if boundary['element'] in page_elements else -1
1566
+ end_idx = (
1567
+ page_elements.index(boundary["element"]) - 1
1568
+ if boundary["element"] in page_elements
1569
+ else -1
1570
+ )
1311
1571
  end_element = page_elements[end_idx] if end_idx >= 0 else None
1312
-
1572
+
1313
1573
  # Create the section
1314
1574
  section = start_element.page.get_section_between(
1315
- start_element,
1316
- end_element,
1317
- boundary_inclusion
1575
+ start_element, end_element, boundary_inclusion
1318
1576
  )
1319
1577
  sections.append(section)
1320
1578
  else:
1321
1579
  # Cross-page section - create from current_start to the end of its page
1322
1580
  from natural_pdf.elements.region import Region
1581
+
1323
1582
  start_page = start_element.page
1324
-
1583
+
1325
1584
  region = Region(
1326
- start_page,
1327
- (0, start_element.top, start_page.width, start_page.height)
1585
+ start_page, (0, start_element.top, start_page.width, start_page.height)
1328
1586
  )
1329
1587
  region.start_element = start_element
1330
1588
  sections.append(region)
1331
-
1589
+
1332
1590
  current_start = boundary
1333
-
1591
+
1334
1592
  # Handle the last section if we have a current start
1335
1593
  if current_start is not None:
1336
- start_element = current_start['element']
1594
+ start_element = current_start["element"]
1337
1595
  start_page = start_element.page
1338
-
1596
+
1339
1597
  if end_elements:
1340
1598
  # With end_elements, we need an explicit end - use the last element
1341
1599
  # on the last page of the collection
@@ -1343,59 +1601,63 @@ class PageCollection(Generic[P]):
1343
1601
  last_page_elements = [e for e in all_elements if e.page == last_page]
1344
1602
  last_page_elements.sort(key=lambda e: (e.top, e.x0))
1345
1603
  end_element = last_page_elements[-1] if last_page_elements else None
1346
-
1604
+
1347
1605
  # Create a multi-page section
1348
1606
  from natural_pdf.elements.region import Region
1349
-
1607
+
1350
1608
  if start_page == last_page:
1351
1609
  # Simple case - both on same page
1352
1610
  section = start_page.get_section_between(
1353
- start_element,
1354
- end_element,
1355
- boundary_inclusion
1611
+ start_element, end_element, boundary_inclusion
1356
1612
  )
1357
1613
  sections.append(section)
1358
1614
  else:
1359
1615
  # Create a multi-page section
1360
1616
  combined_region = Region(
1361
- start_page,
1362
- (0, start_element.top, start_page.width, start_page.height)
1617
+ start_page, (0, start_element.top, start_page.width, start_page.height)
1363
1618
  )
1364
1619
  combined_region._spans_pages = True
1365
1620
  combined_region._page_range = (start_page.index, last_page.index)
1366
1621
  combined_region.start_element = start_element
1367
1622
  combined_region.end_element = end_element
1368
-
1623
+
1369
1624
  # Get all elements that fall within this multi-page region
1370
1625
  combined_elements = []
1371
-
1626
+
1372
1627
  # Get elements from the first page
1373
- first_page_elements = [e for e in all_elements
1374
- if e.page == start_page and e.top >= start_element.top]
1628
+ first_page_elements = [
1629
+ e
1630
+ for e in all_elements
1631
+ if e.page == start_page and e.top >= start_element.top
1632
+ ]
1375
1633
  combined_elements.extend(first_page_elements)
1376
-
1634
+
1377
1635
  # Get elements from middle pages (if any)
1378
1636
  for page_idx in range(start_page.index + 1, last_page.index):
1379
1637
  middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
1380
1638
  combined_elements.extend(middle_page_elements)
1381
-
1639
+
1382
1640
  # Get elements from the last page
1383
- last_page_elements = [e for e in all_elements
1384
- if e.page == last_page and (end_element is None or e.bottom <= end_element.bottom)]
1641
+ last_page_elements = [
1642
+ e
1643
+ for e in all_elements
1644
+ if e.page == last_page
1645
+ and (end_element is None or e.bottom <= end_element.bottom)
1646
+ ]
1385
1647
  combined_elements.extend(last_page_elements)
1386
-
1648
+
1387
1649
  # Store the elements in the combined region
1388
1650
  combined_region._multi_page_elements = combined_elements
1389
-
1651
+
1390
1652
  sections.append(combined_region)
1391
1653
  else:
1392
1654
  # With start_elements only, create a section to the end of the current page
1393
1655
  from natural_pdf.elements.region import Region
1656
+
1394
1657
  region = Region(
1395
- start_page,
1396
- (0, start_element.top, start_page.width, start_page.height)
1658
+ start_page, (0, start_element.top, start_page.width, start_page.height)
1397
1659
  )
1398
1660
  region.start_element = start_element
1399
1661
  sections.append(region)
1400
-
1401
- return sections
1662
+
1663
+ return sections