natural-pdf 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +222 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +260 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +409 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +484 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +586 -0
  56. docs/tutorials/12-ocr-integration.md +188 -0
  57. docs/tutorials/13-semantic-search.ipynb +1888 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +39 -20
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  67. natural_pdf/analyzers/layout/layout_manager.py +98 -58
  68. natural_pdf/analyzers/layout/layout_options.py +32 -17
  69. natural_pdf/analyzers/layout/paddle.py +152 -95
  70. natural_pdf/analyzers/layout/surya.py +164 -92
  71. natural_pdf/analyzers/layout/tatr.py +149 -84
  72. natural_pdf/analyzers/layout/yolo.py +84 -44
  73. natural_pdf/analyzers/text_options.py +22 -15
  74. natural_pdf/analyzers/text_structure.py +131 -85
  75. natural_pdf/analyzers/utils.py +30 -23
  76. natural_pdf/collections/pdf_collection.py +125 -97
  77. natural_pdf/core/__init__.py +1 -1
  78. natural_pdf/core/element_manager.py +416 -337
  79. natural_pdf/core/highlighting_service.py +268 -196
  80. natural_pdf/core/page.py +907 -513
  81. natural_pdf/core/pdf.py +385 -287
  82. natural_pdf/elements/__init__.py +1 -1
  83. natural_pdf/elements/base.py +302 -214
  84. natural_pdf/elements/collections.py +708 -508
  85. natural_pdf/elements/line.py +39 -36
  86. natural_pdf/elements/rect.py +32 -30
  87. natural_pdf/elements/region.py +854 -883
  88. natural_pdf/elements/text.py +122 -99
  89. natural_pdf/exporters/__init__.py +0 -1
  90. natural_pdf/exporters/searchable_pdf.py +261 -102
  91. natural_pdf/ocr/__init__.py +23 -14
  92. natural_pdf/ocr/engine.py +17 -8
  93. natural_pdf/ocr/engine_easyocr.py +63 -47
  94. natural_pdf/ocr/engine_paddle.py +97 -68
  95. natural_pdf/ocr/engine_surya.py +54 -44
  96. natural_pdf/ocr/ocr_manager.py +88 -62
  97. natural_pdf/ocr/ocr_options.py +16 -10
  98. natural_pdf/qa/__init__.py +1 -1
  99. natural_pdf/qa/document_qa.py +119 -111
  100. natural_pdf/search/__init__.py +37 -31
  101. natural_pdf/search/haystack_search_service.py +312 -189
  102. natural_pdf/search/haystack_utils.py +186 -122
  103. natural_pdf/search/search_options.py +25 -14
  104. natural_pdf/search/search_service_protocol.py +12 -6
  105. natural_pdf/search/searchable_mixin.py +261 -176
  106. natural_pdf/selectors/__init__.py +2 -1
  107. natural_pdf/selectors/parser.py +159 -316
  108. natural_pdf/templates/__init__.py +1 -1
  109. natural_pdf/utils/highlighting.py +8 -2
  110. natural_pdf/utils/reading_order.py +65 -63
  111. natural_pdf/utils/text_extraction.py +195 -0
  112. natural_pdf/utils/visualization.py +70 -61
  113. natural_pdf/widgets/__init__.py +2 -3
  114. natural_pdf/widgets/viewer.py +749 -718
  115. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +15 -1
  116. natural_pdf-0.1.5.dist-info/RECORD +134 -0
  117. natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
  118. notebooks/Examples.ipynb +1293 -0
  119. pdfs/.gitkeep +0 -0
  120. pdfs/01-practice.pdf +543 -0
  121. pdfs/0500000US42001.pdf +0 -0
  122. pdfs/0500000US42007.pdf +0 -0
  123. pdfs/2014 Statistics.pdf +0 -0
  124. pdfs/2019 Statistics.pdf +0 -0
  125. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  126. pdfs/needs-ocr.pdf +0 -0
  127. tests/test_loading.py +50 -0
  128. tests/test_optional_deps.py +298 -0
  129. natural_pdf-0.1.4.dist-info/RECORD +0 -61
  130. natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
  131. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
  132. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,24 @@
1
1
  import logging
2
-
3
- from typing import List, Optional, Dict, Any, Union, Callable, TypeVar, Generic, Iterator, Tuple, TYPE_CHECKING
2
+ from typing import (
3
+ TYPE_CHECKING,
4
+ Any,
5
+ Callable,
6
+ Dict,
7
+ Generic,
8
+ Iterator,
9
+ List,
10
+ Optional,
11
+ Tuple,
12
+ TypeVar,
13
+ Union,
14
+ )
15
+
16
+ from pdfplumber.utils.geometry import objects_to_bbox
17
+
18
+ # New Imports
19
+ from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
20
+
21
+ from natural_pdf.elements.text import TextElement # Needed for isinstance check
4
22
  from natural_pdf.ocr import OCROptions
5
23
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
6
24
 
@@ -10,35 +28,36 @@ if TYPE_CHECKING:
10
28
  from natural_pdf.core.page import Page
11
29
  from natural_pdf.elements.region import Region
12
30
 
13
- T = TypeVar('T')
14
- P = TypeVar('P', bound='Page')
31
+ T = TypeVar("T")
32
+ P = TypeVar("P", bound="Page")
33
+
15
34
 
16
35
  class ElementCollection(Generic[T]):
17
36
  """
18
37
  Collection of PDF elements with batch operations.
19
38
  """
20
-
39
+
21
40
  def __init__(self, elements: List[T]):
22
41
  """
23
42
  Initialize a collection of elements.
24
-
43
+
25
44
  Args:
26
45
  elements: List of Element objects
27
46
  """
28
47
  self._elements = elements or []
29
-
48
+
30
49
  def __len__(self) -> int:
31
50
  """Get the number of elements in the collection."""
32
51
  return len(self._elements)
33
-
34
- def __getitem__(self, index: int) -> 'Element':
52
+
53
+ def __getitem__(self, index: int) -> "Element":
35
54
  """Get an element by index."""
36
55
  return self._elements[index]
37
-
56
+
38
57
  def __iter__(self):
39
58
  """Iterate over elements."""
40
59
  return iter(self._elements)
41
-
60
+
42
61
  def __repr__(self) -> str:
43
62
  """Return a string representation showing the element count."""
44
63
  element_type = "Mixed"
@@ -47,130 +66,130 @@ class ElementCollection(Generic[T]):
47
66
  if len(types) == 1:
48
67
  element_type = types.pop()
49
68
  return f"<ElementCollection[{element_type}](count={len(self)})>"
50
-
69
+
51
70
  @property
52
- def elements(self) -> List['Element']:
71
+ def elements(self) -> List["Element"]:
53
72
  """Get the elements in this collection."""
54
73
  return self._elements
55
-
74
+
56
75
  @property
57
- def first(self) -> Optional['Element']:
76
+ def first(self) -> Optional["Element"]:
58
77
  """Get the first element in the collection."""
59
78
  return self._elements[0] if self._elements else None
60
-
79
+
61
80
  @property
62
- def last(self) -> Optional['Element']:
81
+ def last(self) -> Optional["Element"]:
63
82
  """Get the last element in the collection."""
64
83
  return self._elements[-1] if self._elements else None
65
-
66
- def highest(self) -> Optional['Element']:
84
+
85
+ def highest(self) -> Optional["Element"]:
67
86
  """
68
87
  Get element with the smallest top y-coordinate (highest on page).
69
-
88
+
70
89
  Raises:
71
90
  ValueError: If elements are on multiple pages
72
-
91
+
73
92
  Returns:
74
93
  Element with smallest top value or None if empty
75
94
  """
76
95
  if not self._elements:
77
96
  return None
78
-
97
+
79
98
  # Check if elements are on multiple pages
80
99
  if self._are_on_multiple_pages():
81
100
  raise ValueError("Cannot determine highest element across multiple pages")
82
-
101
+
83
102
  return min(self._elements, key=lambda e: e.top)
84
-
85
- def lowest(self) -> Optional['Element']:
103
+
104
+ def lowest(self) -> Optional["Element"]:
86
105
  """
87
106
  Get element with the largest bottom y-coordinate (lowest on page).
88
-
107
+
89
108
  Raises:
90
109
  ValueError: If elements are on multiple pages
91
-
110
+
92
111
  Returns:
93
112
  Element with largest bottom value or None if empty
94
113
  """
95
114
  if not self._elements:
96
115
  return None
97
-
116
+
98
117
  # Check if elements are on multiple pages
99
118
  if self._are_on_multiple_pages():
100
119
  raise ValueError("Cannot determine lowest element across multiple pages")
101
-
120
+
102
121
  return max(self._elements, key=lambda e: e.bottom)
103
-
104
- def leftmost(self) -> Optional['Element']:
122
+
123
+ def leftmost(self) -> Optional["Element"]:
105
124
  """
106
125
  Get element with the smallest x0 coordinate (leftmost on page).
107
-
126
+
108
127
  Raises:
109
128
  ValueError: If elements are on multiple pages
110
-
129
+
111
130
  Returns:
112
131
  Element with smallest x0 value or None if empty
113
132
  """
114
133
  if not self._elements:
115
134
  return None
116
-
135
+
117
136
  # Check if elements are on multiple pages
118
137
  if self._are_on_multiple_pages():
119
138
  raise ValueError("Cannot determine leftmost element across multiple pages")
120
-
139
+
121
140
  return min(self._elements, key=lambda e: e.x0)
122
-
123
- def rightmost(self) -> Optional['Element']:
141
+
142
+ def rightmost(self) -> Optional["Element"]:
124
143
  """
125
144
  Get element with the largest x1 coordinate (rightmost on page).
126
-
145
+
127
146
  Raises:
128
147
  ValueError: If elements are on multiple pages
129
-
148
+
130
149
  Returns:
131
150
  Element with largest x1 value or None if empty
132
151
  """
133
152
  if not self._elements:
134
153
  return None
135
-
154
+
136
155
  # Check if elements are on multiple pages
137
156
  if self._are_on_multiple_pages():
138
157
  raise ValueError("Cannot determine rightmost element across multiple pages")
139
-
158
+
140
159
  return max(self._elements, key=lambda e: e.x1)
141
-
160
+
142
161
  def _are_on_multiple_pages(self) -> bool:
143
162
  """
144
163
  Check if elements in this collection span multiple pages.
145
-
164
+
146
165
  Returns:
147
166
  True if elements are on different pages, False otherwise
148
167
  """
149
168
  if not self._elements:
150
169
  return False
151
-
170
+
152
171
  # Get the page index of the first element
153
- if not hasattr(self._elements[0], 'page'):
172
+ if not hasattr(self._elements[0], "page"):
154
173
  return False
155
-
174
+
156
175
  first_page_idx = self._elements[0].page.index
157
-
176
+
158
177
  # Check if any element is on a different page
159
- return any(hasattr(e, 'page') and e.page.index != first_page_idx for e in self._elements)
160
-
161
- def exclude_regions(self, regions: List['Region']) -> 'ElementCollection':
178
+ return any(hasattr(e, "page") and e.page.index != first_page_idx for e in self._elements)
179
+
180
+ def exclude_regions(self, regions: List["Region"]) -> "ElementCollection":
162
181
  """
163
182
  Remove elements that are within any of the specified regions.
164
-
183
+
165
184
  Args:
166
185
  regions: List of Region objects to exclude
167
-
186
+
168
187
  Returns:
169
188
  New ElementCollection with filtered elements
170
189
  """
171
190
  if not regions:
172
191
  return ElementCollection(self._elements)
173
-
192
+
174
193
  filtered = []
175
194
  for element in self._elements:
176
195
  exclude = False
@@ -180,72 +199,156 @@ class ElementCollection(Generic[T]):
180
199
  break
181
200
  if not exclude:
182
201
  filtered.append(element)
183
-
202
+
184
203
  return ElementCollection(filtered)
185
-
204
+
186
205
  def extract_text(self, preserve_whitespace=True, use_exclusions=True, **kwargs) -> str:
187
206
  """
188
- Extract text from all elements in the collection.
189
-
207
+ Extract text from all TextElements in the collection, optionally using
208
+ pdfplumber's layout engine if layout=True is specified.
209
+
190
210
  Args:
191
- preserve_whitespace: Whether to keep blank characters (default: True)
192
- use_exclusions: Whether to apply exclusion regions (default: True)
193
- **kwargs: Additional extraction parameters
194
-
211
+ preserve_whitespace: Deprecated. Use layout=False for simple joining.
212
+ use_exclusions: Deprecated. Exclusions should be applied *before* creating
213
+ the collection or by filtering the collection itself.
214
+ **kwargs: Additional layout parameters passed directly to pdfplumber's
215
+ `chars_to_textmap` function ONLY if `layout=True` is passed.
216
+ See Page.extract_text docstring for common parameters.
217
+ If `layout=False` or omitted, performs a simple join.
218
+
195
219
  Returns:
196
- Combined text from all elements
220
+ Combined text from elements, potentially with layout-based spacing.
197
221
  """
198
- # Filter to just text-like elements
199
- text_elements = [e for e in self._elements if hasattr(e, 'extract_text')]
200
-
201
- # Sort elements in reading order (top-to-bottom, left-to-right)
202
- sorted_elements = sorted(text_elements, key=lambda e: (e.top, e.x0))
203
-
204
- # Extract text from each element
205
- texts = []
206
- for element in sorted_elements:
207
- # Extract text with new parameter names
208
- text = element.extract_text(preserve_whitespace=preserve_whitespace, use_exclusions=use_exclusions, **kwargs)
209
-
210
- if text:
211
- texts.append(text)
212
-
213
- return " ".join(texts)
214
-
215
- def filter(self, func: Callable[['Element'], bool]) -> 'ElementCollection':
222
+ # Filter to just TextElements that likely have _char_dicts
223
+ text_elements = [
224
+ el
225
+ for el in self._elements
226
+ if isinstance(el, TextElement) and hasattr(el, "_char_dicts")
227
+ ]
228
+
229
+ if not text_elements:
230
+ return ""
231
+
232
+ # Collect all character dictionaries
233
+ all_char_dicts = []
234
+ for el in text_elements:
235
+ all_char_dicts.extend(getattr(el, "_char_dicts", []))
236
+
237
+ if not all_char_dicts:
238
+ # Handle case where elements exist but have no char dicts
239
+ logger.warning(
240
+ "ElementCollection.extract_text: No character dictionaries found in TextElements."
241
+ )
242
+ return " ".join(
243
+ getattr(el, "text", "") for el in text_elements
244
+ ) # Fallback to simple join of word text
245
+
246
+ # Check if layout is requested
247
+ use_layout = kwargs.get("layout", False)
248
+
249
+ if use_layout:
250
+ logger.debug("ElementCollection.extract_text: Using layout=True path.")
251
+ # Layout requested: Use chars_to_textmap
252
+
253
+ # Prepare layout kwargs
254
+ layout_kwargs = {}
255
+ allowed_keys = set(WORD_EXTRACTOR_KWARGS) | set(TEXTMAP_KWARGS)
256
+ for key, value in kwargs.items():
257
+ if key in allowed_keys:
258
+ layout_kwargs[key] = value
259
+ layout_kwargs["layout"] = True # Ensure layout is True
260
+
261
+ # Calculate overall bbox for the elements used
262
+ collection_bbox = objects_to_bbox(all_char_dicts)
263
+ coll_x0, coll_top, coll_x1, coll_bottom = collection_bbox
264
+ coll_width = coll_x1 - coll_x0
265
+ coll_height = coll_bottom - coll_top
266
+
267
+ # Set layout parameters based on collection bounds
268
+ # Warn if collection is sparse? TBD.
269
+ if "layout_bbox" not in layout_kwargs:
270
+ layout_kwargs["layout_bbox"] = collection_bbox
271
+ if "layout_width" not in layout_kwargs:
272
+ layout_kwargs["layout_width"] = coll_width
273
+ if "layout_height" not in layout_kwargs:
274
+ layout_kwargs["layout_height"] = coll_height
275
+ # Set shifts relative to the collection's top-left
276
+ if "x_shift" not in layout_kwargs:
277
+ layout_kwargs["x_shift"] = coll_x0
278
+ if "y_shift" not in layout_kwargs:
279
+ layout_kwargs["y_shift"] = coll_top
280
+
281
+ try:
282
+ # Sort chars by document order (page, top, x0)
283
+ # Need page info on char dicts for multi-page collections
284
+ # Assuming char dicts have 'page_number' from element creation
285
+ all_char_dicts.sort(
286
+ key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
287
+ )
288
+ textmap = chars_to_textmap(all_char_dicts, **layout_kwargs)
289
+ result = textmap.as_string
290
+ except Exception as e:
291
+ logger.error(
292
+ f"ElementCollection: Error calling chars_to_textmap: {e}", exc_info=True
293
+ )
294
+ logger.warning(
295
+ "ElementCollection: Falling back to simple text join due to layout error."
296
+ )
297
+ # Fallback sorting and joining
298
+ all_char_dicts.sort(
299
+ key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
300
+ )
301
+ result = " ".join(c.get("text", "") for c in all_char_dicts)
302
+
303
+ else:
304
+ # Default: Simple join without layout
305
+ logger.debug("ElementCollection.extract_text: Using simple join (layout=False).")
306
+ # Sort chars by document order (page, top, x0)
307
+ all_char_dicts.sort(
308
+ key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
309
+ )
310
+ # Simple join of character text
311
+ result = "".join(c.get("text", "") for c in all_char_dicts)
312
+ # Replace multiple spaces created by joining possibly overlapping chars? Maybe not necessary.
313
+
314
+ return result
315
+
316
+ def filter(self, func: Callable[["Element"], bool]) -> "ElementCollection":
216
317
  """
217
318
  Filter elements using a function.
218
-
319
+
219
320
  Args:
220
321
  func: Function that takes an element and returns True to keep it
221
-
322
+
222
323
  Returns:
223
324
  New ElementCollection with filtered elements
224
325
  """
225
326
  return ElementCollection([e for e in self._elements if func(e)])
226
-
227
- def sort(self, key=None, reverse=False) -> 'ElementCollection':
327
+
328
+ def sort(self, key=None, reverse=False) -> "ElementCollection":
228
329
  """
229
330
  Sort elements by the given key function.
230
-
331
+
231
332
  Args:
232
333
  key: Function to generate a key for sorting
233
334
  reverse: Whether to sort in descending order
234
-
335
+
235
336
  Returns:
236
337
  Self for method chaining
237
338
  """
238
339
  self._elements.sort(key=key, reverse=reverse)
239
340
  return self
240
-
241
- def highlight(self,
242
- label: Optional[str] = None,
243
- color: Optional[Union[Tuple, str]] = None,
244
- group_by: Optional[str] = None,
245
- label_format: Optional[str] = None,
246
- distinct: bool = False,
247
- include_attrs: Optional[List[str]] = None,
248
- replace: bool = False) -> 'ElementCollection':
341
+
342
+ def highlight(
343
+ self,
344
+ label: Optional[str] = None,
345
+ color: Optional[Union[Tuple, str]] = None,
346
+ group_by: Optional[str] = None,
347
+ label_format: Optional[str] = None,
348
+ distinct: bool = False,
349
+ include_attrs: Optional[List[str]] = None,
350
+ replace: bool = False,
351
+ ) -> "ElementCollection":
249
352
  """
250
353
  Adds persistent highlights for all elements in the collection to the page
251
354
  via the HighlightingService.
@@ -294,17 +397,17 @@ class ElementCollection(Generic[T]):
294
397
  color=color,
295
398
  group_by=group_by,
296
399
  label_format=label_format,
297
- include_attrs=include_attrs
400
+ include_attrs=include_attrs,
298
401
  # 'replace' flag is handled during the add call below
299
402
  )
300
403
 
301
404
  # 2. Add prepared highlights to the persistent service
302
405
  if not highlight_data_list:
303
- return self # Nothing to add
406
+ return self # Nothing to add
304
407
 
305
408
  # Get page and highlighter from the first element (assume uniform page)
306
409
  first_element = self._elements[0]
307
- if not hasattr(first_element, 'page') or not hasattr(first_element.page, '_highlighter'):
410
+ if not hasattr(first_element, "page") or not hasattr(first_element.page, "_highlighter"):
308
411
  logger.warning("Cannot highlight collection: Elements lack page or highlighter access.")
309
412
  return self
310
413
 
@@ -317,42 +420,48 @@ class ElementCollection(Generic[T]):
317
420
  if replace:
318
421
  # Identify all unique page indices in this operation
319
422
  for data in highlight_data_list:
320
- pages_to_clear.add(data['page_index'])
423
+ pages_to_clear.add(data["page_index"])
321
424
  # Clear those pages *before* adding new highlights
322
- logger.debug(f"Highlighting with replace=True. Clearing highlights for pages: {pages_to_clear}")
425
+ logger.debug(
426
+ f"Highlighting with replace=True. Clearing highlights for pages: {pages_to_clear}"
427
+ )
323
428
  for page_idx in pages_to_clear:
324
429
  highlighter.clear_page(page_idx)
325
430
 
326
431
  for data in highlight_data_list:
327
432
  # Call the appropriate service add method
328
433
  add_args = {
329
- "page_index": data['page_index'],
330
- "color": data['color'], # Color determined by _prepare
331
- "label": data['label'], # Label determined by _prepare
332
- "use_color_cycling": data.get('use_color_cycling', False), # Set by _prepare if distinct
333
- "element": data['element'],
334
- "include_attrs": data['include_attrs'],
434
+ "page_index": data["page_index"],
435
+ "color": data["color"], # Color determined by _prepare
436
+ "label": data["label"], # Label determined by _prepare
437
+ "use_color_cycling": data.get(
438
+ "use_color_cycling", False
439
+ ), # Set by _prepare if distinct
440
+ "element": data["element"],
441
+ "include_attrs": data["include_attrs"],
335
442
  # Internal call to service always appends, as clearing was handled above
336
- "existing": 'append'
443
+ "existing": "append",
337
444
  }
338
- if data.get('polygon'):
339
- add_args["polygon"] = data['polygon']
445
+ if data.get("polygon"):
446
+ add_args["polygon"] = data["polygon"]
340
447
  highlighter.add_polygon(**add_args)
341
- elif data.get('bbox'):
342
- add_args["bbox"] = data['bbox']
448
+ elif data.get("bbox"):
449
+ add_args["bbox"] = data["bbox"]
343
450
  highlighter.add(**add_args)
344
451
  else:
345
452
  logger.warning(f"Skipping highlight data, no bbox or polygon found: {data}")
346
453
 
347
454
  return self
348
455
 
349
- def _prepare_highlight_data(self,
350
- distinct: bool = False,
351
- label: Optional[str] = None,
352
- color: Optional[Union[Tuple, str]] = None,
353
- group_by: Optional[str] = None,
354
- label_format: Optional[str] = None,
355
- include_attrs: Optional[List[str]] = None) -> List[Dict]:
456
+ def _prepare_highlight_data(
457
+ self,
458
+ distinct: bool = False,
459
+ label: Optional[str] = None,
460
+ color: Optional[Union[Tuple, str]] = None,
461
+ group_by: Optional[str] = None,
462
+ label_format: Optional[str] = None,
463
+ include_attrs: Optional[List[str]] = None,
464
+ ) -> List[Dict]:
356
465
  """
357
466
  Determines the parameters for highlighting each element based on the strategy.
358
467
 
@@ -364,58 +473,64 @@ class ElementCollection(Generic[T]):
364
473
  Color and label determination happens here.
365
474
  """
366
475
  prepared_data = []
367
- if not self._elements: return prepared_data
476
+ if not self._elements:
477
+ return prepared_data
368
478
 
369
479
  # Need access to the HighlightingService to determine colors correctly.
370
480
  highlighter = None
371
481
  first_element = self._elements[0]
372
- if hasattr(first_element, 'page') and hasattr(first_element.page, '_highlighter'):
482
+ if hasattr(first_element, "page") and hasattr(first_element.page, "_highlighter"):
373
483
  highlighter = first_element.page._highlighter
374
484
  else:
375
- logger.warning("Cannot determine highlight colors: HighlightingService not accessible from elements.")
485
+ logger.warning(
486
+ "Cannot determine highlight colors: HighlightingService not accessible from elements."
487
+ )
376
488
  return []
377
489
 
378
490
  if distinct:
379
491
  logger.debug("_prepare: Distinct highlighting strategy.")
380
492
  for element in self._elements:
381
493
  # Call the service's color determination logic
382
- final_color = highlighter._determine_highlight_color(label=None, color_input=None, use_color_cycling=True)
494
+ final_color = highlighter._determine_highlight_color(
495
+ label=None, color_input=None, use_color_cycling=True
496
+ )
383
497
  element_data = self._get_element_highlight_params(element, include_attrs)
384
498
  if element_data:
385
- element_data.update({
386
- 'color': final_color,
387
- 'label': None,
388
- 'use_color_cycling': True
389
- })
499
+ element_data.update(
500
+ {"color": final_color, "label": None, "use_color_cycling": True}
501
+ )
390
502
  prepared_data.append(element_data)
391
503
 
392
504
  elif label is not None:
393
505
  logger.debug(f"_prepare: Explicit label '{label}' strategy.")
394
- final_color = highlighter._determine_highlight_color(label=label, color_input=color, use_color_cycling=False)
506
+ final_color = highlighter._determine_highlight_color(
507
+ label=label, color_input=color, use_color_cycling=False
508
+ )
395
509
  for element in self._elements:
396
510
  element_data = self._get_element_highlight_params(element, include_attrs)
397
511
  if element_data:
398
- element_data.update({
399
- 'color': final_color,
400
- 'label': label
401
- })
512
+ element_data.update({"color": final_color, "label": label})
402
513
  prepared_data.append(element_data)
403
514
 
404
515
  elif group_by is not None:
405
516
  logger.debug("_prepare: Grouping by attribute strategy.")
406
517
  grouped_elements = self._group_elements_by_attr(group_by)
407
518
  for group_key, group_elements in grouped_elements.items():
408
- if not group_elements: continue
409
- group_label = self._format_group_label(group_key, label_format, group_elements[0], group_by)
410
- final_color = highlighter._determine_highlight_color(label=group_label, color_input=None, use_color_cycling=False)
411
- logger.debug(f" _prepare group '{group_label}' ({len(group_elements)} elements) -> color {final_color}")
519
+ if not group_elements:
520
+ continue
521
+ group_label = self._format_group_label(
522
+ group_key, label_format, group_elements[0], group_by
523
+ )
524
+ final_color = highlighter._determine_highlight_color(
525
+ label=group_label, color_input=None, use_color_cycling=False
526
+ )
527
+ logger.debug(
528
+ f" _prepare group '{group_label}' ({len(group_elements)} elements) -> color {final_color}"
529
+ )
412
530
  for element in group_elements:
413
531
  element_data = self._get_element_highlight_params(element, include_attrs)
414
532
  if element_data:
415
- element_data.update({
416
- 'color': final_color,
417
- 'label': group_label
418
- })
533
+ element_data.update({"color": final_color, "label": group_label})
419
534
  prepared_data.append(element_data)
420
535
  else:
421
536
  logger.debug("_prepare: Default grouping strategy.")
@@ -423,15 +538,21 @@ class ElementCollection(Generic[T]):
423
538
 
424
539
  if len(element_types) == 1:
425
540
  type_name = element_types.pop()
426
- base_name = type_name.replace("Element", "").replace("Region", "") if type_name != "Region" else "Region"
541
+ base_name = (
542
+ type_name.replace("Element", "").replace("Region", "")
543
+ if type_name != "Region"
544
+ else "Region"
545
+ )
427
546
  auto_label = f"{base_name} Elements" if base_name else "Elements"
428
547
  # Determine color *before* logging or using it
429
- final_color = highlighter._determine_highlight_color(label=auto_label, color_input=color, use_color_cycling=False)
548
+ final_color = highlighter._determine_highlight_color(
549
+ label=auto_label, color_input=color, use_color_cycling=False
550
+ )
430
551
  logger.debug(f" _prepare default group '{auto_label}' -> color {final_color}")
431
552
  for element in self._elements:
432
553
  element_data = self._get_element_highlight_params(element, include_attrs)
433
554
  if element_data:
434
- element_data.update({'color': final_color, 'label': auto_label})
555
+ element_data.update({"color": final_color, "label": auto_label})
435
556
  prepared_data.append(element_data)
436
557
  else:
437
558
  # Mixed types: Generate generic label and warn
@@ -442,26 +563,33 @@ class ElementCollection(Generic[T]):
442
563
  f"using generic label '{auto_label}'. Consider using 'label', 'group_by', "
443
564
  f"or 'distinct=True' for more specific highlighting."
444
565
  )
445
- final_color = highlighter._determine_highlight_color(label=auto_label, color_input=color, use_color_cycling=False)
566
+ final_color = highlighter._determine_highlight_color(
567
+ label=auto_label, color_input=color, use_color_cycling=False
568
+ )
446
569
  # Determine color *before* logging or using it (already done above for this branch)
447
570
  logger.debug(f" _prepare default group '{auto_label}' -> color {final_color}")
448
571
  for element in self._elements:
449
572
  element_data = self._get_element_highlight_params(element, include_attrs)
450
573
  if element_data:
451
- element_data.update({'color': final_color, 'label': auto_label})
574
+ element_data.update({"color": final_color, "label": auto_label})
452
575
  prepared_data.append(element_data)
453
576
 
454
577
  return prepared_data
455
578
 
456
- def _call_element_highlighter(self, element: T,
457
- color: Optional[Union[Tuple, str]],
458
- label: Optional[str],
459
- use_color_cycling: bool,
460
- include_attrs: Optional[List[str]],
461
- existing: str):
579
+ def _call_element_highlighter(
580
+ self,
581
+ element: T,
582
+ color: Optional[Union[Tuple, str]],
583
+ label: Optional[str],
584
+ use_color_cycling: bool,
585
+ include_attrs: Optional[List[str]],
586
+ existing: str,
587
+ ):
462
588
  """Low-level helper to call the appropriate HighlightingService method for an element."""
463
- if not hasattr(element, 'page') or not hasattr(element.page, '_highlighter'):
464
- logger.warning(f"Cannot highlight element, missing 'page' attribute or page lacks highlighter access: {element}")
589
+ if not hasattr(element, "page") or not hasattr(element.page, "_highlighter"):
590
+ logger.warning(
591
+ f"Cannot highlight element, missing 'page' attribute or page lacks highlighter access: {element}"
592
+ )
465
593
  return
466
594
 
467
595
  page = element.page
@@ -472,59 +600,68 @@ class ElementCollection(Generic[T]):
472
600
  "use_color_cycling": use_color_cycling,
473
601
  "include_attrs": include_attrs,
474
602
  "existing": existing,
475
- "element": element
603
+ "element": element,
476
604
  }
477
605
 
478
- is_polygon = getattr(element, 'has_polygon', False)
606
+ is_polygon = getattr(element, "has_polygon", False)
479
607
  geom_data = None
480
608
  add_method = None
481
609
 
482
610
  if is_polygon:
483
- geom_data = getattr(element, 'polygon', None)
611
+ geom_data = getattr(element, "polygon", None)
484
612
  if geom_data:
485
- args_for_highlighter['polygon'] = geom_data
613
+ args_for_highlighter["polygon"] = geom_data
486
614
  add_method = page._highlighter.add_polygon
487
615
  else:
488
- geom_data = getattr(element, 'bbox', None)
616
+ geom_data = getattr(element, "bbox", None)
489
617
  if geom_data:
490
- args_for_highlighter['bbox'] = geom_data
618
+ args_for_highlighter["bbox"] = geom_data
491
619
  add_method = page._highlighter.add
492
620
 
493
621
  if add_method and geom_data:
494
622
  try:
495
623
  add_method(**args_for_highlighter)
496
624
  except Exception as e:
497
- logger.error(f"Error calling highlighter method for element {element} on page {page.index}: {e}", exc_info=True)
625
+ logger.error(
626
+ f"Error calling highlighter method for element {element} on page {page.index}: {e}",
627
+ exc_info=True,
628
+ )
498
629
  elif not geom_data:
499
630
  logger.warning(f"Cannot highlight element, no bbox or polygon found: {element}")
500
631
 
501
- def _highlight_as_single_group(self, label: str,
502
- color: Optional[Union[Tuple, str]],
503
- include_attrs: Optional[List[str]],
504
- existing: str):
632
+ def _highlight_as_single_group(
633
+ self,
634
+ label: str,
635
+ color: Optional[Union[Tuple, str]],
636
+ include_attrs: Optional[List[str]],
637
+ existing: str,
638
+ ):
505
639
  """Highlights all elements with the same explicit label and color."""
506
640
  for element in self._elements:
507
641
  self._call_element_highlighter(
508
642
  element=element,
509
- color=color, # Use explicit color if provided
510
- label=label, # Use the explicit group label
511
- use_color_cycling=False, # Use consistent color for the label
643
+ color=color, # Use explicit color if provided
644
+ label=label, # Use the explicit group label
645
+ use_color_cycling=False, # Use consistent color for the label
512
646
  include_attrs=include_attrs,
513
- existing=existing
647
+ existing=existing,
514
648
  )
515
649
 
516
- def _highlight_grouped_by_attribute(self, group_by: str,
517
- label_format: Optional[str],
518
- include_attrs: Optional[List[str]],
519
- existing: str):
650
+ def _highlight_grouped_by_attribute(
651
+ self,
652
+ group_by: str,
653
+ label_format: Optional[str],
654
+ include_attrs: Optional[List[str]],
655
+ existing: str,
656
+ ):
520
657
  """Groups elements by attribute and highlights each group distinctly."""
521
658
  grouped_elements: Dict[Any, List[T]] = {}
522
659
  # Group elements by the specified attribute value
523
660
  for element in self._elements:
524
661
  try:
525
662
  group_key = getattr(element, group_by, None)
526
- if group_key is None: # Handle elements missing the attribute
527
- group_key = f"Missing '{group_by}'"
663
+ if group_key is None: # Handle elements missing the attribute
664
+ group_key = f"Missing '{group_by}'"
528
665
  # Ensure group_key is hashable (convert list/dict if necessary)
529
666
  if isinstance(group_key, (list, dict)):
530
667
  group_key = str(group_key)
@@ -533,41 +670,49 @@ class ElementCollection(Generic[T]):
533
670
  grouped_elements[group_key] = []
534
671
  grouped_elements[group_key].append(element)
535
672
  except AttributeError:
536
- logger.warning(f"Attribute '{group_by}' not found on element {element}. Skipping grouping.")
537
- group_key = f"Error accessing '{group_by}'"
538
- if group_key not in grouped_elements:
539
- grouped_elements[group_key] = []
540
- grouped_elements[group_key].append(element)
541
- except TypeError: # Handle unhashable types
542
- logger.warning(f"Attribute value for '{group_by}' on {element} is unhashable ({type(group_key)}). Using string representation.")
543
- group_key = str(group_key)
544
- if group_key not in grouped_elements:
545
- grouped_elements[group_key] = []
546
- grouped_elements[group_key].append(element)
547
-
673
+ logger.warning(
674
+ f"Attribute '{group_by}' not found on element {element}. Skipping grouping."
675
+ )
676
+ group_key = f"Error accessing '{group_by}'"
677
+ if group_key not in grouped_elements:
678
+ grouped_elements[group_key] = []
679
+ grouped_elements[group_key].append(element)
680
+ except TypeError: # Handle unhashable types
681
+ logger.warning(
682
+ f"Attribute value for '{group_by}' on {element} is unhashable ({type(group_key)}). Using string representation."
683
+ )
684
+ group_key = str(group_key)
685
+ if group_key not in grouped_elements:
686
+ grouped_elements[group_key] = []
687
+ grouped_elements[group_key].append(element)
548
688
 
549
689
  # Highlight each group
550
690
  for group_key, group_elements in grouped_elements.items():
551
- if not group_elements: continue
691
+ if not group_elements:
692
+ continue
552
693
 
553
694
  # Determine the label for this group
554
- first_element = group_elements[0] # Use first element for formatting
695
+ first_element = group_elements[0] # Use first element for formatting
555
696
  group_label = None
556
697
  if label_format:
557
698
  try:
558
699
  # Create a dict of element attributes for formatting
559
- element_attrs = first_element.__dict__.copy() # Start with element's dict
700
+ element_attrs = first_element.__dict__.copy() # Start with element's dict
560
701
  # Ensure the group_by key itself is present correctly
561
702
  element_attrs[group_by] = group_key
562
703
  group_label = label_format.format(**element_attrs)
563
704
  except KeyError as e:
564
- logger.warning(f"Invalid key '{e}' in label_format '{label_format}'. Using group key as label.")
705
+ logger.warning(
706
+ f"Invalid key '{e}' in label_format '{label_format}'. Using group key as label."
707
+ )
565
708
  group_label = str(group_key)
566
709
  except Exception as format_e:
567
- logger.warning(f"Error formatting label '{label_format}': {format_e}. Using group key as label.")
710
+ logger.warning(
711
+ f"Error formatting label '{label_format}': {format_e}. Using group key as label."
712
+ )
568
713
  group_label = str(group_key)
569
714
  else:
570
- group_label = str(group_key) # Use the attribute value as label
715
+ group_label = str(group_key) # Use the attribute value as label
571
716
 
572
717
  logger.debug(f" Highlighting group '{group_label}' ({len(group_elements)} elements)")
573
718
 
@@ -575,11 +720,11 @@ class ElementCollection(Generic[T]):
575
720
  for element in group_elements:
576
721
  self._call_element_highlighter(
577
722
  element=element,
578
- color=None, # Let ColorManager choose based on label
579
- label=group_label, # Use the derived group label
580
- use_color_cycling=False, # Use consistent color for the label
723
+ color=None, # Let ColorManager choose based on label
724
+ label=group_label, # Use the derived group label
725
+ use_color_cycling=False, # Use consistent color for the label
581
726
  include_attrs=include_attrs,
582
- existing=existing
727
+ existing=existing,
583
728
  )
584
729
 
585
730
  def _highlight_distinctly(self, include_attrs: Optional[List[str]], existing: str):
@@ -589,116 +734,122 @@ class ElementCollection(Generic[T]):
589
734
  for element in self._elements:
590
735
  self._call_element_highlighter(
591
736
  element=element,
592
- color=None, # Let ColorManager cycle
593
- label=None, # No label for distinct elements
594
- use_color_cycling=True, # Force cycling
737
+ color=None, # Let ColorManager cycle
738
+ label=None, # No label for distinct elements
739
+ use_color_cycling=True, # Force cycling
595
740
  include_attrs=include_attrs,
596
- existing=existing
741
+ existing=existing,
597
742
  )
598
-
599
- def show(self,
600
- # --- Visualization Parameters ---
601
- group_by: Optional[str] = None,
602
- label: Optional[str] = None,
603
- color: Optional[Union[Tuple, str]] = None,
604
- label_format: Optional[str] = None,
605
- distinct: bool = False,
606
- include_attrs: Optional[List[str]] = None,
607
- # --- Rendering Parameters ---
608
- scale: float = 2.0,
609
- labels: bool = True, # Use 'labels' consistent with service
610
- legend_position: str = 'right',
611
- render_ocr: bool = False) -> Optional['Image.Image']:
612
- """
613
- Generates a temporary preview image highlighting elements in this collection
614
- on their page, ignoring any persistent highlights.
615
-
616
- Currently only supports collections where all elements are on the same page.
617
-
618
- Allows grouping and coloring elements based on attributes, similar to the
619
- persistent `highlight()` method, but only for this temporary view.
620
-
621
- Args:
622
- group_by: Attribute name to group elements by for distinct colors/labels.
623
- label: Explicit label for all elements (overrides group_by).
624
- color: Explicit color for all elements (if label used) or base color.
625
- label_format: F-string to format group labels if group_by is used.
626
- distinct: Highlight each element distinctly (overrides group_by/label).
627
- include_attrs: Attributes to display on individual highlights.
628
- scale: Scale factor for rendering image.
629
- labels: Whether to include a legend for the temporary highlights.
630
- legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
631
- render_ocr: Whether to render OCR text.
632
-
633
- Returns:
634
- PIL Image object of the temporary preview, or None if rendering fails or
635
- elements span multiple pages.
636
-
637
- Raises:
638
- ValueError: If the collection is empty or elements are on different pages.
639
- """
640
- if not self._elements:
641
- raise ValueError("Cannot show an empty collection.")
642
-
643
- # Check if elements are on multiple pages
644
- if self._are_on_multiple_pages():
645
- raise ValueError("show() currently only supports collections where all elements are on the same page.")
646
-
647
- # Get the page and highlighting service from the first element
648
- first_element = self._elements[0]
649
- if not hasattr(first_element, 'page') or not first_element.page:
650
- logger.warning("Cannot show collection: First element has no associated page.")
651
- return None
652
- page = first_element.page
653
- if not hasattr(page, 'pdf') or not page.pdf:
654
- logger.warning("Cannot show collection: Page has no associated PDF object.")
655
- return None
656
-
657
- service = page._highlighter
658
- if not service:
659
- logger.warning("Cannot show collection: PDF object has no highlighting service.")
660
- return None
661
-
662
- # 1. Prepare temporary highlight data based on grouping parameters
663
- # This returns a list of dicts, suitable for render_preview
664
- highlight_data_list = self._prepare_highlight_data(
665
- distinct=distinct,
666
- label=label,
667
- color=color,
668
- group_by=group_by,
669
- label_format=label_format,
670
- include_attrs=include_attrs
671
- )
672
-
673
- if not highlight_data_list:
674
- logger.warning("No highlight data generated for show(). Rendering clean page.")
675
- # Render the page without any temporary highlights
676
- highlight_data_list = []
677
-
678
- # 2. Call render_preview on the HighlightingService
679
- try:
680
- return service.render_preview(
681
- page_index=page.index,
682
- temporary_highlights=highlight_data_list,
683
- scale=scale,
684
- labels=labels, # Use 'labels'
685
- legend_position=legend_position,
686
- render_ocr=render_ocr
687
- )
688
- except Exception as e:
689
- logger.error(f"Error calling highlighting_service.render_preview: {e}", exc_info=True)
690
- return None
691
-
692
- def save(self,
693
- filename: str,
694
- scale: float = 2.0,
695
- width: Optional[int] = None,
696
- labels: bool = True,
697
- legend_position: str = 'right',
698
- render_ocr: bool = False) -> 'ElementCollection':
743
+
744
+ def show(
745
+ self,
746
+ # --- Visualization Parameters ---
747
+ group_by: Optional[str] = None,
748
+ label: Optional[str] = None,
749
+ color: Optional[Union[Tuple, str]] = None,
750
+ label_format: Optional[str] = None,
751
+ distinct: bool = False,
752
+ include_attrs: Optional[List[str]] = None,
753
+ # --- Rendering Parameters ---
754
+ scale: float = 2.0,
755
+ labels: bool = True, # Use 'labels' consistent with service
756
+ legend_position: str = "right",
757
+ render_ocr: bool = False,
758
+ ) -> Optional["Image.Image"]:
759
+ """
760
+ Generates a temporary preview image highlighting elements in this collection
761
+ on their page, ignoring any persistent highlights.
762
+
763
+ Currently only supports collections where all elements are on the same page.
764
+
765
+ Allows grouping and coloring elements based on attributes, similar to the
766
+ persistent `highlight()` method, but only for this temporary view.
767
+
768
+ Args:
769
+ group_by: Attribute name to group elements by for distinct colors/labels.
770
+ label: Explicit label for all elements (overrides group_by).
771
+ color: Explicit color for all elements (if label used) or base color.
772
+ label_format: F-string to format group labels if group_by is used.
773
+ distinct: Highlight each element distinctly (overrides group_by/label).
774
+ include_attrs: Attributes to display on individual highlights.
775
+ scale: Scale factor for rendering image.
776
+ labels: Whether to include a legend for the temporary highlights.
777
+ legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
778
+ render_ocr: Whether to render OCR text.
779
+
780
+ Returns:
781
+ PIL Image object of the temporary preview, or None if rendering fails or
782
+ elements span multiple pages.
783
+
784
+ Raises:
785
+ ValueError: If the collection is empty or elements are on different pages.
786
+ """
787
+ if not self._elements:
788
+ raise ValueError("Cannot show an empty collection.")
789
+
790
+ # Check if elements are on multiple pages
791
+ if self._are_on_multiple_pages():
792
+ raise ValueError(
793
+ "show() currently only supports collections where all elements are on the same page."
794
+ )
795
+
796
+ # Get the page and highlighting service from the first element
797
+ first_element = self._elements[0]
798
+ if not hasattr(first_element, "page") or not first_element.page:
799
+ logger.warning("Cannot show collection: First element has no associated page.")
800
+ return None
801
+ page = first_element.page
802
+ if not hasattr(page, "pdf") or not page.pdf:
803
+ logger.warning("Cannot show collection: Page has no associated PDF object.")
804
+ return None
805
+
806
+ service = page._highlighter
807
+ if not service:
808
+ logger.warning("Cannot show collection: PDF object has no highlighting service.")
809
+ return None
810
+
811
+ # 1. Prepare temporary highlight data based on grouping parameters
812
+ # This returns a list of dicts, suitable for render_preview
813
+ highlight_data_list = self._prepare_highlight_data(
814
+ distinct=distinct,
815
+ label=label,
816
+ color=color,
817
+ group_by=group_by,
818
+ label_format=label_format,
819
+ include_attrs=include_attrs,
820
+ )
821
+
822
+ if not highlight_data_list:
823
+ logger.warning("No highlight data generated for show(). Rendering clean page.")
824
+ # Render the page without any temporary highlights
825
+ highlight_data_list = []
826
+
827
+ # 2. Call render_preview on the HighlightingService
828
+ try:
829
+ return service.render_preview(
830
+ page_index=page.index,
831
+ temporary_highlights=highlight_data_list,
832
+ scale=scale,
833
+ labels=labels, # Use 'labels'
834
+ legend_position=legend_position,
835
+ render_ocr=render_ocr,
836
+ )
837
+ except Exception as e:
838
+ logger.error(f"Error calling highlighting_service.render_preview: {e}", exc_info=True)
839
+ return None
840
+
841
+ def save(
842
+ self,
843
+ filename: str,
844
+ scale: float = 2.0,
845
+ width: Optional[int] = None,
846
+ labels: bool = True,
847
+ legend_position: str = "right",
848
+ render_ocr: bool = False,
849
+ ) -> "ElementCollection":
699
850
  """
700
851
  Save the page with this collection's elements highlighted to an image file.
701
-
852
+
702
853
  Args:
703
854
  filename: Path to save the image to
704
855
  scale: Scale factor for rendering
@@ -706,32 +857,34 @@ class ElementCollection(Generic[T]):
706
857
  labels: Whether to include a legend for labels
707
858
  legend_position: Position of the legend
708
859
  render_ocr: Whether to render OCR text with white background boxes
709
-
860
+
710
861
  Returns:
711
862
  Self for method chaining
712
863
  """
713
864
  # Use to_image to generate and save the image
714
865
  self.to_image(
715
- path=filename,
866
+ path=filename,
716
867
  scale=scale,
717
868
  width=width,
718
- labels=labels,
869
+ labels=labels,
719
870
  legend_position=legend_position,
720
- render_ocr=render_ocr
871
+ render_ocr=render_ocr,
721
872
  )
722
873
  return self
723
-
724
- def to_image(self,
725
- path: Optional[str] = None,
726
- scale: float = 2.0,
727
- width: Optional[int] = None,
728
- labels: bool = True,
729
- legend_position: str = 'right',
730
- render_ocr: bool = False) -> Optional['Image.Image']:
874
+
875
+ def to_image(
876
+ self,
877
+ path: Optional[str] = None,
878
+ scale: float = 2.0,
879
+ width: Optional[int] = None,
880
+ labels: bool = True,
881
+ legend_position: str = "right",
882
+ render_ocr: bool = False,
883
+ ) -> Optional["Image.Image"]:
731
884
  """
732
885
  Generate an image of the page with this collection's elements highlighted,
733
886
  optionally saving it to a file.
734
-
887
+
735
888
  Args:
736
889
  path: Optional path to save the image to
737
890
  scale: Scale factor for rendering
@@ -739,21 +892,21 @@ class ElementCollection(Generic[T]):
739
892
  labels: Whether to include a legend for labels
740
893
  legend_position: Position of the legend
741
894
  render_ocr: Whether to render OCR text with white background boxes
742
-
895
+
743
896
  Returns:
744
897
  PIL Image of the page with elements highlighted, or None if no valid page
745
898
  """
746
899
  # Get the page from the first element (if available)
747
- if self._elements and hasattr(self._elements[0], 'page'):
900
+ if self._elements and hasattr(self._elements[0], "page"):
748
901
  page = self._elements[0].page
749
902
  # Generate the image using to_image
750
903
  return page.to_image(
751
- path=path,
904
+ path=path,
752
905
  scale=scale,
753
906
  width=width,
754
- labels=labels,
907
+ labels=labels,
755
908
  legend_position=legend_position,
756
- render_ocr=render_ocr
909
+ render_ocr=render_ocr,
757
910
  )
758
911
  return None
759
912
 
@@ -763,7 +916,7 @@ class ElementCollection(Generic[T]):
763
916
  for element in self._elements:
764
917
  try:
765
918
  group_key = getattr(element, group_by, None)
766
- if group_key is None: # Handle elements missing the attribute
919
+ if group_key is None: # Handle elements missing the attribute
767
920
  group_key = f"Missing '{group_by}'"
768
921
  # Ensure group_key is hashable (convert list/dict if necessary)
769
922
  if isinstance(group_key, (list, dict)):
@@ -773,13 +926,17 @@ class ElementCollection(Generic[T]):
773
926
  grouped_elements[group_key] = []
774
927
  grouped_elements[group_key].append(element)
775
928
  except AttributeError:
776
- logger.warning(f"Attribute '{group_by}' not found on element {element}. Skipping grouping.")
929
+ logger.warning(
930
+ f"Attribute '{group_by}' not found on element {element}. Skipping grouping."
931
+ )
777
932
  group_key = f"Error accessing '{group_by}'"
778
933
  if group_key not in grouped_elements:
779
934
  grouped_elements[group_key] = []
780
935
  grouped_elements[group_key].append(element)
781
- except TypeError: # Handle unhashable types
782
- logger.warning(f"Attribute value for '{group_by}' on {element} is unhashable ({type(group_key)}). Using string representation.")
936
+ except TypeError: # Handle unhashable types
937
+ logger.warning(
938
+ f"Attribute value for '{group_by}' on {element} is unhashable ({type(group_key)}). Using string representation."
939
+ )
783
940
  group_key = str(group_key)
784
941
  if group_key not in grouped_elements:
785
942
  grouped_elements[group_key] = []
@@ -787,48 +944,61 @@ class ElementCollection(Generic[T]):
787
944
 
788
945
  return grouped_elements
789
946
 
790
- def _format_group_label(self, group_key: Any, label_format: Optional[str], sample_element: T, group_by_attr: str) -> str:
947
+ def _format_group_label(
948
+ self, group_key: Any, label_format: Optional[str], sample_element: T, group_by_attr: str
949
+ ) -> str:
791
950
  """Formats the label for a group based on the key and format string."""
792
951
  if label_format:
793
952
  try:
794
953
  element_attrs = sample_element.__dict__.copy()
795
- element_attrs[group_by_attr] = group_key # Ensure key is present
954
+ element_attrs[group_by_attr] = group_key # Ensure key is present
796
955
  return label_format.format(**element_attrs)
797
956
  except KeyError as e:
798
- logger.warning(f"Invalid key '{e}' in label_format '{label_format}'. Using group key as label.")
957
+ logger.warning(
958
+ f"Invalid key '{e}' in label_format '{label_format}'. Using group key as label."
959
+ )
799
960
  return str(group_key)
800
961
  except Exception as format_e:
801
- logger.warning(f"Error formatting label '{label_format}': {format_e}. Using group key as label.")
962
+ logger.warning(
963
+ f"Error formatting label '{label_format}': {format_e}. Using group key as label."
964
+ )
802
965
  return str(group_key)
803
966
  else:
804
967
  return str(group_key)
805
968
 
806
- def _get_element_highlight_params(self, element: T, include_attrs: Optional[List[str]]) -> Optional[Dict]:
969
+ def _get_element_highlight_params(
970
+ self, element: T, include_attrs: Optional[List[str]]
971
+ ) -> Optional[Dict]:
807
972
  """Extracts common parameters needed for highlighting a single element."""
808
- if not hasattr(element, 'page'): return None
973
+ if not hasattr(element, "page"):
974
+ return None
809
975
  page = element.page
810
976
 
811
977
  base_data = {
812
- 'page_index': page.index,
813
- 'element': element,
814
- 'include_attrs': include_attrs,
815
- 'attributes_to_draw': {},
816
- 'bbox': None,
817
- 'polygon': None
978
+ "page_index": page.index,
979
+ "element": element,
980
+ "include_attrs": include_attrs,
981
+ "attributes_to_draw": {},
982
+ "bbox": None,
983
+ "polygon": None,
818
984
  }
819
985
 
820
986
  # Extract geometry
821
- is_polygon = getattr(element, 'has_polygon', False)
987
+ is_polygon = getattr(element, "has_polygon", False)
822
988
  geom_data = None
823
989
  if is_polygon:
824
- geom_data = getattr(element, 'polygon', None)
825
- if geom_data: base_data['polygon'] = geom_data
990
+ geom_data = getattr(element, "polygon", None)
991
+ if geom_data:
992
+ base_data["polygon"] = geom_data
826
993
  else:
827
- geom_data = getattr(element, 'bbox', None)
828
- if geom_data: base_data['bbox'] = geom_data
994
+ geom_data = getattr(element, "bbox", None)
995
+ if geom_data:
996
+ base_data["bbox"] = geom_data
829
997
 
830
998
  if not geom_data:
831
- logger.warning(f"Cannot prepare highlight, no bbox or polygon found for element: {element}")
999
+ logger.warning(
1000
+ f"Cannot prepare highlight, no bbox or polygon found for element: {element}"
1001
+ )
832
1002
  return None
833
1003
 
834
1004
  # Extract attributes if requested
@@ -837,13 +1007,15 @@ class ElementCollection(Generic[T]):
837
1007
  try:
838
1008
  attr_value = getattr(element, attr_name, None)
839
1009
  if attr_value is not None:
840
- base_data['attributes_to_draw'][attr_name] = attr_value
1010
+ base_data["attributes_to_draw"][attr_name] = attr_value
841
1011
  except AttributeError:
842
- logger.warning(f"Attribute '{attr_name}' not found on element {element} for include_attrs")
1012
+ logger.warning(
1013
+ f"Attribute '{attr_name}' not found on element {element} for include_attrs"
1014
+ )
843
1015
 
844
1016
  return base_data
845
1017
 
846
- def viewer(self, title: Optional[str] = None) -> Optional['widgets.DOMWidget']:
1018
+ def viewer(self, title: Optional[str] = None) -> Optional["widgets.DOMWidget"]:
847
1019
  """
848
1020
  Creates and returns an interactive ipywidget showing ONLY the elements
849
1021
  in this collection on their page background.
@@ -862,28 +1034,36 @@ class ElementCollection(Generic[T]):
862
1034
  try:
863
1035
  page = self.elements[0].page
864
1036
  # Check if the page object actually has the method
865
- if hasattr(page, 'viewer') and callable(page.viewer):
866
- final_title = title or f"Interactive Viewer for Collection ({len(self.elements)} elements)"
1037
+ if hasattr(page, "viewer") and callable(page.viewer):
1038
+ final_title = (
1039
+ title or f"Interactive Viewer for Collection ({len(self.elements)} elements)"
1040
+ )
867
1041
  # Call the page method, passing this collection's elements
868
1042
  return page.viewer(
869
1043
  elements_to_render=self.elements,
870
- title=final_title # Pass title if Page method accepts it
1044
+ title=final_title, # Pass title if Page method accepts it
871
1045
  )
872
1046
  else:
873
- logger.error("Page object is missing the 'viewer' method.")
874
- return None
1047
+ logger.error("Page object is missing the 'viewer' method.")
1048
+ return None
875
1049
  except AttributeError:
876
- logger.error("Cannot generate interactive viewer: Elements in collection lack 'page' attribute.")
1050
+ logger.error(
1051
+ "Cannot generate interactive viewer: Elements in collection lack 'page' attribute."
1052
+ )
877
1053
  return None
878
1054
  except IndexError:
879
- # Should be caught by the empty check, but just in case
880
- logger.error("Cannot generate interactive viewer: Collection unexpectedly became empty.")
881
- return None
1055
+ # Should be caught by the empty check, but just in case
1056
+ logger.error(
1057
+ "Cannot generate interactive viewer: Collection unexpectedly became empty."
1058
+ )
1059
+ return None
882
1060
  except Exception as e:
883
- logger.error(f"Error creating interactive viewer from collection: {e}", exc_info=True)
884
- return None
1061
+ logger.error(f"Error creating interactive viewer from collection: {e}", exc_info=True)
1062
+ return None
885
1063
 
886
- def find_all(self, selector: str, regex: bool = False, case: bool = True, **kwargs) -> 'ElementCollection[T]':
1064
+ def find_all(
1065
+ self, selector: str, regex: bool = False, case: bool = True, **kwargs
1066
+ ) -> "ElementCollection[T]":
887
1067
  """
888
1068
  Filter elements within this collection matching the selector.
889
1069
 
@@ -903,21 +1083,21 @@ class ElementCollection(Generic[T]):
903
1083
  selector_obj = parse_selector(selector)
904
1084
  except Exception as e:
905
1085
  logger.error(f"Error parsing selector '{selector}': {e}")
906
- return ElementCollection([]) # Return empty on parse error
1086
+ return ElementCollection([]) # Return empty on parse error
907
1087
 
908
1088
  # Pass regex and case flags to selector function generator
909
- kwargs['regex'] = regex
910
- kwargs['case'] = case
1089
+ kwargs["regex"] = regex
1090
+ kwargs["case"] = case
911
1091
 
912
1092
  try:
913
1093
  filter_func = selector_to_filter_func(selector_obj, **kwargs)
914
1094
  except Exception as e:
915
1095
  logger.error(f"Error creating filter function for selector '{selector}': {e}")
916
- return ElementCollection([]) # Return empty on filter creation error
1096
+ return ElementCollection([]) # Return empty on filter creation error
917
1097
 
918
1098
  matching_elements = [element for element in self._elements if filter_func(element)]
919
1099
 
920
- # Note: Unlike Page.find_all, this doesn't re-sort.
1100
+ # Note: Unlike Page.find_all, this doesn't re-sort.
921
1101
  # Sorting should be done explicitly on the collection if needed.
922
1102
 
923
1103
  return ElementCollection(matching_elements)
@@ -938,65 +1118,63 @@ class ElementCollection(Generic[T]):
938
1118
  results = self.find_all(selector, regex=regex, case=case, **kwargs)
939
1119
  return results.first
940
1120
 
1121
+
941
1122
  class PageCollection(Generic[P]):
942
1123
  """
943
1124
  A collection of PDF pages with cross-page operations.
944
-
1125
+
945
1126
  This class provides methods for working with multiple pages, such as finding
946
1127
  elements across pages, extracting text from page ranges, and more.
947
1128
  """
948
-
1129
+
949
1130
  def __init__(self, pages: List[P]):
950
1131
  """
951
1132
  Initialize a page collection.
952
-
1133
+
953
1134
  Args:
954
1135
  pages: List of Page objects
955
1136
  """
956
1137
  self.pages = pages
957
-
1138
+
958
1139
  def __len__(self) -> int:
959
1140
  """Return the number of pages in the collection."""
960
1141
  return len(self.pages)
961
-
962
- def __getitem__(self, idx) -> Union[P, 'PageCollection[P]']:
1142
+
1143
+ def __getitem__(self, idx) -> Union[P, "PageCollection[P]"]:
963
1144
  """Support indexing and slicing."""
964
1145
  if isinstance(idx, slice):
965
1146
  return PageCollection(self.pages[idx])
966
1147
  return self.pages[idx]
967
-
1148
+
968
1149
  def __iter__(self) -> Iterator[P]:
969
1150
  """Support iteration."""
970
1151
  return iter(self.pages)
971
-
1152
+
972
1153
  def __repr__(self) -> str:
973
1154
  """Return a string representation showing the page count."""
974
1155
  return f"<PageCollection(count={len(self)})>"
975
-
1156
+
976
1157
  def extract_text(self, keep_blank_chars=True, apply_exclusions=True, **kwargs) -> str:
977
1158
  """
978
1159
  Extract text from all pages in the collection.
979
-
1160
+
980
1161
  Args:
981
1162
  keep_blank_chars: Whether to keep blank characters (default: True)
982
1163
  apply_exclusions: Whether to apply exclusion regions (default: True)
983
1164
  **kwargs: Additional extraction parameters
984
-
1165
+
985
1166
  Returns:
986
1167
  Combined text from all pages
987
1168
  """
988
1169
  texts = []
989
1170
  for page in self.pages:
990
1171
  text = page.extract_text(
991
- keep_blank_chars=keep_blank_chars,
992
- apply_exclusions=apply_exclusions,
993
- **kwargs
1172
+ keep_blank_chars=keep_blank_chars, apply_exclusions=apply_exclusions, **kwargs
994
1173
  )
995
1174
  texts.append(text)
996
-
1175
+
997
1176
  return "\n".join(texts)
998
1177
 
999
- # --- NEW METHOD ---
1000
1178
  def apply_ocr(
1001
1179
  self,
1002
1180
  engine: Optional[str] = None,
@@ -1004,8 +1182,7 @@ class PageCollection(Generic[P]):
1004
1182
  languages: Optional[List[str]] = None,
1005
1183
  min_confidence: Optional[float] = None,
1006
1184
  device: Optional[str] = None,
1007
- # Add other simple mode args if needed
1008
- ) -> 'PageCollection[P]':
1185
+ ) -> "PageCollection[P]":
1009
1186
  """
1010
1187
  Applies OCR to all pages within this collection using batch processing.
1011
1188
 
@@ -1036,14 +1213,14 @@ class PageCollection(Generic[P]):
1036
1213
 
1037
1214
  # Assume all pages share the same parent PDF object
1038
1215
  first_page = self.pages[0]
1039
- if not hasattr(first_page, '_parent') or not first_page._parent:
1216
+ if not hasattr(first_page, "_parent") or not first_page._parent:
1040
1217
  raise RuntimeError("Pages in this collection do not have a parent PDF reference.")
1041
1218
 
1042
1219
  parent_pdf = first_page._parent
1043
1220
 
1044
1221
  # Updated check for renamed method
1045
- if not hasattr(parent_pdf, 'apply_ocr') or not callable(parent_pdf.apply_ocr):
1046
- raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
1222
+ if not hasattr(parent_pdf, "apply_ocr") or not callable(parent_pdf.apply_ocr):
1223
+ raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
1047
1224
 
1048
1225
  # Get the 0-based indices of the pages in this collection
1049
1226
  page_indices = [p.index for p in self.pages]
@@ -1057,22 +1234,22 @@ class PageCollection(Generic[P]):
1057
1234
  options=options,
1058
1235
  languages=languages,
1059
1236
  min_confidence=min_confidence,
1060
- device=device
1237
+ device=device,
1061
1238
  # Pass any other relevant simple_kwargs here if added
1062
1239
  )
1063
1240
  # The PDF method modifies the Page objects directly by adding elements.
1064
1241
 
1065
- return self # Return self for chaining
1242
+ return self # Return self for chaining
1066
1243
 
1067
1244
  def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional[T]:
1068
1245
  """
1069
1246
  Find the first element matching the selector across all pages.
1070
-
1247
+
1071
1248
  Args:
1072
1249
  selector: CSS-like selector string
1073
1250
  apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
1074
1251
  **kwargs: Additional filter parameters
1075
-
1252
+
1076
1253
  Returns:
1077
1254
  First matching element or None
1078
1255
  """
@@ -1081,16 +1258,16 @@ class PageCollection(Generic[P]):
1081
1258
  if element:
1082
1259
  return element
1083
1260
  return None
1084
-
1261
+
1085
1262
  def find_all(self, selector: str, apply_exclusions=True, **kwargs) -> ElementCollection:
1086
1263
  """
1087
1264
  Find all elements matching the selector across all pages.
1088
-
1265
+
1089
1266
  Args:
1090
1267
  selector: CSS-like selector string
1091
1268
  apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
1092
1269
  **kwargs: Additional filter parameters
1093
-
1270
+
1094
1271
  Returns:
1095
1272
  ElementCollection with matching elements from all pages
1096
1273
  """
@@ -1099,57 +1276,59 @@ class PageCollection(Generic[P]):
1099
1276
  elements = page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
1100
1277
  if elements:
1101
1278
  all_elements.extend(elements.elements)
1102
-
1279
+
1103
1280
  return ElementCollection(all_elements)
1104
-
1281
+
1105
1282
  # def debug_ocr(self, output_path):
1106
1283
  # """
1107
1284
  # Generate an interactive HTML debug report for OCR results.
1108
-
1285
+
1109
1286
  # This creates a single-file HTML report with:
1110
1287
  # - Side-by-side view of image regions and OCR text
1111
1288
  # - Confidence scores with color coding
1112
1289
  # - Editable correction fields
1113
1290
  # - Filtering and sorting options
1114
1291
  # - Export functionality for corrected text
1115
-
1292
+
1116
1293
  # Args:
1117
1294
  # output_path: Path to save the HTML report
1118
-
1295
+
1119
1296
  # Returns:
1120
1297
  # Path to the generated HTML file
1121
1298
  # """
1122
1299
  # from natural_pdf.utils.ocr import debug_ocr_to_html
1123
1300
  # return debug_ocr_to_html(self.pages, output_path)
1124
-
1125
- def get_sections(self,
1126
- start_elements=None,
1127
- end_elements=None,
1128
- new_section_on_page_break=False,
1129
- boundary_inclusion='both') -> List['Region']:
1301
+
1302
+ def get_sections(
1303
+ self,
1304
+ start_elements=None,
1305
+ end_elements=None,
1306
+ new_section_on_page_break=False,
1307
+ boundary_inclusion="both",
1308
+ ) -> List["Region"]:
1130
1309
  """
1131
1310
  Extract sections from a page collection based on start/end elements.
1132
-
1311
+
1133
1312
  Args:
1134
1313
  start_elements: Elements or selector string that mark the start of sections
1135
1314
  end_elements: Elements or selector string that mark the end of sections
1136
1315
  new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
1137
1316
  boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
1138
-
1317
+
1139
1318
  Returns:
1140
1319
  List of Region objects representing the extracted sections
1141
1320
  """
1142
1321
  # Find start and end elements across all pages
1143
1322
  if isinstance(start_elements, str):
1144
1323
  start_elements = self.find_all(start_elements).elements
1145
-
1324
+
1146
1325
  if isinstance(end_elements, str):
1147
1326
  end_elements = self.find_all(end_elements).elements
1148
-
1327
+
1149
1328
  # If no start elements, return empty list
1150
1329
  if not start_elements:
1151
1330
  return []
1152
-
1331
+
1153
1332
  # If there are page break boundaries, we'll need to add them
1154
1333
  if new_section_on_page_break:
1155
1334
  # For each page boundary, create virtual "end" and "start" elements
@@ -1159,183 +1338,200 @@ class PageCollection(Generic[P]):
1159
1338
  # If end_elements is None, initialize it as an empty list
1160
1339
  if end_elements is None:
1161
1340
  end_elements = []
1162
-
1341
+
1163
1342
  # Create a region at the bottom of the page as an artificial end marker
1164
1343
  from natural_pdf.elements.region import Region
1344
+
1165
1345
  bottom_region = Region(page, (0, page.height - 1, page.width, page.height))
1166
1346
  bottom_region.is_page_boundary = True # Mark it as a special boundary
1167
1347
  end_elements.append(bottom_region)
1168
-
1348
+
1169
1349
  # Add a virtual "start" element at the top of the next page
1170
1350
  next_page = self.pages[i + 1]
1171
1351
  top_region = Region(next_page, (0, 0, next_page.width, 1))
1172
1352
  top_region.is_page_boundary = True # Mark it as a special boundary
1173
1353
  start_elements.append(top_region)
1174
-
1354
+
1175
1355
  # Get all elements from all pages and sort them in document order
1176
1356
  all_elements = []
1177
1357
  for page in self.pages:
1178
1358
  elements = page.get_elements()
1179
1359
  all_elements.extend(elements)
1180
-
1360
+
1181
1361
  # Sort by page index, then vertical position, then horizontal position
1182
1362
  all_elements.sort(key=lambda e: (e.page.index, e.top, e.x0))
1183
-
1363
+
1184
1364
  # Mark section boundaries
1185
1365
  section_boundaries = []
1186
-
1366
+
1187
1367
  # Add start element boundaries
1188
1368
  for element in start_elements:
1189
1369
  if element in all_elements:
1190
1370
  idx = all_elements.index(element)
1191
- section_boundaries.append({
1192
- 'index': idx,
1193
- 'element': element,
1194
- 'type': 'start',
1195
- 'page_idx': element.page.index
1196
- })
1197
- elif hasattr(element, 'is_page_boundary') and element.is_page_boundary:
1371
+ section_boundaries.append(
1372
+ {
1373
+ "index": idx,
1374
+ "element": element,
1375
+ "type": "start",
1376
+ "page_idx": element.page.index,
1377
+ }
1378
+ )
1379
+ elif hasattr(element, "is_page_boundary") and element.is_page_boundary:
1198
1380
  # This is a virtual page boundary element
1199
- section_boundaries.append({
1200
- 'index': -1, # Special index for page boundaries
1201
- 'element': element,
1202
- 'type': 'start',
1203
- 'page_idx': element.page.index
1204
- })
1205
-
1381
+ section_boundaries.append(
1382
+ {
1383
+ "index": -1, # Special index for page boundaries
1384
+ "element": element,
1385
+ "type": "start",
1386
+ "page_idx": element.page.index,
1387
+ }
1388
+ )
1389
+
1206
1390
  # Add end element boundaries if provided
1207
1391
  if end_elements:
1208
1392
  for element in end_elements:
1209
1393
  if element in all_elements:
1210
1394
  idx = all_elements.index(element)
1211
- section_boundaries.append({
1212
- 'index': idx,
1213
- 'element': element,
1214
- 'type': 'end',
1215
- 'page_idx': element.page.index
1216
- })
1217
- elif hasattr(element, 'is_page_boundary') and element.is_page_boundary:
1395
+ section_boundaries.append(
1396
+ {
1397
+ "index": idx,
1398
+ "element": element,
1399
+ "type": "end",
1400
+ "page_idx": element.page.index,
1401
+ }
1402
+ )
1403
+ elif hasattr(element, "is_page_boundary") and element.is_page_boundary:
1218
1404
  # This is a virtual page boundary element
1219
- section_boundaries.append({
1220
- 'index': -1, # Special index for page boundaries
1221
- 'element': element,
1222
- 'type': 'end',
1223
- 'page_idx': element.page.index
1224
- })
1225
-
1405
+ section_boundaries.append(
1406
+ {
1407
+ "index": -1, # Special index for page boundaries
1408
+ "element": element,
1409
+ "type": "end",
1410
+ "page_idx": element.page.index,
1411
+ }
1412
+ )
1413
+
1226
1414
  # Sort boundaries by page index, then by actual document position
1227
- section_boundaries.sort(key=lambda x: (x['page_idx'],
1228
- x['index'] if x['index'] != -1 else
1229
- (0 if x['type'] == 'start' else float('inf'))))
1230
-
1415
+ section_boundaries.sort(
1416
+ key=lambda x: (
1417
+ x["page_idx"],
1418
+ x["index"] if x["index"] != -1 else (0 if x["type"] == "start" else float("inf")),
1419
+ )
1420
+ )
1421
+
1231
1422
  # Generate sections
1232
1423
  sections = []
1233
1424
  current_start = None
1234
-
1425
+
1235
1426
  for i, boundary in enumerate(section_boundaries):
1236
1427
  # If it's a start boundary and we don't have a current start
1237
- if boundary['type'] == 'start' and current_start is None:
1428
+ if boundary["type"] == "start" and current_start is None:
1238
1429
  current_start = boundary
1239
-
1430
+
1240
1431
  # If it's an end boundary and we have a current start
1241
- elif boundary['type'] == 'end' and current_start is not None:
1432
+ elif boundary["type"] == "end" and current_start is not None:
1242
1433
  # Create a section from current_start to this boundary
1243
- start_element = current_start['element']
1244
- end_element = boundary['element']
1245
-
1434
+ start_element = current_start["element"]
1435
+ end_element = boundary["element"]
1436
+
1246
1437
  # If both elements are on the same page, use the page's get_section_between
1247
1438
  if start_element.page == end_element.page:
1248
1439
  section = start_element.page.get_section_between(
1249
- start_element,
1250
- end_element,
1251
- boundary_inclusion
1440
+ start_element, end_element, boundary_inclusion
1252
1441
  )
1253
1442
  sections.append(section)
1254
1443
  else:
1255
1444
  # Create a multi-page section
1256
1445
  from natural_pdf.elements.region import Region
1257
-
1446
+
1258
1447
  # Get the start and end pages
1259
1448
  start_page = start_element.page
1260
1449
  end_page = end_element.page
1261
-
1450
+
1262
1451
  # Create a combined region
1263
1452
  combined_region = Region(
1264
- start_page,
1265
- (0, start_element.top, start_page.width, start_page.height)
1453
+ start_page, (0, start_element.top, start_page.width, start_page.height)
1266
1454
  )
1267
1455
  combined_region._spans_pages = True
1268
1456
  combined_region._page_range = (start_page.index, end_page.index)
1269
1457
  combined_region.start_element = start_element
1270
1458
  combined_region.end_element = end_element
1271
-
1459
+
1272
1460
  # Get all elements that fall within this multi-page region
1273
1461
  combined_elements = []
1274
-
1462
+
1275
1463
  # Get elements from the first page
1276
- first_page_elements = [e for e in all_elements
1277
- if e.page == start_page and e.top >= start_element.top]
1464
+ first_page_elements = [
1465
+ e
1466
+ for e in all_elements
1467
+ if e.page == start_page and e.top >= start_element.top
1468
+ ]
1278
1469
  combined_elements.extend(first_page_elements)
1279
-
1470
+
1280
1471
  # Get elements from middle pages (if any)
1281
1472
  for page_idx in range(start_page.index + 1, end_page.index):
1282
1473
  middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
1283
1474
  combined_elements.extend(middle_page_elements)
1284
-
1475
+
1285
1476
  # Get elements from the last page
1286
- last_page_elements = [e for e in all_elements
1287
- if e.page == end_page and e.bottom <= end_element.bottom]
1477
+ last_page_elements = [
1478
+ e
1479
+ for e in all_elements
1480
+ if e.page == end_page and e.bottom <= end_element.bottom
1481
+ ]
1288
1482
  combined_elements.extend(last_page_elements)
1289
-
1483
+
1290
1484
  # Store the elements in the combined region
1291
1485
  combined_region._multi_page_elements = combined_elements
1292
-
1486
+
1293
1487
  sections.append(combined_region)
1294
-
1488
+
1295
1489
  current_start = None
1296
-
1490
+
1297
1491
  # If it's another start boundary and we have a current start (for splitting by starts only)
1298
- elif boundary['type'] == 'start' and current_start is not None and not end_elements:
1492
+ elif boundary["type"] == "start" and current_start is not None and not end_elements:
1299
1493
  # Create a section from current_start to just before this boundary
1300
- start_element = current_start['element']
1301
-
1494
+ start_element = current_start["element"]
1495
+
1302
1496
  # Find the last element before this boundary on the same page
1303
- if start_element.page == boundary['element'].page:
1497
+ if start_element.page == boundary["element"].page:
1304
1498
  # Find elements on this page
1305
1499
  page_elements = [e for e in all_elements if e.page == start_element.page]
1306
1500
  # Sort by position
1307
1501
  page_elements.sort(key=lambda e: (e.top, e.x0))
1308
-
1502
+
1309
1503
  # Find the last element before the boundary
1310
- end_idx = page_elements.index(boundary['element']) - 1 if boundary['element'] in page_elements else -1
1504
+ end_idx = (
1505
+ page_elements.index(boundary["element"]) - 1
1506
+ if boundary["element"] in page_elements
1507
+ else -1
1508
+ )
1311
1509
  end_element = page_elements[end_idx] if end_idx >= 0 else None
1312
-
1510
+
1313
1511
  # Create the section
1314
1512
  section = start_element.page.get_section_between(
1315
- start_element,
1316
- end_element,
1317
- boundary_inclusion
1513
+ start_element, end_element, boundary_inclusion
1318
1514
  )
1319
1515
  sections.append(section)
1320
1516
  else:
1321
1517
  # Cross-page section - create from current_start to the end of its page
1322
1518
  from natural_pdf.elements.region import Region
1519
+
1323
1520
  start_page = start_element.page
1324
-
1521
+
1325
1522
  region = Region(
1326
- start_page,
1327
- (0, start_element.top, start_page.width, start_page.height)
1523
+ start_page, (0, start_element.top, start_page.width, start_page.height)
1328
1524
  )
1329
1525
  region.start_element = start_element
1330
1526
  sections.append(region)
1331
-
1527
+
1332
1528
  current_start = boundary
1333
-
1529
+
1334
1530
  # Handle the last section if we have a current start
1335
1531
  if current_start is not None:
1336
- start_element = current_start['element']
1532
+ start_element = current_start["element"]
1337
1533
  start_page = start_element.page
1338
-
1534
+
1339
1535
  if end_elements:
1340
1536
  # With end_elements, we need an explicit end - use the last element
1341
1537
  # on the last page of the collection
@@ -1343,59 +1539,63 @@ class PageCollection(Generic[P]):
1343
1539
  last_page_elements = [e for e in all_elements if e.page == last_page]
1344
1540
  last_page_elements.sort(key=lambda e: (e.top, e.x0))
1345
1541
  end_element = last_page_elements[-1] if last_page_elements else None
1346
-
1542
+
1347
1543
  # Create a multi-page section
1348
1544
  from natural_pdf.elements.region import Region
1349
-
1545
+
1350
1546
  if start_page == last_page:
1351
1547
  # Simple case - both on same page
1352
1548
  section = start_page.get_section_between(
1353
- start_element,
1354
- end_element,
1355
- boundary_inclusion
1549
+ start_element, end_element, boundary_inclusion
1356
1550
  )
1357
1551
  sections.append(section)
1358
1552
  else:
1359
1553
  # Create a multi-page section
1360
1554
  combined_region = Region(
1361
- start_page,
1362
- (0, start_element.top, start_page.width, start_page.height)
1555
+ start_page, (0, start_element.top, start_page.width, start_page.height)
1363
1556
  )
1364
1557
  combined_region._spans_pages = True
1365
1558
  combined_region._page_range = (start_page.index, last_page.index)
1366
1559
  combined_region.start_element = start_element
1367
1560
  combined_region.end_element = end_element
1368
-
1561
+
1369
1562
  # Get all elements that fall within this multi-page region
1370
1563
  combined_elements = []
1371
-
1564
+
1372
1565
  # Get elements from the first page
1373
- first_page_elements = [e for e in all_elements
1374
- if e.page == start_page and e.top >= start_element.top]
1566
+ first_page_elements = [
1567
+ e
1568
+ for e in all_elements
1569
+ if e.page == start_page and e.top >= start_element.top
1570
+ ]
1375
1571
  combined_elements.extend(first_page_elements)
1376
-
1572
+
1377
1573
  # Get elements from middle pages (if any)
1378
1574
  for page_idx in range(start_page.index + 1, last_page.index):
1379
1575
  middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
1380
1576
  combined_elements.extend(middle_page_elements)
1381
-
1577
+
1382
1578
  # Get elements from the last page
1383
- last_page_elements = [e for e in all_elements
1384
- if e.page == last_page and (end_element is None or e.bottom <= end_element.bottom)]
1579
+ last_page_elements = [
1580
+ e
1581
+ for e in all_elements
1582
+ if e.page == last_page
1583
+ and (end_element is None or e.bottom <= end_element.bottom)
1584
+ ]
1385
1585
  combined_elements.extend(last_page_elements)
1386
-
1586
+
1387
1587
  # Store the elements in the combined region
1388
1588
  combined_region._multi_page_elements = combined_elements
1389
-
1589
+
1390
1590
  sections.append(combined_region)
1391
1591
  else:
1392
1592
  # With start_elements only, create a section to the end of the current page
1393
1593
  from natural_pdf.elements.region import Region
1594
+
1394
1595
  region = Region(
1395
- start_page,
1396
- (0, start_element.top, start_page.width, start_page.height)
1596
+ start_page, (0, start_element.top, start_page.width, start_page.height)
1397
1597
  )
1398
1598
  region.start_element = start_element
1399
1599
  sections.append(region)
1400
-
1401
- return sections
1600
+
1601
+ return sections