natural-pdf 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. natural_pdf/__init__.py +55 -0
  2. natural_pdf/analyzers/__init__.py +6 -0
  3. natural_pdf/analyzers/layout/__init__.py +1 -0
  4. natural_pdf/analyzers/layout/base.py +151 -0
  5. natural_pdf/analyzers/layout/docling.py +247 -0
  6. natural_pdf/analyzers/layout/layout_analyzer.py +166 -0
  7. natural_pdf/analyzers/layout/layout_manager.py +200 -0
  8. natural_pdf/analyzers/layout/layout_options.py +78 -0
  9. natural_pdf/analyzers/layout/paddle.py +240 -0
  10. natural_pdf/analyzers/layout/surya.py +151 -0
  11. natural_pdf/analyzers/layout/tatr.py +251 -0
  12. natural_pdf/analyzers/layout/yolo.py +165 -0
  13. natural_pdf/analyzers/text_options.py +60 -0
  14. natural_pdf/analyzers/text_structure.py +270 -0
  15. natural_pdf/analyzers/utils.py +57 -0
  16. natural_pdf/core/__init__.py +3 -0
  17. natural_pdf/core/element_manager.py +457 -0
  18. natural_pdf/core/highlighting_service.py +698 -0
  19. natural_pdf/core/page.py +1444 -0
  20. natural_pdf/core/pdf.py +653 -0
  21. natural_pdf/elements/__init__.py +3 -0
  22. natural_pdf/elements/base.py +761 -0
  23. natural_pdf/elements/collections.py +1345 -0
  24. natural_pdf/elements/line.py +140 -0
  25. natural_pdf/elements/rect.py +122 -0
  26. natural_pdf/elements/region.py +1793 -0
  27. natural_pdf/elements/text.py +304 -0
  28. natural_pdf/ocr/__init__.py +56 -0
  29. natural_pdf/ocr/engine.py +104 -0
  30. natural_pdf/ocr/engine_easyocr.py +179 -0
  31. natural_pdf/ocr/engine_paddle.py +204 -0
  32. natural_pdf/ocr/engine_surya.py +171 -0
  33. natural_pdf/ocr/ocr_manager.py +191 -0
  34. natural_pdf/ocr/ocr_options.py +114 -0
  35. natural_pdf/qa/__init__.py +3 -0
  36. natural_pdf/qa/document_qa.py +396 -0
  37. natural_pdf/selectors/__init__.py +4 -0
  38. natural_pdf/selectors/parser.py +354 -0
  39. natural_pdf/templates/__init__.py +1 -0
  40. natural_pdf/templates/ocr_debug.html +517 -0
  41. natural_pdf/utils/__init__.py +3 -0
  42. natural_pdf/utils/highlighting.py +12 -0
  43. natural_pdf/utils/reading_order.py +227 -0
  44. natural_pdf/utils/visualization.py +223 -0
  45. natural_pdf/widgets/__init__.py +4 -0
  46. natural_pdf/widgets/frontend/viewer.js +88 -0
  47. natural_pdf/widgets/viewer.py +765 -0
  48. natural_pdf-0.1.0.dist-info/METADATA +295 -0
  49. natural_pdf-0.1.0.dist-info/RECORD +52 -0
  50. natural_pdf-0.1.0.dist-info/WHEEL +5 -0
  51. natural_pdf-0.1.0.dist-info/licenses/LICENSE +21 -0
  52. natural_pdf-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1345 @@
1
+ import logging
2
+
3
+ from typing import List, Optional, Dict, Any, Union, Callable, TypeVar, Generic, Iterator, Tuple, TYPE_CHECKING
4
+ from natural_pdf.ocr import OCROptions
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ if TYPE_CHECKING:
9
+ from natural_pdf.core.page import Page
10
+ from natural_pdf.elements.region import Region
11
+
12
+ T = TypeVar('T')
13
+ P = TypeVar('P', bound='Page')
14
+
15
+ class ElementCollection(Generic[T]):
16
+ """
17
+ Collection of PDF elements with batch operations.
18
+ """
19
+
20
+ def __init__(self, elements: List[T]):
21
+ """
22
+ Initialize a collection of elements.
23
+
24
+ Args:
25
+ elements: List of Element objects
26
+ """
27
+ self._elements = elements or []
28
+
29
+ def __len__(self) -> int:
30
+ """Get the number of elements in the collection."""
31
+ return len(self._elements)
32
+
33
+ def __getitem__(self, index: int) -> 'Element':
34
+ """Get an element by index."""
35
+ return self._elements[index]
36
+
37
+ def __iter__(self):
38
+ """Iterate over elements."""
39
+ return iter(self._elements)
40
+
41
+ def __repr__(self) -> str:
42
+ """Return a string representation showing the element count."""
43
+ element_type = "Mixed"
44
+ if self._elements:
45
+ types = set(type(el).__name__ for el in self._elements)
46
+ if len(types) == 1:
47
+ element_type = types.pop()
48
+ return f"<ElementCollection[{element_type}](count={len(self)})>"
49
+
50
+ @property
51
+ def elements(self) -> List['Element']:
52
+ """Get the elements in this collection."""
53
+ return self._elements
54
+
55
+ @property
56
+ def first(self) -> Optional['Element']:
57
+ """Get the first element in the collection."""
58
+ return self._elements[0] if self._elements else None
59
+
60
+ @property
61
+ def last(self) -> Optional['Element']:
62
+ """Get the last element in the collection."""
63
+ return self._elements[-1] if self._elements else None
64
+
65
+ def highest(self) -> Optional['Element']:
66
+ """
67
+ Get element with the smallest top y-coordinate (highest on page).
68
+
69
+ Raises:
70
+ ValueError: If elements are on multiple pages
71
+
72
+ Returns:
73
+ Element with smallest top value or None if empty
74
+ """
75
+ if not self._elements:
76
+ return None
77
+
78
+ # Check if elements are on multiple pages
79
+ if self._are_on_multiple_pages():
80
+ raise ValueError("Cannot determine highest element across multiple pages")
81
+
82
+ return min(self._elements, key=lambda e: e.top)
83
+
84
+ def lowest(self) -> Optional['Element']:
85
+ """
86
+ Get element with the largest bottom y-coordinate (lowest on page).
87
+
88
+ Raises:
89
+ ValueError: If elements are on multiple pages
90
+
91
+ Returns:
92
+ Element with largest bottom value or None if empty
93
+ """
94
+ if not self._elements:
95
+ return None
96
+
97
+ # Check if elements are on multiple pages
98
+ if self._are_on_multiple_pages():
99
+ raise ValueError("Cannot determine lowest element across multiple pages")
100
+
101
+ return max(self._elements, key=lambda e: e.bottom)
102
+
103
+ def leftmost(self) -> Optional['Element']:
104
+ """
105
+ Get element with the smallest x0 coordinate (leftmost on page).
106
+
107
+ Raises:
108
+ ValueError: If elements are on multiple pages
109
+
110
+ Returns:
111
+ Element with smallest x0 value or None if empty
112
+ """
113
+ if not self._elements:
114
+ return None
115
+
116
+ # Check if elements are on multiple pages
117
+ if self._are_on_multiple_pages():
118
+ raise ValueError("Cannot determine leftmost element across multiple pages")
119
+
120
+ return min(self._elements, key=lambda e: e.x0)
121
+
122
+ def rightmost(self) -> Optional['Element']:
123
+ """
124
+ Get element with the largest x1 coordinate (rightmost on page).
125
+
126
+ Raises:
127
+ ValueError: If elements are on multiple pages
128
+
129
+ Returns:
130
+ Element with largest x1 value or None if empty
131
+ """
132
+ if not self._elements:
133
+ return None
134
+
135
+ # Check if elements are on multiple pages
136
+ if self._are_on_multiple_pages():
137
+ raise ValueError("Cannot determine rightmost element across multiple pages")
138
+
139
+ return max(self._elements, key=lambda e: e.x1)
140
+
141
+ def _are_on_multiple_pages(self) -> bool:
142
+ """
143
+ Check if elements in this collection span multiple pages.
144
+
145
+ Returns:
146
+ True if elements are on different pages, False otherwise
147
+ """
148
+ if not self._elements:
149
+ return False
150
+
151
+ # Get the page index of the first element
152
+ if not hasattr(self._elements[0], 'page'):
153
+ return False
154
+
155
+ first_page_idx = self._elements[0].page.index
156
+
157
+ # Check if any element is on a different page
158
+ return any(hasattr(e, 'page') and e.page.index != first_page_idx for e in self._elements)
159
+
160
+ def exclude_regions(self, regions: List['Region']) -> 'ElementCollection':
161
+ """
162
+ Remove elements that are within any of the specified regions.
163
+
164
+ Args:
165
+ regions: List of Region objects to exclude
166
+
167
+ Returns:
168
+ New ElementCollection with filtered elements
169
+ """
170
+ if not regions:
171
+ return ElementCollection(self._elements)
172
+
173
+ filtered = []
174
+ for element in self._elements:
175
+ exclude = False
176
+ for region in regions:
177
+ if region._is_element_in_region(element):
178
+ exclude = True
179
+ break
180
+ if not exclude:
181
+ filtered.append(element)
182
+
183
+ return ElementCollection(filtered)
184
+
185
+ def extract_text(self, preserve_whitespace=True, use_exclusions=True, **kwargs) -> str:
186
+ """
187
+ Extract text from all elements in the collection.
188
+
189
+ Args:
190
+ preserve_whitespace: Whether to keep blank characters (default: True)
191
+ use_exclusions: Whether to apply exclusion regions (default: True)
192
+ **kwargs: Additional extraction parameters
193
+
194
+ Returns:
195
+ Combined text from all elements
196
+ """
197
+ # Filter to just text-like elements
198
+ text_elements = [e for e in self._elements if hasattr(e, 'extract_text')]
199
+
200
+ # Sort elements in reading order (top-to-bottom, left-to-right)
201
+ sorted_elements = sorted(text_elements, key=lambda e: (e.top, e.x0))
202
+
203
+ # Extract text from each element
204
+ texts = []
205
+ for element in sorted_elements:
206
+ # Extract text with new parameter names
207
+ text = element.extract_text(preserve_whitespace=preserve_whitespace, use_exclusions=use_exclusions, **kwargs)
208
+
209
+ if text:
210
+ texts.append(text)
211
+
212
+ return " ".join(texts)
213
+
214
+ def filter(self, func: Callable[['Element'], bool]) -> 'ElementCollection':
215
+ """
216
+ Filter elements using a function.
217
+
218
+ Args:
219
+ func: Function that takes an element and returns True to keep it
220
+
221
+ Returns:
222
+ New ElementCollection with filtered elements
223
+ """
224
+ return ElementCollection([e for e in self._elements if func(e)])
225
+
226
+ def sort(self, key=None, reverse=False) -> 'ElementCollection':
227
+ """
228
+ Sort elements by the given key function.
229
+
230
+ Args:
231
+ key: Function to generate a key for sorting
232
+ reverse: Whether to sort in descending order
233
+
234
+ Returns:
235
+ Self for method chaining
236
+ """
237
+ self._elements.sort(key=key, reverse=reverse)
238
+ return self
239
+
240
+ def highlight(self,
241
+ label: Optional[str] = None,
242
+ color: Optional[Union[Tuple, str]] = None,
243
+ group_by: Optional[str] = None,
244
+ label_format: Optional[str] = None,
245
+ distinct: bool = False,
246
+ include_attrs: Optional[List[str]] = None,
247
+ replace: bool = False) -> 'ElementCollection':
248
+ """
249
+ Adds persistent highlights for all elements in the collection to the page
250
+ via the HighlightingService.
251
+
252
+ By default, this APPENDS highlights to any existing ones on the page.
253
+ To replace existing highlights, set `replace=True`.
254
+
255
+ Uses grouping logic based on parameters (defaulting to grouping by type).
256
+
257
+ Args:
258
+ label: Optional explicit label for the entire collection. If provided,
259
+ all elements are highlighted as a single group with this label,
260
+ ignoring 'group_by' and the default type-based grouping.
261
+ color: Optional explicit color for the highlight (tuple/string). Applied
262
+ consistently if 'label' is provided or if grouping occurs.
263
+ group_by: Optional attribute name present on the elements. If provided
264
+ (and 'label' is None), elements will be grouped based on the
265
+ value of this attribute, and each group will be highlighted
266
+ with a distinct label and color.
267
+ label_format: Optional Python f-string to format the group label when
268
+ 'group_by' is used. Can reference element attributes
269
+ (e.g., "Type: {region_type}, Conf: {confidence:.2f}").
270
+ If None, the attribute value itself is used as the label.
271
+ distinct: If True, bypasses all grouping and highlights each element
272
+ individually with cycling colors (the previous default behavior).
273
+ (default: False)
274
+ include_attrs: List of attribute names from the element to display directly
275
+ on the highlight itself (distinct from group label).
276
+ replace: If True, existing highlights on the affected page(s)
277
+ are cleared before adding these highlights.
278
+ If False (default), highlights are appended to existing ones.
279
+
280
+ Returns:
281
+ Self for method chaining
282
+
283
+ Raises:
284
+ AttributeError: If 'group_by' is provided but the attribute doesn't exist
285
+ on some elements.
286
+ ValueError: If 'label_format' is provided but contains invalid keys for
287
+ element attributes.
288
+ """
289
+ # 1. Prepare the highlight data based on parameters
290
+ highlight_data_list = self._prepare_highlight_data(
291
+ distinct=distinct,
292
+ label=label,
293
+ color=color,
294
+ group_by=group_by,
295
+ label_format=label_format,
296
+ include_attrs=include_attrs
297
+ # 'replace' flag is handled during the add call below
298
+ )
299
+
300
+ # 2. Add prepared highlights to the persistent service
301
+ if not highlight_data_list:
302
+ return self # Nothing to add
303
+
304
+ # Get page and highlighter from the first element (assume uniform page)
305
+ first_element = self._elements[0]
306
+ if not hasattr(first_element, 'page') or not hasattr(first_element.page, '_highlighter'):
307
+ logger.warning("Cannot highlight collection: Elements lack page or highlighter access.")
308
+ return self
309
+
310
+ page = first_element.page
311
+ highlighter = page._highlighter
312
+
313
+ # Use a set to track pages affected if replacing
314
+ pages_to_clear = set()
315
+ # Check the 'replace' flag. If True, we replace.
316
+ if replace:
317
+ # Identify all unique page indices in this operation
318
+ for data in highlight_data_list:
319
+ pages_to_clear.add(data['page_index'])
320
+ # Clear those pages *before* adding new highlights
321
+ logger.debug(f"Highlighting with replace=True. Clearing highlights for pages: {pages_to_clear}")
322
+ for page_idx in pages_to_clear:
323
+ highlighter.clear_page(page_idx)
324
+
325
+ for data in highlight_data_list:
326
+ # Call the appropriate service add method
327
+ add_args = {
328
+ "page_index": data['page_index'],
329
+ "color": data['color'], # Color determined by _prepare
330
+ "label": data['label'], # Label determined by _prepare
331
+ "use_color_cycling": data.get('use_color_cycling', False), # Set by _prepare if distinct
332
+ "element": data['element'],
333
+ "include_attrs": data['include_attrs'],
334
+ # Internal call to service always appends, as clearing was handled above
335
+ "existing": 'append'
336
+ }
337
+ if data.get('polygon'):
338
+ add_args["polygon"] = data['polygon']
339
+ highlighter.add_polygon(**add_args)
340
+ elif data.get('bbox'):
341
+ add_args["bbox"] = data['bbox']
342
+ highlighter.add(**add_args)
343
+ else:
344
+ logger.warning(f"Skipping highlight data, no bbox or polygon found: {data}")
345
+
346
+ return self
347
+
348
+ def _prepare_highlight_data(self,
349
+ distinct: bool = False,
350
+ label: Optional[str] = None,
351
+ color: Optional[Union[Tuple, str]] = None,
352
+ group_by: Optional[str] = None,
353
+ label_format: Optional[str] = None,
354
+ include_attrs: Optional[List[str]] = None) -> List[Dict]:
355
+ """
356
+ Determines the parameters for highlighting each element based on the strategy.
357
+
358
+ Does not interact with the HighlightingService directly.
359
+
360
+ Returns:
361
+ List of dictionaries, each containing parameters for a single highlight
362
+ (e.g., page_index, bbox/polygon, color, label, element, include_attrs, attributes_to_draw).
363
+ Color and label determination happens here.
364
+ """
365
+ prepared_data = []
366
+ if not self._elements: return prepared_data
367
+
368
+ # Need access to the HighlightingService to determine colors correctly.
369
+ highlighter = None
370
+ first_element = self._elements[0]
371
+ if hasattr(first_element, 'page') and hasattr(first_element.page, '_highlighter'):
372
+ highlighter = first_element.page._highlighter
373
+ else:
374
+ logger.warning("Cannot determine highlight colors: HighlightingService not accessible from elements.")
375
+ return []
376
+
377
+ if distinct:
378
+ logger.debug("_prepare: Distinct highlighting strategy.")
379
+ for element in self._elements:
380
+ # Call the service's color determination logic
381
+ final_color = highlighter._determine_highlight_color(label=None, color_input=None, use_color_cycling=True)
382
+ element_data = self._get_element_highlight_params(element, include_attrs)
383
+ if element_data:
384
+ element_data.update({
385
+ 'color': final_color,
386
+ 'label': None,
387
+ 'use_color_cycling': True
388
+ })
389
+ prepared_data.append(element_data)
390
+
391
+ elif label is not None:
392
+ logger.debug(f"_prepare: Explicit label '{label}' strategy.")
393
+ final_color = highlighter._determine_highlight_color(label=label, color_input=color, use_color_cycling=False)
394
+ for element in self._elements:
395
+ element_data = self._get_element_highlight_params(element, include_attrs)
396
+ if element_data:
397
+ element_data.update({
398
+ 'color': final_color,
399
+ 'label': label
400
+ })
401
+ prepared_data.append(element_data)
402
+
403
+ elif group_by is not None:
404
+ logger.debug("_prepare: Grouping by attribute strategy.")
405
+ grouped_elements = self._group_elements_by_attr(group_by)
406
+ for group_key, group_elements in grouped_elements.items():
407
+ if not group_elements: continue
408
+ group_label = self._format_group_label(group_key, label_format, group_elements[0], group_by)
409
+ final_color = highlighter._determine_highlight_color(label=group_label, color_input=None, use_color_cycling=False)
410
+ logger.debug(f" _prepare group '{group_label}' ({len(group_elements)} elements) -> color {final_color}")
411
+ for element in group_elements:
412
+ element_data = self._get_element_highlight_params(element, include_attrs)
413
+ if element_data:
414
+ element_data.update({
415
+ 'color': final_color,
416
+ 'label': group_label
417
+ })
418
+ prepared_data.append(element_data)
419
+ else:
420
+ logger.debug("_prepare: Default grouping strategy.")
421
+ element_types = set(type(el).__name__ for el in self._elements)
422
+
423
+ if len(element_types) == 1:
424
+ type_name = element_types.pop()
425
+ base_name = type_name.replace("Element", "").replace("Region", "") if type_name != "Region" else "Region"
426
+ auto_label = f"{base_name} Elements" if base_name else "Elements"
427
+ # Determine color *before* logging or using it
428
+ final_color = highlighter._determine_highlight_color(label=auto_label, color_input=color, use_color_cycling=False)
429
+ logger.debug(f" _prepare default group '{auto_label}' -> color {final_color}")
430
+ for element in self._elements:
431
+ element_data = self._get_element_highlight_params(element, include_attrs)
432
+ if element_data:
433
+ element_data.update({'color': final_color, 'label': auto_label})
434
+ prepared_data.append(element_data)
435
+ else:
436
+ # Mixed types: Generate generic label and warn
437
+ type_names_str = ", ".join(sorted(list(element_types)))
438
+ auto_label = "Mixed Elements"
439
+ logger.warning(
440
+ f"Highlighting collection with mixed element types ({type_names_str}) "
441
+ f"using generic label '{auto_label}'. Consider using 'label', 'group_by', "
442
+ f"or 'distinct=True' for more specific highlighting."
443
+ )
444
+ final_color = highlighter._determine_highlight_color(label=auto_label, color_input=color, use_color_cycling=False)
445
+ # Determine color *before* logging or using it (already done above for this branch)
446
+ logger.debug(f" _prepare default group '{auto_label}' -> color {final_color}")
447
+ for element in self._elements:
448
+ element_data = self._get_element_highlight_params(element, include_attrs)
449
+ if element_data:
450
+ element_data.update({'color': final_color, 'label': auto_label})
451
+ prepared_data.append(element_data)
452
+
453
+ return prepared_data
454
+
455
+ def _call_element_highlighter(self, element: T,
456
+ color: Optional[Union[Tuple, str]],
457
+ label: Optional[str],
458
+ use_color_cycling: bool,
459
+ include_attrs: Optional[List[str]],
460
+ existing: str):
461
+ """Low-level helper to call the appropriate HighlightingService method for an element."""
462
+ if not hasattr(element, 'page') or not hasattr(element.page, '_highlighter'):
463
+ logger.warning(f"Cannot highlight element, missing 'page' attribute or page lacks highlighter access: {element}")
464
+ return
465
+
466
+ page = element.page
467
+ args_for_highlighter = {
468
+ "page_index": page.index,
469
+ "color": color,
470
+ "label": label,
471
+ "use_color_cycling": use_color_cycling,
472
+ "include_attrs": include_attrs,
473
+ "existing": existing,
474
+ "element": element
475
+ }
476
+
477
+ is_polygon = getattr(element, 'has_polygon', False)
478
+ geom_data = None
479
+ add_method = None
480
+
481
+ if is_polygon:
482
+ geom_data = getattr(element, 'polygon', None)
483
+ if geom_data:
484
+ args_for_highlighter['polygon'] = geom_data
485
+ add_method = page._highlighter.add_polygon
486
+ else:
487
+ geom_data = getattr(element, 'bbox', None)
488
+ if geom_data:
489
+ args_for_highlighter['bbox'] = geom_data
490
+ add_method = page._highlighter.add
491
+
492
+ if add_method and geom_data:
493
+ try:
494
+ add_method(**args_for_highlighter)
495
+ except Exception as e:
496
+ logger.error(f"Error calling highlighter method for element {element} on page {page.index}: {e}", exc_info=True)
497
+ elif not geom_data:
498
+ logger.warning(f"Cannot highlight element, no bbox or polygon found: {element}")
499
+
500
+ def _highlight_as_single_group(self, label: str,
501
+ color: Optional[Union[Tuple, str]],
502
+ include_attrs: Optional[List[str]],
503
+ existing: str):
504
+ """Highlights all elements with the same explicit label and color."""
505
+ for element in self._elements:
506
+ self._call_element_highlighter(
507
+ element=element,
508
+ color=color, # Use explicit color if provided
509
+ label=label, # Use the explicit group label
510
+ use_color_cycling=False, # Use consistent color for the label
511
+ include_attrs=include_attrs,
512
+ existing=existing
513
+ )
514
+
515
+ def _highlight_grouped_by_attribute(self, group_by: str,
516
+ label_format: Optional[str],
517
+ include_attrs: Optional[List[str]],
518
+ existing: str):
519
+ """Groups elements by attribute and highlights each group distinctly."""
520
+ grouped_elements: Dict[Any, List[T]] = {}
521
+ # Group elements by the specified attribute value
522
+ for element in self._elements:
523
+ try:
524
+ group_key = getattr(element, group_by, None)
525
+ if group_key is None: # Handle elements missing the attribute
526
+ group_key = f"Missing '{group_by}'"
527
+ # Ensure group_key is hashable (convert list/dict if necessary)
528
+ if isinstance(group_key, (list, dict)):
529
+ group_key = str(group_key)
530
+
531
+ if group_key not in grouped_elements:
532
+ grouped_elements[group_key] = []
533
+ grouped_elements[group_key].append(element)
534
+ except AttributeError:
535
+ logger.warning(f"Attribute '{group_by}' not found on element {element}. Skipping grouping.")
536
+ group_key = f"Error accessing '{group_by}'"
537
+ if group_key not in grouped_elements:
538
+ grouped_elements[group_key] = []
539
+ grouped_elements[group_key].append(element)
540
+ except TypeError: # Handle unhashable types
541
+ logger.warning(f"Attribute value for '{group_by}' on {element} is unhashable ({type(group_key)}). Using string representation.")
542
+ group_key = str(group_key)
543
+ if group_key not in grouped_elements:
544
+ grouped_elements[group_key] = []
545
+ grouped_elements[group_key].append(element)
546
+
547
+
548
+ # Highlight each group
549
+ for group_key, group_elements in grouped_elements.items():
550
+ if not group_elements: continue
551
+
552
+ # Determine the label for this group
553
+ first_element = group_elements[0] # Use first element for formatting
554
+ group_label = None
555
+ if label_format:
556
+ try:
557
+ # Create a dict of element attributes for formatting
558
+ element_attrs = first_element.__dict__.copy() # Start with element's dict
559
+ # Ensure the group_by key itself is present correctly
560
+ element_attrs[group_by] = group_key
561
+ group_label = label_format.format(**element_attrs)
562
+ except KeyError as e:
563
+ logger.warning(f"Invalid key '{e}' in label_format '{label_format}'. Using group key as label.")
564
+ group_label = str(group_key)
565
+ except Exception as format_e:
566
+ logger.warning(f"Error formatting label '{label_format}': {format_e}. Using group key as label.")
567
+ group_label = str(group_key)
568
+ else:
569
+ group_label = str(group_key) # Use the attribute value as label
570
+
571
+ logger.debug(f" Highlighting group '{group_label}' ({len(group_elements)} elements)")
572
+
573
+ # Highlight all elements in this group with the derived label
574
+ for element in group_elements:
575
+ self._call_element_highlighter(
576
+ element=element,
577
+ color=None, # Let ColorManager choose based on label
578
+ label=group_label, # Use the derived group label
579
+ use_color_cycling=False, # Use consistent color for the label
580
+ include_attrs=include_attrs,
581
+ existing=existing
582
+ )
583
+
584
+ def _highlight_distinctly(self, include_attrs: Optional[List[str]], existing: str):
585
+ """DEPRECATED: Logic moved to _prepare_highlight_data. Kept for reference/potential reuse."""
586
+ # This method is no longer called directly by the main highlight path.
587
+ # The distinct logic is handled within _prepare_highlight_data.
588
+ for element in self._elements:
589
+ self._call_element_highlighter(
590
+ element=element,
591
+ color=None, # Let ColorManager cycle
592
+ label=None, # No label for distinct elements
593
+ use_color_cycling=True, # Force cycling
594
+ include_attrs=include_attrs,
595
+ existing=existing
596
+ )
597
+
598
+ def show(self,
599
+ # --- Visualization Parameters ---
600
+ group_by: Optional[str] = None,
601
+ label: Optional[str] = None,
602
+ color: Optional[Union[Tuple, str]] = None,
603
+ label_format: Optional[str] = None,
604
+ distinct: bool = False,
605
+ include_attrs: Optional[List[str]] = None,
606
+ # --- Rendering Parameters ---
607
+ scale: float = 2.0,
608
+ labels: bool = True, # Use 'labels' consistent with service
609
+ legend_position: str = 'right',
610
+ render_ocr: bool = False) -> Optional['Image.Image']:
611
+ """
612
+ Generates a temporary preview image highlighting elements in this collection
613
+ on their page, ignoring any persistent highlights.
614
+
615
+ Currently only supports collections where all elements are on the same page.
616
+
617
+ Allows grouping and coloring elements based on attributes, similar to the
618
+ persistent `highlight()` method, but only for this temporary view.
619
+
620
+ Args:
621
+ group_by: Attribute name to group elements by for distinct colors/labels.
622
+ label: Explicit label for all elements (overrides group_by).
623
+ color: Explicit color for all elements (if label used) or base color.
624
+ label_format: F-string to format group labels if group_by is used.
625
+ distinct: Highlight each element distinctly (overrides group_by/label).
626
+ include_attrs: Attributes to display on individual highlights.
627
+ scale: Scale factor for rendering image.
628
+ labels: Whether to include a legend for the temporary highlights.
629
+ legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
630
+ render_ocr: Whether to render OCR text.
631
+
632
+ Returns:
633
+ PIL Image object of the temporary preview, or None if rendering fails or
634
+ elements span multiple pages.
635
+
636
+ Raises:
637
+ ValueError: If the collection is empty or elements are on different pages.
638
+ """
639
+ if not self._elements:
640
+ raise ValueError("Cannot show an empty collection.")
641
+
642
+ # Check if elements are on multiple pages
643
+ if self._are_on_multiple_pages():
644
+ raise ValueError("show() currently only supports collections where all elements are on the same page.")
645
+
646
+ # Get the page and highlighting service from the first element
647
+ first_element = self._elements[0]
648
+ if not hasattr(first_element, 'page') or not first_element.page:
649
+ logger.warning("Cannot show collection: First element has no associated page.")
650
+ return None
651
+ page = first_element.page
652
+ if not hasattr(page, 'pdf') or not page.pdf:
653
+ logger.warning("Cannot show collection: Page has no associated PDF object.")
654
+ return None
655
+
656
+ service = page._highlighter
657
+ if not service:
658
+ logger.warning("Cannot show collection: PDF object has no highlighting service.")
659
+ return None
660
+
661
+ # 1. Prepare temporary highlight data based on grouping parameters
662
+ # This returns a list of dicts, suitable for render_preview
663
+ highlight_data_list = self._prepare_highlight_data(
664
+ distinct=distinct,
665
+ label=label,
666
+ color=color,
667
+ group_by=group_by,
668
+ label_format=label_format,
669
+ include_attrs=include_attrs
670
+ )
671
+
672
+ if not highlight_data_list:
673
+ logger.warning("No highlight data generated for show(). Rendering clean page.")
674
+ # Render the page without any temporary highlights
675
+ highlight_data_list = []
676
+
677
+ # 2. Call render_preview on the HighlightingService
678
+ try:
679
+ return service.render_preview(
680
+ page_index=page.index,
681
+ temporary_highlights=highlight_data_list,
682
+ scale=scale,
683
+ labels=labels, # Use 'labels'
684
+ legend_position=legend_position,
685
+ render_ocr=render_ocr
686
+ )
687
+ except Exception as e:
688
+ logger.error(f"Error calling highlighting_service.render_preview: {e}", exc_info=True)
689
+ return None
690
+
691
+ def save(self,
692
+ filename: str,
693
+ scale: float = 2.0,
694
+ width: Optional[int] = None,
695
+ labels: bool = True,
696
+ legend_position: str = 'right',
697
+ render_ocr: bool = False) -> 'ElementCollection':
698
+ """
699
+ Save the page with this collection's elements highlighted to an image file.
700
+
701
+ Args:
702
+ filename: Path to save the image to
703
+ scale: Scale factor for rendering
704
+ width: Optional width for the output image in pixels
705
+ labels: Whether to include a legend for labels
706
+ legend_position: Position of the legend
707
+ render_ocr: Whether to render OCR text with white background boxes
708
+
709
+ Returns:
710
+ Self for method chaining
711
+ """
712
+ # Use to_image to generate and save the image
713
+ self.to_image(
714
+ path=filename,
715
+ scale=scale,
716
+ width=width,
717
+ labels=labels,
718
+ legend_position=legend_position,
719
+ render_ocr=render_ocr
720
+ )
721
+ return self
722
+
723
+ def to_image(self,
724
+ path: Optional[str] = None,
725
+ scale: float = 2.0,
726
+ width: Optional[int] = None,
727
+ labels: bool = True,
728
+ legend_position: str = 'right',
729
+ render_ocr: bool = False) -> Optional['Image.Image']:
730
+ """
731
+ Generate an image of the page with this collection's elements highlighted,
732
+ optionally saving it to a file.
733
+
734
+ Args:
735
+ path: Optional path to save the image to
736
+ scale: Scale factor for rendering
737
+ width: Optional width for the output image in pixels (height calculated to maintain aspect ratio)
738
+ labels: Whether to include a legend for labels
739
+ legend_position: Position of the legend
740
+ render_ocr: Whether to render OCR text with white background boxes
741
+
742
+ Returns:
743
+ PIL Image of the page with elements highlighted, or None if no valid page
744
+ """
745
+ # Get the page from the first element (if available)
746
+ if self._elements and hasattr(self._elements[0], 'page'):
747
+ page = self._elements[0].page
748
+ # Generate the image using to_image
749
+ return page.to_image(
750
+ path=path,
751
+ scale=scale,
752
+ width=width,
753
+ labels=labels,
754
+ legend_position=legend_position,
755
+ render_ocr=render_ocr
756
+ )
757
+ return None
758
+
759
+ def _group_elements_by_attr(self, group_by: str) -> Dict[Any, List[T]]:
760
+ """Groups elements by the specified attribute."""
761
+ grouped_elements: Dict[Any, List[T]] = {}
762
+ for element in self._elements:
763
+ try:
764
+ group_key = getattr(element, group_by, None)
765
+ if group_key is None: # Handle elements missing the attribute
766
+ group_key = f"Missing '{group_by}'"
767
+ # Ensure group_key is hashable (convert list/dict if necessary)
768
+ if isinstance(group_key, (list, dict)):
769
+ group_key = str(group_key)
770
+
771
+ if group_key not in grouped_elements:
772
+ grouped_elements[group_key] = []
773
+ grouped_elements[group_key].append(element)
774
+ except AttributeError:
775
+ logger.warning(f"Attribute '{group_by}' not found on element {element}. Skipping grouping.")
776
+ group_key = f"Error accessing '{group_by}'"
777
+ if group_key not in grouped_elements:
778
+ grouped_elements[group_key] = []
779
+ grouped_elements[group_key].append(element)
780
+ except TypeError: # Handle unhashable types
781
+ logger.warning(f"Attribute value for '{group_by}' on {element} is unhashable ({type(group_key)}). Using string representation.")
782
+ group_key = str(group_key)
783
+ if group_key not in grouped_elements:
784
+ grouped_elements[group_key] = []
785
+ grouped_elements[group_key].append(element)
786
+
787
+ return grouped_elements
788
+
789
+ def _format_group_label(self, group_key: Any, label_format: Optional[str], sample_element: T, group_by_attr: str) -> str:
790
+ """Formats the label for a group based on the key and format string."""
791
+ if label_format:
792
+ try:
793
+ element_attrs = sample_element.__dict__.copy()
794
+ element_attrs[group_by_attr] = group_key # Ensure key is present
795
+ return label_format.format(**element_attrs)
796
+ except KeyError as e:
797
+ logger.warning(f"Invalid key '{e}' in label_format '{label_format}'. Using group key as label.")
798
+ return str(group_key)
799
+ except Exception as format_e:
800
+ logger.warning(f"Error formatting label '{label_format}': {format_e}. Using group key as label.")
801
+ return str(group_key)
802
+ else:
803
+ return str(group_key)
804
+
805
+ def _get_element_highlight_params(self, element: T, include_attrs: Optional[List[str]]) -> Optional[Dict]:
806
+ """Extracts common parameters needed for highlighting a single element."""
807
+ if not hasattr(element, 'page'): return None
808
+ page = element.page
809
+
810
+ base_data = {
811
+ 'page_index': page.index,
812
+ 'element': element,
813
+ 'include_attrs': include_attrs,
814
+ 'attributes_to_draw': {},
815
+ 'bbox': None,
816
+ 'polygon': None
817
+ }
818
+
819
+ # Extract geometry
820
+ is_polygon = getattr(element, 'has_polygon', False)
821
+ geom_data = None
822
+ if is_polygon:
823
+ geom_data = getattr(element, 'polygon', None)
824
+ if geom_data: base_data['polygon'] = geom_data
825
+ else:
826
+ geom_data = getattr(element, 'bbox', None)
827
+ if geom_data: base_data['bbox'] = geom_data
828
+
829
+ if not geom_data:
830
+ logger.warning(f"Cannot prepare highlight, no bbox or polygon found for element: {element}")
831
+ return None
832
+
833
+ # Extract attributes if requested
834
+ if include_attrs:
835
+ for attr_name in include_attrs:
836
+ try:
837
+ attr_value = getattr(element, attr_name, None)
838
+ if attr_value is not None:
839
+ base_data['attributes_to_draw'][attr_name] = attr_value
840
+ except AttributeError:
841
+ logger.warning(f"Attribute '{attr_name}' not found on element {element} for include_attrs")
842
+
843
+ return base_data
844
+
845
+ def viewer(self, title: Optional[str] = None) -> Optional['widgets.DOMWidget']:
846
+ """
847
+ Creates and returns an interactive ipywidget showing ONLY the elements
848
+ in this collection on their page background.
849
+
850
+ Args:
851
+ title: Optional title for the viewer window/widget.
852
+
853
+ Returns:
854
+ An InteractiveViewerWidget instance or None if elements lack page context.
855
+ """
856
+ if not self.elements:
857
+ logger.warning("Cannot generate interactive viewer for empty collection.")
858
+ return None
859
+
860
+ # Assume all elements are on the same page and have .page attribute
861
+ try:
862
+ page = self.elements[0].page
863
+ # Check if the page object actually has the method
864
+ if hasattr(page, 'viewer') and callable(page.viewer):
865
+ final_title = title or f"Interactive Viewer for Collection ({len(self.elements)} elements)"
866
+ # Call the page method, passing this collection's elements
867
+ return page.viewer(
868
+ elements_to_render=self.elements,
869
+ title=final_title # Pass title if Page method accepts it
870
+ )
871
+ else:
872
+ logger.error("Page object is missing the 'viewer' method.")
873
+ return None
874
+ except AttributeError:
875
+ logger.error("Cannot generate interactive viewer: Elements in collection lack 'page' attribute.")
876
+ return None
877
+ except IndexError:
878
+ # Should be caught by the empty check, but just in case
879
+ logger.error("Cannot generate interactive viewer: Collection unexpectedly became empty.")
880
+ return None
881
+ except Exception as e:
882
+ logger.error(f"Error creating interactive viewer from collection: {e}", exc_info=True)
883
+ return None
884
+
885
+ class PageCollection(Generic[P]):
886
+ """
887
+ A collection of PDF pages with cross-page operations.
888
+
889
+ This class provides methods for working with multiple pages, such as finding
890
+ elements across pages, extracting text from page ranges, and more.
891
+ """
892
+
893
+ def __init__(self, pages: List[P]):
894
+ """
895
+ Initialize a page collection.
896
+
897
+ Args:
898
+ pages: List of Page objects
899
+ """
900
+ self.pages = pages
901
+
902
+ def __len__(self) -> int:
903
+ """Return the number of pages in the collection."""
904
+ return len(self.pages)
905
+
906
+ def __getitem__(self, idx) -> Union[P, 'PageCollection[P]']:
907
+ """Support indexing and slicing."""
908
+ if isinstance(idx, slice):
909
+ return PageCollection(self.pages[idx])
910
+ return self.pages[idx]
911
+
912
+ def __iter__(self) -> Iterator[P]:
913
+ """Support iteration."""
914
+ return iter(self.pages)
915
+
916
+ def __repr__(self) -> str:
917
+ """Return a string representation showing the page count."""
918
+ return f"<PageCollection(count={len(self)})>"
919
+
920
+ def extract_text(self, keep_blank_chars=True, apply_exclusions=True, **kwargs) -> str:
921
+ """
922
+ Extract text from all pages in the collection.
923
+
924
+ Args:
925
+ keep_blank_chars: Whether to keep blank characters (default: True)
926
+ apply_exclusions: Whether to apply exclusion regions (default: True)
927
+ **kwargs: Additional extraction parameters
928
+
929
+ Returns:
930
+ Combined text from all pages
931
+ """
932
+ texts = []
933
+ for page in self.pages:
934
+ text = page.extract_text(
935
+ keep_blank_chars=keep_blank_chars,
936
+ apply_exclusions=apply_exclusions,
937
+ **kwargs
938
+ )
939
+ texts.append(text)
940
+
941
+ return "\n".join(texts)
942
+
943
+ # --- NEW METHOD ---
944
+ def apply_ocr(
945
+ self,
946
+ engine: Optional[str] = None,
947
+ options: Optional[OCROptions] = None,
948
+ languages: Optional[List[str]] = None,
949
+ min_confidence: Optional[float] = None,
950
+ device: Optional[str] = None,
951
+ # Add other simple mode args if needed
952
+ ) -> 'PageCollection[P]':
953
+ """
954
+ Applies OCR to all pages within this collection using batch processing.
955
+
956
+ This delegates the work to the parent PDF object's `apply_ocr_to_pages`
957
+ method for efficiency. The OCR results (TextElements) are added directly
958
+ to the respective Page objects within this collection.
959
+
960
+ Args:
961
+ engine: Name of the engine (e.g., 'easyocr', 'paddleocr', 'surya').
962
+ Uses manager's default if None. Ignored if 'options' is provided.
963
+ options: An specific Options object (e.g., EasyOCROptions) for
964
+ advanced configuration. Overrides simple arguments.
965
+ languages: List of language codes for simple mode.
966
+ min_confidence: Minimum confidence threshold for simple mode.
967
+ device: Device string ('cpu', 'cuda', etc.) for simple mode.
968
+
969
+ Returns:
970
+ Self for method chaining.
971
+
972
+ Raises:
973
+ RuntimeError: If pages in the collection lack a parent PDF object
974
+ or if the parent PDF object lacks the required
975
+ `apply_ocr_to_pages` method.
976
+ (Propagates exceptions from PDF.apply_ocr_to_pages)
977
+ """
978
+ if not self.pages:
979
+ logger.warning("Cannot apply OCR to an empty PageCollection.")
980
+ return self
981
+
982
+ # Assume all pages share the same parent PDF object
983
+ first_page = self.pages[0]
984
+ if not hasattr(first_page, '_parent') or not first_page._parent:
985
+ raise RuntimeError("Pages in this collection do not have a parent PDF reference.")
986
+
987
+ parent_pdf = first_page._parent
988
+
989
+ if not hasattr(parent_pdf, 'apply_ocr_to_pages') or not callable(parent_pdf.apply_ocr_to_pages):
990
+ raise RuntimeError("Parent PDF object does not have the required 'apply_ocr_to_pages' method.")
991
+
992
+ # Get the 0-based indices of the pages in this collection
993
+ page_indices = [p.index for p in self.pages]
994
+
995
+ logger.info(f"Applying OCR via parent PDF to page indices: {page_indices} in collection.")
996
+
997
+ # Delegate the batch call to the parent PDF object
998
+ parent_pdf.apply_ocr_to_pages(
999
+ pages=page_indices,
1000
+ engine=engine,
1001
+ options=options,
1002
+ languages=languages,
1003
+ min_confidence=min_confidence,
1004
+ device=device
1005
+ # Pass any other relevant simple_kwargs here if added
1006
+ )
1007
+ # The PDF method modifies the Page objects directly by adding elements.
1008
+
1009
+ return self # Return self for chaining
1010
+
1011
+ def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional[T]:
1012
+ """
1013
+ Find the first element matching the selector across all pages.
1014
+
1015
+ Args:
1016
+ selector: CSS-like selector string
1017
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
1018
+ **kwargs: Additional filter parameters
1019
+
1020
+ Returns:
1021
+ First matching element or None
1022
+ """
1023
+ for page in self.pages:
1024
+ element = page.find(selector, apply_exclusions=apply_exclusions, **kwargs)
1025
+ if element:
1026
+ return element
1027
+ return None
1028
+
1029
+ def find_all(self, selector: str, apply_exclusions=True, **kwargs) -> ElementCollection:
1030
+ """
1031
+ Find all elements matching the selector across all pages.
1032
+
1033
+ Args:
1034
+ selector: CSS-like selector string
1035
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
1036
+ **kwargs: Additional filter parameters
1037
+
1038
+ Returns:
1039
+ ElementCollection with matching elements from all pages
1040
+ """
1041
+ all_elements = []
1042
+ for page in self.pages:
1043
+ elements = page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
1044
+ if elements:
1045
+ all_elements.extend(elements.elements)
1046
+
1047
+ return ElementCollection(all_elements)
1048
+
1049
+ # def debug_ocr(self, output_path):
1050
+ # """
1051
+ # Generate an interactive HTML debug report for OCR results.
1052
+
1053
+ # This creates a single-file HTML report with:
1054
+ # - Side-by-side view of image regions and OCR text
1055
+ # - Confidence scores with color coding
1056
+ # - Editable correction fields
1057
+ # - Filtering and sorting options
1058
+ # - Export functionality for corrected text
1059
+
1060
+ # Args:
1061
+ # output_path: Path to save the HTML report
1062
+
1063
+ # Returns:
1064
+ # Path to the generated HTML file
1065
+ # """
1066
+ # from natural_pdf.utils.ocr import debug_ocr_to_html
1067
+ # return debug_ocr_to_html(self.pages, output_path)
1068
+
1069
+ def get_sections(self,
1070
+ start_elements=None,
1071
+ end_elements=None,
1072
+ new_section_on_page_break=False,
1073
+ boundary_inclusion='both') -> List['Region']:
1074
+ """
1075
+ Extract sections from a page collection based on start/end elements.
1076
+
1077
+ Args:
1078
+ start_elements: Elements or selector string that mark the start of sections
1079
+ end_elements: Elements or selector string that mark the end of sections
1080
+ new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
1081
+ boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
1082
+
1083
+ Returns:
1084
+ List of Region objects representing the extracted sections
1085
+ """
1086
+ # Find start and end elements across all pages
1087
+ if isinstance(start_elements, str):
1088
+ start_elements = self.find_all(start_elements).elements
1089
+
1090
+ if isinstance(end_elements, str):
1091
+ end_elements = self.find_all(end_elements).elements
1092
+
1093
+ # If no start elements, return empty list
1094
+ if not start_elements:
1095
+ return []
1096
+
1097
+ # If there are page break boundaries, we'll need to add them
1098
+ if new_section_on_page_break:
1099
+ # For each page boundary, create virtual "end" and "start" elements
1100
+ for i in range(len(self.pages) - 1):
1101
+ # Add a virtual "end" element at the bottom of the current page
1102
+ page = self.pages[i]
1103
+ # If end_elements is None, initialize it as an empty list
1104
+ if end_elements is None:
1105
+ end_elements = []
1106
+
1107
+ # Create a region at the bottom of the page as an artificial end marker
1108
+ from natural_pdf.elements.region import Region
1109
+ bottom_region = Region(page, (0, page.height - 1, page.width, page.height))
1110
+ bottom_region.is_page_boundary = True # Mark it as a special boundary
1111
+ end_elements.append(bottom_region)
1112
+
1113
+ # Add a virtual "start" element at the top of the next page
1114
+ next_page = self.pages[i + 1]
1115
+ top_region = Region(next_page, (0, 0, next_page.width, 1))
1116
+ top_region.is_page_boundary = True # Mark it as a special boundary
1117
+ start_elements.append(top_region)
1118
+
1119
+ # Get all elements from all pages and sort them in document order
1120
+ all_elements = []
1121
+ for page in self.pages:
1122
+ elements = page.get_elements()
1123
+ all_elements.extend(elements)
1124
+
1125
+ # Sort by page index, then vertical position, then horizontal position
1126
+ all_elements.sort(key=lambda e: (e.page.index, e.top, e.x0))
1127
+
1128
+ # Mark section boundaries
1129
+ section_boundaries = []
1130
+
1131
+ # Add start element boundaries
1132
+ for element in start_elements:
1133
+ if element in all_elements:
1134
+ idx = all_elements.index(element)
1135
+ section_boundaries.append({
1136
+ 'index': idx,
1137
+ 'element': element,
1138
+ 'type': 'start',
1139
+ 'page_idx': element.page.index
1140
+ })
1141
+ elif hasattr(element, 'is_page_boundary') and element.is_page_boundary:
1142
+ # This is a virtual page boundary element
1143
+ section_boundaries.append({
1144
+ 'index': -1, # Special index for page boundaries
1145
+ 'element': element,
1146
+ 'type': 'start',
1147
+ 'page_idx': element.page.index
1148
+ })
1149
+
1150
+ # Add end element boundaries if provided
1151
+ if end_elements:
1152
+ for element in end_elements:
1153
+ if element in all_elements:
1154
+ idx = all_elements.index(element)
1155
+ section_boundaries.append({
1156
+ 'index': idx,
1157
+ 'element': element,
1158
+ 'type': 'end',
1159
+ 'page_idx': element.page.index
1160
+ })
1161
+ elif hasattr(element, 'is_page_boundary') and element.is_page_boundary:
1162
+ # This is a virtual page boundary element
1163
+ section_boundaries.append({
1164
+ 'index': -1, # Special index for page boundaries
1165
+ 'element': element,
1166
+ 'type': 'end',
1167
+ 'page_idx': element.page.index
1168
+ })
1169
+
1170
+ # Sort boundaries by page index, then by actual document position
1171
+ section_boundaries.sort(key=lambda x: (x['page_idx'],
1172
+ x['index'] if x['index'] != -1 else
1173
+ (0 if x['type'] == 'start' else float('inf'))))
1174
+
1175
+ # Generate sections
1176
+ sections = []
1177
+ current_start = None
1178
+
1179
+ for i, boundary in enumerate(section_boundaries):
1180
+ # If it's a start boundary and we don't have a current start
1181
+ if boundary['type'] == 'start' and current_start is None:
1182
+ current_start = boundary
1183
+
1184
+ # If it's an end boundary and we have a current start
1185
+ elif boundary['type'] == 'end' and current_start is not None:
1186
+ # Create a section from current_start to this boundary
1187
+ start_element = current_start['element']
1188
+ end_element = boundary['element']
1189
+
1190
+ # If both elements are on the same page, use the page's get_section_between
1191
+ if start_element.page == end_element.page:
1192
+ section = start_element.page.get_section_between(
1193
+ start_element,
1194
+ end_element,
1195
+ boundary_inclusion
1196
+ )
1197
+ sections.append(section)
1198
+ else:
1199
+ # Create a multi-page section
1200
+ from natural_pdf.elements.region import Region
1201
+
1202
+ # Get the start and end pages
1203
+ start_page = start_element.page
1204
+ end_page = end_element.page
1205
+
1206
+ # Create a combined region
1207
+ combined_region = Region(
1208
+ start_page,
1209
+ (0, start_element.top, start_page.width, start_page.height)
1210
+ )
1211
+ combined_region._spans_pages = True
1212
+ combined_region._page_range = (start_page.index, end_page.index)
1213
+ combined_region.start_element = start_element
1214
+ combined_region.end_element = end_element
1215
+
1216
+ # Get all elements that fall within this multi-page region
1217
+ combined_elements = []
1218
+
1219
+ # Get elements from the first page
1220
+ first_page_elements = [e for e in all_elements
1221
+ if e.page == start_page and e.top >= start_element.top]
1222
+ combined_elements.extend(first_page_elements)
1223
+
1224
+ # Get elements from middle pages (if any)
1225
+ for page_idx in range(start_page.index + 1, end_page.index):
1226
+ middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
1227
+ combined_elements.extend(middle_page_elements)
1228
+
1229
+ # Get elements from the last page
1230
+ last_page_elements = [e for e in all_elements
1231
+ if e.page == end_page and e.bottom <= end_element.bottom]
1232
+ combined_elements.extend(last_page_elements)
1233
+
1234
+ # Store the elements in the combined region
1235
+ combined_region._multi_page_elements = combined_elements
1236
+
1237
+ sections.append(combined_region)
1238
+
1239
+ current_start = None
1240
+
1241
+ # If it's another start boundary and we have a current start (for splitting by starts only)
1242
+ elif boundary['type'] == 'start' and current_start is not None and not end_elements:
1243
+ # Create a section from current_start to just before this boundary
1244
+ start_element = current_start['element']
1245
+
1246
+ # Find the last element before this boundary on the same page
1247
+ if start_element.page == boundary['element'].page:
1248
+ # Find elements on this page
1249
+ page_elements = [e for e in all_elements if e.page == start_element.page]
1250
+ # Sort by position
1251
+ page_elements.sort(key=lambda e: (e.top, e.x0))
1252
+
1253
+ # Find the last element before the boundary
1254
+ end_idx = page_elements.index(boundary['element']) - 1 if boundary['element'] in page_elements else -1
1255
+ end_element = page_elements[end_idx] if end_idx >= 0 else None
1256
+
1257
+ # Create the section
1258
+ section = start_element.page.get_section_between(
1259
+ start_element,
1260
+ end_element,
1261
+ boundary_inclusion
1262
+ )
1263
+ sections.append(section)
1264
+ else:
1265
+ # Cross-page section - create from current_start to the end of its page
1266
+ from natural_pdf.elements.region import Region
1267
+ start_page = start_element.page
1268
+
1269
+ region = Region(
1270
+ start_page,
1271
+ (0, start_element.top, start_page.width, start_page.height)
1272
+ )
1273
+ region.start_element = start_element
1274
+ sections.append(region)
1275
+
1276
+ current_start = boundary
1277
+
1278
+ # Handle the last section if we have a current start
1279
+ if current_start is not None:
1280
+ start_element = current_start['element']
1281
+ start_page = start_element.page
1282
+
1283
+ if end_elements:
1284
+ # With end_elements, we need an explicit end - use the last element
1285
+ # on the last page of the collection
1286
+ last_page = self.pages[-1]
1287
+ last_page_elements = [e for e in all_elements if e.page == last_page]
1288
+ last_page_elements.sort(key=lambda e: (e.top, e.x0))
1289
+ end_element = last_page_elements[-1] if last_page_elements else None
1290
+
1291
+ # Create a multi-page section
1292
+ from natural_pdf.elements.region import Region
1293
+
1294
+ if start_page == last_page:
1295
+ # Simple case - both on same page
1296
+ section = start_page.get_section_between(
1297
+ start_element,
1298
+ end_element,
1299
+ boundary_inclusion
1300
+ )
1301
+ sections.append(section)
1302
+ else:
1303
+ # Create a multi-page section
1304
+ combined_region = Region(
1305
+ start_page,
1306
+ (0, start_element.top, start_page.width, start_page.height)
1307
+ )
1308
+ combined_region._spans_pages = True
1309
+ combined_region._page_range = (start_page.index, last_page.index)
1310
+ combined_region.start_element = start_element
1311
+ combined_region.end_element = end_element
1312
+
1313
+ # Get all elements that fall within this multi-page region
1314
+ combined_elements = []
1315
+
1316
+ # Get elements from the first page
1317
+ first_page_elements = [e for e in all_elements
1318
+ if e.page == start_page and e.top >= start_element.top]
1319
+ combined_elements.extend(first_page_elements)
1320
+
1321
+ # Get elements from middle pages (if any)
1322
+ for page_idx in range(start_page.index + 1, last_page.index):
1323
+ middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
1324
+ combined_elements.extend(middle_page_elements)
1325
+
1326
+ # Get elements from the last page
1327
+ last_page_elements = [e for e in all_elements
1328
+ if e.page == last_page and (end_element is None or e.bottom <= end_element.bottom)]
1329
+ combined_elements.extend(last_page_elements)
1330
+
1331
+ # Store the elements in the combined region
1332
+ combined_region._multi_page_elements = combined_elements
1333
+
1334
+ sections.append(combined_region)
1335
+ else:
1336
+ # With start_elements only, create a section to the end of the current page
1337
+ from natural_pdf.elements.region import Region
1338
+ region = Region(
1339
+ start_page,
1340
+ (0, start_element.top, start_page.width, start_page.height)
1341
+ )
1342
+ region.start_element = start_element
1343
+ sections.append(region)
1344
+
1345
+ return sections