natural-pdf 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. natural_pdf/__init__.py +11 -6
  2. natural_pdf/analyzers/__init__.py +6 -1
  3. natural_pdf/analyzers/guides.py +354 -258
  4. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -4
  6. natural_pdf/analyzers/layout/paddle.py +11 -0
  7. natural_pdf/analyzers/layout/surya.py +2 -3
  8. natural_pdf/analyzers/shape_detection_mixin.py +25 -34
  9. natural_pdf/analyzers/text_structure.py +2 -2
  10. natural_pdf/classification/manager.py +1 -1
  11. natural_pdf/collections/mixins.py +3 -2
  12. natural_pdf/core/highlighting_service.py +743 -32
  13. natural_pdf/core/page.py +252 -399
  14. natural_pdf/core/page_collection.py +1249 -0
  15. natural_pdf/core/pdf.py +231 -89
  16. natural_pdf/{collections → core}/pdf_collection.py +18 -11
  17. natural_pdf/core/render_spec.py +335 -0
  18. natural_pdf/describe/base.py +1 -1
  19. natural_pdf/elements/__init__.py +1 -0
  20. natural_pdf/elements/base.py +108 -83
  21. natural_pdf/elements/{collections.py → element_collection.py} +575 -1372
  22. natural_pdf/elements/line.py +0 -1
  23. natural_pdf/elements/rect.py +0 -1
  24. natural_pdf/elements/region.py +405 -280
  25. natural_pdf/elements/text.py +9 -7
  26. natural_pdf/exporters/base.py +2 -2
  27. natural_pdf/exporters/original_pdf.py +1 -1
  28. natural_pdf/exporters/paddleocr.py +2 -4
  29. natural_pdf/exporters/searchable_pdf.py +3 -2
  30. natural_pdf/extraction/mixin.py +1 -3
  31. natural_pdf/flows/collections.py +1 -69
  32. natural_pdf/flows/element.py +25 -0
  33. natural_pdf/flows/flow.py +1658 -19
  34. natural_pdf/flows/region.py +757 -263
  35. natural_pdf/ocr/ocr_options.py +0 -2
  36. natural_pdf/ocr/utils.py +2 -1
  37. natural_pdf/qa/document_qa.py +21 -5
  38. natural_pdf/search/search_service_protocol.py +1 -1
  39. natural_pdf/selectors/parser.py +35 -2
  40. natural_pdf/tables/result.py +35 -1
  41. natural_pdf/text_mixin.py +101 -0
  42. natural_pdf/utils/debug.py +2 -1
  43. natural_pdf/utils/highlighting.py +1 -0
  44. natural_pdf/utils/layout.py +2 -2
  45. natural_pdf/utils/packaging.py +4 -3
  46. natural_pdf/utils/text_extraction.py +15 -12
  47. natural_pdf/utils/visualization.py +385 -0
  48. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
  49. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -52
  50. optimization/memory_comparison.py +1 -1
  51. optimization/pdf_analyzer.py +2 -2
  52. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
  53. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
  54. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
  55. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
@@ -11,6 +11,7 @@ from typing import (
11
11
  Iterable,
12
12
  Iterator,
13
13
  List,
14
+ Literal,
14
15
  Optional,
15
16
  Sequence,
16
17
  Tuple,
@@ -32,6 +33,9 @@ from natural_pdf.classification.manager import ClassificationManager
32
33
  from natural_pdf.classification.mixin import ClassificationMixin
33
34
  from natural_pdf.collections.mixins import ApplyMixin, DirectionalCollectionMixin
34
35
  from natural_pdf.core.pdf import PDF
36
+
37
+ # Add Visualizable import
38
+ from natural_pdf.core.render_spec import RenderSpec, Visualizable
35
39
  from natural_pdf.describe.mixin import DescribeMixin, InspectMixin
36
40
  from natural_pdf.elements.base import Element
37
41
  from natural_pdf.elements.region import Region
@@ -40,6 +44,7 @@ from natural_pdf.export.mixin import ExportMixin
40
44
  from natural_pdf.ocr import OCROptions
41
45
  from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
42
46
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
47
+ from natural_pdf.text_mixin import TextMixin
43
48
 
44
49
  # Potentially lazy imports for optional dependencies needed in save_pdf
45
50
  try:
@@ -66,6 +71,7 @@ if TYPE_CHECKING:
66
71
  from natural_pdf.core.pdf import PDF # ---> ADDED PDF type hint
67
72
  from natural_pdf.elements.region import Region
68
73
  from natural_pdf.elements.text import TextElement # Ensure TextElement is imported
74
+ from natural_pdf.flows.flow import Flow
69
75
 
70
76
  T = TypeVar("T")
71
77
  P = TypeVar("P", bound="Page")
@@ -79,6 +85,7 @@ class ElementCollection(
79
85
  DirectionalCollectionMixin,
80
86
  DescribeMixin,
81
87
  InspectMixin,
88
+ Visualizable,
82
89
  MutableSequence,
83
90
  ):
84
91
  """Collection of PDF elements with batch operations.
@@ -168,13 +175,234 @@ class ElementCollection(
168
175
  """
169
176
  self._elements = elements or []
170
177
 
178
+ def _get_render_specs(
179
+ self,
180
+ mode: Literal["show", "render"] = "show",
181
+ color: Optional[Union[str, Tuple[int, int, int]]] = None,
182
+ highlights: Optional[List[Dict[str, Any]]] = None,
183
+ crop: Union[bool, Literal["content"]] = False,
184
+ crop_bbox: Optional[Tuple[float, float, float, float]] = None,
185
+ group_by: Optional[str] = None,
186
+ bins: Optional[Union[int, List[float]]] = None,
187
+ annotate: Optional[List[str]] = None,
188
+ **kwargs,
189
+ ) -> List[RenderSpec]:
190
+ """Get render specifications for this element collection.
191
+
192
+ Args:
193
+ mode: Rendering mode - 'show' includes highlights, 'render' is clean
194
+ color: Default color for highlights in show mode (or colormap name when using group_by)
195
+ highlights: Additional highlight groups to show
196
+ crop: Whether to crop to element bounds
197
+ crop_bbox: Explicit crop bounds
198
+ group_by: Attribute to group elements by for color mapping
199
+ bins: Binning specification for quantitative data (int for equal-width bins, list for custom bins)
200
+ annotate: List of attribute names to display on highlights
201
+ **kwargs: Additional parameters
202
+
203
+ Returns:
204
+ List of RenderSpec objects, one per page with elements
205
+ """
206
+ if not self._elements:
207
+ return []
208
+
209
+ # Group elements by page
210
+ elements_by_page = {}
211
+ for elem in self._elements:
212
+ if hasattr(elem, "page"):
213
+ page = elem.page
214
+ if page not in elements_by_page:
215
+ elements_by_page[page] = []
216
+ elements_by_page[page].append(elem)
217
+
218
+ if not elements_by_page:
219
+ return []
220
+
221
+ # Create RenderSpec for each page
222
+ specs = []
223
+ for page, page_elements in elements_by_page.items():
224
+ spec = RenderSpec(page=page)
225
+
226
+ # Handle cropping
227
+ if crop_bbox:
228
+ spec.crop_bbox = crop_bbox
229
+ elif crop == "content" or crop is True:
230
+ # Calculate bounds of elements on this page
231
+ x_coords = []
232
+ y_coords = []
233
+ for elem in page_elements:
234
+ if hasattr(elem, "bbox") and elem.bbox:
235
+ x0, y0, x1, y1 = elem.bbox
236
+ x_coords.extend([x0, x1])
237
+ y_coords.extend([y0, y1])
238
+
239
+ if x_coords and y_coords:
240
+ spec.crop_bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
241
+
242
+ # Add highlights in show mode
243
+ if mode == "show":
244
+ # Handle group_by parameter for quantitative/categorical grouping
245
+ if group_by is not None:
246
+ # Use the improved highlighting logic from _prepare_highlight_data
247
+ prepared_highlights = self._prepare_highlight_data(
248
+ group_by=group_by, color=color, bins=bins, annotate=annotate, **kwargs
249
+ )
250
+
251
+ # Check if we have quantitative metadata to preserve
252
+ quantitative_metadata = None
253
+ for highlight_data in prepared_highlights:
254
+ if (
255
+ "quantitative_metadata" in highlight_data
256
+ and highlight_data["quantitative_metadata"]
257
+ ):
258
+ quantitative_metadata = highlight_data["quantitative_metadata"]
259
+ break
260
+
261
+ # Add highlights from prepared data
262
+ for highlight_data in prepared_highlights:
263
+ # Only add elements from this page
264
+ elem = highlight_data.get("element")
265
+ if elem and hasattr(elem, "page") and elem.page == page:
266
+ # Create the highlight dict manually to preserve quantitative metadata
267
+ highlight_dict = {
268
+ "element": elem,
269
+ "color": highlight_data.get("color"),
270
+ "label": highlight_data.get("label"),
271
+ }
272
+
273
+ # Add quantitative metadata to the first highlight
274
+ if quantitative_metadata and not any(
275
+ h.get("quantitative_metadata") for h in spec.highlights
276
+ ):
277
+ highlight_dict["quantitative_metadata"] = quantitative_metadata
278
+
279
+ # Add annotate if provided in the prepared data
280
+ if "annotate" in highlight_data:
281
+ highlight_dict["annotate"] = highlight_data["annotate"]
282
+ if "attributes_to_draw" in highlight_data:
283
+ highlight_dict["attributes_to_draw"] = highlight_data[
284
+ "attributes_to_draw"
285
+ ]
286
+
287
+ # Extract geometry from element
288
+ if (
289
+ hasattr(elem, "polygon")
290
+ and hasattr(elem, "has_polygon")
291
+ and elem.has_polygon
292
+ ):
293
+ highlight_dict["polygon"] = elem.polygon
294
+ elif hasattr(elem, "bbox"):
295
+ highlight_dict["bbox"] = elem.bbox
296
+
297
+ spec.highlights.append(highlight_dict)
298
+ else:
299
+ # Default behavior when no group_by is specified
300
+ # Determine if all elements are of the same type
301
+ element_types = set(type(elem).__name__ for elem in page_elements)
302
+
303
+ if len(element_types) == 1:
304
+ # All elements are the same type - use a single label
305
+ type_name = element_types.pop()
306
+ # Generate a clean label from the type name
307
+ base_name = (
308
+ type_name.replace("Element", "").replace("Region", "")
309
+ if type_name != "Region"
310
+ else "Region"
311
+ )
312
+ # Handle special cases for common types
313
+ if base_name == "Text":
314
+ shared_label = "Text Elements"
315
+ elif base_name == "table_cell" or (
316
+ hasattr(page_elements[0], "region_type")
317
+ and page_elements[0].region_type == "table_cell"
318
+ ):
319
+ shared_label = "Table Cells"
320
+ elif base_name == "table":
321
+ shared_label = "Tables"
322
+ else:
323
+ shared_label = f"{base_name} Elements" if base_name else "Elements"
324
+
325
+ # Add all elements with the same label (no color cycling)
326
+ for elem in page_elements:
327
+ # Get element highlight params with annotate
328
+ element_data = self._get_element_highlight_params(elem, annotate)
329
+ if element_data:
330
+ # Use add_highlight with basic params
331
+ spec.add_highlight(
332
+ element=elem,
333
+ color=color, # Use provided color or None
334
+ label=shared_label,
335
+ )
336
+ # Update last highlight with attributes if present
337
+ if element_data.get("attributes_to_draw") and spec.highlights:
338
+ spec.highlights[-1]["attributes_to_draw"] = element_data[
339
+ "attributes_to_draw"
340
+ ]
341
+ else:
342
+ # Mixed types - use individual labels (existing behavior)
343
+ for elem in page_elements:
344
+ # Get element highlight params with annotate
345
+ element_data = self._get_element_highlight_params(elem, annotate)
346
+ if element_data:
347
+ spec.add_highlight(
348
+ element=elem,
349
+ color=color,
350
+ label=getattr(elem, "text", None) or str(elem),
351
+ )
352
+ # Update last highlight with attributes if present
353
+ if element_data.get("attributes_to_draw") and spec.highlights:
354
+ spec.highlights[-1]["attributes_to_draw"] = element_data[
355
+ "attributes_to_draw"
356
+ ]
357
+
358
+ # Add additional highlight groups if provided
359
+ if highlights:
360
+ for group in highlights:
361
+ group_elements = group.get("elements", [])
362
+ group_color = group.get("color", color)
363
+ group_label = group.get("label")
364
+
365
+ # Only add elements from this page
366
+ for elem in group_elements:
367
+ if hasattr(elem, "page") and elem.page == page:
368
+ spec.add_highlight(
369
+ element=elem, color=group_color, label=group_label
370
+ )
371
+
372
+ specs.append(spec)
373
+
374
+ return specs
375
+
376
+ def _get_highlighter(self):
377
+ """Get the highlighting service for rendering.
378
+
379
+ For ElementCollection, we get it from the first element's page.
380
+ """
381
+ if not self._elements:
382
+ raise RuntimeError("Cannot get highlighter from empty ElementCollection")
383
+
384
+ # Try to get highlighter from first element's page
385
+ for elem in self._elements:
386
+ if hasattr(elem, "page") and hasattr(elem.page, "_highlighter"):
387
+ return elem.page._highlighter
388
+
389
+ # If no elements have pages, we can't render
390
+ raise RuntimeError(
391
+ "Cannot find HighlightingService. ElementCollection elements don't have page access."
392
+ )
393
+
171
394
  def __len__(self) -> int:
172
395
  """Get the number of elements in the collection."""
173
396
  return len(self._elements)
174
397
 
175
- def __getitem__(self, index: int) -> "Element":
176
- """Get an element by index."""
177
- return self._elements[index]
398
+ def __getitem__(self, index: Union[int, slice]) -> Union["Element", "ElementCollection"]:
399
+ """Get an element by index or a collection by slice."""
400
+ if isinstance(index, slice):
401
+ # Return a new ElementCollection for slices
402
+ return ElementCollection(self._elements[index])
403
+ else:
404
+ # Return the element for integer indices
405
+ return self._elements[index]
178
406
 
179
407
  def __repr__(self) -> str:
180
408
  """Return a string representation showing the element count."""
@@ -420,6 +648,7 @@ class ElementCollection(
420
648
  # Apply content filtering if provided
421
649
  if content_filter is not None:
422
650
  from natural_pdf.utils.text_extraction import _apply_content_filter
651
+
423
652
  all_char_dicts = _apply_content_filter(all_char_dicts, content_filter)
424
653
 
425
654
  # Check if layout is requested
@@ -531,8 +760,9 @@ class ElementCollection(
531
760
  group_by: Optional[str] = None,
532
761
  label_format: Optional[str] = None,
533
762
  distinct: bool = False,
534
- include_attrs: Optional[List[str]] = None,
763
+ annotate: Optional[List[str]] = None,
535
764
  replace: bool = False,
765
+ bins: Optional[Union[int, List[float]]] = None,
536
766
  ) -> "ElementCollection":
537
767
  """
538
768
  Adds persistent highlights for all elements in the collection to the page
@@ -550,12 +780,15 @@ class ElementCollection(
550
780
  label: Optional explicit label for the entire collection. If provided,
551
781
  all elements are highlighted as a single group with this label,
552
782
  ignoring 'group_by' and the default type-based grouping.
553
- color: Optional explicit color for the highlight (tuple/string). Applied
554
- consistently if 'label' is provided or if grouping occurs.
783
+ color: Optional explicit color for the highlight (tuple/string), or
784
+ matplotlib colormap name for quantitative group_by (e.g., 'viridis', 'plasma',
785
+ 'inferno', 'coolwarm', 'RdBu'). Applied consistently if 'label' is provided
786
+ or if grouping occurs.
555
787
  group_by: Optional attribute name present on the elements. If provided
556
788
  (and 'label' is None), elements will be grouped based on the
557
789
  value of this attribute, and each group will be highlighted
558
- with a distinct label and color.
790
+ with a distinct label and color. Automatically detects quantitative
791
+ data and uses gradient colormaps when appropriate.
559
792
  label_format: Optional Python f-string to format the group label when
560
793
  'group_by' is used. Can reference element attributes
561
794
  (e.g., "Type: {region_type}, Conf: {confidence:.2f}").
@@ -563,11 +796,14 @@ class ElementCollection(
563
796
  distinct: If True, bypasses all grouping and highlights each element
564
797
  individually with cycling colors (the previous default behavior).
565
798
  (default: False)
566
- include_attrs: List of attribute names from the element to display directly
567
- on the highlight itself (distinct from group label).
799
+ annotate: List of attribute names from the element to display directly
800
+ on the highlight itself (distinct from group label).
568
801
  replace: If True, existing highlights on the affected page(s)
569
802
  are cleared before adding these highlights.
570
803
  If False (default), highlights are appended to existing ones.
804
+ bins: Optional binning specification for quantitative data when using group_by.
805
+ Can be an integer (number of equal-width bins) or a list of bin edges.
806
+ Only used when group_by contains quantitative data.
571
807
 
572
808
  Returns:
573
809
  Self for method chaining
@@ -589,7 +825,8 @@ class ElementCollection(
589
825
  color=color,
590
826
  group_by=group_by,
591
827
  label_format=label_format,
592
- include_attrs=include_attrs,
828
+ annotate=annotate,
829
+ bins=bins,
593
830
  # 'replace' flag is handled during the add call below
594
831
  )
595
832
 
@@ -630,7 +867,7 @@ class ElementCollection(
630
867
  "use_color_cycling", False
631
868
  ), # Set by _prepare if distinct
632
869
  "element": data["element"],
633
- "include_attrs": data["include_attrs"],
870
+ "annotate": data["annotate"],
634
871
  # Internal call to service always appends, as clearing was handled above
635
872
  "existing": "append",
636
873
  }
@@ -652,7 +889,8 @@ class ElementCollection(
652
889
  color: Optional[Union[Tuple, str]] = None,
653
890
  group_by: Optional[str] = None,
654
891
  label_format: Optional[str] = None,
655
- include_attrs: Optional[List[str]] = None,
892
+ annotate: Optional[List[str]] = None,
893
+ bins: Optional[Union[int, List[float]]] = None,
656
894
  ) -> List[Dict]:
657
895
  """
658
896
  Determines the parameters for highlighting each element based on the strategy.
@@ -661,7 +899,7 @@ class ElementCollection(
661
899
 
662
900
  Returns:
663
901
  List of dictionaries, each containing parameters for a single highlight
664
- (e.g., page_index, bbox/polygon, color, label, element, include_attrs, attributes_to_draw).
902
+ (e.g., page_index, bbox/polygon, color, label, element, annotate, attributes_to_draw).
665
903
  Color and label determination happens here.
666
904
  """
667
905
  prepared_data = []
@@ -669,11 +907,25 @@ class ElementCollection(
669
907
  return prepared_data
670
908
 
671
909
  # Need access to the HighlightingService to determine colors correctly.
910
+ # Use highlighting protocol to find a valid service from any element
672
911
  highlighter = None
673
- first_element = self._elements[0]
674
- if hasattr(first_element, "page") and hasattr(first_element.page, "_highlighter"):
675
- highlighter = first_element.page._highlighter
676
- else:
912
+
913
+ for element in self._elements:
914
+ # Try direct page access first (for regular elements)
915
+ if hasattr(element, "page") and hasattr(element.page, "_highlighter"):
916
+ highlighter = element.page._highlighter
917
+ break
918
+ # Try highlighting protocol for FlowRegions and other complex elements
919
+ elif hasattr(element, "get_highlight_specs"):
920
+ specs = element.get_highlight_specs()
921
+ for spec in specs:
922
+ if "page" in spec and hasattr(spec["page"], "_highlighter"):
923
+ highlighter = spec["page"]._highlighter
924
+ break
925
+ if highlighter:
926
+ break
927
+
928
+ if not highlighter:
677
929
  logger.warning(
678
930
  "Cannot determine highlight colors: HighlightingService not accessible from elements."
679
931
  )
@@ -686,7 +938,7 @@ class ElementCollection(
686
938
  final_color = highlighter._determine_highlight_color(
687
939
  label=None, color_input=None, use_color_cycling=True
688
940
  )
689
- element_data = self._get_element_highlight_params(element, include_attrs)
941
+ element_data = self._get_element_highlight_params(element, annotate)
690
942
  if element_data:
691
943
  element_data.update(
692
944
  {"color": final_color, "label": None, "use_color_cycling": True}
@@ -699,7 +951,7 @@ class ElementCollection(
699
951
  label=label, color_input=color, use_color_cycling=False
700
952
  )
701
953
  for element in self._elements:
702
- element_data = self._get_element_highlight_params(element, include_attrs)
954
+ element_data = self._get_element_highlight_params(element, annotate)
703
955
  if element_data:
704
956
  element_data.update({"color": final_color, "label": label})
705
957
  prepared_data.append(element_data)
@@ -707,23 +959,84 @@ class ElementCollection(
707
959
  elif group_by is not None:
708
960
  logger.debug("_prepare: Grouping by attribute strategy.")
709
961
  grouped_elements = self._group_elements_by_attr(group_by)
962
+
963
+ # Collect all values for quantitative detection
964
+ all_values = []
710
965
  for group_key, group_elements in grouped_elements.items():
711
- if not group_elements:
712
- continue
713
- group_label = self._format_group_label(
714
- group_key, label_format, group_elements[0], group_by
715
- )
716
- final_color = highlighter._determine_highlight_color(
717
- label=group_label, color_input=None, use_color_cycling=False
718
- )
719
- logger.debug(
720
- f" _prepare group '{group_label}' ({len(group_elements)} elements) -> color {final_color}"
966
+ if group_elements:
967
+ all_values.append(group_key)
968
+
969
+ # Import the quantitative detection function
970
+ from natural_pdf.utils.visualization import (
971
+ create_quantitative_color_mapping,
972
+ detect_quantitative_data,
973
+ )
974
+
975
+ # Determine if we should use quantitative color mapping
976
+ use_quantitative = detect_quantitative_data(all_values)
977
+
978
+ if use_quantitative:
979
+ logger.debug(" _prepare: Using quantitative color mapping.")
980
+ # Use quantitative color mapping with specified colormap
981
+ colormap_name = color if isinstance(color, str) else "viridis"
982
+ value_to_color = create_quantitative_color_mapping(
983
+ all_values, colormap=colormap_name, bins=bins
721
984
  )
722
- for element in group_elements:
723
- element_data = self._get_element_highlight_params(element, include_attrs)
724
- if element_data:
725
- element_data.update({"color": final_color, "label": group_label})
726
- prepared_data.append(element_data)
985
+
986
+ # Store quantitative metadata for colorbar creation
987
+ quantitative_metadata = {
988
+ "values": all_values,
989
+ "colormap": colormap_name,
990
+ "bins": bins,
991
+ "attribute": group_by,
992
+ }
993
+
994
+ for group_key, group_elements in grouped_elements.items():
995
+ if not group_elements:
996
+ continue
997
+ group_label = self._format_group_label(
998
+ group_key, label_format, group_elements[0], group_by
999
+ )
1000
+
1001
+ # Get quantitative color for this value
1002
+ final_color = value_to_color.get(group_key)
1003
+ if final_color is None:
1004
+ # Fallback to traditional color assignment
1005
+ final_color = highlighter._determine_highlight_color(
1006
+ label=group_label, color_input=None, use_color_cycling=False
1007
+ )
1008
+
1009
+ logger.debug(
1010
+ f" _prepare group '{group_label}' ({len(group_elements)} elements) -> color {final_color}"
1011
+ )
1012
+ for element in group_elements:
1013
+ element_data = self._get_element_highlight_params(element, annotate)
1014
+ if element_data:
1015
+ element_data.update({"color": final_color, "label": group_label})
1016
+ # Add quantitative metadata to the first element in each group
1017
+ if not any("quantitative_metadata" in pd for pd in prepared_data):
1018
+ element_data["quantitative_metadata"] = quantitative_metadata
1019
+ prepared_data.append(element_data)
1020
+ else:
1021
+ logger.debug(" _prepare: Using categorical color mapping.")
1022
+ # Use traditional categorical color mapping
1023
+ for group_key, group_elements in grouped_elements.items():
1024
+ if not group_elements:
1025
+ continue
1026
+ group_label = self._format_group_label(
1027
+ group_key, label_format, group_elements[0], group_by
1028
+ )
1029
+ final_color = highlighter._determine_highlight_color(
1030
+ label=group_label, color_input=None, use_color_cycling=False
1031
+ )
1032
+ logger.debug(
1033
+ f" _prepare group '{group_label}' ({len(group_elements)} elements) -> color {final_color}"
1034
+ )
1035
+ for element in group_elements:
1036
+ element_data = self._get_element_highlight_params(element, annotate)
1037
+ if element_data:
1038
+ element_data.update({"color": final_color, "label": group_label})
1039
+ prepared_data.append(element_data)
727
1040
  else:
728
1041
  logger.debug("_prepare: Default grouping strategy.")
729
1042
  element_types = set(type(el).__name__ for el in self._elements)
@@ -742,7 +1055,7 @@ class ElementCollection(
742
1055
  )
743
1056
  logger.debug(f" _prepare default group '{auto_label}' -> color {final_color}")
744
1057
  for element in self._elements:
745
- element_data = self._get_element_highlight_params(element, include_attrs)
1058
+ element_data = self._get_element_highlight_params(element, annotate)
746
1059
  if element_data:
747
1060
  element_data.update({"color": final_color, "label": auto_label})
748
1061
  prepared_data.append(element_data)
@@ -761,7 +1074,7 @@ class ElementCollection(
761
1074
  # Determine color *before* logging or using it (already done above for this branch)
762
1075
  logger.debug(f" _prepare default group '{auto_label}' -> color {final_color}")
763
1076
  for element in self._elements:
764
- element_data = self._get_element_highlight_params(element, include_attrs)
1077
+ element_data = self._get_element_highlight_params(element, annotate)
765
1078
  if element_data:
766
1079
  element_data.update({"color": final_color, "label": auto_label})
767
1080
  prepared_data.append(element_data)
@@ -774,7 +1087,7 @@ class ElementCollection(
774
1087
  color: Optional[Union[Tuple, str]],
775
1088
  label: Optional[str],
776
1089
  use_color_cycling: bool,
777
- include_attrs: Optional[List[str]],
1090
+ annotate: Optional[List[str]],
778
1091
  existing: str,
779
1092
  ):
780
1093
  """Low-level helper to call the appropriate HighlightingService method for an element."""
@@ -790,7 +1103,7 @@ class ElementCollection(
790
1103
  "color": color,
791
1104
  "label": label,
792
1105
  "use_color_cycling": use_color_cycling,
793
- "include_attrs": include_attrs,
1106
+ "annotate": annotate,
794
1107
  "existing": existing,
795
1108
  "element": element,
796
1109
  }
@@ -825,7 +1138,7 @@ class ElementCollection(
825
1138
  self,
826
1139
  label: str,
827
1140
  color: Optional[Union[Tuple, str]],
828
- include_attrs: Optional[List[str]],
1141
+ annotate: Optional[List[str]],
829
1142
  existing: str,
830
1143
  ):
831
1144
  """Highlights all elements with the same explicit label and color."""
@@ -835,7 +1148,7 @@ class ElementCollection(
835
1148
  color=color, # Use explicit color if provided
836
1149
  label=label, # Use the explicit group label
837
1150
  use_color_cycling=False, # Use consistent color for the label
838
- include_attrs=include_attrs,
1151
+ annotate=annotate,
839
1152
  existing=existing,
840
1153
  )
841
1154
 
@@ -843,7 +1156,7 @@ class ElementCollection(
843
1156
  self,
844
1157
  group_by: str,
845
1158
  label_format: Optional[str],
846
- include_attrs: Optional[List[str]],
1159
+ annotate: Optional[List[str]],
847
1160
  existing: str,
848
1161
  ):
849
1162
  """Groups elements by attribute and highlights each group distinctly."""
@@ -915,11 +1228,11 @@ class ElementCollection(
915
1228
  color=None, # Let ColorManager choose based on label
916
1229
  label=group_label, # Use the derived group label
917
1230
  use_color_cycling=False, # Use consistent color for the label
918
- include_attrs=include_attrs,
1231
+ annotate=annotate,
919
1232
  existing=existing,
920
1233
  )
921
1234
 
922
- def _highlight_distinctly(self, include_attrs: Optional[List[str]], existing: str):
1235
+ def _highlight_distinctly(self, annotate: Optional[List[str]], existing: str):
923
1236
  """DEPRECATED: Logic moved to _prepare_highlight_data. Kept for reference/potential reuse."""
924
1237
  # This method is no longer called directly by the main highlight path.
925
1238
  # The distinct logic is handled within _prepare_highlight_data.
@@ -929,152 +1242,191 @@ class ElementCollection(
929
1242
  color=None, # Let ColorManager cycle
930
1243
  label=None, # No label for distinct elements
931
1244
  use_color_cycling=True, # Force cycling
932
- include_attrs=include_attrs,
1245
+ annotate=annotate,
933
1246
  existing=existing,
934
1247
  )
935
1248
 
936
- def show(
1249
+ def _render_multipage_highlights(
937
1250
  self,
938
- # --- Visualization Parameters ---
939
- group_by: Optional[str] = None,
940
- label: Optional[str] = None,
941
- color: Optional[Union[Tuple, str]] = None,
942
- label_format: Optional[str] = None,
943
- distinct: bool = False,
944
- include_attrs: Optional[List[str]] = None,
945
- # --- Rendering Parameters ---
946
- resolution: Optional[float] = None,
947
- labels: bool = True, # Use 'labels' consistent with service
948
- legend_position: str = "right",
949
- render_ocr: bool = False,
950
- width: Optional[int] = None, # Add width parameter
951
- page: Optional[Any] = None, # NEW: Optional page parameter for empty collections
952
- crop: bool = False, # NEW: If True, crop output to element bounds
953
- ) -> Optional["Image.Image"]:
954
- """
955
- Generates a temporary preview image highlighting elements in this collection
956
- on their page, ignoring any persistent highlights.
957
-
958
- Currently only supports collections where all elements are on the same page
959
- of the same PDF.
960
-
961
- Allows grouping and coloring elements based on attributes, similar to the
962
- persistent `highlight()` method, but only for this temporary view.
963
-
964
- Args:
965
- group_by: Attribute name to group elements by for distinct colors/labels.
966
- label: Explicit label for all elements (overrides group_by).
967
- color: Explicit color for all elements (if label used) or base color.
968
- label_format: F-string to format group labels if group_by is used.
969
- distinct: Highlight each element distinctly (overrides group_by/label).
970
- include_attrs: Attributes to display on individual highlights.
971
- resolution: Resolution in DPI for rendering (uses global options if not specified, defaults to 144 DPI).
972
- labels: Whether to include a legend for the temporary highlights.
973
- legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
974
- render_ocr: Whether to render OCR text.
975
- width: Optional width for the output image in pixels.
976
- crop: If True, crop the resulting image to the tight bounding box
977
- containing all elements in the collection. The elements are
978
- still highlighted first, then the image is cropped.
979
-
980
- Returns:
981
- PIL Image object of the temporary preview, or None if rendering fails or
982
- elements span multiple pages/PDFs.
1251
+ specs_by_page,
1252
+ resolution,
1253
+ width,
1254
+ labels,
1255
+ legend_position,
1256
+ group_by,
1257
+ label,
1258
+ color,
1259
+ label_format,
1260
+ distinct,
1261
+ annotate,
1262
+ render_ocr,
1263
+ crop,
1264
+ stack_direction="vertical",
1265
+ stack_gap=5,
1266
+ stack_background_color=(255, 255, 255),
1267
+ ):
1268
+ """Render highlights across multiple pages and stack them."""
1269
+ from PIL import Image
983
1270
 
984
- Raises:
985
- ValueError: If the collection is empty or elements are on different pages/PDFs.
986
- """
987
- # Apply global options as defaults, but allow explicit parameters to override
988
- import natural_pdf
1271
+ # Sort pages by index for consistent output
1272
+ sorted_pages = sorted(
1273
+ specs_by_page.keys(), key=lambda p: p.index if hasattr(p, "index") else 0
1274
+ )
989
1275
 
990
- # Use global options if parameters are not explicitly set
991
- if width is None:
992
- width = natural_pdf.options.image.width
993
- if resolution is None:
994
- if natural_pdf.options.image.resolution is not None:
995
- resolution = natural_pdf.options.image.resolution
996
- else:
997
- resolution = 144 # Default resolution when none specified
1276
+ page_images = []
998
1277
 
999
- if not self._elements:
1000
- raise ValueError("Cannot show an empty collection.")
1278
+ for page in sorted_pages:
1279
+ element_specs = specs_by_page[page]
1001
1280
 
1002
- # Check if elements are on multiple PDFs
1003
- if self._are_on_multiple_pdfs():
1004
- raise ValueError(
1005
- "show() currently only supports collections where all elements are from the same PDF."
1006
- )
1281
+ # Get highlighter service from the page
1282
+ if not hasattr(page, "_highlighter"):
1283
+ logger.warning(
1284
+ f"Page {getattr(page, 'number', '?')} has no highlighter service, skipping"
1285
+ )
1286
+ continue
1007
1287
 
1008
- # Check if elements are on multiple pages
1009
- if self._are_on_multiple_pages():
1010
- raise ValueError(
1011
- "show() currently only supports collections where all elements are on the same page."
1012
- )
1288
+ service = page._highlighter
1013
1289
 
1014
- # Get the page and highlighting service from the first element
1015
- first_element = self._elements[0]
1016
- if not hasattr(first_element, "page") or not first_element.page:
1017
- logger.warning("Cannot show collection: First element has no associated page.")
1018
- return None
1019
- page = first_element.page
1020
- if not hasattr(page, "pdf") or not page.pdf:
1021
- logger.warning("Cannot show collection: Page has no associated PDF object.")
1022
- return None
1290
+ # Prepare highlight data for this page
1291
+ highlight_data_list = []
1023
1292
 
1024
- service = page._highlighter
1025
- if not service:
1026
- logger.warning("Cannot show collection: PDF object has no highlighting service.")
1027
- return None
1293
+ for element_idx, spec in element_specs:
1294
+ # Use the element index to generate consistent colors/labels across pages
1295
+ element = spec.get(
1296
+ "element",
1297
+ self._elements[element_idx] if element_idx < len(self._elements) else None,
1298
+ )
1028
1299
 
1029
- # 1. Prepare temporary highlight data based on grouping parameters
1030
- # This returns a list of dicts, suitable for render_preview
1031
- highlight_data_list = self._prepare_highlight_data(
1032
- distinct=distinct,
1033
- label=label,
1034
- color=color,
1035
- group_by=group_by,
1036
- label_format=label_format,
1037
- include_attrs=include_attrs,
1038
- )
1300
+ # Prepare highlight data based on grouping parameters
1301
+ if distinct:
1302
+ # Use cycling colors for distinct mode
1303
+ element_color = None # Let the highlighter service pick from palette
1304
+ use_color_cycling = True
1305
+ element_label = (
1306
+ f"Element_{element_idx + 1}"
1307
+ if label is None
1308
+ else f"{label}_{element_idx + 1}"
1309
+ )
1310
+ elif label:
1311
+ # Explicit label for all elements
1312
+ element_color = color
1313
+ use_color_cycling = color is None
1314
+ element_label = label
1315
+ elif group_by and element:
1316
+ # Group by attribute
1317
+ try:
1318
+ group_key = getattr(element, group_by, None)
1319
+ element_label = self._format_group_label(
1320
+ group_key, label_format, element, group_by
1321
+ )
1322
+ element_color = None # Let service assign color by group
1323
+ use_color_cycling = True
1324
+ except:
1325
+ element_label = f"Element_{element_idx + 1}"
1326
+ element_color = color
1327
+ use_color_cycling = color is None
1328
+ else:
1329
+ # Default behavior
1330
+ element_color = color
1331
+ use_color_cycling = color is None
1332
+ element_label = f"Element_{element_idx + 1}"
1333
+
1334
+ # Build highlight data
1335
+ highlight_item = {
1336
+ "page_index": spec["page_index"],
1337
+ "bbox": spec["bbox"],
1338
+ "polygon": spec.get("polygon"),
1339
+ "color": element_color,
1340
+ "label": element_label if labels else None,
1341
+ "use_color_cycling": use_color_cycling,
1342
+ }
1343
+
1344
+ # Add attributes if requested
1345
+ if annotate and element:
1346
+ highlight_item["attributes_to_draw"] = {}
1347
+ for attr_name in annotate:
1348
+ try:
1349
+ attr_value = getattr(element, attr_name, None)
1350
+ if attr_value is not None:
1351
+ highlight_item["attributes_to_draw"][attr_name] = attr_value
1352
+ except:
1353
+ pass
1039
1354
 
1040
- if not highlight_data_list:
1041
- logger.warning("No highlight data generated for show(). Rendering clean page.")
1042
- # Render the page without any temporary highlights
1043
- highlight_data_list = []
1355
+ highlight_data_list.append(highlight_item)
1044
1356
 
1045
- # 2. Call render_preview on the HighlightingService
1046
- try:
1047
- # Calculate crop bounding box in PDF coordinates if crop is requested
1357
+ # Calculate crop bbox if requested
1048
1358
  crop_bbox = None
1049
1359
  if crop:
1050
1360
  try:
1051
- crop_bbox = (
1052
- min(el.x0 for el in self._elements),
1053
- min(el.top for el in self._elements),
1054
- max(el.x1 for el in self._elements),
1055
- max(el.bottom for el in self._elements),
1056
- )
1361
+ # Get bboxes from all specs on this page
1362
+ bboxes = [spec["bbox"] for _, spec in element_specs if spec.get("bbox")]
1363
+ if bboxes:
1364
+ crop_bbox = (
1365
+ min(bbox[0] for bbox in bboxes),
1366
+ min(bbox[1] for bbox in bboxes),
1367
+ max(bbox[2] for bbox in bboxes),
1368
+ max(bbox[3] for bbox in bboxes),
1369
+ )
1057
1370
  except Exception as bbox_err:
1058
- logger.error(
1059
- f"Error determining crop bbox for collection show: {bbox_err}",
1060
- exc_info=True,
1061
- )
1371
+ logger.error(f"Error determining crop bbox: {bbox_err}")
1062
1372
 
1063
- img = service.render_preview(
1064
- page_index=page.index,
1065
- temporary_highlights=highlight_data_list,
1066
- resolution=resolution,
1067
- width=width, # Pass the width parameter
1068
- labels=labels, # Use 'labels'
1069
- legend_position=legend_position,
1070
- render_ocr=render_ocr,
1071
- crop_bbox=crop_bbox,
1072
- )
1073
- return img
1074
- except Exception as e:
1075
- logger.error(f"Error calling highlighting_service.render_preview: {e}", exc_info=True)
1373
+ # Render this page
1374
+ try:
1375
+ img = service.render_preview(
1376
+ page_index=page.index,
1377
+ temporary_highlights=highlight_data_list,
1378
+ resolution=resolution,
1379
+ width=width,
1380
+ labels=labels,
1381
+ legend_position=legend_position,
1382
+ render_ocr=render_ocr,
1383
+ crop_bbox=crop_bbox,
1384
+ )
1385
+
1386
+ if img:
1387
+ page_images.append(img)
1388
+ except Exception as e:
1389
+ logger.error(
1390
+ f"Error rendering page {getattr(page, 'number', '?')}: {e}", exc_info=True
1391
+ )
1392
+
1393
+ if not page_images:
1394
+ logger.warning("Failed to render any pages")
1076
1395
  return None
1077
1396
 
1397
+ if len(page_images) == 1:
1398
+ return page_images[0]
1399
+
1400
+ # Stack the images
1401
+ if stack_direction == "vertical":
1402
+ final_width = max(img.width for img in page_images)
1403
+ final_height = (
1404
+ sum(img.height for img in page_images) + (len(page_images) - 1) * stack_gap
1405
+ )
1406
+
1407
+ stacked_image = Image.new("RGB", (final_width, final_height), stack_background_color)
1408
+
1409
+ current_y = 0
1410
+ for img in page_images:
1411
+ # Center horizontally
1412
+ x_offset = (final_width - img.width) // 2
1413
+ stacked_image.paste(img, (x_offset, current_y))
1414
+ current_y += img.height + stack_gap
1415
+ else: # horizontal
1416
+ final_width = sum(img.width for img in page_images) + (len(page_images) - 1) * stack_gap
1417
+ final_height = max(img.height for img in page_images)
1418
+
1419
+ stacked_image = Image.new("RGB", (final_width, final_height), stack_background_color)
1420
+
1421
+ current_x = 0
1422
+ for img in page_images:
1423
+ # Center vertically
1424
+ y_offset = (final_height - img.height) // 2
1425
+ stacked_image.paste(img, (current_x, y_offset))
1426
+ current_x += img.width + stack_gap
1427
+
1428
+ return stacked_image
1429
+
1078
1430
  def save(
1079
1431
  self,
1080
1432
  filename: str,
@@ -1110,8 +1462,8 @@ class ElementCollection(
1110
1462
  else:
1111
1463
  resolution = 144 # Default resolution when none specified
1112
1464
 
1113
- # Use to_image to generate and save the image
1114
- self.to_image(
1465
+ # Use export() to save the image
1466
+ self.export(
1115
1467
  path=filename,
1116
1468
  resolution=resolution,
1117
1469
  width=width,
@@ -1121,42 +1473,6 @@ class ElementCollection(
1121
1473
  )
1122
1474
  return self
1123
1475
 
1124
- def to_image(
1125
- self,
1126
- path: Optional[str] = None,
1127
- resolution: Optional[float] = None,
1128
- width: Optional[int] = None,
1129
- labels: bool = True,
1130
- legend_position: str = "right",
1131
- render_ocr: bool = False,
1132
- ) -> Optional["Image.Image"]:
1133
- """
1134
- Generate an image of the page with this collection's elements highlighted,
1135
- optionally saving it to a file.
1136
-
1137
- Args:
1138
- path: Optional path to save the image to
1139
- resolution: Resolution in DPI for rendering (uses global options if not specified, defaults to 144 DPI)
1140
- width: Optional width for the output image in pixels (height calculated to maintain aspect ratio)
1141
- labels: Whether to include a legend for labels
1142
- legend_position: Position of the legend
1143
- render_ocr: Whether to render OCR text with white background boxes
1144
-
1145
- Returns:
1146
- PIL Image of the page with elements highlighted, or None if no valid page
1147
- """
1148
- # Get the page from the first element (if available)
1149
- if self._elements and hasattr(self._elements[0], "page"):
1150
- page = self._elements[0].page
1151
- # Generate the image using to_image
1152
- return page.to_image(
1153
- path=path,
1154
- resolution=resolution,
1155
- width=width,
1156
- labels=labels,
1157
- legend_position=legend_position,
1158
- render_ocr=render_ocr,
1159
- )
1160
1476
  return None
1161
1477
 
1162
1478
  def _group_elements_by_attr(self, group_by: str) -> Dict[Any, List[T]]:
@@ -1216,17 +1532,57 @@ class ElementCollection(
1216
1532
  return str(group_key)
1217
1533
 
1218
1534
  def _get_element_highlight_params(
1219
- self, element: T, include_attrs: Optional[List[str]]
1535
+ self, element: T, annotate: Optional[List[str]]
1220
1536
  ) -> Optional[Dict]:
1221
1537
  """Extracts common parameters needed for highlighting a single element."""
1538
+ # For FlowRegions and other complex elements, use highlighting protocol
1539
+ if hasattr(element, "get_highlight_specs"):
1540
+ specs = element.get_highlight_specs()
1541
+ if not specs:
1542
+ logger.warning(f"Element {element} returned no highlight specs")
1543
+ return None
1544
+
1545
+ # For now, we'll use the first spec for the prepared data
1546
+ # The actual rendering will use all specs
1547
+ first_spec = specs[0]
1548
+ page = first_spec["page"]
1549
+
1550
+ base_data = {
1551
+ "page_index": first_spec["page_index"],
1552
+ "element": element,
1553
+ "annotate": annotate,
1554
+ "attributes_to_draw": {},
1555
+ "bbox": first_spec.get("bbox"),
1556
+ "polygon": first_spec.get("polygon"),
1557
+ "multi_spec": len(specs) > 1, # Flag to indicate multiple specs
1558
+ "all_specs": specs, # Store all specs for rendering
1559
+ }
1560
+
1561
+ # Extract attributes if requested
1562
+ if annotate:
1563
+ for attr_name in annotate:
1564
+ try:
1565
+ attr_value = getattr(element, attr_name, None)
1566
+ if attr_value is not None:
1567
+ base_data["attributes_to_draw"][attr_name] = attr_value
1568
+ except AttributeError:
1569
+ logger.warning(
1570
+ f"Attribute '{attr_name}' not found on element {element} for annotate"
1571
+ )
1572
+
1573
+ return base_data
1574
+
1575
+ # Fallback for regular elements with direct page access
1222
1576
  if not hasattr(element, "page"):
1577
+ logger.warning(f"Element {element} has no page attribute and no highlighting protocol")
1223
1578
  return None
1579
+
1224
1580
  page = element.page
1225
1581
 
1226
1582
  base_data = {
1227
1583
  "page_index": page.index,
1228
1584
  "element": element,
1229
- "include_attrs": include_attrs,
1585
+ "annotate": annotate,
1230
1586
  "attributes_to_draw": {},
1231
1587
  "bbox": None,
1232
1588
  "polygon": None,
@@ -1251,15 +1607,15 @@ class ElementCollection(
1251
1607
  return None
1252
1608
 
1253
1609
  # Extract attributes if requested
1254
- if include_attrs:
1255
- for attr_name in include_attrs:
1610
+ if annotate:
1611
+ for attr_name in annotate:
1256
1612
  try:
1257
1613
  attr_value = getattr(element, attr_name, None)
1258
1614
  if attr_value is not None:
1259
1615
  base_data["attributes_to_draw"][attr_name] = attr_value
1260
1616
  except AttributeError:
1261
1617
  logger.warning(
1262
- f"Attribute '{attr_name}' not found on element {element} for include_attrs"
1618
+ f"Attribute '{attr_name}' not found on element {element} for annotate"
1263
1619
  )
1264
1620
 
1265
1621
  return base_data
@@ -1416,7 +1772,7 @@ class ElementCollection(
1416
1772
 
1417
1773
  def correct_ocr(
1418
1774
  self,
1419
- correction_callback: Callable[[Any], Optional[str]],
1775
+ transform: Callable[[Any], Optional[str]],
1420
1776
  max_workers: Optional[int] = None,
1421
1777
  ) -> "ElementCollection":
1422
1778
  """
@@ -1425,10 +1781,10 @@ class ElementCollection(
1425
1781
  in parallel if `max_workers` is specified.
1426
1782
 
1427
1783
  Iterates through elements currently in the collection. If an element's
1428
- 'source' attribute starts with 'ocr', it calls the `correction_callback`
1784
+ 'source' attribute starts with 'ocr', it calls the `transform`
1429
1785
  for that element, passing the element itself.
1430
1786
 
1431
- The `correction_callback` should contain the logic to:
1787
+ The `transform` should contain the logic to:
1432
1788
  1. Determine if the element needs correction.
1433
1789
  2. Perform the correction (e.g., call an LLM).
1434
1790
  3. Return the new text (`str`) or `None`.
@@ -1438,8 +1794,8 @@ class ElementCollection(
1438
1794
  Elements without a source starting with 'ocr' are skipped.
1439
1795
 
1440
1796
  Args:
1441
- correction_callback: A function accepting an element and returning
1442
- `Optional[str]` (new text or None).
1797
+ transform: A function accepting an element and returning
1798
+ `Optional[str]` (new text or None).
1443
1799
  max_workers: The maximum number of worker threads to use for parallel
1444
1800
  correction on each page. If None, defaults are used.
1445
1801
 
@@ -1449,7 +1805,7 @@ class ElementCollection(
1449
1805
  # Delegate to the utility function
1450
1806
  _apply_ocr_correction_to_elements(
1451
1807
  elements=self._elements,
1452
- correction_callback=correction_callback,
1808
+ correction_callback=transform,
1453
1809
  caller_info=f"ElementCollection(len={len(self._elements)})", # Pass caller info
1454
1810
  max_workers=max_workers,
1455
1811
  )
@@ -1696,9 +2052,7 @@ class ElementCollection(
1696
2052
  image_path = image_dir / image_filename
1697
2053
 
1698
2054
  # Save image
1699
- element.to_image(
1700
- path=str(image_path), resolution=image_resolution, include_highlights=True
1701
- )
2055
+ element.show(path=str(image_path), resolution=image_resolution)
1702
2056
 
1703
2057
  # Add relative path to data
1704
2058
  element_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
@@ -1986,8 +2340,8 @@ class ElementCollection(
1986
2340
  # ------------------------------------------------------------------
1987
2341
  def apply_ocr(
1988
2342
  self,
1989
- *,
1990
2343
  function: Optional[Callable[["Region"], Optional[str]]] = None,
2344
+ *,
1991
2345
  show_progress: bool = True,
1992
2346
  **kwargs,
1993
2347
  ) -> "ElementCollection":
@@ -2043,1154 +2397,3 @@ class ElementCollection(
2043
2397
  return self
2044
2398
 
2045
2399
  # ------------------------------------------------------------------
2046
-
2047
-
2048
- class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2049
- """
2050
- Represents a collection of Page objects, often from a single PDF document.
2051
- Provides methods for batch operations on these pages.
2052
- """
2053
-
2054
- def __init__(self, pages: Union[List[P], Sequence[P]]):
2055
- """
2056
- Initialize a page collection.
2057
-
2058
- Args:
2059
- pages: List or sequence of Page objects (can be lazy)
2060
- """
2061
- # Store the sequence as-is to preserve lazy behavior
2062
- # Only convert to list if we need list-specific operations
2063
- if hasattr(pages, '__iter__') and hasattr(pages, '__len__'):
2064
- self.pages = pages
2065
- else:
2066
- # Fallback for non-sequence types
2067
- self.pages = list(pages)
2068
-
2069
- def __len__(self) -> int:
2070
- """Return the number of pages in the collection."""
2071
- return len(self.pages)
2072
-
2073
- def __getitem__(self, idx) -> Union[P, "PageCollection[P]"]:
2074
- """Support indexing and slicing."""
2075
- if isinstance(idx, slice):
2076
- return PageCollection(self.pages[idx])
2077
- return self.pages[idx]
2078
-
2079
- def __iter__(self) -> Iterator[P]:
2080
- """Support iteration."""
2081
- return iter(self.pages)
2082
-
2083
- def __repr__(self) -> str:
2084
- """Return a string representation showing the page count."""
2085
- return f"<PageCollection(count={len(self)})>"
2086
-
2087
- def _get_items_for_apply(self) -> Iterator[P]:
2088
- """
2089
- Override ApplyMixin's _get_items_for_apply to preserve lazy behavior.
2090
-
2091
- Returns an iterator that yields pages on-demand rather than materializing
2092
- all pages at once, maintaining the lazy loading behavior.
2093
- """
2094
- return iter(self.pages)
2095
-
2096
- def _get_page_indices(self) -> List[int]:
2097
- """
2098
- Get page indices without forcing materialization of pages.
2099
-
2100
- Returns:
2101
- List of page indices for the pages in this collection.
2102
- """
2103
- # Handle different types of page sequences efficiently
2104
- if hasattr(self.pages, '_indices'):
2105
- # If it's a _LazyPageList (or slice), get indices directly
2106
- return list(self.pages._indices)
2107
- else:
2108
- # Fallback: if pages are already materialized, get indices normally
2109
- # This will force materialization but only if pages aren't lazy
2110
- return [p.index for p in self.pages]
2111
-
2112
- def extract_text(
2113
- self,
2114
- keep_blank_chars: bool = True,
2115
- apply_exclusions: bool = True,
2116
- strip: Optional[bool] = None,
2117
- **kwargs,
2118
- ) -> str:
2119
- """
2120
- Extract text from all pages in the collection.
2121
-
2122
- Args:
2123
- keep_blank_chars: Whether to keep blank characters (default: True)
2124
- apply_exclusions: Whether to apply exclusion regions (default: True)
2125
- strip: Whether to strip whitespace from the extracted text.
2126
- **kwargs: Additional extraction parameters
2127
-
2128
- Returns:
2129
- Combined text from all pages
2130
- """
2131
- texts = []
2132
- for page in self.pages:
2133
- text = page.extract_text(
2134
- keep_blank_chars=keep_blank_chars,
2135
- apply_exclusions=apply_exclusions,
2136
- **kwargs,
2137
- )
2138
- texts.append(text)
2139
-
2140
- combined = "\n".join(texts)
2141
-
2142
- # Default strip behaviour: if caller picks, honour; else respect layout flag passed via kwargs.
2143
- use_layout = kwargs.get("layout", False)
2144
- strip_final = strip if strip is not None else (not use_layout)
2145
-
2146
- if strip_final:
2147
- combined = "\n".join(line.rstrip() for line in combined.splitlines()).strip()
2148
-
2149
- return combined
2150
-
2151
- def apply_ocr(
2152
- self,
2153
- engine: Optional[str] = None,
2154
- # --- Common OCR Parameters (Direct Arguments) ---
2155
- languages: Optional[List[str]] = None,
2156
- min_confidence: Optional[float] = None, # Min confidence threshold
2157
- device: Optional[str] = None,
2158
- resolution: Optional[int] = None, # DPI for rendering
2159
- apply_exclusions: bool = True, # New parameter
2160
- replace: bool = True, # Whether to replace existing OCR elements
2161
- # --- Engine-Specific Options ---
2162
- options: Optional[Any] = None, # e.g., EasyOCROptions(...)
2163
- ) -> "PageCollection[P]":
2164
- """
2165
- Applies OCR to all pages within this collection using batch processing.
2166
-
2167
- This delegates the work to the parent PDF object's `apply_ocr` method.
2168
-
2169
- Args:
2170
- engine: Name of the OCR engine (e.g., 'easyocr', 'paddleocr').
2171
- languages: List of language codes (e.g., ['en', 'fr'], ['en', 'ch']).
2172
- **Must be codes understood by the specific selected engine.**
2173
- No mapping is performed.
2174
- min_confidence: Minimum confidence threshold for detected text (0.0 to 1.0).
2175
- device: Device to run OCR on (e.g., 'cpu', 'cuda', 'mps').
2176
- resolution: DPI resolution to render page images before OCR (e.g., 150, 300).
2177
- apply_exclusions: If True (default), render page images for OCR with
2178
- excluded areas masked (whited out). If False, OCR
2179
- the raw page images without masking exclusions.
2180
- replace: If True (default), remove any existing OCR elements before
2181
- adding new ones. If False, add new OCR elements to existing ones.
2182
- options: An engine-specific options object (e.g., EasyOCROptions) or dict.
2183
-
2184
- Returns:
2185
- Self for method chaining.
2186
-
2187
- Raises:
2188
- RuntimeError: If pages lack a parent PDF or parent lacks `apply_ocr`.
2189
- (Propagates exceptions from PDF.apply_ocr)
2190
- """
2191
- if not self.pages:
2192
- logger.warning("Cannot apply OCR to an empty PageCollection.")
2193
- return self
2194
-
2195
- # Assume all pages share the same parent PDF object
2196
- first_page = self.pages[0]
2197
- if not hasattr(first_page, "_parent") or not first_page._parent:
2198
- raise RuntimeError("Pages in this collection do not have a parent PDF reference.")
2199
-
2200
- parent_pdf = first_page._parent
2201
-
2202
- if not hasattr(parent_pdf, "apply_ocr") or not callable(parent_pdf.apply_ocr):
2203
- raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
2204
-
2205
- # Get the 0-based indices of the pages in this collection
2206
- page_indices = self._get_page_indices()
2207
-
2208
- logger.info(f"Applying OCR via parent PDF to page indices: {page_indices} in collection.")
2209
-
2210
- # Delegate the batch call to the parent PDF object, passing direct args and apply_exclusions
2211
- parent_pdf.apply_ocr(
2212
- pages=page_indices,
2213
- engine=engine,
2214
- languages=languages,
2215
- min_confidence=min_confidence, # Pass the renamed parameter
2216
- device=device,
2217
- resolution=resolution,
2218
- apply_exclusions=apply_exclusions, # Pass down
2219
- replace=replace, # Pass the replace parameter
2220
- options=options,
2221
- )
2222
- # The PDF method modifies the Page objects directly by adding elements.
2223
-
2224
- return self # Return self for chaining
2225
-
2226
- @overload
2227
- def find(
2228
- self,
2229
- *,
2230
- text: str,
2231
- contains: str = "all",
2232
- apply_exclusions: bool = True,
2233
- regex: bool = False,
2234
- case: bool = True,
2235
- **kwargs,
2236
- ) -> Optional[T]: ...
2237
-
2238
- @overload
2239
- def find(
2240
- self,
2241
- selector: str,
2242
- *,
2243
- contains: str = "all",
2244
- apply_exclusions: bool = True,
2245
- regex: bool = False,
2246
- case: bool = True,
2247
- **kwargs,
2248
- ) -> Optional[T]: ...
2249
-
2250
- def find(
2251
- self,
2252
- selector: Optional[str] = None,
2253
- *,
2254
- text: Optional[str] = None,
2255
- contains: str = "all",
2256
- apply_exclusions: bool = True,
2257
- regex: bool = False,
2258
- case: bool = True,
2259
- **kwargs,
2260
- ) -> Optional[T]:
2261
- """
2262
- Find the first element matching the selector OR text across all pages in the collection.
2263
-
2264
- Provide EITHER `selector` OR `text`, but not both.
2265
-
2266
- Args:
2267
- selector: CSS-like selector string.
2268
- text: Text content to search for (equivalent to 'text:contains(...)').
2269
- contains: How to determine if elements are inside: 'all' (fully inside),
2270
- 'any' (any overlap), or 'center' (center point inside).
2271
- (default: "all")
2272
- apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
2273
- regex: Whether to use regex for text search (`selector` or `text`) (default: False).
2274
- case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
2275
- **kwargs: Additional filter parameters.
2276
-
2277
- Returns:
2278
- First matching element or None.
2279
- """
2280
- # Input validation happens within page.find
2281
- for page in self.pages:
2282
- element = page.find(
2283
- selector=selector,
2284
- text=text,
2285
- contains=contains,
2286
- apply_exclusions=apply_exclusions,
2287
- regex=regex,
2288
- case=case,
2289
- **kwargs,
2290
- )
2291
- if element:
2292
- return element
2293
- return None
2294
-
2295
- @overload
2296
- def find_all(
2297
- self,
2298
- *,
2299
- text: str,
2300
- contains: str = "all",
2301
- apply_exclusions: bool = True,
2302
- regex: bool = False,
2303
- case: bool = True,
2304
- **kwargs,
2305
- ) -> "ElementCollection": ...
2306
-
2307
- @overload
2308
- def find_all(
2309
- self,
2310
- selector: str,
2311
- *,
2312
- contains: str = "all",
2313
- apply_exclusions: bool = True,
2314
- regex: bool = False,
2315
- case: bool = True,
2316
- **kwargs,
2317
- ) -> "ElementCollection": ...
2318
-
2319
- def find_all(
2320
- self,
2321
- selector: Optional[str] = None,
2322
- *,
2323
- text: Optional[str] = None,
2324
- contains: str = "all",
2325
- apply_exclusions: bool = True,
2326
- regex: bool = False,
2327
- case: bool = True,
2328
- **kwargs,
2329
- ) -> "ElementCollection":
2330
- """
2331
- Find all elements matching the selector OR text across all pages in the collection.
2332
-
2333
- Provide EITHER `selector` OR `text`, but not both.
2334
-
2335
- Args:
2336
- selector: CSS-like selector string.
2337
- text: Text content to search for (equivalent to 'text:contains(...)').
2338
- contains: How to determine if elements are inside: 'all' (fully inside),
2339
- 'any' (any overlap), or 'center' (center point inside).
2340
- (default: "all")
2341
- apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
2342
- regex: Whether to use regex for text search (`selector` or `text`) (default: False).
2343
- case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
2344
- **kwargs: Additional filter parameters.
2345
-
2346
- Returns:
2347
- ElementCollection with matching elements from all pages.
2348
- """
2349
- all_elements = []
2350
- # Input validation happens within page.find_all
2351
- for page in self.pages:
2352
- elements = page.find_all(
2353
- selector=selector,
2354
- text=text,
2355
- contains=contains,
2356
- apply_exclusions=apply_exclusions,
2357
- regex=regex,
2358
- case=case,
2359
- **kwargs,
2360
- )
2361
- if elements:
2362
- all_elements.extend(elements.elements)
2363
-
2364
- return ElementCollection(all_elements)
2365
-
2366
- def correct_ocr(
2367
- self,
2368
- correction_callback: Callable[[Any], Optional[str]],
2369
- max_workers: Optional[int] = None,
2370
- ) -> "PageCollection[P]":
2371
- """
2372
- Applies corrections to OCR-generated text elements across all pages
2373
- in this collection using a user-provided callback function, executed
2374
- in parallel if `max_workers` is specified.
2375
-
2376
- This method delegates to the parent PDF's `correct_ocr` method,
2377
- targeting all pages within this collection.
2378
-
2379
- Args:
2380
- correction_callback: A function that accepts a single argument (an element
2381
- object) and returns `Optional[str]` (new text or None).
2382
- max_workers: The maximum number of worker threads to use for parallel
2383
- correction on each page. If None, defaults are used.
2384
-
2385
- Returns:
2386
- Self for method chaining.
2387
-
2388
- Raises:
2389
- RuntimeError: If the collection is empty, pages lack a parent PDF reference,
2390
- or the parent PDF lacks the `correct_ocr` method.
2391
- """
2392
- if not self.pages:
2393
- logger.warning("Cannot correct OCR for an empty PageCollection.")
2394
- # Return self even if empty to maintain chaining consistency
2395
- return self
2396
-
2397
- # Assume all pages share the same parent PDF object
2398
- parent_pdf = self.pages[0]._parent
2399
- if (
2400
- not parent_pdf
2401
- or not hasattr(parent_pdf, "correct_ocr")
2402
- or not callable(parent_pdf.correct_ocr)
2403
- ):
2404
- raise RuntimeError(
2405
- "Parent PDF reference not found or parent PDF lacks the required 'correct_ocr' method."
2406
- )
2407
-
2408
- page_indices = self._get_page_indices()
2409
- logger.info(
2410
- f"PageCollection: Delegating correct_ocr to parent PDF for page indices: {page_indices} with max_workers={max_workers}."
2411
- )
2412
-
2413
- # Delegate the call to the parent PDF object for the relevant pages
2414
- # Pass the max_workers parameter down
2415
- parent_pdf.correct_ocr(
2416
- correction_callback=correction_callback,
2417
- pages=page_indices,
2418
- max_workers=max_workers, # Pass it here
2419
- )
2420
-
2421
- return self
2422
-
2423
- def get_sections(
2424
- self,
2425
- start_elements=None,
2426
- end_elements=None,
2427
- new_section_on_page_break=False,
2428
- boundary_inclusion="both",
2429
- ) -> "ElementCollection[Region]":
2430
- """
2431
- Extract sections from a page collection based on start/end elements.
2432
-
2433
- Args:
2434
- start_elements: Elements or selector string that mark the start of sections
2435
- end_elements: Elements or selector string that mark the end of sections
2436
- new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
2437
- boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
2438
-
2439
- Returns:
2440
- List of Region objects representing the extracted sections
2441
- """
2442
- # Find start and end elements across all pages
2443
- if isinstance(start_elements, str):
2444
- start_elements = self.find_all(start_elements).elements
2445
-
2446
- if isinstance(end_elements, str):
2447
- end_elements = self.find_all(end_elements).elements
2448
-
2449
- # If no start elements, return empty list
2450
- if not start_elements:
2451
- return []
2452
-
2453
- # If there are page break boundaries, we'll need to add them
2454
- if new_section_on_page_break:
2455
- # For each page boundary, create virtual "end" and "start" elements
2456
- for i in range(len(self.pages) - 1):
2457
- # Add a virtual "end" element at the bottom of the current page
2458
- page = self.pages[i]
2459
- # If end_elements is None, initialize it as an empty list
2460
- if end_elements is None:
2461
- end_elements = []
2462
-
2463
- # Create a region at the bottom of the page as an artificial end marker
2464
- from natural_pdf.elements.region import Region
2465
-
2466
- bottom_region = Region(page, (0, page.height - 1, page.width, page.height))
2467
- bottom_region.is_page_boundary = True # Mark it as a special boundary
2468
- end_elements.append(bottom_region)
2469
-
2470
- # Add a virtual "start" element at the top of the next page
2471
- next_page = self.pages[i + 1]
2472
- top_region = Region(next_page, (0, 0, next_page.width, 1))
2473
- top_region.is_page_boundary = True # Mark it as a special boundary
2474
- start_elements.append(top_region)
2475
-
2476
- # Get all elements from all pages and sort them in document order
2477
- all_elements = []
2478
- for page in self.pages:
2479
- elements = page.get_elements()
2480
- all_elements.extend(elements)
2481
-
2482
- # Sort by page index, then vertical position, then horizontal position
2483
- all_elements.sort(key=lambda e: (e.page.index, e.top, e.x0))
2484
-
2485
- # Mark section boundaries
2486
- section_boundaries = []
2487
-
2488
- # Add start element boundaries
2489
- for element in start_elements:
2490
- if element in all_elements:
2491
- idx = all_elements.index(element)
2492
- section_boundaries.append(
2493
- {
2494
- "index": idx,
2495
- "element": element,
2496
- "type": "start",
2497
- "page_idx": element.page.index,
2498
- }
2499
- )
2500
- elif hasattr(element, "is_page_boundary") and element.is_page_boundary:
2501
- # This is a virtual page boundary element
2502
- section_boundaries.append(
2503
- {
2504
- "index": -1, # Special index for page boundaries
2505
- "element": element,
2506
- "type": "start",
2507
- "page_idx": element.page.index,
2508
- }
2509
- )
2510
-
2511
- # Add end element boundaries if provided
2512
- if end_elements:
2513
- for element in end_elements:
2514
- if element in all_elements:
2515
- idx = all_elements.index(element)
2516
- section_boundaries.append(
2517
- {
2518
- "index": idx,
2519
- "element": element,
2520
- "type": "end",
2521
- "page_idx": element.page.index,
2522
- }
2523
- )
2524
- elif hasattr(element, "is_page_boundary") and element.is_page_boundary:
2525
- # This is a virtual page boundary element
2526
- section_boundaries.append(
2527
- {
2528
- "index": -1, # Special index for page boundaries
2529
- "element": element,
2530
- "type": "end",
2531
- "page_idx": element.page.index,
2532
- }
2533
- )
2534
-
2535
- # Sort boundaries by page index, then by actual document position
2536
- section_boundaries.sort(
2537
- key=lambda x: (
2538
- x["page_idx"],
2539
- x["index"] if x["index"] != -1 else (0 if x["type"] == "start" else float("inf")),
2540
- )
2541
- )
2542
-
2543
- # Generate sections
2544
- sections = []
2545
-
2546
- # --- Helper: build a FlowRegion spanning multiple pages ---
2547
- def _build_flow_region(start_el, end_el):
2548
- """Return a FlowRegion that covers from *start_el* to *end_el* (inclusive).
2549
- If *end_el* is None, the region continues to the bottom of the last
2550
- page in this PageCollection."""
2551
- # Local imports to avoid top-level cycles
2552
- from natural_pdf.elements.region import Region
2553
- from natural_pdf.flows.element import FlowElement
2554
- from natural_pdf.flows.flow import Flow
2555
- from natural_pdf.flows.region import FlowRegion
2556
-
2557
- start_pg = start_el.page
2558
- end_pg = end_el.page if end_el is not None else self.pages[-1]
2559
-
2560
- parts: list[Region] = []
2561
- # Slice of first page
2562
- parts.append(Region(start_pg, (0, start_el.top, start_pg.width, start_pg.height)))
2563
-
2564
- # Full middle pages
2565
- for pg_idx in range(start_pg.index + 1, end_pg.index):
2566
- mid_pg = self.pages[pg_idx]
2567
- parts.append(Region(mid_pg, (0, 0, mid_pg.width, mid_pg.height)))
2568
-
2569
- # Slice of last page (if distinct)
2570
- if end_pg is not start_pg:
2571
- bottom = end_el.bottom if end_el is not None else end_pg.height
2572
- parts.append(Region(end_pg, (0, 0, end_pg.width, bottom)))
2573
-
2574
- flow = Flow(segments=parts, arrangement="vertical")
2575
- src_fe = FlowElement(physical_object=start_el, flow=flow)
2576
- return FlowRegion(
2577
- flow=flow,
2578
- constituent_regions=parts,
2579
- source_flow_element=src_fe,
2580
- boundary_element_found=end_el,
2581
- )
2582
-
2583
- # ------------------------------------------------------------------
2584
-
2585
- current_start = None
2586
-
2587
- for i, boundary in enumerate(section_boundaries):
2588
- # If it's a start boundary and we don't have a current start
2589
- if boundary["type"] == "start" and current_start is None:
2590
- current_start = boundary
2591
-
2592
- # If it's an end boundary and we have a current start
2593
- elif boundary["type"] == "end" and current_start is not None:
2594
- # Create a section from current_start to this boundary
2595
- start_element = current_start["element"]
2596
- end_element = boundary["element"]
2597
-
2598
- # If both elements are on the same page, use the page's get_section_between
2599
- if start_element.page == end_element.page:
2600
- section = start_element.page.get_section_between(
2601
- start_element, end_element, boundary_inclusion
2602
- )
2603
- sections.append(section)
2604
- else:
2605
- # Create FlowRegion spanning pages
2606
- flow_region = _build_flow_region(start_element, end_element)
2607
- sections.append(flow_region)
2608
-
2609
- current_start = None
2610
-
2611
- # If it's another start boundary and we have a current start (for splitting by starts only)
2612
- elif boundary["type"] == "start" and current_start is not None and not end_elements:
2613
- # Create a section from current_start to just before this boundary
2614
- start_element = current_start["element"]
2615
-
2616
- # Find the last element before this boundary on the same page
2617
- if start_element.page == boundary["element"].page:
2618
- # Find elements on this page
2619
- page_elements = [e for e in all_elements if e.page == start_element.page]
2620
- # Sort by position
2621
- page_elements.sort(key=lambda e: (e.top, e.x0))
2622
-
2623
- # Find the last element before the boundary
2624
- end_idx = (
2625
- page_elements.index(boundary["element"]) - 1
2626
- if boundary["element"] in page_elements
2627
- else -1
2628
- )
2629
- end_element = page_elements[end_idx] if end_idx >= 0 else None
2630
-
2631
- # Create the section
2632
- section = start_element.page.get_section_between(
2633
- start_element, end_element, boundary_inclusion
2634
- )
2635
- sections.append(section)
2636
- else:
2637
- # Cross-page section - create from current_start to the end of its page
2638
- from natural_pdf.elements.region import Region
2639
-
2640
- start_page = start_element.page
2641
-
2642
- region = Region(
2643
- start_page, (0, start_element.top, start_page.width, start_page.height)
2644
- )
2645
- region.start_element = start_element
2646
- sections.append(region)
2647
-
2648
- current_start = boundary
2649
-
2650
- # Handle the last section if we have a current start
2651
- if current_start is not None:
2652
- start_element = current_start["element"]
2653
- start_page = start_element.page
2654
-
2655
- if end_elements:
2656
- # With end_elements, we need an explicit end - use the last element
2657
- # on the last page of the collection
2658
- last_page = self.pages[-1]
2659
- last_page_elements = [e for e in all_elements if e.page == last_page]
2660
- last_page_elements.sort(key=lambda e: (e.top, e.x0))
2661
- end_element = last_page_elements[-1] if last_page_elements else None
2662
-
2663
- # Create FlowRegion spanning multiple pages using helper
2664
- flow_region = _build_flow_region(start_element, end_element)
2665
- sections.append(flow_region)
2666
- else:
2667
- # With start_elements only, create a section to the end of the current page
2668
- from natural_pdf.elements.region import Region
2669
-
2670
- region = Region(
2671
- start_page, (0, start_element.top, start_page.width, start_page.height)
2672
- )
2673
- region.start_element = start_element
2674
- sections.append(region)
2675
-
2676
- return ElementCollection(sections)
2677
-
2678
- def _gather_analysis_data(
2679
- self,
2680
- analysis_keys: List[str],
2681
- include_content: bool,
2682
- include_images: bool,
2683
- image_dir: Optional[Path],
2684
- image_format: str,
2685
- image_resolution: int,
2686
- ) -> List[Dict[str, Any]]:
2687
- """
2688
- Gather analysis data from all pages in the collection.
2689
-
2690
- Args:
2691
- analysis_keys: Keys in the analyses dictionary to export
2692
- include_content: Whether to include extracted text
2693
- include_images: Whether to export images
2694
- image_dir: Directory to save images
2695
- image_format: Format to save images
2696
- image_resolution: Resolution for exported images
2697
-
2698
- Returns:
2699
- List of dictionaries containing analysis data
2700
- """
2701
- if not self.elements:
2702
- logger.warning("No pages found in collection")
2703
- return []
2704
-
2705
- all_data = []
2706
-
2707
- for page in self.elements:
2708
- # Basic page information
2709
- page_data = {
2710
- "page_number": page.number,
2711
- "page_index": page.index,
2712
- "width": page.width,
2713
- "height": page.height,
2714
- }
2715
-
2716
- # Add PDF information if available
2717
- if hasattr(page, "pdf") and page.pdf:
2718
- page_data["pdf_path"] = page.pdf.path
2719
- page_data["pdf_filename"] = Path(page.pdf.path).name
2720
-
2721
- # Include extracted text if requested
2722
- if include_content:
2723
- try:
2724
- page_data["content"] = page.extract_text(preserve_whitespace=True)
2725
- except Exception as e:
2726
- logger.error(f"Error extracting text from page {page.number}: {e}")
2727
- page_data["content"] = ""
2728
-
2729
- # Save image if requested
2730
- if include_images:
2731
- try:
2732
- # Create image filename
2733
- pdf_name = "unknown"
2734
- if hasattr(page, "pdf") and page.pdf:
2735
- pdf_name = Path(page.pdf.path).stem
2736
-
2737
- image_filename = f"{pdf_name}_page_{page.number}.{image_format}"
2738
- image_path = image_dir / image_filename
2739
-
2740
- # Save image
2741
- page.save_image(
2742
- str(image_path), resolution=image_resolution, include_highlights=True
2743
- )
2744
-
2745
- # Add relative path to data
2746
- page_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
2747
- except Exception as e:
2748
- logger.error(f"Error saving image for page {page.number}: {e}")
2749
- page_data["image_path"] = None
2750
-
2751
- # Add analyses data
2752
- if hasattr(page, "analyses") and page.analyses:
2753
- for key in analysis_keys:
2754
- if key not in page.analyses:
2755
- raise KeyError(f"Analysis key '{key}' not found in page {page.number}")
2756
-
2757
- # Get the analysis result
2758
- analysis_result = page.analyses[key]
2759
-
2760
- # If the result has a to_dict method, use it
2761
- if hasattr(analysis_result, "to_dict"):
2762
- analysis_data = analysis_result.to_dict()
2763
- else:
2764
- # Otherwise, use the result directly if it's dict-like
2765
- try:
2766
- analysis_data = dict(analysis_result)
2767
- except (TypeError, ValueError):
2768
- # Last resort: convert to string
2769
- analysis_data = {"raw_result": str(analysis_result)}
2770
-
2771
- # Add analysis data to page data with the key as prefix
2772
- for k, v in analysis_data.items():
2773
- page_data[f"{key}.{k}"] = v
2774
-
2775
- all_data.append(page_data)
2776
-
2777
- return all_data
2778
-
2779
- # --- Deskew Method --- #
2780
-
2781
- def deskew(
2782
- self,
2783
- resolution: int = 300,
2784
- detection_resolution: int = 72,
2785
- force_overwrite: bool = False,
2786
- **deskew_kwargs,
2787
- ) -> "PDF": # Changed return type
2788
- """
2789
- Creates a new, in-memory PDF object containing deskewed versions of the pages
2790
- in this collection.
2791
-
2792
- This method delegates the actual processing to the parent PDF object's
2793
- `deskew` method.
2794
-
2795
- Important: The returned PDF is image-based. Any existing text, OCR results,
2796
- annotations, or other elements from the original pages will *not* be carried over.
2797
-
2798
- Args:
2799
- resolution: DPI resolution for rendering the output deskewed pages.
2800
- detection_resolution: DPI resolution used for skew detection if angles are not
2801
- already cached on the page objects.
2802
- force_overwrite: If False (default), raises a ValueError if any target page
2803
- already contains processed elements (text, OCR, regions) to
2804
- prevent accidental data loss. Set to True to proceed anyway.
2805
- **deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
2806
- during automatic detection (e.g., `max_angle`, `num_peaks`).
2807
-
2808
- Returns:
2809
- A new PDF object representing the deskewed document.
2810
-
2811
- Raises:
2812
- ImportError: If 'deskew' or 'img2pdf' libraries are not installed (raised by PDF.deskew).
2813
- ValueError: If `force_overwrite` is False and target pages contain elements (raised by PDF.deskew),
2814
- or if the collection is empty.
2815
- RuntimeError: If pages lack a parent PDF reference, or the parent PDF lacks the `deskew` method.
2816
- """
2817
- if not self.pages:
2818
- logger.warning("Cannot deskew an empty PageCollection.")
2819
- raise ValueError("Cannot deskew an empty PageCollection.")
2820
-
2821
- # Assume all pages share the same parent PDF object
2822
- # Need to hint the type of _parent for type checkers
2823
- if TYPE_CHECKING:
2824
- parent_pdf: "natural_pdf.core.pdf.PDF" = self.pages[0]._parent
2825
- else:
2826
- parent_pdf = self.pages[0]._parent
2827
-
2828
- if not parent_pdf or not hasattr(parent_pdf, "deskew") or not callable(parent_pdf.deskew):
2829
- raise RuntimeError(
2830
- "Parent PDF reference not found or parent PDF lacks the required 'deskew' method."
2831
- )
2832
-
2833
- # Get the 0-based indices of the pages in this collection
2834
- page_indices = self._get_page_indices()
2835
- logger.info(
2836
- f"PageCollection: Delegating deskew to parent PDF for page indices: {page_indices}"
2837
- )
2838
-
2839
- # Delegate the call to the parent PDF object for the relevant pages
2840
- # Pass all relevant arguments through (no output_path anymore)
2841
- return parent_pdf.deskew(
2842
- pages=page_indices,
2843
- resolution=resolution,
2844
- detection_resolution=detection_resolution,
2845
- force_overwrite=force_overwrite,
2846
- **deskew_kwargs,
2847
- )
2848
-
2849
- # --- End Deskew Method --- #
2850
-
2851
- def to_image(
2852
- self,
2853
- page_width: Optional[int] = None,
2854
- cols: Optional[int] = 4,
2855
- rows: Optional[int] = None,
2856
- max_pages: Optional[int] = None,
2857
- spacing: int = 10,
2858
- add_labels: bool = True, # Add new flag
2859
- show_category: bool = False,
2860
- ) -> Optional["Image.Image"]:
2861
- """
2862
- Generate a grid of page images for this collection.
2863
-
2864
- Args:
2865
- page_width: Width in pixels for rendering individual pages
2866
- cols: Number of columns in grid (default: 4)
2867
- rows: Number of rows in grid (calculated automatically if None)
2868
- max_pages: Maximum number of pages to include (default: all)
2869
- spacing: Spacing between page thumbnails in pixels
2870
- add_labels: Whether to add page number labels
2871
- show_category: Whether to add category and confidence labels (if available)
2872
-
2873
- Returns:
2874
- PIL Image of the page grid or None if no pages
2875
- """
2876
- # Determine default page width from global options if not explicitly provided
2877
- if page_width is None:
2878
- try:
2879
- import natural_pdf
2880
-
2881
- page_width = natural_pdf.options.image.width or 300
2882
- except Exception:
2883
- # Fallback if natural_pdf import fails in some edge context
2884
- page_width = 300
2885
-
2886
- # Ensure PIL is imported, handle potential ImportError if not done globally/lazily
2887
- try:
2888
- from PIL import Image, ImageDraw, ImageFont
2889
- except ImportError:
2890
- logger.error(
2891
- "Pillow library not found, required for to_image(). Install with 'pip install Pillow'"
2892
- )
2893
- return None
2894
-
2895
- if not self.pages:
2896
- logger.warning("Cannot generate image for empty PageCollection")
2897
- return None
2898
-
2899
- # Limit pages if max_pages is specified
2900
- pages_to_render = self.pages[:max_pages] if max_pages else self.pages
2901
-
2902
- # Load font once outside the loop
2903
- font = None
2904
- if add_labels:
2905
- try:
2906
- # Try loading a commonly available font first
2907
- font = ImageFont.truetype("DejaVuSans.ttf", 16)
2908
- except IOError:
2909
- try:
2910
- font = ImageFont.load_default(16)
2911
- except IOError:
2912
- logger.warning("Default font not found. Labels cannot be added.")
2913
- add_labels = False # Disable if no font
2914
-
2915
- # Render individual page images
2916
- page_images = []
2917
- for page in pages_to_render:
2918
- try:
2919
- # Assume page.to_image returns a PIL Image or None
2920
- img = page.to_image(
2921
- width=page_width, include_highlights=True
2922
- ) # Render with highlights for visual context
2923
- if img is None:
2924
- logger.warning(f"Failed to generate image for page {page.number}. Skipping.")
2925
- continue
2926
- except Exception as img_err:
2927
- logger.error(
2928
- f"Error generating image for page {page.number}: {img_err}", exc_info=True
2929
- )
2930
- continue
2931
-
2932
- # Add page number label
2933
- if add_labels and font:
2934
- draw = ImageDraw.Draw(img)
2935
- pdf_name = (
2936
- Path(page.pdf.path).stem
2937
- if hasattr(page, "pdf") and page.pdf and hasattr(page.pdf, "path")
2938
- else ""
2939
- )
2940
- label_text = f"p{page.number}"
2941
- if pdf_name:
2942
- label_text += f" - {pdf_name}"
2943
-
2944
- # Add category if requested and available
2945
- if show_category:
2946
- # Placeholder logic - adjust based on how classification results are stored
2947
- category = None
2948
- confidence = None
2949
- if (
2950
- hasattr(page, "analyses")
2951
- and page.analyses
2952
- and "classification" in page.analyses
2953
- ):
2954
- result = page.analyses["classification"]
2955
- # Adapt based on actual structure of classification result
2956
- category = (
2957
- getattr(result, "label", None) or result.get("label", None)
2958
- if isinstance(result, dict)
2959
- else None
2960
- )
2961
- confidence = (
2962
- getattr(result, "score", None) or result.get("score", None)
2963
- if isinstance(result, dict)
2964
- else None
2965
- )
2966
-
2967
- if category is not None and confidence is not None:
2968
- try:
2969
- category_str = f"{category} ({confidence:.2f})" # Format confidence
2970
- label_text += f"\\n{category_str}"
2971
- except (TypeError, ValueError):
2972
- pass # Ignore formatting errors
2973
-
2974
- # Calculate bounding box for multi-line text and draw background/text
2975
- try:
2976
- # Using textbbox for potentially better accuracy with specific fonts
2977
- # Note: textbbox needs Pillow 8+
2978
- bbox = draw.textbbox(
2979
- (5, 5), label_text, font=font, spacing=2
2980
- ) # Use textbbox if available
2981
- bg_rect = (
2982
- max(0, bbox[0] - 2),
2983
- max(0, bbox[1] - 2),
2984
- min(img.width, bbox[2] + 2),
2985
- min(img.height, bbox[3] + 2),
2986
- )
2987
-
2988
- # Draw semi-transparent background
2989
- overlay = Image.new("RGBA", img.size, (255, 255, 255, 0))
2990
- draw_overlay = ImageDraw.Draw(overlay)
2991
- draw_overlay.rectangle(bg_rect, fill=(255, 255, 255, 180)) # White with alpha
2992
- img = Image.alpha_composite(img.convert("RGBA"), overlay).convert("RGB")
2993
- draw = ImageDraw.Draw(img) # Recreate draw object
2994
-
2995
- # Draw the potentially multi-line text
2996
- draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
2997
- except AttributeError: # Fallback for older Pillow without textbbox
2998
- # Approximate size and draw
2999
- # This might not be perfectly aligned
3000
- draw.rectangle(
3001
- (2, 2, 150, 40), fill=(255, 255, 255, 180)
3002
- ) # Simple fixed background
3003
- draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
3004
- except Exception as draw_err:
3005
- logger.error(
3006
- f"Error drawing label on page {page.number}: {draw_err}", exc_info=True
3007
- )
3008
-
3009
- page_images.append(img)
3010
-
3011
- if not page_images:
3012
- logger.warning("No page images were successfully rendered for the grid.")
3013
- return None
3014
-
3015
- # Calculate grid dimensions if not provided
3016
- num_images = len(page_images)
3017
- if not rows and not cols:
3018
- cols = min(4, int(num_images**0.5) + 1)
3019
- rows = (num_images + cols - 1) // cols
3020
- elif rows and not cols:
3021
- cols = (num_images + rows - 1) // rows
3022
- elif cols and not rows:
3023
- rows = (num_images + cols - 1) // cols
3024
- cols = max(1, cols if cols else 1) # Ensure at least 1
3025
- rows = max(1, rows if rows else 1)
3026
-
3027
- # Get maximum dimensions for consistent grid cells
3028
- max_width = max(img.width for img in page_images) if page_images else 1
3029
- max_height = max(img.height for img in page_images) if page_images else 1
3030
-
3031
- # Create grid image
3032
- grid_width = cols * max_width + (cols + 1) * spacing
3033
- grid_height = rows * max_height + (rows + 1) * spacing
3034
- grid_img = Image.new(
3035
- "RGB", (grid_width, grid_height), (220, 220, 220)
3036
- ) # Lighter gray background
3037
-
3038
- # Place images in grid
3039
- for i, img in enumerate(page_images):
3040
- if i >= rows * cols: # Ensure we don't exceed grid capacity
3041
- break
3042
-
3043
- row = i // cols
3044
- col = i % cols
3045
-
3046
- x = col * max_width + (col + 1) * spacing
3047
- y = row * max_height + (row + 1) * spacing
3048
-
3049
- grid_img.paste(img, (x, y))
3050
-
3051
- return grid_img
3052
-
3053
- def save_pdf(
3054
- self,
3055
- output_path: Union[str, Path],
3056
- ocr: bool = False,
3057
- original: bool = False,
3058
- dpi: int = 300,
3059
- ):
3060
- """
3061
- Saves the pages in this collection to a new PDF file.
3062
-
3063
- Choose one saving mode:
3064
- - `ocr=True`: Creates a new, image-based PDF using OCR results. This
3065
- makes the text generated during the natural-pdf session searchable,
3066
- but loses original vector content. Requires 'ocr-export' extras.
3067
- - `original=True`: Extracts the original pages from the source PDF,
3068
- preserving all vector content, fonts, and annotations. OCR results
3069
- from the natural-pdf session are NOT included. Requires 'ocr-export' extras.
3070
-
3071
- Args:
3072
- output_path: Path to save the new PDF file.
3073
- ocr: If True, save as a searchable, image-based PDF using OCR data.
3074
- original: If True, save the original, vector-based pages.
3075
- dpi: Resolution (dots per inch) used only when ocr=True for
3076
- rendering page images and aligning the text layer.
3077
-
3078
- Raises:
3079
- ValueError: If the collection is empty, if neither or both 'ocr'
3080
- and 'original' are True, or if 'original=True' and
3081
- pages originate from different PDFs.
3082
- ImportError: If required libraries ('pikepdf', 'Pillow')
3083
- are not installed for the chosen mode.
3084
- RuntimeError: If an unexpected error occurs during saving.
3085
- """
3086
- if not self.pages:
3087
- raise ValueError("Cannot save an empty PageCollection.")
3088
-
3089
- if not (ocr ^ original): # XOR: exactly one must be true
3090
- raise ValueError("Exactly one of 'ocr' or 'original' must be True.")
3091
-
3092
- output_path_obj = Path(output_path)
3093
- output_path_str = str(output_path_obj)
3094
-
3095
- if ocr:
3096
- if create_searchable_pdf is None:
3097
- raise ImportError(
3098
- "Saving with ocr=True requires 'pikepdf' and 'Pillow'. "
3099
- 'Install with: pip install \\"natural-pdf[ocr-export]\\"' # Escaped quotes
3100
- )
3101
-
3102
- # Check for non-OCR vector elements (provide a warning)
3103
- has_vector_elements = False
3104
- for page in self.pages:
3105
- # Simplified check for common vector types or non-OCR chars/words
3106
- if (
3107
- hasattr(page, "rects")
3108
- and page.rects
3109
- or hasattr(page, "lines")
3110
- and page.lines
3111
- or hasattr(page, "curves")
3112
- and page.curves
3113
- or (
3114
- hasattr(page, "chars")
3115
- and any(getattr(el, "source", None) != "ocr" for el in page.chars)
3116
- )
3117
- or (
3118
- hasattr(page, "words")
3119
- and any(getattr(el, "source", None) != "ocr" for el in page.words)
3120
- )
3121
- ):
3122
- has_vector_elements = True
3123
- break
3124
- if has_vector_elements:
3125
- logger.warning(
3126
- "Warning: Saving with ocr=True creates an image-based PDF. "
3127
- "Original vector elements (rects, lines, non-OCR text/chars) "
3128
- "on selected pages will not be preserved in the output file."
3129
- )
3130
-
3131
- logger.info(f"Saving searchable PDF (OCR text layer) to: {output_path_str}")
3132
- try:
3133
- # Delegate to the searchable PDF exporter function
3134
- # Pass `self` (the PageCollection instance) as the source
3135
- create_searchable_pdf(self, output_path_str, dpi=dpi)
3136
- # Success log is now inside create_searchable_pdf if needed, or keep here
3137
- # logger.info(f"Successfully saved searchable PDF to: {output_path_str}")
3138
- except Exception as e:
3139
- logger.error(f"Failed to create searchable PDF: {e}", exc_info=True)
3140
- # Re-raise as RuntimeError for consistency, potentially handled in exporter too
3141
- raise RuntimeError(f"Failed to create searchable PDF: {e}") from e
3142
-
3143
- elif original:
3144
- # ---> MODIFIED: Call the new exporter
3145
- if create_original_pdf is None:
3146
- raise ImportError(
3147
- "Saving with original=True requires 'pikepdf'. "
3148
- 'Install with: pip install \\"natural-pdf[ocr-export]\\"' # Escaped quotes
3149
- )
3150
-
3151
- # Check for OCR elements (provide a warning) - keep this check here
3152
- has_ocr_elements = False
3153
- for page in self.pages:
3154
- # Use find_all which returns a collection; check if it's non-empty
3155
- if hasattr(page, "find_all"):
3156
- ocr_text_elements = page.find_all("text[source=ocr]")
3157
- if ocr_text_elements: # Check truthiness of collection
3158
- has_ocr_elements = True
3159
- break
3160
- elif hasattr(page, "words"): # Fallback check if find_all isn't present?
3161
- if any(getattr(el, "source", None) == "ocr" for el in page.words):
3162
- has_ocr_elements = True
3163
- break
3164
-
3165
- if has_ocr_elements:
3166
- logger.warning(
3167
- "Warning: Saving with original=True preserves original page content. "
3168
- "OCR text generated in this session will not be included in the saved file."
3169
- )
3170
-
3171
- logger.info(f"Saving original pages PDF to: {output_path_str}")
3172
- try:
3173
- # Delegate to the original PDF exporter function
3174
- # Pass `self` (the PageCollection instance) as the source
3175
- create_original_pdf(self, output_path_str)
3176
- # Success log is now inside create_original_pdf
3177
- # logger.info(f"Successfully saved original pages PDF to: {output_path_str}")
3178
- except Exception as e:
3179
- # Error logging is handled within create_original_pdf
3180
- # Re-raise the exception caught from the exporter
3181
- raise e # Keep the original exception type (ValueError, RuntimeError, etc.)
3182
- # <--- END MODIFIED
3183
-
3184
- # Alias .to_image() to .show() for convenience
3185
- def show(
3186
- self,
3187
- *args,
3188
- **kwargs,
3189
- ) -> Optional["Image.Image"]:
3190
- """Display pages similarly to ``to_image``.
3191
-
3192
- This is a thin wrapper around :py:meth:`to_image` so that the API mirrors
3193
- ElementCollection, where ``show()`` already exists. It forwards all
3194
- arguments and returns the resulting ``PIL.Image`` instance.
3195
- """
3196
- return self.to_image(*args, **kwargs)