natural-pdf 0.1.40__py3-none-any.whl → 0.2.1.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. natural_pdf/__init__.py +6 -7
  2. natural_pdf/analyzers/__init__.py +6 -1
  3. natural_pdf/analyzers/guides.py +354 -258
  4. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -4
  6. natural_pdf/analyzers/layout/paddle.py +11 -0
  7. natural_pdf/analyzers/layout/surya.py +2 -3
  8. natural_pdf/analyzers/shape_detection_mixin.py +25 -34
  9. natural_pdf/analyzers/text_structure.py +2 -2
  10. natural_pdf/classification/manager.py +1 -1
  11. natural_pdf/collections/mixins.py +3 -2
  12. natural_pdf/core/highlighting_service.py +743 -32
  13. natural_pdf/core/page.py +236 -383
  14. natural_pdf/core/page_collection.py +1249 -0
  15. natural_pdf/core/pdf.py +172 -83
  16. natural_pdf/{collections → core}/pdf_collection.py +18 -11
  17. natural_pdf/core/render_spec.py +335 -0
  18. natural_pdf/describe/base.py +1 -1
  19. natural_pdf/elements/__init__.py +1 -0
  20. natural_pdf/elements/base.py +108 -83
  21. natural_pdf/elements/{collections.py → element_collection.py} +566 -1487
  22. natural_pdf/elements/line.py +0 -1
  23. natural_pdf/elements/rect.py +0 -1
  24. natural_pdf/elements/region.py +318 -243
  25. natural_pdf/elements/text.py +9 -7
  26. natural_pdf/exporters/base.py +2 -2
  27. natural_pdf/exporters/original_pdf.py +1 -1
  28. natural_pdf/exporters/paddleocr.py +2 -4
  29. natural_pdf/exporters/searchable_pdf.py +3 -2
  30. natural_pdf/extraction/mixin.py +1 -3
  31. natural_pdf/flows/collections.py +1 -69
  32. natural_pdf/flows/element.py +4 -4
  33. natural_pdf/flows/flow.py +1200 -243
  34. natural_pdf/flows/region.py +707 -261
  35. natural_pdf/ocr/ocr_options.py +0 -2
  36. natural_pdf/ocr/utils.py +2 -1
  37. natural_pdf/qa/document_qa.py +21 -5
  38. natural_pdf/search/search_service_protocol.py +1 -1
  39. natural_pdf/selectors/parser.py +2 -2
  40. natural_pdf/tables/result.py +35 -1
  41. natural_pdf/text_mixin.py +7 -3
  42. natural_pdf/utils/debug.py +2 -1
  43. natural_pdf/utils/highlighting.py +1 -0
  44. natural_pdf/utils/layout.py +2 -2
  45. natural_pdf/utils/packaging.py +4 -3
  46. natural_pdf/utils/text_extraction.py +15 -12
  47. natural_pdf/utils/visualization.py +385 -0
  48. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/METADATA +7 -3
  49. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/RECORD +55 -53
  50. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/top_level.txt +0 -2
  51. optimization/memory_comparison.py +1 -1
  52. optimization/pdf_analyzer.py +2 -2
  53. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/WHEEL +0 -0
  54. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/entry_points.txt +0 -0
  55. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/licenses/LICENSE +0 -0
@@ -33,6 +33,9 @@ from natural_pdf.classification.manager import ClassificationManager
33
33
  from natural_pdf.classification.mixin import ClassificationMixin
34
34
  from natural_pdf.collections.mixins import ApplyMixin, DirectionalCollectionMixin
35
35
  from natural_pdf.core.pdf import PDF
36
+
37
+ # Add Visualizable import
38
+ from natural_pdf.core.render_spec import RenderSpec, Visualizable
36
39
  from natural_pdf.describe.mixin import DescribeMixin, InspectMixin
37
40
  from natural_pdf.elements.base import Element
38
41
  from natural_pdf.elements.region import Region
@@ -82,6 +85,7 @@ class ElementCollection(
82
85
  DirectionalCollectionMixin,
83
86
  DescribeMixin,
84
87
  InspectMixin,
88
+ Visualizable,
85
89
  MutableSequence,
86
90
  ):
87
91
  """Collection of PDF elements with batch operations.
@@ -171,13 +175,234 @@ class ElementCollection(
171
175
  """
172
176
  self._elements = elements or []
173
177
 
178
+ def _get_render_specs(
179
+ self,
180
+ mode: Literal["show", "render"] = "show",
181
+ color: Optional[Union[str, Tuple[int, int, int]]] = None,
182
+ highlights: Optional[List[Dict[str, Any]]] = None,
183
+ crop: Union[bool, Literal["content"]] = False,
184
+ crop_bbox: Optional[Tuple[float, float, float, float]] = None,
185
+ group_by: Optional[str] = None,
186
+ bins: Optional[Union[int, List[float]]] = None,
187
+ annotate: Optional[List[str]] = None,
188
+ **kwargs,
189
+ ) -> List[RenderSpec]:
190
+ """Get render specifications for this element collection.
191
+
192
+ Args:
193
+ mode: Rendering mode - 'show' includes highlights, 'render' is clean
194
+ color: Default color for highlights in show mode (or colormap name when using group_by)
195
+ highlights: Additional highlight groups to show
196
+ crop: Whether to crop to element bounds
197
+ crop_bbox: Explicit crop bounds
198
+ group_by: Attribute to group elements by for color mapping
199
+ bins: Binning specification for quantitative data (int for equal-width bins, list for custom bins)
200
+ annotate: List of attribute names to display on highlights
201
+ **kwargs: Additional parameters
202
+
203
+ Returns:
204
+ List of RenderSpec objects, one per page with elements
205
+ """
206
+ if not self._elements:
207
+ return []
208
+
209
+ # Group elements by page
210
+ elements_by_page = {}
211
+ for elem in self._elements:
212
+ if hasattr(elem, "page"):
213
+ page = elem.page
214
+ if page not in elements_by_page:
215
+ elements_by_page[page] = []
216
+ elements_by_page[page].append(elem)
217
+
218
+ if not elements_by_page:
219
+ return []
220
+
221
+ # Create RenderSpec for each page
222
+ specs = []
223
+ for page, page_elements in elements_by_page.items():
224
+ spec = RenderSpec(page=page)
225
+
226
+ # Handle cropping
227
+ if crop_bbox:
228
+ spec.crop_bbox = crop_bbox
229
+ elif crop == "content" or crop is True:
230
+ # Calculate bounds of elements on this page
231
+ x_coords = []
232
+ y_coords = []
233
+ for elem in page_elements:
234
+ if hasattr(elem, "bbox") and elem.bbox:
235
+ x0, y0, x1, y1 = elem.bbox
236
+ x_coords.extend([x0, x1])
237
+ y_coords.extend([y0, y1])
238
+
239
+ if x_coords and y_coords:
240
+ spec.crop_bbox = (min(x_coords), min(y_coords), max(x_coords), max(y_coords))
241
+
242
+ # Add highlights in show mode
243
+ if mode == "show":
244
+ # Handle group_by parameter for quantitative/categorical grouping
245
+ if group_by is not None:
246
+ # Use the improved highlighting logic from _prepare_highlight_data
247
+ prepared_highlights = self._prepare_highlight_data(
248
+ group_by=group_by, color=color, bins=bins, annotate=annotate, **kwargs
249
+ )
250
+
251
+ # Check if we have quantitative metadata to preserve
252
+ quantitative_metadata = None
253
+ for highlight_data in prepared_highlights:
254
+ if (
255
+ "quantitative_metadata" in highlight_data
256
+ and highlight_data["quantitative_metadata"]
257
+ ):
258
+ quantitative_metadata = highlight_data["quantitative_metadata"]
259
+ break
260
+
261
+ # Add highlights from prepared data
262
+ for highlight_data in prepared_highlights:
263
+ # Only add elements from this page
264
+ elem = highlight_data.get("element")
265
+ if elem and hasattr(elem, "page") and elem.page == page:
266
+ # Create the highlight dict manually to preserve quantitative metadata
267
+ highlight_dict = {
268
+ "element": elem,
269
+ "color": highlight_data.get("color"),
270
+ "label": highlight_data.get("label"),
271
+ }
272
+
273
+ # Add quantitative metadata to the first highlight
274
+ if quantitative_metadata and not any(
275
+ h.get("quantitative_metadata") for h in spec.highlights
276
+ ):
277
+ highlight_dict["quantitative_metadata"] = quantitative_metadata
278
+
279
+ # Add annotate if provided in the prepared data
280
+ if "annotate" in highlight_data:
281
+ highlight_dict["annotate"] = highlight_data["annotate"]
282
+ if "attributes_to_draw" in highlight_data:
283
+ highlight_dict["attributes_to_draw"] = highlight_data[
284
+ "attributes_to_draw"
285
+ ]
286
+
287
+ # Extract geometry from element
288
+ if (
289
+ hasattr(elem, "polygon")
290
+ and hasattr(elem, "has_polygon")
291
+ and elem.has_polygon
292
+ ):
293
+ highlight_dict["polygon"] = elem.polygon
294
+ elif hasattr(elem, "bbox"):
295
+ highlight_dict["bbox"] = elem.bbox
296
+
297
+ spec.highlights.append(highlight_dict)
298
+ else:
299
+ # Default behavior when no group_by is specified
300
+ # Determine if all elements are of the same type
301
+ element_types = set(type(elem).__name__ for elem in page_elements)
302
+
303
+ if len(element_types) == 1:
304
+ # All elements are the same type - use a single label
305
+ type_name = element_types.pop()
306
+ # Generate a clean label from the type name
307
+ base_name = (
308
+ type_name.replace("Element", "").replace("Region", "")
309
+ if type_name != "Region"
310
+ else "Region"
311
+ )
312
+ # Handle special cases for common types
313
+ if base_name == "Text":
314
+ shared_label = "Text Elements"
315
+ elif base_name == "table_cell" or (
316
+ hasattr(page_elements[0], "region_type")
317
+ and page_elements[0].region_type == "table_cell"
318
+ ):
319
+ shared_label = "Table Cells"
320
+ elif base_name == "table":
321
+ shared_label = "Tables"
322
+ else:
323
+ shared_label = f"{base_name} Elements" if base_name else "Elements"
324
+
325
+ # Add all elements with the same label (no color cycling)
326
+ for elem in page_elements:
327
+ # Get element highlight params with annotate
328
+ element_data = self._get_element_highlight_params(elem, annotate)
329
+ if element_data:
330
+ # Use add_highlight with basic params
331
+ spec.add_highlight(
332
+ element=elem,
333
+ color=color, # Use provided color or None
334
+ label=shared_label,
335
+ )
336
+ # Update last highlight with attributes if present
337
+ if element_data.get("attributes_to_draw") and spec.highlights:
338
+ spec.highlights[-1]["attributes_to_draw"] = element_data[
339
+ "attributes_to_draw"
340
+ ]
341
+ else:
342
+ # Mixed types - use individual labels (existing behavior)
343
+ for elem in page_elements:
344
+ # Get element highlight params with annotate
345
+ element_data = self._get_element_highlight_params(elem, annotate)
346
+ if element_data:
347
+ spec.add_highlight(
348
+ element=elem,
349
+ color=color,
350
+ label=getattr(elem, "text", None) or str(elem),
351
+ )
352
+ # Update last highlight with attributes if present
353
+ if element_data.get("attributes_to_draw") and spec.highlights:
354
+ spec.highlights[-1]["attributes_to_draw"] = element_data[
355
+ "attributes_to_draw"
356
+ ]
357
+
358
+ # Add additional highlight groups if provided
359
+ if highlights:
360
+ for group in highlights:
361
+ group_elements = group.get("elements", [])
362
+ group_color = group.get("color", color)
363
+ group_label = group.get("label")
364
+
365
+ # Only add elements from this page
366
+ for elem in group_elements:
367
+ if hasattr(elem, "page") and elem.page == page:
368
+ spec.add_highlight(
369
+ element=elem, color=group_color, label=group_label
370
+ )
371
+
372
+ specs.append(spec)
373
+
374
+ return specs
375
+
376
+ def _get_highlighter(self):
377
+ """Get the highlighting service for rendering.
378
+
379
+ For ElementCollection, we get it from the first element's page.
380
+ """
381
+ if not self._elements:
382
+ raise RuntimeError("Cannot get highlighter from empty ElementCollection")
383
+
384
+ # Try to get highlighter from first element's page
385
+ for elem in self._elements:
386
+ if hasattr(elem, "page") and hasattr(elem.page, "_highlighter"):
387
+ return elem.page._highlighter
388
+
389
+ # If no elements have pages, we can't render
390
+ raise RuntimeError(
391
+ "Cannot find HighlightingService. ElementCollection elements don't have page access."
392
+ )
393
+
174
394
  def __len__(self) -> int:
175
395
  """Get the number of elements in the collection."""
176
396
  return len(self._elements)
177
397
 
178
- def __getitem__(self, index: int) -> "Element":
179
- """Get an element by index."""
180
- return self._elements[index]
398
+ def __getitem__(self, index: Union[int, slice]) -> Union["Element", "ElementCollection"]:
399
+ """Get an element by index or a collection by slice."""
400
+ if isinstance(index, slice):
401
+ # Return a new ElementCollection for slices
402
+ return ElementCollection(self._elements[index])
403
+ else:
404
+ # Return the element for integer indices
405
+ return self._elements[index]
181
406
 
182
407
  def __repr__(self) -> str:
183
408
  """Return a string representation showing the element count."""
@@ -423,6 +648,7 @@ class ElementCollection(
423
648
  # Apply content filtering if provided
424
649
  if content_filter is not None:
425
650
  from natural_pdf.utils.text_extraction import _apply_content_filter
651
+
426
652
  all_char_dicts = _apply_content_filter(all_char_dicts, content_filter)
427
653
 
428
654
  # Check if layout is requested
@@ -534,8 +760,9 @@ class ElementCollection(
534
760
  group_by: Optional[str] = None,
535
761
  label_format: Optional[str] = None,
536
762
  distinct: bool = False,
537
- include_attrs: Optional[List[str]] = None,
763
+ annotate: Optional[List[str]] = None,
538
764
  replace: bool = False,
765
+ bins: Optional[Union[int, List[float]]] = None,
539
766
  ) -> "ElementCollection":
540
767
  """
541
768
  Adds persistent highlights for all elements in the collection to the page
@@ -553,12 +780,15 @@ class ElementCollection(
553
780
  label: Optional explicit label for the entire collection. If provided,
554
781
  all elements are highlighted as a single group with this label,
555
782
  ignoring 'group_by' and the default type-based grouping.
556
- color: Optional explicit color for the highlight (tuple/string). Applied
557
- consistently if 'label' is provided or if grouping occurs.
783
+ color: Optional explicit color for the highlight (tuple/string), or
784
+ matplotlib colormap name for quantitative group_by (e.g., 'viridis', 'plasma',
785
+ 'inferno', 'coolwarm', 'RdBu'). Applied consistently if 'label' is provided
786
+ or if grouping occurs.
558
787
  group_by: Optional attribute name present on the elements. If provided
559
788
  (and 'label' is None), elements will be grouped based on the
560
789
  value of this attribute, and each group will be highlighted
561
- with a distinct label and color.
790
+ with a distinct label and color. Automatically detects quantitative
791
+ data and uses gradient colormaps when appropriate.
562
792
  label_format: Optional Python f-string to format the group label when
563
793
  'group_by' is used. Can reference element attributes
564
794
  (e.g., "Type: {region_type}, Conf: {confidence:.2f}").
@@ -566,11 +796,14 @@ class ElementCollection(
566
796
  distinct: If True, bypasses all grouping and highlights each element
567
797
  individually with cycling colors (the previous default behavior).
568
798
  (default: False)
569
- include_attrs: List of attribute names from the element to display directly
570
- on the highlight itself (distinct from group label).
799
+ annotate: List of attribute names from the element to display directly
800
+ on the highlight itself (distinct from group label).
571
801
  replace: If True, existing highlights on the affected page(s)
572
802
  are cleared before adding these highlights.
573
803
  If False (default), highlights are appended to existing ones.
804
+ bins: Optional binning specification for quantitative data when using group_by.
805
+ Can be an integer (number of equal-width bins) or a list of bin edges.
806
+ Only used when group_by contains quantitative data.
574
807
 
575
808
  Returns:
576
809
  Self for method chaining
@@ -592,7 +825,8 @@ class ElementCollection(
592
825
  color=color,
593
826
  group_by=group_by,
594
827
  label_format=label_format,
595
- include_attrs=include_attrs,
828
+ annotate=annotate,
829
+ bins=bins,
596
830
  # 'replace' flag is handled during the add call below
597
831
  )
598
832
 
@@ -633,7 +867,7 @@ class ElementCollection(
633
867
  "use_color_cycling", False
634
868
  ), # Set by _prepare if distinct
635
869
  "element": data["element"],
636
- "include_attrs": data["include_attrs"],
870
+ "annotate": data["annotate"],
637
871
  # Internal call to service always appends, as clearing was handled above
638
872
  "existing": "append",
639
873
  }
@@ -655,7 +889,8 @@ class ElementCollection(
655
889
  color: Optional[Union[Tuple, str]] = None,
656
890
  group_by: Optional[str] = None,
657
891
  label_format: Optional[str] = None,
658
- include_attrs: Optional[List[str]] = None,
892
+ annotate: Optional[List[str]] = None,
893
+ bins: Optional[Union[int, List[float]]] = None,
659
894
  ) -> List[Dict]:
660
895
  """
661
896
  Determines the parameters for highlighting each element based on the strategy.
@@ -664,7 +899,7 @@ class ElementCollection(
664
899
 
665
900
  Returns:
666
901
  List of dictionaries, each containing parameters for a single highlight
667
- (e.g., page_index, bbox/polygon, color, label, element, include_attrs, attributes_to_draw).
902
+ (e.g., page_index, bbox/polygon, color, label, element, annotate, attributes_to_draw).
668
903
  Color and label determination happens here.
669
904
  """
670
905
  prepared_data = []
@@ -672,11 +907,25 @@ class ElementCollection(
672
907
  return prepared_data
673
908
 
674
909
  # Need access to the HighlightingService to determine colors correctly.
910
+ # Use highlighting protocol to find a valid service from any element
675
911
  highlighter = None
676
- first_element = self._elements[0]
677
- if hasattr(first_element, "page") and hasattr(first_element.page, "_highlighter"):
678
- highlighter = first_element.page._highlighter
679
- else:
912
+
913
+ for element in self._elements:
914
+ # Try direct page access first (for regular elements)
915
+ if hasattr(element, "page") and hasattr(element.page, "_highlighter"):
916
+ highlighter = element.page._highlighter
917
+ break
918
+ # Try highlighting protocol for FlowRegions and other complex elements
919
+ elif hasattr(element, "get_highlight_specs"):
920
+ specs = element.get_highlight_specs()
921
+ for spec in specs:
922
+ if "page" in spec and hasattr(spec["page"], "_highlighter"):
923
+ highlighter = spec["page"]._highlighter
924
+ break
925
+ if highlighter:
926
+ break
927
+
928
+ if not highlighter:
680
929
  logger.warning(
681
930
  "Cannot determine highlight colors: HighlightingService not accessible from elements."
682
931
  )
@@ -689,7 +938,7 @@ class ElementCollection(
689
938
  final_color = highlighter._determine_highlight_color(
690
939
  label=None, color_input=None, use_color_cycling=True
691
940
  )
692
- element_data = self._get_element_highlight_params(element, include_attrs)
941
+ element_data = self._get_element_highlight_params(element, annotate)
693
942
  if element_data:
694
943
  element_data.update(
695
944
  {"color": final_color, "label": None, "use_color_cycling": True}
@@ -702,7 +951,7 @@ class ElementCollection(
702
951
  label=label, color_input=color, use_color_cycling=False
703
952
  )
704
953
  for element in self._elements:
705
- element_data = self._get_element_highlight_params(element, include_attrs)
954
+ element_data = self._get_element_highlight_params(element, annotate)
706
955
  if element_data:
707
956
  element_data.update({"color": final_color, "label": label})
708
957
  prepared_data.append(element_data)
@@ -710,23 +959,84 @@ class ElementCollection(
710
959
  elif group_by is not None:
711
960
  logger.debug("_prepare: Grouping by attribute strategy.")
712
961
  grouped_elements = self._group_elements_by_attr(group_by)
962
+
963
+ # Collect all values for quantitative detection
964
+ all_values = []
713
965
  for group_key, group_elements in grouped_elements.items():
714
- if not group_elements:
715
- continue
716
- group_label = self._format_group_label(
717
- group_key, label_format, group_elements[0], group_by
718
- )
719
- final_color = highlighter._determine_highlight_color(
720
- label=group_label, color_input=None, use_color_cycling=False
721
- )
722
- logger.debug(
723
- f" _prepare group '{group_label}' ({len(group_elements)} elements) -> color {final_color}"
966
+ if group_elements:
967
+ all_values.append(group_key)
968
+
969
+ # Import the quantitative detection function
970
+ from natural_pdf.utils.visualization import (
971
+ create_quantitative_color_mapping,
972
+ detect_quantitative_data,
973
+ )
974
+
975
+ # Determine if we should use quantitative color mapping
976
+ use_quantitative = detect_quantitative_data(all_values)
977
+
978
+ if use_quantitative:
979
+ logger.debug(" _prepare: Using quantitative color mapping.")
980
+ # Use quantitative color mapping with specified colormap
981
+ colormap_name = color if isinstance(color, str) else "viridis"
982
+ value_to_color = create_quantitative_color_mapping(
983
+ all_values, colormap=colormap_name, bins=bins
724
984
  )
725
- for element in group_elements:
726
- element_data = self._get_element_highlight_params(element, include_attrs)
727
- if element_data:
728
- element_data.update({"color": final_color, "label": group_label})
729
- prepared_data.append(element_data)
985
+
986
+ # Store quantitative metadata for colorbar creation
987
+ quantitative_metadata = {
988
+ "values": all_values,
989
+ "colormap": colormap_name,
990
+ "bins": bins,
991
+ "attribute": group_by,
992
+ }
993
+
994
+ for group_key, group_elements in grouped_elements.items():
995
+ if not group_elements:
996
+ continue
997
+ group_label = self._format_group_label(
998
+ group_key, label_format, group_elements[0], group_by
999
+ )
1000
+
1001
+ # Get quantitative color for this value
1002
+ final_color = value_to_color.get(group_key)
1003
+ if final_color is None:
1004
+ # Fallback to traditional color assignment
1005
+ final_color = highlighter._determine_highlight_color(
1006
+ label=group_label, color_input=None, use_color_cycling=False
1007
+ )
1008
+
1009
+ logger.debug(
1010
+ f" _prepare group '{group_label}' ({len(group_elements)} elements) -> color {final_color}"
1011
+ )
1012
+ for element in group_elements:
1013
+ element_data = self._get_element_highlight_params(element, annotate)
1014
+ if element_data:
1015
+ element_data.update({"color": final_color, "label": group_label})
1016
+ # Add quantitative metadata to the first element in each group
1017
+ if not any("quantitative_metadata" in pd for pd in prepared_data):
1018
+ element_data["quantitative_metadata"] = quantitative_metadata
1019
+ prepared_data.append(element_data)
1020
+ else:
1021
+ logger.debug(" _prepare: Using categorical color mapping.")
1022
+ # Use traditional categorical color mapping
1023
+ for group_key, group_elements in grouped_elements.items():
1024
+ if not group_elements:
1025
+ continue
1026
+ group_label = self._format_group_label(
1027
+ group_key, label_format, group_elements[0], group_by
1028
+ )
1029
+ final_color = highlighter._determine_highlight_color(
1030
+ label=group_label, color_input=None, use_color_cycling=False
1031
+ )
1032
+ logger.debug(
1033
+ f" _prepare group '{group_label}' ({len(group_elements)} elements) -> color {final_color}"
1034
+ )
1035
+ for element in group_elements:
1036
+ element_data = self._get_element_highlight_params(element, annotate)
1037
+ if element_data:
1038
+ element_data.update({"color": final_color, "label": group_label})
1039
+ prepared_data.append(element_data)
730
1040
  else:
731
1041
  logger.debug("_prepare: Default grouping strategy.")
732
1042
  element_types = set(type(el).__name__ for el in self._elements)
@@ -745,7 +1055,7 @@ class ElementCollection(
745
1055
  )
746
1056
  logger.debug(f" _prepare default group '{auto_label}' -> color {final_color}")
747
1057
  for element in self._elements:
748
- element_data = self._get_element_highlight_params(element, include_attrs)
1058
+ element_data = self._get_element_highlight_params(element, annotate)
749
1059
  if element_data:
750
1060
  element_data.update({"color": final_color, "label": auto_label})
751
1061
  prepared_data.append(element_data)
@@ -764,7 +1074,7 @@ class ElementCollection(
764
1074
  # Determine color *before* logging or using it (already done above for this branch)
765
1075
  logger.debug(f" _prepare default group '{auto_label}' -> color {final_color}")
766
1076
  for element in self._elements:
767
- element_data = self._get_element_highlight_params(element, include_attrs)
1077
+ element_data = self._get_element_highlight_params(element, annotate)
768
1078
  if element_data:
769
1079
  element_data.update({"color": final_color, "label": auto_label})
770
1080
  prepared_data.append(element_data)
@@ -777,7 +1087,7 @@ class ElementCollection(
777
1087
  color: Optional[Union[Tuple, str]],
778
1088
  label: Optional[str],
779
1089
  use_color_cycling: bool,
780
- include_attrs: Optional[List[str]],
1090
+ annotate: Optional[List[str]],
781
1091
  existing: str,
782
1092
  ):
783
1093
  """Low-level helper to call the appropriate HighlightingService method for an element."""
@@ -793,7 +1103,7 @@ class ElementCollection(
793
1103
  "color": color,
794
1104
  "label": label,
795
1105
  "use_color_cycling": use_color_cycling,
796
- "include_attrs": include_attrs,
1106
+ "annotate": annotate,
797
1107
  "existing": existing,
798
1108
  "element": element,
799
1109
  }
@@ -828,7 +1138,7 @@ class ElementCollection(
828
1138
  self,
829
1139
  label: str,
830
1140
  color: Optional[Union[Tuple, str]],
831
- include_attrs: Optional[List[str]],
1141
+ annotate: Optional[List[str]],
832
1142
  existing: str,
833
1143
  ):
834
1144
  """Highlights all elements with the same explicit label and color."""
@@ -838,7 +1148,7 @@ class ElementCollection(
838
1148
  color=color, # Use explicit color if provided
839
1149
  label=label, # Use the explicit group label
840
1150
  use_color_cycling=False, # Use consistent color for the label
841
- include_attrs=include_attrs,
1151
+ annotate=annotate,
842
1152
  existing=existing,
843
1153
  )
844
1154
 
@@ -846,7 +1156,7 @@ class ElementCollection(
846
1156
  self,
847
1157
  group_by: str,
848
1158
  label_format: Optional[str],
849
- include_attrs: Optional[List[str]],
1159
+ annotate: Optional[List[str]],
850
1160
  existing: str,
851
1161
  ):
852
1162
  """Groups elements by attribute and highlights each group distinctly."""
@@ -918,11 +1228,11 @@ class ElementCollection(
918
1228
  color=None, # Let ColorManager choose based on label
919
1229
  label=group_label, # Use the derived group label
920
1230
  use_color_cycling=False, # Use consistent color for the label
921
- include_attrs=include_attrs,
1231
+ annotate=annotate,
922
1232
  existing=existing,
923
1233
  )
924
1234
 
925
- def _highlight_distinctly(self, include_attrs: Optional[List[str]], existing: str):
1235
+ def _highlight_distinctly(self, annotate: Optional[List[str]], existing: str):
926
1236
  """DEPRECATED: Logic moved to _prepare_highlight_data. Kept for reference/potential reuse."""
927
1237
  # This method is no longer called directly by the main highlight path.
928
1238
  # The distinct logic is handled within _prepare_highlight_data.
@@ -932,152 +1242,191 @@ class ElementCollection(
932
1242
  color=None, # Let ColorManager cycle
933
1243
  label=None, # No label for distinct elements
934
1244
  use_color_cycling=True, # Force cycling
935
- include_attrs=include_attrs,
1245
+ annotate=annotate,
936
1246
  existing=existing,
937
1247
  )
938
1248
 
939
- def show(
1249
+ def _render_multipage_highlights(
940
1250
  self,
941
- # --- Visualization Parameters ---
942
- group_by: Optional[str] = None,
943
- label: Optional[str] = None,
944
- color: Optional[Union[Tuple, str]] = None,
945
- label_format: Optional[str] = None,
946
- distinct: bool = False,
947
- include_attrs: Optional[List[str]] = None,
948
- # --- Rendering Parameters ---
949
- resolution: Optional[float] = None,
950
- labels: bool = True, # Use 'labels' consistent with service
951
- legend_position: str = "right",
952
- render_ocr: bool = False,
953
- width: Optional[int] = None, # Add width parameter
954
- page: Optional[Any] = None, # NEW: Optional page parameter for empty collections
955
- crop: bool = False, # NEW: If True, crop output to element bounds
956
- ) -> Optional["Image.Image"]:
957
- """
958
- Generates a temporary preview image highlighting elements in this collection
959
- on their page, ignoring any persistent highlights.
960
-
961
- Currently only supports collections where all elements are on the same page
962
- of the same PDF.
963
-
964
- Allows grouping and coloring elements based on attributes, similar to the
965
- persistent `highlight()` method, but only for this temporary view.
966
-
967
- Args:
968
- group_by: Attribute name to group elements by for distinct colors/labels.
969
- label: Explicit label for all elements (overrides group_by).
970
- color: Explicit color for all elements (if label used) or base color.
971
- label_format: F-string to format group labels if group_by is used.
972
- distinct: Highlight each element distinctly (overrides group_by/label).
973
- include_attrs: Attributes to display on individual highlights.
974
- resolution: Resolution in DPI for rendering (uses global options if not specified, defaults to 144 DPI).
975
- labels: Whether to include a legend for the temporary highlights.
976
- legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
977
- render_ocr: Whether to render OCR text.
978
- width: Optional width for the output image in pixels.
979
- crop: If True, crop the resulting image to the tight bounding box
980
- containing all elements in the collection. The elements are
981
- still highlighted first, then the image is cropped.
982
-
983
- Returns:
984
- PIL Image object of the temporary preview, or None if rendering fails or
985
- elements span multiple pages/PDFs.
1251
+ specs_by_page,
1252
+ resolution,
1253
+ width,
1254
+ labels,
1255
+ legend_position,
1256
+ group_by,
1257
+ label,
1258
+ color,
1259
+ label_format,
1260
+ distinct,
1261
+ annotate,
1262
+ render_ocr,
1263
+ crop,
1264
+ stack_direction="vertical",
1265
+ stack_gap=5,
1266
+ stack_background_color=(255, 255, 255),
1267
+ ):
1268
+ """Render highlights across multiple pages and stack them."""
1269
+ from PIL import Image
986
1270
 
987
- Raises:
988
- ValueError: If the collection is empty or elements are on different pages/PDFs.
989
- """
990
- # Apply global options as defaults, but allow explicit parameters to override
991
- import natural_pdf
1271
+ # Sort pages by index for consistent output
1272
+ sorted_pages = sorted(
1273
+ specs_by_page.keys(), key=lambda p: p.index if hasattr(p, "index") else 0
1274
+ )
992
1275
 
993
- # Use global options if parameters are not explicitly set
994
- if width is None:
995
- width = natural_pdf.options.image.width
996
- if resolution is None:
997
- if natural_pdf.options.image.resolution is not None:
998
- resolution = natural_pdf.options.image.resolution
999
- else:
1000
- resolution = 144 # Default resolution when none specified
1276
+ page_images = []
1001
1277
 
1002
- if not self._elements:
1003
- raise ValueError("Cannot show an empty collection.")
1278
+ for page in sorted_pages:
1279
+ element_specs = specs_by_page[page]
1004
1280
 
1005
- # Check if elements are on multiple PDFs
1006
- if self._are_on_multiple_pdfs():
1007
- raise ValueError(
1008
- "show() currently only supports collections where all elements are from the same PDF."
1009
- )
1281
+ # Get highlighter service from the page
1282
+ if not hasattr(page, "_highlighter"):
1283
+ logger.warning(
1284
+ f"Page {getattr(page, 'number', '?')} has no highlighter service, skipping"
1285
+ )
1286
+ continue
1010
1287
 
1011
- # Check if elements are on multiple pages
1012
- if self._are_on_multiple_pages():
1013
- raise ValueError(
1014
- "show() currently only supports collections where all elements are on the same page."
1015
- )
1288
+ service = page._highlighter
1016
1289
 
1017
- # Get the page and highlighting service from the first element
1018
- first_element = self._elements[0]
1019
- if not hasattr(first_element, "page") or not first_element.page:
1020
- logger.warning("Cannot show collection: First element has no associated page.")
1021
- return None
1022
- page = first_element.page
1023
- if not hasattr(page, "pdf") or not page.pdf:
1024
- logger.warning("Cannot show collection: Page has no associated PDF object.")
1025
- return None
1290
+ # Prepare highlight data for this page
1291
+ highlight_data_list = []
1026
1292
 
1027
- service = page._highlighter
1028
- if not service:
1029
- logger.warning("Cannot show collection: PDF object has no highlighting service.")
1030
- return None
1293
+ for element_idx, spec in element_specs:
1294
+ # Use the element index to generate consistent colors/labels across pages
1295
+ element = spec.get(
1296
+ "element",
1297
+ self._elements[element_idx] if element_idx < len(self._elements) else None,
1298
+ )
1031
1299
 
1032
- # 1. Prepare temporary highlight data based on grouping parameters
1033
- # This returns a list of dicts, suitable for render_preview
1034
- highlight_data_list = self._prepare_highlight_data(
1035
- distinct=distinct,
1036
- label=label,
1037
- color=color,
1038
- group_by=group_by,
1039
- label_format=label_format,
1040
- include_attrs=include_attrs,
1041
- )
1300
+ # Prepare highlight data based on grouping parameters
1301
+ if distinct:
1302
+ # Use cycling colors for distinct mode
1303
+ element_color = None # Let the highlighter service pick from palette
1304
+ use_color_cycling = True
1305
+ element_label = (
1306
+ f"Element_{element_idx + 1}"
1307
+ if label is None
1308
+ else f"{label}_{element_idx + 1}"
1309
+ )
1310
+ elif label:
1311
+ # Explicit label for all elements
1312
+ element_color = color
1313
+ use_color_cycling = color is None
1314
+ element_label = label
1315
+ elif group_by and element:
1316
+ # Group by attribute
1317
+ try:
1318
+ group_key = getattr(element, group_by, None)
1319
+ element_label = self._format_group_label(
1320
+ group_key, label_format, element, group_by
1321
+ )
1322
+ element_color = None # Let service assign color by group
1323
+ use_color_cycling = True
1324
+ except:
1325
+ element_label = f"Element_{element_idx + 1}"
1326
+ element_color = color
1327
+ use_color_cycling = color is None
1328
+ else:
1329
+ # Default behavior
1330
+ element_color = color
1331
+ use_color_cycling = color is None
1332
+ element_label = f"Element_{element_idx + 1}"
1333
+
1334
+ # Build highlight data
1335
+ highlight_item = {
1336
+ "page_index": spec["page_index"],
1337
+ "bbox": spec["bbox"],
1338
+ "polygon": spec.get("polygon"),
1339
+ "color": element_color,
1340
+ "label": element_label if labels else None,
1341
+ "use_color_cycling": use_color_cycling,
1342
+ }
1343
+
1344
+ # Add attributes if requested
1345
+ if annotate and element:
1346
+ highlight_item["attributes_to_draw"] = {}
1347
+ for attr_name in annotate:
1348
+ try:
1349
+ attr_value = getattr(element, attr_name, None)
1350
+ if attr_value is not None:
1351
+ highlight_item["attributes_to_draw"][attr_name] = attr_value
1352
+ except:
1353
+ pass
1042
1354
 
1043
- if not highlight_data_list:
1044
- logger.warning("No highlight data generated for show(). Rendering clean page.")
1045
- # Render the page without any temporary highlights
1046
- highlight_data_list = []
1355
+ highlight_data_list.append(highlight_item)
1047
1356
 
1048
- # 2. Call render_preview on the HighlightingService
1049
- try:
1050
- # Calculate crop bounding box in PDF coordinates if crop is requested
1357
+ # Calculate crop bbox if requested
1051
1358
  crop_bbox = None
1052
1359
  if crop:
1053
1360
  try:
1054
- crop_bbox = (
1055
- min(el.x0 for el in self._elements),
1056
- min(el.top for el in self._elements),
1057
- max(el.x1 for el in self._elements),
1058
- max(el.bottom for el in self._elements),
1059
- )
1361
+ # Get bboxes from all specs on this page
1362
+ bboxes = [spec["bbox"] for _, spec in element_specs if spec.get("bbox")]
1363
+ if bboxes:
1364
+ crop_bbox = (
1365
+ min(bbox[0] for bbox in bboxes),
1366
+ min(bbox[1] for bbox in bboxes),
1367
+ max(bbox[2] for bbox in bboxes),
1368
+ max(bbox[3] for bbox in bboxes),
1369
+ )
1060
1370
  except Exception as bbox_err:
1061
- logger.error(
1062
- f"Error determining crop bbox for collection show: {bbox_err}",
1063
- exc_info=True,
1064
- )
1371
+ logger.error(f"Error determining crop bbox: {bbox_err}")
1065
1372
 
1066
- img = service.render_preview(
1067
- page_index=page.index,
1068
- temporary_highlights=highlight_data_list,
1069
- resolution=resolution,
1070
- width=width, # Pass the width parameter
1071
- labels=labels, # Use 'labels'
1072
- legend_position=legend_position,
1073
- render_ocr=render_ocr,
1074
- crop_bbox=crop_bbox,
1075
- )
1076
- return img
1077
- except Exception as e:
1078
- logger.error(f"Error calling highlighting_service.render_preview: {e}", exc_info=True)
1373
+ # Render this page
1374
+ try:
1375
+ img = service.render_preview(
1376
+ page_index=page.index,
1377
+ temporary_highlights=highlight_data_list,
1378
+ resolution=resolution,
1379
+ width=width,
1380
+ labels=labels,
1381
+ legend_position=legend_position,
1382
+ render_ocr=render_ocr,
1383
+ crop_bbox=crop_bbox,
1384
+ )
1385
+
1386
+ if img:
1387
+ page_images.append(img)
1388
+ except Exception as e:
1389
+ logger.error(
1390
+ f"Error rendering page {getattr(page, 'number', '?')}: {e}", exc_info=True
1391
+ )
1392
+
1393
+ if not page_images:
1394
+ logger.warning("Failed to render any pages")
1079
1395
  return None
1080
1396
 
1397
+ if len(page_images) == 1:
1398
+ return page_images[0]
1399
+
1400
+ # Stack the images
1401
+ if stack_direction == "vertical":
1402
+ final_width = max(img.width for img in page_images)
1403
+ final_height = (
1404
+ sum(img.height for img in page_images) + (len(page_images) - 1) * stack_gap
1405
+ )
1406
+
1407
+ stacked_image = Image.new("RGB", (final_width, final_height), stack_background_color)
1408
+
1409
+ current_y = 0
1410
+ for img in page_images:
1411
+ # Center horizontally
1412
+ x_offset = (final_width - img.width) // 2
1413
+ stacked_image.paste(img, (x_offset, current_y))
1414
+ current_y += img.height + stack_gap
1415
+ else: # horizontal
1416
+ final_width = sum(img.width for img in page_images) + (len(page_images) - 1) * stack_gap
1417
+ final_height = max(img.height for img in page_images)
1418
+
1419
+ stacked_image = Image.new("RGB", (final_width, final_height), stack_background_color)
1420
+
1421
+ current_x = 0
1422
+ for img in page_images:
1423
+ # Center vertically
1424
+ y_offset = (final_height - img.height) // 2
1425
+ stacked_image.paste(img, (current_x, y_offset))
1426
+ current_x += img.width + stack_gap
1427
+
1428
+ return stacked_image
1429
+
1081
1430
  def save(
1082
1431
  self,
1083
1432
  filename: str,
@@ -1113,8 +1462,8 @@ class ElementCollection(
1113
1462
  else:
1114
1463
  resolution = 144 # Default resolution when none specified
1115
1464
 
1116
- # Use to_image to generate and save the image
1117
- self.to_image(
1465
+ # Use export() to save the image
1466
+ self.export(
1118
1467
  path=filename,
1119
1468
  resolution=resolution,
1120
1469
  width=width,
@@ -1124,42 +1473,6 @@ class ElementCollection(
1124
1473
  )
1125
1474
  return self
1126
1475
 
1127
- def to_image(
1128
- self,
1129
- path: Optional[str] = None,
1130
- resolution: Optional[float] = None,
1131
- width: Optional[int] = None,
1132
- labels: bool = True,
1133
- legend_position: str = "right",
1134
- render_ocr: bool = False,
1135
- ) -> Optional["Image.Image"]:
1136
- """
1137
- Generate an image of the page with this collection's elements highlighted,
1138
- optionally saving it to a file.
1139
-
1140
- Args:
1141
- path: Optional path to save the image to
1142
- resolution: Resolution in DPI for rendering (uses global options if not specified, defaults to 144 DPI)
1143
- width: Optional width for the output image in pixels (height calculated to maintain aspect ratio)
1144
- labels: Whether to include a legend for labels
1145
- legend_position: Position of the legend
1146
- render_ocr: Whether to render OCR text with white background boxes
1147
-
1148
- Returns:
1149
- PIL Image of the page with elements highlighted, or None if no valid page
1150
- """
1151
- # Get the page from the first element (if available)
1152
- if self._elements and hasattr(self._elements[0], "page"):
1153
- page = self._elements[0].page
1154
- # Generate the image using to_image
1155
- return page.to_image(
1156
- path=path,
1157
- resolution=resolution,
1158
- width=width,
1159
- labels=labels,
1160
- legend_position=legend_position,
1161
- render_ocr=render_ocr,
1162
- )
1163
1476
  return None
1164
1477
 
1165
1478
  def _group_elements_by_attr(self, group_by: str) -> Dict[Any, List[T]]:
@@ -1219,17 +1532,57 @@ class ElementCollection(
1219
1532
  return str(group_key)
1220
1533
 
1221
1534
  def _get_element_highlight_params(
1222
- self, element: T, include_attrs: Optional[List[str]]
1535
+ self, element: T, annotate: Optional[List[str]]
1223
1536
  ) -> Optional[Dict]:
1224
1537
  """Extracts common parameters needed for highlighting a single element."""
1538
+ # For FlowRegions and other complex elements, use highlighting protocol
1539
+ if hasattr(element, "get_highlight_specs"):
1540
+ specs = element.get_highlight_specs()
1541
+ if not specs:
1542
+ logger.warning(f"Element {element} returned no highlight specs")
1543
+ return None
1544
+
1545
+ # For now, we'll use the first spec for the prepared data
1546
+ # The actual rendering will use all specs
1547
+ first_spec = specs[0]
1548
+ page = first_spec["page"]
1549
+
1550
+ base_data = {
1551
+ "page_index": first_spec["page_index"],
1552
+ "element": element,
1553
+ "annotate": annotate,
1554
+ "attributes_to_draw": {},
1555
+ "bbox": first_spec.get("bbox"),
1556
+ "polygon": first_spec.get("polygon"),
1557
+ "multi_spec": len(specs) > 1, # Flag to indicate multiple specs
1558
+ "all_specs": specs, # Store all specs for rendering
1559
+ }
1560
+
1561
+ # Extract attributes if requested
1562
+ if annotate:
1563
+ for attr_name in annotate:
1564
+ try:
1565
+ attr_value = getattr(element, attr_name, None)
1566
+ if attr_value is not None:
1567
+ base_data["attributes_to_draw"][attr_name] = attr_value
1568
+ except AttributeError:
1569
+ logger.warning(
1570
+ f"Attribute '{attr_name}' not found on element {element} for annotate"
1571
+ )
1572
+
1573
+ return base_data
1574
+
1575
+ # Fallback for regular elements with direct page access
1225
1576
  if not hasattr(element, "page"):
1577
+ logger.warning(f"Element {element} has no page attribute and no highlighting protocol")
1226
1578
  return None
1579
+
1227
1580
  page = element.page
1228
1581
 
1229
1582
  base_data = {
1230
1583
  "page_index": page.index,
1231
1584
  "element": element,
1232
- "include_attrs": include_attrs,
1585
+ "annotate": annotate,
1233
1586
  "attributes_to_draw": {},
1234
1587
  "bbox": None,
1235
1588
  "polygon": None,
@@ -1254,15 +1607,15 @@ class ElementCollection(
1254
1607
  return None
1255
1608
 
1256
1609
  # Extract attributes if requested
1257
- if include_attrs:
1258
- for attr_name in include_attrs:
1610
+ if annotate:
1611
+ for attr_name in annotate:
1259
1612
  try:
1260
1613
  attr_value = getattr(element, attr_name, None)
1261
1614
  if attr_value is not None:
1262
1615
  base_data["attributes_to_draw"][attr_name] = attr_value
1263
1616
  except AttributeError:
1264
1617
  logger.warning(
1265
- f"Attribute '{attr_name}' not found on element {element} for include_attrs"
1618
+ f"Attribute '{attr_name}' not found on element {element} for annotate"
1266
1619
  )
1267
1620
 
1268
1621
  return base_data
@@ -1699,9 +2052,7 @@ class ElementCollection(
1699
2052
  image_path = image_dir / image_filename
1700
2053
 
1701
2054
  # Save image
1702
- element.to_image(
1703
- path=str(image_path), resolution=image_resolution, include_highlights=True
1704
- )
2055
+ element.show(path=str(image_path), resolution=image_resolution)
1705
2056
 
1706
2057
  # Add relative path to data
1707
2058
  element_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
@@ -1989,8 +2340,8 @@ class ElementCollection(
1989
2340
  # ------------------------------------------------------------------
1990
2341
  def apply_ocr(
1991
2342
  self,
1992
- *,
1993
2343
  function: Optional[Callable[["Region"], Optional[str]]] = None,
2344
+ *,
1994
2345
  show_progress: bool = True,
1995
2346
  **kwargs,
1996
2347
  ) -> "ElementCollection":
@@ -2046,1275 +2397,3 @@ class ElementCollection(
2046
2397
  return self
2047
2398
 
2048
2399
  # ------------------------------------------------------------------
2049
-
2050
-
2051
- class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin):
2052
- """
2053
- Represents a collection of Page objects, often from a single PDF document.
2054
- Provides methods for batch operations on these pages.
2055
- """
2056
-
2057
- def __init__(self, pages: Union[List[P], Sequence[P]]):
2058
- """
2059
- Initialize a page collection.
2060
-
2061
- Args:
2062
- pages: List or sequence of Page objects (can be lazy)
2063
- """
2064
- # Store the sequence as-is to preserve lazy behavior
2065
- # Only convert to list if we need list-specific operations
2066
- if hasattr(pages, '__iter__') and hasattr(pages, '__len__'):
2067
- self.pages = pages
2068
- else:
2069
- # Fallback for non-sequence types
2070
- self.pages = list(pages)
2071
-
2072
- def __len__(self) -> int:
2073
- """Return the number of pages in the collection."""
2074
- return len(self.pages)
2075
-
2076
- def __getitem__(self, idx) -> Union[P, "PageCollection[P]"]:
2077
- """Support indexing and slicing."""
2078
- if isinstance(idx, slice):
2079
- return PageCollection(self.pages[idx])
2080
- return self.pages[idx]
2081
-
2082
- def __iter__(self) -> Iterator[P]:
2083
- """Support iteration."""
2084
- return iter(self.pages)
2085
-
2086
- def __repr__(self) -> str:
2087
- """Return a string representation showing the page count."""
2088
- return f"<PageCollection(count={len(self)})>"
2089
-
2090
- def _get_items_for_apply(self) -> Iterator[P]:
2091
- """
2092
- Override ApplyMixin's _get_items_for_apply to preserve lazy behavior.
2093
-
2094
- Returns an iterator that yields pages on-demand rather than materializing
2095
- all pages at once, maintaining the lazy loading behavior.
2096
- """
2097
- return iter(self.pages)
2098
-
2099
- def _get_page_indices(self) -> List[int]:
2100
- """
2101
- Get page indices without forcing materialization of pages.
2102
-
2103
- Returns:
2104
- List of page indices for the pages in this collection.
2105
- """
2106
- # Handle different types of page sequences efficiently
2107
- if hasattr(self.pages, '_indices'):
2108
- # If it's a _LazyPageList (or slice), get indices directly
2109
- return list(self.pages._indices)
2110
- else:
2111
- # Fallback: if pages are already materialized, get indices normally
2112
- # This will force materialization but only if pages aren't lazy
2113
- return [p.index for p in self.pages]
2114
-
2115
- def extract_text(
2116
- self,
2117
- keep_blank_chars: bool = True,
2118
- apply_exclusions: bool = True,
2119
- strip: Optional[bool] = None,
2120
- **kwargs,
2121
- ) -> str:
2122
- """
2123
- Extract text from all pages in the collection.
2124
-
2125
- Args:
2126
- keep_blank_chars: Whether to keep blank characters (default: True)
2127
- apply_exclusions: Whether to apply exclusion regions (default: True)
2128
- strip: Whether to strip whitespace from the extracted text.
2129
- **kwargs: Additional extraction parameters
2130
-
2131
- Returns:
2132
- Combined text from all pages
2133
- """
2134
- texts = []
2135
- for page in self.pages:
2136
- text = page.extract_text(
2137
- keep_blank_chars=keep_blank_chars,
2138
- apply_exclusions=apply_exclusions,
2139
- **kwargs,
2140
- )
2141
- texts.append(text)
2142
-
2143
- combined = "\n".join(texts)
2144
-
2145
- # Default strip behaviour: if caller picks, honour; else respect layout flag passed via kwargs.
2146
- use_layout = kwargs.get("layout", False)
2147
- strip_final = strip if strip is not None else (not use_layout)
2148
-
2149
- if strip_final:
2150
- combined = "\n".join(line.rstrip() for line in combined.splitlines()).strip()
2151
-
2152
- return combined
2153
-
2154
- def apply_ocr(
2155
- self,
2156
- engine: Optional[str] = None,
2157
- # --- Common OCR Parameters (Direct Arguments) ---
2158
- languages: Optional[List[str]] = None,
2159
- min_confidence: Optional[float] = None, # Min confidence threshold
2160
- device: Optional[str] = None,
2161
- resolution: Optional[int] = None, # DPI for rendering
2162
- apply_exclusions: bool = True, # New parameter
2163
- replace: bool = True, # Whether to replace existing OCR elements
2164
- # --- Engine-Specific Options ---
2165
- options: Optional[Any] = None, # e.g., EasyOCROptions(...)
2166
- ) -> "PageCollection[P]":
2167
- """
2168
- Applies OCR to all pages within this collection using batch processing.
2169
-
2170
- This delegates the work to the parent PDF object's `apply_ocr` method.
2171
-
2172
- Args:
2173
- engine: Name of the OCR engine (e.g., 'easyocr', 'paddleocr').
2174
- languages: List of language codes (e.g., ['en', 'fr'], ['en', 'ch']).
2175
- **Must be codes understood by the specific selected engine.**
2176
- No mapping is performed.
2177
- min_confidence: Minimum confidence threshold for detected text (0.0 to 1.0).
2178
- device: Device to run OCR on (e.g., 'cpu', 'cuda', 'mps').
2179
- resolution: DPI resolution to render page images before OCR (e.g., 150, 300).
2180
- apply_exclusions: If True (default), render page images for OCR with
2181
- excluded areas masked (whited out). If False, OCR
2182
- the raw page images without masking exclusions.
2183
- replace: If True (default), remove any existing OCR elements before
2184
- adding new ones. If False, add new OCR elements to existing ones.
2185
- options: An engine-specific options object (e.g., EasyOCROptions) or dict.
2186
-
2187
- Returns:
2188
- Self for method chaining.
2189
-
2190
- Raises:
2191
- RuntimeError: If pages lack a parent PDF or parent lacks `apply_ocr`.
2192
- (Propagates exceptions from PDF.apply_ocr)
2193
- """
2194
- if not self.pages:
2195
- logger.warning("Cannot apply OCR to an empty PageCollection.")
2196
- return self
2197
-
2198
- # Assume all pages share the same parent PDF object
2199
- first_page = self.pages[0]
2200
- if not hasattr(first_page, "_parent") or not first_page._parent:
2201
- raise RuntimeError("Pages in this collection do not have a parent PDF reference.")
2202
-
2203
- parent_pdf = first_page._parent
2204
-
2205
- if not hasattr(parent_pdf, "apply_ocr") or not callable(parent_pdf.apply_ocr):
2206
- raise RuntimeError("Parent PDF object does not have the required 'apply_ocr' method.")
2207
-
2208
- # Get the 0-based indices of the pages in this collection
2209
- page_indices = self._get_page_indices()
2210
-
2211
- logger.info(f"Applying OCR via parent PDF to page indices: {page_indices} in collection.")
2212
-
2213
- # Delegate the batch call to the parent PDF object, passing direct args and apply_exclusions
2214
- parent_pdf.apply_ocr(
2215
- pages=page_indices,
2216
- engine=engine,
2217
- languages=languages,
2218
- min_confidence=min_confidence, # Pass the renamed parameter
2219
- device=device,
2220
- resolution=resolution,
2221
- apply_exclusions=apply_exclusions, # Pass down
2222
- replace=replace, # Pass the replace parameter
2223
- options=options,
2224
- )
2225
- # The PDF method modifies the Page objects directly by adding elements.
2226
-
2227
- return self # Return self for chaining
2228
-
2229
- @overload
2230
- def find(
2231
- self,
2232
- *,
2233
- text: str,
2234
- contains: str = "all",
2235
- apply_exclusions: bool = True,
2236
- regex: bool = False,
2237
- case: bool = True,
2238
- **kwargs,
2239
- ) -> Optional[T]: ...
2240
-
2241
- @overload
2242
- def find(
2243
- self,
2244
- selector: str,
2245
- *,
2246
- contains: str = "all",
2247
- apply_exclusions: bool = True,
2248
- regex: bool = False,
2249
- case: bool = True,
2250
- **kwargs,
2251
- ) -> Optional[T]: ...
2252
-
2253
- def find(
2254
- self,
2255
- selector: Optional[str] = None,
2256
- *,
2257
- text: Optional[str] = None,
2258
- contains: str = "all",
2259
- apply_exclusions: bool = True,
2260
- regex: bool = False,
2261
- case: bool = True,
2262
- **kwargs,
2263
- ) -> Optional[T]:
2264
- """
2265
- Find the first element matching the selector OR text across all pages in the collection.
2266
-
2267
- Provide EITHER `selector` OR `text`, but not both.
2268
-
2269
- Args:
2270
- selector: CSS-like selector string.
2271
- text: Text content to search for (equivalent to 'text:contains(...)').
2272
- contains: How to determine if elements are inside: 'all' (fully inside),
2273
- 'any' (any overlap), or 'center' (center point inside).
2274
- (default: "all")
2275
- apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
2276
- regex: Whether to use regex for text search (`selector` or `text`) (default: False).
2277
- case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
2278
- **kwargs: Additional filter parameters.
2279
-
2280
- Returns:
2281
- First matching element or None.
2282
- """
2283
- # Input validation happens within page.find
2284
- for page in self.pages:
2285
- element = page.find(
2286
- selector=selector,
2287
- text=text,
2288
- contains=contains,
2289
- apply_exclusions=apply_exclusions,
2290
- regex=regex,
2291
- case=case,
2292
- **kwargs,
2293
- )
2294
- if element:
2295
- return element
2296
- return None
2297
-
2298
- @overload
2299
- def find_all(
2300
- self,
2301
- *,
2302
- text: str,
2303
- contains: str = "all",
2304
- apply_exclusions: bool = True,
2305
- regex: bool = False,
2306
- case: bool = True,
2307
- **kwargs,
2308
- ) -> "ElementCollection": ...
2309
-
2310
- @overload
2311
- def find_all(
2312
- self,
2313
- selector: str,
2314
- *,
2315
- contains: str = "all",
2316
- apply_exclusions: bool = True,
2317
- regex: bool = False,
2318
- case: bool = True,
2319
- **kwargs,
2320
- ) -> "ElementCollection": ...
2321
-
2322
- def find_all(
2323
- self,
2324
- selector: Optional[str] = None,
2325
- *,
2326
- text: Optional[str] = None,
2327
- contains: str = "all",
2328
- apply_exclusions: bool = True,
2329
- regex: bool = False,
2330
- case: bool = True,
2331
- **kwargs,
2332
- ) -> "ElementCollection":
2333
- """
2334
- Find all elements matching the selector OR text across all pages in the collection.
2335
-
2336
- Provide EITHER `selector` OR `text`, but not both.
2337
-
2338
- Args:
2339
- selector: CSS-like selector string.
2340
- text: Text content to search for (equivalent to 'text:contains(...)').
2341
- contains: How to determine if elements are inside: 'all' (fully inside),
2342
- 'any' (any overlap), or 'center' (center point inside).
2343
- (default: "all")
2344
- apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
2345
- regex: Whether to use regex for text search (`selector` or `text`) (default: False).
2346
- case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
2347
- **kwargs: Additional filter parameters.
2348
-
2349
- Returns:
2350
- ElementCollection with matching elements from all pages.
2351
- """
2352
- all_elements = []
2353
- # Input validation happens within page.find_all
2354
- for page in self.pages:
2355
- elements = page.find_all(
2356
- selector=selector,
2357
- text=text,
2358
- contains=contains,
2359
- apply_exclusions=apply_exclusions,
2360
- regex=regex,
2361
- case=case,
2362
- **kwargs,
2363
- )
2364
- if elements:
2365
- all_elements.extend(elements.elements)
2366
-
2367
- return ElementCollection(all_elements)
2368
-
2369
- def update_text(
2370
- self,
2371
- transform: Callable[[Any], Optional[str]],
2372
- selector: str = "text",
2373
- max_workers: Optional[int] = None,
2374
- ) -> "PageCollection[P]":
2375
- """
2376
- Applies corrections to text elements across all pages
2377
- in this collection using a user-provided callback function, executed
2378
- in parallel if `max_workers` is specified.
2379
-
2380
- This method delegates to the parent PDF's `update_text` method,
2381
- targeting all pages within this collection.
2382
-
2383
- Args:
2384
- transform: A function that accepts a single argument (an element
2385
- object) and returns `Optional[str]` (new text or None).
2386
- selector: The attribute name to update. Default is 'text'.
2387
- max_workers: The maximum number of worker threads to use for parallel
2388
- correction on each page. If None, defaults are used.
2389
-
2390
- Returns:
2391
- Self for method chaining.
2392
-
2393
- Raises:
2394
- RuntimeError: If the collection is empty, pages lack a parent PDF reference,
2395
- or the parent PDF lacks the `update_text` method.
2396
- """
2397
- if not self.pages:
2398
- logger.warning("Cannot update text for an empty PageCollection.")
2399
- # Return self even if empty to maintain chaining consistency
2400
- return self
2401
-
2402
- # Assume all pages share the same parent PDF object
2403
- parent_pdf = self.pages[0]._parent
2404
- if (
2405
- not parent_pdf
2406
- or not hasattr(parent_pdf, "update_text")
2407
- or not callable(parent_pdf.update_text)
2408
- ):
2409
- raise RuntimeError(
2410
- "Parent PDF reference not found or parent PDF lacks the required 'update_text' method."
2411
- )
2412
-
2413
- page_indices = self._get_page_indices()
2414
- logger.info(
2415
- f"PageCollection: Delegating text update to parent PDF for page indices: {page_indices} with max_workers={max_workers} and selector='{selector}'."
2416
- )
2417
-
2418
- # Delegate the call to the parent PDF object for the relevant pages
2419
- # Pass the max_workers parameter down
2420
- parent_pdf.update_text(
2421
- transform=transform,
2422
- pages=page_indices,
2423
- selector=selector,
2424
- max_workers=max_workers,
2425
- )
2426
-
2427
- return self
2428
-
2429
- def get_sections(
2430
- self,
2431
- start_elements=None,
2432
- end_elements=None,
2433
- new_section_on_page_break=False,
2434
- boundary_inclusion="both",
2435
- ) -> "ElementCollection[Region]":
2436
- """
2437
- Extract sections from a page collection based on start/end elements.
2438
-
2439
- Args:
2440
- start_elements: Elements or selector string that mark the start of sections (optional)
2441
- end_elements: Elements or selector string that mark the end of sections (optional)
2442
- new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
2443
- boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
2444
-
2445
- Returns:
2446
- List of Region objects representing the extracted sections
2447
-
2448
- Note:
2449
- You can provide only start_elements, only end_elements, or both.
2450
- - With only start_elements: sections go from each start to the next start (or end of page)
2451
- - With only end_elements: sections go from beginning of document/page to each end
2452
- - With both: sections go from each start to the corresponding end
2453
- """
2454
- # Find start and end elements across all pages
2455
- if isinstance(start_elements, str):
2456
- start_elements = self.find_all(start_elements).elements
2457
-
2458
- if isinstance(end_elements, str):
2459
- end_elements = self.find_all(end_elements).elements
2460
-
2461
- # If no start elements and no end elements, return empty list
2462
- if not start_elements and not end_elements:
2463
- return []
2464
-
2465
- # If there are page break boundaries, we'll need to add them
2466
- if new_section_on_page_break:
2467
- # For each page boundary, create virtual "end" and "start" elements
2468
- for i in range(len(self.pages) - 1):
2469
- # Add a virtual "end" element at the bottom of the current page
2470
- page = self.pages[i]
2471
- # If end_elements is None, initialize it as an empty list
2472
- if end_elements is None:
2473
- end_elements = []
2474
-
2475
- # Create a region at the bottom of the page as an artificial end marker
2476
- from natural_pdf.elements.region import Region
2477
-
2478
- bottom_region = Region(page, (0, page.height - 1, page.width, page.height))
2479
- bottom_region.is_page_boundary = True # Mark it as a special boundary
2480
- end_elements.append(bottom_region)
2481
-
2482
- # Add a virtual "start" element at the top of the next page
2483
- next_page = self.pages[i + 1]
2484
- top_region = Region(next_page, (0, 0, next_page.width, 1))
2485
- top_region.is_page_boundary = True # Mark it as a special boundary
2486
- start_elements.append(top_region)
2487
-
2488
- # Get all elements from all pages and sort them in document order
2489
- all_elements = []
2490
- for page in self.pages:
2491
- elements = page.get_elements()
2492
- all_elements.extend(elements)
2493
-
2494
- # Sort by page index, then vertical position, then horizontal position
2495
- all_elements.sort(key=lambda e: (e.page.index, e.top, e.x0))
2496
-
2497
- # If we only have end_elements (no start_elements), create implicit start elements
2498
- if not start_elements and end_elements:
2499
- from natural_pdf.elements.region import Region
2500
-
2501
- start_elements = []
2502
-
2503
- # Add implicit start at the beginning of the first page
2504
- first_page = self.pages[0]
2505
- first_start = Region(first_page, (0, 0, first_page.width, 1))
2506
- first_start.is_implicit_start = True
2507
- start_elements.append(first_start)
2508
-
2509
- # For each end element (except the last), add an implicit start after it
2510
- sorted_end_elements = sorted(end_elements, key=lambda e: (e.page.index, e.top, e.x0))
2511
- for i, end_elem in enumerate(sorted_end_elements[:-1]): # Exclude last end element
2512
- # Create implicit start element right after this end element
2513
- implicit_start = Region(end_elem.page, (0, end_elem.bottom, end_elem.page.width, end_elem.bottom + 1))
2514
- implicit_start.is_implicit_start = True
2515
- start_elements.append(implicit_start)
2516
-
2517
- # Mark section boundaries
2518
- section_boundaries = []
2519
-
2520
- # Add start element boundaries
2521
- for element in start_elements:
2522
- if element in all_elements:
2523
- idx = all_elements.index(element)
2524
- section_boundaries.append(
2525
- {
2526
- "index": idx,
2527
- "element": element,
2528
- "type": "start",
2529
- "page_idx": element.page.index,
2530
- }
2531
- )
2532
- elif hasattr(element, "is_page_boundary") and element.is_page_boundary:
2533
- # This is a virtual page boundary element
2534
- section_boundaries.append(
2535
- {
2536
- "index": -1, # Special index for page boundaries
2537
- "element": element,
2538
- "type": "start",
2539
- "page_idx": element.page.index,
2540
- }
2541
- )
2542
- elif hasattr(element, "is_implicit_start") and element.is_implicit_start:
2543
- # This is an implicit start element
2544
- section_boundaries.append(
2545
- {
2546
- "index": -2, # Special index for implicit starts
2547
- "element": element,
2548
- "type": "start",
2549
- "page_idx": element.page.index,
2550
- }
2551
- )
2552
-
2553
- # Add end element boundaries if provided
2554
- if end_elements:
2555
- for element in end_elements:
2556
- if element in all_elements:
2557
- idx = all_elements.index(element)
2558
- section_boundaries.append(
2559
- {
2560
- "index": idx,
2561
- "element": element,
2562
- "type": "end",
2563
- "page_idx": element.page.index,
2564
- }
2565
- )
2566
- elif hasattr(element, "is_page_boundary") and element.is_page_boundary:
2567
- # This is a virtual page boundary element
2568
- section_boundaries.append(
2569
- {
2570
- "index": -1, # Special index for page boundaries
2571
- "element": element,
2572
- "type": "end",
2573
- "page_idx": element.page.index,
2574
- }
2575
- )
2576
-
2577
- # Sort boundaries by page index, then by actual document position
2578
- def _sort_key(boundary):
2579
- """Sort boundaries by (page_idx, vertical_top, priority)."""
2580
- page_idx = boundary["page_idx"]
2581
- element = boundary["element"]
2582
-
2583
- # Vertical position on the page
2584
- y_pos = getattr(element, "top", 0.0)
2585
-
2586
- # Ensure starts come before ends at the same coordinate
2587
- priority = 0 if boundary["type"] == "start" else 1
2588
-
2589
- return (page_idx, y_pos, priority)
2590
-
2591
- section_boundaries.sort(key=_sort_key)
2592
-
2593
- # Generate sections
2594
- sections = []
2595
-
2596
- # --- Helper: build a FlowRegion spanning multiple pages ---
2597
- def _build_flow_region(start_el, end_el):
2598
- """Return a FlowRegion that covers from *start_el* to *end_el* (inclusive).
2599
- If *end_el* is None, the region continues to the bottom of the last
2600
- page in this PageCollection."""
2601
- # Local imports to avoid top-level cycles
2602
- from natural_pdf.elements.region import Region
2603
- from natural_pdf.flows.element import FlowElement
2604
- from natural_pdf.flows.flow import Flow
2605
- from natural_pdf.flows.region import FlowRegion
2606
-
2607
- start_pg = start_el.page
2608
- end_pg = end_el.page if end_el is not None else self.pages[-1]
2609
-
2610
- parts: list[Region] = []
2611
-
2612
- # Use the actual top of the start element (for implicit starts this is
2613
- # the bottom of the previous end element) instead of forcing to 0.
2614
- start_top = start_el.top
2615
-
2616
- # Slice of first page beginning at *start_top*
2617
- parts.append(Region(start_pg, (0, start_top, start_pg.width, start_pg.height)))
2618
-
2619
- # Full middle pages
2620
- for pg_idx in range(start_pg.index + 1, end_pg.index):
2621
- mid_pg = self.pages[pg_idx]
2622
- parts.append(Region(mid_pg, (0, 0, mid_pg.width, mid_pg.height)))
2623
-
2624
- # Slice of last page (if distinct)
2625
- if end_pg is not start_pg:
2626
- bottom = end_el.bottom if end_el is not None else end_pg.height
2627
- parts.append(Region(end_pg, (0, 0, end_pg.width, bottom)))
2628
-
2629
- flow = Flow(segments=parts, arrangement="vertical")
2630
- src_fe = FlowElement(physical_object=start_el, flow=flow)
2631
- return FlowRegion(
2632
- flow=flow,
2633
- constituent_regions=parts,
2634
- source_flow_element=src_fe,
2635
- boundary_element_found=end_el,
2636
- )
2637
-
2638
- # ------------------------------------------------------------------
2639
-
2640
- current_start = None
2641
-
2642
- for i, boundary in enumerate(section_boundaries):
2643
- # If it's a start boundary and we don't have a current start
2644
- if boundary["type"] == "start" and current_start is None:
2645
- current_start = boundary
2646
-
2647
- # If it's an end boundary and we have a current start
2648
- elif boundary["type"] == "end" and current_start is not None:
2649
- # Create a section from current_start to this boundary
2650
- start_element = current_start["element"]
2651
- end_element = boundary["element"]
2652
-
2653
- # If both elements are on the same page, use the page's get_section_between
2654
- if start_element.page == end_element.page:
2655
- # For implicit start elements, create a region from the top of the page
2656
- if hasattr(start_element, "is_implicit_start"):
2657
- from natural_pdf.elements.region import Region
2658
- section = Region(
2659
- start_element.page,
2660
- (0, start_element.top, start_element.page.width, end_element.bottom)
2661
- )
2662
- section.start_element = start_element
2663
- section.boundary_element_found = end_element
2664
- else:
2665
- section = start_element.page.get_section_between(
2666
- start_element, end_element, boundary_inclusion
2667
- )
2668
- sections.append(section)
2669
- else:
2670
- # Create FlowRegion spanning pages
2671
- flow_region = _build_flow_region(start_element, end_element)
2672
- sections.append(flow_region)
2673
-
2674
- current_start = None
2675
-
2676
- # If it's another start boundary and we have a current start (for splitting by starts only)
2677
- elif boundary["type"] == "start" and current_start is not None and not end_elements:
2678
- # Create a section from current_start to just before this boundary
2679
- start_element = current_start["element"]
2680
-
2681
- # Find the last element before this boundary on the same page
2682
- if start_element.page == boundary["element"].page:
2683
- # Find elements on this page
2684
- page_elements = [e for e in all_elements if e.page == start_element.page]
2685
- # Sort by position
2686
- page_elements.sort(key=lambda e: (e.top, e.x0))
2687
-
2688
- # Find the last element before the boundary
2689
- end_idx = (
2690
- page_elements.index(boundary["element"]) - 1
2691
- if boundary["element"] in page_elements
2692
- else -1
2693
- )
2694
- end_element = page_elements[end_idx] if end_idx >= 0 else None
2695
-
2696
- # Create the section
2697
- section = start_element.page.get_section_between(
2698
- start_element, end_element, boundary_inclusion
2699
- )
2700
- sections.append(section)
2701
- else:
2702
- # Cross-page section - create from current_start to the end of its page
2703
- from natural_pdf.elements.region import Region
2704
-
2705
- start_page = start_element.page
2706
-
2707
- # Handle implicit start elements
2708
- start_top = start_element.top
2709
- region = Region(
2710
- start_page, (0, start_top, start_page.width, start_page.height)
2711
- )
2712
- region.start_element = start_element
2713
- sections.append(region)
2714
-
2715
- current_start = boundary
2716
-
2717
- # Handle the last section if we have a current start
2718
- if current_start is not None:
2719
- start_element = current_start["element"]
2720
- start_page = start_element.page
2721
-
2722
- if end_elements:
2723
- # With end_elements, we need an explicit end - use the last element
2724
- # on the last page of the collection
2725
- last_page = self.pages[-1]
2726
- last_page_elements = [e for e in all_elements if e.page == last_page]
2727
- last_page_elements.sort(key=lambda e: (e.top, e.x0))
2728
- end_element = last_page_elements[-1] if last_page_elements else None
2729
-
2730
- # Create FlowRegion spanning multiple pages using helper
2731
- flow_region = _build_flow_region(start_element, end_element)
2732
- sections.append(flow_region)
2733
- else:
2734
- # With start_elements only, create a section to the end of the current page
2735
- from natural_pdf.elements.region import Region
2736
-
2737
- # Handle implicit start elements
2738
- start_top = start_element.top
2739
- region = Region(
2740
- start_page, (0, start_top, start_page.width, start_page.height)
2741
- )
2742
- region.start_element = start_element
2743
- sections.append(region)
2744
-
2745
- return ElementCollection(sections)
2746
-
2747
- def _gather_analysis_data(
2748
- self,
2749
- analysis_keys: List[str],
2750
- include_content: bool,
2751
- include_images: bool,
2752
- image_dir: Optional[Path],
2753
- image_format: str,
2754
- image_resolution: int,
2755
- ) -> List[Dict[str, Any]]:
2756
- """
2757
- Gather analysis data from all pages in the collection.
2758
-
2759
- Args:
2760
- analysis_keys: Keys in the analyses dictionary to export
2761
- include_content: Whether to include extracted text
2762
- include_images: Whether to export images
2763
- image_dir: Directory to save images
2764
- image_format: Format to save images
2765
- image_resolution: Resolution for exported images
2766
-
2767
- Returns:
2768
- List of dictionaries containing analysis data
2769
- """
2770
- if not self.elements:
2771
- logger.warning("No pages found in collection")
2772
- return []
2773
-
2774
- all_data = []
2775
-
2776
- for page in self.elements:
2777
- # Basic page information
2778
- page_data = {
2779
- "page_number": page.number,
2780
- "page_index": page.index,
2781
- "width": page.width,
2782
- "height": page.height,
2783
- }
2784
-
2785
- # Add PDF information if available
2786
- if hasattr(page, "pdf") and page.pdf:
2787
- page_data["pdf_path"] = page.pdf.path
2788
- page_data["pdf_filename"] = Path(page.pdf.path).name
2789
-
2790
- # Include extracted text if requested
2791
- if include_content:
2792
- try:
2793
- page_data["content"] = page.extract_text(preserve_whitespace=True)
2794
- except Exception as e:
2795
- logger.error(f"Error extracting text from page {page.number}: {e}")
2796
- page_data["content"] = ""
2797
-
2798
- # Save image if requested
2799
- if include_images:
2800
- try:
2801
- # Create image filename
2802
- pdf_name = "unknown"
2803
- if hasattr(page, "pdf") and page.pdf:
2804
- pdf_name = Path(page.pdf.path).stem
2805
-
2806
- image_filename = f"{pdf_name}_page_{page.number}.{image_format}"
2807
- image_path = image_dir / image_filename
2808
-
2809
- # Save image
2810
- page.save_image(
2811
- str(image_path), resolution=image_resolution, include_highlights=True
2812
- )
2813
-
2814
- # Add relative path to data
2815
- page_data["image_path"] = str(Path(image_path).relative_to(image_dir.parent))
2816
- except Exception as e:
2817
- logger.error(f"Error saving image for page {page.number}: {e}")
2818
- page_data["image_path"] = None
2819
-
2820
- # Add analyses data
2821
- if hasattr(page, "analyses") and page.analyses:
2822
- for key in analysis_keys:
2823
- if key not in page.analyses:
2824
- raise KeyError(f"Analysis key '{key}' not found in page {page.number}")
2825
-
2826
- # Get the analysis result
2827
- analysis_result = page.analyses[key]
2828
-
2829
- # If the result has a to_dict method, use it
2830
- if hasattr(analysis_result, "to_dict"):
2831
- analysis_data = analysis_result.to_dict()
2832
- else:
2833
- # Otherwise, use the result directly if it's dict-like
2834
- try:
2835
- analysis_data = dict(analysis_result)
2836
- except (TypeError, ValueError):
2837
- # Last resort: convert to string
2838
- analysis_data = {"raw_result": str(analysis_result)}
2839
-
2840
- # Add analysis data to page data with the key as prefix
2841
- for k, v in analysis_data.items():
2842
- page_data[f"{key}.{k}"] = v
2843
-
2844
- all_data.append(page_data)
2845
-
2846
- return all_data
2847
-
2848
- # --- Deskew Method --- #
2849
-
2850
- def deskew(
2851
- self,
2852
- resolution: int = 300,
2853
- detection_resolution: int = 72,
2854
- force_overwrite: bool = False,
2855
- **deskew_kwargs,
2856
- ) -> "PDF": # Changed return type
2857
- """
2858
- Creates a new, in-memory PDF object containing deskewed versions of the pages
2859
- in this collection.
2860
-
2861
- This method delegates the actual processing to the parent PDF object's
2862
- `deskew` method.
2863
-
2864
- Important: The returned PDF is image-based. Any existing text, OCR results,
2865
- annotations, or other elements from the original pages will *not* be carried over.
2866
-
2867
- Args:
2868
- resolution: DPI resolution for rendering the output deskewed pages.
2869
- detection_resolution: DPI resolution used for skew detection if angles are not
2870
- already cached on the page objects.
2871
- force_overwrite: If False (default), raises a ValueError if any target page
2872
- already contains processed elements (text, OCR, regions) to
2873
- prevent accidental data loss. Set to True to proceed anyway.
2874
- **deskew_kwargs: Additional keyword arguments passed to `deskew.determine_skew`
2875
- during automatic detection (e.g., `max_angle`, `num_peaks`).
2876
-
2877
- Returns:
2878
- A new PDF object representing the deskewed document.
2879
-
2880
- Raises:
2881
- ImportError: If 'deskew' or 'img2pdf' libraries are not installed (raised by PDF.deskew).
2882
- ValueError: If `force_overwrite` is False and target pages contain elements (raised by PDF.deskew),
2883
- or if the collection is empty.
2884
- RuntimeError: If pages lack a parent PDF reference, or the parent PDF lacks the `deskew` method.
2885
- """
2886
- if not self.pages:
2887
- logger.warning("Cannot deskew an empty PageCollection.")
2888
- raise ValueError("Cannot deskew an empty PageCollection.")
2889
-
2890
- # Assume all pages share the same parent PDF object
2891
- # Need to hint the type of _parent for type checkers
2892
- if TYPE_CHECKING:
2893
- parent_pdf: "natural_pdf.core.pdf.PDF" = self.pages[0]._parent
2894
- else:
2895
- parent_pdf = self.pages[0]._parent
2896
-
2897
- if not parent_pdf or not hasattr(parent_pdf, "deskew") or not callable(parent_pdf.deskew):
2898
- raise RuntimeError(
2899
- "Parent PDF reference not found or parent PDF lacks the required 'deskew' method."
2900
- )
2901
-
2902
- # Get the 0-based indices of the pages in this collection
2903
- page_indices = self._get_page_indices()
2904
- logger.info(
2905
- f"PageCollection: Delegating deskew to parent PDF for page indices: {page_indices}"
2906
- )
2907
-
2908
- # Delegate the call to the parent PDF object for the relevant pages
2909
- # Pass all relevant arguments through (no output_path anymore)
2910
- return parent_pdf.deskew(
2911
- pages=page_indices,
2912
- resolution=resolution,
2913
- detection_resolution=detection_resolution,
2914
- force_overwrite=force_overwrite,
2915
- **deskew_kwargs,
2916
- )
2917
-
2918
- # --- End Deskew Method --- #
2919
-
2920
- def to_image(
2921
- self,
2922
- page_width: Optional[int] = None,
2923
- cols: Optional[int] = 4,
2924
- rows: Optional[int] = None,
2925
- max_pages: Optional[int] = None,
2926
- spacing: int = 10,
2927
- add_labels: bool = True, # Add new flag
2928
- show_category: bool = False,
2929
- ) -> Optional["Image.Image"]:
2930
- """
2931
- Generate a grid of page images for this collection.
2932
-
2933
- Args:
2934
- page_width: Width in pixels for rendering individual pages
2935
- cols: Number of columns in grid (default: 4)
2936
- rows: Number of rows in grid (calculated automatically if None)
2937
- max_pages: Maximum number of pages to include (default: all)
2938
- spacing: Spacing between page thumbnails in pixels
2939
- add_labels: Whether to add page number labels
2940
- show_category: Whether to add category and confidence labels (if available)
2941
-
2942
- Returns:
2943
- PIL Image of the page grid or None if no pages
2944
- """
2945
- # Determine default page width from global options if not explicitly provided
2946
- if page_width is None:
2947
- try:
2948
- import natural_pdf
2949
-
2950
- page_width = natural_pdf.options.image.width or 300
2951
- except Exception:
2952
- # Fallback if natural_pdf import fails in some edge context
2953
- page_width = 300
2954
-
2955
- # Ensure PIL is imported, handle potential ImportError if not done globally/lazily
2956
- try:
2957
- from PIL import Image, ImageDraw, ImageFont
2958
- except ImportError:
2959
- logger.error(
2960
- "Pillow library not found, required for to_image(). Install with 'pip install Pillow'"
2961
- )
2962
- return None
2963
-
2964
- if not self.pages:
2965
- logger.warning("Cannot generate image for empty PageCollection")
2966
- return None
2967
-
2968
- # Limit pages if max_pages is specified
2969
- pages_to_render = self.pages[:max_pages] if max_pages else self.pages
2970
-
2971
- # Load font once outside the loop
2972
- font = None
2973
- if add_labels:
2974
- try:
2975
- # Try loading a commonly available font first
2976
- font = ImageFont.truetype("DejaVuSans.ttf", 16)
2977
- except IOError:
2978
- try:
2979
- font = ImageFont.load_default(16)
2980
- except IOError:
2981
- logger.warning("Default font not found. Labels cannot be added.")
2982
- add_labels = False # Disable if no font
2983
-
2984
- # Render individual page images
2985
- page_images = []
2986
- for page in pages_to_render:
2987
- try:
2988
- # Assume page.to_image returns a PIL Image or None
2989
- img = page.to_image(
2990
- width=page_width, include_highlights=True
2991
- ) # Render with highlights for visual context
2992
- if img is None:
2993
- logger.warning(f"Failed to generate image for page {page.number}. Skipping.")
2994
- continue
2995
- except Exception as img_err:
2996
- logger.error(
2997
- f"Error generating image for page {page.number}: {img_err}", exc_info=True
2998
- )
2999
- continue
3000
-
3001
- # Add page number label
3002
- if add_labels and font:
3003
- draw = ImageDraw.Draw(img)
3004
- pdf_name = (
3005
- Path(page.pdf.path).stem
3006
- if hasattr(page, "pdf") and page.pdf and hasattr(page.pdf, "path")
3007
- else ""
3008
- )
3009
- label_text = f"p{page.number}"
3010
- if pdf_name:
3011
- label_text += f" - {pdf_name}"
3012
-
3013
- # Add category if requested and available
3014
- if show_category:
3015
- # Placeholder logic - adjust based on how classification results are stored
3016
- category = None
3017
- confidence = None
3018
- if (
3019
- hasattr(page, "analyses")
3020
- and page.analyses
3021
- and "classification" in page.analyses
3022
- ):
3023
- result = page.analyses["classification"]
3024
- # Adapt based on actual structure of classification result
3025
- category = (
3026
- getattr(result, "label", None) or result.get("label", None)
3027
- if isinstance(result, dict)
3028
- else None
3029
- )
3030
- confidence = (
3031
- getattr(result, "score", None) or result.get("score", None)
3032
- if isinstance(result, dict)
3033
- else None
3034
- )
3035
-
3036
- if category is not None and confidence is not None:
3037
- try:
3038
- category_str = f"{category} ({confidence:.2f})" # Format confidence
3039
- label_text += f"\\n{category_str}"
3040
- except (TypeError, ValueError):
3041
- pass # Ignore formatting errors
3042
-
3043
- # Calculate bounding box for multi-line text and draw background/text
3044
- try:
3045
- # Using textbbox for potentially better accuracy with specific fonts
3046
- # Note: textbbox needs Pillow 8+
3047
- bbox = draw.textbbox(
3048
- (5, 5), label_text, font=font, spacing=2
3049
- ) # Use textbbox if available
3050
- bg_rect = (
3051
- max(0, bbox[0] - 2),
3052
- max(0, bbox[1] - 2),
3053
- min(img.width, bbox[2] + 2),
3054
- min(img.height, bbox[3] + 2),
3055
- )
3056
-
3057
- # Draw semi-transparent background
3058
- overlay = Image.new("RGBA", img.size, (255, 255, 255, 0))
3059
- draw_overlay = ImageDraw.Draw(overlay)
3060
- draw_overlay.rectangle(bg_rect, fill=(255, 255, 255, 180)) # White with alpha
3061
- img = Image.alpha_composite(img.convert("RGBA"), overlay).convert("RGB")
3062
- draw = ImageDraw.Draw(img) # Recreate draw object
3063
-
3064
- # Draw the potentially multi-line text
3065
- draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
3066
- except AttributeError: # Fallback for older Pillow without textbbox
3067
- # Approximate size and draw
3068
- # This might not be perfectly aligned
3069
- draw.rectangle(
3070
- (2, 2, 150, 40), fill=(255, 255, 255, 180)
3071
- ) # Simple fixed background
3072
- draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font, spacing=2)
3073
- except Exception as draw_err:
3074
- logger.error(
3075
- f"Error drawing label on page {page.number}: {draw_err}", exc_info=True
3076
- )
3077
-
3078
- page_images.append(img)
3079
-
3080
- if not page_images:
3081
- logger.warning("No page images were successfully rendered for the grid.")
3082
- return None
3083
-
3084
- # Calculate grid dimensions if not provided
3085
- num_images = len(page_images)
3086
- if not rows and not cols:
3087
- cols = min(4, int(num_images**0.5) + 1)
3088
- rows = (num_images + cols - 1) // cols
3089
- elif rows and not cols:
3090
- cols = (num_images + rows - 1) // rows
3091
- elif cols and not rows:
3092
- rows = (num_images + cols - 1) // cols
3093
- cols = max(1, cols if cols else 1) # Ensure at least 1
3094
- rows = max(1, rows if rows else 1)
3095
-
3096
- # Get maximum dimensions for consistent grid cells
3097
- max_width = max(img.width for img in page_images) if page_images else 1
3098
- max_height = max(img.height for img in page_images) if page_images else 1
3099
-
3100
- # Create grid image
3101
- grid_width = cols * max_width + (cols + 1) * spacing
3102
- grid_height = rows * max_height + (rows + 1) * spacing
3103
- grid_img = Image.new(
3104
- "RGB", (grid_width, grid_height), (220, 220, 220)
3105
- ) # Lighter gray background
3106
-
3107
- # Place images in grid
3108
- for i, img in enumerate(page_images):
3109
- if i >= rows * cols: # Ensure we don't exceed grid capacity
3110
- break
3111
-
3112
- row = i // cols
3113
- col = i % cols
3114
-
3115
- x = col * max_width + (col + 1) * spacing
3116
- y = row * max_height + (row + 1) * spacing
3117
-
3118
- grid_img.paste(img, (x, y))
3119
-
3120
- return grid_img
3121
-
3122
- def save_pdf(
3123
- self,
3124
- output_path: Union[str, Path],
3125
- ocr: bool = False,
3126
- original: bool = False,
3127
- dpi: int = 300,
3128
- ):
3129
- """
3130
- Saves the pages in this collection to a new PDF file.
3131
-
3132
- Choose one saving mode:
3133
- - `ocr=True`: Creates a new, image-based PDF using OCR results. This
3134
- makes the text generated during the natural-pdf session searchable,
3135
- but loses original vector content. Requires 'ocr-export' extras.
3136
- - `original=True`: Extracts the original pages from the source PDF,
3137
- preserving all vector content, fonts, and annotations. OCR results
3138
- from the natural-pdf session are NOT included. Requires 'ocr-export' extras.
3139
-
3140
- Args:
3141
- output_path: Path to save the new PDF file.
3142
- ocr: If True, save as a searchable, image-based PDF using OCR data.
3143
- original: If True, save the original, vector-based pages.
3144
- dpi: Resolution (dots per inch) used only when ocr=True for
3145
- rendering page images and aligning the text layer.
3146
-
3147
- Raises:
3148
- ValueError: If the collection is empty, if neither or both 'ocr'
3149
- and 'original' are True, or if 'original=True' and
3150
- pages originate from different PDFs.
3151
- ImportError: If required libraries ('pikepdf', 'Pillow')
3152
- are not installed for the chosen mode.
3153
- RuntimeError: If an unexpected error occurs during saving.
3154
- """
3155
- if not self.pages:
3156
- raise ValueError("Cannot save an empty PageCollection.")
3157
-
3158
- if not (ocr ^ original): # XOR: exactly one must be true
3159
- raise ValueError("Exactly one of 'ocr' or 'original' must be True.")
3160
-
3161
- output_path_obj = Path(output_path)
3162
- output_path_str = str(output_path_obj)
3163
-
3164
- if ocr:
3165
- if create_searchable_pdf is None:
3166
- raise ImportError(
3167
- "Saving with ocr=True requires 'pikepdf' and 'Pillow'. "
3168
- 'Install with: pip install \\"natural-pdf[ocr-export]\\"' # Escaped quotes
3169
- )
3170
-
3171
- # Check for non-OCR vector elements (provide a warning)
3172
- has_vector_elements = False
3173
- for page in self.pages:
3174
- # Simplified check for common vector types or non-OCR chars/words
3175
- if (
3176
- hasattr(page, "rects")
3177
- and page.rects
3178
- or hasattr(page, "lines")
3179
- and page.lines
3180
- or hasattr(page, "curves")
3181
- and page.curves
3182
- or (
3183
- hasattr(page, "chars")
3184
- and any(getattr(el, "source", None) != "ocr" for el in page.chars)
3185
- )
3186
- or (
3187
- hasattr(page, "words")
3188
- and any(getattr(el, "source", None) != "ocr" for el in page.words)
3189
- )
3190
- ):
3191
- has_vector_elements = True
3192
- break
3193
- if has_vector_elements:
3194
- logger.warning(
3195
- "Warning: Saving with ocr=True creates an image-based PDF. "
3196
- "Original vector elements (rects, lines, non-OCR text/chars) "
3197
- "on selected pages will not be preserved in the output file."
3198
- )
3199
-
3200
- logger.info(f"Saving searchable PDF (OCR text layer) to: {output_path_str}")
3201
- try:
3202
- # Delegate to the searchable PDF exporter function
3203
- # Pass `self` (the PageCollection instance) as the source
3204
- create_searchable_pdf(self, output_path_str, dpi=dpi)
3205
- # Success log is now inside create_searchable_pdf if needed, or keep here
3206
- # logger.info(f"Successfully saved searchable PDF to: {output_path_str}")
3207
- except Exception as e:
3208
- logger.error(f"Failed to create searchable PDF: {e}", exc_info=True)
3209
- # Re-raise as RuntimeError for consistency, potentially handled in exporter too
3210
- raise RuntimeError(f"Failed to create searchable PDF: {e}") from e
3211
-
3212
- elif original:
3213
- # ---> MODIFIED: Call the new exporter
3214
- if create_original_pdf is None:
3215
- raise ImportError(
3216
- "Saving with original=True requires 'pikepdf'. "
3217
- 'Install with: pip install \\"natural-pdf[ocr-export]\\"' # Escaped quotes
3218
- )
3219
-
3220
- # Check for OCR elements (provide a warning) - keep this check here
3221
- has_ocr_elements = False
3222
- for page in self.pages:
3223
- # Use find_all which returns a collection; check if it's non-empty
3224
- if hasattr(page, "find_all"):
3225
- ocr_text_elements = page.find_all("text[source=ocr]")
3226
- if ocr_text_elements: # Check truthiness of collection
3227
- has_ocr_elements = True
3228
- break
3229
- elif hasattr(page, "words"): # Fallback check if find_all isn't present?
3230
- if any(getattr(el, "source", None) == "ocr" for el in page.words):
3231
- has_ocr_elements = True
3232
- break
3233
-
3234
- if has_ocr_elements:
3235
- logger.warning(
3236
- "Warning: Saving with original=True preserves original page content. "
3237
- "OCR text generated in this session will not be included in the saved file."
3238
- )
3239
-
3240
- logger.info(f"Saving original pages PDF to: {output_path_str}")
3241
- try:
3242
- # Delegate to the original PDF exporter function
3243
- # Pass `self` (the PageCollection instance) as the source
3244
- create_original_pdf(self, output_path_str)
3245
- # Success log is now inside create_original_pdf
3246
- # logger.info(f"Successfully saved original pages PDF to: {output_path_str}")
3247
- except Exception as e:
3248
- # Error logging is handled within create_original_pdf
3249
- # Re-raise the exception caught from the exporter
3250
- raise e # Keep the original exception type (ValueError, RuntimeError, etc.)
3251
- # <--- END MODIFIED
3252
-
3253
- def to_flow(
3254
- self,
3255
- arrangement: Literal["vertical", "horizontal"] = "vertical",
3256
- alignment: Literal["start", "center", "end", "top", "left", "bottom", "right"] = "start",
3257
- segment_gap: float = 0.0,
3258
- ) -> "Flow":
3259
- """
3260
- Convert this PageCollection to a Flow for cross-page operations.
3261
-
3262
- This enables treating multiple pages as a continuous logical document
3263
- structure, useful for multi-page tables, articles spanning columns,
3264
- or any content requiring reading order across page boundaries.
3265
-
3266
- Args:
3267
- arrangement: Primary flow direction ('vertical' or 'horizontal').
3268
- 'vertical' stacks pages top-to-bottom (most common).
3269
- 'horizontal' arranges pages left-to-right.
3270
- alignment: Cross-axis alignment for pages of different sizes:
3271
- For vertical: 'left'/'start', 'center', 'right'/'end'
3272
- For horizontal: 'top'/'start', 'center', 'bottom'/'end'
3273
- segment_gap: Virtual gap between pages in PDF points (default: 0.0).
3274
-
3275
- Returns:
3276
- Flow object that can perform operations across all pages in sequence.
3277
-
3278
- Example:
3279
- Multi-page table extraction:
3280
- ```python
3281
- pdf = npdf.PDF("multi_page_report.pdf")
3282
-
3283
- # Create flow for pages 2-4 containing a table
3284
- table_flow = pdf.pages[1:4].to_flow()
3285
-
3286
- # Extract table as if it were continuous
3287
- table_data = table_flow.extract_table()
3288
- df = table_data.df
3289
- ```
3290
-
3291
- Cross-page element search:
3292
- ```python
3293
- # Find all headers across multiple pages
3294
- headers = pdf.pages[5:10].to_flow().find_all('text[size>12]:bold')
3295
-
3296
- # Analyze layout across pages
3297
- regions = pdf.pages.to_flow().analyze_layout(engine='yolo')
3298
- ```
3299
- """
3300
- from natural_pdf.flows.flow import Flow
3301
- return Flow(
3302
- segments=self, # Flow constructor now handles PageCollection
3303
- arrangement=arrangement,
3304
- alignment=alignment,
3305
- segment_gap=segment_gap,
3306
- )
3307
-
3308
- # Alias .to_image() to .show() for convenience
3309
- def show(
3310
- self,
3311
- *args,
3312
- **kwargs,
3313
- ) -> Optional["Image.Image"]:
3314
- """Display pages similarly to ``to_image``.
3315
-
3316
- This is a thin wrapper around :py:meth:`to_image` so that the API mirrors
3317
- ElementCollection, where ``show()`` already exists. It forwards all
3318
- arguments and returns the resulting ``PIL.Image`` instance.
3319
- """
3320
- return self.to_image(*args, **kwargs)