natural-pdf 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. natural_pdf/__init__.py +11 -6
  2. natural_pdf/analyzers/__init__.py +6 -1
  3. natural_pdf/analyzers/guides.py +354 -258
  4. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -4
  6. natural_pdf/analyzers/layout/paddle.py +11 -0
  7. natural_pdf/analyzers/layout/surya.py +2 -3
  8. natural_pdf/analyzers/shape_detection_mixin.py +25 -34
  9. natural_pdf/analyzers/text_structure.py +2 -2
  10. natural_pdf/classification/manager.py +1 -1
  11. natural_pdf/collections/mixins.py +3 -2
  12. natural_pdf/core/highlighting_service.py +743 -32
  13. natural_pdf/core/page.py +252 -399
  14. natural_pdf/core/page_collection.py +1249 -0
  15. natural_pdf/core/pdf.py +231 -89
  16. natural_pdf/{collections → core}/pdf_collection.py +18 -11
  17. natural_pdf/core/render_spec.py +335 -0
  18. natural_pdf/describe/base.py +1 -1
  19. natural_pdf/elements/__init__.py +1 -0
  20. natural_pdf/elements/base.py +108 -83
  21. natural_pdf/elements/{collections.py → element_collection.py} +575 -1372
  22. natural_pdf/elements/line.py +0 -1
  23. natural_pdf/elements/rect.py +0 -1
  24. natural_pdf/elements/region.py +405 -280
  25. natural_pdf/elements/text.py +9 -7
  26. natural_pdf/exporters/base.py +2 -2
  27. natural_pdf/exporters/original_pdf.py +1 -1
  28. natural_pdf/exporters/paddleocr.py +2 -4
  29. natural_pdf/exporters/searchable_pdf.py +3 -2
  30. natural_pdf/extraction/mixin.py +1 -3
  31. natural_pdf/flows/collections.py +1 -69
  32. natural_pdf/flows/element.py +25 -0
  33. natural_pdf/flows/flow.py +1658 -19
  34. natural_pdf/flows/region.py +757 -263
  35. natural_pdf/ocr/ocr_options.py +0 -2
  36. natural_pdf/ocr/utils.py +2 -1
  37. natural_pdf/qa/document_qa.py +21 -5
  38. natural_pdf/search/search_service_protocol.py +1 -1
  39. natural_pdf/selectors/parser.py +35 -2
  40. natural_pdf/tables/result.py +35 -1
  41. natural_pdf/text_mixin.py +101 -0
  42. natural_pdf/utils/debug.py +2 -1
  43. natural_pdf/utils/highlighting.py +1 -0
  44. natural_pdf/utils/layout.py +2 -2
  45. natural_pdf/utils/packaging.py +4 -3
  46. natural_pdf/utils/text_extraction.py +15 -12
  47. natural_pdf/utils/visualization.py +385 -0
  48. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
  49. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -52
  50. optimization/memory_comparison.py +1 -1
  51. optimization/pdf_analyzer.py +2 -2
  52. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
  53. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
  54. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
  55. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
@@ -2,11 +2,12 @@
2
2
  Base Element class for natural-pdf.
3
3
  """
4
4
 
5
- from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, overload
5
+ from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union, overload
6
6
 
7
7
  from PIL import Image
8
8
 
9
9
  from natural_pdf.classification.mixin import ClassificationMixin
10
+ from natural_pdf.core.render_spec import RenderSpec, Visualizable
10
11
  from natural_pdf.describe.mixin import DescribeMixin
11
12
 
12
13
  # Import selector parsing functions
@@ -15,7 +16,7 @@ from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
15
16
  if TYPE_CHECKING:
16
17
  from natural_pdf.classification.manager import ClassificationManager # noqa: F401
17
18
  from natural_pdf.core.page import Page
18
- from natural_pdf.elements.collections import ElementCollection
19
+ from natural_pdf.elements.element_collection import ElementCollection
19
20
  from natural_pdf.elements.region import Region
20
21
 
21
22
 
@@ -563,7 +564,56 @@ class DirectionalMixin:
563
564
  return matches[0]
564
565
 
565
566
 
566
- class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
567
+ class HighlightableMixin:
568
+ """
569
+ Mixin that provides the highlighting protocol for elements.
570
+
571
+ This protocol enables ElementCollection.show() to work with mixed content
572
+ including FlowRegions and elements from multiple pages by providing a
573
+ standard way to get highlight specifications.
574
+ """
575
+
576
+ def get_highlight_specs(self) -> List[Dict[str, Any]]:
577
+ """
578
+ Get highlight specifications for this element.
579
+
580
+ Returns a list of dictionaries, each containing:
581
+ - page: The Page object to highlight on
582
+ - page_index: The 0-based index of the page
583
+ - bbox: The bounding box (x0, y0, x1, y1) to highlight
584
+ - polygon: Optional polygon coordinates for non-rectangular highlights
585
+ - element: Reference to the element being highlighted
586
+
587
+ For regular elements, this returns a single spec.
588
+ For FlowRegions, this returns specs for all constituent regions.
589
+
590
+ Returns:
591
+ List of highlight specification dictionaries
592
+ """
593
+ # Default implementation for regular elements
594
+ if not hasattr(self, "page") or self.page is None:
595
+ return []
596
+
597
+ if not hasattr(self, "bbox") or self.bbox is None:
598
+ return []
599
+
600
+ spec = {
601
+ "page": self.page,
602
+ "page_index": self.page.index if hasattr(self.page, "index") else 0,
603
+ "bbox": self.bbox,
604
+ "element": self,
605
+ }
606
+
607
+ # Add polygon if available
608
+ if hasattr(self, "polygon") and hasattr(self, "has_polygon") and self.has_polygon:
609
+ spec["polygon"] = self.polygon
610
+
611
+ return [spec]
612
+
613
+
614
+ class Element(
615
+ DirectionalMixin, ClassificationMixin, DescribeMixin, HighlightableMixin, Visualizable
616
+ ):
567
617
  """Base class for all PDF elements.
568
618
 
569
619
  This class provides common properties and methods for all PDF elements,
@@ -1024,7 +1074,7 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
1024
1074
  label: str = "",
1025
1075
  color: Optional[Tuple[float, float, float]] = None,
1026
1076
  use_color_cycling: bool = True,
1027
- include_attrs: Optional[List[str]] = None,
1077
+ annotate: Optional[List[str]] = None,
1028
1078
  existing: str = "append",
1029
1079
  ) -> "Element":
1030
1080
  """Highlight the element with the specified colour.
@@ -1042,7 +1092,7 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
1042
1092
  "label": label,
1043
1093
  "use_color_cycling": use_color_cycling,
1044
1094
  "element": self, # Pass the element itself so attributes can be accessed
1045
- "include_attrs": include_attrs,
1095
+ "annotate": annotate,
1046
1096
  "existing": existing,
1047
1097
  }
1048
1098
 
@@ -1056,84 +1106,67 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
1056
1106
 
1057
1107
  return self
1058
1108
 
1059
- def show(
1109
+ def _get_render_specs(
1060
1110
  self,
1061
- resolution: Optional[float] = None,
1062
- labels: bool = True,
1063
- legend_position: str = "right",
1064
- color: Optional[Union[Tuple, str]] = "red", # Default color for single element
1111
+ mode: Literal["show", "render"] = "show",
1112
+ color: Optional[Union[str, Tuple[int, int, int]]] = None,
1113
+ highlights: Optional[List[Dict[str, Any]]] = None,
1114
+ crop: Union[bool, Literal["content"]] = False,
1115
+ crop_bbox: Optional[Tuple[float, float, float, float]] = None,
1065
1116
  label: Optional[str] = None,
1066
- width: Optional[int] = None, # Add width parameter
1067
- crop: bool = False, # NEW: Crop to element bounds before legend
1068
- ) -> Optional["Image.Image"]:
1069
- """
1070
- Show the page with only this element highlighted temporarily.
1117
+ **kwargs,
1118
+ ) -> List[RenderSpec]:
1119
+ """Get render specifications for this element.
1071
1120
 
1072
1121
  Args:
1073
- resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
1074
- labels: Whether to include a legend for the highlight
1075
- legend_position: Position of the legend
1076
- color: Color to highlight this element (default: red)
1077
- label: Optional label for this element in the legend
1078
- width: Optional width for the output image in pixels
1079
- crop: If True, crop the rendered image to this element's
1080
- bounding box before legends/overlays are added.
1122
+ mode: Rendering mode - 'show' includes highlights, 'render' is clean
1123
+ color: Color for highlighting this element in show mode
1124
+ highlights: Additional highlight groups to show
1125
+ crop: Whether to crop to element bounds
1126
+ crop_bbox: Explicit crop bounds
1127
+ label: Optional label for this element
1128
+ **kwargs: Additional parameters
1081
1129
 
1082
1130
  Returns:
1083
- PIL Image of the page with only this element highlighted, or None if error.
1131
+ List with single RenderSpec for this element's page
1084
1132
  """
1085
- # Apply global options as defaults
1086
- import natural_pdf
1087
-
1088
- if resolution is None:
1089
- if natural_pdf.options.image.resolution is not None:
1090
- resolution = natural_pdf.options.image.resolution
1091
- else:
1092
- resolution = 144 # Default resolution when none specified
1093
- if not hasattr(self, "page") or not self.page:
1094
- logger.warning(f"Cannot show element, missing 'page' attribute: {self}")
1095
- return None
1096
- if not hasattr(self.page, "_highlighter") or not self.page._highlighter:
1097
- logger.warning(f"Cannot show element, page lacks highlighter service: {self}")
1098
- return None
1099
-
1100
- service = self.page._highlighter
1101
-
1102
- # Determine the label if not provided
1103
- display_label = label if label is not None else f"{self.__class__.__name__}"
1104
-
1105
- # Prepare temporary highlight data for just this element
1106
- temp_highlight_data = {
1107
- "page_index": self.page.index,
1108
- "bbox": self.bbox if not self.has_polygon else None,
1109
- "polygon": self.polygon if self.has_polygon else None,
1110
- "color": color, # Use provided or default color
1111
- "label": display_label,
1112
- "use_color_cycling": False, # Explicitly false for single preview
1113
- }
1133
+ if not hasattr(self, "page") or self.page is None:
1134
+ return []
1135
+
1136
+ spec = RenderSpec(page=self.page)
1137
+
1138
+ # Handle cropping
1139
+ if crop_bbox:
1140
+ spec.crop_bbox = crop_bbox
1141
+ elif crop == "content" or crop is True:
1142
+ # Crop to element bounds
1143
+ if hasattr(self, "bbox") and self.bbox:
1144
+ spec.crop_bbox = self.bbox
1145
+
1146
+ # Add highlight in show mode
1147
+ if mode == "show":
1148
+ # Use provided label or generate one
1149
+ element_label = label if label is not None else self.__class__.__name__
1150
+
1151
+ spec.add_highlight(
1152
+ element=self,
1153
+ color=color or "red", # Default red for single element
1154
+ label=element_label,
1155
+ )
1114
1156
 
1115
- # Determine crop bbox
1116
- crop_bbox = self.bbox if crop else None
1157
+ # Add additional highlight groups if provided
1158
+ if highlights:
1159
+ for group in highlights:
1160
+ group_elements = group.get("elements", [])
1161
+ group_color = group.get("color", color)
1162
+ group_label = group.get("label")
1117
1163
 
1118
- # Check if we actually got geometry data
1119
- if temp_highlight_data["bbox"] is None and temp_highlight_data["polygon"] is None:
1120
- logger.warning(f"Cannot show element, failed to get bbox or polygon: {self}")
1121
- return None
1164
+ for elem in group_elements:
1165
+ # Only add if element is on same page
1166
+ if hasattr(elem, "page") and elem.page == self.page:
1167
+ spec.add_highlight(element=elem, color=group_color, label=group_label)
1122
1168
 
1123
- # Use render_preview to show only this highlight
1124
- try:
1125
- return service.render_preview(
1126
- page_index=self.page.index,
1127
- temporary_highlights=[temp_highlight_data],
1128
- resolution=resolution,
1129
- width=width, # Pass the width parameter
1130
- labels=labels,
1131
- legend_position=legend_position,
1132
- crop_bbox=crop_bbox,
1133
- )
1134
- except Exception as e:
1135
- logger.error(f"Error calling render_preview for element {self}: {e}", exc_info=True)
1136
- return None
1169
+ return [spec]
1137
1170
 
1138
1171
  def save(
1139
1172
  self,
@@ -1346,22 +1379,14 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
1346
1379
  resolution = kwargs.get("resolution", 150)
1347
1380
  from natural_pdf.elements.region import Region # Local import to avoid cycles
1348
1381
 
1349
- return self.expand().to_image(
1382
+ # Use render() for clean image without highlights
1383
+ return self.expand().render(
1350
1384
  resolution=resolution,
1351
- include_highlights=False,
1352
1385
  crop=True,
1353
1386
  )
1354
1387
  else:
1355
1388
  raise ValueError(f"Unsupported model_type for classification: {model_type}")
1356
1389
 
1357
- # ------------------------------------------------------------------
1358
- # Lightweight to_image proxy (vision models, previews, etc.)
1359
- # ------------------------------------------------------------------
1360
-
1361
- def to_image(self, *args, **kwargs): # type: ignore[override]
1362
- """Generate an image of this element by delegating to a temporary Region."""
1363
- return self.expand().to_image(*args, **kwargs)
1364
-
1365
1390
  # ------------------------------------------------------------------
1366
1391
  # Unified analysis storage (maps to metadata["analysis"])
1367
1392
  # ------------------------------------------------------------------