natural-pdf 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. natural_pdf/__init__.py +11 -6
  2. natural_pdf/analyzers/__init__.py +6 -1
  3. natural_pdf/analyzers/guides.py +354 -258
  4. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -4
  6. natural_pdf/analyzers/layout/paddle.py +11 -0
  7. natural_pdf/analyzers/layout/surya.py +2 -3
  8. natural_pdf/analyzers/shape_detection_mixin.py +25 -34
  9. natural_pdf/analyzers/text_structure.py +2 -2
  10. natural_pdf/classification/manager.py +1 -1
  11. natural_pdf/collections/mixins.py +3 -2
  12. natural_pdf/core/highlighting_service.py +743 -32
  13. natural_pdf/core/page.py +252 -399
  14. natural_pdf/core/page_collection.py +1249 -0
  15. natural_pdf/core/pdf.py +231 -89
  16. natural_pdf/{collections → core}/pdf_collection.py +18 -11
  17. natural_pdf/core/render_spec.py +335 -0
  18. natural_pdf/describe/base.py +1 -1
  19. natural_pdf/elements/__init__.py +1 -0
  20. natural_pdf/elements/base.py +108 -83
  21. natural_pdf/elements/{collections.py → element_collection.py} +575 -1372
  22. natural_pdf/elements/line.py +0 -1
  23. natural_pdf/elements/rect.py +0 -1
  24. natural_pdf/elements/region.py +405 -280
  25. natural_pdf/elements/text.py +9 -7
  26. natural_pdf/exporters/base.py +2 -2
  27. natural_pdf/exporters/original_pdf.py +1 -1
  28. natural_pdf/exporters/paddleocr.py +2 -4
  29. natural_pdf/exporters/searchable_pdf.py +3 -2
  30. natural_pdf/extraction/mixin.py +1 -3
  31. natural_pdf/flows/collections.py +1 -69
  32. natural_pdf/flows/element.py +25 -0
  33. natural_pdf/flows/flow.py +1658 -19
  34. natural_pdf/flows/region.py +757 -263
  35. natural_pdf/ocr/ocr_options.py +0 -2
  36. natural_pdf/ocr/utils.py +2 -1
  37. natural_pdf/qa/document_qa.py +21 -5
  38. natural_pdf/search/search_service_protocol.py +1 -1
  39. natural_pdf/selectors/parser.py +35 -2
  40. natural_pdf/tables/result.py +35 -1
  41. natural_pdf/text_mixin.py +101 -0
  42. natural_pdf/utils/debug.py +2 -1
  43. natural_pdf/utils/highlighting.py +1 -0
  44. natural_pdf/utils/layout.py +2 -2
  45. natural_pdf/utils/packaging.py +4 -3
  46. natural_pdf/utils/text_extraction.py +15 -12
  47. natural_pdf/utils/visualization.py +385 -0
  48. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
  49. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -52
  50. optimization/memory_comparison.py +1 -1
  51. optimization/pdf_analyzer.py +2 -2
  52. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
  53. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
  54. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
  55. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,16 @@
1
1
  import logging
2
- from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union, overload
2
+ from typing import (
3
+ TYPE_CHECKING,
4
+ Any,
5
+ Callable,
6
+ Dict,
7
+ List,
8
+ Literal,
9
+ Optional,
10
+ Tuple,
11
+ Union,
12
+ overload,
13
+ )
3
14
 
4
15
  from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
5
16
 
@@ -15,6 +26,9 @@ from natural_pdf.classification.manager import ClassificationManager # Keep for
15
26
 
16
27
  # --- Classification Imports --- #
17
28
  from natural_pdf.classification.mixin import ClassificationMixin
29
+
30
+ # Add Visualizable import
31
+ from natural_pdf.core.render_spec import RenderSpec, Visualizable
18
32
  from natural_pdf.describe.mixin import DescribeMixin
19
33
  from natural_pdf.elements.base import DirectionalMixin
20
34
  from natural_pdf.elements.text import TextElement # ADDED IMPORT
@@ -26,11 +40,15 @@ from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
26
40
  # Table utilities
27
41
  # ------------------------------------------------------------------
28
42
  from natural_pdf.tables import TableResult
43
+ from natural_pdf.text_mixin import TextMixin
29
44
  from natural_pdf.utils.locks import pdf_render_lock # Import the lock
30
45
 
31
46
  # Import new utils
32
47
  from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
33
48
 
49
+ # Import viewer widget support
50
+ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerWidget
51
+
34
52
  # --- End Classification Imports --- #
35
53
 
36
54
 
@@ -42,7 +60,7 @@ if TYPE_CHECKING:
42
60
 
43
61
  from natural_pdf.core.page import Page
44
62
  from natural_pdf.elements.base import Element # Added for type hint
45
- from natural_pdf.elements.collections import ElementCollection
63
+ from natural_pdf.elements.element_collection import ElementCollection
46
64
  from natural_pdf.elements.text import TextElement
47
65
 
48
66
  # Import OCRManager conditionally to avoid circular imports
@@ -56,7 +74,13 @@ logger = logging.getLogger(__name__)
56
74
 
57
75
 
58
76
  class Region(
59
- DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin
77
+ TextMixin,
78
+ DirectionalMixin,
79
+ ClassificationMixin,
80
+ ExtractionMixin,
81
+ ShapeDetectionMixin,
82
+ DescribeMixin,
83
+ Visualizable,
60
84
  ):
61
85
  """Represents a rectangular region on a page.
62
86
 
@@ -193,6 +217,62 @@ class Region(
193
217
  self.text_content = None # Direct text content (e.g., from Docling)
194
218
  self.associated_text_elements = [] # Native text elements that overlap with this region
195
219
 
220
+ def _get_render_specs(
221
+ self,
222
+ mode: Literal["show", "render"] = "show",
223
+ color: Optional[Union[str, Tuple[int, int, int]]] = None,
224
+ highlights: Optional[List[Dict[str, Any]]] = None,
225
+ crop: Union[bool, Literal["content"]] = True, # Default to True for regions
226
+ crop_bbox: Optional[Tuple[float, float, float, float]] = None,
227
+ **kwargs,
228
+ ) -> List[RenderSpec]:
229
+ """Get render specifications for this region.
230
+
231
+ Args:
232
+ mode: Rendering mode - 'show' includes highlights, 'render' is clean
233
+ color: Color for highlighting this region in show mode
234
+ highlights: Additional highlight groups to show
235
+ crop: Whether to crop to this region
236
+ crop_bbox: Explicit crop bounds (overrides region bounds)
237
+ **kwargs: Additional parameters
238
+
239
+ Returns:
240
+ List containing a single RenderSpec for this region's page
241
+ """
242
+ from typing import Literal
243
+
244
+ spec = RenderSpec(page=self.page)
245
+
246
+ # Handle cropping
247
+ if crop_bbox:
248
+ spec.crop_bbox = crop_bbox
249
+ elif crop:
250
+ # Crop to this region's bounds
251
+ spec.crop_bbox = self.bbox
252
+
253
+ # Add highlights in show mode
254
+ if mode == "show":
255
+ # Highlight this region
256
+ if color or mode == "show": # Always highlight in show mode
257
+ spec.add_highlight(
258
+ bbox=self.bbox,
259
+ polygon=self.polygon if self.has_polygon else None,
260
+ color=color or "blue",
261
+ label=self.label or self.name or "Region",
262
+ )
263
+
264
+ # Add additional highlight groups if provided
265
+ if highlights:
266
+ for group in highlights:
267
+ elements = group.get("elements", [])
268
+ group_color = group.get("color", color)
269
+ group_label = group.get("label")
270
+
271
+ for elem in elements:
272
+ spec.add_highlight(element=elem, color=group_color, label=group_label)
273
+
274
+ return [spec]
275
+
196
276
  def _direction(
197
277
  self,
198
278
  direction: str,
@@ -633,7 +713,7 @@ class Region(
633
713
  label: Optional[str] = None,
634
714
  color: Optional[Union[Tuple, str]] = None,
635
715
  use_color_cycling: bool = False,
636
- include_attrs: Optional[List[str]] = None,
716
+ annotate: Optional[List[str]] = None,
637
717
  existing: str = "append",
638
718
  ) -> "Region":
639
719
  """
@@ -643,7 +723,7 @@ class Region(
643
723
  label: Optional label for the highlight
644
724
  color: Color tuple/string for the highlight, or None to use automatic color
645
725
  use_color_cycling: Force color cycling even with no label (default: False)
646
- include_attrs: List of attribute names to display on the highlight (e.g., ['confidence', 'type'])
726
+ annotate: List of attribute names to display on the highlight (e.g., ['confidence', 'type'])
647
727
  existing: How to handle existing highlights ('append' or 'replace').
648
728
 
649
729
  Returns:
@@ -659,7 +739,7 @@ class Region(
659
739
  "label": label,
660
740
  "use_color_cycling": use_color_cycling,
661
741
  "element": self, # Pass the region itself so attributes can be accessed
662
- "include_attrs": include_attrs,
742
+ "annotate": annotate,
663
743
  "existing": existing,
664
744
  }
665
745
 
@@ -673,178 +753,6 @@ class Region(
673
753
 
674
754
  return self
675
755
 
676
- def to_image(
677
- self,
678
- resolution: Optional[float] = None,
679
- crop: bool = False,
680
- include_highlights: bool = True,
681
- **kwargs,
682
- ) -> "Image.Image":
683
- """
684
- Generate an image of just this region.
685
-
686
- Args:
687
- resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
688
- crop: If True, only crop the region without highlighting its boundaries
689
- include_highlights: Whether to include existing highlights (default: True)
690
- **kwargs: Additional parameters for page.to_image()
691
-
692
- Returns:
693
- PIL Image of just this region
694
- """
695
- # Apply global options as defaults
696
- import natural_pdf
697
-
698
- if resolution is None:
699
- if natural_pdf.options.image.resolution is not None:
700
- resolution = natural_pdf.options.image.resolution
701
- else:
702
- resolution = 144 # Default resolution when none specified
703
-
704
- # Handle the case where user wants the cropped region to have a specific width
705
- page_kwargs = kwargs.copy()
706
- effective_resolution = resolution # Start with the provided resolution
707
-
708
- if crop and "width" in kwargs:
709
- target_width = kwargs["width"]
710
- # Calculate what resolution is needed to make the region crop have target_width
711
- region_width_points = self.width # Region width in PDF points
712
-
713
- if region_width_points > 0:
714
- # Calculate scale needed: target_width / region_width_points
715
- required_scale = target_width / region_width_points
716
- # Convert scale to resolution: scale * 72 DPI
717
- effective_resolution = required_scale * 72.0
718
- page_kwargs.pop("width") # Remove width parameter to avoid conflicts
719
- logger.debug(
720
- f"Region {self.bbox}: Calculated required resolution {effective_resolution:.1f} DPI for region crop width {target_width}"
721
- )
722
- else:
723
- logger.warning(
724
- f"Region {self.bbox}: Invalid region width {region_width_points}, using original resolution"
725
- )
726
-
727
- # First get the full page image with highlights if requested
728
- page_image = self._page.to_image(
729
- resolution=effective_resolution,
730
- include_highlights=include_highlights,
731
- **page_kwargs,
732
- )
733
-
734
- # Calculate the actual scale factor used by the page image
735
- if page_image.width > 0 and self._page.width > 0:
736
- scale_factor = page_image.width / self._page.width
737
- else:
738
- # Fallback to resolution-based calculation if dimensions are invalid
739
- scale_factor = resolution / 72.0
740
-
741
- # Apply scaling to the coordinates
742
- x0 = int(self.x0 * scale_factor)
743
- top = int(self.top * scale_factor)
744
- x1 = int(self.x1 * scale_factor)
745
- bottom = int(self.bottom * scale_factor)
746
-
747
- # Ensure coords are valid for cropping (left < right, top < bottom)
748
- if x0 >= x1:
749
- logger.warning(
750
- f"Region {self.bbox} resulted in non-positive width after scaling ({x0} >= {x1}). Cannot create image."
751
- )
752
- return None
753
- if top >= bottom:
754
- logger.warning(
755
- f"Region {self.bbox} resulted in non-positive height after scaling ({top} >= {bottom}). Cannot create image."
756
- )
757
- return None
758
-
759
- # Crop the image to just this region
760
- region_image = page_image.crop((x0, top, x1, bottom))
761
-
762
- # If not crop, add a border to highlight the region boundaries
763
- if not crop:
764
- from PIL import ImageDraw
765
-
766
- # Create a 1px border around the region
767
- draw = ImageDraw.Draw(region_image)
768
- draw.rectangle(
769
- (0, 0, region_image.width - 1, region_image.height - 1),
770
- outline=(255, 0, 0),
771
- width=1,
772
- )
773
-
774
- return region_image
775
-
776
- def show(
777
- self,
778
- resolution: Optional[float] = None,
779
- labels: bool = True,
780
- legend_position: str = "right",
781
- # Add a default color for standalone show
782
- color: Optional[Union[Tuple, str]] = "blue",
783
- label: Optional[str] = None,
784
- width: Optional[int] = None, # Add width parameter
785
- crop: bool = False, # NEW: Crop output to region bounds before legend
786
- ) -> "Image.Image":
787
- """
788
- Show the page with just this region highlighted temporarily.
789
-
790
- Args:
791
- resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
792
- labels: Whether to include a legend for labels
793
- legend_position: Position of the legend
794
- color: Color to highlight this region (default: blue)
795
- label: Optional label for this region in the legend
796
- width: Optional width for the output image in pixels
797
- crop: If True, crop the rendered image to this region's
798
- bounding box (with a small margin handled inside
799
- HighlightingService) before legends/overlays are added.
800
-
801
- Returns:
802
- PIL Image of the page with only this region highlighted
803
- """
804
- # Apply global options as defaults
805
- import natural_pdf
806
-
807
- if resolution is None:
808
- if natural_pdf.options.image.resolution is not None:
809
- resolution = natural_pdf.options.image.resolution
810
- else:
811
- resolution = 144 # Default resolution when none specified
812
-
813
- if not self._page:
814
- raise ValueError("Region must be associated with a page to show.")
815
-
816
- # Use the highlighting service via the page's property
817
- service = self._page._highlighter
818
-
819
- # Determine the label if not provided
820
- display_label = (
821
- label if label is not None else f"Region ({self.type})" if self.type else "Region"
822
- )
823
-
824
- # Prepare temporary highlight data for just this region
825
- temp_highlight_data = {
826
- "page_index": self._page.index,
827
- "bbox": self.bbox,
828
- "polygon": self.polygon if self.has_polygon else None,
829
- "color": color, # Use provided or default color
830
- "label": display_label,
831
- "use_color_cycling": False, # Explicitly false for single preview
832
- }
833
-
834
- # Determine crop bbox if requested
835
- crop_bbox = self.bbox if crop else None
836
-
837
- # Use render_preview to show only this highlight
838
- return service.render_preview(
839
- page_index=self._page.index,
840
- temporary_highlights=[temp_highlight_data],
841
- resolution=resolution,
842
- width=width, # Pass the width parameter
843
- labels=labels,
844
- legend_position=legend_position,
845
- crop_bbox=crop_bbox,
846
- )
847
-
848
756
  def save(
849
757
  self,
850
758
  filename: str,
@@ -898,7 +806,7 @@ class Region(
898
806
  resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
899
807
  crop: If True, only crop the region without highlighting its boundaries
900
808
  include_highlights: Whether to include existing highlights (default: True)
901
- **kwargs: Additional parameters for page.to_image()
809
+ **kwargs: Additional parameters for rendering
902
810
 
903
811
  Returns:
904
812
  Self for method chaining
@@ -912,16 +820,23 @@ class Region(
912
820
  else:
913
821
  resolution = 144 # Default resolution when none specified
914
822
 
915
- # Get the region image
916
- image = self.to_image(
917
- resolution=resolution,
918
- crop=crop,
919
- include_highlights=include_highlights,
920
- **kwargs,
921
- )
823
+ # Use export() to save the image
824
+ if include_highlights:
825
+ # With highlights, use export() which includes them
826
+ self.export(
827
+ path=filename,
828
+ resolution=resolution,
829
+ crop=crop,
830
+ **kwargs,
831
+ )
832
+ else:
833
+ # Without highlights, use render() and save manually
834
+ image = self.render(resolution=resolution, crop=crop, **kwargs)
835
+ if image:
836
+ image.save(filename)
837
+ else:
838
+ logger.error(f"Failed to render region image for saving to {filename}")
922
839
 
923
- # Save the image
924
- image.save(filename)
925
840
  return self
926
841
 
927
842
  def trim(
@@ -982,7 +897,8 @@ class Region(
982
897
  )
983
898
 
984
899
  # Get the region image
985
- image = work_region.to_image(resolution=resolution, crop=True, include_highlights=False)
900
+ # Use render() for clean image without highlights, with cropping
901
+ image = work_region.render(resolution=resolution, crop=True)
986
902
 
987
903
  if image is None:
988
904
  logger.warning(
@@ -1221,7 +1137,9 @@ class Region(
1221
1137
  # Filter to elements in this region
1222
1138
  return [e for e in page_elements if self._is_element_in_region(e)]
1223
1139
 
1224
- def extract_text(self, apply_exclusions=True, debug=False, content_filter=None, **kwargs) -> str:
1140
+ def extract_text(
1141
+ self, apply_exclusions=True, debug=False, content_filter=None, **kwargs
1142
+ ) -> str:
1225
1143
  """
1226
1144
  Extract text from this region, respecting page exclusions and using pdfplumber's
1227
1145
  layout engine (chars_to_textmap).
@@ -1293,7 +1211,7 @@ class Region(
1293
1211
  final_kwargs = kwargs.copy()
1294
1212
  if content_filter is not None:
1295
1213
  final_kwargs["content_filter"] = content_filter
1296
-
1214
+
1297
1215
  result = generate_text_layout(
1298
1216
  char_dicts=filtered_chars,
1299
1217
  layout_context_bbox=self.bbox, # Use region's bbox for context
@@ -1313,7 +1231,9 @@ class Region(
1313
1231
  cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
1314
1232
  # --- NEW: Add tqdm control option --- #
1315
1233
  show_progress: bool = False, # Controls progress bar for text method
1316
- content_filter: Optional[Union[str, Callable[[str], bool], List[str]]] = None, # NEW: Content filtering
1234
+ content_filter: Optional[
1235
+ Union[str, Callable[[str], bool], List[str]]
1236
+ ] = None, # NEW: Content filtering
1317
1237
  ) -> TableResult: # Return type allows Optional[str] for cells
1318
1238
  """
1319
1239
  Extract a table from this region.
@@ -1373,7 +1293,11 @@ class Region(
1373
1293
  logger.debug(
1374
1294
  f"Region {self.bbox}: Found {len(cell_regions_in_table)} pre-computed table_cell regions – using 'cells' method."
1375
1295
  )
1376
- return TableResult(self._extract_table_from_cells(cell_regions_in_table, content_filter=content_filter))
1296
+ return TableResult(
1297
+ self._extract_table_from_cells(
1298
+ cell_regions_in_table, content_filter=content_filter
1299
+ )
1300
+ )
1377
1301
 
1378
1302
  # --------------------------------------------------------------- #
1379
1303
 
@@ -1454,7 +1378,9 @@ class Region(
1454
1378
 
1455
1379
  # Use the selected method
1456
1380
  if effective_method == "tatr":
1457
- table_rows = self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config, content_filter=content_filter)
1381
+ table_rows = self._extract_table_tatr(
1382
+ use_ocr=use_ocr, ocr_config=ocr_config, content_filter=content_filter
1383
+ )
1458
1384
  elif effective_method == "text":
1459
1385
  current_text_options = text_options.copy()
1460
1386
  current_text_options["cell_extraction_func"] = cell_extraction_func
@@ -1610,8 +1536,47 @@ class Region(
1610
1536
  table_settings.setdefault("join_x_tolerance", join)
1611
1537
  table_settings.setdefault("join_y_tolerance", join)
1612
1538
 
1613
- # Create a crop of the page for this region
1614
- cropped = self.page._page.crop(self.bbox)
1539
+ # -------------------------------------------------------------
1540
+ # Apply char-level exclusion filtering, if any exclusions are
1541
+ # defined on the parent Page. We create a lightweight
1542
+ # pdfplumber.Page copy whose .chars list omits characters that
1543
+ # fall inside any exclusion Region. Other object types are
1544
+ # left untouched for now ("chars-only" strategy).
1545
+ # -------------------------------------------------------------
1546
+ base_plumber_page = self.page._page
1547
+
1548
+ if getattr(self.page, "_exclusions", None):
1549
+ # Resolve exclusion Regions (callables already evaluated)
1550
+ exclusion_regions = self.page._get_exclusion_regions(include_callable=True)
1551
+
1552
+ def _keep_char(obj):
1553
+ """Return True if pdfplumber obj should be kept."""
1554
+ if obj.get("object_type") != "char":
1555
+ # Keep non-char objects unchanged – lattice grids etc.
1556
+ return True
1557
+
1558
+ # Compute character centre point
1559
+ cx = (obj["x0"] + obj["x1"]) / 2.0
1560
+ cy = (obj["top"] + obj["bottom"]) / 2.0
1561
+
1562
+ # Reject if the centre lies inside ANY exclusion Region
1563
+ for reg in exclusion_regions:
1564
+ if reg.x0 <= cx <= reg.x1 and reg.top <= cy <= reg.bottom:
1565
+ return False
1566
+ return True
1567
+
1568
+ try:
1569
+ filtered_page = base_plumber_page.filter(_keep_char)
1570
+ except Exception as _filter_err:
1571
+ # Fallback – if filtering fails, log and proceed unfiltered
1572
+ logger.warning(
1573
+ f"Region {self.bbox}: Failed to filter pdfplumber chars for exclusions: {_filter_err}"
1574
+ )
1575
+ filtered_page = base_plumber_page
1576
+ else:
1577
+ filtered_page = base_plumber_page
1578
+
1579
+ cropped = filtered_page.crop(self.bbox)
1615
1580
 
1616
1581
  # Extract all tables from the cropped area
1617
1582
  tables = cropped.extract_tables(table_settings)
@@ -1672,8 +1637,38 @@ class Region(
1672
1637
  if y_tol is not None:
1673
1638
  table_settings.setdefault("text_y_tolerance", y_tol)
1674
1639
 
1675
- # Create a crop of the page for this region
1676
- cropped = self.page._page.crop(self.bbox)
1640
+ # -------------------------------------------------------------
1641
+ # Apply char-level exclusion filtering (chars only) just like in
1642
+ # _extract_tables_plumber so header/footer text does not appear
1643
+ # in extracted tables.
1644
+ # -------------------------------------------------------------
1645
+ base_plumber_page = self.page._page
1646
+
1647
+ if getattr(self.page, "_exclusions", None):
1648
+ exclusion_regions = self.page._get_exclusion_regions(include_callable=True)
1649
+
1650
+ def _keep_char(obj):
1651
+ if obj.get("object_type") != "char":
1652
+ return True
1653
+ cx = (obj["x0"] + obj["x1"]) / 2.0
1654
+ cy = (obj["top"] + obj["bottom"]) / 2.0
1655
+ for reg in exclusion_regions:
1656
+ if reg.x0 <= cx <= reg.x1 and reg.top <= cy <= reg.bottom:
1657
+ return False
1658
+ return True
1659
+
1660
+ try:
1661
+ filtered_page = base_plumber_page.filter(_keep_char)
1662
+ except Exception as _filter_err:
1663
+ logger.warning(
1664
+ f"Region {self.bbox}: Failed to filter pdfplumber chars for exclusions (single table): {_filter_err}"
1665
+ )
1666
+ filtered_page = base_plumber_page
1667
+ else:
1668
+ filtered_page = base_plumber_page
1669
+
1670
+ # Now crop the (possibly filtered) page to the region bbox
1671
+ cropped = filtered_page.crop(self.bbox)
1677
1672
 
1678
1673
  # Extract the single largest table from the cropped area
1679
1674
  table = cropped.extract_table(table_settings)
@@ -1688,10 +1683,12 @@ class Region(
1688
1683
  if cell is not None:
1689
1684
  # Apply RTL text processing first
1690
1685
  rtl_processed_cell = self._apply_rtl_processing_to_text(cell)
1691
-
1686
+
1692
1687
  # Then apply content filter if provided
1693
1688
  if content_filter is not None:
1694
- filtered_cell = self._apply_content_filter_to_text(rtl_processed_cell, content_filter)
1689
+ filtered_cell = self._apply_content_filter_to_text(
1690
+ rtl_processed_cell, content_filter
1691
+ )
1695
1692
  processed_row.append(filtered_cell)
1696
1693
  else:
1697
1694
  processed_row.append(rtl_processed_cell)
@@ -1701,7 +1698,9 @@ class Region(
1701
1698
  return processed_table
1702
1699
  return []
1703
1700
 
1704
- def _extract_table_tatr(self, use_ocr=False, ocr_config=None, content_filter=None) -> List[List[str]]:
1701
+ def _extract_table_tatr(
1702
+ self, use_ocr=False, ocr_config=None, content_filter=None
1703
+ ) -> List[List[str]]:
1705
1704
  """
1706
1705
  Extract table using TATR structure detection.
1707
1706
 
@@ -2098,7 +2097,7 @@ class Region(
2098
2097
  Returns:
2099
2098
  ElementCollection with matching elements.
2100
2099
  """
2101
- from natural_pdf.elements.collections import ElementCollection
2100
+ from natural_pdf.elements.element_collection import ElementCollection
2102
2101
 
2103
2102
  if selector is not None and text is not None:
2104
2103
  raise ValueError("Provide either 'selector' or 'text', not both.")
@@ -2183,7 +2182,7 @@ class Region(
2183
2182
  ---------
2184
2183
  ```python
2185
2184
  def llm_ocr(region):
2186
- image = region.to_image(resolution=300, crop=True)
2185
+ image = region.render(resolution=300, crop=True)
2187
2186
  return my_llm_client.ocr(image)
2188
2187
  region.apply_ocr(function=llm_ocr)
2189
2188
  ```
@@ -2293,9 +2292,8 @@ class Region(
2293
2292
 
2294
2293
  # Render the page region to an image using the determined resolution
2295
2294
  try:
2296
- region_image = self.to_image(
2297
- resolution=final_resolution, include_highlights=False, crop=True
2298
- )
2295
+ # Use render() for clean image without highlights, with cropping
2296
+ region_image = self.render(resolution=final_resolution, crop=True)
2299
2297
  if not region_image:
2300
2298
  logger.error("Failed to render region to image for OCR.")
2301
2299
  return self
@@ -2417,7 +2415,7 @@ class Region(
2417
2415
  Example:
2418
2416
  # Using with an LLM
2419
2417
  def ocr_with_llm(region):
2420
- image = region.to_image(resolution=300, crop=True)
2418
+ image = region.render(resolution=300, crop=True)
2421
2419
  # Call your LLM API here
2422
2420
  return llm_client.ocr(image)
2423
2421
 
@@ -2425,7 +2423,7 @@ class Region(
2425
2423
 
2426
2424
  # Using with a custom OCR service
2427
2425
  def ocr_with_service(region):
2428
- img_bytes = region.to_image(crop=True).tobytes()
2426
+ img_bytes = region.render(crop=True).tobytes()
2429
2427
  response = ocr_service.process(img_bytes)
2430
2428
  return response.text
2431
2429
 
@@ -2530,14 +2528,14 @@ class Region(
2530
2528
 
2531
2529
  return self
2532
2530
 
2533
- def get_section_between(self, start_element=None, end_element=None, boundary_inclusion="both"):
2531
+ def get_section_between(self, start_element=None, end_element=None, include_boundaries="both"):
2534
2532
  """
2535
2533
  Get a section between two elements within this region.
2536
2534
 
2537
2535
  Args:
2538
2536
  start_element: Element marking the start of the section
2539
2537
  end_element: Element marking the end of the section
2540
- boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none'
2538
+ include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
2541
2539
 
2542
2540
  Returns:
2543
2541
  Region representing the section
@@ -2586,15 +2584,15 @@ class Region(
2586
2584
  start_element_for_bbox = start_element
2587
2585
  end_element_for_bbox = end_element
2588
2586
 
2589
- if boundary_inclusion == "none":
2587
+ if include_boundaries == "none":
2590
2588
  start_idx += 1
2591
2589
  end_idx -= 1
2592
2590
  start_element_for_bbox = elements[start_idx] if start_idx <= end_idx else None
2593
2591
  end_element_for_bbox = elements[end_idx] if start_idx <= end_idx else None
2594
- elif boundary_inclusion == "start":
2592
+ elif include_boundaries == "start":
2595
2593
  end_idx -= 1
2596
2594
  end_element_for_bbox = elements[end_idx] if start_idx <= end_idx else None
2597
- elif boundary_inclusion == "end":
2595
+ elif include_boundaries == "end":
2598
2596
  start_idx += 1
2599
2597
  start_element_for_bbox = elements[start_idx] if start_idx <= end_idx else None
2600
2598
 
@@ -2627,7 +2625,7 @@ class Region(
2627
2625
  return section
2628
2626
 
2629
2627
  def get_sections(
2630
- self, start_elements=None, end_elements=None, boundary_inclusion="both"
2628
+ self, start_elements=None, end_elements=None, include_boundaries="both"
2631
2629
  ) -> "ElementCollection[Region]":
2632
2630
  """
2633
2631
  Get sections within this region based on start/end elements.
@@ -2635,12 +2633,12 @@ class Region(
2635
2633
  Args:
2636
2634
  start_elements: Elements or selector string that mark the start of sections
2637
2635
  end_elements: Elements or selector string that mark the end of sections
2638
- boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none'
2636
+ include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
2639
2637
 
2640
2638
  Returns:
2641
2639
  List of Region objects representing the extracted sections
2642
2640
  """
2643
- from natural_pdf.elements.collections import ElementCollection
2641
+ from natural_pdf.elements.element_collection import ElementCollection
2644
2642
 
2645
2643
  # Process string selectors to find elements WITHIN THIS REGION
2646
2644
  if isinstance(start_elements, str):
@@ -2714,7 +2712,7 @@ class Region(
2714
2712
  start_element = current_start_boundary["element"]
2715
2713
  end_element = boundary["element"]
2716
2714
  # Use the helper, ensuring elements are from within the region
2717
- section = self.get_section_between(start_element, end_element, boundary_inclusion)
2715
+ section = self.get_section_between(start_element, end_element, include_boundaries)
2718
2716
  sections.append(section)
2719
2717
  current_start_boundary = None # Reset
2720
2718
 
@@ -2731,7 +2729,7 @@ class Region(
2731
2729
  if end_idx >= 0 and end_idx >= current_start_boundary["index"]:
2732
2730
  end_element = all_elements_in_region[end_idx]
2733
2731
  section = self.get_section_between(
2734
- start_element, end_element, boundary_inclusion
2732
+ start_element, end_element, include_boundaries
2735
2733
  )
2736
2734
  sections.append(section)
2737
2735
  # Else: Section started and ended by consecutive start elements? Create empty?
@@ -2745,7 +2743,7 @@ class Region(
2745
2743
  start_element = current_start_boundary["element"]
2746
2744
  # End at the last element within the region
2747
2745
  end_element = all_elements_in_region[-1]
2748
- section = self.get_section_between(start_element, end_element, boundary_inclusion)
2746
+ section = self.get_section_between(start_element, end_element, include_boundaries)
2749
2747
  sections.append(section)
2750
2748
 
2751
2749
  return ElementCollection(sections)
@@ -3007,46 +3005,23 @@ class Region(
3007
3005
  source_info = f" source='{self.source}'" if self.source else ""
3008
3006
  return f"<Region{name_info}{type_info}{source_info} bbox={self.bbox}{poly_info}>"
3009
3007
 
3010
- def correct_ocr(
3008
+ def update_text(
3011
3009
  self,
3012
- correction_callback: Callable[[Any], Optional[str]],
3013
- ) -> "Region": # Return self for chaining
3014
- """
3015
- Applies corrections to OCR-generated text elements within this region
3016
- using a user-provided callback function.
3017
-
3018
- Finds text elements within this region whose 'source' attribute starts
3019
- with 'ocr' and calls the `correction_callback` for each, passing the
3020
- element itself.
3021
-
3022
- The `correction_callback` should contain the logic to:
3023
- 1. Determine if the element needs correction.
3024
- 2. Perform the correction (e.g., call an LLM).
3025
- 3. Return the new text (`str`) or `None`.
3026
-
3027
- If the callback returns a string, the element's `.text` is updated.
3028
- Metadata updates (source, confidence, etc.) should happen within the callback.
3029
-
3030
- Args:
3031
- correction_callback: A function accepting an element and returning
3032
- `Optional[str]` (new text or None).
3010
+ transform: Callable[[Any], Optional[str]],
3011
+ *,
3012
+ selector: str = "text",
3013
+ apply_exclusions: bool = False,
3014
+ ) -> "Region":
3015
+ """Apply *transform* to every text element matched by *selector* inside this region.
3033
3016
 
3034
- Returns:
3035
- Self for method chaining.
3017
+ The heavy lifting is delegated to :py:meth:`TextMixin.update_text`; this
3018
+ override simply ensures the search is scoped to the region.
3036
3019
  """
3037
- # Find OCR elements specifically within this region
3038
- # Note: We typically want to correct even if the element falls in an excluded area
3039
- target_elements = self.find_all(selector="text[source=ocr]", apply_exclusions=False)
3040
3020
 
3041
- # Delegate to the utility function
3042
- _apply_ocr_correction_to_elements(
3043
- elements=target_elements, # Pass the ElementCollection directly
3044
- correction_callback=correction_callback,
3045
- caller_info=f"Region({self.bbox})", # Pass caller info
3021
+ return TextMixin.update_text(
3022
+ self, transform, selector=selector, apply_exclusions=apply_exclusions
3046
3023
  )
3047
3024
 
3048
- return self # Return self for chaining
3049
-
3050
3025
  # --- Classification Mixin Implementation --- #
3051
3026
  def _get_classification_manager(self) -> "ClassificationManager":
3052
3027
  if (
@@ -3086,9 +3061,8 @@ class Region(
3086
3061
  else default_resolution
3087
3062
  )
3088
3063
 
3089
- img = self.to_image(
3064
+ img = self.render(
3090
3065
  resolution=resolution,
3091
- include_highlights=False, # No highlights for classification input
3092
3066
  crop=True, # Just the region content
3093
3067
  )
3094
3068
  if img is None:
@@ -3218,7 +3192,7 @@ class Region(
3218
3192
  An ElementCollection containing temporary Region objects for each detected cell,
3219
3193
  or an empty ElementCollection if no cells are found or an error occurs.
3220
3194
  """
3221
- from natural_pdf.elements.collections import ElementCollection
3195
+ from natural_pdf.elements.element_collection import ElementCollection
3222
3196
 
3223
3197
  # 1. Perform the analysis (or use cached results)
3224
3198
  if "text_table_structure" in self.analyses:
@@ -3420,13 +3394,15 @@ class Region(
3420
3394
  # New helper: build table from pre-computed table_cell regions
3421
3395
  # ------------------------------------------------------------------
3422
3396
 
3423
- def _extract_table_from_cells(self, cell_regions: List["Region"], content_filter=None) -> List[List[Optional[str]]]:
3397
+ def _extract_table_from_cells(
3398
+ self, cell_regions: List["Region"], content_filter=None
3399
+ ) -> List[List[Optional[str]]]:
3424
3400
  """Construct a table (list-of-lists) from table_cell regions.
3425
3401
 
3426
3402
  This assumes each cell Region has metadata.row_index / col_index as written by
3427
3403
  detect_table_structure_from_lines(). If these keys are missing we will
3428
3404
  fall back to sorting by geometry.
3429
-
3405
+
3430
3406
  Args:
3431
3407
  cell_regions: List of table cell Region objects to extract text from
3432
3408
  content_filter: Optional content filter to apply to cell text extraction
@@ -3460,7 +3436,9 @@ class Region(
3460
3436
  try:
3461
3437
  r_idx = int(cell.metadata.get("row_index"))
3462
3438
  c_idx = int(cell.metadata.get("col_index"))
3463
- text_val = cell.extract_text(layout=False, apply_exclusions=False, content_filter=content_filter).strip()
3439
+ text_val = cell.extract_text(
3440
+ layout=False, apply_exclusions=False, content_filter=content_filter
3441
+ ).strip()
3464
3442
  table_grid[r_idx][c_idx] = text_val if text_val else None
3465
3443
  except Exception as _err:
3466
3444
  # Skip problematic cell
@@ -3507,7 +3485,9 @@ class Region(
3507
3485
  row_idx = int(np.argmin([abs(cy - rc) for rc in row_centers]))
3508
3486
  col_idx = int(np.argmin([abs(cx - cc) for cc in col_centers]))
3509
3487
 
3510
- text_val = cell.extract_text(layout=False, apply_exclusions=False, content_filter=content_filter).strip()
3488
+ text_val = cell.extract_text(
3489
+ layout=False, apply_exclusions=False, content_filter=content_filter
3490
+ ).strip()
3511
3491
  table_grid[row_idx][col_idx] = text_val if text_val else None
3512
3492
 
3513
3493
  return table_grid
@@ -3515,32 +3495,33 @@ class Region(
3515
3495
  def _apply_rtl_processing_to_text(self, text: str) -> str:
3516
3496
  """
3517
3497
  Apply RTL (Right-to-Left) text processing to a string.
3518
-
3498
+
3519
3499
  This converts visual order text (as stored in PDFs) to logical order
3520
3500
  for proper display of Arabic, Hebrew, and other RTL scripts.
3521
-
3501
+
3522
3502
  Args:
3523
3503
  text: Input text string in visual order
3524
-
3504
+
3525
3505
  Returns:
3526
3506
  Text string in logical order
3527
3507
  """
3528
3508
  if not text or not text.strip():
3529
3509
  return text
3530
-
3510
+
3531
3511
  # Quick check for RTL characters - if none found, return as-is
3532
3512
  import unicodedata
3533
-
3513
+
3534
3514
  def _contains_rtl(s):
3535
3515
  return any(unicodedata.bidirectional(ch) in ("R", "AL", "AN") for ch in s)
3536
-
3516
+
3537
3517
  if not _contains_rtl(text):
3538
3518
  return text
3539
-
3519
+
3540
3520
  try:
3541
3521
  from bidi.algorithm import get_display # type: ignore
3522
+
3542
3523
  from natural_pdf.utils.bidi_mirror import mirror_brackets
3543
-
3524
+
3544
3525
  # Apply BiDi algorithm to convert from visual to logical order
3545
3526
  # Process line by line to handle mixed content properly
3546
3527
  processed_lines = []
@@ -3553,9 +3534,9 @@ class Region(
3553
3534
  processed_lines.append(mirror_brackets(logical_line))
3554
3535
  else:
3555
3536
  processed_lines.append(line)
3556
-
3537
+
3557
3538
  return "\n".join(processed_lines)
3558
-
3539
+
3559
3540
  except (ImportError, Exception):
3560
3541
  # If bidi library is not available or fails, return original text
3561
3542
  return text
@@ -3563,36 +3544,36 @@ class Region(
3563
3544
  def _apply_content_filter_to_text(self, text: str, content_filter) -> str:
3564
3545
  """
3565
3546
  Apply content filter to a text string.
3566
-
3547
+
3567
3548
  Args:
3568
3549
  text: Input text string
3569
3550
  content_filter: Content filter (regex, callable, or list of regexes)
3570
-
3551
+
3571
3552
  Returns:
3572
3553
  Filtered text string
3573
3554
  """
3574
3555
  if not text or content_filter is None:
3575
3556
  return text
3576
-
3557
+
3577
3558
  import re
3578
-
3559
+
3579
3560
  if isinstance(content_filter, str):
3580
3561
  # Single regex pattern - remove matching parts
3581
3562
  try:
3582
- return re.sub(content_filter, '', text)
3563
+ return re.sub(content_filter, "", text)
3583
3564
  except re.error:
3584
3565
  return text # Invalid regex, return original
3585
-
3566
+
3586
3567
  elif isinstance(content_filter, list):
3587
3568
  # List of regex patterns - remove parts matching ANY pattern
3588
3569
  try:
3589
3570
  result = text
3590
3571
  for pattern in content_filter:
3591
- result = re.sub(pattern, '', result)
3572
+ result = re.sub(pattern, "", result)
3592
3573
  return result
3593
3574
  except re.error:
3594
3575
  return text # Invalid regex, return original
3595
-
3576
+
3596
3577
  elif callable(content_filter):
3597
3578
  # Callable filter - apply to individual characters
3598
3579
  try:
@@ -3600,8 +3581,152 @@ class Region(
3600
3581
  for char in text:
3601
3582
  if content_filter(char):
3602
3583
  filtered_chars.append(char)
3603
- return ''.join(filtered_chars)
3584
+ return "".join(filtered_chars)
3604
3585
  except Exception:
3605
3586
  return text # Function error, return original
3606
-
3587
+
3607
3588
  return text
3589
+
3590
+ # ------------------------------------------------------------------
3591
+ # Interactive Viewer Support
3592
+ # ------------------------------------------------------------------
3593
+
3594
+ def viewer(
3595
+ self,
3596
+ *,
3597
+ resolution: int = 150,
3598
+ include_chars: bool = False,
3599
+ include_attributes: Optional[List[str]] = None,
3600
+ ) -> Optional["InteractiveViewerWidget"]:
3601
+ """Create an interactive ipywidget viewer for **this specific region**.
3602
+
3603
+ The method renders the region to an image (cropped to the region bounds) and
3604
+ overlays all elements that intersect the region (optionally excluding noisy
3605
+ character-level elements). The resulting widget offers the same zoom / pan
3606
+ experience as :py:meth:`Page.viewer` but scoped to the region.
3607
+
3608
+ Parameters
3609
+ ----------
3610
+ resolution : int, default 150
3611
+ Rendering resolution (DPI). This should match the value used by the
3612
+ page-level viewer so element scaling is accurate.
3613
+ include_chars : bool, default False
3614
+ Whether to include individual *char* elements in the overlay. These
3615
+ are often too dense for a meaningful visualisation so are skipped by
3616
+ default.
3617
+ include_attributes : list[str], optional
3618
+ Additional element attributes to expose in the info panel (on top of
3619
+ the default set used by the page viewer).
3620
+
3621
+ Returns
3622
+ -------
3623
+ InteractiveViewerWidget | None
3624
+ The widget instance, or ``None`` if *ipywidgets* is not installed or
3625
+ an error occurred during creation.
3626
+ """
3627
+
3628
+ # ------------------------------------------------------------------
3629
+ # Dependency / environment checks
3630
+ # ------------------------------------------------------------------
3631
+ if not _IPYWIDGETS_AVAILABLE or InteractiveViewerWidget is None:
3632
+ logger.error(
3633
+ "Interactive viewer requires 'ipywidgets'. "
3634
+ 'Please install with: pip install "ipywidgets>=7.0.0,<10.0.0"'
3635
+ )
3636
+ return None
3637
+
3638
+ try:
3639
+ # ------------------------------------------------------------------
3640
+ # Render region image (cropped) and encode as data URI
3641
+ # ------------------------------------------------------------------
3642
+ import base64
3643
+ from io import BytesIO
3644
+
3645
+ # Use unified render() with crop=True to obtain just the region
3646
+ img = self.render(resolution=resolution, crop=True)
3647
+ if img is None:
3648
+ logger.error(f"Failed to render image for region {self.bbox} viewer.")
3649
+ return None
3650
+
3651
+ buf = BytesIO()
3652
+ img.save(buf, format="PNG")
3653
+ img_str = base64.b64encode(buf.getvalue()).decode()
3654
+ image_uri = f"data:image/png;base64,{img_str}"
3655
+
3656
+ # ------------------------------------------------------------------
3657
+ # Prepare element overlay data (coordinates relative to region)
3658
+ # ------------------------------------------------------------------
3659
+ scale = resolution / 72.0 # Same convention as page viewer
3660
+
3661
+ # Gather elements intersecting the region
3662
+ region_elements = self.get_elements(apply_exclusions=False)
3663
+
3664
+ # Optionally filter out chars
3665
+ if not include_chars:
3666
+ region_elements = [
3667
+ el for el in region_elements if str(getattr(el, "type", "")).lower() != "char"
3668
+ ]
3669
+
3670
+ default_attrs = [
3671
+ "text",
3672
+ "fontname",
3673
+ "size",
3674
+ "bold",
3675
+ "italic",
3676
+ "color",
3677
+ "linewidth",
3678
+ "is_horizontal",
3679
+ "is_vertical",
3680
+ "source",
3681
+ "confidence",
3682
+ "label",
3683
+ "model",
3684
+ "upright",
3685
+ "direction",
3686
+ ]
3687
+
3688
+ if include_attributes:
3689
+ default_attrs.extend([a for a in include_attributes if a not in default_attrs])
3690
+
3691
+ elements_json: List[dict] = []
3692
+ for idx, el in enumerate(region_elements):
3693
+ try:
3694
+ # Calculate coordinates relative to region bbox and apply scale
3695
+ x0 = (el.x0 - self.x0) * scale
3696
+ y0 = (el.top - self.top) * scale
3697
+ x1 = (el.x1 - self.x0) * scale
3698
+ y1 = (el.bottom - self.top) * scale
3699
+
3700
+ elem_dict = {
3701
+ "id": idx,
3702
+ "type": getattr(el, "type", "unknown"),
3703
+ "x0": round(x0, 2),
3704
+ "y0": round(y0, 2),
3705
+ "x1": round(x1, 2),
3706
+ "y1": round(y1, 2),
3707
+ "width": round(x1 - x0, 2),
3708
+ "height": round(y1 - y0, 2),
3709
+ }
3710
+
3711
+ # Add requested / default attributes
3712
+ for attr_name in default_attrs:
3713
+ if hasattr(el, attr_name):
3714
+ val = getattr(el, attr_name)
3715
+ # Ensure JSON serialisable
3716
+ if not isinstance(val, (str, int, float, bool, list, dict, type(None))):
3717
+ val = str(val)
3718
+ elem_dict[attr_name] = val
3719
+ elements_json.append(elem_dict)
3720
+ except Exception as e:
3721
+ logger.warning(f"Error preparing element {idx} for region viewer: {e}")
3722
+
3723
+ viewer_data = {"page_image": image_uri, "elements": elements_json}
3724
+
3725
+ # ------------------------------------------------------------------
3726
+ # Instantiate the widget directly using the prepared data
3727
+ # ------------------------------------------------------------------
3728
+ return InteractiveViewerWidget(pdf_data=viewer_data)
3729
+
3730
+ except Exception as e:
3731
+ logger.error(f"Error creating viewer for region {self.bbox}: {e}", exc_info=True)
3732
+ return None