natural-pdf 0.1.40__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. natural_pdf/__init__.py +6 -7
  2. natural_pdf/analyzers/__init__.py +6 -1
  3. natural_pdf/analyzers/guides.py +354 -258
  4. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -4
  6. natural_pdf/analyzers/layout/paddle.py +11 -0
  7. natural_pdf/analyzers/layout/surya.py +2 -3
  8. natural_pdf/analyzers/shape_detection_mixin.py +25 -34
  9. natural_pdf/analyzers/text_structure.py +2 -2
  10. natural_pdf/classification/manager.py +1 -1
  11. natural_pdf/collections/mixins.py +3 -2
  12. natural_pdf/core/highlighting_service.py +743 -32
  13. natural_pdf/core/page.py +236 -383
  14. natural_pdf/core/page_collection.py +1249 -0
  15. natural_pdf/core/pdf.py +172 -83
  16. natural_pdf/{collections → core}/pdf_collection.py +18 -11
  17. natural_pdf/core/render_spec.py +335 -0
  18. natural_pdf/describe/base.py +1 -1
  19. natural_pdf/elements/__init__.py +1 -0
  20. natural_pdf/elements/base.py +108 -83
  21. natural_pdf/elements/{collections.py → element_collection.py} +566 -1487
  22. natural_pdf/elements/line.py +0 -1
  23. natural_pdf/elements/rect.py +0 -1
  24. natural_pdf/elements/region.py +318 -243
  25. natural_pdf/elements/text.py +9 -7
  26. natural_pdf/exporters/base.py +2 -2
  27. natural_pdf/exporters/original_pdf.py +1 -1
  28. natural_pdf/exporters/paddleocr.py +2 -4
  29. natural_pdf/exporters/searchable_pdf.py +3 -2
  30. natural_pdf/extraction/mixin.py +1 -3
  31. natural_pdf/flows/collections.py +1 -69
  32. natural_pdf/flows/element.py +4 -4
  33. natural_pdf/flows/flow.py +1200 -243
  34. natural_pdf/flows/region.py +707 -261
  35. natural_pdf/ocr/ocr_options.py +0 -2
  36. natural_pdf/ocr/utils.py +2 -1
  37. natural_pdf/qa/document_qa.py +21 -5
  38. natural_pdf/search/search_service_protocol.py +1 -1
  39. natural_pdf/selectors/parser.py +2 -2
  40. natural_pdf/tables/result.py +35 -1
  41. natural_pdf/text_mixin.py +7 -3
  42. natural_pdf/utils/debug.py +2 -1
  43. natural_pdf/utils/highlighting.py +1 -0
  44. natural_pdf/utils/layout.py +2 -2
  45. natural_pdf/utils/packaging.py +4 -3
  46. natural_pdf/utils/text_extraction.py +15 -12
  47. natural_pdf/utils/visualization.py +385 -0
  48. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
  49. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -53
  50. optimization/memory_comparison.py +1 -1
  51. optimization/pdf_analyzer.py +2 -2
  52. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
  53. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
  54. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
  55. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,16 @@
1
1
  import logging
2
- from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union, overload
2
+ from typing import (
3
+ TYPE_CHECKING,
4
+ Any,
5
+ Callable,
6
+ Dict,
7
+ List,
8
+ Literal,
9
+ Optional,
10
+ Tuple,
11
+ Union,
12
+ overload,
13
+ )
3
14
 
4
15
  from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
5
16
 
@@ -15,23 +26,29 @@ from natural_pdf.classification.manager import ClassificationManager # Keep for
15
26
 
16
27
  # --- Classification Imports --- #
17
28
  from natural_pdf.classification.mixin import ClassificationMixin
29
+
30
+ # Add Visualizable import
31
+ from natural_pdf.core.render_spec import RenderSpec, Visualizable
18
32
  from natural_pdf.describe.mixin import DescribeMixin
19
33
  from natural_pdf.elements.base import DirectionalMixin
20
34
  from natural_pdf.elements.text import TextElement # ADDED IMPORT
21
35
  from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
22
36
  from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
23
37
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
24
- from natural_pdf.text_mixin import TextMixin
25
38
 
26
39
  # ------------------------------------------------------------------
27
40
  # Table utilities
28
41
  # ------------------------------------------------------------------
29
42
  from natural_pdf.tables import TableResult
43
+ from natural_pdf.text_mixin import TextMixin
30
44
  from natural_pdf.utils.locks import pdf_render_lock # Import the lock
31
45
 
32
46
  # Import new utils
33
47
  from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
34
48
 
49
+ # Import viewer widget support
50
+ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerWidget
51
+
35
52
  # --- End Classification Imports --- #
36
53
 
37
54
 
@@ -43,7 +60,7 @@ if TYPE_CHECKING:
43
60
 
44
61
  from natural_pdf.core.page import Page
45
62
  from natural_pdf.elements.base import Element # Added for type hint
46
- from natural_pdf.elements.collections import ElementCollection
63
+ from natural_pdf.elements.element_collection import ElementCollection
47
64
  from natural_pdf.elements.text import TextElement
48
65
 
49
66
  # Import OCRManager conditionally to avoid circular imports
@@ -63,6 +80,7 @@ class Region(
63
80
  ExtractionMixin,
64
81
  ShapeDetectionMixin,
65
82
  DescribeMixin,
83
+ Visualizable,
66
84
  ):
67
85
  """Represents a rectangular region on a page.
68
86
 
@@ -199,6 +217,62 @@ class Region(
199
217
  self.text_content = None # Direct text content (e.g., from Docling)
200
218
  self.associated_text_elements = [] # Native text elements that overlap with this region
201
219
 
220
+ def _get_render_specs(
221
+ self,
222
+ mode: Literal["show", "render"] = "show",
223
+ color: Optional[Union[str, Tuple[int, int, int]]] = None,
224
+ highlights: Optional[List[Dict[str, Any]]] = None,
225
+ crop: Union[bool, Literal["content"]] = True, # Default to True for regions
226
+ crop_bbox: Optional[Tuple[float, float, float, float]] = None,
227
+ **kwargs,
228
+ ) -> List[RenderSpec]:
229
+ """Get render specifications for this region.
230
+
231
+ Args:
232
+ mode: Rendering mode - 'show' includes highlights, 'render' is clean
233
+ color: Color for highlighting this region in show mode
234
+ highlights: Additional highlight groups to show
235
+ crop: Whether to crop to this region
236
+ crop_bbox: Explicit crop bounds (overrides region bounds)
237
+ **kwargs: Additional parameters
238
+
239
+ Returns:
240
+ List containing a single RenderSpec for this region's page
241
+ """
242
+ from typing import Literal
243
+
244
+ spec = RenderSpec(page=self.page)
245
+
246
+ # Handle cropping
247
+ if crop_bbox:
248
+ spec.crop_bbox = crop_bbox
249
+ elif crop:
250
+ # Crop to this region's bounds
251
+ spec.crop_bbox = self.bbox
252
+
253
+ # Add highlights in show mode
254
+ if mode == "show":
255
+ # Highlight this region
256
+ if color or mode == "show": # Always highlight in show mode
257
+ spec.add_highlight(
258
+ bbox=self.bbox,
259
+ polygon=self.polygon if self.has_polygon else None,
260
+ color=color or "blue",
261
+ label=self.label or self.name or "Region",
262
+ )
263
+
264
+ # Add additional highlight groups if provided
265
+ if highlights:
266
+ for group in highlights:
267
+ elements = group.get("elements", [])
268
+ group_color = group.get("color", color)
269
+ group_label = group.get("label")
270
+
271
+ for elem in elements:
272
+ spec.add_highlight(element=elem, color=group_color, label=group_label)
273
+
274
+ return [spec]
275
+
202
276
  def _direction(
203
277
  self,
204
278
  direction: str,
@@ -639,7 +713,7 @@ class Region(
639
713
  label: Optional[str] = None,
640
714
  color: Optional[Union[Tuple, str]] = None,
641
715
  use_color_cycling: bool = False,
642
- include_attrs: Optional[List[str]] = None,
716
+ annotate: Optional[List[str]] = None,
643
717
  existing: str = "append",
644
718
  ) -> "Region":
645
719
  """
@@ -649,7 +723,7 @@ class Region(
649
723
  label: Optional label for the highlight
650
724
  color: Color tuple/string for the highlight, or None to use automatic color
651
725
  use_color_cycling: Force color cycling even with no label (default: False)
652
- include_attrs: List of attribute names to display on the highlight (e.g., ['confidence', 'type'])
726
+ annotate: List of attribute names to display on the highlight (e.g., ['confidence', 'type'])
653
727
  existing: How to handle existing highlights ('append' or 'replace').
654
728
 
655
729
  Returns:
@@ -665,7 +739,7 @@ class Region(
665
739
  "label": label,
666
740
  "use_color_cycling": use_color_cycling,
667
741
  "element": self, # Pass the region itself so attributes can be accessed
668
- "include_attrs": include_attrs,
742
+ "annotate": annotate,
669
743
  "existing": existing,
670
744
  }
671
745
 
@@ -679,178 +753,6 @@ class Region(
679
753
 
680
754
  return self
681
755
 
682
- def to_image(
683
- self,
684
- resolution: Optional[float] = None,
685
- crop: bool = False,
686
- include_highlights: bool = True,
687
- **kwargs,
688
- ) -> "Image.Image":
689
- """
690
- Generate an image of just this region.
691
-
692
- Args:
693
- resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
694
- crop: If True, only crop the region without highlighting its boundaries
695
- include_highlights: Whether to include existing highlights (default: True)
696
- **kwargs: Additional parameters for page.to_image()
697
-
698
- Returns:
699
- PIL Image of just this region
700
- """
701
- # Apply global options as defaults
702
- import natural_pdf
703
-
704
- if resolution is None:
705
- if natural_pdf.options.image.resolution is not None:
706
- resolution = natural_pdf.options.image.resolution
707
- else:
708
- resolution = 144 # Default resolution when none specified
709
-
710
- # Handle the case where user wants the cropped region to have a specific width
711
- page_kwargs = kwargs.copy()
712
- effective_resolution = resolution # Start with the provided resolution
713
-
714
- if crop and "width" in kwargs:
715
- target_width = kwargs["width"]
716
- # Calculate what resolution is needed to make the region crop have target_width
717
- region_width_points = self.width # Region width in PDF points
718
-
719
- if region_width_points > 0:
720
- # Calculate scale needed: target_width / region_width_points
721
- required_scale = target_width / region_width_points
722
- # Convert scale to resolution: scale * 72 DPI
723
- effective_resolution = required_scale * 72.0
724
- page_kwargs.pop("width") # Remove width parameter to avoid conflicts
725
- logger.debug(
726
- f"Region {self.bbox}: Calculated required resolution {effective_resolution:.1f} DPI for region crop width {target_width}"
727
- )
728
- else:
729
- logger.warning(
730
- f"Region {self.bbox}: Invalid region width {region_width_points}, using original resolution"
731
- )
732
-
733
- # First get the full page image with highlights if requested
734
- page_image = self._page.to_image(
735
- resolution=effective_resolution,
736
- include_highlights=include_highlights,
737
- **page_kwargs,
738
- )
739
-
740
- # Calculate the actual scale factor used by the page image
741
- if page_image.width > 0 and self._page.width > 0:
742
- scale_factor = page_image.width / self._page.width
743
- else:
744
- # Fallback to resolution-based calculation if dimensions are invalid
745
- scale_factor = resolution / 72.0
746
-
747
- # Apply scaling to the coordinates
748
- x0 = int(self.x0 * scale_factor)
749
- top = int(self.top * scale_factor)
750
- x1 = int(self.x1 * scale_factor)
751
- bottom = int(self.bottom * scale_factor)
752
-
753
- # Ensure coords are valid for cropping (left < right, top < bottom)
754
- if x0 >= x1:
755
- logger.warning(
756
- f"Region {self.bbox} resulted in non-positive width after scaling ({x0} >= {x1}). Cannot create image."
757
- )
758
- return None
759
- if top >= bottom:
760
- logger.warning(
761
- f"Region {self.bbox} resulted in non-positive height after scaling ({top} >= {bottom}). Cannot create image."
762
- )
763
- return None
764
-
765
- # Crop the image to just this region
766
- region_image = page_image.crop((x0, top, x1, bottom))
767
-
768
- # If not crop, add a border to highlight the region boundaries
769
- if not crop:
770
- from PIL import ImageDraw
771
-
772
- # Create a 1px border around the region
773
- draw = ImageDraw.Draw(region_image)
774
- draw.rectangle(
775
- (0, 0, region_image.width - 1, region_image.height - 1),
776
- outline=(255, 0, 0),
777
- width=1,
778
- )
779
-
780
- return region_image
781
-
782
- def show(
783
- self,
784
- resolution: Optional[float] = None,
785
- labels: bool = True,
786
- legend_position: str = "right",
787
- # Add a default color for standalone show
788
- color: Optional[Union[Tuple, str]] = "blue",
789
- label: Optional[str] = None,
790
- width: Optional[int] = None, # Add width parameter
791
- crop: bool = False, # NEW: Crop output to region bounds before legend
792
- ) -> "Image.Image":
793
- """
794
- Show the page with just this region highlighted temporarily.
795
-
796
- Args:
797
- resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
798
- labels: Whether to include a legend for labels
799
- legend_position: Position of the legend
800
- color: Color to highlight this region (default: blue)
801
- label: Optional label for this region in the legend
802
- width: Optional width for the output image in pixels
803
- crop: If True, crop the rendered image to this region's
804
- bounding box (with a small margin handled inside
805
- HighlightingService) before legends/overlays are added.
806
-
807
- Returns:
808
- PIL Image of the page with only this region highlighted
809
- """
810
- # Apply global options as defaults
811
- import natural_pdf
812
-
813
- if resolution is None:
814
- if natural_pdf.options.image.resolution is not None:
815
- resolution = natural_pdf.options.image.resolution
816
- else:
817
- resolution = 144 # Default resolution when none specified
818
-
819
- if not self._page:
820
- raise ValueError("Region must be associated with a page to show.")
821
-
822
- # Use the highlighting service via the page's property
823
- service = self._page._highlighter
824
-
825
- # Determine the label if not provided
826
- display_label = (
827
- label if label is not None else f"Region ({self.type})" if self.type else "Region"
828
- )
829
-
830
- # Prepare temporary highlight data for just this region
831
- temp_highlight_data = {
832
- "page_index": self._page.index,
833
- "bbox": self.bbox,
834
- "polygon": self.polygon if self.has_polygon else None,
835
- "color": color, # Use provided or default color
836
- "label": display_label,
837
- "use_color_cycling": False, # Explicitly false for single preview
838
- }
839
-
840
- # Determine crop bbox if requested
841
- crop_bbox = self.bbox if crop else None
842
-
843
- # Use render_preview to show only this highlight
844
- return service.render_preview(
845
- page_index=self._page.index,
846
- temporary_highlights=[temp_highlight_data],
847
- resolution=resolution,
848
- width=width, # Pass the width parameter
849
- labels=labels,
850
- legend_position=legend_position,
851
- crop_bbox=crop_bbox,
852
- )
853
-
854
756
  def save(
855
757
  self,
856
758
  filename: str,
@@ -904,7 +806,7 @@ class Region(
904
806
  resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
905
807
  crop: If True, only crop the region without highlighting its boundaries
906
808
  include_highlights: Whether to include existing highlights (default: True)
907
- **kwargs: Additional parameters for page.to_image()
809
+ **kwargs: Additional parameters for rendering
908
810
 
909
811
  Returns:
910
812
  Self for method chaining
@@ -918,16 +820,23 @@ class Region(
918
820
  else:
919
821
  resolution = 144 # Default resolution when none specified
920
822
 
921
- # Get the region image
922
- image = self.to_image(
923
- resolution=resolution,
924
- crop=crop,
925
- include_highlights=include_highlights,
926
- **kwargs,
927
- )
823
+ # Use export() to save the image
824
+ if include_highlights:
825
+ # With highlights, use export() which includes them
826
+ self.export(
827
+ path=filename,
828
+ resolution=resolution,
829
+ crop=crop,
830
+ **kwargs,
831
+ )
832
+ else:
833
+ # Without highlights, use render() and save manually
834
+ image = self.render(resolution=resolution, crop=crop, **kwargs)
835
+ if image:
836
+ image.save(filename)
837
+ else:
838
+ logger.error(f"Failed to render region image for saving to {filename}")
928
839
 
929
- # Save the image
930
- image.save(filename)
931
840
  return self
932
841
 
933
842
  def trim(
@@ -988,7 +897,8 @@ class Region(
988
897
  )
989
898
 
990
899
  # Get the region image
991
- image = work_region.to_image(resolution=resolution, crop=True, include_highlights=False)
900
+ # Use render() for clean image without highlights, with cropping
901
+ image = work_region.render(resolution=resolution, crop=True)
992
902
 
993
903
  if image is None:
994
904
  logger.warning(
@@ -1227,7 +1137,9 @@ class Region(
1227
1137
  # Filter to elements in this region
1228
1138
  return [e for e in page_elements if self._is_element_in_region(e)]
1229
1139
 
1230
- def extract_text(self, apply_exclusions=True, debug=False, content_filter=None, **kwargs) -> str:
1140
+ def extract_text(
1141
+ self, apply_exclusions=True, debug=False, content_filter=None, **kwargs
1142
+ ) -> str:
1231
1143
  """
1232
1144
  Extract text from this region, respecting page exclusions and using pdfplumber's
1233
1145
  layout engine (chars_to_textmap).
@@ -1299,7 +1211,7 @@ class Region(
1299
1211
  final_kwargs = kwargs.copy()
1300
1212
  if content_filter is not None:
1301
1213
  final_kwargs["content_filter"] = content_filter
1302
-
1214
+
1303
1215
  result = generate_text_layout(
1304
1216
  char_dicts=filtered_chars,
1305
1217
  layout_context_bbox=self.bbox, # Use region's bbox for context
@@ -1319,7 +1231,9 @@ class Region(
1319
1231
  cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
1320
1232
  # --- NEW: Add tqdm control option --- #
1321
1233
  show_progress: bool = False, # Controls progress bar for text method
1322
- content_filter: Optional[Union[str, Callable[[str], bool], List[str]]] = None, # NEW: Content filtering
1234
+ content_filter: Optional[
1235
+ Union[str, Callable[[str], bool], List[str]]
1236
+ ] = None, # NEW: Content filtering
1323
1237
  ) -> TableResult: # Return type allows Optional[str] for cells
1324
1238
  """
1325
1239
  Extract a table from this region.
@@ -1379,7 +1293,11 @@ class Region(
1379
1293
  logger.debug(
1380
1294
  f"Region {self.bbox}: Found {len(cell_regions_in_table)} pre-computed table_cell regions – using 'cells' method."
1381
1295
  )
1382
- return TableResult(self._extract_table_from_cells(cell_regions_in_table, content_filter=content_filter))
1296
+ return TableResult(
1297
+ self._extract_table_from_cells(
1298
+ cell_regions_in_table, content_filter=content_filter
1299
+ )
1300
+ )
1383
1301
 
1384
1302
  # --------------------------------------------------------------- #
1385
1303
 
@@ -1460,7 +1378,9 @@ class Region(
1460
1378
 
1461
1379
  # Use the selected method
1462
1380
  if effective_method == "tatr":
1463
- table_rows = self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config, content_filter=content_filter)
1381
+ table_rows = self._extract_table_tatr(
1382
+ use_ocr=use_ocr, ocr_config=ocr_config, content_filter=content_filter
1383
+ )
1464
1384
  elif effective_method == "text":
1465
1385
  current_text_options = text_options.copy()
1466
1386
  current_text_options["cell_extraction_func"] = cell_extraction_func
@@ -1763,10 +1683,12 @@ class Region(
1763
1683
  if cell is not None:
1764
1684
  # Apply RTL text processing first
1765
1685
  rtl_processed_cell = self._apply_rtl_processing_to_text(cell)
1766
-
1686
+
1767
1687
  # Then apply content filter if provided
1768
1688
  if content_filter is not None:
1769
- filtered_cell = self._apply_content_filter_to_text(rtl_processed_cell, content_filter)
1689
+ filtered_cell = self._apply_content_filter_to_text(
1690
+ rtl_processed_cell, content_filter
1691
+ )
1770
1692
  processed_row.append(filtered_cell)
1771
1693
  else:
1772
1694
  processed_row.append(rtl_processed_cell)
@@ -1776,7 +1698,9 @@ class Region(
1776
1698
  return processed_table
1777
1699
  return []
1778
1700
 
1779
- def _extract_table_tatr(self, use_ocr=False, ocr_config=None, content_filter=None) -> List[List[str]]:
1701
+ def _extract_table_tatr(
1702
+ self, use_ocr=False, ocr_config=None, content_filter=None
1703
+ ) -> List[List[str]]:
1780
1704
  """
1781
1705
  Extract table using TATR structure detection.
1782
1706
 
@@ -2173,7 +2097,7 @@ class Region(
2173
2097
  Returns:
2174
2098
  ElementCollection with matching elements.
2175
2099
  """
2176
- from natural_pdf.elements.collections import ElementCollection
2100
+ from natural_pdf.elements.element_collection import ElementCollection
2177
2101
 
2178
2102
  if selector is not None and text is not None:
2179
2103
  raise ValueError("Provide either 'selector' or 'text', not both.")
@@ -2258,7 +2182,7 @@ class Region(
2258
2182
  ---------
2259
2183
  ```python
2260
2184
  def llm_ocr(region):
2261
- image = region.to_image(resolution=300, crop=True)
2185
+ image = region.render(resolution=300, crop=True)
2262
2186
  return my_llm_client.ocr(image)
2263
2187
  region.apply_ocr(function=llm_ocr)
2264
2188
  ```
@@ -2368,9 +2292,8 @@ class Region(
2368
2292
 
2369
2293
  # Render the page region to an image using the determined resolution
2370
2294
  try:
2371
- region_image = self.to_image(
2372
- resolution=final_resolution, include_highlights=False, crop=True
2373
- )
2295
+ # Use render() for clean image without highlights, with cropping
2296
+ region_image = self.render(resolution=final_resolution, crop=True)
2374
2297
  if not region_image:
2375
2298
  logger.error("Failed to render region to image for OCR.")
2376
2299
  return self
@@ -2492,7 +2415,7 @@ class Region(
2492
2415
  Example:
2493
2416
  # Using with an LLM
2494
2417
  def ocr_with_llm(region):
2495
- image = region.to_image(resolution=300, crop=True)
2418
+ image = region.render(resolution=300, crop=True)
2496
2419
  # Call your LLM API here
2497
2420
  return llm_client.ocr(image)
2498
2421
 
@@ -2500,7 +2423,7 @@ class Region(
2500
2423
 
2501
2424
  # Using with a custom OCR service
2502
2425
  def ocr_with_service(region):
2503
- img_bytes = region.to_image(crop=True).tobytes()
2426
+ img_bytes = region.render(crop=True).tobytes()
2504
2427
  response = ocr_service.process(img_bytes)
2505
2428
  return response.text
2506
2429
 
@@ -2605,14 +2528,14 @@ class Region(
2605
2528
 
2606
2529
  return self
2607
2530
 
2608
- def get_section_between(self, start_element=None, end_element=None, boundary_inclusion="both"):
2531
+ def get_section_between(self, start_element=None, end_element=None, include_boundaries="both"):
2609
2532
  """
2610
2533
  Get a section between two elements within this region.
2611
2534
 
2612
2535
  Args:
2613
2536
  start_element: Element marking the start of the section
2614
2537
  end_element: Element marking the end of the section
2615
- boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none'
2538
+ include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
2616
2539
 
2617
2540
  Returns:
2618
2541
  Region representing the section
@@ -2661,15 +2584,15 @@ class Region(
2661
2584
  start_element_for_bbox = start_element
2662
2585
  end_element_for_bbox = end_element
2663
2586
 
2664
- if boundary_inclusion == "none":
2587
+ if include_boundaries == "none":
2665
2588
  start_idx += 1
2666
2589
  end_idx -= 1
2667
2590
  start_element_for_bbox = elements[start_idx] if start_idx <= end_idx else None
2668
2591
  end_element_for_bbox = elements[end_idx] if start_idx <= end_idx else None
2669
- elif boundary_inclusion == "start":
2592
+ elif include_boundaries == "start":
2670
2593
  end_idx -= 1
2671
2594
  end_element_for_bbox = elements[end_idx] if start_idx <= end_idx else None
2672
- elif boundary_inclusion == "end":
2595
+ elif include_boundaries == "end":
2673
2596
  start_idx += 1
2674
2597
  start_element_for_bbox = elements[start_idx] if start_idx <= end_idx else None
2675
2598
 
@@ -2702,7 +2625,7 @@ class Region(
2702
2625
  return section
2703
2626
 
2704
2627
  def get_sections(
2705
- self, start_elements=None, end_elements=None, boundary_inclusion="both"
2628
+ self, start_elements=None, end_elements=None, include_boundaries="both"
2706
2629
  ) -> "ElementCollection[Region]":
2707
2630
  """
2708
2631
  Get sections within this region based on start/end elements.
@@ -2710,12 +2633,12 @@ class Region(
2710
2633
  Args:
2711
2634
  start_elements: Elements or selector string that mark the start of sections
2712
2635
  end_elements: Elements or selector string that mark the end of sections
2713
- boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none'
2636
+ include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
2714
2637
 
2715
2638
  Returns:
2716
2639
  List of Region objects representing the extracted sections
2717
2640
  """
2718
- from natural_pdf.elements.collections import ElementCollection
2641
+ from natural_pdf.elements.element_collection import ElementCollection
2719
2642
 
2720
2643
  # Process string selectors to find elements WITHIN THIS REGION
2721
2644
  if isinstance(start_elements, str):
@@ -2789,7 +2712,7 @@ class Region(
2789
2712
  start_element = current_start_boundary["element"]
2790
2713
  end_element = boundary["element"]
2791
2714
  # Use the helper, ensuring elements are from within the region
2792
- section = self.get_section_between(start_element, end_element, boundary_inclusion)
2715
+ section = self.get_section_between(start_element, end_element, include_boundaries)
2793
2716
  sections.append(section)
2794
2717
  current_start_boundary = None # Reset
2795
2718
 
@@ -2806,7 +2729,7 @@ class Region(
2806
2729
  if end_idx >= 0 and end_idx >= current_start_boundary["index"]:
2807
2730
  end_element = all_elements_in_region[end_idx]
2808
2731
  section = self.get_section_between(
2809
- start_element, end_element, boundary_inclusion
2732
+ start_element, end_element, include_boundaries
2810
2733
  )
2811
2734
  sections.append(section)
2812
2735
  # Else: Section started and ended by consecutive start elements? Create empty?
@@ -2820,7 +2743,7 @@ class Region(
2820
2743
  start_element = current_start_boundary["element"]
2821
2744
  # End at the last element within the region
2822
2745
  end_element = all_elements_in_region[-1]
2823
- section = self.get_section_between(start_element, end_element, boundary_inclusion)
2746
+ section = self.get_section_between(start_element, end_element, include_boundaries)
2824
2747
  sections.append(section)
2825
2748
 
2826
2749
  return ElementCollection(sections)
@@ -3095,7 +3018,9 @@ class Region(
3095
3018
  override simply ensures the search is scoped to the region.
3096
3019
  """
3097
3020
 
3098
- return TextMixin.update_text(self, transform, selector=selector, apply_exclusions=apply_exclusions)
3021
+ return TextMixin.update_text(
3022
+ self, transform, selector=selector, apply_exclusions=apply_exclusions
3023
+ )
3099
3024
 
3100
3025
  # --- Classification Mixin Implementation --- #
3101
3026
  def _get_classification_manager(self) -> "ClassificationManager":
@@ -3136,9 +3061,8 @@ class Region(
3136
3061
  else default_resolution
3137
3062
  )
3138
3063
 
3139
- img = self.to_image(
3064
+ img = self.render(
3140
3065
  resolution=resolution,
3141
- include_highlights=False, # No highlights for classification input
3142
3066
  crop=True, # Just the region content
3143
3067
  )
3144
3068
  if img is None:
@@ -3268,7 +3192,7 @@ class Region(
3268
3192
  An ElementCollection containing temporary Region objects for each detected cell,
3269
3193
  or an empty ElementCollection if no cells are found or an error occurs.
3270
3194
  """
3271
- from natural_pdf.elements.collections import ElementCollection
3195
+ from natural_pdf.elements.element_collection import ElementCollection
3272
3196
 
3273
3197
  # 1. Perform the analysis (or use cached results)
3274
3198
  if "text_table_structure" in self.analyses:
@@ -3470,13 +3394,15 @@ class Region(
3470
3394
  # New helper: build table from pre-computed table_cell regions
3471
3395
  # ------------------------------------------------------------------
3472
3396
 
3473
- def _extract_table_from_cells(self, cell_regions: List["Region"], content_filter=None) -> List[List[Optional[str]]]:
3397
+ def _extract_table_from_cells(
3398
+ self, cell_regions: List["Region"], content_filter=None
3399
+ ) -> List[List[Optional[str]]]:
3474
3400
  """Construct a table (list-of-lists) from table_cell regions.
3475
3401
 
3476
3402
  This assumes each cell Region has metadata.row_index / col_index as written by
3477
3403
  detect_table_structure_from_lines(). If these keys are missing we will
3478
3404
  fall back to sorting by geometry.
3479
-
3405
+
3480
3406
  Args:
3481
3407
  cell_regions: List of table cell Region objects to extract text from
3482
3408
  content_filter: Optional content filter to apply to cell text extraction
@@ -3510,7 +3436,9 @@ class Region(
3510
3436
  try:
3511
3437
  r_idx = int(cell.metadata.get("row_index"))
3512
3438
  c_idx = int(cell.metadata.get("col_index"))
3513
- text_val = cell.extract_text(layout=False, apply_exclusions=False, content_filter=content_filter).strip()
3439
+ text_val = cell.extract_text(
3440
+ layout=False, apply_exclusions=False, content_filter=content_filter
3441
+ ).strip()
3514
3442
  table_grid[r_idx][c_idx] = text_val if text_val else None
3515
3443
  except Exception as _err:
3516
3444
  # Skip problematic cell
@@ -3557,7 +3485,9 @@ class Region(
3557
3485
  row_idx = int(np.argmin([abs(cy - rc) for rc in row_centers]))
3558
3486
  col_idx = int(np.argmin([abs(cx - cc) for cc in col_centers]))
3559
3487
 
3560
- text_val = cell.extract_text(layout=False, apply_exclusions=False, content_filter=content_filter).strip()
3488
+ text_val = cell.extract_text(
3489
+ layout=False, apply_exclusions=False, content_filter=content_filter
3490
+ ).strip()
3561
3491
  table_grid[row_idx][col_idx] = text_val if text_val else None
3562
3492
 
3563
3493
  return table_grid
@@ -3565,32 +3495,33 @@ class Region(
3565
3495
  def _apply_rtl_processing_to_text(self, text: str) -> str:
3566
3496
  """
3567
3497
  Apply RTL (Right-to-Left) text processing to a string.
3568
-
3498
+
3569
3499
  This converts visual order text (as stored in PDFs) to logical order
3570
3500
  for proper display of Arabic, Hebrew, and other RTL scripts.
3571
-
3501
+
3572
3502
  Args:
3573
3503
  text: Input text string in visual order
3574
-
3504
+
3575
3505
  Returns:
3576
3506
  Text string in logical order
3577
3507
  """
3578
3508
  if not text or not text.strip():
3579
3509
  return text
3580
-
3510
+
3581
3511
  # Quick check for RTL characters - if none found, return as-is
3582
3512
  import unicodedata
3583
-
3513
+
3584
3514
  def _contains_rtl(s):
3585
3515
  return any(unicodedata.bidirectional(ch) in ("R", "AL", "AN") for ch in s)
3586
-
3516
+
3587
3517
  if not _contains_rtl(text):
3588
3518
  return text
3589
-
3519
+
3590
3520
  try:
3591
3521
  from bidi.algorithm import get_display # type: ignore
3522
+
3592
3523
  from natural_pdf.utils.bidi_mirror import mirror_brackets
3593
-
3524
+
3594
3525
  # Apply BiDi algorithm to convert from visual to logical order
3595
3526
  # Process line by line to handle mixed content properly
3596
3527
  processed_lines = []
@@ -3603,9 +3534,9 @@ class Region(
3603
3534
  processed_lines.append(mirror_brackets(logical_line))
3604
3535
  else:
3605
3536
  processed_lines.append(line)
3606
-
3537
+
3607
3538
  return "\n".join(processed_lines)
3608
-
3539
+
3609
3540
  except (ImportError, Exception):
3610
3541
  # If bidi library is not available or fails, return original text
3611
3542
  return text
@@ -3613,36 +3544,36 @@ class Region(
3613
3544
  def _apply_content_filter_to_text(self, text: str, content_filter) -> str:
3614
3545
  """
3615
3546
  Apply content filter to a text string.
3616
-
3547
+
3617
3548
  Args:
3618
3549
  text: Input text string
3619
3550
  content_filter: Content filter (regex, callable, or list of regexes)
3620
-
3551
+
3621
3552
  Returns:
3622
3553
  Filtered text string
3623
3554
  """
3624
3555
  if not text or content_filter is None:
3625
3556
  return text
3626
-
3557
+
3627
3558
  import re
3628
-
3559
+
3629
3560
  if isinstance(content_filter, str):
3630
3561
  # Single regex pattern - remove matching parts
3631
3562
  try:
3632
- return re.sub(content_filter, '', text)
3563
+ return re.sub(content_filter, "", text)
3633
3564
  except re.error:
3634
3565
  return text # Invalid regex, return original
3635
-
3566
+
3636
3567
  elif isinstance(content_filter, list):
3637
3568
  # List of regex patterns - remove parts matching ANY pattern
3638
3569
  try:
3639
3570
  result = text
3640
3571
  for pattern in content_filter:
3641
- result = re.sub(pattern, '', result)
3572
+ result = re.sub(pattern, "", result)
3642
3573
  return result
3643
3574
  except re.error:
3644
3575
  return text # Invalid regex, return original
3645
-
3576
+
3646
3577
  elif callable(content_filter):
3647
3578
  # Callable filter - apply to individual characters
3648
3579
  try:
@@ -3650,8 +3581,152 @@ class Region(
3650
3581
  for char in text:
3651
3582
  if content_filter(char):
3652
3583
  filtered_chars.append(char)
3653
- return ''.join(filtered_chars)
3584
+ return "".join(filtered_chars)
3654
3585
  except Exception:
3655
3586
  return text # Function error, return original
3656
-
3587
+
3657
3588
  return text
3589
+
3590
+ # ------------------------------------------------------------------
3591
+ # Interactive Viewer Support
3592
+ # ------------------------------------------------------------------
3593
+
3594
+ def viewer(
3595
+ self,
3596
+ *,
3597
+ resolution: int = 150,
3598
+ include_chars: bool = False,
3599
+ include_attributes: Optional[List[str]] = None,
3600
+ ) -> Optional["InteractiveViewerWidget"]:
3601
+ """Create an interactive ipywidget viewer for **this specific region**.
3602
+
3603
+ The method renders the region to an image (cropped to the region bounds) and
3604
+ overlays all elements that intersect the region (optionally excluding noisy
3605
+ character-level elements). The resulting widget offers the same zoom / pan
3606
+ experience as :py:meth:`Page.viewer` but scoped to the region.
3607
+
3608
+ Parameters
3609
+ ----------
3610
+ resolution : int, default 150
3611
+ Rendering resolution (DPI). This should match the value used by the
3612
+ page-level viewer so element scaling is accurate.
3613
+ include_chars : bool, default False
3614
+ Whether to include individual *char* elements in the overlay. These
3615
+ are often too dense for a meaningful visualisation so are skipped by
3616
+ default.
3617
+ include_attributes : list[str], optional
3618
+ Additional element attributes to expose in the info panel (on top of
3619
+ the default set used by the page viewer).
3620
+
3621
+ Returns
3622
+ -------
3623
+ InteractiveViewerWidget | None
3624
+ The widget instance, or ``None`` if *ipywidgets* is not installed or
3625
+ an error occurred during creation.
3626
+ """
3627
+
3628
+ # ------------------------------------------------------------------
3629
+ # Dependency / environment checks
3630
+ # ------------------------------------------------------------------
3631
+ if not _IPYWIDGETS_AVAILABLE or InteractiveViewerWidget is None:
3632
+ logger.error(
3633
+ "Interactive viewer requires 'ipywidgets'. "
3634
+ 'Please install with: pip install "ipywidgets>=7.0.0,<10.0.0"'
3635
+ )
3636
+ return None
3637
+
3638
+ try:
3639
+ # ------------------------------------------------------------------
3640
+ # Render region image (cropped) and encode as data URI
3641
+ # ------------------------------------------------------------------
3642
+ import base64
3643
+ from io import BytesIO
3644
+
3645
+ # Use unified render() with crop=True to obtain just the region
3646
+ img = self.render(resolution=resolution, crop=True)
3647
+ if img is None:
3648
+ logger.error(f"Failed to render image for region {self.bbox} viewer.")
3649
+ return None
3650
+
3651
+ buf = BytesIO()
3652
+ img.save(buf, format="PNG")
3653
+ img_str = base64.b64encode(buf.getvalue()).decode()
3654
+ image_uri = f"data:image/png;base64,{img_str}"
3655
+
3656
+ # ------------------------------------------------------------------
3657
+ # Prepare element overlay data (coordinates relative to region)
3658
+ # ------------------------------------------------------------------
3659
+ scale = resolution / 72.0 # Same convention as page viewer
3660
+
3661
+ # Gather elements intersecting the region
3662
+ region_elements = self.get_elements(apply_exclusions=False)
3663
+
3664
+ # Optionally filter out chars
3665
+ if not include_chars:
3666
+ region_elements = [
3667
+ el for el in region_elements if str(getattr(el, "type", "")).lower() != "char"
3668
+ ]
3669
+
3670
+ default_attrs = [
3671
+ "text",
3672
+ "fontname",
3673
+ "size",
3674
+ "bold",
3675
+ "italic",
3676
+ "color",
3677
+ "linewidth",
3678
+ "is_horizontal",
3679
+ "is_vertical",
3680
+ "source",
3681
+ "confidence",
3682
+ "label",
3683
+ "model",
3684
+ "upright",
3685
+ "direction",
3686
+ ]
3687
+
3688
+ if include_attributes:
3689
+ default_attrs.extend([a for a in include_attributes if a not in default_attrs])
3690
+
3691
+ elements_json: List[dict] = []
3692
+ for idx, el in enumerate(region_elements):
3693
+ try:
3694
+ # Calculate coordinates relative to region bbox and apply scale
3695
+ x0 = (el.x0 - self.x0) * scale
3696
+ y0 = (el.top - self.top) * scale
3697
+ x1 = (el.x1 - self.x0) * scale
3698
+ y1 = (el.bottom - self.top) * scale
3699
+
3700
+ elem_dict = {
3701
+ "id": idx,
3702
+ "type": getattr(el, "type", "unknown"),
3703
+ "x0": round(x0, 2),
3704
+ "y0": round(y0, 2),
3705
+ "x1": round(x1, 2),
3706
+ "y1": round(y1, 2),
3707
+ "width": round(x1 - x0, 2),
3708
+ "height": round(y1 - y0, 2),
3709
+ }
3710
+
3711
+ # Add requested / default attributes
3712
+ for attr_name in default_attrs:
3713
+ if hasattr(el, attr_name):
3714
+ val = getattr(el, attr_name)
3715
+ # Ensure JSON serialisable
3716
+ if not isinstance(val, (str, int, float, bool, list, dict, type(None))):
3717
+ val = str(val)
3718
+ elem_dict[attr_name] = val
3719
+ elements_json.append(elem_dict)
3720
+ except Exception as e:
3721
+ logger.warning(f"Error preparing element {idx} for region viewer: {e}")
3722
+
3723
+ viewer_data = {"page_image": image_uri, "elements": elements_json}
3724
+
3725
+ # ------------------------------------------------------------------
3726
+ # Instantiate the widget directly using the prepared data
3727
+ # ------------------------------------------------------------------
3728
+ return InteractiveViewerWidget(pdf_data=viewer_data)
3729
+
3730
+ except Exception as e:
3731
+ logger.error(f"Error creating viewer for region {self.bbox}: {e}", exc_info=True)
3732
+ return None